├── .gitignore
├── LICENSE
├── README.md
├── __images__
└── seq-to-seq-DNC.jpg
├── environment.yml
├── notebooks
├── .gitkeep
├── Data Preparation.ipynb
├── Preprocessing.ipynb
└── Text Normalization Demo.ipynb
├── results
├── .gitkeep
├── base-paper_classwise_accuracy.csv
├── english
│ ├── Semiotic_Class-wise_Accuracy.png
│ ├── classwise_accuracy.csv
│ ├── mistakes.csv
│ └── normalized.csv
└── russian
│ ├── .gitkeep
│ ├── Semiotic_Class-wise_Accuracy.png
│ ├── classwise_accuracy.csv
│ ├── mistakes.csv
│ └── normalized.csv
├── setup.sh
└── src
├── .gitkeep
├── DNCnormalize.py
├── Encoder.py
├── XGBclassify.py
├── classification_report.py
├── lib
├── access.py
├── addressing.py
├── dnc.py
├── seq2seq.py
└── util.py
├── preprocessing.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | MANIFEST
2 | build
3 | dist
4 | _build
5 | docs/man/*.gz
6 | docs/source/api/generated
7 | docs/source/config.rst
8 | docs/gh-pages
9 | notebook/i18n/*/LC_MESSAGES/*.mo
10 | notebook/i18n/*/LC_MESSAGES/nbjs.json
11 | notebook/static/components
12 | notebook/static/style/*.min.css*
13 | notebook/static/*/js/built/
14 | notebook/static/*/built/
15 | notebook/static/built/
16 | notebook/static/*/js/main.min.js*
17 | notebook/static/lab/*bundle.js
18 | node_modules
19 | *.py[co]
20 | __pycache__
21 | *.egg-info
22 | *~
23 | *.bak
24 | .ipynb_checkpoints
25 | .tox
26 | .DS_Store
27 | \#*#
28 | .#*
29 | .coverage
30 |
31 | *.swp
32 | *.map
33 | .idea/
34 | Read the Docs
35 | config.rst
36 |
37 | /.project
38 | /.pydevproject
39 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Text Normalization using Memory Augmented Neural Networks
2 |
3 | The Text Normalization Demo notebook and the accompanying paper "Text Normalization using Memory Augmented Neural Networks" demonstrates an accuracy of 99.5% on the Text Normalization Challenge by Richard Sproat and Navdeep Jaitly. An earlier version of the approach used here has secured the 6th position in the [Kaggle Russian Text Normalization Challenge](https://www.kaggle.com/c/text-normalization-challenge-russian-language) by Google's Text Normalization Research Group.
4 |
5 | Go straight to the Text Normalization Demo Notebook
6 |
7 | ## Architecture
8 | Two models are used for the purpose of text normalization. A XGBoost boost model first classifies a token as to-be-normalized or remain-self. The to-be-normalized tokens are then fed character-by-character to our proposed Sequence to Sequence DNC model.
9 |
10 | More details about the architecture and implementation can be found in the original paper.
11 |
12 |
13 |
14 | **Sequence to sequence DNC**
15 |
16 | 
17 |
18 |
19 |
20 |
21 | ## Results :
22 |
23 | ### 1. Normalizing English Data
24 |
25 |
26 |
27 | **Semiotic Classwise Accuracy**
28 |
29 | | semiotic-class | accuracy | count | correct |
30 | |----------------|--------------------|-------|---------|
31 | | ALL | 0.994267233453397 | 92451 | 91921 |
32 | | ADDRESS | 1.0 | 4 | 4 |
33 | | CARDINAL | 0.9942140790742526 | 1037 | 1031 |
34 | | DATE | 0.9971751412429378 | 2832 | 2824 |
35 | | DECIMAL | 0.9891304347826086 | 92 | 91 |
36 | | DIGIT | 0.7954545454545454 | 44 | 35 |
37 | | ELECTRONIC | 0.7346938775510204 | 49 | 36 |
38 | | FRACTION | 0.6875 | 16 | 11 |
39 | | LETTERS | 0.971611071682044 | 1409 | 1369 |
40 | | MEASURE | 0.971830985915493 | 142 | 138 |
41 | | MONEY | 0.972972972972973 | 37 | 36 |
42 | | ORDINAL | 0.9805825242718447 | 103 | 101 |
43 | | PLAIN | 0.9939611747724394 | 67894 | 67484 |
44 | | PUNCT | 0.9988729854615125 | 17746 | 17726 |
45 | | TELEPHONE | 0.918918918918919 | 37 | 34 |
46 | | TIME | 0.75 | 8 | 6 |
47 | | VERBATIM | 0.994005994005994 | 1001 | 995 |
48 |
49 |
50 |
51 | ### 2. Normalizing Russian Data
52 |
53 | **Semiotic Classwise Accuracy**
54 |
55 |
56 |
57 | | semiotic-class | accuracy | count | correct |
58 | |----------------|--------------------|-------|---------|
59 | | ALL | 0.9928752306965964 | 93196 | 92532 |
60 | | CARDINAL | 0.9417922948073701 | 2388 | 2249 |
61 | | DATE | 0.9732441471571907 | 1495 | 1455 |
62 | | DECIMAL | 0.9 | 60 | 54 |
63 | | DIGIT | 1.0 | 16 | 16 |
64 | | ELECTRONIC | 0.6041666666666666 | 48 | 29 |
65 | | FRACTION | 0.6086956521739131 | 23 | 14 |
66 | | LETTERS | 0.9907608695652174 | 1840 | 1823 |
67 | | MEASURE | 0.8978102189781022 | 411 | 369 |
68 | | MONEY | 0.8947368421052632 | 19 | 17 |
69 | | ORDINAL | 0.9461358313817331 | 427 | 404 |
70 | | PLAIN | 0.994688407139769 | 64764 | 64420 |
71 | | PUNCT | 0.9998519542045006 | 20264 | 20261 |
72 | | TELEPHONE | 0.8202247191011236 | 89 | 73 |
73 | | TIME | 0.75 | 8 | 6 |
74 | | VERBATIM | 0.9985119047619048 | 1344 | 1342 |
75 |
76 |
77 |
78 | ## How to run?
79 |
80 | **Requirements:**
81 | - [Jupyter Notebook](http://jupyter.org/)
82 | - [Anaconda Package Manager](https://anaconda.org/)
83 | - rest will be installed by anaconda (see below)
84 |
85 | **Follow these steps for a demonstration:**
86 |
87 | 1. Clone the repo
88 | 2. Download and extract the required data.
89 | ```
90 | $ sh setup.sh
91 | ```
92 | 2. Create & activate an environment using the provided file
93 | ```
94 | $ conda env create -f environment.yml
95 | $ source activate deep-tf
96 | ```
97 | 3. Start a Jupyter Notebook server
98 | 4. Open 'notebooks/Text Normalization Demo.ipynb'
99 | 5. Set the language to English or Russian below the 'Global Config' cell
100 | ```python
101 | lang = 'english'
102 | # lang = 'russian'
103 | ```
104 | 6. Run the notebook
105 |
106 | **Full Requirements:**
107 |
108 | - numpy 1.13.3
109 | - pandas 0.21.0
110 | - matplotlib 2.1.0
111 | - watermark 1.5.0
112 | - seaborn 0.8.1
113 | - sklearn 0.19.1
114 | - xgboost 0.6
115 | - tensorflow 1.3.0
116 |
117 | ## Authors
118 | 1. Subhojeet Pramanik (http://github.com/subho406)
119 | 2. Aman Hussain (https://github.com/AmanDaVinci)
120 |
121 | **Acknowledgements**
122 |
123 | Differentiable Neural Computer, Tensorflow Implementation: https://github.com/deepmind/dnc
124 |
--------------------------------------------------------------------------------
/__images__/seq-to-seq-DNC.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/__images__/seq-to-seq-DNC.jpg
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: deep-tf
2 | channels:
3 | - conda-forge
4 | - anaconda
5 | - defaults
6 | dependencies:
7 | - _nb_ext_conf=0.4.0=py36_1
8 | - anaconda-client=1.6.5=py36h19c0dcd_0
9 | - asn1crypto=0.22.0=py36h265ca7c_1
10 | - backports=1.0=py36hfa02d7e_1
11 | - backports.weakref=1.0rc1=py36_0
12 | - bcolz=1.1.2=py36h00f5784_0
13 | - bleach=1.5.0=py36_0
14 | - bokeh=0.12.10=py36hbb0e44a_0
15 | - bzip2=1.0.6=h6d464ef_2
16 | - ca-certificates=2017.08.26=h1d4fec5_0
17 | - certifi=2017.11.5=py36hf29ccca_0
18 | - cffi=1.10.0=py36had8d393_1
19 | - chardet=3.0.4=py36h0f667ec_1
20 | - click=6.7=py36h5253387_0
21 | - cloudpickle=0.5.2=py36h84cdd9c_0
22 | - clyent=1.2.2=py36h7e57e65_1
23 | - cryptography=2.0.3=py36ha225213_1
24 | - cudatoolkit=8.0=3
25 | - cudnn=6.0.21=cuda8.0_0
26 | - cycler=0.10.0=py36h93f1223_0
27 | - dask=0.16.0=py36h73d177f_0
28 | - dask-core=0.16.0=py36ha827fd6_0
29 | - dbus=1.10.22=h3b5a359_0
30 | - decorator=4.1.2=py36hd076ac8_0
31 | - distributed=1.20.0=py36h1c9f478_0
32 | - entrypoints=0.2.3=py36h1aec115_2
33 | - expat=2.2.5=he0dffb1_0
34 | - fontconfig=2.12.4=h88586e7_1
35 | - freetype=2.8=hab7d2ae_1
36 | - glib=2.53.6=h5d9569c_2
37 | - gmp=6.1.2=h6c8ec71_1
38 | - gst-plugins-base=1.12.2=he3457e5_0
39 | - gstreamer=1.12.2=h4f93127_0
40 | - hdf5=1.10.1=h9caa474_1
41 | - heapdict=1.0.0=py36h79797d7_0
42 | - html5lib=0.9999999=py36_0
43 | - icu=58.2=h9c2bf20_1
44 | - idna=2.6=py36h82fb2a8_1
45 | - intel-openmp=2018.0.0=hc7b2577_8
46 | - ipykernel=4.6.1=py36hbf841aa_0
47 | - ipython=6.2.1=py36h88c514a_1
48 | - ipython_genutils=0.2.0=py36hb52b0d5_0
49 | - ipywidgets=7.0.5=py36h31d6531_0
50 | - jedi=0.10.2=py36h552def0_0
51 | - jinja2=2.9.6=py36h489bce4_1
52 | - jpeg=9b=h024ee3a_2
53 | - jsonschema=2.6.0=py36h006f8b5_0
54 | - jupyter_client=5.1.0=py36h614e9ea_0
55 | - jupyter_core=4.4.0=py36h7c827e3_0
56 | - libedit=3.1=heed3624_0
57 | - libffi=3.2.1=hd88cf55_4
58 | - libgcc=7.2.0=h69d50b8_2
59 | - libgcc-ng=7.2.0=h7cc24e2_2
60 | - libgfortran-ng=7.2.0=h9f7466a_2
61 | - libgpuarray=0.6.9=0
62 | - libpng=1.6.32=hbd3595f_4
63 | - libprotobuf=3.4.1=h5b8497f_0
64 | - libsodium=1.0.15=hf101ebd_0
65 | - libstdcxx-ng=7.2.0=h7a57d05_2
66 | - libxcb=1.12=hcd93eb1_4
67 | - libxml2=2.9.4=h2e8b1d7_6
68 | - locket=0.2.0=py36h787c0ad_1
69 | - lzo=2.10=h49e0be7_2
70 | - mako=1.0.7=py36h0727276_0
71 | - markdown=2.6.9=py36_0
72 | - markupsafe=1.0=py36hd9260cd_1
73 | - matplotlib=2.1.0=py36hba5de38_0
74 | - mistune=0.8.1=py36h3d5977c_0
75 | - mkl=2018.0.1=h19d6760_4
76 | - mkl-service=1.1.2=py36h17a0993_4
77 | - msgpack-python=0.4.8=py36hec4c5d1_0
78 | - nb_anacondacloud=1.4.0=py36_0
79 | - nb_conda=2.2.1=py36h8118bb2_0
80 | - nb_conda_kernels=2.1.0=py36_0
81 | - nbconvert=5.3.1=py36hb41ffb7_0
82 | - nbformat=4.4.0=py36h31c9010_0
83 | - nbpresent=3.0.2=py36h5f95a39_1
84 | - ncurses=6.0=h9df7e31_2
85 | - nose=1.3.7=py36hcdf7029_2
86 | - notebook=5.2.1=py36h690a4eb_0
87 | - numexpr=2.6.2=py36hc561933_2
88 | - numpy=1.13.3=py36ha12f23b_0
89 | - openssl=1.0.2m=h26d622b_1
90 | - pandas=0.21.0=py36h78bd809_1
91 | - pandoc=1.19.2.1=hea2e7c5_1
92 | - pandocfilters=1.4.2=py36ha6701b7_1
93 | - partd=0.3.8=py36h36fd896_0
94 | - patsy=0.4.1=py36ha3be15e_0
95 | - pcre=8.41=hc27e229_1
96 | - pexpect=4.2.1=py36h3b9d41b_0
97 | - pickleshare=0.7.4=py36h63277f8_0
98 | - pip=9.0.1=py36h6c6f9ce_4
99 | - prompt_toolkit=1.0.15=py36h17d85b1_0
100 | - protobuf=3.4.1=py36h306e679_0
101 | - psutil=5.4.0=py36h84c53db_0
102 | - ptyprocess=0.5.2=py36h69acd42_0
103 | - pycparser=2.18=py36hf9f622e_1
104 | - pygments=2.2.0=py36h0d3125c_0
105 | - pygpu=0.6.9=py36_0
106 | - pyopenssl=17.2.0=py36h5cc804b_0
107 | - pyparsing=2.2.0=py36hee85983_1
108 | - pyqt=5.6.0=py36h0386399_5
109 | - pysocks=1.6.7=py36hd97a5b1_1
110 | - pytables=3.4.2=py36h3b5282a_2
111 | - python=3.6.3=h6c0c0dc_5
112 | - python-dateutil=2.6.1=py36h88d3b88_1
113 | - pytz=2017.3=py36h63b9c63_0
114 | - pyyaml=3.12=py36hafb9ca4_1
115 | - pyzmq=16.0.3=py36he2533c7_0
116 | - qt=5.6.2=h974d657_12
117 | - readline=7.0=ha6073c6_4
118 | - requests=2.18.4=py36he2e5f8d_1
119 | - scikit-learn=0.19.1=py36h7aa7ec6_0
120 | - scipy=1.0.0=py36hbf646e7_0
121 | - seaborn=0.8.1=py36hfad7ec4_0
122 | - setuptools=36.5.0=py36he42e2e1_0
123 | - simplegeneric=0.8.1=py36h2cb9092_0
124 | - sip=4.18.1=py36h51ed4ed_2
125 | - six=1.11.0=py36h372c433_1
126 | - sortedcontainers=1.5.7=py36hdf89491_0
127 | - sqlite=3.20.1=hb898158_2
128 | - statsmodels=0.8.0=py36h8533d0b_0
129 | - tblib=1.3.2=py36h34cf8b6_0
130 | - tensorflow-gpu=1.3.0=0
131 | - tensorflow-gpu-base=1.3.0=py36cuda8.0cudnn6.0_1
132 | - tensorflow-tensorboard=0.1.5=py36_0
133 | - terminado=0.6=py36ha25a19f_0
134 | - testpath=0.3.1=py36h8cadb63_0
135 | - theano=0.9.0=py36_0
136 | - tk=8.6.7=hc745277_3
137 | - toolz=0.8.2=py36h81f2dff_0
138 | - tornado=4.5.2=py36h1283b2a_0
139 | - traitlets=4.3.2=py36h674d592_0
140 | - urllib3=1.22=py36hbe7ace6_0
141 | - wcwidth=0.1.7=py36hdf4376a_0
142 | - webencodings=0.5.1=py36h800622e_1
143 | - werkzeug=0.12.2=py36hc703753_0
144 | - wheel=0.30.0=py36hfd4bba0_1
145 | - widgetsnbextension=3.0.8=py36h25a1d49_0
146 | - xz=5.2.3=h55aa19d_2
147 | - yaml=0.1.7=had09818_2
148 | - zeromq=4.2.2=hbedb6e5_2
149 | - zict=0.1.3=py36h3a3bf81_0
150 | - zlib=1.2.11=ha838bed_2
151 | - h5py=2.7.1=py36_2
152 | - keras=1.2.2=py36_0
153 | - watermark=1.5.0=py36_0
154 | - xgboost=0.6a2=py36_2
155 | - pip:
156 | - absl-py==0.1.6
157 | - dm-sonnet==1.14
158 | - ipython-genutils==0.2.0
159 | - jupyter-client==5.1.0
160 | - jupyter-core==4.4.0
161 | - nb-anacondacloud==1.4.0
162 | - nb-conda==2.2.1
163 | - nb-conda-kernels==2.1.0
164 | - prompt-toolkit==1.0.15
165 | - tables==3.4.2
166 | - tensorflow==1.3.0
167 | prefix: /home/amanthevinci/anaconda3/envs/deep-tf
168 |
169 |
--------------------------------------------------------------------------------
/notebooks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/notebooks/.gitkeep
--------------------------------------------------------------------------------
/notebooks/Preprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Preprocessing\n",
8 | "---"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "These notebooks and the accompanying paper (Text Normalization using Memory Augmented Neural Networks) demonstrates an accuracy of 99.4% (English) and 99.3% (Russian) on the Text Normalization Challenge by Richard Sproat and Navdeep Jaitly. To achieve comparable and objective results, we need to preprocess the data provided by Richard Sproat and Navdeep Jaitly at [https://github.com/rwsproat/text-normalization-data]. From the README of the dataset:\n",
16 | "```\n",
17 | "In practice for the results reported in the paper only the first 100,002 lines\n",
18 | "of output-00099-of-00100 were used (for English), and the first 100,007 lines of\n",
19 | "output-00099-of-00100 for Russian.\n",
20 | "```\n",
21 | "Hence, the 'output-00099-of-00100' file is extracted for further use. \n",
22 | "This notebook prepares the raw data for the next stage of normalization."
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "## Import Libraries"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 1,
35 | "metadata": {
36 | "collapsed": true
37 | },
38 | "outputs": [],
39 | "source": [
40 | "import numpy as np\n",
41 | "import pandas as pd\n",
42 | "import seaborn as sns\n",
43 | "import matplotlib.pyplot as plt\n",
44 | "\n",
45 | "%matplotlib inline"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "## Global Config\n",
53 | "**Language : English or Russian?**"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 2,
59 | "metadata": {
60 | "collapsed": true
61 | },
62 | "outputs": [],
63 | "source": [
64 | "lang = 'english'\n",
65 | "# lang = 'russian'"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 3,
71 | "metadata": {
72 | "collapsed": true
73 | },
74 | "outputs": [],
75 | "source": [
76 | "if lang == 'english':\n",
77 | " # input data\n",
78 | " data_directory = '../data/english/'\n",
79 | " data = 'output-00099-of-00100'\n",
80 | " # output\n",
81 | " out = 'output-00099-of-00100_processed.csv'\n",
82 | " # test size \n",
83 | " test_rows = 100002\n",
84 | " \n",
85 | "elif lang == 'russian':\n",
86 | " # input data\n",
87 | " data_directory = '../data/russian/'\n",
88 | " data = 'output-00099-of-00100'\n",
89 | " # output\n",
90 | " out = 'output-00099-of-00100_processed.csv'\n",
91 | " # test size\n",
92 | " test_rows = 100007"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "## Load Data"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "By default, Pandas treats double quote as enclosing an entry so it includes all tabs and newlines in that entry until it reaches the next quote. To escape it we need to have the quoting argument set to QUOTE_NONE or 3 as given in the documentation - [https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html]\n"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 4,
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "name": "stdout",
116 | "output_type": "stream",
117 | "text": [
118 | "\n",
119 | "RangeIndex: 100002 entries, 0 to 100001\n",
120 | "Data columns (total 3 columns):\n",
121 | "semiotic 100002 non-null object\n",
122 | "before 100002 non-null object\n",
123 | "after 92451 non-null object\n",
124 | "dtypes: object(3)\n",
125 | "memory usage: 2.3+ MB\n"
126 | ]
127 | }
128 | ],
129 | "source": [
130 | "raw_data = pd.read_csv(data_directory+data, nrows=test_rows,\n",
131 | " header=None, sep='\\t', quoting = 3,\n",
132 | " names=['semiotic', 'before', 'after'])\n",
133 | "raw_data.info()"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 5,
139 | "metadata": {},
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/html": [
144 | "\n",
145 | "\n",
158 | "
\n",
159 | " \n",
160 | " \n",
161 | " | \n",
162 | " semiotic | \n",
163 | " before | \n",
164 | " after | \n",
165 | "
\n",
166 | " \n",
167 | " \n",
168 | " \n",
169 | " 0 | \n",
170 | " PLAIN | \n",
171 | " It | \n",
172 | " <self> | \n",
173 | "
\n",
174 | " \n",
175 | " 1 | \n",
176 | " PLAIN | \n",
177 | " can | \n",
178 | " <self> | \n",
179 | "
\n",
180 | " \n",
181 | " 2 | \n",
182 | " PLAIN | \n",
183 | " be | \n",
184 | " <self> | \n",
185 | "
\n",
186 | " \n",
187 | " 3 | \n",
188 | " PLAIN | \n",
189 | " summarized | \n",
190 | " <self> | \n",
191 | "
\n",
192 | " \n",
193 | " 4 | \n",
194 | " PLAIN | \n",
195 | " as | \n",
196 | " <self> | \n",
197 | "
\n",
198 | " \n",
199 | " 5 | \n",
200 | " PLAIN | \n",
201 | " an | \n",
202 | " <self> | \n",
203 | "
\n",
204 | " \n",
205 | " 6 | \n",
206 | " PUNCT | \n",
207 | " \" | \n",
208 | " sil | \n",
209 | "
\n",
210 | " \n",
211 | " 7 | \n",
212 | " PLAIN | \n",
213 | " error | \n",
214 | " <self> | \n",
215 | "
\n",
216 | " \n",
217 | " 8 | \n",
218 | " PLAIN | \n",
219 | " driven | \n",
220 | " <self> | \n",
221 | "
\n",
222 | " \n",
223 | " 9 | \n",
224 | " PLAIN | \n",
225 | " transformation | \n",
226 | " <self> | \n",
227 | "
\n",
228 | " \n",
229 | "
\n",
230 | "
"
231 | ],
232 | "text/plain": [
233 | " semiotic before after\n",
234 | "0 PLAIN It \n",
235 | "1 PLAIN can \n",
236 | "2 PLAIN be \n",
237 | "3 PLAIN summarized \n",
238 | "4 PLAIN as \n",
239 | "5 PLAIN an \n",
240 | "6 PUNCT \" sil\n",
241 | "7 PLAIN error \n",
242 | "8 PLAIN driven \n",
243 | "9 PLAIN transformation "
244 | ]
245 | },
246 | "execution_count": 5,
247 | "metadata": {},
248 | "output_type": "execute_result"
249 | }
250 | ],
251 | "source": [
252 | "raw_data.head(10)"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {},
258 | "source": [
259 | "## Data Analysis"
260 | ]
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "metadata": {},
265 | "source": [
266 | "**What are the different type of semiotic classes available?**"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 6,
272 | "metadata": {},
273 | "outputs": [
274 | {
275 | "data": {
276 | "text/plain": [
277 | "PLAIN 67894\n",
278 | "PUNCT 17746\n",
279 | " 7551\n",
280 | "DATE 2832\n",
281 | "LETTERS 1409\n",
282 | "CARDINAL 1037\n",
283 | "VERBATIM 1001\n",
284 | "MEASURE 142\n",
285 | "ORDINAL 103\n",
286 | "DECIMAL 92\n",
287 | "ELECTRONIC 49\n",
288 | "DIGIT 44\n",
289 | "MONEY 37\n",
290 | "TELEPHONE 37\n",
291 | "FRACTION 16\n",
292 | "TIME 8\n",
293 | "ADDRESS 4\n",
294 | "Name: semiotic, dtype: int64"
295 | ]
296 | },
297 | "execution_count": 6,
298 | "metadata": {},
299 | "output_type": "execute_result"
300 | }
301 | ],
302 | "source": [
303 | "raw_data['semiotic'].value_counts()"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "metadata": {},
309 | "source": [
310 | "The semiotic classes mentioned in the paper are:\n",
311 | "\n",
312 | "1. PLAIN\n",
313 | "2. PUNCT\n",
314 | "3. DATE\n",
315 | "4. TRANS\n",
316 | "5. LETTERS\n",
317 | "6. CARDINAL\n",
318 | "7. VERBATIM\n",
319 | "8. MEASURE\n",
320 | "9. ORDINAL\n",
321 | "10. DECIMAL\n",
322 | "11. ELECTRONIC\n",
323 | "12. DIGIT\n",
324 | "13. MONEY\n",
325 | "14. FRACTION\n",
326 | "15. TIME\n"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "metadata": {},
332 | "source": [
333 | "## Data Preprocessing"
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "metadata": {},
339 | "source": [
340 | "**Generating sentence and word token ids**"
341 | ]
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "metadata": {},
346 | "source": [
347 | "Our text normalization approach requires sentence and token ids to encode and generate batches"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 7,
353 | "metadata": {
354 | "collapsed": true
355 | },
356 | "outputs": [],
357 | "source": [
358 | "# to avoid modifying something we are iterating over\n",
359 | "data = pd.DataFrame(columns=['sentence_id',\n",
360 | " 'token_id',\n",
361 | " 'semiotic',\n",
362 | " 'before',\n",
363 | " 'after'])\n",
364 | "# initialize columns and iterator\n",
365 | "sentence_id = 0\n",
366 | "token_id = -1"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": 8,
372 | "metadata": {
373 | "collapsed": true
374 | },
375 | "outputs": [],
376 | "source": [
377 | "for row in raw_data.itertuples():\n",
378 | " # look for end of sentences\n",
379 | " if (row.semiotic == '' and row.before == ''):\n",
380 | " sentence_id += 1\n",
381 | " token_id = -1\n",
382 | " continue\n",
383 | " else:\n",
384 | " token_id += 1\n",
385 | " \n",
386 | " new_row = {'sentence_id': sentence_id,\n",
387 | " 'token_id': token_id,\n",
388 | " 'semiotic': row.semiotic,\n",
389 | " 'before': row.before,\n",
390 | " 'after': row.after}\n",
391 | " data = data.append(new_row, ignore_index=True) "
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": 9,
397 | "metadata": {},
398 | "outputs": [
399 | {
400 | "data": {
401 | "text/html": [
402 | "\n",
403 | "\n",
416 | "
\n",
417 | " \n",
418 | " \n",
419 | " | \n",
420 | " sentence_id | \n",
421 | " token_id | \n",
422 | " semiotic | \n",
423 | " before | \n",
424 | " after | \n",
425 | "
\n",
426 | " \n",
427 | " \n",
428 | " \n",
429 | " 0 | \n",
430 | " 0 | \n",
431 | " 0 | \n",
432 | " PLAIN | \n",
433 | " It | \n",
434 | " <self> | \n",
435 | "
\n",
436 | " \n",
437 | " 1 | \n",
438 | " 0 | \n",
439 | " 1 | \n",
440 | " PLAIN | \n",
441 | " can | \n",
442 | " <self> | \n",
443 | "
\n",
444 | " \n",
445 | " 2 | \n",
446 | " 0 | \n",
447 | " 2 | \n",
448 | " PLAIN | \n",
449 | " be | \n",
450 | " <self> | \n",
451 | "
\n",
452 | " \n",
453 | " 3 | \n",
454 | " 0 | \n",
455 | " 3 | \n",
456 | " PLAIN | \n",
457 | " summarized | \n",
458 | " <self> | \n",
459 | "
\n",
460 | " \n",
461 | " 4 | \n",
462 | " 0 | \n",
463 | " 4 | \n",
464 | " PLAIN | \n",
465 | " as | \n",
466 | " <self> | \n",
467 | "
\n",
468 | " \n",
469 | " 5 | \n",
470 | " 0 | \n",
471 | " 5 | \n",
472 | " PLAIN | \n",
473 | " an | \n",
474 | " <self> | \n",
475 | "
\n",
476 | " \n",
477 | " 6 | \n",
478 | " 0 | \n",
479 | " 6 | \n",
480 | " PUNCT | \n",
481 | " \" | \n",
482 | " sil | \n",
483 | "
\n",
484 | " \n",
485 | " 7 | \n",
486 | " 0 | \n",
487 | " 7 | \n",
488 | " PLAIN | \n",
489 | " error | \n",
490 | " <self> | \n",
491 | "
\n",
492 | " \n",
493 | " 8 | \n",
494 | " 0 | \n",
495 | " 8 | \n",
496 | " PLAIN | \n",
497 | " driven | \n",
498 | " <self> | \n",
499 | "
\n",
500 | " \n",
501 | " 9 | \n",
502 | " 0 | \n",
503 | " 9 | \n",
504 | " PLAIN | \n",
505 | " transformation | \n",
506 | " <self> | \n",
507 | "
\n",
508 | " \n",
509 | "
\n",
510 | "
"
511 | ],
512 | "text/plain": [
513 | " sentence_id token_id semiotic before after\n",
514 | "0 0 0 PLAIN It \n",
515 | "1 0 1 PLAIN can \n",
516 | "2 0 2 PLAIN be \n",
517 | "3 0 3 PLAIN summarized \n",
518 | "4 0 4 PLAIN as \n",
519 | "5 0 5 PLAIN an \n",
520 | "6 0 6 PUNCT \" sil\n",
521 | "7 0 7 PLAIN error \n",
522 | "8 0 8 PLAIN driven \n",
523 | "9 0 9 PLAIN transformation "
524 | ]
525 | },
526 | "execution_count": 9,
527 | "metadata": {},
528 | "output_type": "execute_result"
529 | }
530 | ],
531 | "source": [
532 | "data.head(10)"
533 | ]
534 | },
535 | {
536 | "cell_type": "markdown",
537 | "metadata": {},
538 | "source": [
539 | "**Transforming 'after' tokens** \n",
540 | "From the above mentioned paper:\n",
541 | "```\n",
542 | "Semiotic class instances are verbalized as sequences\n",
543 | "of fully spelled words, most ordinary words are left alone (rep-\n",
544 | "resented here as ), and punctuation symbols are mostly\n",
545 | "transduced to sil (for “silence”).\n",
546 | "```\n",
547 | "Hence we transform as follows:\n",
548 | "1. sil is replaced with < self >\n",
549 | "2. < self > is replaced with the before column\n"
550 | ]
551 | },
552 | {
553 | "cell_type": "code",
554 | "execution_count": 10,
555 | "metadata": {
556 | "collapsed": true
557 | },
558 | "outputs": [],
559 | "source": [
560 | "sil_mask = (data['after'] == 'sil')\n",
561 | "data.loc[sil_mask, 'after'] = '' "
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": 11,
567 | "metadata": {
568 | "collapsed": true
569 | },
570 | "outputs": [],
571 | "source": [
572 | "self_mask = (data['after'] == '')\n",
573 | "data.loc[self_mask, ('after')] = data.loc[self_mask, 'before']"
574 | ]
575 | },
576 | {
577 | "cell_type": "markdown",
578 | "metadata": {},
579 | "source": [
580 | "Sanity Check..."
581 | ]
582 | },
583 | {
584 | "cell_type": "code",
585 | "execution_count": 12,
586 | "metadata": {},
587 | "outputs": [
588 | {
589 | "data": {
590 | "text/html": [
591 | "\n",
592 | "\n",
605 | "
\n",
606 | " \n",
607 | " \n",
608 | " | \n",
609 | " sentence_id | \n",
610 | " token_id | \n",
611 | " semiotic | \n",
612 | " before | \n",
613 | " after | \n",
614 | "
\n",
615 | " \n",
616 | " \n",
617 | " \n",
618 | " 27604 | \n",
619 | " 2255 | \n",
620 | " 1 | \n",
621 | " PUNCT | \n",
622 | " : | \n",
623 | " : | \n",
624 | "
\n",
625 | " \n",
626 | " 23472 | \n",
627 | " 1886 | \n",
628 | " 3 | \n",
629 | " PUNCT | \n",
630 | " : | \n",
631 | " : | \n",
632 | "
\n",
633 | " \n",
634 | " 33683 | \n",
635 | " 2775 | \n",
636 | " 15 | \n",
637 | " PUNCT | \n",
638 | " , | \n",
639 | " , | \n",
640 | "
\n",
641 | " \n",
642 | " 69723 | \n",
643 | " 5727 | \n",
644 | " 4 | \n",
645 | " PUNCT | \n",
646 | " , | \n",
647 | " , | \n",
648 | "
\n",
649 | " \n",
650 | " 74352 | \n",
651 | " 6093 | \n",
652 | " 11 | \n",
653 | " PUNCT | \n",
654 | " . | \n",
655 | " . | \n",
656 | "
\n",
657 | " \n",
658 | "
\n",
659 | "
"
660 | ],
661 | "text/plain": [
662 | " sentence_id token_id semiotic before after\n",
663 | "27604 2255 1 PUNCT : :\n",
664 | "23472 1886 3 PUNCT : :\n",
665 | "33683 2775 15 PUNCT , ,\n",
666 | "69723 5727 4 PUNCT , ,\n",
667 | "74352 6093 11 PUNCT . ."
668 | ]
669 | },
670 | "execution_count": 12,
671 | "metadata": {},
672 | "output_type": "execute_result"
673 | }
674 | ],
675 | "source": [
676 | "data[sil_mask].sample(5)"
677 | ]
678 | },
679 | {
680 | "cell_type": "code",
681 | "execution_count": 13,
682 | "metadata": {},
683 | "outputs": [
684 | {
685 | "data": {
686 | "text/html": [
687 | "\n",
688 | "\n",
701 | "
\n",
702 | " \n",
703 | " \n",
704 | " | \n",
705 | " sentence_id | \n",
706 | " token_id | \n",
707 | " semiotic | \n",
708 | " before | \n",
709 | " after | \n",
710 | "
\n",
711 | " \n",
712 | " \n",
713 | " \n",
714 | " 27460 | \n",
715 | " 2242 | \n",
716 | " 11 | \n",
717 | " PUNCT | \n",
718 | " . | \n",
719 | " . | \n",
720 | "
\n",
721 | " \n",
722 | " 9551 | \n",
723 | " 759 | \n",
724 | " 5 | \n",
725 | " PLAIN | \n",
726 | " the | \n",
727 | " the | \n",
728 | "
\n",
729 | " \n",
730 | " 77947 | \n",
731 | " 6381 | \n",
732 | " 11 | \n",
733 | " PLAIN | \n",
734 | " far | \n",
735 | " far | \n",
736 | "
\n",
737 | " \n",
738 | " 4412 | \n",
739 | " 348 | \n",
740 | " 7 | \n",
741 | " PLAIN | \n",
742 | " in | \n",
743 | " in | \n",
744 | "
\n",
745 | " \n",
746 | " 42046 | \n",
747 | " 3427 | \n",
748 | " 7 | \n",
749 | " PLAIN | \n",
750 | " Takayama | \n",
751 | " Takayama | \n",
752 | "
\n",
753 | " \n",
754 | "
\n",
755 | "
"
756 | ],
757 | "text/plain": [
758 | " sentence_id token_id semiotic before after\n",
759 | "27460 2242 11 PUNCT . .\n",
760 | "9551 759 5 PLAIN the the\n",
761 | "77947 6381 11 PLAIN far far\n",
762 | "4412 348 7 PLAIN in in\n",
763 | "42046 3427 7 PLAIN Takayama Takayama"
764 | ]
765 | },
766 | "execution_count": 13,
767 | "metadata": {},
768 | "output_type": "execute_result"
769 | }
770 | ],
771 | "source": [
772 | "data[self_mask].sample(5)"
773 | ]
774 | },
775 | {
776 | "cell_type": "markdown",
777 | "metadata": {},
778 | "source": [
779 | "## Exporting Data"
780 | ]
781 | },
782 | {
783 | "cell_type": "code",
784 | "execution_count": 14,
785 | "metadata": {},
786 | "outputs": [
787 | {
788 | "data": {
789 | "text/html": [
790 | "\n",
791 | "\n",
804 | "
\n",
805 | " \n",
806 | " \n",
807 | " | \n",
808 | " sentence_id | \n",
809 | " token_id | \n",
810 | " semiotic | \n",
811 | " before | \n",
812 | " after | \n",
813 | "
\n",
814 | " \n",
815 | " \n",
816 | " \n",
817 | " 30 | \n",
818 | " 2 | \n",
819 | " 0 | \n",
820 | " PLAIN | \n",
821 | " She | \n",
822 | " She | \n",
823 | "
\n",
824 | " \n",
825 | " 31 | \n",
826 | " 2 | \n",
827 | " 1 | \n",
828 | " PLAIN | \n",
829 | " then | \n",
830 | " then | \n",
831 | "
\n",
832 | " \n",
833 | " 32 | \n",
834 | " 2 | \n",
835 | " 2 | \n",
836 | " PLAIN | \n",
837 | " compelled | \n",
838 | " compelled | \n",
839 | "
\n",
840 | " \n",
841 | " 33 | \n",
842 | " 2 | \n",
843 | " 3 | \n",
844 | " PLAIN | \n",
845 | " her | \n",
846 | " her | \n",
847 | "
\n",
848 | " \n",
849 | " 34 | \n",
850 | " 2 | \n",
851 | " 4 | \n",
852 | " PLAIN | \n",
853 | " tenants | \n",
854 | " tenants | \n",
855 | "
\n",
856 | " \n",
857 | " 35 | \n",
858 | " 2 | \n",
859 | " 5 | \n",
860 | " PLAIN | \n",
861 | " to | \n",
862 | " to | \n",
863 | "
\n",
864 | " \n",
865 | " 36 | \n",
866 | " 2 | \n",
867 | " 6 | \n",
868 | " PLAIN | \n",
869 | " level | \n",
870 | " level | \n",
871 | "
\n",
872 | " \n",
873 | " 37 | \n",
874 | " 2 | \n",
875 | " 7 | \n",
876 | " PLAIN | \n",
877 | " the | \n",
878 | " the | \n",
879 | "
\n",
880 | " \n",
881 | " 38 | \n",
882 | " 2 | \n",
883 | " 8 | \n",
884 | " PLAIN | \n",
885 | " Royalist | \n",
886 | " Royalist | \n",
887 | "
\n",
888 | " \n",
889 | " 39 | \n",
890 | " 2 | \n",
891 | " 9 | \n",
892 | " PLAIN | \n",
893 | " siege | \n",
894 | " siege | \n",
895 | "
\n",
896 | " \n",
897 | "
\n",
898 | "
"
899 | ],
900 | "text/plain": [
901 | " sentence_id token_id semiotic before after\n",
902 | "30 2 0 PLAIN She She\n",
903 | "31 2 1 PLAIN then then\n",
904 | "32 2 2 PLAIN compelled compelled\n",
905 | "33 2 3 PLAIN her her\n",
906 | "34 2 4 PLAIN tenants tenants\n",
907 | "35 2 5 PLAIN to to\n",
908 | "36 2 6 PLAIN level level\n",
909 | "37 2 7 PLAIN the the\n",
910 | "38 2 8 PLAIN Royalist Royalist\n",
911 | "39 2 9 PLAIN siege siege"
912 | ]
913 | },
914 | "execution_count": 14,
915 | "metadata": {},
916 | "output_type": "execute_result"
917 | }
918 | ],
919 | "source": [
920 | "data[30:40]"
921 | ]
922 | },
923 | {
924 | "cell_type": "code",
925 | "execution_count": 15,
926 | "metadata": {
927 | "collapsed": true
928 | },
929 | "outputs": [],
930 | "source": [
931 | "data.to_csv(data_directory+out, index=False)"
932 | ]
933 | },
934 | {
935 | "cell_type": "markdown",
936 | "metadata": {},
937 | "source": [
938 | "___"
939 | ]
940 | }
941 | ],
942 | "metadata": {
943 | "kernelspec": {
944 | "display_name": "Python 3",
945 | "language": "python",
946 | "name": "python3"
947 | },
948 | "language_info": {
949 | "codemirror_mode": {
950 | "name": "ipython",
951 | "version": 3
952 | },
953 | "file_extension": ".py",
954 | "mimetype": "text/x-python",
955 | "name": "python",
956 | "nbconvert_exporter": "python",
957 | "pygments_lexer": "ipython3",
958 | "version": "3.6.5"
959 | }
960 | },
961 | "nbformat": 4,
962 | "nbformat_minor": 2
963 | }
964 |
--------------------------------------------------------------------------------
/results/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/results/.gitkeep
--------------------------------------------------------------------------------
/results/base-paper_classwise_accuracy.csv:
--------------------------------------------------------------------------------
1 | Semiotic Class, En Count, En Accuracy, Ru Count, Ru Accuracy
2 | ALL,92416,0.997,93184,0.993
3 | PLAIN,68029,0.998,60747,0.999
4 | PUNCT,17726,1.000,20263,1.000
5 | DATE,2808,0.999,1495,0.976
6 | TRANS,nan,nan,4103,0.921
7 | LETTERS,1404,0.971,1839,0.991
8 | CARDINAL,1067,0.989,2387,0.940
9 | VERBATIM,894,0.980,1298,1.000
10 | MEASURE,142,0.986,409,0.883
11 | ORDINAL,103,0.971,427,0.956
12 | DECIMAL,89,1.000,60,0.867
13 | ELECTRONIC,21,1.000,2,1.000
14 | DIGIT,37,0.865,16,1.000
15 | MONEY,36,0.972,19,0.842
16 | FRACTION,13,0.923,23,0.826
17 | TIME,8,0.750,8,0.750
18 |
--------------------------------------------------------------------------------
/results/english/Semiotic_Class-wise_Accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/results/english/Semiotic_Class-wise_Accuracy.png
--------------------------------------------------------------------------------
/results/english/classwise_accuracy.csv:
--------------------------------------------------------------------------------
1 | semiotic-class,accuracy,count,correct
2 | ALL,0.994267233453397,92451,91921
3 | ADDRESS,1.0,4,4
4 | CARDINAL,0.9942140790742526,1037,1031
5 | DATE,0.9971751412429378,2832,2824
6 | DECIMAL,0.9891304347826086,92,91
7 | DIGIT,0.7954545454545454,44,35
8 | ELECTRONIC,0.7346938775510204,49,36
9 | FRACTION,0.6875,16,11
10 | LETTERS,0.971611071682044,1409,1369
11 | MEASURE,0.971830985915493,142,138
12 | MONEY,0.972972972972973,37,36
13 | ORDINAL,0.9805825242718447,103,101
14 | PLAIN,0.9939611747724394,67894,67484
15 | PUNCT,0.9988729854615125,17746,17726
16 | TELEPHONE,0.918918918918919,37,34
17 | TIME,0.75,8,6
18 | VERBATIM,0.994005994005994,1001,995
19 |
--------------------------------------------------------------------------------
/results/english/mistakes.csv:
--------------------------------------------------------------------------------
1 | sentence_id,token_id,semiotic,before,class,after,truth
2 | 59,0,PLAIN,UPA,ToBeNormalized,u p a,UPA
3 | 66,6,PLAIN,SEO,ToBeNormalized,s e o,SEO
4 | 68,0,PLAIN,INSIGHT,ToBeNormalized,i n s i g h t,INSIGHT
5 | 80,2,PLAIN,WA,ToBeNormalized,w a,WA
6 | 80,4,PLAIN,SPIE,ToBeNormalized,s p i e,SPIE
7 | 90,0,PLAIN,LiPSA,ToBeNormalized,l i p s a,LiPSA
8 | 94,20,PLAIN,ROD,ToBeNormalized,r o d,ROD
9 | 100,2,PLAIN,LEGAL,ToBeNormalized,l e g a l,LEGAL
10 | 100,3,PLAIN,ASSISTANCE,ToBeNormalized,a s s i s t a n c e,ASSISTANCE
11 | 100,4,PLAIN,CENTRE,ToBeNormalized,c e n t r e,center
12 | 100,6,PLAIN,LAC,ToBeNormalized,l a c,LAC
13 | 156,3,MEASURE,0.001251 g/cm3,ToBeNormalized,zero point o o one two five one sil g per hour,zero point o o one two five one grams per c c
14 | 158,5,LETTERS,V,ToBeNormalized,five,V
15 | 184,24,PLAIN,doi,ToBeNormalized,d o i,doi
16 | 184,28,CARDINAL,14356007,ToBeNormalized,one million four hundred thirty five thousand six hundred seven,fourteen million three hundred fifty six thousand seven
17 | 191,3,PLAIN,RENAMO,ToBeNormalized,r e n a m o,RENAMO
18 | 205,12,CARDINAL,1572225424,ToBeNormalized,one billion five hundred seventy two million two hundred twenty two thousand four hundred twenty four,one billion five hundred seventy two million two hundred twenty five thousand four hundred twenty four
19 | 239,0,LETTERS,Acee,RemainSelf,Acee,a c e e
20 | 243,5,PLAIN,GUS,ToBeNormalized,g u s,GUS
21 | 253,4,PLAIN,DOS,ToBeNormalized,d o s,DOS
22 | 297,8,LETTERS,Vit,RemainSelf,Vit,v i t
23 | 303,15,PLAIN,authorisation,ToBeNormalized,t,authorization
24 | 309,0,FRACTION,1/0,ToBeNormalized,one meter,one zeroth
25 | 316,3,PLAIN,ski,ToBeNormalized,s k i,ski
26 | 349,8,PLAIN,Fei,ToBeNormalized,f e i,Fei
27 | 366,10,PLAIN,JA,ToBeNormalized,j a,JA
28 | 378,10,PLAIN,ser,ToBeNormalized,s e r,ser
29 | 455,3,FRACTION,"10/618,543",ToBeNormalized,ten sixteenth sixty one thousand five hundred forty three,ten six hundred eighteen thousand five hundred forty thirds
30 | 471,15,PLAIN,Up,ToBeNormalized,u p,Up
31 | 473,7,ELECTRONIC,#Selfie,ToBeNormalized,hash tag fourteen,hash tag selfie
32 | 490,7,VERBATIM,-,ToBeNormalized,to,-
33 | 507,0,PLAIN,BibliographyTranslations,ToBeNormalized,b i b l i o g r a p h y t r a n s,BibliographyTranslations
34 | 529,7,PLAIN,EE,ToBeNormalized,e e,EE
35 | 550,0,LETTERS,Ligi'ne,RemainSelf,Ligi'ne,l i g i n e
36 | 555,2,PLAIN,PROFILES,ToBeNormalized,p r o f i l e s,PROFILES
37 | 577,5,PLAIN,GUS,ToBeNormalized,g u s,GUS
38 | 593,15,PLAIN,ups,ToBeNormalized,u p s,ups
39 | 595,10,PLAIN,ANABlog,ToBeNormalized,a n a b l o g,ANABlog
40 | 626,7,PLAIN,ms,ToBeNormalized,m s,ms
41 | 627,8,PUNCT,-,ToBeNormalized,to,-
42 | 627,12,PUNCT,-,ToBeNormalized,to,-
43 | 633,13,PLAIN,ADUM,ToBeNormalized,a d u m,ADUM
44 | 636,15,PLAIN,pluralised,ToBeNormalized,popularized,pluralized
45 | 637,7,PLAIN,TRADOC,ToBeNormalized,t r a d o c,TRADOC
46 | 637,18,PLAIN,CARR,ToBeNormalized,c a r r,CARR
47 | 637,19,PLAIN,OPT,ToBeNormalized,o p t,OPT
48 | 659,0,PLAIN,MATCH,ToBeNormalized,m a t c h,MATCH
49 | 687,0,DIGIT,2007,ToBeNormalized,two thousand seven,two o o seven
50 | 694,11,PLAIN,MAR,ToBeNormalized,m a r,MAR
51 | 698,1,PLAIN,PIX,ToBeNormalized,p i x,PIX
52 | 701,1,PLAIN,WorldwideEastEndersExtrasGrownupsPlanet,ToBeNormalized,w_letter,WorldwideEastEndersExtrasGrownupsPlanet
53 | 704,0,PLAIN,Realising,RemainSelf,Realising,realizing
54 | 704,13,PLAIN,Crystalliser,RemainSelf,Crystalliser,crystallizer
55 | 711,0,PLAIN,LIT,ToBeNormalized,l i t,LIT
56 | 721,11,PLAIN,VI,ToBeNormalized,v i,VI
57 | 754,2,PLAIN,OCAMPO,ToBeNormalized,o c a m p o,OCAMPO
58 | 754,4,PLAIN,FROM,ToBeNormalized,f r o m,FROM
59 | 754,10,PLAIN,BUILDS,ToBeNormalized,b u i l d s,BUILDS
60 | 754,11,TELEPHONE,3-0 LEAD,ToBeNormalized,three sil o sil l e a d,three sil o sil lead
61 | 783,3,PLAIN,OF,ToBeNormalized,o f,OF
62 | 784,5,PLAIN,Ngoc,ToBeNormalized,n g o c,Ngoc
63 | 799,5,PLAIN,Maj,ToBeNormalized,m a j,Maj
64 | 802,10,PLAIN,RAF,ToBeNormalized,r a f,RAF
65 | 831,14,PLAIN,Sol,ToBeNormalized,s o l,Sol
66 | 846,6,PLAIN,CLIO,ToBeNormalized,c l i o,CLIO
67 | 864,1,PLAIN,BEEF,ToBeNormalized,b e e f,BEEF
68 | 918,7,PLAIN,ser,ToBeNormalized,s e r,ser
69 | 923,13,PLAIN,Lik,ToBeNormalized,l i k,Lik
70 | 936,5,PLAIN,CARD,ToBeNormalized,c a r d,CARD
71 | 942,1,PLAIN,Am's,ToBeNormalized,a m s's,Am's
72 | 942,4,DIGIT,314,ToBeNormalized,three hundred fourteen,three one four
73 | 964,18,PLAIN,mrs,ToBeNormalized,mister,mrs
74 | 998,6,PLAIN,Aud,ToBeNormalized,a u d,Aud
75 | 998,10,PLAIN,odd,ToBeNormalized,o d d,odd
76 | 1018,0,PLAIN,Ava,ToBeNormalized,a v a,Ava
77 | 1057,13,PLAIN,INRIA,ToBeNormalized,i n r i a,INRIA
78 | 1060,3,PLAIN,ASIA,ToBeNormalized,a s i a,ASIA
79 | 1071,15,PLAIN,Azo,ToBeNormalized,a z o,Azo
80 | 1079,21,PLAIN,BANYU,ToBeNormalized,b a n y u,BANYU
81 | 1091,4,LETTERS,Crkva,RemainSelf,Crkva,c r k v a
82 | 1092,5,PLAIN,HEW,ToBeNormalized,h e w,HEW
83 | 1101,0,PLAIN,CARE,ToBeNormalized,c a r e,CARE
84 | 1105,5,PLAIN,I,ToBeNormalized,one,I
85 | 1117,15,ELECTRONIC,//web.archive.org/web/20110105051516/http://www.fairfield.edu/x18852.html,ToBeNormalized,s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter e_letter b_letter dot a_letter r_letter c_letter h_letter i_letter v_letter e_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter e_letter b_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter _letter o_letter n_letter e_letter _letter o_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter,s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter e_letter b_letter dot a_letter r_letter c_letter h_letter i_letter v_letter e_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter e_letter b_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter _letter o_letter n_letter e_letter _letter o_letter _letter f_letter i_letter v_letter e_letter _letter o_letter _letter f_letter i_letter v_letter e_letter _letter o_letter n_letter e_letter _letter f_letter i_letter v_letter e_letter _letter o_letter n_letter e_letter _letter s_letter i_letter x_letter _letter s_letter l_letter a_letter s_letter h_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot f_letter a_letter i_letter r_letter f_letter i_letter e_letter l_letter d_letter dot e_letter _letter d_letter _letter u_letter _letter s_letter l_letter a_letter s_letter h_letter _letter x_letter _letter o_letter n_letter e_letter _letter e_letter i_letter g_letter h_letter t_letter _letter e_letter i_letter g_letter h_letter t_letter _letter f_letter i_letter v_letter e_letter _letter t_letter w_letter o_letter dot h_letter _letter t_letter _letter m_letter _letter l_letter
86 | 1121,10,PLAIN,Abba,ToBeNormalized,a b b a,Abba
87 | 1124,7,PLAIN,Yaesu,ToBeNormalized,y a e s u,Yaesu
88 | 1165,7,PLAIN,ser,ToBeNormalized,s e r,ser
89 | 1189,7,PLAIN,Oh,ToBeNormalized,o h,Oh
90 | 1193,9,PLAIN,centro,ToBeNormalized,center,centro
91 | 1203,2,PLAIN,CHARLES,ToBeNormalized,c h a r l e s,CHARLES
92 | 1203,4,PLAIN,MOODY,ToBeNormalized,m o o d y,MOODY
93 | 1211,20,ELECTRONIC,nethttp://www.pamirian.ru/Wakhi_language_transition.pdfAli,ToBeNormalized,n_letter e_letter t_letter h_letter t_letter t_letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot p_letter a_letter m_letter i_letter r_letter i_letter a_letter n_letter dot r_letter _letter u_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter a_letter k_letter h_letter i_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter p_letter a_letter r_letter e_letter n_letter t_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter p_letter a_letter r_letter e_letter n_letter t_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter p_letter a_letter r_letter e_letter n_letter t_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter p_letter a_letter r_letter e_letter n_letter t_letter _letter u_letter n_letter d_letter,n_letter _letter e_letter _letter t_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot p_letter a_letter m_letter i_letter r_letter i_letter a_letter n_letter dot r_letter _letter u_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter a_letter k_letter h_letter i_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter l_letter a_letter n_letter g_letter u_letter a_letter g_letter e_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter t_letter r_letter a_letter n_letter s_letter i_letter t_letter i_letter o_letter n_letter dot p_letter d_letter f_letter a_letter l_letter i_letter
94 | 1243,13,PLAIN,IZMIR,ToBeNormalized,i z m i r,IZMIR
95 | 1252,6,PUNCT,-,ToBeNormalized,to,-
96 | 1289,2,ELECTRONIC,http://www.knchr.org/dmdocuments/KNCHR%20doc.pdfFollowing,ToBeNormalized,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot k_letter _letter n_letter _letter c_letter _letter h_letter _letter r_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter d_letter _letter m_letter _letter d_letter o_letter c_letter u_letter m_letter e_letter n_letter t_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter k_letter _letter n_letter _letter c_letter _letter h_letter _letter r_letter _letter s_letter i_letter x_letter d_letter o_letter c_letter dot p_letter _letter d_letter _letter f_letter _letter f_letter o_letter l_letter l_letter o_letter n_letter i_letter n_letter g_letter,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot k_letter _letter n_letter _letter c_letter _letter h_letter _letter r_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter d_letter m_letter d_letter o_letter c_letter u_letter m_letter e_letter n_letter t_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter k_letter _letter n_letter _letter c_letter _letter h_letter _letter r_letter _letter p_letter e_letter r_letter c_letter e_letter n_letter t_letter _letter t_letter w_letter e_letter n_letter t_letter y_letter _letter d_letter o_letter c_letter dot p_letter d_letter f_letter f_letter o_letter l_letter l_letter o_letter w_letter i_letter n_letter g_letter
97 | 1302,8,PLAIN,Abor,ToBeNormalized,a b o r,Abor
98 | 1343,6,PLAIN,NeXTSTEP,ToBeNormalized,n e x t s t e p,NeXTSTEP
99 | 1357,10,PLAIN,CARICOM,ToBeNormalized,c a r i c o m,CARICOM
100 | 1369,0,PLAIN,CY,ToBeNormalized,c y,CY
101 | 1385,4,PLAIN,Bodour,ToBeNormalized,b,Bodour
102 | 1388,11,PLAIN,KHAD,ToBeNormalized,k h a d,KHAD
103 | 1410,9,PLAIN,DEFRA,ToBeNormalized,d e f r a,DEFRA
104 | 1411,15,LETTERS,subg,RemainSelf,subg,s u b g
105 | 1414,2,DIGIT,450,ToBeNormalized,four hundred fifty,four five o
106 | 1427,6,PLAIN,SM,ToBeNormalized,s m,SM
107 | 1439,11,PLAIN,odor,ToBeNormalized,o d o r,odor
108 | 1456,18,PLAIN,est,ToBeNormalized,e s t,est
109 | 1475,0,PLAIN,Ava,ToBeNormalized,a v a,Ava
110 | 1520,6,DATE,2017/,ToBeNormalized,two thousand seventeen,twenty seventeen
111 | 1520,7,DATE,2016,ToBeNormalized,two thousand sixteen,twenty sixteen
112 | 1584,3,PLAIN,SPOILERS,ToBeNormalized,s p o i l e r s,SPOILERS
113 | 1612,16,PLAIN,ski,ToBeNormalized,s k i,ski
114 | 1626,1,PLAIN,mrs,ToBeNormalized,mister,mrs
115 | 1631,6,PLAIN,vols,ToBeNormalized,v o l s,vols
116 | 1681,4,PLAIN,Chu,ToBeNormalized,c h u,Chu
117 | 1699,7,ELECTRONIC,http://www.yafc-ftp.com/The,ToBeNormalized,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot y_letter a_letter f_letter c_letter _letter d_letter a_letter s_letter h_letter _letter f_letter t_letter p_letter dot c_letter o_letter m_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter h_letter e_letter,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot y_letter _letter a_letter _letter f_letter _letter c_letter _letter d_letter a_letter s_letter h_letter _letter f_letter _letter t_letter _letter p_letter dot c_letter o_letter m_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter h_letter e_letter
118 | 1707,0,PLAIN,FAO,ToBeNormalized,f a o,FAO
119 | 1726,11,MONEY,88.5 million HRK,ToBeNormalized,eighty eight point five million yen,eighty eight point five million croatian kunas
120 | 1785,1,LETTERS,Oolaa,RemainSelf,Oolaa,o o l a a
121 | 1813,9,PLAIN,Koi,ToBeNormalized,k o i,Koi
122 | 1831,5,LETTERS,xSnxTe,RemainSelf,xSnxTe,x s n x t e
123 | 1892,5,PLAIN,CRI,ToBeNormalized,c r i,CRI
124 | 1900,0,PLAIN,CRI,ToBeNormalized,c r i,CRI
125 | 1901,0,PLAIN,APRA,ToBeNormalized,a p r a,APRA
126 | 1903,6,PLAIN,SnO2,ToBeNormalized,s n o two,tin four oxide
127 | 1925,0,PLAIN,CART,ToBeNormalized,c a r t,CART
128 | 1925,12,PLAIN,CART,ToBeNormalized,c a r t,CART
129 | 1940,8,PLAIN,CRIs,ToBeNormalized,c r i's,CRIs
130 | 1967,0,LETTERS,Ilir,RemainSelf,Ilir,i l i r
131 | 1983,17,LETTERS,anth,RemainSelf,anth,a n t h
132 | 2027,2,PLAIN,obra,ToBeNormalized,o b r a,obra
133 | 2033,2,PLAIN,Mac's,ToBeNormalized,m a c's,Mac's
134 | 2109,3,PLAIN,millimetre,RemainSelf,millimetre,millimeter
135 | 2127,8,PLAIN,Ava,ToBeNormalized,a v a,Ava
136 | 2132,5,PLAIN,mrs,ToBeNormalized,m r s,mrs
137 | 2153,17,PLAIN,Stem,ToBeNormalized,s t e m,Stem
138 | 2153,19,PLAIN,ells,ToBeNormalized,e l l s,ells
139 | 2154,10,PLAIN,SAT,ToBeNormalized,s a t,SAT
140 | 2155,5,PLAIN,RY,ToBeNormalized,r y,RY
141 | 2163,12,PLAIN,Sept,ToBeNormalized,s e p t,Sept
142 | 2186,16,PLAIN,I'd,ToBeNormalized,i d,I'd
143 | 2191,2,PLAIN,ABS,ToBeNormalized,a b s,ABS
144 | 2203,1,PLAIN,THE,ToBeNormalized,t h e,THE
145 | 2203,3,PLAIN,BEHIND,ToBeNormalized,b e h i n d,BEHIND
146 | 2203,4,PLAIN,AVAAZ,ToBeNormalized,a v a a z,AVAAZ
147 | 2270,7,DIGIT,21770,ToBeNormalized,twenty one thousand seven hundred seventy,two one seven seven o
148 | 2274,1,PLAIN,WO,ToBeNormalized,w o,WO
149 | 2274,3,FRACTION,2006/118205,ToBeNormalized,two thousand six one thousand eight hundred fifths,two thousand six one hundred eighteen thousand two hundred fifths
150 | 2275,1,PUNCT,-,ToBeNormalized,to,-
151 | 2298,22,PLAIN,Oh,ToBeNormalized,o h,Oh
152 | 2320,12,PLAIN,ms,ToBeNormalized,m s,ms
153 | 2325,5,PLAIN,GLUT,ToBeNormalized,g l u t,GLUT
154 | 2344,5,PLAIN,APRA,ToBeNormalized,a p r a,APRA
155 | 2373,12,PLAIN,Bok,ToBeNormalized,b o k,Bok
156 | 2398,4,PUNCT,:,ToBeNormalized,to,:
157 | 2402,10,PLAIN,POW,ToBeNormalized,p o w,POW
158 | 2402,12,PLAIN,MIA,ToBeNormalized,m i a,MIA
159 | 2442,17,PLAIN,NOW,ToBeNormalized,n o w,NOW
160 | 2454,4,VERBATIM,-,ToBeNormalized,to,-
161 | 2466,1,PLAIN,YOU,ToBeNormalized,y o u,YOU
162 | 2505,1,PUNCT,-,ToBeNormalized,to,-
163 | 2505,3,VERBATIM,-,ToBeNormalized,to,-
164 | 2523,5,PLAIN,Of,ToBeNormalized,o f,Of
165 | 2528,3,PLAIN,RAFT,ToBeNormalized,r a f t,RAFT
166 | 2536,9,PLAIN,polarisation,ToBeNormalized,globalization,polarization
167 | 2542,1,PLAIN,ARTHUR,ToBeNormalized,a r t h u r,ARTHUR
168 | 2551,3,PLAIN,ES,ToBeNormalized,e s,ES
169 | 2566,9,PLAIN,catalysed,ToBeNormalized,catalogs,catalysed
170 | 2582,8,PLAIN,EVA,ToBeNormalized,e v a,EVA
171 | 2593,0,LETTERS,Akl,RemainSelf,Akl,a k l
172 | 2631,4,PLAIN,Spa,ToBeNormalized,s p a,Spa
173 | 2635,5,LETTERS,I,ToBeNormalized,one,I
174 | 2649,3,PLAIN,Ozat,ToBeNormalized,o z a t,Ozat
175 | 2650,15,PLAIN,AMS,ToBeNormalized,a m s,AMS
176 | 2697,5,PLAIN,AND,ToBeNormalized,a n d,AND
177 | 2701,1,ELECTRONIC,http://www.business-humanrights.org/Links/Repository/308254/,ToBeNormalized,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot b_letter u_letter s_letter i_letter n_letter e_letter s_letter s_letter _letter d_letter a_letter s_letter h_letter _letter h_letter u_letter m_letter a_letter n_letter r_letter i_letter g_letter h_letter t_letter s_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter l_letter i_letter n_letter k_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter r_letter e_letter p_letter o_letter s_letter i_letter t_letter o_letter r_letter y_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter h_letter r_letter e_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot b_letter u_letter s_letter i_letter n_letter e_letter s_letter s_letter _letter d_letter a_letter s_letter h_letter _letter h_letter u_letter m_letter a_letter n_letter r_letter i_letter g_letter h_letter t_letter s_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter l_letter i_letter n_letter k_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter r_letter e_letter p_letter o_letter s_letter i_letter t_letter o_letter r_letter y_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter h_letter r_letter e_letter e_letter _letter o_letter _letter e_letter i_letter g_letter h_letter t_letter _letter t_letter w_letter o_letter _letter f_letter i_letter v_letter e_letter _letter f_letter o_letter u_letter r_letter _letter s_letter l_letter a_letter s_letter h_letter
178 | 2747,0,PLAIN,Izadi,ToBeNormalized,i z a d i,Izadi
179 | 2764,1,PLAIN,aBontodBuenasuerteIntusanJose,ToBeNormalized,a_letter b o n t o d b u e n a s u e r t e i n t u s a n j o s e,aBontodBuenasuerteIntusanJose
180 | 2781,9,PUNCT,-,ToBeNormalized,to,-
181 | 2784,3,PLAIN,Luh,ToBeNormalized,l u h,Luh
182 | 2816,6,PLAIN,von,ToBeNormalized,v o n,von
183 | 2816,16,TELEPHONE,40 (1969) 111-124,ToBeNormalized,four o sil one nine six nine sil one four sil one two sil one four,four o sil one nine six nine sil one one one sil one two four
184 | 2880,2,PLAIN,Suzi,ToBeNormalized,s u z i,Suzi
185 | 2891,9,PLAIN,I,ToBeNormalized,the first,I
186 | 2906,11,PLAIN,ACS,ToBeNormalized,a c s,ACS
187 | 2949,6,PLAIN,synagogues,ToBeNormalized,synagog,synagogues
188 | 2979,7,PLAIN,SA,ToBeNormalized,s a,SA
189 | 2979,16,PLAIN,SA,ToBeNormalized,s a,SA
190 | 2981,9,PLAIN,NURBS,ToBeNormalized,n u r b s,NURBS
191 | 3024,13,DIGIT,11,ToBeNormalized,eleven,one one
192 | 3026,11,PLAIN,FAN,ToBeNormalized,f a n,FAN
193 | 3043,15,PLAIN,civilise,RemainSelf,civilise,civilize
194 | 3055,8,PLAIN,oak,ToBeNormalized,o a k,oak
195 | 3060,5,PLAIN,ASTRO,ToBeNormalized,a s t r o,ASTRO
196 | 3085,3,PLAIN,Pi,ToBeNormalized,p i,Pi
197 | 3120,7,PLAIN,LEED,ToBeNormalized,l e e d,LEED
198 | 3125,9,PLAIN,Eslov,ToBeNormalized,e s l o v,Eslov
199 | 3132,10,PUNCT,-,ToBeNormalized,to,-
200 | 3157,24,PLAIN,Sept,ToBeNormalized,s e p t,Sept
201 | 3162,1,ELECTRONIC,informationhttp://dynamic.stlouis-mo.gov/census/neighborhood.cfmhttp://dynamic.stlouis-mo.gov/census/neigh_comp.cfm,ToBeNormalized,i_letter n_letter f_letter o_letter r_letter m_letter a_letter t_letter i_letter o_letter n_letter h_letter t_letter t_letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter e_letter n_letter s_letter u_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter e_letter n_letter g_letter h_letter i_letter c_letter r_letter h_letter o_letter o_letter d_letter dot c_letter _letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter e_letter n_letter s_letter u_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter e_letter i_letter g_letter h_letter i_letter c_letter r_letter h_letter o_letter o_letter d_letter dot c_letter _letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter e_letter n_letter s_letter u_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter e_letter i_letter g_letter h_letter i_letter c_letter o_letter m_letter p_letter dot c_letter _letter f_letter _letter m_letter,i_letter _letter n_letter _letter f_letter _letter o_letter _letter r_letter _letter m_letter _letter a_letter _letter t_letter _letter i_letter _letter o_letter _letter n_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter d_letter y_letter n_letter a_letter m_letter i_letter c_letter dot s_letter t_letter l_letter o_letter u_letter i_letter s_letter _letter d_letter a_letter s_letter h_letter _letter m_letter o_letter dot g_letter o_letter v_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter e_letter n_letter s_letter u_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter e_letter i_letter g_letter h_letter b_letter o_letter r_letter h_letter o_letter o_letter d_letter dot c_letter _letter f_letter _letter m_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter d_letter y_letter n_letter a_letter m_letter i_letter c_letter dot s_letter t_letter l_letter o_letter u_letter i_letter s_letter _letter d_letter a_letter s_letter h_letter _letter m_letter o_letter dot g_letter o_letter v_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter e_letter n_letter s_letter u_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter e_letter i_letter g_letter h_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter c_letter o_letter m_letter p_letter dot c_letter _letter f_letter _letter m_letter
202 | 3166,3,PLAIN,Eva,ToBeNormalized,e v a,Eva
203 | 3167,6,PLAIN,evil,ToBeNormalized,e v i l,evil
204 | 3246,4,PLAIN,valour,ToBeNormalized,vigor,valour
205 | 3268,5,PLAIN,riA,ToBeNormalized,r i a,riA
206 | 3268,8,PLAIN,Cha,ToBeNormalized,c h a,Cha
207 | 3276,1,PLAIN,FUKUHARA,ToBeNormalized,f u k u h a r a,FUKUHARA
208 | 3284,1,LETTERS,Var,RemainSelf,Var,v a r
209 | 3285,0,PLAIN,FOCUS,ToBeNormalized,f o c u s,FOCUS
210 | 3291,15,PLAIN,ESS,ToBeNormalized,e s s,ESS
211 | 3302,1,PLAIN,Ur,ToBeNormalized,u r,Ur
212 | 3311,8,CARDINAL,X,RemainSelf,X,ten
213 | 3337,8,PLAIN,OUTSTANDING,ToBeNormalized,o u t s t a n d i n g,OUTSTANDING
214 | 3337,10,PLAIN,IN,ToBeNormalized,i n,IN
215 | 3337,13,PLAIN,VARIETY,ToBeNormalized,v a r i e t y,VARIETY
216 | 3337,15,PLAIN,MUSIC,ToBeNormalized,m u s i c,MUSIC
217 | 3340,9,PLAIN,AN,ToBeNormalized,a n,AN
218 | 3340,18,PLAIN,RA,ToBeNormalized,r a,RA
219 | 3343,5,PLAIN,MAR,ToBeNormalized,m a r,MAR
220 | 3347,5,MEASURE,1/2 cc,ToBeNormalized,one _letter d_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter,half a c c
221 | 3347,8,PLAIN,ULTRAFINE,ToBeNormalized,u l t r a f i n e,ULTRAFINE
222 | 3347,14,PLAIN,UltraFine,ToBeNormalized,favorite,UltraFine
223 | 3347,16,PLAIN,SHORT,ToBeNormalized,s h o r t,SHORT
224 | 3347,22,PLAIN,MICROFINE,ToBeNormalized,m i c r o f i n e,MICROFINE
225 | 3350,4,PLAIN,FIN,ToBeNormalized,f i n,FIN
226 | 3352,2,PLAIN,Aja,ToBeNormalized,a j a,Aja
227 | 3380,15,PLAIN,SEZ,ToBeNormalized,s e z,SEZ
228 | 3407,7,PLAIN,so,ToBeNormalized,s o,so
229 | 3415,2,PLAIN,No,ToBeNormalized,number,No
230 | 3415,5,PLAIN,Ads,ToBeNormalized,a d s,Ads
231 | 3505,1,PLAIN,TeX,ToBeNormalized,t e x,TeX
232 | 3519,21,PLAIN,Am,ToBeNormalized,a m,Am
233 | 3549,6,PLAIN,UEFA,ToBeNormalized,u e f a,UEFA
234 | 3555,9,VERBATIM,-,ToBeNormalized,to,-
235 | 3562,5,ELECTRONIC,//www.mediacorp.sg/corporate-en/corporatehttp://www.ofcom.org.uk/static/archive/itc/itc_publications/codes_guidance/programme_code/section_4.asp.html,ToBeNormalized,s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter o_letter r_letter t_letter o_letter r_letter a_letter _letter d_letter a_letter s_letter h_letter _letter e_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter o_letter r_letter p_letter o_letter r_letter a_letter t_letter e_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter a_letter r_letter c_letter h_letter i_letter v_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter i_letter t_letter c_letter h_letter i_letter t_letter c_letter u_letter p_letter u_letter r_letter t_letter i_letter c_letter a_letter t_letter i_letter o_letter n_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter i_letter t_letter,s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot m_letter e_letter d_letter i_letter a_letter c_letter o_letter r_letter p_letter dot s_letter _letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter o_letter r_letter p_letter o_letter r_letter a_letter t_letter e_letter _letter d_letter a_letter s_letter h_letter _letter e_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter o_letter r_letter p_letter o_letter r_letter a_letter t_letter e_letter h_letter t_letter t_letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot o_letter f_letter c_letter o_letter m_letter dot o_letter r_letter g_letter dot u_letter k_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter t_letter a_letter t_letter i_letter c_letter _letter s_letter l_letter a_letter s_letter h_letter _letter a_letter r_letter c_letter h_letter i_letter v_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter i_letter t_letter c_letter _letter s_letter l_letter a_letter s_letter h_letter _letter i_letter t_letter c_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter p_letter u_letter b_letter l_letter i_letter c_letter a_letter t_letter i_letter o_letter n_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter o_letter d_letter e_letter s_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter g_letter u_letter i_letter d_letter a_letter n_letter c_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter r_letter o_letter g_letter r_letter a_letter m_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter c_letter o_letter d_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter e_letter c_letter t_letter i_letter o_letter n_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter f_letter o_letter u_letter r_letter dot a_letter _letter s_letter _letter p_letter dot h_letter _letter t_letter _letter m_letter _letter l_letter
236 | 3575,20,PLAIN,AIDS,ToBeNormalized,a i d s,AIDS
237 | 3601,2,PLAIN,DARPA,ToBeNormalized,d a r p a,DARPA
238 | 3601,10,PLAIN,programmed,ToBeNormalized,program,programmed
239 | 3603,1,PLAIN,CAST,ToBeNormalized,c a s t,CAST
240 | 3659,10,PLAIN,Suwo,ToBeNormalized,s u w o,Suwo
241 | 3661,6,PLAIN,EPs,ToBeNormalized,e p's,EPs
242 | 3669,8,LETTERS,Uy,RemainSelf,Uy,u y
243 | 3676,6,PUNCT,-,ToBeNormalized,to,-
244 | 3678,6,PLAIN,Idol,ToBeNormalized,i d o l,Idol
245 | 3736,2,PLAIN,MGR,ToBeNormalized,m g r,MGR
246 | 3738,5,PLAIN,Chi,ToBeNormalized,c h i,Chi
247 | 3749,13,PLAIN,VI,ToBeNormalized,the sixth,VI
248 | 3789,8,PUNCT,-,ToBeNormalized,to,-
249 | 3790,0,PLAIN,THE,ToBeNormalized,t h e,THE
250 | 3808,0,PLAIN,PLoS,ToBeNormalized,p l o s,PLoS
251 | 3907,0,PLAIN,Programmed,ToBeNormalized,program,Programmed
252 | 3920,7,PLAIN,BOO,ToBeNormalized,b o o,BOO
253 | 3951,2,DIGIT,126,ToBeNormalized,one hundred twenty six,one two six
254 | 3952,6,PLAIN,Xtra,ToBeNormalized,x t r a,Xtra
255 | 3961,8,PLAIN,rev,ToBeNormalized,r e v,rev
256 | 3965,3,PLAIN,MARATHON,ToBeNormalized,m a r a t h o n,MARATHON
257 | 3965,4,PLAIN,CUP,ToBeNormalized,c u p,CUP
258 | 3976,8,PLAIN,Sept,ToBeNormalized,s e p t,Sept
259 | 3984,21,PLAIN,stop,ToBeNormalized,s t o p,stop
260 | 3992,9,LETTERS,Aam,RemainSelf,Aam,a a m
261 | 4027,20,PUNCT,-,ToBeNormalized,to,-
262 | 4034,1,PLAIN,COMPANY,ToBeNormalized,c o m p a n y,COMPANY
263 | 4039,18,MEASURE,295 ch,ToBeNormalized,two hundred ninety five hours,two hundred ninety five chains
264 | 4048,0,PLAIN,CAT,ToBeNormalized,c a t,CAT
265 | 4079,8,PLAIN,XI,ToBeNormalized,eleven,XI
266 | 4085,8,VERBATIM,Θ,ToBeNormalized,eta,theta
267 | 4102,1,PLAIN,SM,ToBeNormalized,s m,SM
268 | 4111,5,PLAIN,tri,ToBeNormalized,t r i,tri
269 | 4126,5,PLAIN,SEC,ToBeNormalized,s e c,SEC
270 | 4136,3,ELECTRONIC,http://www.jstor.org/stable/2799027;,ToBeNormalized,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot j_letter _letter s_letter t_letter o_letter r_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter t_letter a_letter b_letter l_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter ,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot j_letter _letter s_letter t_letter o_letter r_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter t_letter a_letter b_letter l_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter s_letter e_letter v_letter e_letter n_letter _letter n_letter i_letter n_letter e_letter _letter n_letter i_letter n_letter e_letter _letter o_letter _letter t_letter w_letter o_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter m_letter i_letter c_letter o_letter l_letter o_letter n_letter
271 | 4139,21,PLAIN,INTERVIEW,ToBeNormalized,i n t e r v i e w,INTERVIEW
272 | 4162,5,PLAIN,GUS,ToBeNormalized,g u s,GUS
273 | 4171,18,PUNCT,-,ToBeNormalized,to,-
274 | 4172,2,PUNCT,-,ToBeNormalized,to,-
275 | 4178,5,PLAIN,VEVO,ToBeNormalized,v e v o,VEVO
276 | 4189,6,TELEPHONE,1999-2000 BCA,ToBeNormalized,one nine nine nine sil two hundred sil b c a,one nine nine nine sil two thousand sil b c a
277 | 4195,1,PLAIN,Dilweg,RemainSelf,Dilweg,dil weg
278 | 4215,8,PLAIN,Li,ToBeNormalized,l i,Li
279 | 4235,1,PLAIN,Huba,ToBeNormalized,h u b a,Huba
280 | 4244,2,PLAIN,DIN,ToBeNormalized,d i n,DIN
281 | 4244,12,MEASURE,2 mA,ToBeNormalized,two a m,two milli amperes
282 | 4252,2,PLAIN,IRA,ToBeNormalized,i r a,IRA
283 | 4272,0,PLAIN,DIC,ToBeNormalized,d i c,DIC
284 | 4273,12,PLAIN,VIDEO,ToBeNormalized,v i d o i,VIDEO
285 | 4276,11,PLAIN,Ajaji,ToBeNormalized,a j a j i,Ajaji
286 | 4279,5,PLAIN,Rep,ToBeNormalized,r e p,Rep
287 | 4307,2,PLAIN,HOWZE,ToBeNormalized,h o w z e,HOWZE
288 | 4343,7,PLAIN,CAZy,ToBeNormalized,c a z y,CAZy
289 | 4365,7,PLAIN,Esma,ToBeNormalized,e s m a,Esma
290 | 4400,23,PLAIN,CEA,ToBeNormalized,c e a,CEA
291 | 4457,3,PLAIN,Isla,ToBeNormalized,i s l a,Isla
292 | 4458,3,PLAIN,SAT,ToBeNormalized,s a t,SAT
293 | 4480,9,PLAIN,pro,ToBeNormalized,p r o,pro
294 | 4480,14,LETTERS,Nea,RemainSelf,Nea,n e a
295 | 4500,5,PLAIN,BY,ToBeNormalized,b y,BY
296 | 4500,11,PLAIN,I,ToBeNormalized,the first,I
297 | 4520,4,PLAIN,Esma,ToBeNormalized,e s m a,Esma
298 | 4521,0,PLAIN,CURRICULUM,ToBeNormalized,c u r r i c u l u m,CURRICULUM
299 | 4536,3,PLAIN,Esma,ToBeNormalized,e s m a,Esma
300 | 4550,5,PLAIN,SA,ToBeNormalized,s a,SA
301 | 4565,5,PLAIN,UNESCO,ToBeNormalized,u n e s c o,UNESCO
302 | 4572,1,PUNCT,-,ToBeNormalized,to,-
303 | 4600,1,PLAIN,Viz,ToBeNormalized,v i z,Viz
304 | 4601,2,ELECTRONIC,www.cdc.gov/HealthyYouth/shpps/2006/factsheets/pdf/FS_Overview_SHPPS2006.pdf,ToBeNormalized,w_letter _letter w_letter _letter w_letter dot c_letter d_letter c_letter dot g_letter o_letter v_letter _letter s_letter l_letter a_letter s_letter h_letter _letter h_letter e_letter a_letter l_letter t_letter h_letter y_letter y_letter o_letter u_letter t_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter o_letter _letter o_letter _letter p_letter _letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter a_letter c_letter t_letter s_letter h_letter e_letter e_letter t_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter _letter d_letter _letter f_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter _letter s_letter _letter t_letter h_letter r_letter e_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter o_letter u_letter r_letter _letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter o_letter r_letter t_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter,w_letter _letter w_letter _letter w_letter dot c_letter _letter d_letter _letter c_letter dot g_letter o_letter v_letter _letter s_letter l_letter a_letter s_letter h_letter _letter h_letter e_letter a_letter l_letter t_letter h_letter y_letter y_letter o_letter u_letter t_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter _letter h_letter _letter p_letter _letter p_letter _letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter _letter s_letter i_letter x_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter a_letter c_letter t_letter s_letter h_letter e_letter e_letter t_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter _letter d_letter _letter f_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter _letter s_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter o_letter v_letter e_letter r_letter v_letter i_letter e_letter w_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter s_letter _letter h_letter _letter p_letter _letter p_letter _letter s_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter _letter s_letter i_letter x_letter dot p_letter _letter d_letter _letter f_letter
305 | 4614,9,PLAIN,FEBRUARY,ToBeNormalized,f e b r u a r y,FEBRUARY
306 | 4618,2,PLAIN,stylised,RemainSelf,stylised,stylized
307 | 4643,16,PLAIN,V,ToBeNormalized,the fifth,V
308 | 4651,16,PLAIN,IA,ToBeNormalized,i a,IA
309 | 4661,8,PLAIN,RITA,ToBeNormalized,r i t a,RITA
310 | 4677,18,LETTERS,Lun,RemainSelf,Lun,l u n
311 | 4680,7,PLAIN,V,ToBeNormalized,the fifth,V
312 | 4697,3,PLAIN,SEED,ToBeNormalized,s e e d,SEED
313 | 4704,1,PLAIN,NA's,ToBeNormalized,n a's,NA's
314 | 4733,3,PLAIN,Us,ToBeNormalized,u s,Us
315 | 4736,18,PLAIN,programmed,ToBeNormalized,program,programmed
316 | 4748,15,PLAIN,PCs,ToBeNormalized,p c's,PCs
317 | 4766,2,PLAIN,GUJARAT,ToBeNormalized,g u j a r a t,GUJARAT
318 | 4766,3,PLAIN,LEGISLATIVE,ToBeNormalized,l e g i s l a t i v e,LEGISLATIVE
319 | 4784,3,PLAIN,col,ToBeNormalized,colonel,col
320 | 4816,4,PLAIN,KI,ToBeNormalized,k i,KI
321 | 4834,9,PLAIN,LIX,ToBeNormalized,l i x,LIX
322 | 4835,11,PLAIN,ICA,ToBeNormalized,i c a,ICA
323 | 4850,10,PLAIN,ArmandoFabiolaJackieAlexAfter,ToBeNormalized,a r m a n d o f a b i o l a j a c k i e a l e x a x t e r,ArmandoFabiolaJackieAlexAfter
324 | 4859,0,PLAIN,PIN,ToBeNormalized,p i n,PIN
325 | 4885,5,PUNCT,-,ToBeNormalized,to,-
326 | 4895,12,DIGIT,1271,ToBeNormalized,twelve seventy one,one two seven one
327 | 4896,1,PLAIN,EL,ToBeNormalized,e l,EL
328 | 4907,4,PLAIN,Ass,ToBeNormalized,a s s,Ass
329 | 4911,14,PLAIN,WHOI,ToBeNormalized,w h o i,WHOI
330 | 4918,9,PLAIN,idolised,RemainSelf,idolised,idolized
331 | 4929,1,PLAIN,V,ToBeNormalized,five,V
332 | 4931,17,PLAIN,nuoc,ToBeNormalized,n u o c,nuoc
333 | 4931,18,PLAIN,ngoai,ToBeNormalized,n g o a i,ngoai
334 | 4932,19,PLAIN,Etna,ToBeNormalized,e t n a,Etna
335 | 4958,4,PLAIN,MIR,ToBeNormalized,m i r,MIR
336 | 4958,15,PLAIN,miRNA,ToBeNormalized,m i r n a,miRNA
337 | 4960,1,PLAIN,secularisation,ToBeNormalized,saint,secularization
338 | 4997,0,PLAIN,Qui,ToBeNormalized,q u i,Qui
339 | 4997,1,PLAIN,est,ToBeNormalized,e s t,est
340 | 4997,2,LETTERS,Abdu'l,RemainSelf,Abdu'l,a b d u l
341 | 5014,2,PLAIN,IRA,ToBeNormalized,i r a,IRA
342 | 5025,6,PLAIN,Vaux,ToBeNormalized,v a u x,Vaux
343 | 5034,0,PLAIN,Neurocomputational,ToBeNormalized,behavioral,Neurocomputational
344 | 5039,7,PLAIN,resocialisation,ToBeNormalized,w,resocialization
345 | 5064,9,PLAIN,ix,ToBeNormalized,i x,ix
346 | 5071,3,PLAIN,Caichigue,ToBeNormalized,catalog,Caichigue
347 | 5115,12,PLAIN,MIT,ToBeNormalized,m i t,MIT
348 | 5158,4,PLAIN,BAS,ToBeNormalized,b a s,BAS
349 | 5160,2,PLAIN,ABS,ToBeNormalized,a b s,ABS
350 | 5166,5,PLAIN,GUS,ToBeNormalized,g u s,GUS
351 | 5195,5,PLAIN,INTRODUCTION,ToBeNormalized,i n t r o d u c t i o n,INTRODUCTION
352 | 5195,7,PLAIN,CADES,ToBeNormalized,c a d e s,CADES
353 | 5195,10,PLAIN,SERVICE,ToBeNormalized,s e r v i c e,SERVICE
354 | 5195,11,PLAIN,PROVIDERS,ToBeNormalized,p r o v i d e r s,PROVIDERS
355 | 5196,8,PLAIN,czar,ToBeNormalized,c z a r,czar
356 | 5236,12,PLAIN,A's,ToBeNormalized,a s's,A's
357 | 5238,2,PLAIN,mrs,ToBeNormalized,m r s,mrs
358 | 5253,6,PLAIN,Zuk,ToBeNormalized,z u k,Zuk
359 | 5257,4,PLAIN,ANHYDRASE,ToBeNormalized,a n h y d r a s e,ANHYDRASE
360 | 5277,10,PLAIN,miRNA,ToBeNormalized,m i r n a,miRNA
361 | 5290,10,PLAIN,Lug,ToBeNormalized,l u g,Lug
362 | 5301,0,LETTERS,Mpigi,RemainSelf,Mpigi,m p i g i
363 | 5301,10,LETTERS,Mpigi,RemainSelf,Mpigi,m p i g i
364 | 5315,0,PLAIN,Kroh,ToBeNormalized,k r o h,Kroh
365 | 5316,11,LETTERS,Mpigi,RemainSelf,Mpigi,m p i g i
366 | 5381,6,DATE,00s,ToBeNormalized,hundreds,o o
367 | 5383,13,PLAIN,vii,ToBeNormalized,v i i,vii
368 | 5419,0,PLAIN,Nieuw,ToBeNormalized,n i e u w,Nieuw
369 | 5422,5,PLAIN,PANDAS,ToBeNormalized,p a n d a s,PANDAS
370 | 5511,8,ELECTRONIC,//www.nytimes.com/2014/06/19/fashion/no-body-talk-summer-camps.html,ToBeNormalized,s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot n_letter _letter y_letter _letter t_letter _letter i_letter _letter m_letter _letter e_letter _letter s_letter dot c_letter o_letter m_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter o_letter _letter s_letter i_letter x_letter _letter s_letter l_letter a_letter s_letter h_letter _letter o_letter _letter s_letter i_letter x_letter _letter s_letter l_letter a_letter s_letter h_letter _letter o_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter o_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter o_letter u_letter r_letter _letter o_letter n_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter,s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot n_letter y_letter t_letter i_letter m_letter e_letter s_letter dot c_letter o_letter m_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter e_letter n_letter t_letter y_letter _letter f_letter o_letter u_letter r_letter t_letter e_letter e_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter o_letter _letter s_letter i_letter x_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter i_letter n_letter e_letter t_letter e_letter e_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter a_letter s_letter h_letter i_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter o_letter _letter d_letter a_letter s_letter h_letter _letter b_letter o_letter d_letter y_letter _letter d_letter a_letter s_letter h_letter _letter t_letter a_letter l_letter k_letter _letter d_letter a_letter s_letter h_letter _letter s_letter u_letter m_letter m_letter e_letter r_letter _letter d_letter a_letter s_letter h_letter _letter c_letter a_letter m_letter p_letter s_letter dot h_letter _letter t_letter _letter m_letter _letter l_letter
371 | 5553,13,PLAIN,SM,ToBeNormalized,s m,SM
372 | 5607,8,LETTERS,zvaz,RemainSelf,zvaz,z v a z
373 | 5633,4,PLAIN,PRU,ToBeNormalized,p r u,PRU
374 | 5633,5,PLAIN,CHA,ToBeNormalized,c h a,CHA
375 | 5634,11,PLAIN,XI,ToBeNormalized,eleven,XI
376 | 5634,15,PLAIN,WILD,ToBeNormalized,w i l d,WILD
377 | 5641,9,LETTERS,Arci,RemainSelf,Arci,a r c i
378 | 5683,3,CARDINAL,310780751,ToBeNormalized,thirty one million seventy eight thousand seventy five,three hundred ten million seven hundred eighty thousand seven hundred fifty one
379 | 5684,4,PLAIN,ZOLaw,ToBeNormalized,z o l a w,ZOLaw
380 | 5688,0,PLAIN,Issa,ToBeNormalized,i s s a,Issa
381 | 5705,14,PLAIN,Rep,ToBeNormalized,r e p,Rep
382 | 5711,7,PLAIN,mrs,ToBeNormalized,mister,mrs
383 | 5744,0,PLAIN,NASA,ToBeNormalized,n a s a,NASA
384 | 5746,8,LETTERS,Ilam,RemainSelf,Ilam,i l a m
385 | 5747,6,PLAIN,PCs,ToBeNormalized,p c's,PCs
386 | 5756,12,PLAIN,COO,ToBeNormalized,c o o,COO
387 | 5757,8,LETTERS,Fase,RemainSelf,Fase,f a s e
388 | 5769,6,PLAIN,Wu,ToBeNormalized,w u,Wu
389 | 5771,9,LETTERS,OAM,RemainSelf,OAM,o a m
390 | 5772,4,PLAIN,MAC,ToBeNormalized,m a c,MAC
391 | 5778,2,PLAIN,Xiu,ToBeNormalized,x i u,Xiu
392 | 5803,3,PLAIN,Isla,ToBeNormalized,i s l a,Isla
393 | 5828,2,FRACTION,-133/94,ToBeNormalized,minus thirty three hundred ninety fourths,minus one hundred thirty three ninety fourths
394 | 5841,2,PLAIN,Centro,ToBeNormalized,center,Centro
395 | 5841,7,PLAIN,sul,ToBeNormalized,s u l,sul
396 | 5848,15,PLAIN,Ach,ToBeNormalized,a c h,Ach
397 | 5851,5,DECIMAL,.763,ToBeNormalized,seven hundred sixty three,point seven six three
398 | 5865,3,FRACTION,⅞,ToBeNormalized,upsilon,seven eighths
399 | 5907,2,PLAIN,Sétif,ToBeNormalized,s e acute t i f,Sétif
400 | 5907,7,PLAIN,ES,ToBeNormalized,e s,ES
401 | 5907,8,PLAIN,Sétif,ToBeNormalized,s e acute t i f,Sétif
402 | 5928,11,PLAIN,DOS,ToBeNormalized,d o s,DOS
403 | 5945,5,DIGIT,68821,ToBeNormalized,sixty eight thousand eight hundred twenty one,six eight eight two one
404 | 5963,7,PLAIN,du,ToBeNormalized,d u,du
405 | 5963,9,PLAIN,er,ToBeNormalized,e r,er
406 | 5966,1,PLAIN,MIT,ToBeNormalized,m i t,MIT
407 | 5977,9,PLAIN,SAVE,ToBeNormalized,s a v e,SAVE
408 | 5988,6,PLAIN,OST,ToBeNormalized,o s t,OST
409 | 5991,3,DATE,10/10/00,ToBeNormalized,ten tenth,the tenth of october o o
410 | 5991,9,VERBATIM,-,ToBeNormalized,to,-
411 | 6009,8,PLAIN,PAC,ToBeNormalized,p a c,PAC
412 | 6021,12,PLAIN,andWissenschaftliche,ToBeNormalized,a_letter,andWissenschaftliche
413 | 6043,1,PLAIN,I'm,ToBeNormalized,one meter,I'm
414 | 6059,13,PLAIN,equalised,ToBeNormalized,e,equalized
415 | 6063,5,PLAIN,SPIN,ToBeNormalized,s p i n,SPIN
416 | 6079,10,ORDINAL,III,ToBeNormalized,three,the third
417 | 6084,1,PLAIN,ORCHESTRA,ToBeNormalized,o r c h e s t r a,ORCHESTRA
418 | 6084,3,PLAIN,SAMPLES,ToBeNormalized,s a m p l e s,SAMPLES
419 | 6099,0,LETTERS,mr,ToBeNormalized,mister,m r
420 | 6101,4,PLAIN,I,ToBeNormalized,the first,I
421 | 6112,7,PLAIN,KANU,ToBeNormalized,k a n u,KANU
422 | 6115,6,PLAIN,d'etre,ToBeNormalized,n e t r e,d'etre
423 | 6123,5,DATE,2010,ToBeNormalized,two o one o,twenty ten
424 | 6124,2,PLAIN,IL,ToBeNormalized,i l,IL
425 | 6142,7,PLAIN,ne's,ToBeNormalized,n e's,ne's
426 | 6151,1,PLAIN,neighbourhood's,ToBeNormalized,neighborhoods,neighbourhood's
427 | 6155,21,PLAIN,fundraise,ToBeNormalized,f_letter,fundraise
428 | 6168,4,PLAIN,AB,ToBeNormalized,a b,AB
429 | 6206,1,LETTERS,Tou,RemainSelf,Tou,t o u
430 | 6218,1,PLAIN,Aamir,ToBeNormalized,a a m i r,Aamir
431 | 6235,5,LETTERS,d'Yeu,RemainSelf,d'Yeu,d y e u
432 | 6235,15,PLAIN,V,ToBeNormalized,the fifth,V
433 | 6258,14,PLAIN,COM,ToBeNormalized,c o m,COM
434 | 6259,2,PLAIN,all,ToBeNormalized,a l l,all
435 | 6261,11,ELECTRONIC,https://web.archive.org/20130716070450/http://www.warriors.co.nz:80/playerprofiledisplay/Warriors/Suaia%20Matagi/7207,ToBeNormalized,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter e_letter b_letter dot a_letter r_letter c_letter h_letter i_letter v_letter e_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter h_letter t_letter t_letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter,h_letter _letter t_letter _letter t_letter _letter p_letter _letter s_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter e_letter b_letter dot a_letter r_letter c_letter h_letter i_letter v_letter e_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter n_letter e_letter _letter t_letter h_letter r_letter e_letter e_letter _letter o_letter _letter s_letter e_letter v_letter e_letter n_letter _letter o_letter n_letter e_letter _letter s_letter i_letter x_letter _letter o_letter _letter s_letter e_letter v_letter e_letter n_letter _letter o_letter _letter f_letter o_letter u_letter r_letter _letter f_letter i_letter v_letter e_letter _letter o_letter _letter s_letter l_letter a_letter s_letter h_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot w_letter a_letter r_letter r_letter i_letter o_letter r_letter s_letter dot c_letter o_letter dot n_letter _letter z_letter _letter c_letter o_letter l_letter o_letter n_letter _letter e_letter i_letter g_letter h_letter t_letter y_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter l_letter a_letter y_letter e_letter r_letter p_letter r_letter o_letter f_letter i_letter l_letter e_letter d_letter i_letter s_letter p_letter l_letter a_letter y_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter a_letter r_letter r_letter i_letter o_letter r_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter u_letter a_letter i_letter a_letter _letter p_letter e_letter r_letter c_letter e_letter n_letter t_letter _letter t_letter w_letter e_letter n_letter t_letter y_letter _letter m_letter a_letter t_letter a_letter g_letter i_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter e_letter v_letter e_letter n_letter _letter t_letter w_letter o_letter _letter o_letter _letter s_letter e_letter v_letter e_letter n_letter
436 | 6280,0,PLAIN,EGAN,ToBeNormalized,e g a n,EGAN
437 | 6280,2,PLAIN,JOHN,ToBeNormalized,j o h n,JOHN
438 | 6280,4,PLAIN,THOMAS,ToBeNormalized,t h o m a s,THOMAS
439 | 6284,3,PLAIN,Up,ToBeNormalized,u p,Up
440 | 6289,1,PLAIN,AS,ToBeNormalized,a s,AS
441 | 6314,21,PLAIN,Naama,ToBeNormalized,n a a m a,Naama
442 | 6329,4,PLAIN,Oh,ToBeNormalized,o h,Oh
443 | 6339,2,DIGIT,2013,ToBeNormalized,twenty thirteen,two o one three
444 | 6358,5,PLAIN,Ill,ToBeNormalized,i l l,Ill
445 | 6365,6,DATE,2010,ToBeNormalized,two o one o,twenty ten
446 | 6410,19,PLAIN,DID,ToBeNormalized,d i d,DID
447 | 6431,4,PLAIN,Prus',ToBeNormalized,p r u's,Prus'
448 | 6438,9,PLAIN,Lok,ToBeNormalized,l o k,Lok
449 | 6473,2,PLAIN,Dev,ToBeNormalized,d e v,Dev
450 | 6473,13,PLAIN,Dev,ToBeNormalized,d e v,Dev
451 | 6482,4,PLAIN,Prus',ToBeNormalized,p r u's,Prus'
452 | 6496,4,PLAIN,UCCIO,ToBeNormalized,u c c i o,UCCIO
453 | 6496,10,PLAIN,DE,ToBeNormalized,d e,DE
454 | 6504,6,PLAIN,revolutionise,RemainSelf,revolutionise,revolutionize
455 | 6506,10,PLAIN,ACT,ToBeNormalized,a c t,ACT
456 | 6524,11,PLAIN,AIDS,ToBeNormalized,a i d s,AIDS
457 | 6577,21,PLAIN,MUN,ToBeNormalized,m u n,MUN
458 | 6581,6,LETTERS,nul,RemainSelf,nul,n u l
459 | 6612,5,PLAIN,DOI,ToBeNormalized,d o i,DOI
460 | 6618,7,PLAIN,Ku,ToBeNormalized,k u,Ku
461 | 6637,2,PLAIN,I,ToBeNormalized,the first,I
462 | 6648,11,PLAIN,GOV,ToBeNormalized,g o v,GOV
463 | 6711,9,PLAIN,ISO,ToBeNormalized,i s o,ISO
464 | 6717,1,PLAIN,I,ToBeNormalized,one,I
465 | 6743,1,PLAIN,est,ToBeNormalized,e s t,est
466 | 6777,3,CARDINAL,V,ToBeNormalized,the fifth,five
467 | 6800,0,PLAIN,REGIO,ToBeNormalized,r e g i o,REGIO
468 | 6800,20,PLAIN,ACID,ToBeNormalized,a c i d,ACID
469 | 6814,7,PLAIN,AG,ToBeNormalized,a g,AG
470 | 6818,0,PLAIN,DAR,ToBeNormalized,d a r,DAR
471 | 6844,1,PLAIN,Umro,ToBeNormalized,u m r o,Umro
472 | 6856,5,PLAIN,B's,ToBeNormalized,b's,B's
473 | 6857,9,PLAIN,Scat,ToBeNormalized,s c a t,Scat
474 | 6874,4,PLAIN,I,ToBeNormalized,the first,I
475 | 6878,5,PLAIN,Glas,ToBeNormalized,g l a's,Glas
476 | 6887,18,PUNCT,-,ToBeNormalized,to,-
477 | 6938,5,PLAIN,GUS,ToBeNormalized,g u s,GUS
478 | 6945,1,ORDINAL,II,ToBeNormalized,two,the second
479 | 6946,4,PLAIN,digitised,RemainSelf,digitised,digitized
480 | 6946,11,PLAIN,Digitisation,ToBeNormalized,digitized,Digitisation
481 | 6964,12,PLAIN,cue,ToBeNormalized,c u e,cue
482 | 6970,3,PLAIN,GONZALES,ToBeNormalized,g o n z a l e s,GONZALES
483 | 6972,7,PLAIN,synthestration,ToBeNormalized,s_letter,synthestration
484 | 6978,1,PLAIN,iPad,ToBeNormalized,i p a d,iPad
485 | 6999,4,PLAIN,Shab,ToBeNormalized,s h a b,Shab
486 | 7012,2,ELECTRONIC,http://www.tmaxsoft.com/product/productView.do,ToBeNormalized,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot t_letter _letter m_letter _letter a_letter _letter x_letter _letter s_letter _letter o_letter _letter f_letter _letter t_letter dot c_letter o_letter m_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter r_letter o_letter d_letter u_letter c_letter t_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter r_letter o_letter d_letter u_letter c_letter t_letter v_letter i_letter e_letter w_letter dot d_letter o_letter,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot t_letter m_letter a_letter x_letter s_letter o_letter f_letter t_letter dot c_letter o_letter m_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter r_letter o_letter d_letter u_letter c_letter t_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter r_letter o_letter d_letter u_letter c_letter t_letter v_letter i_letter e_letter w_letter dot d_letter o_letter
487 | 7029,3,PLAIN,DOS,ToBeNormalized,d o s,DOS
488 | 7031,14,PLAIN,DOS,ToBeNormalized,d o s,DOS
489 | 7033,19,PLAIN,CASTOR,ToBeNormalized,c a s t o r,CASTOR
490 | 7039,5,PLAIN,ab,ToBeNormalized,a b,ab
491 | 7040,3,PLAIN,IU,ToBeNormalized,i u,IU
492 | 7086,12,PLAIN,LIF,ToBeNormalized,l i f,LIF
493 | 7086,21,PLAIN,STAT,ToBeNormalized,s t a t,STAT
494 | 7088,11,PLAIN,circumcised,RemainSelf,circumcised,circumcized
495 | 7090,6,PLAIN,GRIN,ToBeNormalized,g r i n,GRIN
496 | 7094,0,PLAIN,HSI,ToBeNormalized,h s i,HSI
497 | 7099,0,PUNCT,¿,ToBeNormalized,f,¿
498 | 7109,6,PLAIN,TAP,ToBeNormalized,t a p,TAP
499 | 7112,4,PUNCT,:,ToBeNormalized,to,:
500 | 7113,4,LETTERS,IV,ToBeNormalized,the fourth,i v
501 | 7123,12,LETTERS,Ssese,RemainSelf,Ssese,s s e s e
502 | 7132,13,DATE,1991,ToBeNormalized,one thousand nine hundred ninety one,nineteen ninety one
503 | 7180,9,LETTERS,Ald,RemainSelf,Ald,a l d
504 | 7196,7,PLAIN,XI,ToBeNormalized,eleven,XI
505 | 7196,10,TIME,18:00:00Z,ToBeNormalized,eighteen hours u seconds,eighteen hours zero minutes and zero seconds z
506 | 7230,16,DATE,1968,ToBeNormalized,one thousand nine hundred sixty eight,nineteen sixty eight
507 | 7234,4,PLAIN,ski,ToBeNormalized,s k i,ski
508 | 7235,13,PLAIN,Lok,ToBeNormalized,l o k,Lok
509 | 7250,3,TIME,0:02:01,ToBeNormalized,zero hours two minutes and one seconds,zero hours two minutes and one second
510 | 7256,5,PLAIN,CAB,ToBeNormalized,c a b,CAB
511 | 7286,4,PLAIN,ABA,ToBeNormalized,a b a,ABA
512 | 7291,2,PLAIN,OBE,ToBeNormalized,o b e,OBE
513 | 7295,15,PLAIN,scam,ToBeNormalized,s c a m,scam
514 | 7298,8,PLAIN,est,ToBeNormalized,e s t,est
515 | 7334,1,CARDINAL,1298015,ToBeNormalized,one two nine eight o one five,one million two hundred ninety eight thousand fifteen
516 | 7354,18,PLAIN,IX,ToBeNormalized,nine,IX
517 | 7364,11,PLAIN,Asaf,ToBeNormalized,a s a f,Asaf
518 | 7372,6,LETTERS,Graz'zt,RemainSelf,Graz'zt,g r a z z t
519 | 7385,1,LETTERS,Suat,RemainSelf,Suat,s u a t
520 | 7385,4,PLAIN,odul,ToBeNormalized,o d u l,odul
521 | 7388,20,PUNCT,-,ToBeNormalized,to,-
522 | 7388,24,PUNCT,-,ToBeNormalized,to,-
523 | 7392,7,PLAIN,I,ToBeNormalized,the first,I
524 | 7409,5,PLAIN,LAN,ToBeNormalized,l a n,LAN
525 | 7440,5,PLAIN,GO,ToBeNormalized,g o,GO
526 | 7487,2,LETTERS,Ekow,RemainSelf,Ekow,e k o w
527 | 7492,8,LETTERS,Lwala,RemainSelf,Lwala,l w a l a
528 | 7495,1,PLAIN,Ski,ToBeNormalized,s k i,Ski
529 | 7498,4,LETTERS,bd,RemainSelf,bd,b d
530 | 7530,5,LETTERS,Smer,RemainSelf,Smer,s m e r
531 | 7532,18,LETTERS,Mzee,RemainSelf,Mzee,m z e e
532 |
--------------------------------------------------------------------------------
/results/russian/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/results/russian/.gitkeep
--------------------------------------------------------------------------------
/results/russian/Semiotic_Class-wise_Accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/results/russian/Semiotic_Class-wise_Accuracy.png
--------------------------------------------------------------------------------
/results/russian/classwise_accuracy.csv:
--------------------------------------------------------------------------------
1 | semiotic-class,accuracy,count,correct
2 | ALL,0.9928752306965964,93196,92532
3 | CARDINAL,0.9417922948073701,2388,2249
4 | DATE,0.9732441471571907,1495,1455
5 | DECIMAL,0.9,60,54
6 | DIGIT,1.0,16,16
7 | ELECTRONIC,0.6041666666666666,48,29
8 | FRACTION,0.6086956521739131,23,14
9 | LETTERS,0.9907608695652174,1840,1823
10 | MEASURE,0.8978102189781022,411,369
11 | MONEY,0.8947368421052632,19,17
12 | ORDINAL,0.9461358313817331,427,404
13 | PLAIN,0.994688407139769,64764,64420
14 | PUNCT,0.9998519542045006,20264,20261
15 | TELEPHONE,0.8202247191011236,89,73
16 | TIME,0.75,8,6
17 | VERBATIM,0.9985119047619048,1344,1342
18 |
--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | echo "Downloading and extracting required files"
3 | wget https://storage.googleapis.com/text_normalization/test_data.zip
4 | wget https://storage.googleapis.com/text_normalization/dnc_model.zip
5 | rm -rf data
6 | rm -rf models
7 | unzip test_data.zip
8 | unzip dnc_model.zip
9 | rm test_data.zip
10 | rm dnc_model.zip
11 | echo "Finished"
12 |
13 |
--------------------------------------------------------------------------------
/src/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/src/.gitkeep
--------------------------------------------------------------------------------
/src/DNCnormalize.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Cognibit Solutions LLP.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """
16 | Text Normalization using Differentiable Neural Computer
17 |
18 | """
19 |
20 | import os
21 | import numpy as np
22 | import tensorflow as tf
23 | from collections import OrderedDict
24 |
25 | from lib.seq2seq import Seq2SeqModel
26 |
27 |
28 | # ----------------------
29 | # Model Flag Parameters
30 | # ----------------------
31 | config={
32 | 'cell_type':'dnc',
33 | 'attention_type':'bahdanau',
34 | 'hidden_units':1024,
35 | 'depth':1,
36 | 'embedding_size':32,
37 | 'memory_size':256,
38 | 'word_size':64,
39 | 'num_writes':1,
40 | 'num_reads':5,
41 | 'clip_value':20,
42 | 'beam_width':1,
43 | 'max_decode_step':150,
44 | 'use_residual':False,
45 | 'attn_input_feeding':True,
46 | 'use_dropout':False,
47 | 'dropout_rate':0.3,
48 | 'use_fp16':False
49 |
50 | }
51 |
52 |
53 | def normalize(enc_data, enc_len, model_path,batch_size=200,use_memory=True):
54 | """Normalize encoded data using the trained DNC model given"""
55 |
56 | # Initiate TF session
57 | tf.reset_default_graph()
58 | dnc_predictions=[]
59 | with tf.Session() as sess:
60 | print('Using DNC model at {}'.format(model_path))
61 | model=create_model_decode(batch_size=batch_size,use_memory=use_memory)
62 | restore_model(model,sess,model_path)
63 |
64 | num_batches=int(enc_data.shape[0]/batch_size)
65 | print('Number of batches: {}'.format(num_batches))
66 |
67 | for i in range(num_batches):
68 | predict=model.predict(sess,enc_data[i*batch_size:i*batch_size+batch_size],
69 | enc_len[i*batch_size:i*batch_size+batch_size])
70 | predict = np.split(predict,batch_size,axis=0)
71 | dnc_predictions.extend(predict)
72 |
73 | if i%(int(num_batches/25)) == 0:
74 | print('Normalized {} out of {}'.format((i+1)*batch_size,
75 | num_batches*batch_size))
76 |
77 | #Process the last batch by adding zeros to the end
78 | if(enc_data.shape[0]%batch_size != 0):
79 | lastbatch = enc_data[num_batches*batch_size:]
80 | lastbatch_len= enc_len[num_batches*batch_size:]
81 | lastbatch=np.concatenate((lastbatch,np.zeros([batch_size-lastbatch.shape[0],
82 | lastbatch.shape[1]])),axis=0)
83 |
84 | lastbatch_len=np.concatenate((lastbatch_len,
85 | np.ones([batch_size-lastbatch_len.shape[0]])),axis=0)
86 |
87 | predict=model.predict(sess,lastbatch,lastbatch_len)
88 | predict=np.split(predict,batch_size,axis=0)
89 | dnc_predictions.extend(predict)
90 |
91 | return dnc_predictions
92 |
93 | def create_model_decode(batch_size,use_memory):
94 | model = Seq2SeqModel(config,'decode',batch_size,use_memory=use_memory)
95 | return model
96 |
97 | def restore_model(model, sess, model_path):
98 | print('Reloading model parameters...')
99 | model.restore(sess, model_path)
100 | return None
--------------------------------------------------------------------------------
/src/Encoder.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Cognibit Solutions LLP.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """
16 | Generate Required Encoding for XGBoost and DNC Model
17 |
18 | """
19 |
20 | import pickle
21 | import numpy as np
22 | from multiprocessing import Pool
23 | import pandas as pd
24 | import itertools
25 |
26 | class XGBoostEncodingGenerator:
27 |
28 | def __init__(self,space_letter=0,max_num_features = 30,pad_size = 1,boundary_letter = -1):
29 | self.space_letter=space_letter
30 | self.max_num_features=max_num_features
31 | self.boundary_letter=boundary_letter
32 | self.pad_size=pad_size
33 |
34 | def context_window_transform(self,data, pad_size,flush_progress=True):
35 | pre = np.zeros(self.max_num_features)
36 | pre = [pre for x in np.arange(pad_size)]
37 | data = pre + data + pre
38 | neo_data = []
39 | for i in np.arange(len(data) - pad_size * 2):
40 | row = []
41 | if(flush_progress and i%100==0):
42 | print('Processed %f%%'%((i/(len(data) - pad_size * 2-1))*100),end='\r')
43 | for x in data[i : i + pad_size * 2 + 1]:
44 | row.append([self.boundary_letter])
45 | row.append(x)
46 | row.append([self.boundary_letter])
47 | merged=list(itertools.chain(*row))
48 | neo_data.append(merged)
49 | if(flush_progress):
50 | print('Processed 100% ',end='\r')
51 | return neo_data
52 |
53 | def encode(self,df):
54 | x_data = []
55 | for x in df['before'].values:
56 | x_row = np.ones(self.max_num_features, dtype=int) * self.space_letter
57 | for xi, i in zip(list(str(x)), np.arange(self.max_num_features)):
58 | x_row[i] = ord(xi)
59 | x_data.append(x_row)
60 | return np.array(self.context_window_transform(x_data, self.pad_size), dtype = np.int16)
61 |
62 | def encode_csv(self,csv_file):
63 | csv=pd.read_csv(csv_file)
64 | encoding=self.encode(csv)
65 | print('Finished Encoding %s'%csv_file)
66 | return encoding
67 |
68 | def encode_csvs_parallel(self,csv_list,n_threads=8):
69 | """
70 | Encode Multiple CSVs in parallel
71 | """
72 | if (n_threads < 1):
73 | assert ('nthreads is 1, cannot proceeed!')
74 | threads = Pool(n_threads)
75 | all_enc=threads.map(self.encode_csv,csv_list)
76 | return all_enc
--------------------------------------------------------------------------------
/src/XGBclassify.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Cognibit Solutions LLP.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """
16 | Predict a token as 'ToBeNormalized' or 'RemainSame' using XGBoost
17 | """
18 | import pickle
19 | import xgboost
20 | import numpy as np
21 | import pandas as pd
22 | from Encoder import XGBoostEncodingGenerator
23 |
24 | class XGB:
25 | """XGBoost
26 |
27 | API wrapper for trained XGBoost model
28 | """
29 |
30 | def __init__(self, path='../models/english/en-xgb[0.5]'):
31 | """Initialize & load model with its parameters"""
32 | # init model
33 | self.model = pickle.load(open(path, "rb"))
34 | # load model
35 | # self.model.load_model(path)
36 | # init parameters
37 | self.max_num_features = 30
38 | self.pad_size = 1
39 | self.boundary_letter = -1
40 | self.space_letter = 0
41 | self.labels = ['RemainSelf', 'ToBeNormalized']
42 | return None
43 |
44 | def predict(self, data):
45 | """XGBoost prediction
46 |
47 | Classifies the dataframe's 'before' tokens
48 |
49 | Args:
50 | data: pandas dataframe having 'before' column
51 |
52 | Returns:
53 | y_labels: list of class labels
54 | """
55 | # pre-process data
56 | encoded_data = self._encode(data)
57 | enc_gen = XGBoostEncodingGenerator()
58 |
59 | contextual_data = np.array(enc_gen.context_window_transform(encoded_data, self.pad_size))
60 | columns=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
61 | '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
62 | '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36',
63 | '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48',
64 | '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',
65 | '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72',
66 | '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84',
67 | '85', '86', '87', '88', '89', '90', '91', '92', '93']
68 | X = pd.DataFrame(data=contextual_data, columns=columns)
69 |
70 | # classify as RemainSelf or ToBeNormalized
71 | y = self.model.predict(X)
72 | y_labels = [self.labels[int(i)] for i in y]
73 | return y_labels
74 |
75 | def _encode(self, data):
76 | """Encodes data into vectors"""
77 | encoded_data = []
78 | for x in data['before'].values:
79 | x_row = np.ones(self.max_num_features, dtype=int) * self.space_letter
80 | for xi, i in zip(list(str(x)), np.arange(self.max_num_features)):
81 | x_row[i] = ord(xi)
82 | encoded_data.append(x_row)
83 | return encoded_data
84 |
85 | def _context_window_transform(self, data, pad_size):
86 | """Transforms into a context window"""
87 | pre = np.zeros(self.max_num_features)
88 | pre = [pre for x in np.arange(pad_size)]
89 | data = pre + data + pre
90 | context_data = []
91 | for i in np.arange(len(data) - pad_size * 2):
92 | row = []
93 | for x in data[i: i + pad_size * 2 + 1]:
94 | row.append([self.boundary_letter])
95 | row.append(x)
96 | row.append([self.boundary_letter])
97 | context_data.append([int(x) for y in row for x in y])
98 | return context_data
99 |
--------------------------------------------------------------------------------
/src/classification_report.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Cognibit Solutions LLP.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """
16 |
17 | Generates classification report for the trained XGBoost models
18 | """
19 |
20 | import itertools
21 | import numpy as np
22 | import matplotlib.pyplot as plt
23 | from sklearn.metrics import confusion_matrix as cm
24 | from sklearn.metrics import precision_recall_curve
25 | from sklearn.metrics import average_precision_score
26 | from sklearn.metrics import classification_report as report
27 |
28 | def preprocessing(results, truth):
29 | # preprocessing
30 | results.loc[truth['before']==truth['after'],'truth']='RemainSelf'
31 | results.loc[truth['before']!=truth['after'],'truth']='ToBeNormalized'
32 | truth['class']=''
33 | truth.loc[truth['before']!=truth['after'],'class']='ToBeNormalized'
34 | truth.loc[truth['before']==truth['after'],'class']='RemainSelf'
35 | return results, truth
36 |
37 | def f1_scores(results, truth):
38 | print(report(truth['class'].tolist(), results['class'].tolist()))
39 |
40 | def confusion_matrix(results, truth, lang):
41 | matrix = cm(truth['class'].tolist(), results['class'].tolist())
42 | plot_confusion_matrix(matrix, classes=['ToBeNormalized', 'RemainSelf'],
43 | title='XGBoost Confusion Matrix [{}]'.format(lang))
44 |
45 | def pr_curve(results, truth, lang):
46 | truth.loc[truth['class']=='ToBeNormalized', 'class'] = 1
47 | truth.loc[truth['class']=='RemainSelf', 'class'] = 0
48 | results.loc[results['class']=='ToBeNormalized', 'class'] = 1
49 | results.loc[results['class']=='RemainSelf', 'class'] = 0
50 |
51 | average_precision = average_precision_score(truth['class'].tolist(), results['class'].tolist())
52 | precision, recall, threshold = precision_recall_curve(truth['class'].tolist(), results['class'].tolist())
53 |
54 | plt.step(recall, precision, color='b', alpha=0.2, where='post')
55 | plt.fill_between(recall, precision, alpha=0.2, color='b')
56 | plt.xlabel('Recall')
57 | plt.ylabel('Precision')
58 | plt.ylim([0.0, 1.05])
59 | plt.xlim([0.0, 1.0])
60 | plt.title('Precision-Recall Curve: AP={0:0.2f} [{1}]'.format(average_precision, lang))
61 | plt.show()
62 |
63 | def plot_confusion_matrix(cm, classes,
64 | title='Confusion matrix',
65 | cmap=plt.cm.Blues):
66 | """
67 | This function prints and plots the confusion matrix.
68 | """
69 | plt.imshow(cm, interpolation='nearest', cmap=cmap)
70 | plt.title(title)
71 | plt.colorbar()
72 | tick_marks = np.arange(len(classes))
73 | plt.xticks(tick_marks, classes, rotation=45)
74 | plt.yticks(tick_marks, classes)
75 |
76 | fmt = 'd'
77 | thresh = cm.max() / 2.
78 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
79 | plt.text(j, i, format(cm[i, j], fmt),
80 | horizontalalignment="center",
81 | color="white" if cm[i, j] > thresh else "black")
82 |
83 | plt.ylabel('True label')
84 | plt.xlabel('Predicted label')
85 | plt.tight_layout()
86 |
87 |
--------------------------------------------------------------------------------
/src/lib/access.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """DNC access modules."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import collections
22 | import sonnet as snt
23 | import tensorflow as tf
24 |
25 | from . import addressing
26 | from . import util
27 |
28 | AccessState = collections.namedtuple('AccessState', (
29 | 'memory', 'read_weights', 'write_weights', 'linkage', 'usage'))
30 |
31 |
32 | def _erase_and_write(memory, address, reset_weights, values):
33 | """Module to erase and write in the external memory.
34 |
35 | Erase operation:
36 | M_t'(i) = M_{t-1}(i) * (1 - w_t(i) * e_t)
37 |
38 | Add operation:
39 | M_t(i) = M_t'(i) + w_t(i) * a_t
40 |
41 | where e are the reset_weights, w the write weights and a the values.
42 |
43 | Args:
44 | memory: 3-D tensor of shape `[batch_size, memory_size, word_size]`.
45 | address: 3-D tensor `[batch_size, num_writes, memory_size]`.
46 | reset_weights: 3-D tensor `[batch_size, num_writes, word_size]`.
47 | values: 3-D tensor `[batch_size, num_writes, word_size]`.
48 |
49 | Returns:
50 | 3-D tensor of shape `[batch_size, num_writes, word_size]`.
51 | """
52 | with tf.name_scope('erase_memory', values=[memory, address, reset_weights]):
53 | expand_address = tf.expand_dims(address, 3)
54 | reset_weights = tf.expand_dims(reset_weights, 2)
55 | weighted_resets = expand_address * reset_weights
56 | reset_gate = tf.reduce_prod(1 - weighted_resets, [1])
57 | memory *= reset_gate
58 |
59 | with tf.name_scope('additive_write', values=[memory, address, values]):
60 | add_matrix = tf.matmul(address, values, adjoint_a=True)
61 | memory += add_matrix
62 |
63 | return memory
64 |
65 |
66 | class MemoryAccess(snt.RNNCore):
67 | """Access module of the Differentiable Neural Computer.
68 |
69 | This memory module supports multiple read and write heads. It makes use of:
70 |
71 | * `addressing.TemporalLinkage` to track the temporal ordering of writes in
72 | memory for each write head.
73 | * `addressing.FreenessAllocator` for keeping track of memory usage, where
74 | usage increase when a memory location is written to, and decreases when
75 | memory is read from that the controller says can be freed.
76 |
77 | Write-address selection is done by an interpolation between content-based
78 | lookup and using unused memory.
79 |
80 | Read-address selection is done by an interpolation of content-based lookup
81 | and following the link graph in the forward or backwards read direction.
82 | """
83 |
84 | def __init__(self,
85 | memory_size=128,
86 | word_size=20,
87 | num_reads=1,
88 | num_writes=1,
89 | name='memory_access'):
90 | """Creates a MemoryAccess module.
91 |
92 | Args:
93 | memory_size: The number of memory slots (N in the DNC paper).
94 | word_size: The width of each memory slot (W in the DNC paper)
95 | num_reads: The number of read heads (R in the DNC paper).
96 | num_writes: The number of write heads (fixed at 1 in the paper).
97 | name: The name of the module.
98 | """
99 | super(MemoryAccess, self).__init__(name=name)
100 | self._memory_size = memory_size
101 | self._word_size = word_size
102 | self._num_reads = num_reads
103 | self._num_writes = num_writes
104 |
105 | self._write_content_weights_mod = addressing.CosineWeights(
106 | num_writes, word_size, name='write_content_weights')
107 | self._read_content_weights_mod = addressing.CosineWeights(
108 | num_reads, word_size, name='read_content_weights')
109 |
110 | self._linkage = addressing.TemporalLinkage(memory_size, num_writes)
111 | self._freeness = addressing.Freeness(memory_size)
112 |
113 | def _build(self, inputs, prev_state):
114 | """Connects the MemoryAccess module into the graph.
115 |
116 | Args:
117 | inputs: tensor of shape `[batch_size, input_size]`. This is used to
118 | control this access module.
119 | prev_state: Instance of `AccessState` containing the previous state.
120 |
121 | Returns:
122 | A tuple `(output, next_state)`, where `output` is a tensor of shape
123 | `[batch_size, num_reads, word_size]`, and `next_state` is the new
124 | `AccessState` named tuple at the current time t.
125 | """
126 | inputs = self._read_inputs(inputs)
127 |
128 | # Update usage using inputs['free_gate'] and previous read & write weights.
129 | usage = self._freeness(
130 | write_weights=prev_state.write_weights,
131 | free_gate=inputs['free_gate'],
132 | read_weights=prev_state.read_weights,
133 | prev_usage=prev_state.usage)
134 |
135 | # Write to memory.
136 | write_weights = self._write_weights(inputs, prev_state.memory, usage)
137 | memory = _erase_and_write(
138 | prev_state.memory,
139 | address=write_weights,
140 | reset_weights=inputs['erase_vectors'],
141 | values=inputs['write_vectors'])
142 |
143 | linkage_state = self._linkage(write_weights, prev_state.linkage)
144 |
145 | # Read from memory.
146 | read_weights = self._read_weights(
147 | inputs,
148 | memory=memory,
149 | prev_read_weights=prev_state.read_weights,
150 | link=linkage_state.link)
151 | read_words = tf.matmul(read_weights, memory)
152 |
153 | return (read_words, AccessState(
154 | memory=memory,
155 | read_weights=read_weights,
156 | write_weights=write_weights,
157 | linkage=linkage_state,
158 | usage=usage))
159 |
160 | def _read_inputs(self, inputs):
161 | """Applies transformations to `inputs` to get control for this module."""
162 |
163 | def _linear(first_dim, second_dim, name, activation=None):
164 | """Returns a linear transformation of `inputs`, followed by a reshape."""
165 | linear = snt.Linear(first_dim * second_dim, name=name)(inputs)
166 | if activation is not None:
167 | linear = activation(linear, name=name + '_activation')
168 | return tf.reshape(linear, [-1, first_dim, second_dim])
169 |
170 | # v_t^i - The vectors to write to memory, for each write head `i`.
171 | write_vectors = _linear(self._num_writes, self._word_size, 'write_vectors')
172 |
173 | # e_t^i - Amount to erase the memory by before writing, for each write head.
174 | erase_vectors = _linear(self._num_writes, self._word_size, 'erase_vectors',
175 | tf.sigmoid)
176 |
177 | # f_t^j - Amount that the memory at the locations read from at the previous
178 | # time step can be declared unused, for each read head `j`.
179 | free_gate = tf.sigmoid(
180 | snt.Linear(self._num_reads, name='free_gate')(inputs))
181 |
182 | # g_t^{a, i} - Interpolation between writing to unallocated memory and
183 | # content-based lookup, for each write head `i`. Note: `a` is simply used to
184 | # identify this gate with allocation vs writing (as defined below).
185 | allocation_gate = tf.sigmoid(
186 | snt.Linear(self._num_writes, name='allocation_gate')(inputs))
187 |
188 | # g_t^{w, i} - Overall gating of write amount for each write head.
189 | write_gate = tf.sigmoid(
190 | snt.Linear(self._num_writes, name='write_gate')(inputs))
191 |
192 | # \pi_t^j - Mixing between "backwards" and "forwards" positions (for
193 | # each write head), and content-based lookup, for each read head.
194 | num_read_modes = 1 + 2 * self._num_writes
195 | read_mode = snt.BatchApply(tf.nn.softmax)(
196 | _linear(self._num_reads, num_read_modes, name='read_mode'))
197 |
198 | # Parameters for the (read / write) "weights by content matching" modules.
199 | write_keys = _linear(self._num_writes, self._word_size, 'write_keys')
200 | write_strengths = snt.Linear(self._num_writes, name='write_strengths')(
201 | inputs)
202 |
203 | read_keys = _linear(self._num_reads, self._word_size, 'read_keys')
204 | read_strengths = snt.Linear(self._num_reads, name='read_strengths')(inputs)
205 |
206 | result = {
207 | 'read_content_keys': read_keys,
208 | 'read_content_strengths': read_strengths,
209 | 'write_content_keys': write_keys,
210 | 'write_content_strengths': write_strengths,
211 | 'write_vectors': write_vectors,
212 | 'erase_vectors': erase_vectors,
213 | 'free_gate': free_gate,
214 | 'allocation_gate': allocation_gate,
215 | 'write_gate': write_gate,
216 | 'read_mode': read_mode,
217 | }
218 | return result
219 |
220 | def _write_weights(self, inputs, memory, usage):
221 | """Calculates the memory locations to write to.
222 |
223 | This uses a combination of content-based lookup and finding an unused
224 | location in memory, for each write head.
225 |
226 | Args:
227 | inputs: Collection of inputs to the access module, including controls for
228 | how to chose memory writing, such as the content to look-up and the
229 | weighting between content-based and allocation-based addressing.
230 | memory: A tensor of shape `[batch_size, memory_size, word_size]`
231 | containing the current memory contents.
232 | usage: Current memory usage, which is a tensor of shape `[batch_size,
233 | memory_size]`, used for allocation-based addressing.
234 |
235 | Returns:
236 | tensor of shape `[batch_size, num_writes, memory_size]` indicating where
237 | to write to (if anywhere) for each write head.
238 | """
239 | with tf.name_scope('write_weights', values=[inputs, memory, usage]):
240 | # c_t^{w, i} - The content-based weights for each write head.
241 | write_content_weights = self._write_content_weights_mod(
242 | memory, inputs['write_content_keys'],
243 | inputs['write_content_strengths'])
244 |
245 | # a_t^i - The allocation weights for each write head.
246 | write_allocation_weights = self._freeness.write_allocation_weights(
247 | usage=usage,
248 | write_gates=(inputs['allocation_gate'] * inputs['write_gate']),
249 | num_writes=self._num_writes)
250 |
251 | # Expands gates over memory locations.
252 | allocation_gate = tf.expand_dims(inputs['allocation_gate'], -1)
253 | write_gate = tf.expand_dims(inputs['write_gate'], -1)
254 |
255 | # w_t^{w, i} - The write weightings for each write head.
256 | return write_gate * (allocation_gate * write_allocation_weights +
257 | (1 - allocation_gate) * write_content_weights)
258 |
259 | def _read_weights(self, inputs, memory, prev_read_weights, link):
260 | """Calculates read weights for each read head.
261 |
262 | The read weights are a combination of following the link graphs in the
263 | forward or backward directions from the previous read position, and doing
264 | content-based lookup. The interpolation between these different modes is
265 | done by `inputs['read_mode']`.
266 |
267 | Args:
268 | inputs: Controls for this access module. This contains the content-based
269 | keys to lookup, and the weightings for the different read modes.
270 | memory: A tensor of shape `[batch_size, memory_size, word_size]`
271 | containing the current memory contents to do content-based lookup.
272 | prev_read_weights: A tensor of shape `[batch_size, num_reads,
273 | memory_size]` containing the previous read locations.
274 | link: A tensor of shape `[batch_size, num_writes, memory_size,
275 | memory_size]` containing the temporal write transition graphs.
276 |
277 | Returns:
278 | A tensor of shape `[batch_size, num_reads, memory_size]` containing the
279 | read weights for each read head.
280 | """
281 | with tf.name_scope(
282 | 'read_weights', values=[inputs, memory, prev_read_weights, link]):
283 | # c_t^{r, i} - The content weightings for each read head.
284 | content_weights = self._read_content_weights_mod(
285 | memory, inputs['read_content_keys'], inputs['read_content_strengths'])
286 |
287 | # Calculates f_t^i and b_t^i.
288 | forward_weights = self._linkage.directional_read_weights(
289 | link, prev_read_weights, forward=True)
290 | backward_weights = self._linkage.directional_read_weights(
291 | link, prev_read_weights, forward=False)
292 |
293 | backward_mode = inputs['read_mode'][:, :, :self._num_writes]
294 | forward_mode = (
295 | inputs['read_mode'][:, :, self._num_writes:2 * self._num_writes])
296 | content_mode = inputs['read_mode'][:, :, 2 * self._num_writes]
297 |
298 | read_weights = (
299 | tf.expand_dims(content_mode, 2) * content_weights + tf.reduce_sum(
300 | tf.expand_dims(forward_mode, 3) * forward_weights, 2) +
301 | tf.reduce_sum(tf.expand_dims(backward_mode, 3) * backward_weights, 2))
302 |
303 | return read_weights
304 |
305 | @property
306 | def state_size(self):
307 | """Returns a tuple of the shape of the state tensors."""
308 | return AccessState(
309 | memory=tf.TensorShape([self._memory_size, self._word_size]),
310 | read_weights=tf.TensorShape([self._num_reads, self._memory_size]),
311 | write_weights=tf.TensorShape([self._num_writes, self._memory_size]),
312 | linkage=self._linkage.state_size,
313 | usage=self._freeness.state_size)
314 |
315 | @property
316 | def output_size(self):
317 | """Returns the output shape."""
318 | return tf.TensorShape([self._num_reads, self._word_size])
319 |
--------------------------------------------------------------------------------
/src/lib/addressing.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """DNC addressing modules."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import collections
22 | import sonnet as snt
23 | import tensorflow as tf
24 |
25 | from . import util
26 |
27 | # Ensure values are greater than epsilon to avoid numerical instability.
28 | _EPSILON = 1e-6
29 |
30 | TemporalLinkageState = collections.namedtuple('TemporalLinkageState',
31 | ('link', 'precedence_weights'))
32 |
33 |
34 | def _vector_norms(m):
35 | squared_norms = tf.reduce_sum(m * m, axis=2, keep_dims=True)
36 | return tf.sqrt(squared_norms + _EPSILON)
37 |
38 |
39 | def weighted_softmax(activations, strengths, strengths_op):
40 | """Returns softmax over activations multiplied by positive strengths.
41 |
42 | Args:
43 | activations: A tensor of shape `[batch_size, num_heads, memory_size]`, of
44 | activations to be transformed. Softmax is taken over the last dimension.
45 | strengths: A tensor of shape `[batch_size, num_heads]` containing strengths to
46 | multiply by the activations prior to the softmax.
47 | strengths_op: An operation to transform strengths before softmax.
48 |
49 | Returns:
50 | A tensor of same shape as `activations` with weighted softmax applied.
51 | """
52 | transformed_strengths = tf.expand_dims(strengths_op(strengths), -1)
53 | sharp_activations = activations * transformed_strengths
54 | softmax = snt.BatchApply(module_or_op=tf.nn.softmax)
55 | return softmax(sharp_activations)
56 |
57 |
58 | class CosineWeights(snt.AbstractModule):
59 | """Cosine-weighted attention.
60 |
61 | Calculates the cosine similarity between a query and each word in memory, then
62 | applies a weighted softmax to return a sharp distribution.
63 | """
64 |
65 | def __init__(self,
66 | num_heads,
67 | word_size,
68 | strength_op=tf.nn.softplus,
69 | name='cosine_weights'):
70 | """Initializes the CosineWeights module.
71 |
72 | Args:
73 | num_heads: number of memory heads.
74 | word_size: memory word size.
75 | strength_op: operation to apply to strengths (default is tf.nn.softplus).
76 | name: module name (default 'cosine_weights')
77 | """
78 | super(CosineWeights, self).__init__(name=name)
79 | self._num_heads = num_heads
80 | self._word_size = word_size
81 | self._strength_op = strength_op
82 |
83 | def _build (self, memory, keys, strengths):
84 | """Connects the CosineWeights module into the graph.
85 |
86 | Args:
87 | memory: A 3-D tensor of shape `[batch_size, memory_size, word_size]`.
88 | keys: A 3-D tensor of shape `[batch_size, num_heads, word_size]`.
89 | strengths: A 2-D tensor of shape `[batch_size, num_heads]`.
90 |
91 | Returns:
92 | Weights tensor of shape `[batch_size, num_heads, memory_size]`.
93 | """
94 | # Calculates the inner product between the query vector and words in memory.
95 | dot = tf.matmul(keys, memory, adjoint_b=True)
96 |
97 | # Outer product to compute denominator (euclidean norm of query and memory).
98 | memory_norms = _vector_norms(memory)
99 | key_norms = _vector_norms(keys)
100 | norm = tf.matmul(key_norms, memory_norms, adjoint_b=True)
101 |
102 | # Calculates cosine similarity between the query vector and words in memory.
103 | similarity = dot / (norm + _EPSILON)
104 |
105 | return weighted_softmax(similarity, strengths, self._strength_op)
106 |
107 |
108 | class TemporalLinkage(snt.RNNCore):
109 | """Keeps track of write order for forward and backward addressing.
110 |
111 | This is a pseudo-RNNCore module, whose state is a pair `(link,
112 | precedence_weights)`, where `link` is a (collection of) graphs for (possibly
113 | multiple) write heads (represented by a tensor with values in the range
114 | [0, 1]), and `precedence_weights` records the "previous write locations" used
115 | to build the link graphs.
116 |
117 | The function `directional_read_weights` computes addresses following the
118 | forward and backward directions in the link graphs.
119 | """
120 |
121 | def __init__(self, memory_size, num_writes, name='temporal_linkage'):
122 | """Construct a TemporalLinkage module.
123 |
124 | Args:
125 | memory_size: The number of memory slots.
126 | num_writes: The number of write heads.
127 | name: Name of the module.
128 | """
129 | super(TemporalLinkage, self).__init__(name=name)
130 | self._memory_size = memory_size
131 | self._num_writes = num_writes
132 |
133 | def _build(self, write_weights, prev_state):
134 | """Calculate the updated linkage state given the write weights.
135 |
136 | Args:
137 | write_weights: A tensor of shape `[batch_size, num_writes, memory_size]`
138 | containing the memory addresses of the different write heads.
139 | prev_state: `TemporalLinkageState` tuple containg a tensor `link` of
140 | shape `[batch_size, num_writes, memory_size, memory_size]`, and a
141 | tensor `precedence_weights` of shape `[batch_size, num_writes,
142 | memory_size]` containing the aggregated history of recent writes.
143 |
144 | Returns:
145 | A `TemporalLinkageState` tuple `next_state`, which contains the updated
146 | link and precedence weights.
147 | """
148 | link = self._link(prev_state.link, prev_state.precedence_weights,
149 | write_weights)
150 | precedence_weights = self._precedence_weights(prev_state.precedence_weights,
151 | write_weights)
152 | return TemporalLinkageState(
153 | link=link, precedence_weights=precedence_weights)
154 |
155 | def directional_read_weights(self, link, prev_read_weights, forward):
156 | """Calculates the forward or the backward read weights.
157 |
158 | For each read head (at a given address), there are `num_writes` link graphs
159 | to follow. Thus this function computes a read address for each of the
160 | `num_reads * num_writes` pairs of read and write heads.
161 |
162 | Args:
163 | link: tensor of shape `[batch_size, num_writes, memory_size,
164 | memory_size]` representing the link graphs L_t.
165 | prev_read_weights: tensor of shape `[batch_size, num_reads,
166 | memory_size]` containing the previous read weights w_{t-1}^r.
167 | forward: Boolean indicating whether to follow the "future" direction in
168 | the link graph (True) or the "past" direction (False).
169 |
170 | Returns:
171 | tensor of shape `[batch_size, num_reads, num_writes, memory_size]`
172 | """
173 | with tf.name_scope('directional_read_weights'):
174 | # We calculate the forward and backward directions for each pair of
175 | # read and write heads; hence we need to tile the read weights and do a
176 | # sort of "outer product" to get this.
177 | expanded_read_weights = tf.stack([prev_read_weights] * self._num_writes,
178 | 1)
179 | result = tf.matmul(expanded_read_weights, link, adjoint_b=forward)
180 | # Swap dimensions 1, 2 so order is [batch, reads, writes, memory]:
181 | return tf.transpose(result, perm=[0, 2, 1, 3])
182 |
183 | def _link(self, prev_link, prev_precedence_weights, write_weights):
184 | """Calculates the new link graphs.
185 |
186 | For each write head, the link is a directed graph (represented by a matrix
187 | with entries in range [0, 1]) whose vertices are the memory locations, and
188 | an edge indicates temporal ordering of writes.
189 |
190 | Args:
191 | prev_link: A tensor of shape `[batch_size, num_writes, memory_size,
192 | memory_size]` representing the previous link graphs for each write
193 | head.
194 | prev_precedence_weights: A tensor of shape `[batch_size, num_writes,
195 | memory_size]` which is the previous "aggregated" write weights for
196 | each write head.
197 | write_weights: A tensor of shape `[batch_size, num_writes, memory_size]`
198 | containing the new locations in memory written to.
199 |
200 | Returns:
201 | A tensor of shape `[batch_size, num_writes, memory_size, memory_size]`
202 | containing the new link graphs for each write head.
203 | """
204 | with tf.name_scope('link'):
205 | batch_size = prev_link.get_shape()[0].value
206 | write_weights_i = tf.expand_dims(write_weights, 3)
207 | write_weights_j = tf.expand_dims(write_weights, 2)
208 | prev_precedence_weights_j = tf.expand_dims(prev_precedence_weights, 2)
209 | prev_link_scale = 1 - write_weights_i - write_weights_j
210 | new_link = write_weights_i * prev_precedence_weights_j
211 | link = prev_link_scale * prev_link + new_link
212 | # Return the link with the diagonal set to zero, to remove self-looping
213 | # edges.
214 | return tf.matrix_set_diag(
215 | link,
216 | tf.zeros(
217 | [batch_size, self._num_writes, self._memory_size],
218 | dtype=link.dtype))
219 |
220 | def _precedence_weights(self, prev_precedence_weights, write_weights):
221 | """Calculates the new precedence weights given the current write weights.
222 |
223 | The precedence weights are the "aggregated write weights" for each write
224 | head, where write weights with sum close to zero will leave the precedence
225 | weights unchanged, but with sum close to one will replace the precedence
226 | weights.
227 |
228 | Args:
229 | prev_precedence_weights: A tensor of shape `[batch_size, num_writes,
230 | memory_size]` containing the previous precedence weights.
231 | write_weights: A tensor of shape `[batch_size, num_writes, memory_size]`
232 | containing the new write weights.
233 |
234 | Returns:
235 | A tensor of shape `[batch_size, num_writes, memory_size]` containing the
236 | new precedence weights.
237 | """
238 | with tf.name_scope('precedence_weights'):
239 | write_sum = tf.reduce_sum(write_weights, 2, keep_dims=True)
240 | return (1 - write_sum) * prev_precedence_weights + write_weights
241 |
242 | @property
243 | def state_size(self):
244 | """Returns a `TemporalLinkageState` tuple of the state tensors' shapes."""
245 | return TemporalLinkageState(
246 | link=tf.TensorShape(
247 | [self._num_writes, self._memory_size, self._memory_size]),
248 | precedence_weights=tf.TensorShape([self._num_writes,
249 | self._memory_size]),)
250 |
251 |
252 | class Freeness(snt.RNNCore):
253 | """Memory usage that is increased by writing and decreased by reading.
254 |
255 | This module is a pseudo-RNNCore whose state is a tensor with values in
256 | the range [0, 1] indicating the usage of each of `memory_size` memory slots.
257 |
258 | The usage is:
259 |
260 | * Increased by writing, where usage is increased towards 1 at the write
261 | addresses.
262 | * Decreased by reading, where usage is decreased after reading from a
263 | location when free_gate is close to 1.
264 |
265 | The function `write_allocation_weights` can be invoked to get free locations
266 | to write to for a number of write heads.
267 | """
268 |
269 | def __init__(self, memory_size, name='freeness'):
270 | """Creates a Freeness module.
271 |
272 | Args:
273 | memory_size: Number of memory slots.
274 | name: Name of the module.
275 | """
276 | super(Freeness, self).__init__(name=name)
277 | self._memory_size = memory_size
278 |
279 | def _build(self, write_weights, free_gate, read_weights, prev_usage):
280 | """Calculates the new memory usage u_t.
281 |
282 | Memory that was written to in the previous time step will have its usage
283 | increased; memory that was read from and the controller says can be "freed"
284 | will have its usage decreased.
285 |
286 | Args:
287 | write_weights: tensor of shape `[batch_size, num_writes,
288 | memory_size]` giving write weights at previous time step.
289 | free_gate: tensor of shape `[batch_size, num_reads]` which indicates
290 | which read heads read memory that can now be freed.
291 | read_weights: tensor of shape `[batch_size, num_reads,
292 | memory_size]` giving read weights at previous time step.
293 | prev_usage: tensor of shape `[batch_size, memory_size]` giving
294 | usage u_{t - 1} at the previous time step, with entries in range
295 | [0, 1].
296 |
297 | Returns:
298 | tensor of shape `[batch_size, memory_size]` representing updated memory
299 | usage.
300 | """
301 | # Calculation of usage is not differentiable with respect to write weights.
302 | write_weights = tf.stop_gradient(write_weights)
303 | usage = self._usage_after_write(prev_usage, write_weights)
304 | usage = self._usage_after_read(usage, free_gate, read_weights)
305 | return usage
306 |
307 | def write_allocation_weights(self, usage, write_gates, num_writes):
308 | """Calculates freeness-based locations for writing to.
309 |
310 | This finds unused memory by ranking the memory locations by usage, for each
311 | write head. (For more than one write head, we use a "simulated new usage"
312 | which takes into account the fact that the previous write head will increase
313 | the usage in that area of the memory.)
314 |
315 | Args:
316 | usage: A tensor of shape `[batch_size, memory_size]` representing
317 | current memory usage.
318 | write_gates: A tensor of shape `[batch_size, num_writes]` with values in
319 | the range [0, 1] indicating how much each write head does writing
320 | based on the address returned here (and hence how much usage
321 | increases).
322 | num_writes: The number of write heads to calculate write weights for.
323 |
324 | Returns:
325 | tensor of shape `[batch_size, num_writes, memory_size]` containing the
326 | freeness-based write locations. Note that this isn't scaled by
327 | `write_gate`; this scaling must be applied externally.
328 | """
329 | with tf.name_scope('write_allocation_weights'):
330 | # expand gatings over memory locations
331 | write_gates = tf.expand_dims(write_gates, -1)
332 |
333 | allocation_weights = []
334 | for i in range(num_writes):
335 | allocation_weights.append(self._allocation(usage))
336 | # update usage to take into account writing to this new allocation
337 | usage += ((1 - usage) * write_gates[:, i, :] * allocation_weights[i])
338 |
339 | # Pack the allocation weights for the write heads into one tensor.
340 | return tf.stack(allocation_weights, axis=1)
341 |
342 | def _usage_after_write(self, prev_usage, write_weights):
343 | """Calcualtes the new usage after writing to memory.
344 |
345 | Args:
346 | prev_usage: tensor of shape `[batch_size, memory_size]`.
347 | write_weights: tensor of shape `[batch_size, num_writes, memory_size]`.
348 |
349 | Returns:
350 | New usage, a tensor of shape `[batch_size, memory_size]`.
351 | """
352 | with tf.name_scope('usage_after_write'):
353 | # Calculate the aggregated effect of all write heads
354 | write_weights = 1 - tf.reduce_prod(1 - write_weights, [1])
355 | return prev_usage + (1 - prev_usage) * write_weights
356 |
357 | def _usage_after_read(self, prev_usage, free_gate, read_weights):
358 | """Calcualtes the new usage after reading and freeing from memory.
359 |
360 | Args:
361 | prev_usage: tensor of shape `[batch_size, memory_size]`.
362 | free_gate: tensor of shape `[batch_size, num_reads]` with entries in the
363 | range [0, 1] indicating the amount that locations read from can be
364 | freed.
365 | read_weights: tensor of shape `[batch_size, num_reads, memory_size]`.
366 |
367 | Returns:
368 | New usage, a tensor of shape `[batch_size, memory_size]`.
369 | """
370 | with tf.name_scope('usage_after_read'):
371 | free_gate = tf.expand_dims(free_gate, -1)
372 | free_read_weights = free_gate * read_weights
373 | phi = tf.reduce_prod(1 - free_read_weights, [1], name='phi')
374 | return prev_usage * phi
375 |
376 | def _allocation(self, usage):
377 | r"""Computes allocation by sorting `usage`.
378 |
379 | This corresponds to the value a = a_t[\phi_t[j]] in the paper.
380 |
381 | Args:
382 | usage: tensor of shape `[batch_size, memory_size]` indicating current
383 | memory usage. This is equal to u_t in the paper when we only have one
384 | write head, but for multiple write heads, one should update the usage
385 | while iterating through the write heads to take into account the
386 | allocation returned by this function.
387 |
388 | Returns:
389 | Tensor of shape `[batch_size, memory_size]` corresponding to allocation.
390 | """
391 | with tf.name_scope('allocation'):
392 | # Ensure values are not too small prior to cumprod.
393 | usage = _EPSILON + (1 - _EPSILON) * usage
394 |
395 | nonusage = 1 - usage
396 | sorted_nonusage, indices = tf.nn.top_k(
397 | nonusage, k=self._memory_size, name='sort')
398 | sorted_usage = 1 - sorted_nonusage
399 | prod_sorted_usage = tf.cumprod(sorted_usage, axis=1, exclusive=True)
400 | sorted_allocation = sorted_nonusage * prod_sorted_usage
401 | inverse_indices = util.batch_invert_permutation(indices)
402 |
403 | # This final line "unsorts" sorted_allocation, so that the indexing
404 | # corresponds to the original indexing of `usage`.
405 | return util.batch_gather(sorted_allocation, inverse_indices)
406 |
407 | @property
408 | def state_size(self):
409 | """Returns the shape of the state tensor."""
410 | return tf.TensorShape([self._memory_size])
411 |
--------------------------------------------------------------------------------
/src/lib/dnc.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """DNC Cores.
16 |
17 | These modules create a DNC core. They take input, pass parameters to the memory
18 | access module, and integrate the output of memory to form an output.
19 | """
20 |
21 | from __future__ import absolute_import
22 | from __future__ import division
23 | from __future__ import print_function
24 |
25 | import collections
26 | import numpy as np
27 | import sonnet as snt
28 | import tensorflow as tf
29 |
30 | from . import access
31 |
32 | DNCState = collections.namedtuple('DNCState', ('access_output', 'access_state',
33 | 'controller_state'))
34 |
35 |
36 | class DNC(snt.RNNCore):
37 | """DNC core module.
38 |
39 | Contains controller and memory access module.
40 | """
41 |
42 | def __init__(self,
43 | access_config,
44 | controller_config,
45 | output_size,
46 | clip_value=None,use_memory=True,
47 | name='dnc'):
48 | """Initializes the DNC core.
49 |
50 | Args:
51 | access_config: dictionary of access module configurations.
52 | controller_config: dictionary of controller (LSTM) module configurations.
53 | output_size: output dimension size of core.
54 | clip_value: clips controller and core output values to between
55 | `[-clip_value, clip_value]` if specified.
56 | name: module name (default 'dnc').
57 |
58 | Raises:
59 | TypeError: if direct_input_size is not None for any access module other
60 | than KeyValueMemory.
61 | """
62 | super(DNC, self).__init__(name=name)
63 |
64 | with self._enter_variable_scope():
65 | self._controller = snt.LSTM(**controller_config)
66 | self._access = access.MemoryAccess(**access_config)
67 | self.use_memory=use_memory
68 | self._access_output_size = np.prod(self._access.output_size.as_list())
69 | self._output_size = output_size
70 | self._clip_value = clip_value or 0
71 |
72 | self._output_size = tf.TensorShape([output_size])
73 | self._state_size = DNCState(
74 | access_output=self._access_output_size,
75 | access_state=self._access.state_size,
76 | controller_state=self._controller.state_size)
77 |
78 | def _clip_if_enabled(self, x):
79 | if self._clip_value > 0:
80 | return tf.clip_by_value(x, -self._clip_value, self._clip_value)
81 | else:
82 | return x
83 |
84 | def _build(self, inputs, prev_state):
85 | """Connects the DNC core into the graph.
86 |
87 | Args:
88 | inputs: Tensor input.
89 | prev_state: A `DNCState` tuple containing the fields `access_output`,
90 | `access_state` and `controller_state`. `access_state` is a 3-D Tensor
91 | of shape `[batch_size, num_reads, word_size]` containing read words.
92 | `access_state` is a tuple of the access module's state, and
93 | `controller_state` is a tuple of controller module's state.
94 |
95 | Returns:
96 | A tuple `(output, next_state)` where `output` is a tensor and `next_state`
97 | is a `DNCState` tuple containing the fields `access_output`,
98 | `access_state`, and `controller_state`.
99 | """
100 |
101 | prev_access_output = prev_state.access_output
102 | prev_access_state = prev_state.access_state
103 | prev_controller_state = prev_state.controller_state
104 |
105 | batch_flatten = snt.BatchFlatten()
106 | if self.use_memory is False:
107 | prev_access_output=prev_access_output*0
108 | controller_input = tf.concat(
109 | [batch_flatten(inputs), batch_flatten(prev_access_output)], 1)
110 |
111 | controller_output, controller_state = self._controller(
112 | controller_input, prev_controller_state)
113 |
114 | controller_output = self._clip_if_enabled(controller_output)
115 | controller_state = snt.nest.map(self._clip_if_enabled, controller_state)
116 |
117 | access_output, access_state = self._access(controller_output,
118 | prev_access_state)
119 | if self.use_memory is False:
120 | access_output=access_output*0
121 | output = tf.concat([controller_output, batch_flatten(access_output)], 1)
122 | output = snt.Linear(
123 | output_size=self._output_size.as_list()[0],
124 | name='output_linear')(output)
125 | output = self._clip_if_enabled(output)
126 |
127 | return output, DNCState(
128 | access_output=access_output,
129 | access_state=access_state,
130 | controller_state=controller_state)
131 |
132 | def initial_state(self, batch_size, dtype=tf.float32):
133 | return DNCState(
134 | controller_state=self._controller.initial_state(batch_size, dtype),
135 | access_state=self._access.initial_state(batch_size, dtype),
136 | access_output=tf.zeros(
137 | [batch_size] + self._access.output_size.as_list(), dtype))
138 |
139 | @property
140 | def state_size(self):
141 | return self._state_size
142 |
143 | @property
144 | def output_size(self):
145 | return self._output_size
146 |
--------------------------------------------------------------------------------
/src/lib/seq2seq.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # Copyright 2018 Cognibit Solutions LLP.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | Sequence to sequence DNC model, Training and prediction library.
19 | """
20 | import math
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 | import tensorflow.contrib.seq2seq as seq2seq
25 |
26 | from tensorflow.python.ops.rnn_cell import GRUCell
27 | from tensorflow.python.ops.rnn_cell import LSTMCell
28 | from tensorflow.python.ops.rnn_cell import MultiRNNCell
29 | from tensorflow.python.ops.rnn_cell import DropoutWrapper, ResidualWrapper
30 | from tensorflow.python.ops import array_ops
31 | from tensorflow.python.ops import control_flow_ops
32 | from tensorflow.python.framework import constant_op
33 | from tensorflow.python.framework import dtypes
34 | from tensorflow.python.layers.core import Dense
35 | from tensorflow.python.util import nest
36 |
37 | from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
38 | from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
39 |
40 | from .dnc import DNC
41 |
42 |
43 | class Seq2SeqModel(object):
44 | def __init__(self, config, mode, dnc_batch_size=None,use_memory=True):
45 | assert mode.lower() in ['train', 'decode']
46 |
47 | self.config = config
48 | self.mode = mode.lower()
49 |
50 | self.cell_type = config['cell_type']
51 | self.hidden_units = config['hidden_units']
52 | self.depth = config['depth']
53 | self.attention_type = config['attention_type']
54 | self.embedding_size = config['embedding_size']
55 | # self.bidirectional = config.bidirectional
56 | self.num_encoder_symbols = config['num_encoder_symbols']
57 | self.num_decoder_symbols = config['num_decoder_symbols']
58 | self.start_token = config['start_token']
59 | self.end_token = config['end_token']
60 | self.use_residual = config['use_residual']
61 | self.attn_input_feeding = config['attn_input_feeding']
62 | self.use_dropout = config['use_dropout']
63 | self.keep_prob = 1.0 - config['dropout_rate']
64 | self.global_step = tf.Variable(0, trainable=False, name='global_step')
65 | self.global_epoch_step = tf.Variable(0, trainable=False, name='global_epoch_step')
66 | self.global_epoch_step_op = \
67 | tf.assign(self.global_epoch_step, self.global_epoch_step + 1)
68 |
69 | self.dtype = tf.float16 if config['use_fp16'] else tf.float32
70 | self.keep_prob_placeholder = tf.placeholder(self.dtype, shape=[], name='keep_prob')
71 |
72 | self.use_beamsearch_decode = False
73 | if self.mode == 'decode':
74 | self.beam_width = config['beam_width']
75 | self.use_beamsearch_decode = True if self.beam_width > 1 else False
76 | self.max_decode_step = config['max_decode_step']
77 | else:
78 | self.optimizer = config['optimizer']
79 | self.learning_rate = config['learning_rate']
80 | self.max_gradient_norm = config['max_gradient_norm']
81 |
82 | if (self.cell_type == 'dnc'):
83 | self.num_reads = config['num_reads']
84 | self.num_writes = config['num_writes']
85 | self.word_size = config['word_size']
86 | self.memory_size = config['memory_size']
87 | self.clip_value = config['clip_value']
88 | cell_type = DNC
89 | access_config = {
90 | "memory_size": self.memory_size,
91 | "word_size": self.word_size,
92 | "num_reads": self.num_reads,
93 | "num_writes": self.num_writes,
94 | }
95 | controller_config = {
96 | "hidden_size": self.hidden_units,
97 | }
98 | self.dnc_cell = cell_type(access_config=access_config, controller_config=controller_config,
99 | output_size=self.hidden_units, clip_value=self.clip_value,use_memory=use_memory)
100 | self.dncInitial = self.dnc_cell.initial_state
101 | #Dynamic Batch Size for DNC not yet supported hence we will use static batch size
102 | self.dnc_batch_size=dnc_batch_size
103 |
104 | self.build_model()
105 |
106 | def build_model(self):
107 | print("building model..")
108 |
109 | # Building encoder and decoder networks
110 | self.init_placeholders()
111 | self.build_encoder()
112 | self.build_decoder()
113 |
114 | # Merge all the training summaries
115 | self.summary_op = tf.summary.merge_all()
116 |
117 | def init_placeholders(self):
118 | # encoder_inputs: [batch_size, max_time_steps]
119 |
120 | self.encoder_inputs = tf.placeholder(dtype=tf.int32,
121 | shape=(None, None), name='encoder_inputs')
122 |
123 | # encoder_inputs_length: [batch_size]
124 | self.encoder_inputs_length = tf.placeholder(
125 | dtype=tf.int32, shape=(None,), name='encoder_inputs_length')
126 |
127 | # get dynamic batch_size
128 | self.batch_size = tf.shape(self.encoder_inputs)[0]
129 | if self.mode == 'train':
130 | # decoder_inputs: [batch_size, max_time_steps]
131 | self.decoder_inputs = tf.placeholder(
132 | dtype=tf.int32, shape=(None, None), name='decoder_inputs')
133 | # decoder_inputs_length: [batch_size]
134 | self.decoder_inputs_length = tf.placeholder(
135 | dtype=tf.int32, shape=(None,), name='decoder_inputs_length')
136 |
137 | decoder_start_token = tf.ones(
138 | shape=[self.batch_size, 1], dtype=tf.int32) * self.start_token
139 | decoder_end_token = tf.ones(
140 | shape=[self.batch_size, 1], dtype=tf.int32) * self.end_token
141 |
142 | # decoder_inputs_train: [batch_size , max_time_steps + 1]
143 | # insert _GO symbol in front of each decoder input
144 | self.decoder_inputs_train = tf.concat([decoder_start_token,
145 | self.decoder_inputs], axis=1)
146 |
147 | # decoder_inputs_length_train: [batch_size]
148 | self.decoder_inputs_length_train = self.decoder_inputs_length + 1
149 |
150 | # decoder_targets_train: [batch_size, max_time_steps + 1]
151 | # insert EOS symbol at the end of each decoder input
152 | self.decoder_targets_train = tf.concat([self.decoder_inputs,
153 | decoder_end_token], axis=1)
154 |
155 | def init_encoder_variable(self):
156 | self.encoder_cell = self.build_encoder_cell()
157 |
158 | # Initialize encoder_embeddings to have variance=1.
159 | sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1.
160 | initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=self.dtype)
161 |
162 | self.encoder_embeddings = tf.get_variable(name='embedding',
163 | shape=[self.num_encoder_symbols, self.embedding_size],
164 | initializer=initializer, dtype=self.dtype)
165 |
166 | # Embedded_inputs: [batch_size, time_step, embedding_size]
167 | self.encoder_inputs_embedded = tf.nn.embedding_lookup(
168 | params=self.encoder_embeddings, ids=self.encoder_inputs)
169 |
170 | # Input projection layer to feed embedded inputs to the cell
171 | # ** Essential when use_residual=True to match input/output dims
172 | input_layer = Dense(self.hidden_units, dtype=self.dtype, name='input_projection')
173 |
174 | # Embedded inputs having gone through input projection layer
175 | self.encoder_inputs_embedded = input_layer(self.encoder_inputs_embedded)
176 |
177 | # Encode input sequences into context vectors:
178 | # encoder_outputs: [batch_size, max_time_step, cell_output_size]
179 | # encoder_state: [batch_size, cell_output_size]
180 |
181 | if (self.cell_type == 'dnc'):
182 | initial_state = self.dncInitial(self.dnc_batch_size)
183 | self.encoder_outputs, self.encoder_last_state = tf.nn.dynamic_rnn(
184 | cell=self.encoder_cell, inputs=self.encoder_inputs_embedded,
185 | sequence_length=self.encoder_inputs_length, dtype=self.dtype,
186 | time_major=False, initial_state=initial_state)
187 | else:
188 | self.encoder_outputs, self.encoder_last_state = tf.nn.dynamic_rnn(
189 | cell=self.encoder_cell, inputs=self.encoder_inputs_embedded,
190 | sequence_length=self.encoder_inputs_length, dtype=self.dtype,
191 | time_major=False)
192 |
193 | def build_encoder(self):
194 | print("building encoder..")
195 | try:
196 | with tf.variable_scope('encoder'):
197 | self.init_encoder_variable()
198 | except:
199 | with tf.variable_scope('encoder', reuse=True):
200 | self.init_encoder_variable()
201 |
202 | def init_decoder_variable(self):
203 | # Building decoder_cell and decoder_initial_state
204 | self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell()
205 |
206 | # Initialize decoder embeddings to have variance=1.
207 | sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1.
208 | initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=self.dtype)
209 |
210 | self.decoder_embeddings = tf.get_variable(name='embedding',
211 | shape=[self.num_decoder_symbols, self.embedding_size],
212 | initializer=initializer, dtype=self.dtype)
213 |
214 | # Input projection layer to feed embedded inputs to the cell
215 | # ** Essential when use_residual=True to match input/output dims
216 | input_layer = Dense(self.hidden_units, dtype=self.dtype, name='input_projection')
217 |
218 | # Output projection layer to convert cell_outputs to logits
219 | output_layer = Dense(self.num_decoder_symbols, name='output_projection')
220 |
221 | if self.mode == 'train':
222 | # decoder_inputs_embedded: [batch_size, max_time_step + 1, embedding_size]
223 | self.decoder_inputs_embedded = tf.nn.embedding_lookup(
224 | params=self.decoder_embeddings, ids=self.decoder_inputs_train)
225 |
226 | # Embedded inputs having gone through input projection layer
227 | self.decoder_inputs_embedded = input_layer(self.decoder_inputs_embedded)
228 |
229 | # Helper to feed inputs for training: read inputs from dense ground truth vectors
230 | training_helper = seq2seq.TrainingHelper(inputs=self.decoder_inputs_embedded,
231 | sequence_length=self.decoder_inputs_length_train,
232 | time_major=False,
233 | name='training_helper')
234 |
235 | training_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell,
236 | helper=training_helper,
237 | initial_state=self.decoder_initial_state,
238 | output_layer=output_layer)
239 | # output_layer=None)
240 |
241 | # Maximum decoder time_steps in current batch
242 | max_decoder_length = tf.reduce_max(self.decoder_inputs_length_train)
243 |
244 | # decoder_outputs_train: BasicDecoderOutput
245 | # namedtuple(rnn_outputs, sample_id)
246 | # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False
247 | # [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True
248 | # decoder_outputs_train.sample_id: [batch_size], tf.int32
249 | (self.decoder_outputs_train, self.decoder_last_state_train,
250 | self.decoder_outputs_length_train) = (seq2seq.dynamic_decode(
251 | decoder=training_decoder,
252 | output_time_major=False,
253 | impute_finished=True,
254 | maximum_iterations=max_decoder_length))
255 |
256 | # More efficient to do the projection on the batch-time-concatenated tensor
257 | # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols]
258 | # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output)
259 | self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output)
260 | # Use argmax to extract decoder symbols to emit
261 | self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1,
262 | name='decoder_pred_train')
263 |
264 | # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1]
265 | masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train,
266 | maxlen=max_decoder_length, dtype=self.dtype, name='masks')
267 |
268 | # Computes per word average cross-entropy over a batch
269 | # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default
270 | self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train,
271 | targets=self.decoder_targets_train,
272 | weights=masks,
273 | average_across_timesteps=True,
274 | average_across_batch=True, )
275 | # Training summary for the current batch_loss
276 | tf.summary.scalar('loss', self.loss)
277 |
278 | # Contruct graphs for minimizing loss
279 | self.init_optimizer()
280 |
281 | elif self.mode == 'decode':
282 |
283 | # Start_tokens: [batch_size,] `int32` vector
284 | start_tokens = tf.ones([self.batch_size, ], tf.int32) * self.start_token
285 | end_token = self.end_token
286 |
287 | def embed_and_input_proj(inputs):
288 | return input_layer(tf.nn.embedding_lookup(self.decoder_embeddings, inputs))
289 |
290 | if not self.use_beamsearch_decode:
291 | # Helper to feed inputs for greedy decoding: uses the argmax of the output
292 | decoding_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=start_tokens,
293 | end_token=end_token,
294 | embedding=embed_and_input_proj)
295 | # Basic decoder performs greedy decoding at each time step
296 | print("building greedy decoder..")
297 | inference_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell,
298 | helper=decoding_helper,
299 | initial_state=self.decoder_initial_state,
300 | output_layer=output_layer)
301 | else:
302 | # Beamsearch is used to approximately find the most likely translation
303 | print("building beamsearch decoder..")
304 | inference_decoder = beam_search_decoder.BeamSearchDecoder(cell=self.decoder_cell,
305 | embedding=embed_and_input_proj,
306 | start_tokens=start_tokens,
307 | end_token=end_token,
308 | initial_state=self.decoder_initial_state,
309 | beam_width=self.beam_width,
310 | output_layer=output_layer, )
311 |
312 | (self.decoder_outputs_decode, self.decoder_last_state_decode,
313 | self.decoder_outputs_length_decode) = (seq2seq.dynamic_decode(
314 | decoder=inference_decoder,
315 | output_time_major=False,
316 | # impute_finished=True, # error occurs
317 | maximum_iterations=self.max_decode_step))
318 |
319 | if not self.use_beamsearch_decode:
320 | # decoder_outputs_decode.sample_id: [batch_size, max_time_step]
321 | # Or use argmax to find decoder symbols to emit:
322 | # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output,
323 | # axis=-1, name='decoder_pred_decode')
324 |
325 | # Here, we use expand_dims to be compatible with the result of the beamsearch decoder
326 | # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False)
327 | self.decoder_pred_decode = tf.expand_dims(self.decoder_outputs_decode.sample_id, -1)
328 |
329 | else:
330 | # Use beam search to approximately find the most likely translation
331 | # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False)
332 | self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
333 |
334 | def build_decoder(self):
335 | print("building decoder and attention..")
336 | try:
337 | with tf.variable_scope('decoder'):
338 | self.init_decoder_variable()
339 | except:
340 | with tf.variable_scope('decoder', reuse=True):
341 | self.init_decoder_variable()
342 |
343 | def build_single_cell(self):
344 |
345 | if (self.cell_type.lower() == 'gru'):
346 | cell_type = GRUCell
347 | cell = cell_type(self.hidden_units)
348 | else:
349 | cell_type = LSTMCell
350 | cell = cell_type(self.hidden_units)
351 |
352 | if self.use_dropout:
353 | cell = DropoutWrapper(cell, dtype=self.dtype,
354 | output_keep_prob=self.keep_prob_placeholder, )
355 | if self.use_residual:
356 | cell = ResidualWrapper(cell)
357 |
358 | return cell
359 |
360 | # Building encoder cell
361 | def build_encoder_cell(self):
362 | if (self.cell_type.lower() != 'dnc'):
363 | return MultiRNNCell([self.build_single_cell() for i in range(self.depth)])
364 | else:
365 | return self.dnc_cell
366 |
367 | # Building decoder cell and attention. Also returns decoder_initial_state
368 | def build_decoder_cell(self):
369 |
370 | encoder_outputs = self.encoder_outputs
371 | encoder_last_state = self.encoder_last_state
372 | encoder_inputs_length = self.encoder_inputs_length
373 | # To use BeamSearchDecoder, encoder_outputs, encoder_last_state, encoder_inputs_length
374 | # needs to be tiled so that: [batch_size, .., ..] -> [batch_size x beam_width, .., ..]
375 | if self.use_beamsearch_decode:
376 | print("use beamsearch decoding..")
377 | encoder_outputs = seq2seq.tile_batch(
378 | self.encoder_outputs, multiplier=self.beam_width)
379 | encoder_last_state = nest.map_structure(
380 | lambda s: seq2seq.tile_batch(s, self.beam_width), self.encoder_last_state)
381 | encoder_inputs_length = seq2seq.tile_batch(
382 | self.encoder_inputs_length, multiplier=self.beam_width)
383 |
384 | # Building attention mechanism: Default Bahdanau
385 | # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473
386 | self.attention_mechanism = attention_wrapper.BahdanauAttention(
387 | num_units=self.hidden_units, memory=encoder_outputs,
388 | memory_sequence_length=encoder_inputs_length, )
389 | # 'Luong' style attention: https://arxiv.org/abs/1508.04025
390 | if self.attention_type.lower() == 'luong':
391 | self.attention_mechanism = attention_wrapper.LuongAttention(
392 | num_units=self.hidden_units, memory=encoder_outputs,
393 | memory_sequence_length=encoder_inputs_length, )
394 |
395 | def attn_decoder_input_fn(inputs, attention):
396 | if not self.attn_input_feeding:
397 | return inputs
398 |
399 | # Essential when use_residual=True
400 | _input_layer = Dense(self.hidden_units, dtype=self.dtype,
401 | name='attn_input_feeding')
402 | return _input_layer(array_ops.concat([inputs, attention], -1))
403 |
404 | if (self.cell_type != 'dnc'):
405 | # Building decoder_cell
406 | self.decoder_cell_list = [
407 | self.build_single_cell() for i in range(self.depth)]
408 | decoder_initial_state = encoder_last_state
409 |
410 | # AttentionWrapper wraps RNNCell with the attention_mechanism
411 | # Note: We implement Attention mechanism only on the top decoder layer
412 | self.decoder_cell_list[-1] = attention_wrapper.AttentionWrapper(
413 | cell=self.decoder_cell_list[-1],
414 | attention_mechanism=self.attention_mechanism,
415 | attention_layer_size=self.hidden_units,
416 | cell_input_fn=attn_decoder_input_fn,
417 | initial_cell_state=encoder_last_state[-1],
418 | alignment_history=False,
419 | name='Attention_Wrapper')
420 |
421 | # To be compatible with AttentionWrapper, the encoder last state
422 | # of the top layer should be converted into the AttentionWrapperState form
423 | # We can easily do this by calling AttentionWrapper.zero_state
424 |
425 | # Also if beamsearch decoding is used, the batch_size argument in .zero_state
426 | # should be ${decoder_beam_width} times to the origianl batch_size
427 | batch_size = self.batch_size if not self.use_beamsearch_decode \
428 | else self.batch_size * self.beam_width
429 | # Initialised with this encoder state
430 |
431 | initial_state = [state for state in encoder_last_state]
432 |
433 | initial_state[-1] = self.decoder_cell_list[-1].zero_state(
434 | batch_size=batch_size, dtype=self.dtype)
435 | decoder_initial_state = tuple(initial_state)
436 |
437 | return MultiRNNCell(self.decoder_cell_list), decoder_initial_state
438 | else:
439 | decoder_cell = attention_wrapper.AttentionWrapper(
440 | cell=self.dnc_cell,
441 | attention_mechanism=self.attention_mechanism,
442 | attention_layer_size=self.hidden_units,
443 | cell_input_fn=attn_decoder_input_fn,
444 | initial_cell_state=encoder_last_state,
445 | alignment_history=False,
446 | name='Attention_Wrapper')
447 | decoder_initial_state = decoder_cell.zero_state(batch_size=self.dnc_batch_size, dtype=self.dtype)
448 | return decoder_cell, decoder_initial_state
449 |
450 | def init_optimizer(self):
451 | print("setting optimizer..")
452 | # Gradients and SGD update operation for training the model
453 | trainable_params = tf.trainable_variables()
454 | if self.optimizer.lower() == 'adadelta':
455 | self.opt = tf.train.AdadeltaOptimizer(learning_rate=self.learning_rate)
456 | elif self.optimizer.lower() == 'adam':
457 | self.opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
458 | elif self.optimizer.lower() == 'rmsprop':
459 | self.opt = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate)
460 | else:
461 | self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
462 |
463 | # Compute gradients of loss w.r.t. all trainable variables
464 | gradients = tf.gradients(self.loss, trainable_params)
465 |
466 | # Clip gradients by a given maximum_gradient_norm
467 | clip_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
468 |
469 | # Update the model
470 | self.updates = self.opt.apply_gradients(
471 | zip(clip_gradients, trainable_params), global_step=self.global_step)
472 |
473 | def save(self, sess, path, var_list=None, global_step=None):
474 | # var_list = None returns the list of all saveable variables
475 | saver = tf.train.Saver(var_list)
476 |
477 | # temporary code
478 | # del tf.get_collection_ref('LAYER_NAME_UIDS')[0]
479 | save_path = saver.save(sess, save_path=path, global_step=global_step)
480 | print('model saved at %s' % save_path)
481 |
482 | def restore(self, sess, path, var_list=None):
483 | # var_list = None returns the list of all saveable variables
484 | saver = tf.train.Saver(var_list,reshape=True)
485 | saver.restore(sess, save_path=path)
486 | print('model restored from %s' % path)
487 |
488 | def train(self, sess, encoder_inputs, encoder_inputs_length,
489 | decoder_inputs, decoder_inputs_length):
490 | """Run a train step of the model feeding the given inputs.
491 |
492 | Args:
493 | session: tensorflow session to use.
494 | encoder_inputs: a numpy int matrix of [batch_size, max_source_time_steps]
495 | to feed as encoder inputs
496 | encoder_inputs_length: a numpy int vector of [batch_size]
497 | to feed as sequence lengths for each element in the given batch
498 | decoder_inputs: a numpy int matrix of [batch_size, max_target_time_steps]
499 | to feed as decoder inputs
500 | decoder_inputs_length: a numpy int vector of [batch_size]
501 | to feed as sequence lengths for each element in the given batch
502 |
503 | Returns:
504 | A triple consisting of gradient norm (or None if we did not do backward),
505 | average perplexity, and the outputs.
506 | """
507 | # Check if the model is 'training' mode
508 | if self.mode.lower() != 'train':
509 | raise ValueError("train step can only be operated in train mode")
510 |
511 | input_feed = self.check_feeds(encoder_inputs, encoder_inputs_length,
512 | decoder_inputs, decoder_inputs_length, False)
513 | # Input feeds for dropout
514 | input_feed[self.keep_prob_placeholder.name] = self.keep_prob
515 |
516 | output_feed = [self.updates, # Update Op that does optimization
517 | self.loss, # Loss for current batch
518 | self.summary_op] # Training summary
519 |
520 | outputs = sess.run(output_feed, input_feed)
521 | return outputs[1], outputs[2] # loss, summary
522 |
523 | def eval(self, sess, encoder_inputs, encoder_inputs_length,
524 | decoder_inputs, decoder_inputs_length):
525 | """Run a evaluation step of the model feeding the given inputs.
526 |
527 | Args:
528 | session: tensorflow session to use.
529 | encoder_inputs: a numpy int matrix of [batch_size, max_source_time_steps]
530 | to feed as encoder inputs
531 | encoder_inputs_length: a numpy int vector of [batch_size]
532 | to feed as sequence lengths for each element in the given batch
533 | decoder_inputs: a numpy int matrix of [batch_size, max_target_time_steps]
534 | to feed as decoder inputs
535 | decoder_inputs_length: a numpy int vector of [batch_size]
536 | to feed as sequence lengths for each element in the given batch
537 |
538 | Returns:
539 | A triple consisting of gradient norm (or None if we did not do backward),
540 | average perplexity, and the outputs.
541 | """
542 |
543 | input_feed = self.check_feeds(encoder_inputs, encoder_inputs_length,
544 | decoder_inputs, decoder_inputs_length, False)
545 | # Input feeds for dropout
546 | input_feed[self.keep_prob_placeholder.name] = 1.0
547 |
548 | output_feed = [self.loss, # Loss for current batch
549 | self.summary_op] # Training summary
550 | outputs = sess.run(output_feed, input_feed)
551 | return outputs[0], outputs[1] # loss
552 |
553 | def predict(self, sess, encoder_inputs, encoder_inputs_length):
554 |
555 | input_feed = self.check_feeds(encoder_inputs, encoder_inputs_length,
556 | decoder_inputs=None, decoder_inputs_length=None,
557 | decode=True)
558 |
559 | # Input feeds for dropout
560 | input_feed[self.keep_prob_placeholder.name] = 1.0
561 |
562 | output_feed = [self.decoder_pred_decode]
563 | outputs = sess.run(output_feed, input_feed)
564 |
565 | # GreedyDecoder: [batch_size, max_time_step]
566 | return outputs[0] # BeamSearchDecoder: [batch_size, max_time_step, beam_width]
567 |
568 | def check_feeds(self, encoder_inputs, encoder_inputs_length,
569 | decoder_inputs, decoder_inputs_length, decode):
570 | """
571 | Args:
572 | encoder_inputs: a numpy int matrix of [batch_size, max_source_time_steps]
573 | to feed as encoder inputs
574 | encoder_inputs_length: a numpy int vector of [batch_size]
575 | to feed as sequence lengths for each element in the given batch
576 | decoder_inputs: a numpy int matrix of [batch_size, max_target_time_steps]
577 | to feed as decoder inputs
578 | decoder_inputs_length: a numpy int vector of [batch_size]
579 | to feed as sequence lengths for each element in the given batch
580 | decode: a scalar boolean that indicates decode mode
581 | Returns:
582 | A feed for the model that consists of encoder_inputs, encoder_inputs_length,
583 | decoder_inputs, decoder_inputs_length
584 | """
585 |
586 | input_batch_size = encoder_inputs.shape[0]
587 | if input_batch_size != encoder_inputs_length.shape[0]:
588 | raise ValueError("Encoder inputs and their lengths must be equal in their "
589 | "batch_size, %d != %d" % (input_batch_size, encoder_inputs_length.shape[0]))
590 |
591 | if not decode:
592 | target_batch_size = decoder_inputs.shape[0]
593 | if target_batch_size != input_batch_size:
594 | raise ValueError("Encoder inputs and Decoder inputs must be equal in their "
595 | "batch_size, %d != %d" % (input_batch_size, target_batch_size))
596 | if target_batch_size != decoder_inputs_length.shape[0]:
597 | raise ValueError("Decoder targets and their lengths must be equal in their "
598 | "batch_size, %d != %d" % (target_batch_size, decoder_inputs_length.shape[0]))
599 |
600 | input_feed = {}
601 |
602 | input_feed[self.encoder_inputs.name] = encoder_inputs
603 | input_feed[self.encoder_inputs_length.name] = encoder_inputs_length
604 |
605 | if not decode:
606 | input_feed[self.decoder_inputs.name] = decoder_inputs
607 | input_feed[self.decoder_inputs_length.name] = decoder_inputs_length
608 |
609 | return input_feed
610 |
--------------------------------------------------------------------------------
/src/lib/util.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """DNC util ops and modules."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import numpy as np
22 | import tensorflow as tf
23 |
24 |
25 | def batch_invert_permutation(permutations):
26 | """Returns batched `tf.invert_permutation` for every row in `permutations`."""
27 | with tf.name_scope('batch_invert_permutation', values=[permutations]):
28 | unpacked = tf.unstack(permutations)
29 | inverses = [tf.invert_permutation(permutation) for permutation in unpacked]
30 | return tf.stack(inverses)
31 |
32 |
33 | def batch_gather(values, indices):
34 | """Returns batched `tf.gather` for every row in the input."""
35 | with tf.name_scope('batch_gather', values=[values, indices]):
36 | unpacked = zip(tf.unstack(values), tf.unstack(indices))
37 | result = [tf.gather(value, index) for value, index in unpacked]
38 | return tf.stack(result)
39 |
40 |
41 | def one_hot(length, index):
42 | """Return an nd array of given `length` filled with 0s and a 1 at `index`."""
43 | result = np.zeros(length)
44 | result[index] = 1
45 | return result
46 |
--------------------------------------------------------------------------------
/src/preprocessing.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Cognibit Solutions LLP.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """
16 | This script prepares the raw data for the next stage of normalization.
17 | """
18 |
19 | import os
20 | import sys
21 | import pandas as pd
22 | from multiprocessing import Pool
23 |
24 |
25 | def preprocessing(file):
26 | print('Launch Processing of {}'.format(file))
27 | output = file+'_processed.csv'
28 |
29 | # By default, Pandas treats double quote as enclosing an entry so it includes all tabs and newlines in that entry
30 | # until it reaches the next quote. To escape it we need to have the quoting argument set to QUOTE_NONE or 3 as
31 | # given in the documentation - [https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html]
32 | raw_data = pd.read_csv(file, header=None, sep='\t', quoting = 3, names=['semiotic', 'before', 'after'])
33 |
34 | # Generating sentence and word token ids
35 | # Our text normalization approach requires sentence and token ids to encode and generate batches
36 | data = pd.DataFrame(columns=['sentence_id',
37 | 'token_id',
38 | 'semiotic',
39 | 'before',
40 | 'after'])
41 | # initialize columns and iterator
42 | sentence_id = 0
43 | token_id = -1
44 |
45 | # heavy processing ahead
46 | for row in raw_data.itertuples():
47 | # look for end of sentences
48 | if row.semiotic == '' and row.before == '':
49 | sentence_id += 1
50 | token_id = -1
51 | continue
52 | else:
53 | token_id += 1
54 |
55 | new_row = {'sentence_id': sentence_id,
56 | 'token_id': token_id,
57 | 'semiotic': row.semiotic,
58 | 'before': row.before,
59 | 'after': row.after}
60 | data = data.append(new_row, ignore_index=True)
61 | print('Processing Sentence#{} of {}'.format(sentence_id, file))
62 |
63 | # **Transforming 'after' tokens**
64 | # From the above mentioned paper:
65 | # ```
66 | # Semiotic class instances are verbalized as sequences
67 | # of fully spelled words, most ordinary words are left alone (rep-
68 | # resented here as ), and punctuation symbols are mostly
69 | # transduced to sil (for “silence”).
70 | # ```
71 | # Hence we transform as follows:
72 | # 1. sil is replaced with < self >
73 | # 2. < self > is replaced with the before column
74 | #
75 | sil_mask = (data['after'] == 'sil')
76 | data.loc[sil_mask, 'after'] = ''
77 | self_mask = (data['after'] == '')
78 | data.loc[self_mask, ('after')] = data.loc[self_mask, 'before']
79 |
80 | # Exporting Data
81 | data.to_csv(output, index=False)
82 | print('Done {}'.format(file))
83 | return True
84 |
85 | def split_dataframe(df, size=10*1024*1024):
86 | """Splits huge dataframes(CSVs) into smaller segments of given size in bytes"""
87 |
88 | # size of each row
89 | row_size = df.memory_usage().sum() / len(df)
90 | # maximum number of rows in each segment
91 | row_limit = int(size // row_size)
92 | # number of segments
93 | seg_num = (len(df)+row_limit-1)//row_limit
94 | # split df into segments
95 | segments = [df.iloc[i*row_limit : (i+1)*row_limit] for i in range(seg_num)]
96 |
97 | return segments
98 |
99 |
100 | if __name__ == '__main__':
101 | path = sys.argv[1]
102 | jobs = int(sys.argv[2])
103 |
104 | # split large CSVs
105 | for dirpath, _, filenames in os.walk(path):
106 | for file in filenames:
107 | df = pd.read_csv(os.path.join(dirpath, file),header=None, sep='\t', quoting = 3, names=['semiotic', 'before', 'after'])
108 | df_splits = split_dataframe(df, 10*1024*1024)
109 | # save each split and delete original
110 | for i in range(len(df_splits)):
111 | split_file = file+'_part{}'.format(i+1)
112 | df_splits[i].to_csv(os.path.join(dirpath, split_file))
113 | os.remove(os.path.join(dirpath, file))
114 | print("Splitted original file into chunks...")
115 |
116 | files=[]
117 | for dirpath, _, filenames in os.walk(path):
118 | for file in filenames:
119 | files.append(os.path.join(dirpath, file))
120 |
121 | pool=Pool(jobs)
122 | pool.map(preprocessing, files)
123 |
124 |
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Cognibit Solutions LLP.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """
16 |
17 | Utility Modules for Text Normalization
18 | """
19 |
20 | import pickle
21 | import numpy as np
22 | from multiprocessing import Pool
23 |
24 |
25 | class Encoder:
26 | def __init__(self, vocab_file, wlook=3, time_major=False):
27 | self.vocab_file = vocab_file
28 | self.wlook = wlook
29 | self.time_major = time_major
30 |
31 | def encode(self, df, nthreads=8):
32 | if (nthreads < 1):
33 | assert ('nthreads is 1, cannot proceeed!')
34 | else:
35 | row_len = df.shape[0]
36 | batch_len = int(row_len / nthreads)
37 | last_batch = row_len % nthreads
38 | batches = []
39 | for i in range(nthreads):
40 | if (i != nthreads - 1):
41 | batches.append(df.iloc[i * batch_len:i * batch_len + batch_len])
42 | else:
43 | batches.append(df.iloc[i * batch_len:])
44 | threads = Pool(nthreads)
45 | encoded_dfs = threads.map(self.run_single_batch, batches)
46 | encoding, encoding_len = zip(*encoded_dfs)
47 | col_len = 0
48 | for e in encoding:
49 | if (e.shape[1] > col_len):
50 | col_len = e.shape[1]
51 | encoding = list(encoding)
52 | for i in range(len(encoding)):
53 | encoding[i] = np.concatenate((encoding[i], np.zeros([encoding[i].shape[0]
54 | , col_len - encoding[i].shape[1]])), axis=1)
55 | encoding = np.concatenate(encoding)
56 | encoding_len = np.concatenate(encoding_len)
57 | return encoding, encoding_len
58 |
59 | def run_single_batch(self, df):
60 | batch_gen = EncodingGenerator(self.vocab_file, self.wlook, self.time_major)
61 | return batch_gen.encode(df)
62 |
63 |
64 | class EncodingGenerator:
65 | def __init__(self, vocab_file, wlook=3, time_major=False):
66 | self.train_grp = None
67 | self.row_len = None
68 | with open(vocab_file, 'rb') as handle:
69 | self.vocab_dict = pickle.loads(handle.read())
70 | self.sent_id = 0
71 | self.token_id = 0
72 | self.row_count = 0
73 | self.wlook = wlook
74 | self.time_major = time_major
75 | self.group_keys = None
76 |
77 | def __input_lookup(self, char):
78 | if (char in self.vocab_dict['input']):
79 | return self.vocab_dict['input'][char]
80 | else:
81 | return self.vocab_dict['input']['']
82 |
83 | def __input_word_lookup(self, word):
84 | lookups = []
85 | word = str(word)
86 | print
87 | for c in word:
88 | lookups.append(self.__input_lookup(c))
89 | return lookups
90 |
91 | def __next_element(self):
92 | sent = self.train_grp.get_group(self.group_keys[self.sent_id])
93 | if (self.token_id > sent.shape[0] - 1):
94 | self.sent_id = (self.sent_id + 1) % self.train_grp.ngroups
95 | self.token_id = 0
96 | sent = self.train_grp.get_group(self.group_keys[self.sent_id])
97 | token_count = sent.shape[0]
98 | row_dict = dict()
99 | new_row = []
100 | for k in range(-self.wlook, self.wlook + 1):
101 | if (k == 0):
102 | new_row.append(self.__input_lookup(''))
103 | lookup = self.__input_word_lookup(sent.iloc[k + self.token_id, :]['before'])
104 | new_row.extend(lookup)
105 | new_row.append(self.__input_lookup(''))
106 | new_row.append(self.__input_lookup(' '))
107 | elif ((self.token_id + k < 0 or self.token_id + k > token_count - 1) == False):
108 | lookup = self.__input_word_lookup(sent.iloc[k + self.token_id, :]['before'])
109 | new_row.extend(lookup)
110 | new_row.append(self.__input_lookup(' '))
111 | new_row.append(self.__input_lookup(''))
112 | self.token_id = self.token_id + 1
113 | return new_row
114 |
115 | def encode(self, df):
116 | self.train_grp = df.groupby(by='sentence_id')
117 | self.row_len = df.shape[0]
118 | self.group_keys = list(self.train_grp.groups.keys())
119 | input_batches = []
120 | max_inp_len = 0
121 | for b in range(self.row_len):
122 | i = self.__next_element()
123 | input_batches.append(i)
124 | if (len(i) > max_inp_len):
125 | max_inp_len = len(i)
126 | # Add the padding characters
127 | input_batches_len = np.zeros([self.row_len])
128 | count = 0
129 | for b in input_batches:
130 | input_batches_len[count] = len(b)
131 | count = count + 1
132 | for i in range(0, max_inp_len - len(b)):
133 | b.append(self.__input_lookup(''))
134 |
135 | input_batches = np.array(input_batches)
136 |
137 | if (self.time_major == True):
138 | input_batches = input_batches.T
139 |
140 | return input_batches, input_batches_len
141 |
142 |
143 | class Normalized2String:
144 | def __init__(self, vocab_file):
145 | with open(vocab_file, 'rb') as handle:
146 | self.vocab_dict = pickle.loads(handle.read())
147 | output_id_dict = self.vocab_dict['output']
148 | self.output_id_dict_rev = {v: k for k, v in output_id_dict.items()}
149 |
150 | def to_str(self, prediction):
151 | """
152 | prediction : A 1D numpy array
153 | """
154 | final_str = ''
155 | for id in prediction:
156 | word = self.__output_lookup_inverse(id)
157 | if word == '':
158 | break
159 | else:
160 | final_str = final_str +' '+ str(word)
161 | return final_str[1:]
162 |
163 | def __output_lookup_inverse(self, id):
164 | return self.output_id_dict_rev[id]
--------------------------------------------------------------------------------