├── CONTRIBUTING
├── LICENSE
├── README.md
├── papers
    ├── gorman-sproat-2016.pdf
    └── wu-etal-2016.pdf
└── src
    ├── Makefile
    ├── en
        ├── README.md
        └── verbalizer
        │   ├── cardinals.tsv
        │   ├── extra_numbers.grm
        │   ├── factorization.grm
        │   ├── float.grm
        │   ├── g.fst
        │   ├── lexical_map.grm
        │   ├── lexical_map.tsv
        │   ├── math.grm
        │   ├── miscellaneous.grm
        │   ├── money.grm
        │   ├── money.tsv
        │   ├── number_names.grm
        │   ├── numbers.grm
        │   ├── numbers_plus.grm
        │   ├── ordinals.tsv
        │   ├── params.tsv
        │   ├── spelled.grm
        │   ├── spoken_punct.grm
        │   ├── time.grm
        │   ├── urls.grm
        │   └── verbalizer.grm
    ├── number_data
        ├── README.md
        ├── minimal.txt
        ├── random-trn.txt
        └── random-tst.txt
    ├── ru
        ├── README.md
        ├── classifier
        │   └── cyrillic.grm
        └── verbalizer
        │   ├── cardinals-lex.grm
        │   ├── cardinals.tsv
        │   ├── extra_numbers.grm
        │   ├── factorization.grm
        │   ├── float.grm
        │   ├── g.fst
        │   ├── lexical_map.grm
        │   ├── lexical_map.tsv
        │   ├── math.grm
        │   ├── miscellaneous.grm
        │   ├── money.grm
        │   ├── money.tsv
        │   ├── nominatives.tsv
        │   ├── number_names.grm
        │   ├── numbers.grm
        │   ├── numbers_plus.grm
        │   ├── ordinal_endings.tsv
        │   ├── ordinals-lex.grm
        │   ├── ordinals.tsv
        │   ├── spelled.grm
        │   ├── spoken_punct.grm
        │   ├── time.grm
        │   ├── urls.grm
        │   └── verbalizer.grm
    ├── universal
        ├── README.md
        ├── roman_numerals.tsv
        └── thousands_punct.grm
    └── util
        ├── README.md
        ├── arithmetic.grm
        ├── byte.grm
        ├── case.grm
        ├── germanic.tsv
        └── util.grm


/CONTRIBUTING:
--------------------------------------------------------------------------------
 1 | Want to contribute? Great! First, read this page (including the small print at
 2 | the end).
 3 | 
 4 | ### Before you contribute
 5 | 
 6 | Before we can use your code, you must sign the
 7 | [Google Individual Contributor License Agreement]
 8 | (https://cla.developers.google.com/about/google-individual)
 9 | (CLA), which you can do online. The CLA is necessary mainly because you own the
10 | copyright to your changes, even after your contribution becomes part of our
11 | codebase, so we need your permission to use and distribute your code. We also
12 | need to be sure of various other things—for instance that you'll tell us if you
13 | know that your code infringes on other people's patents. You don't have to sign
14 | the CLA until after you've submitted your code for review and a member has
15 | approved it, but you must do it before we can put your code into our codebase.
16 | Before you start working on a larger contribution, you should get in touch with
17 | us first through the issue tracker with your idea so that we can help out and
18 | possibly guide you. Coordinating up front makes it much easier to avoid
19 | frustration later on.
20 | 
21 | ### Code reviews
22 | 
23 | All submissions, including submissions by project members, require review. We
24 | use Github pull requests for this purpose.
25 | 
26 | ### The small print
27 | 
28 | Contributions made by corporations are covered by a different agreement than
29 | the one above, the [Software Grant and Corporate Contributor License Agreement]
30 | (https://cla.developers.google.com/about/google-corporate).
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Text normalization covering grammars
 2 | 
 3 | This repository provides covering grammars for English and Russian text normalization as
 4 | documented in:
 5 |   
 6 |   Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
 7 |   _Transactions of the Association for Computational Linguistics_ 4: 507-519.
 8 |   
 9 |   Ng, A. H., Gorman, K., and Sproat, R. 2017. Minimally supervised
10 |   written-to-spoken text normalization. In _ASRU_, pages 665-670.
11 | 
12 | If you use these grammars in a publication, we would appreciate if you cite these works.
13 | 
14 | ## Building
15 | 
16 | The grammars are written in [Thrax](thrax.opengrm.org) and compile into [OpenFst](openfst.org) FAR (FstARchive) files. To compile, simply run `make` in the `src/` directory.
17 | 
18 | ## License
19 | 
20 | See `LICENSE`.
21 | 
22 | ## Mandatory disclaimer
23 | 
24 | This is not an official Google product.
25 | 


--------------------------------------------------------------------------------
/papers/gorman-sproat-2016.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research-datasets/TextNormalizationCoveringGrammars/37f5fb8a9e81a2512f7ea78e88fad3210acf6fe6/papers/gorman-sproat-2016.pdf


--------------------------------------------------------------------------------
/papers/wu-etal-2016.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research-datasets/TextNormalizationCoveringGrammars/37f5fb8a9e81a2512f7ea78e88fad3210acf6fe6/papers/wu-etal-2016.pdf


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
  1 | all: en/verbalizer/verbalizer.far ru/verbalizer/verbalizer.far
  2 | 
  3 | 
  4 | # Language-universal definitions.
  5 | 
  6 | universal/thousands_punct.far: universal/thousands_punct.grm util/byte.far util/util.far
  7 | 	thraxcompiler --input_grammar=$< --output_far=$@
  8 | 
  9 | util/util.far: util/util.grm util/byte.far util/case.far
 10 | 	thraxcompiler --input_grammar=$< --output_far=$@
 11 | 
 12 | util/byte.far: util/byte.grm 
 13 | 	thraxcompiler --input_grammar=$< --output_far=$@
 14 | 
 15 | util/case.far: util/case.grm util/byte.far
 16 | 	thraxcompiler --input_grammar=$< --output_far=$@
 17 | 
 18 | util/arithmetic.far: util/arithmetic.grm util/germanic.tsv util/byte.far
 19 | 	thraxcompiler --input_grammar=$< --output_far=$@
 20 | 
 21 | # English verbalizer.
 22 | 
 23 | en/verbalizer/verbalizer.far: en/verbalizer/verbalizer.grm util/util.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far
 24 | 	thraxcompiler --input_grammar=$< --output_far=$@
 25 | 
 26 | en/verbalizer/number_names.far: en/verbalizer/number_names.grm en/verbalizer/cardinals.tsv en/verbalizer/g.fst en/verbalizer/ordinals.tsv util/arithmetic.far
 27 | 	thraxcompiler --input_grammar=$< --output_far=$@
 28 | 
 29 | en/verbalizer/extra_numbers.far: en/verbalizer/extra_numbers.grm util/byte.far en/verbalizer/numbers.far
 30 | 	thraxcompiler --input_grammar=$< --output_far=$@
 31 | 
 32 | en/verbalizer/numbers.far: en/verbalizer/numbers.grm en/verbalizer/number_names.far util/byte.far universal/thousands_punct.far
 33 | 	thraxcompiler --input_grammar=$< --output_far=$@
 34 | 
 35 | en/verbalizer/float.far: en/verbalizer/float.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
 36 | 	thraxcompiler --input_grammar=$< --output_far=$@
 37 | 
 38 | en/verbalizer/factorization.far: en/verbalizer/factorization.grm util/byte.far util/util.far en/verbalizer/numbers.far
 39 | 	thraxcompiler --input_grammar=$< --output_far=$@
 40 | 
 41 | en/verbalizer/lexical_map.far: en/verbalizer/lexical_map.grm util/byte.far en/verbalizer/lexical_map.tsv
 42 | 	thraxcompiler --input_grammar=$< --output_far=$@
 43 | 
 44 | en/verbalizer/math.far: en/verbalizer/math.grm en/verbalizer/float.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
 45 | 	thraxcompiler --input_grammar=$< --output_far=$@
 46 | 
 47 | en/verbalizer/miscellaneous.far: en/verbalizer/miscellaneous.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/extra_numbers.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/spelled.far
 48 | 	thraxcompiler --input_grammar=$< --output_far=$@
 49 | 
 50 | en/verbalizer/spelled.far: en/verbalizer/spelled.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
 51 | 	thraxcompiler --input_grammar=$< --output_far=$@
 52 | 
 53 | en/verbalizer/money.far: en/verbalizer/money.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/money.tsv
 54 | 	thraxcompiler --input_grammar=$< --output_far=$@
 55 | 
 56 | en/verbalizer/numbers_plus.far: en/verbalizer/numbers_plus.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
 57 | 	thraxcompiler --input_grammar=$< --output_far=$@
 58 | 
 59 | en/verbalizer/spoken_punct.far: en/verbalizer/spoken_punct.grm en/verbalizer/lexical_map.far
 60 | 	thraxcompiler --input_grammar=$< --output_far=$@
 61 | 
 62 | en/verbalizer/time.far: en/verbalizer/time.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far
 63 | 	thraxcompiler --input_grammar=$< --output_far=$@
 64 | 
 65 | en/verbalizer/urls.far: en/verbalizer/urls.grm util/byte.far en/verbalizer/lexical_map.far
 66 | 	thraxcompiler --input_grammar=$< --output_far=$@
 67 | 
 68 | 
 69 | # Russian verbalizer.
 70 | 
 71 | ru/verbalizer/verbalizer.far: ru/verbalizer/verbalizer.grm util/util.far ru/verbalizer/extra_numbers.far ru/verbalizer/float.far ru/verbalizer/math.far ru/verbalizer/miscellaneous.far ru/verbalizer/money.far ru/verbalizer/numbers.far ru/verbalizer/numbers_plus.far ru/verbalizer/spelled.far ru/verbalizer/spoken_punct.far ru/verbalizer/time.far ru/verbalizer/urls.far
 72 | 	thraxcompiler --input_grammar=$< --output_far=$@
 73 | 
 74 | ru/verbalizer/number_names.far: ru/verbalizer/number_names.grm ru/verbalizer/cardinals.tsv ru/verbalizer/g.fst ru/verbalizer/ordinals.tsv util/arithmetic.far
 75 | 	thraxcompiler --input_grammar=$< --output_far=$@
 76 | 
 77 | ru/verbalizer/extra_numbers.far: ru/verbalizer/extra_numbers.grm util/byte.far ru/verbalizer/numbers.far
 78 | 	thraxcompiler --input_grammar=$< --output_far=$@
 79 | 
 80 | ru/verbalizer/numbers.far: ru/verbalizer/numbers.grm util/byte.far universal/thousands_punct.far ru/verbalizer/nominatives.tsv ru/verbalizer/number_names.far
 81 | 	thraxcompiler --input_grammar=$< --output_far=$@
 82 | 
 83 | ru/verbalizer/cardinals-lex.far: ru/verbalizer/cardinals-lex.grm util/byte.far
 84 | 	thraxcompiler --input_grammar=$< --output_far=$@
 85 | 
 86 | ru/verbalizer/ordinals-lex.far: ru/verbalizer/ordinals-lex.grm util/byte.far
 87 | 	thraxcompiler --input_grammar=$< --output_far=$@
 88 | 
 89 | ru/verbalizer/float.far: ru/verbalizer/float.grm ru/verbalizer/factorization.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far
 90 | 	thraxcompiler --input_grammar=$< --output_far=$@
 91 | 
 92 | ru/verbalizer/factorization.far: ru/verbalizer/factorization.grm util/byte.far util/util.far ru/verbalizer/numbers.far
 93 | 	thraxcompiler --input_grammar=$< --output_far=$@
 94 | 
 95 | ru/verbalizer/lexical_map.far: ru/verbalizer/lexical_map.grm util/byte.far
 96 | 	thraxcompiler --input_grammar=$< --output_far=$@
 97 | 
 98 | ru/verbalizer/math.far: ru/verbalizer/math.grm ru/verbalizer/float.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far
 99 | 	thraxcompiler --input_grammar=$< --output_far=$@
100 | 
101 | ru/verbalizer/miscellaneous.far: ru/verbalizer/miscellaneous.grm util/byte.far ru/classifier/cyrillic.far ru/verbalizer/extra_numbers.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far ru/verbalizer/spelled.far
102 | 	thraxcompiler --input_grammar=$< --output_far=$@
103 | 
104 | ru/classifier/cyrillic.far: ru/classifier/cyrillic.grm 
105 | 	thraxcompiler --input_grammar=$< --output_far=$@
106 | 
107 | ru/verbalizer/spelled.far: ru/verbalizer/spelled.grm util/byte.far ru/classifier/cyrillic.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far
108 | 	thraxcompiler --input_grammar=$< --output_far=$@
109 | 
110 | ru/verbalizer/money.far: ru/verbalizer/money.grm util/byte.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far ru/verbalizer/money.tsv
111 | 	thraxcompiler --input_grammar=$< --output_far=$@
112 | 
113 | ru/verbalizer/numbers_plus.far: ru/verbalizer/numbers_plus.grm ru/verbalizer/factorization.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far
114 | 	thraxcompiler --input_grammar=$< --output_far=$@
115 | 
116 | ru/verbalizer/spoken_punct.far: ru/verbalizer/spoken_punct.grm ru/verbalizer/lexical_map.far
117 | 	thraxcompiler --input_grammar=$< --output_far=$@
118 | 
119 | ru/verbalizer/time.far: ru/verbalizer/time.grm util/byte.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far
120 | 	thraxcompiler --input_grammar=$< --output_far=$@
121 | 
122 | ru/verbalizer/urls.far: ru/verbalizer/urls.grm util/byte.far ru/verbalizer/lexical_map.far
123 | 	thraxcompiler --input_grammar=$< --output_far=$@
124 | 
125 | 
126 | # Cleanup.
127 | 
128 | clean:
129 | 	$(RM) */*.far */*/*.far
130 | 


--------------------------------------------------------------------------------
/src/en/README.md:
--------------------------------------------------------------------------------
1 | # English covering grammar definitions
2 | 
3 | This directory defines a English text normalization covering grammar. The
4 | primary entry-point is the FST `VERBALIZER`, defined in
5 | `verbalizer/verbalizer.grm` and compiled in the FST archive
6 | `verbalizer/verbalizer.far`.
7 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/cardinals.tsv:
--------------------------------------------------------------------------------
 1 | 0	zero
 2 | 1	one
 3 | 2	two
 4 | 3	three
 5 | 4	four
 6 | 5	five
 7 | 6	six
 8 | 7	seven
 9 | 8	eight
10 | 9	nine
11 | 10	ten
12 | 11	eleven
13 | 12	twelve
14 | 13	thirteen
15 | 14	fourteen
16 | 15	fifteen
17 | 16	sixteen
18 | 17	seventeen
19 | 18	eighteen
20 | 19	nineteen
21 | 20	twenty
22 | 30	thirty
23 | 40	forty
24 | 50	fifty
25 | 60	sixty
26 | 70	seventy
27 | 80	eighty
28 | 90	ninety
29 | 100	hundred
30 | 1000	thousand
31 | 1000000	million
32 | 1000000000	billion
33 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/extra_numbers.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'util/byte.grm' as b;
16 | import 'en/verbalizer/numbers.grm' as n;
17 | 
18 | digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@");
19 | 
20 | export DIGITS  = digit (n.I[" "] digit)*;
21 | 
22 | # Various common factorizations
23 | 
24 | two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS;
25 | 
26 | three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS;
27 | 
28 | mixed =
29 |    (digit n.I[" "] two_digits)
30 |  | (two_digits n.I[" "] two_digits)
31 |  | (two_digits n.I[" "] three_digits)
32 |  | (two_digits n.I[" "] two_digits n.I[" "] two_digits)
33 | ;
34 | 
35 | export MIXED_NUMBERS = Optimize[mixed];
36 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/factorization.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'util/byte.grm' as b;
16 | import 'util/util.grm' as u;
17 | import 'en/verbalizer/numbers.grm' as n;
18 | 
19 | func ToNumberName[expr] {
20 |   number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*;
21 |   return Optimize[expr @ number_name_seq];
22 | }
23 | 
24 | d = b.kDigit;
25 | 
26 | leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*];
27 | 
28 | by_ones = d n.I[" "];
29 | by_twos = (d{2} @ leading_zero) n.I[" "];
30 | by_threes = (d{3} @ leading_zero) n.I[" "];
31 | 
32 | groupings = by_twos* (by_threes | by_twos | by_ones);
33 | 
34 | export FRACTIONAL_PART_UNGROUPED =
35 |   Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]]
36 | ;
37 | export FRACTIONAL_PART_GROUPED =
38 |   Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]]
39 | ;
40 | export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]];
41 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/float.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'en/verbalizer/factorization.grm' as f;
16 | import 'en/verbalizer/lexical_map.grm' as l;
17 | import 'en/verbalizer/numbers.grm' as n;
18 | 
19 | fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED;
20 | fractional_part_grouped = f.FRACTIONAL_PART_GROUPED;
21 | fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED;
22 | 
23 | __fractional_part__ = fractional_part_ungrouped | fractional_part_unparsed;
24 | __decimal_marker__ = ".";
25 | 
26 | export FLOAT = Optimize[
27 |  (n.CARDINAL_NUMBERS
28 |   (__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ")
29 |   __fractional_part__) @ l.LEXICAL_MAP]
30 | ;
31 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/g.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research-datasets/TextNormalizationCoveringGrammars/37f5fb8a9e81a2512f7ea78e88fad3210acf6fe6/src/en/verbalizer/g.fst


--------------------------------------------------------------------------------
/src/en/verbalizer/lexical_map.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'util/byte.grm' as b;
16 | 
17 | lexical_map = StringFile['en/verbalizer/lexical_map.tsv'];
18 | 
19 | sigma_star = b.kBytes*;
20 | 
21 | del_null = CDRewrite["__NULL__" : "", "", "", sigma_star];
22 | 
23 | export LEXICAL_MAP = Optimize[
24 |   CDRewrite[lexical_map, "", "", sigma_star] @ del_null]
25 | ;
26 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/lexical_map.tsv:
--------------------------------------------------------------------------------
 1 | @@CONNECTOR_RANGE@@	to
 2 | @@CONNECTOR_RATIO@@	to
 3 | @@CONNECTOR_BY@@	by
 4 | @@CONNECTOR_CONSECUTIVE_YEAR@@	to
 5 | @@JANUARY@@	january
 6 | @@FEBRUARY@@	february
 7 | @@MARCH@@	march
 8 | @@APRIL@@	april
 9 | @@MAY@@	may
10 | @@JUNE@@	june
11 | @@JULY@@	july
12 | @@AUGUST@@	august
13 | @@SEPTEMBER@@	september
14 | @@OCTOBER@@	october
15 | @@NOVEMBER@@	november
16 | @@DECEMBER@@	december
17 | @@MINUS@@	minus
18 | @@DECIMAL_DOT_EXPRESSION@@	point
19 | @@URL_DOT_EXPRESSION@@	dot
20 | @@DECIMAL_EXPONENT@@	to the
21 | @@DECIMAL_EXPONENT@@	to the power of
22 | @@COLON@@	colon
23 | @@SLASH@@	slash
24 | @@SLASH@@	forward slash
25 | @@DASH@@	dash
26 | @@PASSWORD@@	password
27 | @@AT@@	at
28 | @@PORT@@	port
29 | @@QUESTION_MARK@@	question mark
30 | @@HASH@@	hash
31 | @@HASH@@	hash tag
32 | @@FRACTION_OVER@@	over
33 | @@MONEY_AND@@	and
34 | @@AND@@	and
35 | @@PHONE_PLUS@@	plus
36 | @@PHONE_EXTENSION@@	extension
37 | @@TIME_AM@@		a m
38 | @@TIME_PM@@		p m
39 | @@HOUR@@		o'clock
40 | @@MINUTE@@		minute
41 | @@MINUTE@@		minutes
42 | @@TIME_AFTER@@		after
43 | @@TIME_AFTER@@		past
44 | @@TIME_BEFORE@@		to
45 | @@TIME_BEFORE@@		till
46 | @@TIME_QUARTER@@	quarter
47 | @@TIME_HALF@@		half
48 | @@TIME_ZERO@@		oh
49 | @@TIME_THREE_QUARTER@@	three quarters
50 | @@ARITHMETIC_PLUS@@	plus
51 | @@ARITHMETIC_TIMES@@	times
52 | @@ARITHMETIC_TIMES@@	multiplied by
53 | @@ARITHMETIC_MINUS@@	minus
54 | @@ARITHMETIC_DIVISION@@	divided by
55 | @@ARITHMETIC_DIVISION@@	over
56 | @@ARITHMETIC_EQUALS@@	equals
57 | @@PERCENT@@		percent
58 | @@DEGREE@@		degree
59 | @@DEGREE@@		degrees
60 | @@SQUARE_ROOT@@		square root of
61 | @@SQUARE_ROOT@@		the square root of
62 | @@STAR@@		star
63 | @@HYPHEN@@		hyphen
64 | @@AT@@			at
65 | @@PER@@			per
66 | @@PERIOD@@		period
67 | @@PERIOD@@		full stop
68 | @@PERIOD@@		dot
69 | @@EXCLAMATION_MARK@@	exclamation mark
70 | @@EXCLAMATION_MARK@@	exclamation point
71 | @@COMMA@@		comma
72 | @@POSITIVE@@		positive
73 | @@NEGATIVE@@		negative
74 | @@OTHER_ZERO_VERBALIZATIONS@@	oh
75 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/math.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'en/verbalizer/float.grm' as f;
16 | import 'en/verbalizer/lexical_map.grm' as l;
17 | import 'en/verbalizer/numbers.grm' as n;
18 | 
19 | float = f.FLOAT;
20 | card = n.CARDINAL_NUMBERS;
21 | number = card | float;
22 | 
23 | plus = "+" : " @@ARITHMETIC_PLUS@@ ";
24 | times = "*" : " @@ARITHMETIC_TIMES@@ ";
25 | minus = "-" : " @@ARITHMETIC_MINUS@@ ";
26 | division = "/" : " @@ARITHMETIC_DIVISION@@ ";
27 | 
28 | operator = plus | times | minus | division;
29 | 
30 | percent = "%" : " @@PERCENT@@";
31 | 
32 | export ARITHMETIC =
33 |   Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP]
34 | ;
35 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/miscellaneous.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'util/byte.grm' as b;
16 | import 'ru/classifier/cyrillic.grm' as c;
17 | import 'en/verbalizer/extra_numbers.grm' as e;
18 | import 'en/verbalizer/lexical_map.grm' as l;
19 | import 'en/verbalizer/numbers.grm' as n;
20 | import 'en/verbalizer/spelled.grm' as s;
21 | 
22 | letter = b.kAlpha | c.kCyrillicAlpha;
23 | dash   = "-";
24 | word = letter+;
25 | possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?;
26 | 
27 | post_word_symbol =
28 |    ("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) |
29 |    ("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) |
30 |    ("*" : "@@STAR@@")
31 | ;
32 | 
33 | pre_word_symbol =
34 |    ("@" : "@@AT@@") |
35 |    ("/" : "@@SLASH@@") |
36 |    ("#" : "@@HASH@@")
37 | ;
38 | 
39 | post_word = possibly_split_word n.I[" "] post_word_symbol;
40 | 
41 | pre_word = pre_word_symbol n.I[" "] possibly_split_word;
42 | 
43 | ## Number/digit sequence combos, maybe with a dash
44 | 
45 | spelled_word = word @ s.SPELLED_NO_LETTER;
46 | 
47 | word_number =
48 |   (word | spelled_word)
49 |   (n.I[" "] | (dash : " "))
50 |   (e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
51 | ;
52 | 
53 | number_word =
54 |   (e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
55 |   (n.I[" "] | (dash : " "))
56 |   (word | spelled_word)
57 | ;
58 | 
59 | ## Two-digit year.
60 | 
61 | # Note that in this case to be fair we really have to allow ordinals too since
62 | # in some languages that's what you would have.
63 | 
64 | two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS));
65 | 
66 | dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com";
67 | 
68 | miscellaneous = Optimize[
69 |     possibly_split_word
70 |   | post_word
71 |   | pre_word
72 |   | word_number
73 |   | number_word
74 |   | two_digit_year
75 |   | dot_com
76 | ];
77 | 
78 | export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP];
79 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/money.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'util/byte.grm' as b;
16 | import 'en/verbalizer/lexical_map.grm' as l;
17 | import 'en/verbalizer/numbers.grm' as n;
18 | 
19 | card = n.CARDINAL_NUMBERS;
20 | 
21 | __currency__ = StringFile['en/verbalizer/money.tsv'];
22 | 
23 | d = b.kDigit;
24 | D = d - "0";
25 | 
26 | cents = ((n.D["0"] | D) d) @ card;
27 | 
28 | # Only dollar for the verbalizer tests for English. Will need to add other
29 | # currencies.
30 | usd_maj = Project["usd_maj" @ __currency__, 'output'];
31 | usd_min = Project["usd_min" @ __currency__, 'output'];
32 | and = " @@MONEY_AND@@ " | " ";
33 | 
34 | dollar1 =
35 |   n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min]
36 | ;
37 | 
38 | dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"];
39 | 
40 | dollar3 = n.D["$"] card n.I[" " usd_maj];
41 | 
42 | dollar = Optimize[dollar1 | dollar2 | dollar3];
43 | 
44 | export MONEY = Optimize[dollar @ l.LEXICAL_MAP];
45 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/money.tsv:
--------------------------------------------------------------------------------
1 | usd_maj	dollar
2 | usd_maj	dollars
3 | usd_min	cent
4 | usd_min	cents
5 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/number_names.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # English minimally supervised number grammar.
16 | #
17 | # Supports both cardinals and ordinals without overt marking.
18 | #
19 | # The language-specific acceptor G was compiled with digit, teen, and decade
20 | # preterminals. The lexicon transducer L is unambiguous so no LM is used.
21 | 
22 | import 'util/arithmetic.grm' as a;
23 | 
24 | # Intersects the universal factorization transducer (F) with the
25 | # language-specific acceptor (G).
26 | 
27 | d = a.DELTA_STAR;
28 | f = a.IARITHMETIC_RESTRICTED;
29 | g = LoadFst['en/verbalizer/g.fst'];
30 | fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]];
31 | test1 = AssertEqual["230" @ fg, "(+ (* 2 100 *) 30 +)"];
32 | 
33 | # Compiles lexicon transducer (L).
34 | 
35 | cardinal_name = StringFile['en/verbalizer/cardinals.tsv'];
36 | cardinal_l = Optimize[(cardinal_name " ")* cardinal_name];
37 | test2 = AssertEqual["2 100 30" @ cardinal_l, "two hundred thirty"];
38 | 
39 | ordinal_name = StringFile['en/verbalizer/ordinals.tsv'];
40 | # In English, ordinals have the same syntax as cardinals and all but the final
41 | # element is verbalized using a cardinal number word; e.g., "two hundred
42 | # thirtieth".
43 | ordinal_l = Optimize[(cardinal_name " ")* ordinal_name];
44 | test3 = AssertEqual["2 100 30" @ ordinal_l, "two hundred thirtieth"];
45 | 
46 | # Composes L with the leaf transducer (P), then composes that with FG.
47 | 
48 | p = a.LEAVES;
49 | 
50 | export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)];
51 | test4 = AssertEqual["230" @ CARDINAL_NUMBER_NAME, "two hundred thirty"];
52 | 
53 | export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)];
54 | test5 = AssertEqual["230" @ ORDINAL_NUMBER_NAME, "two hundred thirtieth"];
55 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/numbers.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'en/verbalizer/number_names.grm' as n;
16 | import 'util/byte.grm' as bytelib;
17 | import 'universal/thousands_punct.grm' as t;
18 | 
19 | cardinal = n.CARDINAL_NUMBER_NAME;
20 | ordinal = n.ORDINAL_NUMBER_NAME;
21 | 
22 | # Putting these here since this grammar gets incorporated by all the others.
23 | 
24 | func I[expr] {
25 |   return "" : expr;
26 | }
27 | 
28 | func D[expr] {
29 |   return expr : "";
30 | }
31 | 
32 | separators = t.comma_thousands | t.no_delimiter;
33 | 
34 | # Language specific endings for ordinals.
35 | d = bytelib.kDigit;
36 | endings = "st" | "nd" | "rd" | "th";
37 | 
38 | st = (d* "1") - (d* "11");
39 | nd = (d* "2") - (d* "12");
40 | rd = (d* "3") - (d* "13");
41 | th = Optimize[d* - st - nd - rd];
42 | first = st ("st" : "");
43 | second = nd ("nd" : "");
44 | third = rd ("rd" : "");
45 | other = th ("th" : "");
46 | marked_ordinal = Optimize[first | second | third | other];
47 | 
48 | # The separator is a no-op here but will be needed once we replace
49 | # the above targets.
50 | 
51 | export CARDINAL_NUMBERS = Optimize[separators @ cardinal];
52 | 
53 | export ORDINAL_NUMBERS =
54 |   Optimize[(separators endings) @ marked_ordinal @ ordinal]
55 | ;
56 | 
57 | export ORDINAL_NUMBERS_UNMARKED = Optimize[separators @ ordinal];
58 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/numbers_plus.grm:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # Grammar for things built mostly on numbers.
 16 | 
 17 | import 'en/verbalizer/factorization.grm' as f;
 18 | import 'en/verbalizer/lexical_map.grm' as l;
 19 | import 'en/verbalizer/numbers.grm' as n;
 20 | 
 21 | num = n.CARDINAL_NUMBERS;
 22 | ord = n.ORDINAL_NUMBERS_UNMARKED;
 23 | digits = f.FRACTIONAL_PART_UNGROUPED;
 24 | 
 25 | # Various symbols.
 26 | 
 27 | plus = "+" : "@@ARITHMETIC_PLUS@@";
 28 | minus = "-" : "@@ARITHMETIC_MINUS@@";
 29 | slash = "/" : "@@SLASH@@";
 30 | dot = "." : "@@URL_DOT_EXPRESSION@@";
 31 | dash = "-" : "@@DASH@@";
 32 | equals = "=" : "@@ARITHMETIC_EQUALS@@";
 33 | 
 34 | degree = "°" : "@@DEGREE@@";
 35 | 
 36 | division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@";
 37 | 
 38 | times = ("x" | "*") : "@@ARITHMETIC_TIMES@@";
 39 | 
 40 | power = "^" : "@@DECIMAL_EXPONENT@@";
 41 | 
 42 | square_root = "√" : "@@SQUARE_ROOT@@";
 43 | 
 44 | percent = "%" : "@@PERCENT@@";
 45 | 
 46 | # Safe roman numbers.
 47 | 
 48 | # NB: Do not change the formatting here. NO_EDIT must be on the same
 49 | # line as the path.
 50 | rfile = 
 51 |   'universal/roman_numerals.tsv' # NO_EDIT
 52 | ;
 53 | 
 54 | roman = StringFile[rfile];
 55 | 
 56 | ## Main categories.
 57 | 
 58 | cat_dot_number =
 59 |    num
 60 |    n.I[" "] dot n.I[" "] num
 61 |    (n.I[" "] dot n.I[" "] num)+
 62 | ;
 63 | 
 64 | cat_slash_number =
 65 |    num
 66 |    n.I[" "] slash n.I[" "] num
 67 |    (n.I[" "] slash n.I[" "] num)*
 68 | ;
 69 | 
 70 | cat_dash_number =
 71 |    num
 72 |    n.I[" "] dash n.I[" "] num
 73 |    (n.I[" "] dash n.I[" "] num)*
 74 | ;
 75 | 
 76 | cat_signed_number = ((plus | minus) n.I[" "])? num;
 77 | 
 78 | cat_degree = cat_signed_number n.I[" "] degree;
 79 | 
 80 | cat_country_code = plus n.I[" "] (num | digits);
 81 | 
 82 | cat_math_operations =
 83 |      plus
 84 |    | minus
 85 |    | division
 86 |    | times
 87 |    | equals
 88 |    | percent
 89 |    | power
 90 |    | square_root
 91 | ;
 92 | 
 93 | # Roman numbers are often either cardinals or ordinals in various languages.
 94 | cat_roman = roman @ (num | ord);
 95 | 
 96 | # Allow
 97 | #
 98 | # number:number
 99 | # number-number
100 | #
101 | # to just be
102 | #
103 | # number number.
104 | 
105 | cat_number_number =
106 |    num ((":" | "-") : " ") num
107 | ;
108 | 
109 | # Some additional readings for these symbols.
110 | 
111 | cat_additional_readings =
112 |   ("/" : "@@PER@@") |
113 |   ("+" : "@@AND@@") |
114 |   ("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) |
115 |   ("*" : "@@STAR@@") |
116 |   ("x" : ("x" | "@@CONNECTOR_BY@@")) |
117 |   ("@" : "@@AT@@")
118 | ;
119 | 
120 | numbers_plus = Optimize[
121 |    cat_dot_number
122 |  | cat_slash_number
123 |  | cat_dash_number
124 |  | cat_signed_number
125 |  | cat_degree
126 |  | cat_country_code
127 |  | cat_math_operations
128 |  | cat_roman
129 |  | cat_number_number
130 |  | cat_additional_readings
131 | ];
132 | 
133 | export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP];
134 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/ordinals.tsv:
--------------------------------------------------------------------------------
 1 | 0	zeroth
 2 | 1	first
 3 | 2	second
 4 | 3	third
 5 | 4	fourth
 6 | 5	fifth
 7 | 6	sixth
 8 | 7	seventh
 9 | 8	eighth
10 | 9	ninth
11 | 10	tenth
12 | 11	eleventh
13 | 12	twelfth
14 | 13	thirteenth
15 | 14	fourteenth
16 | 15	fifteenth
17 | 16	sixteenth
18 | 17	seventeenth
19 | 18	eighteenth
20 | 19	nineteenth
21 | 20	twentieth
22 | 30	thirtieth
23 | 40	fortieth
24 | 50	fiftieth
25 | 60	sixtieth
26 | 70	seventieth
27 | 80	eightieth
28 | 90	ninetieth
29 | 100	hundredth
30 | 1000	thousandth
31 | 1000000	millionth
32 | 1000000000	billionth
33 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/params.tsv:
--------------------------------------------------------------------------------
1 | float.grm	__fractional_part__ = fractional_part_ungrouped | fractional_part_unparsed;
2 | telephone.grm	__grouping__ = f.UNGROUPED;
3 | measure.grm	__measure__ = StringFile['en/verbalizer/measures.tsv'];
4 | money.grm	__currency__ = StringFile['en/verbalizer/money.tsv'];
5 | time.grm	__sep__ = ":";
6 | time.grm	__am__ = "a.m." | "am" | "AM";
7 | time.grm	__pm__ = "p.m." | "pm" | "PM";
8 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/spelled.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This verbalizer is used whenever there is an LM symbol that consists of
16 | # letters immediately followed by "{spelled}".l This strips the "{spelled}"
17 | # suffix.
18 | 
19 | import 'util/byte.grm' as b;
20 | import 'ru/classifier/cyrillic.grm' as c;
21 | import 'en/verbalizer/lexical_map.grm' as l;
22 | import 'en/verbalizer/numbers.grm' as n;
23 | 
24 | digit = b.kDigit @ n.CARDINAL_NUMBERS;
25 | 
26 | char_set = (("a" | "A") : "letter-a")
27 |         | (("b" | "B") : "letter-b")
28 |         | (("c" | "C") : "letter-c")
29 |         | (("d" | "D") : "letter-d")
30 |         | (("e" | "E") : "letter-e")
31 |         | (("f" | "F") : "letter-f")
32 |         | (("g" | "G") : "letter-g")
33 |         | (("h" | "H") : "letter-h")
34 |         | (("i" | "I") : "letter-i")
35 |         | (("j" | "J") : "letter-j")
36 |         | (("k" | "K") : "letter-k")
37 |         | (("l" | "L") : "letter-l")
38 |         | (("m" | "M") : "letter-m")
39 |         | (("n" | "N") : "letter-n")
40 |         | (("o" | "O") : "letter-o")
41 |         | (("p" | "P") : "letter-p")
42 |         | (("q" | "Q") : "letter-q")
43 |         | (("r" | "R") : "letter-r")
44 |         | (("s" | "S") : "letter-s")
45 |         | (("t" | "T") : "letter-t")
46 |         | (("u" | "U") : "letter-u")
47 |         | (("v" | "V") : "letter-v")
48 |         | (("w" | "W") : "letter-w")
49 |         | (("x" | "X") : "letter-x")
50 |         | (("y" | "Y") : "letter-y")
51 |         | (("z" | "Z") : "letter-z")
52 |         | (digit)
53 |         | ("&" : "@@AND@@")
54 |         | ("." : "")
55 |         | ("-" : "")
56 |         | ("_" : "")
57 |         | ("/" : "")
58 |         | (n.I["letter-"] c.kCyrillicAlpha)
59 |         ;
60 | 
61 | ins_space = "" : " ";
62 | 
63 | suffix = "{spelled}" : "";
64 | 
65 | spelled = Optimize[char_set (ins_space char_set)* suffix];
66 | 
67 | export SPELLED = Optimize[spelled @ l.LEXICAL_MAP];
68 | 
69 | sigma_star = b.kBytes*;
70 | 
71 | # Gets rid of the letter- prefix since in some cases we don't want it.
72 | 
73 | del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star];
74 | 
75 | spelled_no_tag = Optimize[char_set (ins_space char_set)*];
76 | 
77 | export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter];
78 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/spoken_punct.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'en/verbalizer/lexical_map.grm' as l;
16 | 
17 | punct =
18 |    ("." : "@@PERIOD@@")
19 |  | ("," : "@@COMMA@@")
20 |  | ("!" : "@@EXCLAMATION_MARK@@")
21 |  | ("?" : "@@QUESTION_MARK@@")
22 | ;
23 | 
24 | export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP];
25 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/time.grm:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import 'util/byte.grm' as b;
 16 | import 'en/verbalizer/lexical_map.grm' as l;
 17 | import 'en/verbalizer/numbers.grm' as n;
 18 | 
 19 | # Only handles 24-hour time with quarter-to, half-past and quarter-past.
 20 | 
 21 | increment_hour =
 22 |     ("0" : "1")
 23 |   | ("1" : "2")
 24 |   | ("2" : "3")
 25 |   | ("3" : "4")
 26 |   | ("4" : "5")
 27 |   | ("5" : "6")
 28 |   | ("6" : "7")
 29 |   | ("7" : "8")
 30 |   | ("8" : "9")
 31 |   | ("9" : "10")
 32 |   | ("10" : "11")
 33 |   | ("11" : "12")
 34 |   | ("12" : "1")  # If someone uses 12, we assume 12-hour by default.
 35 |   | ("13" : "14")
 36 |   | ("14" : "15")
 37 |   | ("15" : "16")
 38 |   | ("16" : "17")
 39 |   | ("17" : "18")
 40 |   | ("18" : "19")
 41 |   | ("19" : "20")
 42 |   | ("20" : "21")
 43 |   | ("21" : "22")
 44 |   | ("22" : "23")
 45 |   | ("23" : "12")
 46 | ;
 47 | 
 48 | hours = Project[increment_hour, 'input'];
 49 | 
 50 | d = b.kDigit;
 51 | D = d - "0";
 52 | 
 53 | minutes09 = "0" D;
 54 | 
 55 | minutes = ("1" | "2" | "3" | "4" | "5") d;
 56 | 
 57 | __sep__ = ":";
 58 | sep_space = __sep__ : " ";
 59 | 
 60 | verbalize_hours = hours @ n.CARDINAL_NUMBERS;
 61 | 
 62 | verbalize_minutes =
 63 |    ("00" : "@@HOUR@@")
 64 |  | (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS))
 65 |  | (minutes @ n.CARDINAL_NUMBERS)
 66 | ;
 67 | 
 68 | time_basic = Optimize[verbalize_hours sep_space verbalize_minutes];
 69 | 
 70 | # Special cases we handle right now.
 71 | # TODO: Need to allow for cases like
 72 | #
 73 | #   half twelve (in the UK English sense)
 74 | #   half twaalf (in the Dutch sense)
 75 | 
 76 | time_quarter_past =
 77 |    n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "]
 78 |    verbalize_hours
 79 |    n.D[__sep__ "15"];
 80 | 
 81 | time_half_past =
 82 |    n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "]
 83 |    verbalize_hours
 84 |    n.D[__sep__ "30"];
 85 | 
 86 | time_quarter_to =
 87 |    n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "]
 88 |    (increment_hour @ verbalize_hours)
 89 |    n.D[__sep__ "45"];
 90 | 
 91 | time_extra = Optimize[
 92 |   time_quarter_past | time_half_past | time_quarter_to]
 93 | ;
 94 | 
 95 | # Basic time periods which most languages can be expected to have.
 96 | __am__ = "a.m." | "am" | "AM";
 97 | __pm__ = "p.m." | "pm" | "PM";
 98 | 
 99 | period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@");
100 | 
101 | time_variants = time_basic | time_extra;
102 | 
103 | time = Optimize[
104 |     (period (" " | n.I[" "]))? time_variants
105 |  |  time_variants ((" " | n.I[" "]) period)?]
106 | ;
107 | 
108 | export TIME = Optimize[time @ l.LEXICAL_MAP];
109 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/urls.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Rules for URLs and email addresses.
16 | 
17 | import 'util/byte.grm' as bytelib;
18 | import 'en/verbalizer/lexical_map.grm' as l;
19 | 
20 | ins_space = "" : " ";
21 | dot = "." : "@@URL_DOT_EXPRESSION@@";
22 | at = "@" : "@@AT@@";
23 | 
24 | url_suffix =
25 |   (".com" : dot ins_space "com") |
26 |   (".gov" : dot ins_space "gov") |
27 |   (".edu" : dot ins_space "e d u") |
28 |   (".org" : dot ins_space "org") |
29 |   (".net" : dot ins_space "net")
30 | ;
31 | 
32 | letter_string = (bytelib.kAlnum)* bytelib.kAlnum;
33 | 
34 | letter_string_dot =
35 |   ((letter_string ins_space dot ins_space)* letter_string)
36 | ;
37 | 
38 | # Rules for URLs.
39 | export URL = Optimize[
40 |  ((letter_string_dot) (ins_space)
41 |   (url_suffix)) @ l.LEXICAL_MAP
42 | ];
43 | 
44 | # Rules for email addresses.
45 | letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum);
46 | 
47 | letter_by_letter_dot =
48 |   ((letter_by_letter ins_space dot ins_space)*
49 |   letter_by_letter)
50 | ;
51 | 
52 | export EMAIL1 = Optimize[
53 |  ((letter_by_letter) (ins_space)
54 |   (at) (ins_space)
55 |   (letter_by_letter_dot) (ins_space)
56 |   (url_suffix)) @ l.LEXICAL_MAP
57 | ];
58 | 
59 | export EMAIL2 = Optimize[
60 |  ((letter_by_letter) (ins_space)
61 |   (at) (ins_space)
62 |   (letter_string_dot) (ins_space)
63 |   (url_suffix)) @ l.LEXICAL_MAP
64 | ];
65 | 
66 | export EMAILS = Optimize[
67 |   EMAIL1 | EMAIL2
68 | ];
69 | 


--------------------------------------------------------------------------------
/src/en/verbalizer/verbalizer.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'util/util.grm' as util;
16 | import 'en/verbalizer/extra_numbers.grm' as e;
17 | import 'en/verbalizer/float.grm' as f;
18 | import 'en/verbalizer/math.grm' as ma;
19 | import 'en/verbalizer/miscellaneous.grm' as mi;
20 | import 'en/verbalizer/money.grm' as mo;
21 | import 'en/verbalizer/numbers.grm' as n;
22 | import 'en/verbalizer/numbers_plus.grm' as np;
23 | import 'en/verbalizer/spelled.grm' as s;
24 | import 'en/verbalizer/spoken_punct.grm' as sp;
25 | import 'en/verbalizer/time.grm' as t;
26 | import 'en/verbalizer/urls.grm' as u;
27 | 
28 | export VERBALIZER = Optimize[RmWeight[
29 |  (  e.MIXED_NUMBERS
30 |   | e.DIGITS
31 |   | f.FLOAT
32 |   | ma.ARITHMETIC
33 |   | mi.MISCELLANEOUS
34 |   | mo.MONEY
35 |   | n.CARDINAL_NUMBERS
36 |   | n.ORDINAL_NUMBERS
37 |   | np.NUMBERS_PLUS
38 |   | s.SPELLED
39 |   | sp.SPOKEN_PUNCT
40 |   | t.TIME
41 |   | u.URL) @ util.CLEAN_SPACES
42 | ]];
43 | 


--------------------------------------------------------------------------------
/src/number_data/README.md:
--------------------------------------------------------------------------------
 1 | This directory contains data used in:
 2 | 
 3 |   Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization.
 4 |   Transactions of the Association for Computational Linguistics 4: 507-519.
 5 | 
 6 | * `minimal.txt`: A list of 30 curated numbers used as the "minimal" training
 7 |   set.
 8 | * `random-trn.txt`: A list of 9000 randomly-generated numbers used as the
 9 |   "medium" training set.
10 | * `random-tst.txt`: A list of 1000 randomly-generated numbers used as the test
11 |   set.
12 | 
13 | Note that `random-trn.txt` and `random-tst.txt` are totally disjoint, but that
14 | a small number of examples occur both in `minimal.txt` and `random-tst.txt`.
15 | 
16 | For information about the sampling procedure used to generate the random data
17 | sets, see appendix A of the aforementioned paper.
18 | 


--------------------------------------------------------------------------------
/src/number_data/minimal.txt:
--------------------------------------------------------------------------------
  1 | 0
  2 | 1
  3 | 2
  4 | 3
  5 | 4
  6 | 5
  7 | 6
  8 | 7
  9 | 8
 10 | 9
 11 | 10
 12 | 11
 13 | 12
 14 | 13
 15 | 14
 16 | 15
 17 | 16
 18 | 17
 19 | 18
 20 | 19
 21 | 20
 22 | 21
 23 | 22
 24 | 23
 25 | 24
 26 | 25
 27 | 26
 28 | 27
 29 | 28
 30 | 29
 31 | 30
 32 | 31
 33 | 32
 34 | 33
 35 | 34
 36 | 35
 37 | 36
 38 | 37
 39 | 38
 40 | 39
 41 | 40
 42 | 41
 43 | 42
 44 | 43
 45 | 44
 46 | 45
 47 | 46
 48 | 47
 49 | 48
 50 | 49
 51 | 50
 52 | 51
 53 | 52
 54 | 53
 55 | 54
 56 | 55
 57 | 56
 58 | 57
 59 | 58
 60 | 59
 61 | 60
 62 | 61
 63 | 62
 64 | 63
 65 | 64
 66 | 65
 67 | 66
 68 | 67
 69 | 68
 70 | 69
 71 | 70
 72 | 71
 73 | 72
 74 | 73
 75 | 74
 76 | 75
 77 | 76
 78 | 77
 79 | 78
 80 | 79
 81 | 80
 82 | 81
 83 | 82
 84 | 83
 85 | 84
 86 | 85
 87 | 86
 88 | 87
 89 | 88
 90 | 89
 91 | 90
 92 | 91
 93 | 92
 94 | 93
 95 | 94
 96 | 95
 97 | 96
 98 | 97
 99 | 98
100 | 99
101 | 100
102 | 101
103 | 102
104 | 103
105 | 104
106 | 105
107 | 106
108 | 107
109 | 108
110 | 109
111 | 110
112 | 111
113 | 112
114 | 113
115 | 114
116 | 115
117 | 116
118 | 117
119 | 118
120 | 119
121 | 120
122 | 121
123 | 122
124 | 123
125 | 124
126 | 125
127 | 126
128 | 127
129 | 128
130 | 129
131 | 130
132 | 131
133 | 132
134 | 133
135 | 134
136 | 135
137 | 136
138 | 137
139 | 138
140 | 139
141 | 140
142 | 141
143 | 142
144 | 143
145 | 144
146 | 145
147 | 146
148 | 147
149 | 148
150 | 149
151 | 150
152 | 151
153 | 152
154 | 153
155 | 154
156 | 155
157 | 156
158 | 157
159 | 158
160 | 159
161 | 160
162 | 161
163 | 162
164 | 163
165 | 164
166 | 165
167 | 166
168 | 167
169 | 168
170 | 169
171 | 170
172 | 171
173 | 172
174 | 173
175 | 174
176 | 175
177 | 176
178 | 177
179 | 178
180 | 179
181 | 180
182 | 181
183 | 182
184 | 183
185 | 184
186 | 185
187 | 186
188 | 187
189 | 188
190 | 189
191 | 190
192 | 191
193 | 192
194 | 193
195 | 194
196 | 195
197 | 196
198 | 197
199 | 198
200 | 199
201 | 200
202 | 201
203 | 202
204 | 203
205 | 204
206 | 205
207 | 206
208 | 207
209 | 208
210 | 209
211 | 210
212 | 211
213 | 212
214 | 220
215 | 221
216 | 230
217 | 300
218 | 400
219 | 500
220 | 600
221 | 700
222 | 800
223 | 900
224 | 1000
225 | 1001
226 | 1002
227 | 1003
228 | 1004
229 | 1005
230 | 1006
231 | 1007
232 | 1008
233 | 1009
234 | 1010
235 | 1011
236 | 1012
237 | 1020
238 | 1021
239 | 1030
240 | 1200
241 | 2000
242 | 2001
243 | 2002
244 | 2003
245 | 2004
246 | 2005
247 | 2006
248 | 2007
249 | 2008
250 | 2009
251 | 2010
252 | 2011
253 | 2012
254 | 2020
255 | 2021
256 | 2030
257 | 2100
258 | 2200
259 | 5001
260 | 10000
261 | 12000
262 | 20000
263 | 21000
264 | 50001
265 | 100000
266 | 120000
267 | 200000
268 | 210000
269 | 500001
270 | 1000000
271 | 1001000
272 | 1200000
273 | 2000000
274 | 2100000
275 | 5000001
276 | 10000000
277 | 10001000
278 | 12000000
279 | 20000000
280 | 50000001
281 | 100000000
282 | 100001000
283 | 120000000
284 | 200000000
285 | 500000001
286 | 1000000000
287 | 1000001000
288 | 1200000000
289 | 2000000000
290 | 5000000001
291 | 10000000000
292 | 10000001000
293 | 12000000000
294 | 20000000000
295 | 50000000001
296 | 100000000000
297 | 100000001000
298 | 120000000000
299 | 200000000000
300 | 500000000001
301 | 


--------------------------------------------------------------------------------
/src/number_data/random-tst.txt:
--------------------------------------------------------------------------------
   1 | 209
   2 | 220
   3 | 250
   4 | 254
   5 | 263
   6 | 266
   7 | 276
   8 | 303
   9 | 310
  10 | 317
  11 | 322
  12 | 364
  13 | 386
  14 | 405
  15 | 414
  16 | 424
  17 | 429
  18 | 489
  19 | 505
  20 | 520
  21 | 523
  22 | 525
  23 | 554
  24 | 624
  25 | 627
  26 | 640
  27 | 665
  28 | 680
  29 | 704
  30 | 715
  31 | 723
  32 | 741
  33 | 742
  34 | 775
  35 | 776
  36 | 845
  37 | 847
  38 | 851
  39 | 868
  40 | 898
  41 | 921
  42 | 927
  43 | 972
  44 | 973
  45 | 984
  46 | 986
  47 | 994
  48 | 1038
  49 | 1055
  50 | 1077
  51 | 1079
  52 | 1083
  53 | 1090
  54 | 1123
  55 | 1137
  56 | 1161
  57 | 1184
  58 | 1186
  59 | 1235
  60 | 1257
  61 | 1258
  62 | 1285
  63 | 1302
  64 | 1307
  65 | 1311
  66 | 1358
  67 | 1369
  68 | 1372
  69 | 1383
  70 | 1391
  71 | 1418
  72 | 1441
  73 | 1442
  74 | 1447
  75 | 1476
  76 | 1478
  77 | 1509
  78 | 1535
  79 | 1548
  80 | 1550
  81 | 1571
  82 | 1581
  83 | 1593
  84 | 1615
  85 | 1623
  86 | 1639
  87 | 1660
  88 | 1686
  89 | 1688
  90 | 1717
  91 | 1735
  92 | 1782
  93 | 1813
  94 | 1815
  95 | 1824
  96 | 1831
  97 | 1875
  98 | 1881
  99 | 1924
 100 | 1931
 101 | 1949
 102 | 1951
 103 | 1966
 104 | 1970
 105 | 1984
 106 | 1990
 107 | 1992
 108 | 2012
 109 | 2013
 110 | 2024
 111 | 2040
 112 | 2058
 113 | 2062
 114 | 2064
 115 | 2067
 116 | 2075
 117 | 2116
 118 | 2130
 119 | 2135
 120 | 2171
 121 | 2197
 122 | 2200
 123 | 2215
 124 | 2220
 125 | 2226
 126 | 2246
 127 | 2259
 128 | 2277
 129 | 2294
 130 | 2303
 131 | 2318
 132 | 2342
 133 | 2347
 134 | 2349
 135 | 2355
 136 | 2364
 137 | 2413
 138 | 2419
 139 | 2420
 140 | 2433
 141 | 2441
 142 | 2445
 143 | 2451
 144 | 2468
 145 | 2488
 146 | 2498
 147 | 2499
 148 | 2500
 149 | 2502
 150 | 2514
 151 | 2523
 152 | 2524
 153 | 2557
 154 | 2568
 155 | 2598
 156 | 2609
 157 | 2612
 158 | 2629
 159 | 2685
 160 | 2697
 161 | 2718
 162 | 2724
 163 | 2734
 164 | 2739
 165 | 2760
 166 | 2763
 167 | 2779
 168 | 2796
 169 | 2797
 170 | 2809
 171 | 2818
 172 | 2828
 173 | 2839
 174 | 2842
 175 | 2850
 176 | 2857
 177 | 2864
 178 | 2916
 179 | 2923
 180 | 2984
 181 | 2987
 182 | 2991
 183 | 2994
 184 | 3021
 185 | 3025
 186 | 3026
 187 | 3054
 188 | 3070
 189 | 3080
 190 | 3086
 191 | 3098
 192 | 3114
 193 | 3121
 194 | 3130
 195 | 3136
 196 | 3137
 197 | 3157
 198 | 3175
 199 | 3182
 200 | 3200
 201 | 3233
 202 | 3245
 203 | 3250
 204 | 3270
 205 | 3298
 206 | 3303
 207 | 3330
 208 | 3341
 209 | 3347
 210 | 3368
 211 | 3392
 212 | 3394
 213 | 3398
 214 | 3400
 215 | 3427
 216 | 3435
 217 | 3441
 218 | 3449
 219 | 3474
 220 | 3477
 221 | 3497
 222 | 3501
 223 | 3525
 224 | 3526
 225 | 3551
 226 | 3570
 227 | 3576
 228 | 3597
 229 | 3612
 230 | 3630
 231 | 3636
 232 | 3639
 233 | 3649
 234 | 3651
 235 | 3675
 236 | 3692
 237 | 3719
 238 | 3742
 239 | 3773
 240 | 3785
 241 | 3790
 242 | 3850
 243 | 3870
 244 | 3873
 245 | 3875
 246 | 3885
 247 | 3910
 248 | 3926
 249 | 3927
 250 | 3928
 251 | 3941
 252 | 3943
 253 | 3945
 254 | 3950
 255 | 3961
 256 | 3971
 257 | 3990
 258 | 3992
 259 | 3996
 260 | 4010
 261 | 4013
 262 | 4018
 263 | 4024
 264 | 4032
 265 | 4047
 266 | 4065
 267 | 4069
 268 | 4079
 269 | 4089
 270 | 4097
 271 | 4114
 272 | 4125
 273 | 4127
 274 | 4148
 275 | 4155
 276 | 4173
 277 | 4180
 278 | 4206
 279 | 4249
 280 | 4256
 281 | 4284
 282 | 4298
 283 | 4303
 284 | 4305
 285 | 4345
 286 | 4354
 287 | 4409
 288 | 4417
 289 | 4433
 290 | 4437
 291 | 4470
 292 | 4474
 293 | 4486
 294 | 4494
 295 | 4527
 296 | 4538
 297 | 4544
 298 | 4572
 299 | 4629
 300 | 4630
 301 | 4634
 302 | 4647
 303 | 4652
 304 | 4654
 305 | 4658
 306 | 4680
 307 | 4699
 308 | 4747
 309 | 4748
 310 | 4773
 311 | 4791
 312 | 4852
 313 | 4863
 314 | 4884
 315 | 4907
 316 | 4927
 317 | 4943
 318 | 4953
 319 | 5027
 320 | 5032
 321 | 5037
 322 | 5080
 323 | 5095
 324 | 5108
 325 | 5134
 326 | 5163
 327 | 5168
 328 | 5186
 329 | 5210
 330 | 5236
 331 | 5237
 332 | 5265
 333 | 5273
 334 | 5283
 335 | 5330
 336 | 5351
 337 | 5362
 338 | 5396
 339 | 5438
 340 | 5446
 341 | 5465
 342 | 5495
 343 | 5511
 344 | 5526
 345 | 5534
 346 | 5556
 347 | 5567
 348 | 5611
 349 | 5639
 350 | 5642
 351 | 5725
 352 | 5738
 353 | 5751
 354 | 5774
 355 | 5777
 356 | 5786
 357 | 5813
 358 | 5837
 359 | 5864
 360 | 5879
 361 | 5885
 362 | 5889
 363 | 5898
 364 | 5921
 365 | 5924
 366 | 5946
 367 | 5955
 368 | 5959
 369 | 5968
 370 | 5976
 371 | 5981
 372 | 6021
 373 | 6047
 374 | 6049
 375 | 6080
 376 | 6158
 377 | 6162
 378 | 6170
 379 | 6176
 380 | 6206
 381 | 6214
 382 | 6220
 383 | 6243
 384 | 6253
 385 | 6261
 386 | 6284
 387 | 6307
 388 | 6322
 389 | 6330
 390 | 6338
 391 | 6367
 392 | 6413
 393 | 6430
 394 | 6434
 395 | 6437
 396 | 6470
 397 | 6492
 398 | 6499
 399 | 6504
 400 | 6512
 401 | 6660
 402 | 6670
 403 | 6680
 404 | 6699
 405 | 6710
 406 | 6737
 407 | 6741
 408 | 6751
 409 | 6776
 410 | 6779
 411 | 6802
 412 | 6819
 413 | 6890
 414 | 6892
 415 | 6969
 416 | 6970
 417 | 7040
 418 | 7045
 419 | 7052
 420 | 7063
 421 | 7065
 422 | 7088
 423 | 7128
 424 | 7129
 425 | 7133
 426 | 7155
 427 | 7164
 428 | 7166
 429 | 7181
 430 | 7210
 431 | 7219
 432 | 7234
 433 | 7236
 434 | 7256
 435 | 7266
 436 | 7270
 437 | 7303
 438 | 7364
 439 | 7370
 440 | 7378
 441 | 7499
 442 | 7593
 443 | 7629
 444 | 7633
 445 | 7640
 446 | 7675
 447 | 7709
 448 | 7753
 449 | 7791
 450 | 7792
 451 | 7812
 452 | 7838
 453 | 7860
 454 | 7890
 455 | 7972
 456 | 8014
 457 | 8025
 458 | 8096
 459 | 8106
 460 | 8123
 461 | 8154
 462 | 8159
 463 | 8200
 464 | 8228
 465 | 8343
 466 | 8381
 467 | 8429
 468 | 8490
 469 | 8515
 470 | 8526
 471 | 8560
 472 | 8568
 473 | 8579
 474 | 8658
 475 | 8668
 476 | 8672
 477 | 8688
 478 | 8710
 479 | 8731
 480 | 8739
 481 | 8752
 482 | 8771
 483 | 8790
 484 | 8833
 485 | 8900
 486 | 8917
 487 | 8929
 488 | 9002
 489 | 9035
 490 | 9043
 491 | 9067
 492 | 9078
 493 | 9122
 494 | 9138
 495 | 9144
 496 | 9183
 497 | 9199
 498 | 9211
 499 | 9235
 500 | 9240
 501 | 9257
 502 | 9330
 503 | 9385
 504 | 9390
 505 | 9450
 506 | 9512
 507 | 9523
 508 | 9530
 509 | 9535
 510 | 9564
 511 | 9596
 512 | 9601
 513 | 9602
 514 | 9603
 515 | 9626
 516 | 9655
 517 | 9691
 518 | 9695
 519 | 9772
 520 | 9780
 521 | 9808
 522 | 9849
 523 | 9881
 524 | 9911
 525 | 9923
 526 | 9946
 527 | 9970
 528 | 9986
 529 | 10009
 530 | 10019
 531 | 10168
 532 | 10178
 533 | 10180
 534 | 10190
 535 | 10290
 536 | 10348
 537 | 10470
 538 | 10520
 539 | 10525
 540 | 10535
 541 | 10545
 542 | 10627
 543 | 10675
 544 | 10715
 545 | 10757
 546 | 10772
 547 | 10786
 548 | 10896
 549 | 10940
 550 | 10970
 551 | 11000
 552 | 11101
 553 | 11120
 554 | 11132
 555 | 11192
 556 | 11201
 557 | 11209
 558 | 11265
 559 | 11337
 560 | 11392
 561 | 11549
 562 | 11557
 563 | 11567
 564 | 11736
 565 | 11767
 566 | 11807
 567 | 11814
 568 | 11866
 569 | 11881
 570 | 11913
 571 | 12073
 572 | 12098
 573 | 12111
 574 | 12137
 575 | 12291
 576 | 12370
 577 | 12376
 578 | 12397
 579 | 12435
 580 | 12439
 581 | 12443
 582 | 12511
 583 | 12520
 584 | 12567
 585 | 12575
 586 | 12615
 587 | 12700
 588 | 12710
 589 | 12726
 590 | 12729
 591 | 12814
 592 | 12822
 593 | 12883
 594 | 12890
 595 | 12910
 596 | 12915
 597 | 12980
 598 | 13069
 599 | 13075
 600 | 13127
 601 | 13193
 602 | 13209
 603 | 13386
 604 | 13390
 605 | 13393
 606 | 13511
 607 | 13586
 608 | 13607
 609 | 13625
 610 | 13630
 611 | 13647
 612 | 13656
 613 | 13763
 614 | 13810
 615 | 13910
 616 | 13979
 617 | 13991
 618 | 14073
 619 | 14096
 620 | 14111
 621 | 14170
 622 | 14210
 623 | 14259
 624 | 14306
 625 | 14350
 626 | 14351
 627 | 14360
 628 | 14479
 629 | 14587
 630 | 14613
 631 | 14736
 632 | 14745
 633 | 14797
 634 | 14810
 635 | 14822
 636 | 14824
 637 | 14830
 638 | 15020
 639 | 15068
 640 | 15118
 641 | 15197
 642 | 15230
 643 | 15270
 644 | 15310
 645 | 15404
 646 | 15510
 647 | 15603
 648 | 15680
 649 | 15700
 650 | 15721
 651 | 15820
 652 | 15928
 653 | 15990
 654 | 16012
 655 | 16018
 656 | 16030
 657 | 16073
 658 | 16123
 659 | 16243
 660 | 16275
 661 | 16501
 662 | 16690
 663 | 16710
 664 | 16765
 665 | 16870
 666 | 16958
 667 | 17014
 668 | 17030
 669 | 17138
 670 | 17190
 671 | 17272
 672 | 17409
 673 | 17424
 674 | 17430
 675 | 17477
 676 | 17678
 677 | 17684
 678 | 17687
 679 | 17820
 680 | 17840
 681 | 17898
 682 | 18097
 683 | 18219
 684 | 18284
 685 | 18349
 686 | 18525
 687 | 18634
 688 | 18680
 689 | 19042
 690 | 19070
 691 | 19084
 692 | 19120
 693 | 19151
 694 | 19250
 695 | 19389
 696 | 19679
 697 | 19932
 698 | 20080
 699 | 20100
 700 | 20133
 701 | 20321
 702 | 20440
 703 | 20801
 704 | 20819
 705 | 20969
 706 | 21190
 707 | 21300
 708 | 21340
 709 | 21350
 710 | 21360
 711 | 21490
 712 | 21531
 713 | 21640
 714 | 21728
 715 | 21796
 716 | 21831
 717 | 21860
 718 | 22040
 719 | 22208
 720 | 22282
 721 | 22410
 722 | 22566
 723 | 22850
 724 | 23060
 725 | 23196
 726 | 23380
 727 | 24190
 728 | 24350
 729 | 24360
 730 | 24380
 731 | 24475
 732 | 24480
 733 | 24491
 734 | 24521
 735 | 24644
 736 | 24695
 737 | 24747
 738 | 24760
 739 | 24945
 740 | 25000
 741 | 25510
 742 | 25754
 743 | 25870
 744 | 26200
 745 | 26300
 746 | 26410
 747 | 26447
 748 | 26472
 749 | 26510
 750 | 27000
 751 | 27017
 752 | 27400
 753 | 27430
 754 | 27531
 755 | 27600
 756 | 27740
 757 | 27870
 758 | 28200
 759 | 28544
 760 | 28570
 761 | 28618
 762 | 28629
 763 | 28716
 764 | 28753
 765 | 28850
 766 | 29027
 767 | 29040
 768 | 29045
 769 | 29129
 770 | 29190
 771 | 29404
 772 | 29600
 773 | 29970
 774 | 30030
 775 | 30050
 776 | 30190
 777 | 30375
 778 | 30500
 779 | 30700
 780 | 30778
 781 | 30790
 782 | 30838
 783 | 31310
 784 | 31379
 785 | 31480
 786 | 31547
 787 | 31698
 788 | 31986
 789 | 32600
 790 | 32991
 791 | 33417
 792 | 33603
 793 | 34751
 794 | 34900
 795 | 34980
 796 | 35059
 797 | 35101
 798 | 35190
 799 | 35496
 800 | 35500
 801 | 35707
 802 | 35761
 803 | 36320
 804 | 36496
 805 | 36893
 806 | 37200
 807 | 37520
 808 | 37780
 809 | 38370
 810 | 38500
 811 | 38600
 812 | 39200
 813 | 39575
 814 | 39580
 815 | 40324
 816 | 40560
 817 | 41222
 818 | 41300
 819 | 41485
 820 | 41973
 821 | 43110
 822 | 43229
 823 | 44097
 824 | 44550
 825 | 44666
 826 | 45078
 827 | 45085
 828 | 45090
 829 | 45600
 830 | 46170
 831 | 46772
 832 | 47060
 833 | 48280
 834 | 48500
 835 | 48518
 836 | 49400
 837 | 49430
 838 | 50100
 839 | 50167
 840 | 50359
 841 | 50800
 842 | 51386
 843 | 51390
 844 | 51531
 845 | 51800
 846 | 52092
 847 | 52100
 848 | 52590
 849 | 52663
 850 | 52670
 851 | 52738
 852 | 52990
 853 | 53025
 854 | 53450
 855 | 53600
 856 | 53620
 857 | 54070
 858 | 54505
 859 | 56160
 860 | 56165
 861 | 57100
 862 | 57730
 863 | 58825
 864 | 58900
 865 | 60151
 866 | 60500
 867 | 61306
 868 | 61710
 869 | 62250
 870 | 62270
 871 | 62400
 872 | 63310
 873 | 63960
 874 | 64235
 875 | 64760
 876 | 65200
 877 | 65654
 878 | 66240
 879 | 66400
 880 | 66600
 881 | 68670
 882 | 68920
 883 | 71000
 884 | 71400
 885 | 72630
 886 | 72700
 887 | 72860
 888 | 73700
 889 | 75841
 890 | 76108
 891 | 77122
 892 | 79220
 893 | 79400
 894 | 79670
 895 | 81110
 896 | 83574
 897 | 84100
 898 | 84500
 899 | 86090
 900 | 87078
 901 | 87300
 902 | 87860
 903 | 88340
 904 | 88880
 905 | 89154
 906 | 89950
 907 | 92600
 908 | 96220
 909 | 96870
 910 | 97503
 911 | 99600
 912 | 101000
 913 | 104000
 914 | 105100
 915 | 105570
 916 | 106900
 917 | 108290
 918 | 108400
 919 | 110840
 920 | 110975
 921 | 113773
 922 | 115000
 923 | 116500
 924 | 119200
 925 | 124720
 926 | 127000
 927 | 127780
 928 | 128200
 929 | 128966
 930 | 138900
 931 | 140900
 932 | 141000
 933 | 141228
 934 | 144000
 935 | 145000
 936 | 145061
 937 | 147245
 938 | 147562
 939 | 148450
 940 | 152218
 941 | 154990
 942 | 158775
 943 | 159940
 944 | 161000
 945 | 161300
 946 | 163500
 947 | 165500
 948 | 170559
 949 | 176000
 950 | 178000
 951 | 184000
 952 | 188800
 953 | 196100
 954 | 204400
 955 | 204880
 956 | 210900
 957 | 216616
 958 | 220930
 959 | 238000
 960 | 239740
 961 | 257226
 962 | 265000
 963 | 271590
 964 | 273200
 965 | 285810
 966 | 309620
 967 | 315612
 968 | 320959
 969 | 321500
 970 | 341400
 971 | 348697
 972 | 350260
 973 | 359030
 974 | 360000
 975 | 360600
 976 | 376500
 977 | 378265
 978 | 383070
 979 | 394740
 980 | 410000
 981 | 446000
 982 | 471750
 983 | 497384
 984 | 510600
 985 | 560000
 986 | 590000
 987 | 608400
 988 | 696900
 989 | 704000
 990 | 1448374
 991 | 2256800
 992 | 3275000
 993 | 3980000
 994 | 4500000
 995 | 5066940
 996 | 5166299
 997 | 7113500
 998 | 9842447
 999 | 13020696
1000 | 70477170
1001 | 


--------------------------------------------------------------------------------
/src/ru/README.md:
--------------------------------------------------------------------------------
1 | # Russian covering grammar definitions
2 | 
3 | This directory defines a Russian text normalization covering grammar. The
4 | primary entry-point is the FST `VERBALIZER`, defined in
5 | `verbalizer/verbalizer.grm` and compiled in the FST archive
6 | `verbalizer/verbalizer.far`.
7 | 


--------------------------------------------------------------------------------
/src/ru/classifier/cyrillic.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | export kRussianLowerAlpha = Optimize[
16 |     "а" | "б" | "в" | "г" | "д" | "е" | "ё" | "ж" | "з" | "и" | "й" |
17 |     "к" | "л" | "м" | "н" | "о" | "п" | "р" | "с" | "т" | "у" | "ф" |
18 |     "х" | "ц" | "ч" | "ш" | "щ" | "ъ" | "ы" | "ь" | "э" | "ю" | "я" ];
19 | 
20 | export kRussianUpperAlpha = Optimize[
21 |     "А" | "Б" | "В" | "Г" | "Д" | "Е" | "Ё" | "Ж" | "З" | "И" | "Й" |
22 |     "К" | "Л" | "М" | "Н" | "О" | "П" | "Р" | "С" | "Т" | "У" | "Ф" |
23 |     "Х" | "Ц" | "Ч" | "Ш" | "Щ" | "Ъ" | "Ы" | "Ь" | "Э" | "Ю" | "Я" ];
24 | 
25 | export kRussianLowerAlphaStressed = Optimize[
26 |     "а́" | "е́" | "ё́" | "и́" | "о́" | "у́" | "ы́" | "э́" | "ю́" | "я́" ];
27 | 
28 | export kRussianUpperAlphaStressed = Optimize[
29 |     "А́" | "Е́" | "Ё́" | "И́" | "О́" | "У́" | "Ы́" | "Э́" | "Ю́" | "Я́" ];
30 | 
31 | export kRussianRewriteStress = Optimize[
32 |     ("А́" : "А'") | ("Е́" : "Е'") | ("Ё́" : "Ё'") | ("И́" : "И'") |
33 |     ("О́" : "О'") | ("У́" : "У'") | ("Ы́" : "Ы'") | ("Э́" : "Э'") |
34 |     ("Ю́" : "Ю'") | ("Я́" : "Я'") |
35 |     ("а́" : "а'") | ("е́" : "е'") | ("ё́" : "ё'") | ("и́" : "и'") |
36 |     ("о́" : "о'") | ("у́" : "у'") | ("ы́" : "ы'") | ("э́" : "э'") |
37 |     ("ю́" : "ю'") | ("я́" : "я'")
38 | ];
39 | 
40 | export kRussianRemoveStress = Optimize[
41 |     ("А́" : "А") | ("Е́" : "Е") | ("Ё́" : "Ё") | ("И́" : "И") | ("О́" : "О") |
42 |     ("У́" : "У") | ("Ы́" : "Ы") | ("Э́" : "Э") | ("Ю́" : "Ю") | ("Я́" : "Я") |
43 |     ("а́" : "а") | ("е́" : "е") | ("ё́" : "ё") | ("и́" : "и") | ("о́" : "о") |
44 |     ("у́" : "у") | ("ы́" : "ы") | ("э́" : "э") | ("ю́" : "ю") | ("я́" : "я")
45 | ];
46 | 
47 | # Pre-reform characters, just in case.
48 | export kRussianPreReform = Optimize[
49 |     "ѣ" | "Ѣ"   # http://en.wikipedia.org/wiki/Yat
50 | ];
51 | 
52 | export kCyrillicAlphaStressed = Optimize[
53 |   kRussianLowerAlphaStressed | kRussianUpperAlphaStressed
54 | ];
55 | 
56 | export kCyrillicAlpha = Optimize[
57 |     kRussianLowerAlpha | kRussianUpperAlpha | kRussianPreReform
58 | ];
59 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/cardinals-lex.grm:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # AUTOMATICALLY GENERATED: DO NOT EDIT.
 16 | import 'util/byte.grm' as b;
 17 | 
 18 | # Utilities for insertion and deletion.
 19 | 
 20 | func I[expr] {
 21 |   return "" : expr;
 22 | }
 23 | 
 24 | func D[expr] {
 25 |   return expr : "";
 26 | }
 27 | 
 28 | # Powers of base 10.
 29 | export POWERS =
 30 |     "[E15]"
 31 |   | "[E14]"
 32 |   | "[E13]"
 33 |   | "[E12]"
 34 |   | "[E11]"
 35 |   | "[E10]"
 36 |   | "[E9]"
 37 |   | "[E8]"
 38 |   | "[E7]"
 39 |   | "[E6]"
 40 |   | "[E5]"
 41 |   | "[E4]"
 42 |   | "[E3]"
 43 |   | "[E2]"
 44 |   | "[E1]"
 45 | ;
 46 | 
 47 | export SIGMA = b.kBytes | POWERS;
 48 | 
 49 | export SIGMA_STAR = SIGMA*;
 50 | 
 51 | export SIGMA_PLUS = SIGMA+;
 52 | 
 53 | ################################################################################
 54 | # BEGIN LANGUAGE SPECIFIC DATA
 55 | revaluations =
 56 |     ("[E4]" : "[E1]")
 57 |   | ("[E5]" : "[E2]")
 58 |   | ("[E7]" : "[E1]")
 59 |   | ("[E8]" : "[E2]")
 60 | ;
 61 | 
 62 | Ms = "[E3]" | "[E6]" | "[E9]";
 63 | 
 64 | 
 65 | func Zero[expr] {
 66 |   return expr : ("");
 67 | }
 68 | 
 69 | space = " ";
 70 | 
 71 | lexset3 = Optimize[
 72 |     ("1[E1]+1" : "одиннадцати")
 73 |   | ("1[E1]+1" : "одиннадцать")
 74 |   | ("1[E1]+1" : "одиннадцатью")
 75 |   | ("1[E1]+2" : "двенадцати")
 76 |   | ("1[E1]+2" : "двенадцать")
 77 |   | ("1[E1]+2" : "двенадцатью")
 78 |   | ("1[E1]+3" : "тринадцати")
 79 |   | ("1[E1]+3" : "тринадцать")
 80 |   | ("1[E1]+3" : "тринадцатью")
 81 |   | ("1[E1]+4" : "четырнадцати")
 82 |   | ("1[E1]+4" : "четырнадцать")
 83 |   | ("1[E1]+4" : "четырнадцатью")
 84 |   | ("1[E1]+5" : "пятнадцати")
 85 |   | ("1[E1]+5" : "пятнадцать")
 86 |   | ("1[E1]+5" : "пятнадцатью")
 87 |   | ("1[E1]+6" : "шестнадцати")
 88 |   | ("1[E1]+6" : "шестнадцать")
 89 |   | ("1[E1]+6" : "шестнадцатью")
 90 |   | ("1[E1]+7" : "семнадцати")
 91 |   | ("1[E1]+7" : "семнадцать")
 92 |   | ("1[E1]+7" : "семнадцатью")
 93 |   | ("1[E1]+8" : "восемнадцати")
 94 |   | ("1[E1]+8" : "восемнадцать")
 95 |   | ("1[E1]+8" : "восемнадцатью")
 96 |   | ("1[E1]+9" : "девятнадцати")
 97 |   | ("1[E1]+9" : "девятнадцать")
 98 |   | ("1[E1]+9" : "девятнадцатью")]
 99 | ;
100 | 
101 | lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR];
102 | 
103 | lexset2 = Optimize[
104 |     ("1[E1]" : "десяти")
105 |   | ("1[E1]" : "десять")
106 |   | ("1[E1]" : "десятью")
107 |   | ("1[E2]" : "ста")
108 |   | ("1[E2]" : "сто")
109 |   | ("2[E1]" : "двадцати")
110 |   | ("2[E1]" : "двадцать")
111 |   | ("2[E1]" : "двадцатью")
112 |   | ("2[E2]" : "двести")
113 |   | ("2[E2]" : "двумстам")
114 |   | ("2[E2]" : "двумястами")
115 |   | ("2[E2]" : "двухсот")
116 |   | ("2[E2]" : "двухстах")
117 |   | ("3[E1]" : "тридцати")
118 |   | ("3[E1]" : "тридцать")
119 |   | ("3[E1]" : "тридцатью")
120 |   | ("3[E2]" : "тремстам")
121 |   | ("3[E2]" : "тремястами")
122 |   | ("3[E2]" : "трехсот")
123 |   | ("3[E2]" : "трехстах")
124 |   | ("3[E2]" : "триста")
125 |   | ("4[E1]" : "сорок")
126 |   | ("4[E1]" : "сорока")
127 |   | ("4[E2]" : "четыремстам")
128 |   | ("4[E2]" : "четыреста")
129 |   | ("4[E2]" : "четырехсот")
130 |   | ("4[E2]" : "четырехстах")
131 |   | ("4[E2]" : "четырьмястами")
132 |   | ("5[E1]" : "пятидесяти")
133 |   | ("5[E1]" : "пятьдесят")
134 |   | ("5[E1]" : "пятьюдесятью")
135 |   | ("5[E2]" : "пятисот")
136 |   | ("5[E2]" : "пятистам")
137 |   | ("5[E2]" : "пятистах")
138 |   | ("5[E2]" : "пятьсот")
139 |   | ("5[E2]" : "пятьюстами")
140 |   | ("6[E1]" : "шестидесяти")
141 |   | ("6[E1]" : "шестьдесят")
142 |   | ("6[E1]" : "шестьюдесятью")
143 |   | ("6[E2]" : "шестисот")
144 |   | ("6[E2]" : "шестистам")
145 |   | ("6[E2]" : "шестистах")
146 |   | ("6[E2]" : "шестьсот")
147 |   | ("6[E2]" : "шестьюстами")
148 |   | ("7[E1]" : "семидесяти")
149 |   | ("7[E1]" : "семьдесят")
150 |   | ("7[E1]" : "семьюдесятью")
151 |   | ("7[E2]" : "семисот")
152 |   | ("7[E2]" : "семистам")
153 |   | ("7[E2]" : "семистах")
154 |   | ("7[E2]" : "семьсот")
155 |   | ("7[E2]" : "семьюстами")
156 |   | ("8[E1]" : "восемьдесят")
157 |   | ("8[E1]" : "восьмидесяти")
158 |   | ("8[E1]" : "восьмьюдесятью")
159 |   | ("8[E2]" : "восемьсот")
160 |   | ("8[E2]" : "восемьюстами")
161 |   | ("8[E2]" : "восьмисот")
162 |   | ("8[E2]" : "восьмистам")
163 |   | ("8[E2]" : "восьмистах")
164 |   | ("8[E2]" : "восьмьюстами")
165 |   | ("9[E1]" : "девяноста")
166 |   | ("9[E1]" : "девяносто")
167 |   | ("9[E2]" : "девятисот")
168 |   | ("9[E2]" : "девятистам")
169 |   | ("9[E2]" : "девятистах")
170 |   | ("9[E2]" : "девятьсот")
171 |   | ("9[E2]" : "девятьюстами")]
172 | ;
173 | 
174 | lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR];
175 | 
176 | lexset1 = Optimize[
177 |     ("+" : "")
178 |   | ("1" : "один")
179 |   | ("1" : "одна")
180 |   | ("1" : "одни")
181 |   | ("1" : "одним")
182 |   | ("1" : "одними")
183 |   | ("1" : "одних")
184 |   | ("1" : "одно")
185 |   | ("1" : "одного")
186 |   | ("1" : "одной")
187 |   | ("1" : "одном")
188 |   | ("1" : "одному")
189 |   | ("1" : "одною")
190 |   | ("1" : "одну")
191 |   | ("2" : "два")
192 |   | ("2" : "две")
193 |   | ("2" : "двум")
194 |   | ("2" : "двумя")
195 |   | ("2" : "двух")
196 |   | ("3" : "трем")
197 |   | ("3" : "тремя")
198 |   | ("3" : "трех")
199 |   | ("3" : "три")
200 |   | ("4" : "четыре")
201 |   | ("4" : "четырем")
202 |   | ("4" : "четырех")
203 |   | ("4" : "четырьмя")
204 |   | ("5" : "пяти")
205 |   | ("5" : "пять")
206 |   | ("5" : "пятью")
207 |   | ("6" : "шести")
208 |   | ("6" : "шесть")
209 |   | ("6" : "шестью")
210 |   | ("7" : "семи")
211 |   | ("7" : "семь")
212 |   | ("7" : "семью")
213 |   | ("8" : "восемь")
214 |   | ("8" : "восьми")
215 |   | ("8" : "восьмью")
216 |   | ("9" : "девяти")
217 |   | ("9" : "девять")
218 |   | ("9" : "девятью")
219 |   | ("[E3]" : "тысяч")
220 |   | ("[E3]" : "тысяча")
221 |   | ("[E3]" : "тысячам")
222 |   | ("[E3]" : "тысячами")
223 |   | ("[E3]" : "тысячах")
224 |   | ("[E3]" : "тысяче")
225 |   | ("[E3]" : "тысячей")
226 |   | ("[E3]" : "тысячи")
227 |   | ("[E3]" : "тысячу")
228 |   | ("[E3]" : "тысячью")
229 |   | ("[E6]" : "миллион")
230 |   | ("[E6]" : "миллиона")
231 |   | ("[E6]" : "миллионам")
232 |   | ("[E6]" : "миллионами")
233 |   | ("[E6]" : "миллионах")
234 |   | ("[E6]" : "миллионе")
235 |   | ("[E6]" : "миллионов")
236 |   | ("[E6]" : "миллионом")
237 |   | ("[E6]" : "миллиону")
238 |   | ("[E6]" : "миллионы")
239 |   | ("[E9]" : "миллиард")
240 |   | ("[E9]" : "миллиарда")
241 |   | ("[E9]" : "миллиардам")
242 |   | ("[E9]" : "миллиардами")
243 |   | ("[E9]" : "миллиардах")
244 |   | ("[E9]" : "миллиарде")
245 |   | ("[E9]" : "миллиардов")
246 |   | ("[E9]" : "миллиардом")
247 |   | ("[E9]" : "миллиарду")
248 |   | ("[E9]" : "миллиарды")
249 |   | ("|0|" : "ноле")
250 |   | ("|0|" : "нолем")
251 |   | ("|0|" : "ноль")
252 |   | ("|0|" : "нолю")
253 |   | ("|0|" : "ноля")
254 |   | ("|0|" : "нуле")
255 |   | ("|0|" : "нулем")
256 |   | ("|0|" : "нуль")
257 |   | ("|0|" : "нулю")
258 |   | ("|0|" : "нуля")]
259 | ;
260 | 
261 | lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR];
262 | 
263 | export LEX = Optimize[lex3 @ lex2 @ lex1];
264 | 
265 | export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]";
266 | 
267 | # END LANGUAGE SPECIFIC DATA
268 | ################################################################################
269 | # Inserts a marker after the Ms.
270 | export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR];
271 | 
272 | # Deletes all powers and "+".
273 | export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR];
274 | 
275 | # Deletes trailing zeros at the beginning of a number, so that "0003" does not
276 | # get treated as an ordinary number.
277 | export DELETE_INITIAL_ZEROS =
278 |   CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR]
279 | ;
280 | 
281 | NonMs = Optimize[POWERS - Ms];
282 | 
283 | # Deletes (usually) zeros before a non-M. E.g., +0[E1] should be deleted.
284 | export DELETE_INTERMEDIATE_ZEROS1 =
285 |   CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR]
286 | ;
287 | 
288 | # Deletes (usually) zeros before an M, if there is no non-zero element between
289 | # that and the previous boundary. Thus, if after the result of the rule above we
290 | # end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final
291 | # zero.
292 | export DELETE_INTERMEDIATE_ZEROS2 = Optimize[
293 |    CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR]
294 |  @ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]]
295 | ;
296 | 
297 | # Final clean up of stray zeros.
298 | export DELETE_REMAINING_ZEROS = Optimize[
299 |    CDRewrite[Zero["+0"], "", "", SIGMA_STAR]
300 |  @ CDRewrite[Zero["0"], "", "", SIGMA_STAR]]
301 | ;
302 | 
303 | # Applies the revaluation map. For example in English, changes [E4] to [E1] as a
304 | # modifier of [E3].
305 | export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR];
306 | 
307 | # Deletes the various marks and powers in the input and output.
308 | export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR];
309 | 
310 | export CLEAN_SPACES = Optimize[
311 |    CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR]
312 |  @ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR]
313 |  @ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]]
314 | ;
315 | 
316 | d = b.kDigit;
317 | 
318 | # Germanic inversion rule.
319 | germanic =
320 |     (I["1+"] d "[E1]" D["+1"])
321 |   | (I["2+"] d "[E1]" D["+2"])
322 |   | (I["3+"] d "[E1]" D["+3"])
323 |   | (I["4+"] d "[E1]" D["+4"])
324 |   | (I["5+"] d "[E1]" D["+5"])
325 |   | (I["6+"] d "[E1]" D["+6"])
326 |   | (I["7+"] d "[E1]" D["+7"])
327 |   | (I["8+"] d "[E1]" D["+8"])
328 |   | (I["9+"] d "[E1]" D["+9"])
329 | ;
330 | 
331 | germanic_inversion =
332 |   CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt']
333 | ;
334 | 
335 | export GERMANIC_INVERSION = SIGMA_STAR;
336 | export ORDINAL_RESTRICTION = SIGMA_STAR;
337 | nondigits = b.kBytes - b.kDigit;
338 | export ORDINAL_SUFFIX = D[nondigits*];
339 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/cardinals.tsv:
--------------------------------------------------------------------------------
  1 | 0	ноле
  2 | 0	ноль
  3 | 0	нолю
  4 | 0	ноля
  5 | 0	нолём
  6 | 0	нуле
  7 | 0	нуль
  8 | 0	нулю
  9 | 0	нуля
 10 | 0	нулём
 11 | 1	один
 12 | 1	одна
 13 | 1	одни
 14 | 1	одним
 15 | 1	одними
 16 | 1	одних
 17 | 1	одно
 18 | 1	одного
 19 | 1	одной
 20 | 1	одном
 21 | 1	одному
 22 | 1	одною
 23 | 1	раз
 24 | 1	одну
 25 | 2	два
 26 | 2	две
 27 | 2	двум
 28 | 2	двумя
 29 | 2	двух
 30 | 3	тремя
 31 | 3	три
 32 | 3	трём
 33 | 3	трёх
 34 | 4	четыре
 35 | 4	четырьмя
 36 | 4	четырём
 37 | 4	четырёх
 38 | 5	пяти
 39 | 5	пять
 40 | 5	пятью
 41 | 6	шести
 42 | 6	шесть
 43 | 6	шестью
 44 | 7	семи
 45 | 7	семь
 46 | 7	семью
 47 | 8	восемь
 48 | 8	восьми
 49 | 8	восьмью
 50 | 9	девяти
 51 | 9	девять
 52 | 9	девятью
 53 | 10	десяти
 54 | 10	десять
 55 | 10	десятью
 56 | 11	одиннадцати
 57 | 11	одиннадцать
 58 | 11	одиннадцатью
 59 | 12	двенадцати
 60 | 12	двенадцать
 61 | 12	двенадцатью
 62 | 13	тринадцати
 63 | 13	тринадцать
 64 | 13	тринадцатью
 65 | 14	четырнадцати
 66 | 14	четырнадцать
 67 | 14	четырнадцатью
 68 | 15	пятнадцати
 69 | 15	пятнадцать
 70 | 15	пятнадцатью
 71 | 16	шестнадцати
 72 | 16	шестнадцать
 73 | 16	шестнадцатью
 74 | 17	семнадцати
 75 | 17	семнадцать
 76 | 17	семнадцатью
 77 | 18	восемнадцати
 78 | 18	восемнадцать
 79 | 18	восемнадцатью
 80 | 19	девятнадцати
 81 | 19	девятнадцать
 82 | 19	девятнадцатью
 83 | 20	двадцати
 84 | 20	двадцать
 85 | 20	двадцатью
 86 | 30	тридцати
 87 | 30	тридцать
 88 | 30	тридцатью
 89 | 40	сорок
 90 | 40	сорока
 91 | 50	пятидесяти
 92 | 50	пятьдесят
 93 | 50	пятьюдесятью
 94 | 60	шестидесяти
 95 | 60	шестьдесят
 96 | 60	шестьюдесятью
 97 | 70	семидесяти
 98 | 70	семьдесят
 99 | 70	семьюдесятью
100 | 80	восемьдесят
101 | 80	восьмидесяти
102 | 80	восьмьюдесятью
103 | 90	девяноста
104 | 90	девяносто
105 | 100	ста
106 | 100	сто
107 | 200	двести
108 | 200	двумстам
109 | 200	двумястами
110 | 200	двухсот
111 | 200	двухстах
112 | 300	тремястами
113 | 300	трехсот
114 | 300	триста
115 | 300	трёмстам
116 | 300	трёхстах
117 | 400	четыреста
118 | 400	четырьмястами
119 | 400	четырёмстам
120 | 400	четырёхсот
121 | 400	четырёхстах
122 | 500	пятисот
123 | 500	пятистам
124 | 500	пятистах
125 | 500	пятьсот
126 | 500	пятьюстами
127 | 600	шестисот
128 | 600	шестистам
129 | 600	шестистах
130 | 600	шестьсот
131 | 600	шестьюстами
132 | 700	семисот
133 | 700	семистам
134 | 700	семистах
135 | 700	семьсот
136 | 700	семьюстами
137 | 800	восемьсот
138 | 800	восемьюстами
139 | 800	восьмисот
140 | 800	восьмистам
141 | 800	восьмистах
142 | 800	восьмьюстами
143 | 900	девятисот
144 | 900	девятистам
145 | 900	девятистах
146 | 900	девятьсот
147 | 900	девятьюстами
148 | 1000	тысяч
149 | 1000	тысяча
150 | 1000	тысячам
151 | 1000	тысячами
152 | 1000	тысячах
153 | 1000	тысяче
154 | 1000	тысячей
155 | 1000	тысячи
156 | 1000	тысячу
157 | 1000	тысячью
158 | 1000000	миллион
159 | 1000000	миллиона
160 | 1000000	миллионам
161 | 1000000	миллионами
162 | 1000000	миллионах
163 | 1000000	миллионе
164 | 1000000	миллионов
165 | 1000000	миллионом
166 | 1000000	миллиону
167 | 1000000	миллионы
168 | 1000000000	миллиард
169 | 1000000000	миллиарда
170 | 1000000000	миллиардам
171 | 1000000000	миллиардами
172 | 1000000000	миллиардах
173 | 1000000000	миллиарде
174 | 1000000000	миллиардов
175 | 1000000000	миллиардом
176 | 1000000000	миллиарду
177 | 1000000000	миллиарды
178 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/extra_numbers.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'util/byte.grm' as b;
16 | import 'ru/verbalizer/numbers.grm' as n;
17 | 
18 | digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@");
19 | 
20 | export DIGITS  = digit (n.I[" "] digit)*;
21 | 
22 | # Various common factorizations
23 | 
24 | two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS;
25 | 
26 | three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS;
27 | 
28 | mixed =
29 |    (digit n.I[" "] two_digits)
30 |  | (two_digits n.I[" "] two_digits)
31 |  | (two_digits n.I[" "] three_digits)
32 |  | (two_digits n.I[" "] two_digits n.I[" "] two_digits)
33 | ;
34 | 
35 | export MIXED_NUMBERS = Optimize[mixed];
36 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/factorization.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'util/byte.grm' as b;
16 | import 'util/util.grm' as u;
17 | import 'ru/verbalizer/numbers.grm' as n;
18 | 
19 | func ToNumberName[expr] {
20 |   number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*;
21 |   return Optimize[expr @ number_name_seq];
22 | }
23 | 
24 | d = b.kDigit;
25 | 
26 | leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*];
27 | 
28 | by_ones = d n.I[" "];
29 | by_twos = (d{2} @ leading_zero) n.I[" "];
30 | by_threes = (d{3} @ leading_zero) n.I[" "];
31 | 
32 | groupings = by_twos* (by_threes | by_twos | by_ones);
33 | 
34 | export FRACTIONAL_PART_UNGROUPED =
35 |   Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]]
36 | ;
37 | export FRACTIONAL_PART_GROUPED =
38 |   Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]]
39 | ;
40 | export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]];
41 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/float.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'ru/verbalizer/factorization.grm' as f;
16 | import 'ru/verbalizer/lexical_map.grm' as l;
17 | import 'ru/verbalizer/numbers.grm' as n;
18 | 
19 | fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED;
20 | fractional_part_grouped = f.FRACTIONAL_PART_GROUPED;
21 | fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED;
22 | 
23 | __fractional_part__ = fractional_part_unparsed;
24 | __decimal_marker__ = ",";
25 | 
26 | export FLOAT = Optimize[
27 |  (n.CARDINAL_NUMBERS
28 |   (__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ")
29 |   __fractional_part__) @ l.LEXICAL_MAP]
30 | ;
31 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/g.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research-datasets/TextNormalizationCoveringGrammars/37f5fb8a9e81a2512f7ea78e88fad3210acf6fe6/src/ru/verbalizer/g.fst


--------------------------------------------------------------------------------
/src/ru/verbalizer/lexical_map.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'util/byte.grm' as b;
16 | 
17 | lexical_map = StringFile['ru/verbalizer/lexical_map.tsv'];
18 | 
19 | sigma_star = b.kBytes*;
20 | 
21 | del_null = CDRewrite["__NULL__" : "", "", "", sigma_star];
22 | 
23 | export LEXICAL_MAP = Optimize[
24 |   CDRewrite[lexical_map, "", "", sigma_star] @ del_null]
25 | ;
26 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/lexical_map.tsv:
--------------------------------------------------------------------------------
  1 | @@CONNECTOR_RANGE@@	до
  2 | @@CONNECTOR_RATIO@@	к
  3 | @@CONNECTOR_BY@@	на
  4 | @@CONNECTOR_CONSECUTIVE_YEAR@@	до
  5 | @@JANUARY@@	январь
  6 | @@JANUARY@@	январи
  7 | @@JANUARY@@	января
  8 | @@JANUARY@@	январей
  9 | @@JANUARY@@	январю
 10 | @@JANUARY@@	январям
 11 | @@JANUARY@@	январь
 12 | @@JANUARY@@	январи
 13 | @@JANUARY@@	январём
 14 | @@JANUARY@@	январями
 15 | @@JANUARY@@	январе
 16 | @@JANUARY@@	январях
 17 | @@FEBRUARY@@	февраль
 18 | @@FEBRUARY@@	феврали
 19 | @@FEBRUARY@@	февраля
 20 | @@FEBRUARY@@	февралей
 21 | @@FEBRUARY@@	февралю
 22 | @@FEBRUARY@@	февралям
 23 | @@FEBRUARY@@	февраль
 24 | @@FEBRUARY@@	феврали
 25 | @@FEBRUARY@@	февралём
 26 | @@FEBRUARY@@	февралями
 27 | @@FEBRUARY@@	феврале
 28 | @@FEBRUARY@@	февралях
 29 | @@MARCH@@	март
 30 | @@MARCH@@	марты
 31 | @@MARCH@@	марта
 32 | @@MARCH@@	мартов
 33 | @@MARCH@@	марту
 34 | @@MARCH@@	мартам
 35 | @@MARCH@@	март
 36 | @@MARCH@@	марты
 37 | @@MARCH@@	мартом
 38 | @@MARCH@@	мартами
 39 | @@MARCH@@	марте
 40 | @@MARCH@@	мартах
 41 | @@APRIL@@	апрель
 42 | @@APRIL@@	апрели
 43 | @@APRIL@@	апреля
 44 | @@APRIL@@	апрелей
 45 | @@APRIL@@	апрелю
 46 | @@APRIL@@	апрелям
 47 | @@APRIL@@	апрель
 48 | @@APRIL@@	апрели
 49 | @@APRIL@@	апрелем
 50 | @@APRIL@@	апрелями
 51 | @@APRIL@@	апреле
 52 | @@APRIL@@	апрелях
 53 | @@MAY@@	май
 54 | @@MAY@@	маи
 55 | @@MAY@@	мая
 56 | @@MAY@@	маев
 57 | @@MAY@@	маю
 58 | @@MAY@@	маям
 59 | @@MAY@@	май
 60 | @@MAY@@	маи
 61 | @@MAY@@	маем
 62 | @@MAY@@	маями
 63 | @@MAY@@	мае
 64 | @@MAY@@	маях
 65 | @@JUN@@	июнь
 66 | @@JUN@@	июни
 67 | @@JUN@@	июня
 68 | @@JUN@@	июней
 69 | @@JUN@@	июню
 70 | @@JUN@@	июням
 71 | @@JUN@@	июнь
 72 | @@JUN@@	июни
 73 | @@JUN@@	июнем
 74 | @@JUN@@	июнями
 75 | @@JUN@@	июне
 76 | @@JUN@@	июнях
 77 | @@JUL@@	июль
 78 | @@JUL@@	июли
 79 | @@JUL@@	июля
 80 | @@JUL@@	июлей
 81 | @@JUL@@	июлю
 82 | @@JUL@@	июлям
 83 | @@JUL@@	июль
 84 | @@JUL@@	июли
 85 | @@JUL@@	июлем
 86 | @@JUL@@	июлями
 87 | @@JUL@@	июле
 88 | @@JUL@@	июлях
 89 | @@AUGUST@@	август
 90 | @@AUGUST@@	августы
 91 | @@AUGUST@@	августа
 92 | @@AUGUST@@	августов
 93 | @@AUGUST@@	августу
 94 | @@AUGUST@@	августам
 95 | @@AUGUST@@	август
 96 | @@AUGUST@@	августы
 97 | @@AUGUST@@	августом
 98 | @@AUGUST@@	августами
 99 | @@AUGUST@@	августе
100 | @@AUGUST@@	августах
101 | @@SEPTEMBER@@	сентябрь
102 | @@SEPTEMBER@@	сентябри
103 | @@SEPTEMBER@@	сентября
104 | @@SEPTEMBER@@	сентябрей
105 | @@SEPTEMBER@@	сентябрю
106 | @@SEPTEMBER@@	сентябрям
107 | @@SEPTEMBER@@	сентябрь
108 | @@SEPTEMBER@@	сентябри
109 | @@SEPTEMBER@@	сентябрём
110 | @@SEPTEMBER@@	сентябрями
111 | @@SEPTEMBER@@	сентябре
112 | @@SEPTEMBER@@	сентябрях
113 | @@OCTOBER@@	октябрь
114 | @@OCTOBER@@	октябри
115 | @@OCTOBER@@	октября
116 | @@OCTOBER@@	октябрей
117 | @@OCTOBER@@	октябрю
118 | @@OCTOBER@@	октябрям
119 | @@OCTOBER@@	октябрь
120 | @@OCTOBER@@	октябри
121 | @@OCTOBER@@	октябрём
122 | @@OCTOBER@@	октябрями
123 | @@OCTOBER@@	октябре
124 | @@OCTOBER@@	октябрях
125 | @@NOVEMBER@@	ноябрь
126 | @@NOVEMBER@@	ноябри
127 | @@NOVEMBER@@	ноября
128 | @@NOVEMBER@@	ноябрей
129 | @@NOVEMBER@@	ноябрю
130 | @@NOVEMBER@@	ноябрям
131 | @@NOVEMBER@@	ноябрь
132 | @@NOVEMBER@@	ноябри
133 | @@NOVEMBER@@	ноябрём
134 | @@NOVEMBER@@	ноябрями
135 | @@NOVEMBER@@	ноябре
136 | @@NOVEMBER@@	ноябрях
137 | @@DECEMBER@@	декабрь
138 | @@DECEMBER@@	декабри
139 | @@DECEMBER@@	декабря
140 | @@DECEMBER@@	декабрей
141 | @@DECEMBER@@	декабрю
142 | @@DECEMBER@@	декабрям
143 | @@DECEMBER@@	декабрь
144 | @@DECEMBER@@	декабри
145 | @@DECEMBER@@	декабрём
146 | @@DECEMBER@@	декабрями
147 | @@DECEMBER@@	декабре
148 | @@DECEMBER@@	декабрях
149 | @@MINUS@@	минус
150 | @@DECIMAL_DOT_EXPRESSION@@	целая
151 | @@DECIMAL_DOT_EXPRESSION@@	целой
152 | @@DECIMAL_DOT_EXPRESSION@@	целой
153 | @@DECIMAL_DOT_EXPRESSION@@	целую
154 | @@DECIMAL_DOT_EXPRESSION@@	целой
155 | @@DECIMAL_DOT_EXPRESSION@@	целой
156 | @@DECIMAL_DOT_EXPRESSION@@	целым
157 | @@DECIMAL_DOT_EXPRESSION@@	целыми
158 | @@DECIMAL_DOT_EXPRESSION@@	целых
159 | @@DECIMAL_DOT_EXPRESSION@@	целых
160 | @@URL_DOT_EXPRESSION@@	точка
161 | @@PERIOD@@	точка
162 | @@DECIMAL_EXPONENT@@	умножить на десять в степени
163 | @@COLON@@	двоеточие
164 | @@SLASH@@	косая черта
165 | @@PASSWORD@@	пароль
166 | @@AT@@	собака
167 | @@PORT@@	порт
168 | @@QUESTION_MARK@@	вопросительный знак
169 | @@HASH@@	решётка
170 | @@HASH@@	решетка
171 | @@MONEY_AND@@	и
172 | @@AND@@	и
173 | @@PHONE_PLUS@@	плюс
174 | @@ARITHMETIC_PLUS@@	плюс
175 | @@PHONE_EXTENSION@@	добавочный номер
176 | @@TIME_AM@@		утра
177 | @@TIME_PM@@		вечера
178 | @@HOUR@@		час
179 | @@HOUR@@		часа
180 | @@HOUR@@		часам
181 | @@HOUR@@		часами
182 | @@HOUR@@		часах
183 | @@HOUR@@		часе
184 | @@HOUR@@		часов
185 | @@HOUR@@		часом
186 | @@HOUR@@		часу
187 | @@HOUR@@		часы
188 | @@MINUTE@@	минут
189 | @@MINUTE@@	минута
190 | @@MINUTE@@	минутам
191 | @@MINUTE@@	минутами
192 | @@MINUTE@@	минутах
193 | @@MINUTE@@	минуте
194 | @@MINUTE@@	минутой
195 | @@MINUTE@@	минутою
196 | @@MINUTE@@	минуту
197 | @@MINUTE@@	минуты
198 | @@TIME_AFTER@@	__NULL__
199 | @@TIME_BEFORE_PRE@@		без
200 | @@TIME_QUARTER@@	четверть
201 | @@TIME_QUARTER@@	четверти
202 | @@TIME_HALF@@	половина
203 | @@TIME_HALF@@	половины
204 | @@TIME_HALF@@	половину
205 | @@TIME_HALF@@	половин
206 | @@TIME_HALF@@	половине
207 | @@TIME_HALF@@	половинам
208 | @@TIME_HALF@@	половиной
209 | @@TIME_HALF@@	половинами
210 | @@TIME_HALF@@	половинах
211 | @@PERCENT@@	процент
212 | @@PERCENT@@	процента
213 | @@PERCENT@@	процентам
214 | @@PERCENT@@	процентами
215 | @@PERCENT@@	процентах
216 | @@PERCENT@@	проценте
217 | @@PERCENT@@	процентов
218 | @@PERCENT@@	процентом
219 | @@PERCENT@@	проценту
220 | @@PERCENT@@	проценты
221 | @@PERCENT@@	проценты
222 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/math.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'ru/verbalizer/float.grm' as f;
16 | import 'ru/verbalizer/lexical_map.grm' as l;
17 | import 'ru/verbalizer/numbers.grm' as n;
18 | 
19 | float = f.FLOAT;
20 | card = n.CARDINAL_NUMBERS;
21 | number = card | float;
22 | 
23 | plus = "+" : " @@ARITHMETIC_PLUS@@ ";
24 | times = "*" : " @@ARITHMETIC_TIMES@@ ";
25 | minus = "-" : " @@ARITHMETIC_MINUS@@ ";
26 | division = "/" : " @@ARITHMETIC_DIVISION@@ ";
27 | 
28 | operator = plus | times | minus | division;
29 | 
30 | percent = "%" : " @@PERCENT@@";
31 | 
32 | export ARITHMETIC =
33 |   Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP]
34 | ;
35 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/miscellaneous.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'util/byte.grm' as b;
16 | import 'ru/classifier/cyrillic.grm' as c;
17 | import 'ru/verbalizer/extra_numbers.grm' as e;
18 | import 'ru/verbalizer/lexical_map.grm' as l;
19 | import 'ru/verbalizer/numbers.grm' as n;
20 | import 'ru/verbalizer/spelled.grm' as s;
21 | 
22 | letter = b.kAlpha | c.kCyrillicAlpha;
23 | dash   = "-";
24 | word = letter+;
25 | possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?;
26 | 
27 | post_word_symbol =
28 |    ("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) |
29 |    ("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) |
30 |    ("*" : "@@STAR@@")
31 | ;
32 | 
33 | pre_word_symbol =
34 |    ("@" : "@@AT@@") |
35 |    ("/" : "@@SLASH@@") |
36 |    ("#" : "@@HASH@@")
37 | ;
38 | 
39 | post_word = possibly_split_word n.I[" "] post_word_symbol;
40 | 
41 | pre_word = pre_word_symbol n.I[" "] possibly_split_word;
42 | 
43 | ## Number/digit sequence combos, maybe with a dash
44 | 
45 | spelled_word = word @ s.SPELLED_NO_LETTER;
46 | 
47 | word_number =
48 |   (word | spelled_word)
49 |   (n.I[" "] | (dash : " "))
50 |   (e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
51 | ;
52 | 
53 | number_word =
54 |   (e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS)
55 |   (n.I[" "] | (dash : " "))
56 |   (word | spelled_word)
57 | ;
58 | 
59 | ## Two-digit year.
60 | 
61 | # Note that in this case to be fair we really have to allow ordinals too since
62 | # in some languages that's what you would have.
63 | 
64 | two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS));
65 | 
66 | dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com";
67 | 
68 | miscellaneous = Optimize[
69 |     possibly_split_word
70 |   | post_word
71 |   | pre_word
72 |   | word_number
73 |   | number_word
74 |   | two_digit_year
75 |   | dot_com
76 | ];
77 | 
78 | export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP];
79 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/money.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'util/byte.grm' as b;
16 | import 'ru/verbalizer/lexical_map.grm' as l;
17 | import 'ru/verbalizer/numbers.grm' as n;
18 | 
19 | card = n.CARDINAL_NUMBERS;
20 | 
21 | __currency__ = StringFile['ru/verbalizer/money.tsv'];
22 | 
23 | d = b.kDigit;
24 | D = d - "0";
25 | 
26 | cents = ((n.D["0"] | D) d) @ card;
27 | 
28 | # Only dollar for the verbalizer tests for English. Will need to add other
29 | # currencies.
30 | usd_maj = Project["usd_maj" @ __currency__, 'output'];
31 | usd_min = Project["usd_min" @ __currency__, 'output'];
32 | and = " @@MONEY_AND@@ " | " ";
33 | 
34 | dollar1 =
35 |   n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min]
36 | ;
37 | 
38 | dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"];
39 | 
40 | dollar3 = n.D["$"] card n.I[" " usd_maj];
41 | 
42 | dollar = Optimize[dollar1 | dollar2 | dollar3];
43 | 
44 | export MONEY = Optimize[dollar @ l.LEXICAL_MAP];
45 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/money.tsv:
--------------------------------------------------------------------------------
 1 | usd_maj	доллара
 2 | usd_maj	долларами
 3 | usd_maj	долларам
 4 | usd_maj	долларах
 5 | usd_maj	долларе
 6 | usd_maj	долларов
 7 | usd_maj	долларом
 8 | usd_maj	доллар
 9 | usd_maj	доллар
10 | usd_maj	доллару
11 | usd_maj	доллары
12 | usd_maj	доллары
13 | usd_min	цент
14 | usd_min	цент
15 | usd_min	цента
16 | usd_min	центам
17 | usd_min	центами
18 | usd_min	центах
19 | usd_min	центе
20 | usd_min	центов
21 | usd_min	центом
22 | usd_min	центу
23 | usd_min	центы
24 | usd_min	центы
25 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/nominatives.tsv:
--------------------------------------------------------------------------------
  1 | нуль
  2 | ноль
  3 | один
  4 | два
  5 | две
  6 | три
  7 | четыре
  8 | пять
  9 | шесть
 10 | семь
 11 | восемь
 12 | девять
 13 | десять
 14 | одиннадцать
 15 | двенадцать
 16 | тринадцать
 17 | четырнадцать
 18 | пятнадцать
 19 | шестнадцать
 20 | семнадцать
 21 | восемнадцать
 22 | девятнадцать
 23 | двадцать
 24 | тридцать
 25 | сорок
 26 | пятьдесят
 27 | шестьдесят
 28 | семьдесят
 29 | восемьдесят
 30 | девяносто
 31 | сто
 32 | двести
 33 | триста
 34 | четыреста
 35 | пятьсот
 36 | шестьсот
 37 | семьсот
 38 | восемьсот
 39 | девятьсот
 40 | тысячи
 41 | тысяч
 42 | тысяча
 43 | миллионов
 44 | миллион
 45 | миллиона
 46 | миллиардов
 47 | миллиард
 48 | миллиарда
 49 | первая
 50 | первого
 51 | первое
 52 | первый
 53 | вторая
 54 | второе
 55 | второй
 56 | третий
 57 | третье
 58 | третья
 59 | четвертая
 60 | четвертое
 61 | четвертой
 62 | пятая
 63 | пятое
 64 | пятой
 65 | шестая
 66 | шестое
 67 | шестой
 68 | седьмая
 69 | седьмое
 70 | седьмой
 71 | восьмая
 72 | восьмое
 73 | восьмой
 74 | девятая
 75 | девятое
 76 | девятой
 77 | десятая
 78 | десятое
 79 | десятой
 80 | одиннадцатая
 81 | одиннадцатое
 82 | одиннадцатой
 83 | двенадцатая
 84 | двенадцатое
 85 | двенадцатой
 86 | тринадцатая
 87 | тринадцатое
 88 | тринадцатой
 89 | четырнадцатая
 90 | четырнадцатое
 91 | четырнадцатой
 92 | пятнадцатая
 93 | пятнадцатое
 94 | пятнадцатой
 95 | шестнадцатая
 96 | шестнадцатое
 97 | шестнадцатой
 98 | семнадцатая
 99 | семнадцатое
100 | семнадцатой
101 | восемнадцатая
102 | восемнадцатое
103 | восемнадцатой
104 | девятнадцатая
105 | девятнадцатое
106 | девятнадцатой
107 | двадцатая
108 | двадцатое
109 | двадцатой
110 | тридцатая
111 | тридцатое
112 | тридцатой
113 | сороковая
114 | сороковое
115 | сороковой
116 | пятидесятая
117 | пятидесятое
118 | пятидесятой
119 | шестидесятая
120 | шестидесятое
121 | шестидесятой
122 | семидесятая
123 | семидесятое
124 | семидесятой
125 | восьмидесятая
126 | восьмидесятое
127 | восьмидесятой
128 | девяностая
129 | девяностое
130 | девяностой
131 | сотая
132 | сотое
133 | сотой
134 | двухсотая
135 | двухсотое
136 | двухсотой
137 | трехсотая
138 | трехсотое
139 | трехсотой
140 | четырехсотая
141 | четырехсотое
142 | четырехсотой
143 | пятисотая
144 | пятисотое
145 | пятисотой
146 | шестисотая
147 | шестисотое
148 | шестисотой
149 | семисотая
150 | семисотое
151 | семисотой
152 | восьмисотая
153 | восьмисотое
154 | восьмисотой
155 | девятисотая
156 | девятисотое
157 | девятисотой
158 | тысячная
159 | тысячное
160 | тысячной
161 | миллионная
162 | миллионное
163 | миллионной
164 | миллиардная
165 | миллиардное
166 | миллиардной
167 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/number_names.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Russian minimally supervised number grammar.
16 | #
17 | # Supports cardinals and ordinals in all inflected forms.
18 | #
19 | # The language-specific acceptor G was compiled with digit, teen, decade,
20 | # century, and big power-of-ten preterminals. The lexicon transducer is
21 | # highly ambiguous, but no LM is used.
22 | 
23 | import 'util/arithmetic.grm' as a;
24 | 
25 | # Intersects the universal factorization transducer (F) with language-specific
26 | # acceptor (G).
27 | 
28 | d = a.DELTA_STAR;
29 | f = a.IARITHMETIC_RESTRICTED;
30 | g = LoadFst['ru/verbalizer/g.fst'];
31 | fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]];
32 | test1 = AssertEqual["230" @ fg, "(+ 200 30 +)"];
33 | 
34 | # Compiles lexicon transducers (L).
35 | 
36 | cardinal_name = StringFile['ru/verbalizer/cardinals.tsv'];
37 | cardinal_l = Optimize[(cardinal_name " ")* cardinal_name];
38 | 
39 | ordinal_name = StringFile['ru/verbalizer/ordinals.tsv'];
40 | ordinal_l = Optimize[(cardinal_name " ")* ordinal_name];
41 | 
42 | # Composes L with the leaf transducer (P), then composes that with FG.
43 | 
44 | p = a.LEAVES;
45 | 
46 | export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)];
47 | 
48 | export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)];
49 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/numbers.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'ru/verbalizer/number_names.grm' as n;
16 | import 'universal/thousands_punct.grm' as t;
17 | import 'util/byte.grm' as b;
18 | 
19 | nominatives = StringFile['ru/verbalizer/nominatives.tsv'];
20 | 
21 | sigma_star = b.kBytes*;
22 | 
23 | nominative_filter =
24 |  CDRewrite[nominatives ("" : "" <-1>), "[BOS]" | " ", " " | "[EOS]", sigma_star]
25 | ;
26 | 
27 | cardinal = n.CARDINAL_NUMBER_NAME;
28 | ordinal = n.ORDINAL_NUMBER_NAME;
29 | 
30 | # Putting these here since this grammar gets incorporated by all the others.
31 | 
32 | func I[expr] {
33 |   return "" : expr;
34 | }
35 | 
36 | func D[expr] {
37 |   return expr : "";
38 | }
39 | 
40 | # Since we know this is the default for Russian, it's fair game to set it.
41 | separators = t.dot_thousands | t.no_delimiter;
42 | 
43 | export CARDINAL_NUMBERS = Optimize[
44 |    separators
45 |  @ cardinal
46 | ];
47 | 
48 | export ORDINAL_NUMBERS_UNMARKED = Optimize[
49 |    separators
50 |  @ ordinal
51 | ];
52 | 
53 | 
54 | endings = StringFile['ru/verbalizer/ordinal_endings.tsv'];
55 | 
56 | not_dash = (b.kBytes - "-")+;
57 | del_ending = CDRewrite[("-" not_dash) : "", "", "[EOS]", sigma_star];
58 | 
59 | # Needs nominative_filter here if we take out Kyle's models.
60 | export ORDINAL_NUMBERS_MARKED = Optimize[
61 |    Optimize[Optimize[separators @ ordinal] "-" not_dash]
62 |  @ Optimize[sigma_star endings]
63 |  @ del_ending]
64 | ;
65 | 
66 | export ORDINAL_NUMBERS =
67 |   Optimize[ORDINAL_NUMBERS_MARKED | ORDINAL_NUMBERS_UNMARKED]
68 | ;
69 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/numbers_plus.grm:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | # Grammar for things built mostly on numbers.
 16 | 
 17 | import 'ru/verbalizer/factorization.grm' as f;
 18 | import 'ru/verbalizer/lexical_map.grm' as l;
 19 | import 'ru/verbalizer/numbers.grm' as n;
 20 | 
 21 | num = n.CARDINAL_NUMBERS;
 22 | ord = n.ORDINAL_NUMBERS_UNMARKED;
 23 | digits = f.FRACTIONAL_PART_UNGROUPED;
 24 | 
 25 | # Various symbols.
 26 | 
 27 | plus = "+" : "@@ARITHMETIC_PLUS@@";
 28 | minus = "-" : "@@ARITHMETIC_MINUS@@";
 29 | slash = "/" : "@@SLASH@@";
 30 | dot = "." : "@@URL_DOT_EXPRESSION@@";
 31 | dash = "-" : "@@DASH@@";
 32 | equals = "=" : "@@ARITHMETIC_EQUALS@@";
 33 | 
 34 | degree = "°" : "@@DEGREE@@";
 35 | 
 36 | division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@";
 37 | 
 38 | times = ("x" | "*") : "@@ARITHMETIC_TIMES@@";
 39 | 
 40 | power = "^" : "@@DECIMAL_EXPONENT@@";
 41 | 
 42 | square_root = "√" : "@@SQUARE_ROOT@@";
 43 | 
 44 | percent = "%" : "@@PERCENT@@";
 45 | 
 46 | # Safe roman numbers.
 47 | 
 48 | # NB: Do not change the formatting here. NO_EDIT must be on the same
 49 | # line as the path.
 50 | rfile =
 51 |   'universal/roman_numerals.tsv' # NO_EDIT
 52 | ;
 53 | 
 54 | roman = StringFile[rfile];
 55 | 
 56 | ## Main categories.
 57 | 
 58 | cat_dot_number =
 59 |    num
 60 |    n.I[" "] dot n.I[" "] num
 61 |    (n.I[" "] dot n.I[" "] num)+
 62 | ;
 63 | 
 64 | cat_slash_number =
 65 |    num
 66 |    n.I[" "] slash n.I[" "] num
 67 |    (n.I[" "] slash n.I[" "] num)*
 68 | ;
 69 | 
 70 | cat_dash_number =
 71 |    num
 72 |    n.I[" "] dash n.I[" "] num
 73 |    (n.I[" "] dash n.I[" "] num)*
 74 | ;
 75 | 
 76 | cat_signed_number = ((plus | minus) n.I[" "])? num;
 77 | 
 78 | cat_degree = cat_signed_number n.I[" "] degree;
 79 | 
 80 | cat_country_code = plus n.I[" "] (num | digits);
 81 | 
 82 | cat_math_operations =
 83 |      plus
 84 |    | minus
 85 |    | division
 86 |    | times
 87 |    | equals
 88 |    | percent
 89 |    | power
 90 |    | square_root
 91 | ;
 92 | 
 93 | # Roman numbers are often either cardinals or ordinals in various languages.
 94 | cat_roman = roman @ (num | ord);
 95 | 
 96 | # Allow
 97 | #
 98 | # number:number
 99 | # number-number
100 | #
101 | # to just be
102 | #
103 | # number number.
104 | 
105 | cat_number_number =
106 |    num ((":" | "-") : " ") num
107 | ;
108 | 
109 | # Some additional readings for these symbols.
110 | 
111 | cat_additional_readings =
112 |   ("/" : "@@PER@@") |
113 |   ("+" : "@@AND@@") |
114 |   ("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) |
115 |   ("*" : "@@STAR@@") |
116 |   ("x" : ("x" | "@@CONNECTOR_BY@@")) |
117 |   ("@" : "@@AT@@")
118 | ;
119 | 
120 | numbers_plus = Optimize[
121 |    cat_dot_number
122 |  | cat_slash_number
123 |  | cat_dash_number
124 |  | cat_signed_number
125 |  | cat_degree
126 |  | cat_country_code
127 |  | cat_math_operations
128 |  | cat_roman
129 |  | cat_number_number
130 |  | cat_additional_readings
131 | ];
132 | 
133 | export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP];
134 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/ordinal_endings.tsv:
--------------------------------------------------------------------------------
 1 | ая-ая
 2 | ого-го
 3 | ьего-го
 4 | ьего-его
 5 | ьей-ей
 6 | ьему-ему
 7 | ьем-ем
 8 | ое-е
 9 | ые-е
10 | ье-е
11 | ий-ий
12 | ьими-ими
13 | ьим-им
14 | ьих-их
15 | ьи-и
16 | ий-й
17 | ой-й
18 | ый-й
19 | ыми-ми
20 | ьими-ми
21 | ому-му
22 | ьему-му
23 | ого-ого
24 | ое-ое
25 | ой-ой
26 | ом-ом
27 | ому-ому
28 | ую-ую
29 | ых-х
30 | ьих-х
31 | ые-ые
32 | ый-ый
33 | ыми-ыми
34 | ым-ым
35 | ых-ых
36 | ую-ю
37 | ью-ю
38 | ая-я
39 | ья-я
40 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/ordinals-lex.grm:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # AUTOMATICALLY GENERATED: DO NOT EDIT.
 16 | import 'util/byte.grm' as b;
 17 | 
 18 | # Utilities for insertion and deletion.
 19 | 
 20 | func I[expr] {
 21 |   return "" : expr;
 22 | }
 23 | 
 24 | func D[expr] {
 25 |   return expr : "";
 26 | }
 27 | 
 28 | # Powers of base 10.
 29 | export POWERS =
 30 |     "[E15]"
 31 |   | "[E14]"
 32 |   | "[E13]"
 33 |   | "[E12]"
 34 |   | "[E11]"
 35 |   | "[E10]"
 36 |   | "[E9]"
 37 |   | "[E8]"
 38 |   | "[E7]"
 39 |   | "[E6]"
 40 |   | "[E5]"
 41 |   | "[E4]"
 42 |   | "[E3]"
 43 |   | "[E2]"
 44 |   | "[E1]"
 45 | ;
 46 | 
 47 | export SIGMA = b.kBytes | POWERS;
 48 | 
 49 | export SIGMA_STAR = SIGMA*;
 50 | 
 51 | export SIGMA_PLUS = SIGMA+;
 52 | 
 53 | ################################################################################
 54 | # BEGIN LANGUAGE SPECIFIC DATA
 55 | revaluations =
 56 |     ("[E4]" : "[E1]")
 57 |   | ("[E5]" : "[E2]")
 58 |   | ("[E7]" : "[E1]")
 59 |   | ("[E8]" : "[E2]")
 60 | ;
 61 | 
 62 | Ms = "[E3]" | "[E6]" | "[E9]";
 63 | 
 64 | 
 65 | func Zero[expr] {
 66 |   return expr : ("");
 67 | }
 68 | 
 69 | space = " ";
 70 | 
 71 | lexset3 = Optimize[
 72 |     ("1[E1]+1" : "одиннадцатая@")
 73 |   | ("1[E1]+1" : "одиннадцати")
 74 |   | ("1[E1]+1" : "одиннадцатого@")
 75 |   | ("1[E1]+1" : "одиннадцатое@")
 76 |   | ("1[E1]+1" : "одиннадцатой@")
 77 |   | ("1[E1]+1" : "одиннадцатом@")
 78 |   | ("1[E1]+1" : "одиннадцатому@")
 79 |   | ("1[E1]+1" : "одиннадцатую@")
 80 |   | ("1[E1]+1" : "одиннадцатые@")
 81 |   | ("1[E1]+1" : "одиннадцатый@")
 82 |   | ("1[E1]+1" : "одиннадцатым@")
 83 |   | ("1[E1]+1" : "одиннадцатыми@")
 84 |   | ("1[E1]+1" : "одиннадцатых@")
 85 |   | ("1[E1]+1" : "одиннадцать")
 86 |   | ("1[E1]+1" : "одиннадцатью")
 87 |   | ("1[E1]+2" : "двенадцатая@")
 88 |   | ("1[E1]+2" : "двенадцати")
 89 |   | ("1[E1]+2" : "двенадцатого@")
 90 |   | ("1[E1]+2" : "двенадцатое@")
 91 |   | ("1[E1]+2" : "двенадцатой@")
 92 |   | ("1[E1]+2" : "двенадцатом@")
 93 |   | ("1[E1]+2" : "двенадцатому@")
 94 |   | ("1[E1]+2" : "двенадцатую@")
 95 |   | ("1[E1]+2" : "двенадцатые@")
 96 |   | ("1[E1]+2" : "двенадцатый@")
 97 |   | ("1[E1]+2" : "двенадцатым@")
 98 |   | ("1[E1]+2" : "двенадцатыми@")
 99 |   | ("1[E1]+2" : "двенадцатых@")
100 |   | ("1[E1]+2" : "двенадцать")
101 |   | ("1[E1]+2" : "двенадцатью")
102 |   | ("1[E1]+3" : "тринадцатая@")
103 |   | ("1[E1]+3" : "тринадцати")
104 |   | ("1[E1]+3" : "тринадцатого@")
105 |   | ("1[E1]+3" : "тринадцатое@")
106 |   | ("1[E1]+3" : "тринадцатой@")
107 |   | ("1[E1]+3" : "тринадцатом@")
108 |   | ("1[E1]+3" : "тринадцатому@")
109 |   | ("1[E1]+3" : "тринадцатую@")
110 |   | ("1[E1]+3" : "тринадцатые@")
111 |   | ("1[E1]+3" : "тринадцатый@")
112 |   | ("1[E1]+3" : "тринадцатым@")
113 |   | ("1[E1]+3" : "тринадцатыми@")
114 |   | ("1[E1]+3" : "тринадцатых@")
115 |   | ("1[E1]+3" : "тринадцать")
116 |   | ("1[E1]+3" : "тринадцатью")
117 |   | ("1[E1]+4" : "четырнадцатая@")
118 |   | ("1[E1]+4" : "четырнадцати")
119 |   | ("1[E1]+4" : "четырнадцатого@")
120 |   | ("1[E1]+4" : "четырнадцатое@")
121 |   | ("1[E1]+4" : "четырнадцатой@")
122 |   | ("1[E1]+4" : "четырнадцатом@")
123 |   | ("1[E1]+4" : "четырнадцатому@")
124 |   | ("1[E1]+4" : "четырнадцатую@")
125 |   | ("1[E1]+4" : "четырнадцатые@")
126 |   | ("1[E1]+4" : "четырнадцатый@")
127 |   | ("1[E1]+4" : "четырнадцатым@")
128 |   | ("1[E1]+4" : "четырнадцатыми@")
129 |   | ("1[E1]+4" : "четырнадцатых@")
130 |   | ("1[E1]+4" : "четырнадцать")
131 |   | ("1[E1]+4" : "четырнадцатью")
132 |   | ("1[E1]+5" : "пятнадцатая@")
133 |   | ("1[E1]+5" : "пятнадцати")
134 |   | ("1[E1]+5" : "пятнадцатого@")
135 |   | ("1[E1]+5" : "пятнадцатое@")
136 |   | ("1[E1]+5" : "пятнадцатой@")
137 |   | ("1[E1]+5" : "пятнадцатом@")
138 |   | ("1[E1]+5" : "пятнадцатому@")
139 |   | ("1[E1]+5" : "пятнадцатую@")
140 |   | ("1[E1]+5" : "пятнадцатые@")
141 |   | ("1[E1]+5" : "пятнадцатый@")
142 |   | ("1[E1]+5" : "пятнадцатым@")
143 |   | ("1[E1]+5" : "пятнадцатыми@")
144 |   | ("1[E1]+5" : "пятнадцатых@")
145 |   | ("1[E1]+5" : "пятнадцать")
146 |   | ("1[E1]+5" : "пятнадцатью")
147 |   | ("1[E1]+6" : "шестнадцатая@")
148 |   | ("1[E1]+6" : "шестнадцати")
149 |   | ("1[E1]+6" : "шестнадцатого@")
150 |   | ("1[E1]+6" : "шестнадцатое@")
151 |   | ("1[E1]+6" : "шестнадцатой@")
152 |   | ("1[E1]+6" : "шестнадцатом@")
153 |   | ("1[E1]+6" : "шестнадцатому@")
154 |   | ("1[E1]+6" : "шестнадцатую@")
155 |   | ("1[E1]+6" : "шестнадцатые@")
156 |   | ("1[E1]+6" : "шестнадцатый@")
157 |   | ("1[E1]+6" : "шестнадцатым@")
158 |   | ("1[E1]+6" : "шестнадцатыми@")
159 |   | ("1[E1]+6" : "шестнадцатых@")
160 |   | ("1[E1]+6" : "шестнадцать")
161 |   | ("1[E1]+6" : "шестнадцатью")
162 |   | ("1[E1]+7" : "семнадцатая@")
163 |   | ("1[E1]+7" : "семнадцати")
164 |   | ("1[E1]+7" : "семнадцатого@")
165 |   | ("1[E1]+7" : "семнадцатое@")
166 |   | ("1[E1]+7" : "семнадцатой@")
167 |   | ("1[E1]+7" : "семнадцатом@")
168 |   | ("1[E1]+7" : "семнадцатому@")
169 |   | ("1[E1]+7" : "семнадцатую@")
170 |   | ("1[E1]+7" : "семнадцатые@")
171 |   | ("1[E1]+7" : "семнадцатый@")
172 |   | ("1[E1]+7" : "семнадцатым@")
173 |   | ("1[E1]+7" : "семнадцатыми@")
174 |   | ("1[E1]+7" : "семнадцатых@")
175 |   | ("1[E1]+7" : "семнадцать")
176 |   | ("1[E1]+7" : "семнадцатью")
177 |   | ("1[E1]+8" : "восемнадцатая@")
178 |   | ("1[E1]+8" : "восемнадцати")
179 |   | ("1[E1]+8" : "восемнадцатого@")
180 |   | ("1[E1]+8" : "восемнадцатое@")
181 |   | ("1[E1]+8" : "восемнадцатой@")
182 |   | ("1[E1]+8" : "восемнадцатом@")
183 |   | ("1[E1]+8" : "восемнадцатому@")
184 |   | ("1[E1]+8" : "восемнадцатую@")
185 |   | ("1[E1]+8" : "восемнадцатые@")
186 |   | ("1[E1]+8" : "восемнадцатый@")
187 |   | ("1[E1]+8" : "восемнадцатым@")
188 |   | ("1[E1]+8" : "восемнадцатыми@")
189 |   | ("1[E1]+8" : "восемнадцатых@")
190 |   | ("1[E1]+8" : "восемнадцать")
191 |   | ("1[E1]+8" : "восемнадцатью")
192 |   | ("1[E1]+9" : "девятнадцатая@")
193 |   | ("1[E1]+9" : "девятнадцати")
194 |   | ("1[E1]+9" : "девятнадцатого@")
195 |   | ("1[E1]+9" : "девятнадцатое@")
196 |   | ("1[E1]+9" : "девятнадцатой@")
197 |   | ("1[E1]+9" : "девятнадцатом@")
198 |   | ("1[E1]+9" : "девятнадцатому@")
199 |   | ("1[E1]+9" : "девятнадцатую@")
200 |   | ("1[E1]+9" : "девятнадцатые@")
201 |   | ("1[E1]+9" : "девятнадцатый@")
202 |   | ("1[E1]+9" : "девятнадцатым@")
203 |   | ("1[E1]+9" : "девятнадцатыми@")
204 |   | ("1[E1]+9" : "девятнадцатых@")
205 |   | ("1[E1]+9" : "девятнадцать")
206 |   | ("1[E1]+9" : "девятнадцатью")]
207 | ;
208 | 
209 | lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR];
210 | 
211 | lexset2 = Optimize[
212 |     ("1[E1]" : "десятая@")
213 |   | ("1[E1]" : "десяти")
214 |   | ("1[E1]" : "десятого@")
215 |   | ("1[E1]" : "десятое@")
216 |   | ("1[E1]" : "десятой@")
217 |   | ("1[E1]" : "десятом@")
218 |   | ("1[E1]" : "десятому@")
219 |   | ("1[E1]" : "десятую@")
220 |   | ("1[E1]" : "десятые@")
221 |   | ("1[E1]" : "десятый@")
222 |   | ("1[E1]" : "десятым@")
223 |   | ("1[E1]" : "десятыми@")
224 |   | ("1[E1]" : "десятых@")
225 |   | ("1[E1]" : "десять")
226 |   | ("1[E1]" : "десятью")
227 |   | ("1[E2]" : "сотая@")
228 |   | ("1[E2]" : "сотого@")
229 |   | ("1[E2]" : "сотое@")
230 |   | ("1[E2]" : "сотой@")
231 |   | ("1[E2]" : "сотом@")
232 |   | ("1[E2]" : "сотому@")
233 |   | ("1[E2]" : "сотую@")
234 |   | ("1[E2]" : "сотые@")
235 |   | ("1[E2]" : "сотый@")
236 |   | ("1[E2]" : "сотым@")
237 |   | ("1[E2]" : "сотыми@")
238 |   | ("1[E2]" : "сотых@")
239 |   | ("1[E2]" : "ста")
240 |   | ("1[E2]" : "сто")
241 |   | ("1[E3]" : "тысячная@")
242 |   | ("1[E3]" : "тысячного@")
243 |   | ("1[E3]" : "тысячное@")
244 |   | ("1[E3]" : "тысячной@")
245 |   | ("1[E3]" : "тысячном@")
246 |   | ("1[E3]" : "тысячному@")
247 |   | ("1[E3]" : "тысячную@")
248 |   | ("1[E3]" : "тысячные@")
249 |   | ("1[E3]" : "тысячный@")
250 |   | ("1[E3]" : "тысячным@")
251 |   | ("1[E3]" : "тысячными@")
252 |   | ("1[E3]" : "тысячных@")
253 |   | ("1[E6]" : "миллионная@")
254 |   | ("1[E6]" : "миллионного@")
255 |   | ("1[E6]" : "миллионное@")
256 |   | ("1[E6]" : "миллионной@")
257 |   | ("1[E6]" : "миллионном@")
258 |   | ("1[E6]" : "миллионному@")
259 |   | ("1[E6]" : "миллионную@")
260 |   | ("1[E6]" : "миллионные@")
261 |   | ("1[E6]" : "миллионный@")
262 |   | ("1[E6]" : "миллионным@")
263 |   | ("1[E6]" : "миллионными@")
264 |   | ("1[E6]" : "миллионных@")
265 |   | ("1[E9]" : "миллиардная@")
266 |   | ("1[E9]" : "миллиардного@")
267 |   | ("1[E9]" : "миллиардное@")
268 |   | ("1[E9]" : "миллиардной@")
269 |   | ("1[E9]" : "миллиардном@")
270 |   | ("1[E9]" : "миллиардному@")
271 |   | ("1[E9]" : "миллиардную@")
272 |   | ("1[E9]" : "миллиардные@")
273 |   | ("1[E9]" : "миллиардный@")
274 |   | ("1[E9]" : "миллиардным@")
275 |   | ("1[E9]" : "миллиардными@")
276 |   | ("1[E9]" : "миллиардных@")
277 |   | ("2[E1]" : "двадцатая@")
278 |   | ("2[E1]" : "двадцати")
279 |   | ("2[E1]" : "двадцатого@")
280 |   | ("2[E1]" : "двадцатое@")
281 |   | ("2[E1]" : "двадцатой@")
282 |   | ("2[E1]" : "двадцатом@")
283 |   | ("2[E1]" : "двадцатому@")
284 |   | ("2[E1]" : "двадцатую@")
285 |   | ("2[E1]" : "двадцатые@")
286 |   | ("2[E1]" : "двадцатый@")
287 |   | ("2[E1]" : "двадцатым@")
288 |   | ("2[E1]" : "двадцатыми@")
289 |   | ("2[E1]" : "двадцатых@")
290 |   | ("2[E1]" : "двадцать")
291 |   | ("2[E1]" : "двадцатью")
292 |   | ("2[E2]" : "двести")
293 |   | ("2[E2]" : "двумстам")
294 |   | ("2[E2]" : "двумястами")
295 |   | ("2[E2]" : "двухсот")
296 |   | ("2[E2]" : "двухсотая@")
297 |   | ("2[E2]" : "двухсотого@")
298 |   | ("2[E2]" : "двухсотое@")
299 |   | ("2[E2]" : "двухсотой@")
300 |   | ("2[E2]" : "двухсотом@")
301 |   | ("2[E2]" : "двухсотому@")
302 |   | ("2[E2]" : "двухсотую@")
303 |   | ("2[E2]" : "двухсотые@")
304 |   | ("2[E2]" : "двухсотый@")
305 |   | ("2[E2]" : "двухсотым@")
306 |   | ("2[E2]" : "двухсотыми@")
307 |   | ("2[E2]" : "двухсотых@")
308 |   | ("2[E2]" : "двухстах")
309 |   | ("3[E1]" : "тридцатая@")
310 |   | ("3[E1]" : "тридцати")
311 |   | ("3[E1]" : "тридцатого@")
312 |   | ("3[E1]" : "тридцатое@")
313 |   | ("3[E1]" : "тридцатой@")
314 |   | ("3[E1]" : "тридцатом@")
315 |   | ("3[E1]" : "тридцатому@")
316 |   | ("3[E1]" : "тридцатую@")
317 |   | ("3[E1]" : "тридцатые@")
318 |   | ("3[E1]" : "тридцатый@")
319 |   | ("3[E1]" : "тридцатым@")
320 |   | ("3[E1]" : "тридцатыми@")
321 |   | ("3[E1]" : "тридцатых@")
322 |   | ("3[E1]" : "тридцать")
323 |   | ("3[E1]" : "тридцатью")
324 |   | ("3[E2]" : "тремстам")
325 |   | ("3[E2]" : "тремястами")
326 |   | ("3[E2]" : "трехсот")
327 |   | ("3[E2]" : "трехсотая@")
328 |   | ("3[E2]" : "трехсотого@")
329 |   | ("3[E2]" : "трехсотое@")
330 |   | ("3[E2]" : "трехсотой@")
331 |   | ("3[E2]" : "трехсотом@")
332 |   | ("3[E2]" : "трехсотому@")
333 |   | ("3[E2]" : "трехсотую@")
334 |   | ("3[E2]" : "трехсотые@")
335 |   | ("3[E2]" : "трехсотый@")
336 |   | ("3[E2]" : "трехсотым@")
337 |   | ("3[E2]" : "трехсотыми@")
338 |   | ("3[E2]" : "трехсотых@")
339 |   | ("3[E2]" : "трехстах")
340 |   | ("3[E2]" : "триста")
341 |   | ("4[E1]" : "сорок")
342 |   | ("4[E1]" : "сорока")
343 |   | ("4[E1]" : "сороковая@")
344 |   | ("4[E1]" : "сорокового@")
345 |   | ("4[E1]" : "сороковое@")
346 |   | ("4[E1]" : "сороковой@")
347 |   | ("4[E1]" : "сороковом@")
348 |   | ("4[E1]" : "сороковому@")
349 |   | ("4[E1]" : "сороковую@")
350 |   | ("4[E1]" : "сороковые@")
351 |   | ("4[E1]" : "сороковым@")
352 |   | ("4[E1]" : "сороковыми@")
353 |   | ("4[E1]" : "сороковых@")
354 |   | ("4[E2]" : "четыремстам")
355 |   | ("4[E2]" : "четыреста")
356 |   | ("4[E2]" : "четырехсот")
357 |   | ("4[E2]" : "четырехсотая@")
358 |   | ("4[E2]" : "четырехсотого@")
359 |   | ("4[E2]" : "четырехсотое@")
360 |   | ("4[E2]" : "четырехсотой@")
361 |   | ("4[E2]" : "четырехсотом@")
362 |   | ("4[E2]" : "четырехсотому@")
363 |   | ("4[E2]" : "четырехсотую@")
364 |   | ("4[E2]" : "четырехсотые@")
365 |   | ("4[E2]" : "четырехсотый@")
366 |   | ("4[E2]" : "четырехсотым@")
367 |   | ("4[E2]" : "четырехсотыми@")
368 |   | ("4[E2]" : "четырехсотых@")
369 |   | ("4[E2]" : "четырехстах")
370 |   | ("4[E2]" : "четырьмястами")
371 |   | ("5[E1]" : "пятидесятая@")
372 |   | ("5[E1]" : "пятидесяти")
373 |   | ("5[E1]" : "пятидесятого@")
374 |   | ("5[E1]" : "пятидесятое@")
375 |   | ("5[E1]" : "пятидесятой@")
376 |   | ("5[E1]" : "пятидесятом@")
377 |   | ("5[E1]" : "пятидесятому@")
378 |   | ("5[E1]" : "пятидесятую@")
379 |   | ("5[E1]" : "пятидесятые@")
380 |   | ("5[E1]" : "пятидесятый@")
381 |   | ("5[E1]" : "пятидесятым@")
382 |   | ("5[E1]" : "пятидесятыми@")
383 |   | ("5[E1]" : "пятидесятых@")
384 |   | ("5[E1]" : "пятьдесят")
385 |   | ("5[E1]" : "пятьюдесятью")
386 |   | ("5[E2]" : "пятисот")
387 |   | ("5[E2]" : "пятисотая@")
388 |   | ("5[E2]" : "пятисотого@")
389 |   | ("5[E2]" : "пятисотое@")
390 |   | ("5[E2]" : "пятисотой@")
391 |   | ("5[E2]" : "пятисотом@")
392 |   | ("5[E2]" : "пятисотому@")
393 |   | ("5[E2]" : "пятисотую@")
394 |   | ("5[E2]" : "пятисотые@")
395 |   | ("5[E2]" : "пятисотый@")
396 |   | ("5[E2]" : "пятисотым@")
397 |   | ("5[E2]" : "пятисотыми@")
398 |   | ("5[E2]" : "пятисотых@")
399 |   | ("5[E2]" : "пятистам")
400 |   | ("5[E2]" : "пятистах")
401 |   | ("5[E2]" : "пятьсот")
402 |   | ("5[E2]" : "пятьюстами")
403 |   | ("6[E1]" : "шестидесятая@")
404 |   | ("6[E1]" : "шестидесяти")
405 |   | ("6[E1]" : "шестидесятого@")
406 |   | ("6[E1]" : "шестидесятое@")
407 |   | ("6[E1]" : "шестидесятой@")
408 |   | ("6[E1]" : "шестидесятом@")
409 |   | ("6[E1]" : "шестидесятому@")
410 |   | ("6[E1]" : "шестидесятую@")
411 |   | ("6[E1]" : "шестидесятые@")
412 |   | ("6[E1]" : "шестидесятый@")
413 |   | ("6[E1]" : "шестидесятым@")
414 |   | ("6[E1]" : "шестидесятыми@")
415 |   | ("6[E1]" : "шестидесятых@")
416 |   | ("6[E1]" : "шестьдесят")
417 |   | ("6[E1]" : "шестьюдесятью")
418 |   | ("6[E2]" : "шестисот")
419 |   | ("6[E2]" : "шестисотая@")
420 |   | ("6[E2]" : "шестисотого@")
421 |   | ("6[E2]" : "шестисотое@")
422 |   | ("6[E2]" : "шестисотой@")
423 |   | ("6[E2]" : "шестисотом@")
424 |   | ("6[E2]" : "шестисотому@")
425 |   | ("6[E2]" : "шестисотую@")
426 |   | ("6[E2]" : "шестисотые@")
427 |   | ("6[E2]" : "шестисотый@")
428 |   | ("6[E2]" : "шестисотым@")
429 |   | ("6[E2]" : "шестисотыми@")
430 |   | ("6[E2]" : "шестисотых@")
431 |   | ("6[E2]" : "шестистам")
432 |   | ("6[E2]" : "шестистах")
433 |   | ("6[E2]" : "шестьсот")
434 |   | ("6[E2]" : "шестьюстами")
435 |   | ("7[E1]" : "семидесятая@")
436 |   | ("7[E1]" : "семидесяти")
437 |   | ("7[E1]" : "семидесятого@")
438 |   | ("7[E1]" : "семидесятое@")
439 |   | ("7[E1]" : "семидесятой@")
440 |   | ("7[E1]" : "семидесятом@")
441 |   | ("7[E1]" : "семидесятому@")
442 |   | ("7[E1]" : "семидесятую@")
443 |   | ("7[E1]" : "семидесятые@")
444 |   | ("7[E1]" : "семидесятый@")
445 |   | ("7[E1]" : "семидесятым@")
446 |   | ("7[E1]" : "семидесятыми@")
447 |   | ("7[E1]" : "семидесятых@")
448 |   | ("7[E1]" : "семьдесят")
449 |   | ("7[E1]" : "семьюдесятью")
450 |   | ("7[E2]" : "семисот")
451 |   | ("7[E2]" : "семисотая@")
452 |   | ("7[E2]" : "семисотого@")
453 |   | ("7[E2]" : "семисотое@")
454 |   | ("7[E2]" : "семисотой@")
455 |   | ("7[E2]" : "семисотом@")
456 |   | ("7[E2]" : "семисотому@")
457 |   | ("7[E2]" : "семисотую@")
458 |   | ("7[E2]" : "семисотые@")
459 |   | ("7[E2]" : "семисотый@")
460 |   | ("7[E2]" : "семисотым@")
461 |   | ("7[E2]" : "семисотыми@")
462 |   | ("7[E2]" : "семисотых@")
463 |   | ("7[E2]" : "семистам")
464 |   | ("7[E2]" : "семистах")
465 |   | ("7[E2]" : "семьсот")
466 |   | ("7[E2]" : "семьюстами")
467 |   | ("8[E1]" : "восемьдесят")
468 |   | ("8[E1]" : "восьмидесятая@")
469 |   | ("8[E1]" : "восьмидесяти")
470 |   | ("8[E1]" : "восьмидесятого@")
471 |   | ("8[E1]" : "восьмидесятое@")
472 |   | ("8[E1]" : "восьмидесятой@")
473 |   | ("8[E1]" : "восьмидесятом@")
474 |   | ("8[E1]" : "восьмидесятому@")
475 |   | ("8[E1]" : "восьмидесятую@")
476 |   | ("8[E1]" : "восьмидесятые@")
477 |   | ("8[E1]" : "восьмидесятый@")
478 |   | ("8[E1]" : "восьмидесятым@")
479 |   | ("8[E1]" : "восьмидесятыми@")
480 |   | ("8[E1]" : "восьмидесятых@")
481 |   | ("8[E1]" : "восьмьюдесятью")
482 |   | ("8[E2]" : "восемьсот")
483 |   | ("8[E2]" : "восемьюстами")
484 |   | ("8[E2]" : "восьмисот")
485 |   | ("8[E2]" : "восьмисотая@")
486 |   | ("8[E2]" : "восьмисотого@")
487 |   | ("8[E2]" : "восьмисотое@")
488 |   | ("8[E2]" : "восьмисотой@")
489 |   | ("8[E2]" : "восьмисотом@")
490 |   | ("8[E2]" : "восьмисотому@")
491 |   | ("8[E2]" : "восьмисотую@")
492 |   | ("8[E2]" : "восьмисотые@")
493 |   | ("8[E2]" : "восьмисотый@")
494 |   | ("8[E2]" : "восьмисотым@")
495 |   | ("8[E2]" : "восьмисотыми@")
496 |   | ("8[E2]" : "восьмисотых@")
497 |   | ("8[E2]" : "восьмистам")
498 |   | ("8[E2]" : "восьмистах")
499 |   | ("8[E2]" : "восьмьюстами")
500 |   | ("9[E1]" : "девяноста")
501 |   | ("9[E1]" : "девяностая@")
502 |   | ("9[E1]" : "девяносто")
503 |   | ("9[E1]" : "девяностого@")
504 |   | ("9[E1]" : "девяностое@")
505 |   | ("9[E1]" : "девяностой@")
506 |   | ("9[E1]" : "девяностом@")
507 |   | ("9[E1]" : "девяностому@")
508 |   | ("9[E1]" : "девяностую@")
509 |   | ("9[E1]" : "девяностые@")
510 |   | ("9[E1]" : "девяностый@")
511 |   | ("9[E1]" : "девяностым@")
512 |   | ("9[E1]" : "девяностыми@")
513 |   | ("9[E1]" : "девяностых@")
514 |   | ("9[E2]" : "девятисот")
515 |   | ("9[E2]" : "девятисотая@")
516 |   | ("9[E2]" : "девятисотого@")
517 |   | ("9[E2]" : "девятисотое@")
518 |   | ("9[E2]" : "девятисотой@")
519 |   | ("9[E2]" : "девятисотом@")
520 |   | ("9[E2]" : "девятисотому@")
521 |   | ("9[E2]" : "девятисотую@")
522 |   | ("9[E2]" : "девятисотые@")
523 |   | ("9[E2]" : "девятисотый@")
524 |   | ("9[E2]" : "девятисотым@")
525 |   | ("9[E2]" : "девятисотыми@")
526 |   | ("9[E2]" : "девятисотых@")
527 |   | ("9[E2]" : "девятистам")
528 |   | ("9[E2]" : "девятистах")
529 |   | ("9[E2]" : "девятьсот")
530 |   | ("9[E2]" : "девятьюстами")]
531 | ;
532 | 
533 | lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR];
534 | 
535 | lexset1 = Optimize[
536 |     ("+" : "")
537 |   | ("1" : "один")
538 |   | ("1" : "одна")
539 |   | ("1" : "одни")
540 |   | ("1" : "одним")
541 |   | ("1" : "одними")
542 |   | ("1" : "одних")
543 |   | ("1" : "одно")
544 |   | ("1" : "одного")
545 |   | ("1" : "одной")
546 |   | ("1" : "одном")
547 |   | ("1" : "одному")
548 |   | ("1" : "одною")
549 |   | ("1" : "одну")
550 |   | ("1" : "первая@")
551 |   | ("1" : "первого@")
552 |   | ("1" : "первое@")
553 |   | ("1" : "первой@")
554 |   | ("1" : "первом@")
555 |   | ("1" : "первому@")
556 |   | ("1" : "первую@")
557 |   | ("1" : "первые@")
558 |   | ("1" : "первый@")
559 |   | ("1" : "первым@")
560 |   | ("1" : "первыми@")
561 |   | ("1" : "первых@")
562 |   | ("2" : "вторая@")
563 |   | ("2" : "второго@")
564 |   | ("2" : "второе@")
565 |   | ("2" : "второй@")
566 |   | ("2" : "втором@")
567 |   | ("2" : "второму@")
568 |   | ("2" : "вторую@")
569 |   | ("2" : "вторые@")
570 |   | ("2" : "вторым@")
571 |   | ("2" : "вторыми@")
572 |   | ("2" : "вторых@")
573 |   | ("2" : "два")
574 |   | ("2" : "две")
575 |   | ("2" : "двум")
576 |   | ("2" : "двумя")
577 |   | ("2" : "двух")
578 |   | ("3" : "трем")
579 |   | ("3" : "тремя")
580 |   | ("3" : "третий@")
581 |   | ("3" : "третье@")
582 |   | ("3" : "третьего@")
583 |   | ("3" : "третьей@")
584 |   | ("3" : "третьем@")
585 |   | ("3" : "третьему@")
586 |   | ("3" : "третьи@")
587 |   | ("3" : "третьим@")
588 |   | ("3" : "третьими@")
589 |   | ("3" : "третьих@")
590 |   | ("3" : "третью@")
591 |   | ("3" : "третья@")
592 |   | ("3" : "трех")
593 |   | ("3" : "три")
594 |   | ("4" : "четвертая@")
595 |   | ("4" : "четвертого@")
596 |   | ("4" : "четвертое@")
597 |   | ("4" : "четвертой@")
598 |   | ("4" : "четвертом@")
599 |   | ("4" : "четвертому@")
600 |   | ("4" : "четвертую@")
601 |   | ("4" : "четвертые@")
602 |   | ("4" : "четвертый@")
603 |   | ("4" : "четвертым@")
604 |   | ("4" : "четвертыми@")
605 |   | ("4" : "четвертых@")
606 |   | ("4" : "четыре")
607 |   | ("4" : "четырем")
608 |   | ("4" : "четырех")
609 |   | ("4" : "четырьмя")
610 |   | ("5" : "пятая@")
611 |   | ("5" : "пяти")
612 |   | ("5" : "пятого@")
613 |   | ("5" : "пятое@")
614 |   | ("5" : "пятой@")
615 |   | ("5" : "пятом@")
616 |   | ("5" : "пятому@")
617 |   | ("5" : "пятую@")
618 |   | ("5" : "пятые@")
619 |   | ("5" : "пятый@")
620 |   | ("5" : "пятым@")
621 |   | ("5" : "пятыми@")
622 |   | ("5" : "пятых@")
623 |   | ("5" : "пять")
624 |   | ("5" : "пятью")
625 |   | ("6" : "шестая@")
626 |   | ("6" : "шести")
627 |   | ("6" : "шестого@")
628 |   | ("6" : "шестое@")
629 |   | ("6" : "шестой@")
630 |   | ("6" : "шестом@")
631 |   | ("6" : "шестому@")
632 |   | ("6" : "шестую@")
633 |   | ("6" : "шестые@")
634 |   | ("6" : "шестым@")
635 |   | ("6" : "шестыми@")
636 |   | ("6" : "шестых@")
637 |   | ("6" : "шесть")
638 |   | ("6" : "шестью")
639 |   | ("7" : "седьмая@")
640 |   | ("7" : "седьмого@")
641 |   | ("7" : "седьмое@")
642 |   | ("7" : "седьмой@")
643 |   | ("7" : "седьмом@")
644 |   | ("7" : "седьмому@")
645 |   | ("7" : "седьмую@")
646 |   | ("7" : "седьмые@")
647 |   | ("7" : "седьмым@")
648 |   | ("7" : "седьмыми@")
649 |   | ("7" : "седьмых@")
650 |   | ("7" : "семи")
651 |   | ("7" : "семь")
652 |   | ("7" : "семью")
653 |   | ("8" : "восемь")
654 |   | ("8" : "восьмая@")
655 |   | ("8" : "восьми")
656 |   | ("8" : "восьмого@")
657 |   | ("8" : "восьмое@")
658 |   | ("8" : "восьмой@")
659 |   | ("8" : "восьмом@")
660 |   | ("8" : "восьмому@")
661 |   | ("8" : "восьмую@")
662 |   | ("8" : "восьмые@")
663 |   | ("8" : "восьмым@")
664 |   | ("8" : "восьмыми@")
665 |   | ("8" : "восьмых@")
666 |   | ("8" : "восьмью")
667 |   | ("9" : "девятая@")
668 |   | ("9" : "девяти")
669 |   | ("9" : "девятого@")
670 |   | ("9" : "девятое@")
671 |   | ("9" : "девятой@")
672 |   | ("9" : "девятом@")
673 |   | ("9" : "девятому@")
674 |   | ("9" : "девятую@")
675 |   | ("9" : "девятые@")
676 |   | ("9" : "девятый@")
677 |   | ("9" : "девятым@")
678 |   | ("9" : "девятыми@")
679 |   | ("9" : "девятых@")
680 |   | ("9" : "девять")
681 |   | ("9" : "девятью")
682 |   | ("[E3]" : "тысяч")
683 |   | ("[E3]" : "тысяча")
684 |   | ("[E3]" : "тысячам")
685 |   | ("[E3]" : "тысячами")
686 |   | ("[E3]" : "тысячах")
687 |   | ("[E3]" : "тысяче")
688 |   | ("[E3]" : "тысячей")
689 |   | ("[E3]" : "тысячи")
690 |   | ("[E3]" : "тысячу")
691 |   | ("[E3]" : "тысячью")
692 |   | ("[E6]" : "миллион")
693 |   | ("[E6]" : "миллиона")
694 |   | ("[E6]" : "миллионам")
695 |   | ("[E6]" : "миллионами")
696 |   | ("[E6]" : "миллионах")
697 |   | ("[E6]" : "миллионе")
698 |   | ("[E6]" : "миллионов")
699 |   | ("[E6]" : "миллионом")
700 |   | ("[E6]" : "миллиону")
701 |   | ("[E6]" : "миллионы")
702 |   | ("[E9]" : "миллиард")
703 |   | ("[E9]" : "миллиарда")
704 |   | ("[E9]" : "миллиардам")
705 |   | ("[E9]" : "миллиардами")
706 |   | ("[E9]" : "миллиардах")
707 |   | ("[E9]" : "миллиарде")
708 |   | ("[E9]" : "миллиардов")
709 |   | ("[E9]" : "миллиардом")
710 |   | ("[E9]" : "миллиарду")
711 |   | ("[E9]" : "миллиарды")
712 |   | ("|0|" : "ноле")
713 |   | ("|0|" : "нолем")
714 |   | ("|0|" : "ноль")
715 |   | ("|0|" : "нолю")
716 |   | ("|0|" : "ноля")
717 |   | ("|0|" : "нуле")
718 |   | ("|0|" : "нулем")
719 |   | ("|0|" : "нуль")
720 |   | ("|0|" : "нулю")
721 |   | ("|0|" : "нуля")]
722 | ;
723 | 
724 | lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR];
725 | 
726 | export LEX = Optimize[lex3 @ lex2 @ lex1];
727 | 
728 | export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]";
729 | 
730 | # END LANGUAGE SPECIFIC DATA
731 | ################################################################################
732 | # Inserts a marker after the Ms.
733 | export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR];
734 | 
735 | # Deletes all powers and "+".
736 | export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR];
737 | 
738 | # Deletes trailing zeros at the beginning of a number, so that "0003" does not
739 | # get treated as an ordinary number.
740 | export DELETE_INITIAL_ZEROS =
741 |   CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR]
742 | ;
743 | 
744 | NonMs = Optimize[POWERS - Ms];
745 | 
746 | # Deletes (usually) zeros before a non-M. E.g., +0[E1] should be
747 | # deleted
748 | export DELETE_INTERMEDIATE_ZEROS1 =
749 |   CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR]
750 | ;
751 | 
752 | # Deletes (usually) zeros before an M, if there is no non-zero element between
753 | # that and the previous boundary. Thus, if after the result of the rule above we
754 | # end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final
755 | # zero.
756 | export DELETE_INTERMEDIATE_ZEROS2 = Optimize[
757 |    CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR]
758 |  @ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]]
759 | ;
760 | 
761 | # Final clean up of stray zeros.
762 | export DELETE_REMAINING_ZEROS = Optimize[
763 |    CDRewrite[Zero["+0"], "", "", SIGMA_STAR]
764 |  @ CDRewrite[Zero["0"], "", "", SIGMA_STAR]]
765 | ;
766 | 
767 | # Applies the revaluation map. For example in English, change [E4] to [E1] as a
768 | # modifier of [E3]
769 | export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR];
770 | 
771 | # Deletes the various marks and powers in the input and output.
772 | export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR];
773 | 
774 | export CLEAN_SPACES = Optimize[
775 |    CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR]
776 |  @ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR]
777 |  @ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]]
778 | ;
779 | 
780 | d = b.kDigit;
781 | 
782 | # Germanic inversion rule.
783 | germanic =
784 |     (I["1+"] d "[E1]" D["+1"])
785 |   | (I["2+"] d "[E1]" D["+2"])
786 |   | (I["3+"] d "[E1]" D["+3"])
787 |   | (I["4+"] d "[E1]" D["+4"])
788 |   | (I["5+"] d "[E1]" D["+5"])
789 |   | (I["6+"] d "[E1]" D["+6"])
790 |   | (I["7+"] d "[E1]" D["+7"])
791 |   | (I["8+"] d "[E1]" D["+8"])
792 |   | (I["9+"] d "[E1]" D["+9"])
793 | ;
794 | 
795 | germanic_inversion =
796 |   CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt']
797 | ;
798 | 
799 | export GERMANIC_INVERSION = SIGMA_STAR;
800 | export ORDINAL_RESTRICTION = 
801 |   Optimize[((SIGMA - "@")* "@") @ CDRewrite[D["@"], "", "", SIGMA_STAR]]
802 | ;
803 | nondigits = b.kBytes - b.kDigit;
804 | export ORDINAL_SUFFIX = D[nondigits*];
805 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/ordinals.tsv:
--------------------------------------------------------------------------------
  1 | 0	нулевая
  2 | 0	нулевого
  3 | 0	нулевое
  4 | 0	нулевой
  5 | 0	нулевом
  6 | 0	нулевому
  7 | 0	нулевую
  8 | 0	нулевые
  9 | 0	нулевым
 10 | 0	нулевым
 11 | 0	нулевыми
 12 | 0	нулевых
 13 | 1	первая
 14 | 1	первого
 15 | 1	первое
 16 | 1	первой
 17 | 1	первом
 18 | 1	первому
 19 | 1	первую
 20 | 1	первые
 21 | 1	первый
 22 | 1	первым
 23 | 1	первым
 24 | 1	первыми
 25 | 1	первых
 26 | 2	вторая
 27 | 2	второго
 28 | 2	второе
 29 | 2	второй
 30 | 2	втором
 31 | 2	второму
 32 | 2	вторую
 33 | 2	вторые
 34 | 2	вторым
 35 | 2	вторым
 36 | 2	вторыми
 37 | 2	вторых
 38 | 3	третий
 39 | 3	третье
 40 | 3	третьего
 41 | 3	третьей
 42 | 3	третьем
 43 | 3	третьему
 44 | 3	третьи
 45 | 3	третьим
 46 | 3	третьим
 47 | 3	третьими
 48 | 3	третьих
 49 | 3	третью
 50 | 3	третья
 51 | 4	четвертая
 52 | 4	четвертого
 53 | 4	четвертое
 54 | 4	четвертой
 55 | 4	четвертом
 56 | 4	четвертому
 57 | 4	четвертую
 58 | 4	четвертые
 59 | 4	четвертый
 60 | 4	четвертым
 61 | 4	четвертым
 62 | 4	четвертыми
 63 | 4	четвертых
 64 | 4	четвёртая
 65 | 4	четвёртого
 66 | 4	четвёртое
 67 | 4	четвёртой
 68 | 4	четвёртом
 69 | 4	четвёртому
 70 | 4	четвёртую
 71 | 4	четвёртые
 72 | 4	четвёртый
 73 | 4	четвёртым
 74 | 4	четвёртым
 75 | 4	четвёртыми
 76 | 4	четвёртых
 77 | 5	пятая
 78 | 5	пятого
 79 | 5	пятое
 80 | 5	пятой
 81 | 5	пятом
 82 | 5	пятому
 83 | 5	пятую
 84 | 5	пятые
 85 | 5	пятый
 86 | 5	пятым
 87 | 5	пятым
 88 | 5	пятыми
 89 | 5	пятых
 90 | 6	шестая
 91 | 6	шестого
 92 | 6	шестое
 93 | 6	шестой
 94 | 6	шестом
 95 | 6	шестому
 96 | 6	шестую
 97 | 6	шестые
 98 | 6	шестым
 99 | 6	шестым
100 | 6	шестыми
101 | 6	шестых
102 | 7	седьмая
103 | 7	седьмого
104 | 7	седьмое
105 | 7	седьмой
106 | 7	седьмом
107 | 7	седьмому
108 | 7	седьмую
109 | 7	седьмые
110 | 7	седьмым
111 | 7	седьмым
112 | 7	седьмыми
113 | 7	седьмых
114 | 8	восьмая
115 | 8	восьмого
116 | 8	восьмое
117 | 8	восьмой
118 | 8	восьмом
119 | 8	восьмому
120 | 8	восьмую
121 | 8	восьмые
122 | 8	восьмым
123 | 8	восьмым
124 | 8	восьмыми
125 | 8	восьмых
126 | 9	девятая
127 | 9	девятого
128 | 9	девятое
129 | 9	девятой
130 | 9	девятом
131 | 9	девятому
132 | 9	девятую
133 | 9	девятые
134 | 9	девятый
135 | 9	девятым
136 | 9	девятым
137 | 9	девятыми
138 | 9	девятых
139 | 10	десятая
140 | 10	десятого
141 | 10	десятое
142 | 10	десятой
143 | 10	десятом
144 | 10	десятому
145 | 10	десятую
146 | 10	десятые
147 | 10	десятый
148 | 10	десятым
149 | 10	десятым
150 | 10	десятыми
151 | 10	десятых
152 | 11	одиннадцатая
153 | 11	одиннадцатого
154 | 11	одиннадцатое
155 | 11	одиннадцатой
156 | 11	одиннадцатом
157 | 11	одиннадцатому
158 | 11	одиннадцатую
159 | 11	одиннадцатые
160 | 11	одиннадцатый
161 | 11	одиннадцатым
162 | 11	одиннадцатым
163 | 11	одиннадцатыми
164 | 11	одиннадцатых
165 | 12	двенадцатая
166 | 12	двенадцатого
167 | 12	двенадцатое
168 | 12	двенадцатой
169 | 12	двенадцатом
170 | 12	двенадцатому
171 | 12	двенадцатую
172 | 12	двенадцатые
173 | 12	двенадцатый
174 | 12	двенадцатым
175 | 12	двенадцатым
176 | 12	двенадцатыми
177 | 12	двенадцатых
178 | 13	тринадцатая
179 | 13	тринадцатого
180 | 13	тринадцатое
181 | 13	тринадцатой
182 | 13	тринадцатом
183 | 13	тринадцатому
184 | 13	тринадцатую
185 | 13	тринадцатые
186 | 13	тринадцатый
187 | 13	тринадцатым
188 | 13	тринадцатым
189 | 13	тринадцатыми
190 | 13	тринадцатых
191 | 14	четырнадцатая
192 | 14	четырнадцатого
193 | 14	четырнадцатое
194 | 14	четырнадцатой
195 | 14	четырнадцатом
196 | 14	четырнадцатому
197 | 14	четырнадцатую
198 | 14	четырнадцатые
199 | 14	четырнадцатый
200 | 14	четырнадцатым
201 | 14	четырнадцатым
202 | 14	четырнадцатыми
203 | 14	четырнадцатых
204 | 15	пятнадцатая
205 | 15	пятнадцатого
206 | 15	пятнадцатое
207 | 15	пятнадцатой
208 | 15	пятнадцатом
209 | 15	пятнадцатому
210 | 15	пятнадцатую
211 | 15	пятнадцатые
212 | 15	пятнадцатый
213 | 15	пятнадцатым
214 | 15	пятнадцатым
215 | 15	пятнадцатыми
216 | 15	пятнадцатых
217 | 16	шестнадцатая
218 | 16	шестнадцатого
219 | 16	шестнадцатое
220 | 16	шестнадцатой
221 | 16	шестнадцатом
222 | 16	шестнадцатому
223 | 16	шестнадцатую
224 | 16	шестнадцатые
225 | 16	шестнадцатый
226 | 16	шестнадцатым
227 | 16	шестнадцатым
228 | 16	шестнадцатыми
229 | 16	шестнадцатых
230 | 17	семнадцатая
231 | 17	семнадцатого
232 | 17	семнадцатое
233 | 17	семнадцатой
234 | 17	семнадцатом
235 | 17	семнадцатому
236 | 17	семнадцатую
237 | 17	семнадцатые
238 | 17	семнадцатый
239 | 17	семнадцатым
240 | 17	семнадцатым
241 | 17	семнадцатыми
242 | 17	семнадцатых
243 | 18	восемнадцатая
244 | 18	восемнадцатого
245 | 18	восемнадцатое
246 | 18	восемнадцатой
247 | 18	восемнадцатом
248 | 18	восемнадцатому
249 | 18	восемнадцатую
250 | 18	восемнадцатые
251 | 18	восемнадцатый
252 | 18	восемнадцатым
253 | 18	восемнадцатым
254 | 18	восемнадцатыми
255 | 18	восемнадцатых
256 | 19	девятнадцатая
257 | 19	девятнадцатого
258 | 19	девятнадцатое
259 | 19	девятнадцатой
260 | 19	девятнадцатом
261 | 19	девятнадцатому
262 | 19	девятнадцатую
263 | 19	девятнадцатые
264 | 19	девятнадцатый
265 | 19	девятнадцатым
266 | 19	девятнадцатым
267 | 19	девятнадцатыми
268 | 19	девятнадцатых
269 | 20	двадцатая
270 | 20	двадцатого
271 | 20	двадцатое
272 | 20	двадцатой
273 | 20	двадцатом
274 | 20	двадцатому
275 | 20	двадцатую
276 | 20	двадцатые
277 | 20	двадцатый
278 | 20	двадцатым
279 | 20	двадцатым
280 | 20	двадцатыми
281 | 20	двадцатых
282 | 30	тридцатая
283 | 30	тридцатого
284 | 30	тридцатое
285 | 30	тридцатой
286 | 30	тридцатом
287 | 30	тридцатому
288 | 30	тридцатую
289 | 30	тридцатые
290 | 30	тридцатый
291 | 30	тридцатым
292 | 30	тридцатым
293 | 30	тридцатыми
294 | 30	тридцатых
295 | 40	сороковая
296 | 40	сорокового
297 | 40	сороковое
298 | 40	сороковой
299 | 40	сороковом
300 | 40	сороковому
301 | 40	сороковую
302 | 40	сороковые
303 | 40	сороковым
304 | 40	сороковым
305 | 40	сороковыми
306 | 40	сороковых
307 | 50	пятидесятая
308 | 50	пятидесятого
309 | 50	пятидесятое
310 | 50	пятидесятой
311 | 50	пятидесятом
312 | 50	пятидесятому
313 | 50	пятидесятую
314 | 50	пятидесятые
315 | 50	пятидесятый
316 | 50	пятидесятым
317 | 50	пятидесятым
318 | 50	пятидесятыми
319 | 50	пятидесятых
320 | 60	шестидесятая
321 | 60	шестидесятого
322 | 60	шестидесятое
323 | 60	шестидесятой
324 | 60	шестидесятом
325 | 60	шестидесятому
326 | 60	шестидесятую
327 | 60	шестидесятые
328 | 60	шестидесятый
329 | 60	шестидесятым
330 | 60	шестидесятым
331 | 60	шестидесятыми
332 | 60	шестидесятых
333 | 70	семидесятая
334 | 70	семидесятого
335 | 70	семидесятое
336 | 70	семидесятой
337 | 70	семидесятом
338 | 70	семидесятому
339 | 70	семидесятую
340 | 70	семидесятые
341 | 70	семидесятый
342 | 70	семидесятым
343 | 70	семидесятым
344 | 70	семидесятыми
345 | 70	семидесятых
346 | 80	восьмидесятая
347 | 80	восьмидесятого
348 | 80	восьмидесятое
349 | 80	восьмидесятой
350 | 80	восьмидесятом
351 | 80	восьмидесятому
352 | 80	восьмидесятую
353 | 80	восьмидесятые
354 | 80	восьмидесятый
355 | 80	восьмидесятым
356 | 80	восьмидесятым
357 | 80	восьмидесятыми
358 | 80	восьмидесятых
359 | 90	девяностая
360 | 90	девяностого
361 | 90	девяностое
362 | 90	девяностой
363 | 90	девяностом
364 | 90	девяностому
365 | 90	девяностую
366 | 90	девяностые
367 | 90	девяностый
368 | 90	девяностым
369 | 90	девяностым
370 | 90	девяностыми
371 | 90	девяностых
372 | 100	сотая
373 | 100	сотого
374 | 100	сотое
375 | 100	сотой
376 | 100	сотом
377 | 100	сотому
378 | 100	сотую
379 | 100	сотые
380 | 100	сотый
381 | 100	сотым
382 | 100	сотым
383 | 100	сотыми
384 | 100	сотых
385 | 200	двухсотая
386 | 200	двухсотого
387 | 200	двухсотое
388 | 200	двухсотой
389 | 200	двухсотом
390 | 200	двухсотому
391 | 200	двухсотую
392 | 200	двухсотые
393 | 200	двухсотый
394 | 200	двухсотым
395 | 200	двухсотым
396 | 200	двухсотыми
397 | 200	двухсотых
398 | 300	трехсотая
399 | 300	трехсотого
400 | 300	трехсотое
401 | 300	трехсотой
402 | 300	трехсотом
403 | 300	трехсотому
404 | 300	трехсотую
405 | 300	трехсотые
406 | 300	трехсотый
407 | 300	трехсотым
408 | 300	трехсотым
409 | 300	трехсотыми
410 | 300	трехсотых
411 | 400	четырехсотая
412 | 400	четырехсотого
413 | 400	четырехсотое
414 | 400	четырехсотой
415 | 400	четырехсотом
416 | 400	четырехсотому
417 | 400	четырехсотую
418 | 400	четырехсотые
419 | 400	четырехсотый
420 | 400	четырехсотым
421 | 400	четырехсотым
422 | 400	четырехсотыми
423 | 400	четырехсотых
424 | 500	пятисотая
425 | 500	пятисотого
426 | 500	пятисотое
427 | 500	пятисотой
428 | 500	пятисотом
429 | 500	пятисотому
430 | 500	пятисотую
431 | 500	пятисотые
432 | 500	пятисотый
433 | 500	пятисотым
434 | 500	пятисотым
435 | 500	пятисотыми
436 | 500	пятисотых
437 | 600	шестисотая
438 | 600	шестисотого
439 | 600	шестисотое
440 | 600	шестисотой
441 | 600	шестисотом
442 | 600	шестисотому
443 | 600	шестисотую
444 | 600	шестисотые
445 | 600	шестисотый
446 | 600	шестисотым
447 | 600	шестисотым
448 | 600	шестисотыми
449 | 600	шестисотых
450 | 700	семисотая
451 | 700	семисотого
452 | 700	семисотое
453 | 700	семисотой
454 | 700	семисотом
455 | 700	семисотому
456 | 700	семисотую
457 | 700	семисотые
458 | 700	семисотый
459 | 700	семисотым
460 | 700	семисотым
461 | 700	семисотыми
462 | 700	семисотых
463 | 800	восьмисотая
464 | 800	восьмисотого
465 | 800	восьмисотое
466 | 800	восьмисотой
467 | 800	восьмисотом
468 | 800	восьмисотому
469 | 800	восьмисотую
470 | 800	восьмисотые
471 | 800	восьмисотый
472 | 800	восьмисотым
473 | 800	восьмисотым
474 | 800	восьмисотыми
475 | 800	восьмисотых
476 | 900	девятисотая
477 | 900	девятисотого
478 | 900	девятисотое
479 | 900	девятисотой
480 | 900	девятисотом
481 | 900	девятисотому
482 | 900	девятисотую
483 | 900	девятисотые
484 | 900	девятисотый
485 | 900	девятисотым
486 | 900	девятисотым
487 | 900	девятисотыми
488 | 900	девятисотых
489 | 1000	тысячная
490 | 1000	тысячного
491 | 1000	тысячное
492 | 1000	тысячной
493 | 1000	тысячном
494 | 1000	тысячному
495 | 1000	тысячную
496 | 1000	тысячные
497 | 1000	тысячный
498 | 1000	тысячным
499 | 1000	тысячным
500 | 1000	тысячными
501 | 1000	тысячных
502 | 1000000	миллионная
503 | 1000000	миллионного
504 | 1000000	миллионное
505 | 1000000	миллионной
506 | 1000000	миллионном
507 | 1000000	миллионному
508 | 1000000	миллионную
509 | 1000000	миллионные
510 | 1000000	миллионный
511 | 1000000	миллионным
512 | 1000000	миллионным
513 | 1000000	миллионными
514 | 1000000	миллионных
515 | 1000000000	миллиардная
516 | 1000000000	миллиардного
517 | 1000000000	миллиардное
518 | 1000000000	миллиардной
519 | 1000000000	миллиардном
520 | 1000000000	миллиардному
521 | 1000000000	миллиардную
522 | 1000000000	миллиардные
523 | 1000000000	миллиардный
524 | 1000000000	миллиардным
525 | 1000000000	миллиардным
526 | 1000000000	миллиардными
527 | 1000000000	миллиардных
528 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/spelled.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # This verbalizer is used whenever there is an LM symbol that consists of
16 | # letters immediately followed by "{spelled}". This strips the "{spelled}"
17 | # suffix.
18 | 
19 | import 'util/byte.grm' as b;
20 | import 'ru/classifier/cyrillic.grm' as c;
21 | import 'ru/verbalizer/lexical_map.grm' as l;
22 | import 'ru/verbalizer/numbers.grm' as n;
23 | 
24 | digit = b.kDigit @ n.CARDINAL_NUMBERS;
25 | 
26 | char_set = (("a" | "A") : "letter-a")
27 |         | (("b" | "B") : "letter-b")
28 |         | (("c" | "C") : "letter-c")
29 |         | (("d" | "D") : "letter-d")
30 |         | (("e" | "E") : "letter-e")
31 |         | (("f" | "F") : "letter-f")
32 |         | (("g" | "G") : "letter-g")
33 |         | (("h" | "H") : "letter-h")
34 |         | (("i" | "I") : "letter-i")
35 |         | (("j" | "J") : "letter-j")
36 |         | (("k" | "K") : "letter-k")
37 |         | (("l" | "L") : "letter-l")
38 |         | (("m" | "M") : "letter-m")
39 |         | (("n" | "N") : "letter-n")
40 |         | (("o" | "O") : "letter-o")
41 |         | (("p" | "P") : "letter-p")
42 |         | (("q" | "Q") : "letter-q")
43 |         | (("r" | "R") : "letter-r")
44 |         | (("s" | "S") : "letter-s")
45 |         | (("t" | "T") : "letter-t")
46 |         | (("u" | "U") : "letter-u")
47 |         | (("v" | "V") : "letter-v")
48 |         | (("w" | "W") : "letter-w")
49 |         | (("x" | "X") : "letter-x")
50 |         | (("y" | "Y") : "letter-y")
51 |         | (("z" | "Z") : "letter-z")
52 |         | (digit)
53 |         | ("&" : "@@AND@@")
54 |         | ("." : "")
55 |         | ("-" : "")
56 |         | ("_" : "")
57 |         | ("/" : "")
58 |         | (n.I["letter-"] c.kCyrillicAlpha)
59 |         ;
60 | 
61 | ins_space = "" : " ";
62 | 
63 | suffix = "{spelled}" : "";
64 | 
65 | spelled = Optimize[char_set (ins_space char_set)* suffix];
66 | 
67 | export SPELLED = Optimize[spelled @ l.LEXICAL_MAP];
68 | 
69 | sigma_star = b.kBytes*;
70 | 
71 | # Gets rid of the letter- prefix since in some cases we don't want it.
72 | 
73 | del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star];
74 | 
75 | spelled_no_tag = Optimize[char_set (ins_space char_set)*];
76 | 
77 | export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter];
78 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/spoken_punct.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'ru/verbalizer/lexical_map.grm' as l;
16 | 
17 | punct =
18 |    ("." : "@@PERIOD@@")
19 |  | ("," : "@@COMMA@@")
20 |  | ("!" : "@@EXCLAMATION_MARK@@")
21 |  | ("?" : "@@QUESTION_MARK@@")
22 | ;
23 | 
24 | export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP];
25 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/time.grm:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import 'util/byte.grm' as b;
 16 | import 'ru/verbalizer/lexical_map.grm' as l;
 17 | import 'ru/verbalizer/numbers.grm' as n;
 18 | 
 19 | # Only handles 24-hour time with quarter-to, half-past and quarter-past.
 20 | 
 21 | increment_hour =
 22 |     ("0" : "1")
 23 |   | ("1" : "2")
 24 |   | ("2" : "3")
 25 |   | ("3" : "4")
 26 |   | ("4" : "5")
 27 |   | ("5" : "6")
 28 |   | ("6" : "7")
 29 |   | ("7" : "8")
 30 |   | ("8" : "9")
 31 |   | ("9" : "10")
 32 |   | ("10" : "11")
 33 |   | ("11" : "12")
 34 |   | ("12" : "1")  # If someone uses 12, we assume 12-hour by default.
 35 |   | ("13" : "14")
 36 |   | ("14" : "15")
 37 |   | ("15" : "16")
 38 |   | ("16" : "17")
 39 |   | ("17" : "18")
 40 |   | ("18" : "19")
 41 |   | ("19" : "20")
 42 |   | ("20" : "21")
 43 |   | ("21" : "22")
 44 |   | ("22" : "23")
 45 |   | ("23" : "12")
 46 | ;
 47 | 
 48 | hours = Project[increment_hour, 'input'];
 49 | 
 50 | d = b.kDigit;
 51 | D = d - "0";
 52 | 
 53 | minutes09 = "0" D;
 54 | 
 55 | minutes = ("1" | "2" | "3" | "4" | "5") d;
 56 | 
 57 | __sep__ = ":";
 58 | sep_space = __sep__ : " ";
 59 | 
 60 | verbalize_hours = hours @ n.CARDINAL_NUMBERS;
 61 | 
 62 | verbalize_minutes =
 63 |    ("00" : "@@HOUR@@")
 64 |  | (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS))
 65 |  | (minutes @ n.CARDINAL_NUMBERS)
 66 | ;
 67 | 
 68 | time_basic = Optimize[verbalize_hours sep_space verbalize_minutes];
 69 | 
 70 | # Special cases we handle right now.
 71 | # TODO: Need to allow for cases like
 72 | #
 73 | #   half twelve (in the UK English sense)
 74 | #   half twaalf (in the Dutch sense)
 75 | 
 76 | time_quarter_past =
 77 |    n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "]
 78 |    verbalize_hours
 79 |    n.D[__sep__ "15"];
 80 | 
 81 | time_half_past =
 82 |    n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "]
 83 |    verbalize_hours
 84 |    n.D[__sep__ "30"];
 85 | 
 86 | time_quarter_to =
 87 |    n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "]
 88 |    (increment_hour @ verbalize_hours)
 89 |    n.D[__sep__ "45"];
 90 | 
 91 | time_extra = Optimize[
 92 |   time_quarter_past | time_half_past | time_quarter_to]
 93 | ;
 94 | 
 95 | # Basic time periods which most languages can be expected to have.
 96 | __am__ = "a.m." | "am" | "AM" | "утра";
 97 | __pm__ = "p.m." | "pm" | "PM" | "вечера";
 98 | 
 99 | period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@");
100 | 
101 | time_variants = time_basic | time_extra;
102 | 
103 | time = Optimize[
104 |     (period (" " | n.I[" "]))? time_variants
105 |  |  time_variants ((" " | n.I[" "]) period)?]
106 | ;
107 | 
108 | export TIME = Optimize[time @ l.LEXICAL_MAP];
109 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/urls.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Rules for URLs and email addresses.
16 | 
17 | import 'util/byte.grm' as bytelib;
18 | import 'ru/verbalizer/lexical_map.grm' as l;
19 | 
20 | ins_space = "" : " ";
21 | dot = "." : "@@URL_DOT_EXPRESSION@@";
22 | at = "@" : "@@AT@@";
23 | 
24 | url_suffix =
25 |   (".com" : dot ins_space "com") |
26 |   (".gov" : dot ins_space "gov") |
27 |   (".edu" : dot ins_space "e d u") |
28 |   (".org" : dot ins_space "org") |
29 |   (".net" : dot ins_space "net")
30 | ;
31 | 
32 | letter_string = (bytelib.kAlnum)* bytelib.kAlnum;
33 | 
34 | letter_string_dot =
35 |   ((letter_string ins_space dot ins_space)* letter_string)
36 | ;
37 | 
38 | # Rules for URLs.
39 | export URL = Optimize[
40 |  ((letter_string_dot) (ins_space)
41 |   (url_suffix)) @ l.LEXICAL_MAP
42 | ];
43 | 
44 | # Rules for email addresses.
45 | letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum);
46 | 
47 | letter_by_letter_dot =
48 |   ((letter_by_letter ins_space dot ins_space)*
49 |   letter_by_letter)
50 | ;
51 | 
52 | export EMAIL1 = Optimize[
53 |  ((letter_by_letter) (ins_space)
54 |   (at) (ins_space)
55 |   (letter_by_letter_dot) (ins_space)
56 |   (url_suffix)) @ l.LEXICAL_MAP
57 | ];
58 | 
59 | export EMAIL2 = Optimize[
60 |  ((letter_by_letter) (ins_space)
61 |   (at) (ins_space)
62 |   (letter_string_dot) (ins_space)
63 |   (url_suffix)) @ l.LEXICAL_MAP
64 | ];
65 | 
66 | export EMAILS = Optimize[
67 |   EMAIL1 | EMAIL2
68 | ];
69 | 


--------------------------------------------------------------------------------
/src/ru/verbalizer/verbalizer.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import 'util/util.grm' as util;
16 | import 'ru/verbalizer/extra_numbers.grm' as e;
17 | import 'ru/verbalizer/float.grm' as f;
18 | import 'ru/verbalizer/math.grm' as ma;
19 | import 'ru/verbalizer/miscellaneous.grm' as mi;
20 | import 'ru/verbalizer/money.grm' as mo;
21 | import 'ru/verbalizer/numbers.grm' as n;
22 | import 'ru/verbalizer/numbers_plus.grm' as np;
23 | import 'ru/verbalizer/spelled.grm' as s;
24 | import 'ru/verbalizer/spoken_punct.grm' as sp;
25 | import 'ru/verbalizer/time.grm' as t;
26 | import 'ru/verbalizer/urls.grm' as u;
27 | 
28 | export VERBALIZER = Optimize[RmWeight[
29 |  (  e.MIXED_NUMBERS
30 |   | e.DIGITS
31 |   | f.FLOAT
32 |   | ma.ARITHMETIC
33 |   | mi.MISCELLANEOUS
34 |   | mo.MONEY
35 |   | n.CARDINAL_NUMBERS
36 |   | n.ORDINAL_NUMBERS
37 |   | np.NUMBERS_PLUS
38 |   | s.SPELLED
39 |   | sp.SPOKEN_PUNCT
40 |   | t.TIME
41 |   | u.URL) @ util.CLEAN_SPACES
42 | ]];
43 | 


--------------------------------------------------------------------------------
/src/universal/README.md:
--------------------------------------------------------------------------------
1 | # Language-universal grammar definitions
2 | 
3 | This directory contains various language-universal grammar definitions.
4 | 


--------------------------------------------------------------------------------
/src/universal/roman_numerals.tsv:
--------------------------------------------------------------------------------
 1 | i	1
 2 | ii	2
 3 | iii	3
 4 | iv	4
 5 | v	5
 6 | vi	6
 7 | vii	7
 8 | viii	8
 9 | ix	9
10 | x	10
11 | xi	11
12 | xii	12
13 | xiii	13
14 | xiv	14
15 | xv	15
16 | xvi	16
17 | xvii	17
18 | xviii	18
19 | xix	19
20 | xx	20
21 | xxi	21
22 | xxii	22
23 | xxiii	23
24 | xxiv	24
25 | xxv	25
26 | xxvi	26
27 | xxvii	27
28 | xxviii	28
29 | xxix	29
30 | xxx	30
31 | xxxi	31
32 | xxxii	32
33 | xxxiii	33
34 | xxxiv	34
35 | xxxv	35
36 | xxxvi	36
37 | xxxvii	37
38 | xxxviii	38
39 | xxxix	39
40 | xl	40
41 | xli	41
42 | xlii	42
43 | xliii	43
44 | xliv	44
45 | xlv	45
46 | xlvi	46
47 | xlvii	47
48 | xlviii	48
49 | xlix	49
50 | mcmxciv	1994
51 | mcmxcv	1995
52 | mcmxcvi	1996
53 | mcmxcvii	1997
54 | mcmxcviii	1998
55 | mcmxcix	1999
56 | mm	2000
57 | mmi	2001
58 | mmii	2002
59 | mmiii	2003
60 | mmiv	2004
61 | mmv	2005
62 | mmvi	2006
63 | mmvii	2007
64 | mmviii	2008
65 | mmix	2009
66 | mmx	2010
67 | mmxi	2011
68 | mmxii	2012
69 | mmxiii	2013
70 | mmxiv	2014
71 | mmxv	2015
72 | mmxvi	2016
73 | mmxvii	2017
74 | mmxviii	2018
75 | mmxix	2019
76 | mmxx	2020
77 | mmxxi	2021
78 | mmxxii	2022
79 | mmxxiii	2023
80 | mmxxiv	2024
81 | mmxxv	2025
82 | mmxxvi	2026
83 | mmxxvii	2027
84 | mmxxviii	2028
85 | mmxxix	2029
86 | mmxxx	2030
87 | mmxxxi	2031
88 | mmxxxii	2032
89 | mmxxxiii	2033
90 | mmxxxiv	2034
91 | mmxxxv	2035
92 | 


--------------------------------------------------------------------------------
/src/universal/thousands_punct.grm:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | # Specifies common ways of delimiting thousands in digit strings.
 16 | 
 17 | import 'util/byte.grm' as bytelib;
 18 | import 'util/util.grm' as util;
 19 | 
 20 | killcomma = "," : "";
 21 | dot2comma = "." : ",";
 22 | spaces2comma = " "+ : ",";
 23 | 
 24 | zero = "0";
 25 | 
 26 | # no_delimiter = zero | "[1-9][0-9]*";
 27 | export no_delimiter = zero | (util.d1to9 bytelib.kDigit*);
 28 | 
 29 | # delim_map_dot = ("[0-9]" | ("\." : ","))*;
 30 | delim_map_dot = (bytelib.kDigit | dot2comma)*;
 31 | 
 32 | # delim_map_space = ("[0-9]" | (" +" : ","))*;
 33 | delim_map_space = (bytelib.kDigit | spaces2comma)*;
 34 | 
 35 | ## Western systems group thousands. Korean goes this way too.
 36 | 
 37 | # comma_thousands = zero | ("[1-9][0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9]")*);
 38 | export comma_thousands = zero | (util.d1to9 bytelib.kDigit{0,2} (killcomma bytelib.kDigit{3})*);
 39 | 
 40 | # ComposeFst: 1st argument cannot match on output labels and 2nd argument
 41 | # cannot match on input labels (sort?).
 42 | export dot_thousands = delim_map_dot @ comma_thousands;
 43 | 
 44 | # ComposeFst: 1st argument cannot match on output labels and 2nd argument
 45 | # cannot match on input labels (sort?).
 46 | export space_thousands = delim_map_space @ comma_thousands;
 47 | 
 48 | ## Chinese prefers grouping by fours (by ten-thousands).
 49 | 
 50 | # chinese_comma =
 51 | #   zero | ("[1-9][0-9]?[0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9][0-9]")*);
 52 | export chinese_comma = zero | (util.d1to9 (bytelib.kDigit{0,3}) (killcomma bytelib.kDigit{4})*);
 53 | 
 54 | ## The Indian system is more complex because of the Stravinskian alternation
 55 | ## between lakhs and crores.
 56 | ##
 57 | ## According to Wikipedia:
 58 | ##
 59 | ## Indian English       Value
 60 | ## One                  1
 61 | ## Ten                  10
 62 | ## Hundred              100
 63 | ## Thousand             1,000
 64 | ## Lakh                 1,00,000
 65 | ## Crore                1,00,00,000
 66 | ## Arab                 1,00,00,00,000
 67 | ## Kharab               1,00,00,00,00,000
 68 | 
 69 | # indian_hundreds = "[1-9][0-9]?[0-9]?";
 70 | indian_hundreds = util.d1to9 bytelib.kDigit{0,2};
 71 | 
 72 | ## Up to 99,999.
 73 | 
 74 | # indian_comma_thousands = "[1-9][0-9]?" ("," : "") "[0-9][0-9][0-9]";
 75 | indian_comma_thousands = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{3};
 76 | 
 77 | ## Up to 99,99,999.
 78 | 
 79 | # indian_comma_lakhs = "[1-9][0-9]?" ("," : "") "[0-9][0-9]" ("," : "") "[0-9][0-9][0-9]";
 80 | indian_comma_lakhs = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{2} killcomma bytelib.kDigit{3};
 81 | 
 82 | ## Up to 999,99,99,999
 83 | 
 84 | indian_comma_crores =
 85 |     util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma
 86 |     (bytelib.kDigit{2} killcomma)?
 87 |     bytelib.kDigit{2} killcomma
 88 |     bytelib.kDigit{3}
 89 | ;
 90 | 
 91 | ## Up to 99,999,99,99,999.
 92 | 
 93 | indian_comma_thousand_crores =
 94 |     util.d1to9 bytelib.kDigit? killcomma
 95 |     bytelib.kDigit{3} killcomma
 96 |     bytelib.kDigit{2} killcomma
 97 |     bytelib.kDigit{2} killcomma
 98 |     bytelib.kDigit{3}
 99 | ;
100 | 
101 | ## Up to 999,99,999,99,99,999.
102 | 
103 | indian_comma_lakh_crores =
104 |     util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma
105 |     bytelib.kDigit{2} killcomma
106 |     bytelib.kDigit{3} killcomma
107 |     bytelib.kDigit{2} killcomma
108 |     bytelib.kDigit{2} killcomma
109 |     bytelib.kDigit{3}
110 | ;
111 | 
112 | export indian_comma =
113 |     zero
114 |   | indian_hundreds
115 |   | indian_comma_thousands
116 |   | indian_comma_lakhs
117 |   | indian_comma_crores
118 |   | indian_comma_thousand_crores
119 |   | indian_comma_lakh_crores
120 | ;
121 | 
122 | # Indian number system with dots.
123 | export indian_dot_number = delim_map_dot @ indian_comma;
124 | 
125 | # Indian number system with spaces.
126 | export indian_space_number = delim_map_space @ indian_comma;
127 | 


--------------------------------------------------------------------------------
/src/util/README.md:
--------------------------------------------------------------------------------
1 | # Utility grammar definitions
2 | 
3 | This directory contains various utility grammar definitions.
4 | 


--------------------------------------------------------------------------------
/src/util/arithmetic.grm:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | # Basic arithmetic on S-expressions. Exported arithmetic transducers may either:
 16 | #
 17 | # * Support weak vigesimal addition and multiplication...
 18 | #
 19 | #   (+ 20 17 +) -> 37
 20 | #   (+ 20 10 7 +) -> 37
 21 | #   (* 4 20 *) -> 80
 22 | #
 23 | #   ...or not.
 24 | #
 25 | # * Support "Germanic decade flop" addition....
 26 | #
 27 | #   (+ 8 20 +) -> 28
 28 | #   (+ 4 60 +) -> 64
 29 | #
 30 | #   ...or not.
 31 | #
 32 | # * Support multiplication where the left-hand side multiplicand is of a higher
 33 | #   order than the right-hand side multiplicand.
 34 | #
 35 | #   (* 1000 100) -> 100000
 36 | #
 37 | #   ...or not.
 38 | #
 39 | # However, modulo these exceptions, arithmetic transducers do not support
 40 | # addition that requires "carrying", or multiplication where the right-hand
 41 | # side multiplicand is not a power of ten. So this is not a *generic*
 42 | # S-expression evaluator.
 43 | #
 44 | # LEAVES is a transducer that accepts symbols in delta but deletes symbols
 45 | # in sigma - delta. So it essentially removes markup.
 46 | #
 47 | # REPEAT_FILTER is an acceptor which blocks derivations of the form
 48 | #
 49 | #   (+ (* 50 1000 *) (* 4 1000) ...)   "fifty thousand four thousand..."
 50 | #
 51 | # in languages where that is not licensed.
 52 | 
 53 | import 'util/byte.grm' as b;
 54 | 
 55 | # Deleter FST.
 56 | func D[expr] {
 57 |   return expr : "";
 58 | }
 59 | 
 60 | delta = b.kDigit;
 61 | sigma = delta | " " | "(" | ")" | "+" | "*";
 62 | 
 63 | sigmastar = sigma*;
 64 | deltastar = delta*;
 65 | 
 66 | rparen = Optimize["+)" | "*)"];
 67 | space_or_rparen = Optimize[" " | rparen];
 68 | 
 69 | ## Multiplication.
 70 | 
 71 | # Generic multiplication where the RHS is a power of ten.
 72 | 
 73 | del_one = Optimize[delta+ D[" 1"] "0"+];
 74 | 
 75 | test1_1 = AssertEqual["2 10"      @ del_one,      "20"];
 76 | test1_2 = AssertEqual["20 10"     @ del_one,     "200"];
 77 | test1_3 = AssertEqual["2 100"     @ del_one,     "200"];
 78 | test1_4 = AssertEqual["20 100"    @ del_one,    "2000"];
 79 | test1_5 = AssertEqual["200 100"   @ del_one,   "20000"];
 80 | test1_6 = AssertEqual["2 1000"    @ del_one,    "2000"];
 81 | test1_7 = AssertEqual["20 1000"   @ del_one,   "20000"];
 82 | test1_8 = AssertEqual["200 1000"  @ del_one,  "200000"];
 83 | test1_9 = AssertEqual["2000 1000" @ del_one, "2000000"];
 84 | 
 85 | # Generic multiplication where the RHS is a power of ten and the LHS has fewer
 86 | # trailing zeros than the RHS.
 87 | del_one_restricted = Optimize[ # e.g., "2 x 10", "2 x 100", etc.
 88 |                                delta      D[" 1"]        "0"+ |
 89 |                                # e.g., "20 x 100", etc.
 90 |                                delta{1,2} D[" 1"] "0"    "0"+ |
 91 |                                # e.g., "200" x 1000", etc.
 92 |                                delta{2,3} D[" 1"] "0"{2} "0"+ |
 93 |                                delta{3,4} D[" 1"] "0"{3} "0"+ |
 94 |                                delta{4,5} D[" 1"] "0"{4} "0"+];
 95 | 
 96 | test2_01 = AssertEqual["2 10"     @ del_one_restricted,               "20"];
 97 | test2_02 = AssertNull["20 10"     @ del_one_restricted];
 98 | test2_03 = AssertEqual["2 100"    @ del_one_restricted,              "200"];
 99 | test2_04 = AssertEqual["20 100"   @ del_one_restricted,             "2000"];
100 | test2_05 = AssertNull[ "200 100"  @ del_one_restricted];
101 | test2_06 = AssertEqual["2 1000"   @ del_one_restricted,             "2000"];
102 | test2_07 = AssertEqual["20 1000"  @ del_one_restricted,            "20000"];
103 | test2_08 = AssertEqual["200 1000" @ del_one_restricted,           "200000"];
104 | test2_09 = AssertNull["2000 1000" @ del_one_restricted];
105 | test2_10 = AssertEqual["1000 10000000" @ del_one_restricted, "10000000000"];
106 | 
107 | # Multiplication of vigesimal base for weak vigesimal systems
108 | 
109 | vigesimal_times_map = ("1" : "2") | ("2" : "4") | ("3" : "6") | ("4" : "8");
110 | 
111 | del_two = Optimize[vigesimal_times_map D[" 2"] "0"+];
112 | 
113 | test3_1 = AssertEqual["1 20" @ del_two, "20"];
114 | test3_2 = AssertEqual["2 20" @ del_two, "40"];
115 | test3_3 = AssertEqual["3 20" @ del_two, "60"];
116 | test3_4 = AssertEqual["4 20" @ del_two, "80"];
117 | 
118 | # Multiplication of vigesimal base restricted to cases where the LHS is [1-4]
119 | # and the RHS is a power of ten.
120 | 
121 | del_two_restricted = Optimize[vigesimal_times_map D[" 2"] "0"+];
122 | 
123 | test4_1 = AssertEqual["1 20" @ del_two_restricted, "20"];
124 | test4_2 = AssertEqual["2 20" @ del_two_restricted, "40"];
125 | test4_3 = AssertEqual["3 20" @ del_two_restricted, "60"];
126 | test4_4 = AssertEqual["4 20" @ del_two_restricted, "80"];
127 | test4_5 = AssertNull["5 20" @ del_two_restricted];
128 | test4_6 = AssertNull["10 20" @ del_two_restricted];
129 | 
130 | products = del_one | del_two;
131 | products_restricted = del_one_restricted | del_two_restricted;
132 | 
133 | multiplication = CDRewrite[D["(* "] products D[" *)"], "", "", sigmastar];
134 | multiplication_restricted = CDRewrite[D["(* "] products_restricted D[" *)"],
135 |                                       "", "", sigmastar];
136 | 
137 | test5_1 = AssertEqual["(* 8 100 *)"    @ multiplication, "800"];
138 | test5_2 = AssertEqual["(* 1 100 *)"    @ multiplication, "100"];
139 | test5_3 = AssertEqual["(* 4 20 *)"     @ multiplication, "80"];
140 | test5_4 = AssertEqual["(* 13 1000 *)"  @ multiplication, "13000"];
141 | test5_5 = AssertEqual["(* 13000 10 *)" @ multiplication, "130000"];
142 | test5_6 = AssertEqual["(* 13000 10 *)" @ multiplication_restricted,
143 |                       "(* 13000 10 *)"];  # Can't reduce this.
144 | 
145 | ## Addition.
146 | 
147 | insum = "+" (sigma - "(")*;
148 | rcon = insum deltastar;
149 | 
150 | # Generic zero deletion up to 12.
151 | del_zero = Optimize[
152 |    # Handles lone zero inside a plus statement.
153 |    CDRewrite[D[" 0"], rcon, space_or_rparen, sigmastar] @
154 |    # If we need to go any larger, we probably should switch to a PDT.
155 |    CDRewrite[D["0"{12} " "] delta{12}, rcon, space_or_rparen, sigmastar] @
156 |    CDRewrite[D["0"{11} " "] delta{11}, rcon, space_or_rparen, sigmastar] @
157 |    CDRewrite[D["0"{10} " "] delta{10}, rcon, space_or_rparen, sigmastar] @
158 |    CDRewrite[D["0"{9} " "]  delta{9},  rcon, space_or_rparen, sigmastar] @
159 |    CDRewrite[D["0"{8} " "]  delta{8},  rcon, space_or_rparen, sigmastar] @
160 |    CDRewrite[D["0"{7} " "]  delta{7},  rcon, space_or_rparen, sigmastar] @
161 |    CDRewrite[D["0"{6} " "]  delta{6},  rcon, space_or_rparen, sigmastar] @
162 |    CDRewrite[D["0"{5} " "]  delta{5},  rcon, space_or_rparen, sigmastar] @
163 |    CDRewrite[D["0"{4} " "]  delta{4},  rcon, space_or_rparen, sigmastar] @
164 |    CDRewrite[D["0"{3} " "]  delta{3},  rcon, space_or_rparen, sigmastar] @
165 |    CDRewrite[D["0"{2} " "]  delta{2},  rcon, space_or_rparen, sigmastar] @
166 |    CDRewrite[D["0" " "]     delta,     rcon, space_or_rparen, sigmastar]];
167 | 
168 | ## Weak vigesimal cases involving scores and teens.
169 | 
170 | vigesimal_plus_map = Optimize[("20 1" : "3") delta |
171 |                               ("40 1" : "5") delta |
172 |                               ("60 1" : "7") delta |
173 |                               ("80 1" : "9") delta];
174 | 
175 | vigesimal = CDRewrite[vigesimal_plus_map, insum, space_or_rparen, sigmastar];
176 | 
177 | ## Germanic decade flop.
178 | 
179 | germanic_map = StringFile['util/germanic.tsv'];
180 | 
181 | germanic = CDRewrite[germanic_map, insum, space_or_rparen, sigmastar];
182 | 
183 | sums = Optimize[germanic @ vigesimal @ del_zero];
184 | 
185 | # Deletes the surrounding "(+ +)" around a successful reduction.
186 | 
187 | del_plus = CDRewrite[D["(+ "] delta+ D[" +)"], "", "", sigmastar];
188 | 
189 | addition = Optimize[sums @ del_plus];
190 | 
191 | test6_1 = AssertEqual["(+ 30 2 +)" @ addition, "32"];
192 | test6_2 = AssertEqual["(+ 300 20 1 +)" @ addition, "321"];
193 | test6_3 = AssertEqual["(+ 80 17 +)" @ addition, "97"];
194 | test6_4 = AssertEqual["(+ 4 50 +)" @ addition, "54"];
195 | test6_5 = AssertEqual["(+ 3000 80 17 +)" @ addition, "3097"];
196 | test6_6 = AssertEqual["(+ 3000 4 50 +)" @ addition, "3054"];
197 | test6_7 = AssertEqual["(+ 0 10 +)" @ addition, "10"];
198 | test6_8 = AssertEqual["(+ 0 20 +)" @ addition, "20"];
199 | test6_9 = AssertEqual["(+ 200 (+ 0 20 +) +)" @ addition @ addition, "220"];
200 | 
201 | ## Export statements.
202 | 
203 | export ARITHMETIC = Optimize[multiplication @ addition];
204 | export ARITHMETIC_RESTRICTED = Optimize[multiplication_restricted @ addition];
205 | 
206 | # Lightweight versions that lack the vigesimal /vɪˈdʒɛsɪməl/ or Germanic decade
207 | # flop, or both.
208 | 
209 | export ARITHMETIC_BASIC = Optimize[multiplication @ del_zero @ del_plus];
210 | export ARITHMETIC_BASIC_RESTRICTED = Optimize[multiplication_restricted @
211 |                                               del_zero @ del_plus];
212 | 
213 | export ARITHMETIC_GERMANIC = Optimize[multiplication @ germanic @ del_zero @
214 |                                       del_plus];
215 | 
216 | export ARITHMETIC_GERMANIC_RESTRICTED = Optimize[multiplication_restricted @
217 |                                                  germanic @ del_zero @
218 |                                                  del_plus];
219 | 
220 | export ARITHMETIC_VIGESIMAL = Optimize[multiplication @ vigesimal @ del_zero @
221 |                                        del_plus];
222 | export ARITHMETIC_VIGESIMAL_RESTRICTED = Optimize[multiplication_restricted @
223 |                                                   vigesimal @ del_zero @
224 |                                                   del_plus];
225 | 
226 | ## LEAVES transducer.
227 | 
228 | nonterm = "+" | "*";
229 | export LEAVES = Optimize[CDRewrite["(" nonterm " " | " " nonterm ")" : "",
230 |                                    "", "", sigmastar]];
231 | 
232 | test7 = AssertEqual["(* (+ (* 4 20 *) 10 7 +) 1000 *)" @ LEAVES,
233 |                     "4 20 10 7 1000"];
234 | 
235 | ## Optional filter for repeated large powers of ten, to be applied to leaves.
236 | 
237 | func Filter[expr, sigstar] {
238 |   return Optimize[sigstar - (sigstar expr sigstar)];
239 | }
240 | 
241 | func FilterMoreThanOne[expr, sigstar] {
242 |   return Filter[expr " " (sigstar " ")? expr, sigstar];
243 | }
244 | 
245 | filter_sigstar = (delta | " ")*;
246 | 
247 | export REPEAT_FILTER =
248 |   Optimize[FilterMoreThanOne["1000", filter_sigstar] @
249 |            FilterMoreThanOne["10000", filter_sigstar] @
250 |            FilterMoreThanOne["100000", filter_sigstar] @
251 |            FilterMoreThanOne["1000000", filter_sigstar] @
252 |            FilterMoreThanOne["1000000000", filter_sigstar] @
253 |            FilterMoreThanOne["1000000000000", filter_sigstar]];
254 | 
255 | test8_1 = AssertNull["50 1000 4 1000" @ REPEAT_FILTER];
256 | test8_2 = AssertNull["50 1000000 4 1000000" @ REPEAT_FILTER];
257 | test8_3 = AssertEqual["50 100 1000" @ REPEAT_FILTER, "50 100 1000"];
258 | test8_4 = AssertNull["20 1000 1000 20" @ REPEAT_FILTER];
259 | test8_5 = AssertEqual[
260 |     "70 1000000 400 0 70 0 7 1000 100 0 70" @ REPEAT_FILTER,
261 |     "70 1000000 400 0 70 0 7 1000 100 0 70" @ REPEAT_FILTER];
262 | test8_6 = AssertNull[
263 |     "70 1000000 400 0 70 1000 0 7 1000 100 0 70" @ REPEAT_FILTER];
264 | 
265 | # Filters to force the output of *inverting* the arithmetic as applied to a
266 | # digit string to be a well-formed sexpr:
267 | 
268 | not_space = b.kNotSpace;
269 | 
270 | # Things like (+ 1 +)(+ 9 +).
271 | 
272 | bad_parens  =
273 |      sigmastar ")" not_space sigmastar
274 |  |   sigmastar not_space "("  sigmastar
275 | ;
276 | 
277 | no_bad_parens = sigmastar - bad_parens;
278 | 
279 | # Things like (+ 1 +) or (* 3 *).
280 | 
281 | spurious_operators =
282 |     sigmastar "(+ " delta+ " +)" sigmastar
283 |   | sigmastar "(* " delta+ " *)" sigmastar
284 | ;
285 | 
286 | no_spurious_operators = sigmastar - spurious_operators;
287 | 
288 | no_strings_of_zeros =
289 |   sigmastar - (sigmastar " " "0"+ " " "0"+ " " sigmastar)
290 | ;
291 | 
292 | no_bad_sequences =
293 |   Optimize[no_bad_parens @ no_strings_of_zeros]
294 | ;
295 | 
296 | export SEXP_FILTER = Optimize[
297 |  (  delta+
298 |   | "(* " no_bad_sequences " *)"
299 |   | "(+ " no_bad_sequences " +)") @ no_spurious_operators]
300 | ;
301 | 
302 | # For convenience adds inverses of the arithmetic rules:
303 | 
304 | export IARITHMETIC = Invert[ARITHMETIC];
305 | 
306 | export IARITHMETIC_RESTRICTED = Invert[ARITHMETIC_RESTRICTED];
307 | 
308 | export IARITHMETIC_BASIC = Invert[ARITHMETIC_BASIC];
309 | 
310 | export IARITHMETIC_BASIC_RESTRICTED = Invert[ARITHMETIC_BASIC_RESTRICTED];
311 | 
312 | export IARITHMETIC_GERMANIC = Invert[ARITHMETIC_GERMANIC];
313 | 
314 | export IARITHMETIC_GERMANIC_RESTRICTED =
315 |   Invert[ARITHMETIC_GERMANIC_RESTRICTED]
316 | ;
317 | 
318 | export IARITHMETIC_VIGESIMAL = Invert[ARITHMETIC_VIGESIMAL];
319 | 
320 | export IARITHMETIC_VIGESIMAL_RESTRICTED =
321 |     Invert[ARITHMETIC_VIGESIMAL_RESTRICTED]
322 | ;
323 | 
324 | ## This should be applied on the lefthand side of FG to ensure that the only
325 | ## digit input nis permitted.
326 | export DELTA_STAR = deltastar;
327 | 


--------------------------------------------------------------------------------
/src/util/byte.grm:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | # 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Standard constants for ASCII (byte) based strings.  This mirrors the
16 | # functions provided by C/C++'s ctype.h library.
17 | 
18 | # Note that [0] is missing; matching the string-termination character is kinda weird.
19 | export kBytes = Optimize[
20 |   "[1]" |   "[2]" |   "[3]" |   "[4]" |   "[5]" |   "[6]" |   "[7]" |   "[8]" |   "[9]" |  "[10]" |
21 |  "[11]" |  "[12]" |  "[13]" |  "[14]" |  "[15]" |  "[16]" |  "[17]" |  "[18]" |  "[19]" |  "[20]" |
22 |  "[21]" |  "[22]" |  "[23]" |  "[24]" |  "[25]" |  "[26]" |  "[27]" |  "[28]" |  "[29]" |  "[30]" |
23 |  "[31]" |  "[32]" |  "[33]" |  "[34]" |  "[35]" |  "[36]" |  "[37]" |  "[38]" |  "[39]" |  "[40]" |
24 |  "[41]" |  "[42]" |  "[43]" |  "[44]" |  "[45]" |  "[46]" |  "[47]" |  "[48]" |  "[49]" |  "[50]" |
25 |  "[51]" |  "[52]" |  "[53]" |  "[54]" |  "[55]" |  "[56]" |  "[57]" |  "[58]" |  "[59]" |  "[60]" |
26 |  "[61]" |  "[62]" |  "[63]" |  "[64]" |  "[65]" |  "[66]" |  "[67]" |  "[68]" |  "[69]" |  "[70]" |
27 |  "[71]" |  "[72]" |  "[73]" |  "[74]" |  "[75]" |  "[76]" |  "[77]" |  "[78]" |  "[79]" |  "[80]" |
28 |  "[81]" |  "[82]" |  "[83]" |  "[84]" |  "[85]" |  "[86]" |  "[87]" |  "[88]" |  "[89]" |  "[90]" |
29 |  "[91]" |  "[92]" |  "[93]" |  "[94]" |  "[95]" |  "[96]" |  "[97]" |  "[98]" |  "[99]" | "[100]" |
30 | "[101]" | "[102]" | "[103]" | "[104]" | "[105]" | "[106]" | "[107]" | "[108]" | "[109]" | "[110]" |
31 | "[111]" | "[112]" | "[113]" | "[114]" | "[115]" | "[116]" | "[117]" | "[118]" | "[119]" | "[120]" |
32 | "[121]" | "[122]" | "[123]" | "[124]" | "[125]" | "[126]" | "[127]" | "[128]" | "[129]" | "[130]" |
33 | "[131]" | "[132]" | "[133]" | "[134]" | "[135]" | "[136]" | "[137]" | "[138]" | "[139]" | "[140]" |
34 | "[141]" | "[142]" | "[143]" | "[144]" | "[145]" | "[146]" | "[147]" | "[148]" | "[149]" | "[150]" |
35 | "[151]" | "[152]" | "[153]" | "[154]" | "[155]" | "[156]" | "[157]" | "[158]" | "[159]" | "[160]" |
36 | "[161]" | "[162]" | "[163]" | "[164]" | "[165]" | "[166]" | "[167]" | "[168]" | "[169]" | "[170]" |
37 | "[171]" | "[172]" | "[173]" | "[174]" | "[175]" | "[176]" | "[177]" | "[178]" | "[179]" | "[180]" |
38 | "[181]" | "[182]" | "[183]" | "[184]" | "[185]" | "[186]" | "[187]" | "[188]" | "[189]" | "[190]" |
39 | "[191]" | "[192]" | "[193]" | "[194]" | "[195]" | "[196]" | "[197]" | "[198]" | "[199]" | "[200]" |
40 | "[201]" | "[202]" | "[203]" | "[204]" | "[205]" | "[206]" | "[207]" | "[208]" | "[209]" | "[210]" |
41 | "[211]" | "[212]" | "[213]" | "[214]" | "[215]" | "[216]" | "[217]" | "[218]" | "[219]" | "[220]" |
42 | "[221]" | "[222]" | "[223]" | "[224]" | "[225]" | "[226]" | "[227]" | "[228]" | "[229]" | "[230]" |
43 | "[231]" | "[232]" | "[233]" | "[234]" | "[235]" | "[236]" | "[237]" | "[238]" | "[239]" | "[240]" |
44 | "[241]" | "[242]" | "[243]" | "[244]" | "[245]" | "[246]" | "[247]" | "[248]" | "[249]" | "[250]" |
45 | "[251]" | "[252]" | "[253]" | "[254]" | "[255]"
46 | ];
47 | 
48 | export kDigit = Optimize[
49 |     "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
50 | ];
51 | 
52 | export kLower = Optimize[
53 |     "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" |
54 |     "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
55 | ];
56 | export kUpper = Optimize[
57 |     "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" |
58 |     "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
59 | ];
60 | export kAlpha = Optimize[kLower | kUpper];
61 | 
62 | export kAlnum = Optimize[kDigit | kAlpha];
63 | 
64 | export kSpace = Optimize[
65 |     " " | "\t" | "\n" | "\r"
66 | ];
67 | export kNotSpace = Optimize[kBytes - kSpace];
68 | 
69 | export kPunct = Optimize[
70 |     "!" | "\"" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," |
71 |     "-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "\[" | "\\" |
72 |     "\]" | "^" | "_" | "`" | "{" | "|" | "}" | "~"
73 | ];
74 | 
75 | export kGraph = Optimize[kAlnum | kPunct];
76 | 


--------------------------------------------------------------------------------
/src/util/germanic.tsv:
--------------------------------------------------------------------------------
 1 | 1 10	11
 2 | 2 10	12
 3 | 3 10	13
 4 | 4 10	14
 5 | 5 10	15
 6 | 6 10	16
 7 | 7 10	17
 8 | 8 10	18
 9 | 9 10	19
10 | 1 20	21
11 | 2 20	22
12 | 3 20	23
13 | 4 20	24
14 | 5 20	25
15 | 6 20	26
16 | 7 20	27
17 | 8 20	28
18 | 9 20	29
19 | 1 30	31
20 | 2 30	32
21 | 3 30	33
22 | 4 30	34
23 | 5 30	35
24 | 6 30	36
25 | 7 30	37
26 | 8 30	38
27 | 9 30	39
28 | 1 40	41
29 | 2 40	42
30 | 3 40	43
31 | 4 40	44
32 | 5 40	45
33 | 6 40	46
34 | 7 40	47
35 | 8 40	48
36 | 9 40	49
37 | 1 50	51
38 | 2 50	52
39 | 3 50	53
40 | 4 50	54
41 | 5 50	55
42 | 6 50	56
43 | 7 50	57
44 | 8 50	58
45 | 9 50	59
46 | 1 60	61
47 | 2 60	62
48 | 3 60	63
49 | 4 60	64
50 | 5 60	65
51 | 6 60	66
52 | 7 60	67
53 | 8 60	68
54 | 9 60	69
55 | 1 70	71
56 | 2 70	72
57 | 3 70	73
58 | 4 70	74
59 | 5 70	75
60 | 6 70	76
61 | 7 70	77
62 | 8 70	78
63 | 9 70	79
64 | 1 80	81
65 | 2 80	82
66 | 3 80	83
67 | 4 80	84
68 | 5 80	85
69 | 6 80	86
70 | 7 80	87
71 | 8 80	88
72 | 9 80	89
73 | 1 90	91
74 | 2 90	92
75 | 3 90	93
76 | 4 90	94
77 | 5 90	95
78 | 6 90	96
79 | 7 90	97
80 | 8 90	98
81 | 9 90	99
82 | 


--------------------------------------------------------------------------------
/src/util/util.grm:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | # Utility functions.
 16 | 
 17 | import 'util/byte.grm' as bytelib;
 18 | import 'util/case.grm' as case;
 19 | 
 20 | # A simplification helper function that encapsulates the left-to-right and
 21 | # obligatory options.
 22 | func CDR[t, l, r, s] {
 23 |   return CDRewrite[t, l, r, s, 'ltr', 'obl'];
 24 | }
 25 | 
 26 | # Useful insertion and deletion functions.
 27 | 
 28 | func I[expr] {
 29 |   return "" : expr;
 30 | }
 31 | 
 32 | func D[expr] {
 33 |   return expr : "";
 34 | }
 35 | 
 36 | # A machine that accepts nothing.
 37 | export NULL = Optimize["" - ""];
 38 | 
 39 | export d1to9 = Optimize[bytelib.kDigit - "0"];
 40 | export d02to9 = Optimize[bytelib.kDigit - "1"];
 41 | export d2to9 = Optimize[d02to9 - "0"];
 42 | # Any number that isn't zero. May have leading zeroes.
 43 | export non_zero_number = Optimize["0"* d1to9 bytelib.kDigit*];
 44 | # Any number, allowing for factorization markers.
 45 | export factorized_number = Optimize[(bytelib.kDigit | "\[" | "E" | "\]")*];
 46 | export non_zero_factorized_number = Optimize["0"* d1to9 factorized_number];
 47 | 
 48 | export ins_space = "" : " ";
 49 | export ins_sil = "" : " sil ";
 50 | export ins_short_sil = "" : " sil|short ";
 51 | export ins_quote = "" : "\"";
 52 | 
 53 | # Caveat: pass_anything does not pass stuff like "[~~]".
 54 | export pass_anything = bytelib.kBytes*;
 55 | export pass_any_word = bytelib.kNotSpace+;
 56 | 
 57 | export pass_space_plus = bytelib.kSpace+;
 58 | export pass_space_star = bytelib.kSpace*;
 59 | 
 60 | export clear_space = bytelib.kSpace : "";
 61 | export clear_space_plus = bytelib.kSpace+ : "";
 62 | export clear_space_star = bytelib.kSpace* : "";
 63 | 
 64 | export space_to_underscore = (bytelib.kAlnum | (" " : "_"))*;
 65 | export one_space = clear_space_star ins_space;
 66 | 
 67 | export CLEAN_SPACES = Optimize[
 68 |   "" | (clear_space_star
 69 |         (pass_any_word (bytelib.kSpace+ : " "))*
 70 |         pass_any_word clear_space_star)]
 71 | ;
 72 | 
 73 | export del_space_star = " "* : "";
 74 | export del_space_plus = " "+ : "";
 75 | 
 76 | export sigma_star = Optimize[pass_anything];
 77 | 
 78 | export DELETE_SPACES =
 79 |   CDRewrite[clear_space_plus, "", "", sigma_star];
 80 | 
 81 | export REMOVE_LEADING_SPACES =
 82 |   CDRewrite[clear_space_plus, "[BOS]", "", sigma_star];
 83 | 
 84 | export REMOVE_FINAL_SPACES =
 85 |   CDRewrite[clear_space_plus, "", "[EOS]", sigma_star];
 86 | 
 87 | export REMOVE_BOUNDARY_SPACES = REMOVE_LEADING_SPACES @ REMOVE_FINAL_SPACES;
 88 | 
 89 | export delete_initial_zero =
 90 |   CDRewrite["0" : "", "[BOS]", bytelib.kDigit, sigma_star];
 91 | 
 92 | export lower_case_letter = Optimize[case.tolower | case.LOWER | bytelib.kLower];
 93 | export lower_case = Optimize[lower_case_letter+];
 94 | export lower_case_anything = case.TOLOWER;
 95 | 
 96 | export upper_case_letter = Optimize[case.toupper | case.UPPER | bytelib.kUpper];
 97 | export upper_case = Optimize[upper_case_letter+];
 98 | export upper_case_anything = case.TOUPPER;
 99 | 
100 | export opening_brace = del_space_star ("{" : "") del_space_star;
101 | export closing_brace = del_space_star ("}" : "") del_space_star;
102 | 
103 | export quote = del_space_star ("\"" : "") del_space_star;
104 | export double_quote = del_space_star ("\"\"" : "") del_space_star;
105 | 
106 | export VOWELS = Optimize["a" | "e" | "i" | "o" | "u"];
107 | export VOWELS_Y = Optimize["a" | "e" | "i" | "o" | "u" | "y"];
108 | export VOWELS_INSENSITIVE = Optimize[VOWELS_Y | "A" | "E" | "I"
109 |                                               | "O" | "U" | "Y"];
110 | export CONSONANTS = Optimize[bytelib.kLower - VOWELS];
111 | export CONSONANTS_INSENSITIVE = Optimize[bytelib.kAlpha - VOWELS_INSENSITIVE];
112 | 
113 | # LSEQs that can be used for URL verbalization for all languages;
114 | # mainly protocol names & file extensions.
115 | export URL_LSEQS = Optimize["www" | "edu" | "ftp" | "htm" | "html" | "imdb" |
116 |                             "php" | "asp" | "aspx" | "bbc" | "cgi" | "xhtml" |
117 |                             "shtml" | "jsp"];
118 | 
119 | # Rule for swapping cardinal to decimal; useful for measures where
120 | # both can appear in the proto but may be handled similarly.
121 | export CARDINAL_TO_DECIMAL = Optimize[
122 |   CDRewrite["cardinal" : "decimal", "", "", sigma_star] @
123 |   CDRewrite["integer:" : "integer_part:", "", "", sigma_star]
124 | ];
125 | 
126 | export escape_quotes_and_backslashes =
127 |   ((bytelib.kBytes - "\"" - "\\") | ("\"" : "\\\"") | ("\\" : "\\\\"))*
128 | ;
129 | 
130 | ## Generally useful definition:
131 | 
132 | export hours =
133 |     "0"
134 |   | "1"
135 |   | "2"
136 |   | "3"
137 |   | "4"
138 |   | "5"
139 |   | "6"
140 |   | "7"
141 |   | "8"
142 |   | "9"
143 |   | "10"
144 |   | "11"
145 |   | "12"
146 |   | "13"
147 |   | "14"
148 |   | "15"
149 |   | "16"
150 |   | "17"
151 |   | "18"
152 |   | "19"
153 |   | "20"
154 |   | "21"
155 |   | "22"
156 |   | "23"
157 |   | "24"
158 | ;
159 | 
160 | export hours_shift =
161 |     ("0" : "1")
162 |   | ("1" : "2")
163 |   | ("2" : "3")
164 |   | ("3" : "4")
165 |   | ("4" : "5")
166 |   | ("5" : "6")
167 |   | ("6" : "7")
168 |   | ("7" : "8")
169 |   | ("8" : "9")
170 |   | ("9" : "10")
171 |   | ("10" : "11")
172 |   | ("11" : "12")
173 |   | ("12" : "13")
174 |   | ("13" : "14")
175 |   | ("14" : "15")
176 |   | ("15" : "16")
177 |   | ("16" : "17")
178 |   | ("17" : "18")
179 |   | ("18" : "19")
180 |   | ("19" : "20")
181 |   | ("20" : "21")
182 |   | ("21" : "22")
183 |   | ("22" : "23")
184 |   | ("23" : "24")
185 |   | ("24" : "1")
186 | ;
187 | 
188 | export hours_24_to_12 =
189 |     ("0" : "12")
190 |   | "1"
191 |   | "2"
192 |   | "3"
193 |   | "4"
194 |   | "5"
195 |   | "6"
196 |   | "7"
197 |   | "8"
198 |   | "9"
199 |   | "10"
200 |   | "11"
201 |   | "12"
202 |   | ("13" : "1")
203 |   | ("14" : "2")
204 |   | ("15" : "3")
205 |   | ("16" : "4")
206 |   | ("17" : "5")
207 |   | ("18" : "6")
208 |   | ("19" : "7")
209 |   | ("20" : "8")
210 |   | ("21" : "9")
211 |   | ("22" : "10")
212 |   | ("23" : "11")
213 |   | ("24" : "12")
214 | ;
215 | 
216 | export hours_24_to_12_next =
217 |     ("0" : "1")
218 |   | ("1" : "2")
219 |   | ("2" : "3")
220 |   | ("3" : "4")
221 |   | ("4" : "5")
222 |   | ("5" : "6")
223 |   | ("6" : "7")
224 |   | ("7" : "8")
225 |   | ("8" : "9")
226 |   | ("9" : "10")
227 |   | ("10" : "11")
228 |   | ("11" : "12")
229 |   | ("12" : "1")
230 |   | ("13" : "2")
231 |   | ("14" : "3")
232 |   | ("15" : "4")
233 |   | ("16" : "5")
234 |   | ("17" : "6")
235 |   | ("18" : "7")
236 |   | ("19" : "8")
237 |   | ("20" : "9")
238 |   | ("21" : "10")
239 |   | ("22" : "11")
240 |   | ("23" : "12")
241 |   | ("24" : "1")
242 | ;
243 | 
244 | export minutes =
245 |     "0"
246 |   | "1"
247 |   | "2"
248 |   | "3"
249 |   | "4"
250 |   | "5"
251 |   | "6"
252 |   | "7"
253 |   | "8"
254 |   | "9"
255 |   | "10"
256 |   | "11"
257 |   | "12"
258 |   | "13"
259 |   | "14"
260 |   | "15"
261 |   | "16"
262 |   | "17"
263 |   | "18"
264 |   | "19"
265 |   | "20"
266 |   | "21"
267 |   | "22"
268 |   | "23"
269 |   | "24"
270 |   | "25"
271 |   | "26"
272 |   | "27"
273 |   | "28"
274 |   | "29"
275 |   | "30"
276 |   | "31"
277 |   | "32"
278 |   | "33"
279 |   | "34"
280 |   | "35"
281 |   | "36"
282 |   | "37"
283 |   | "38"
284 |   | "39"
285 |   | "40"
286 |   | "41"
287 |   | "42"
288 |   | "43"
289 |   | "44"
290 |   | "45"
291 |   | "46"
292 |   | "47"
293 |   | "48"
294 |   | "49"
295 |   | "50"
296 |   | "51"
297 |   | "52"
298 |   | "53"
299 |   | "54"
300 |   | "55"
301 |   | "56"
302 |   | "57"
303 |   | "58"
304 |   | "59"
305 | ;
306 | 
307 | export round_minutes =
308 |     ("1" : "0")
309 |   | ("2" : "0")
310 |   | ("3" : "5")
311 |   | ("4" : "5")
312 |   | ("6" : "5")
313 |   | ("7" : "5")
314 |   | ("8" : "10")
315 |   | ("9" : "10")
316 |   | ("11" : "10")
317 |   | ("12" : "10")
318 |   | ("13" : "15")
319 |   | ("14" : "15")
320 |   | ("16" : "15")
321 |   | ("17" : "15")
322 |   | ("18" : "20")
323 |   | ("19" : "20")
324 |   | ("21" : "20")
325 |   | ("22" : "20")
326 |   | ("23" : "25")
327 |   | ("24" : "25")
328 |   | ("26" : "25")
329 |   | ("27" : "25")
330 |   | ("28" : "30")
331 |   | ("29" : "30")
332 |   | ("31" : "30")
333 |   | ("32" : "30")
334 |   | ("33" : "35")
335 |   | ("34" : "35")
336 |   | ("36" : "35")
337 |   | ("37" : "35")
338 |   | ("38" : "40")
339 |   | ("39" : "40")
340 |   | ("41" : "40")
341 |   | ("42" : "40")
342 |   | ("43" : "45")
343 |   | ("44" : "45")
344 |   | ("46" : "45")
345 |   | ("47" : "45")
346 |   | ("48" : "50")
347 |   | ("49" : "50")
348 |   | ("51" : "50")
349 |   | ("52" : "50")
350 |   | ("53" : "55")
351 |   | ("54" : "55")
352 |   | ("56" : "55")
353 |   | ("57" : "55")
354 | ;
355 | 
356 | export unrounded_minutes =
357 |     ("0" : "0")
358 |   | ("5" : "5")
359 |   | ("10" : "10")
360 |   | ("15" : "15")
361 |   | ("20" : "20")
362 |   | ("25" : "25")
363 |   | ("30" : "30")
364 |   | ("35" : "35")
365 |   | ("40" : "40")
366 |   | ("45" : "45")
367 |   | ("50" : "50")
368 |   | ("55" : "55")
369 | ;
370 | 
371 | export round_minutes_next_hour =
372 |     ("58" : "0")
373 |   | ("59" : "0")
374 | ;
375 | 
376 | export subtract_from_60 =
377 |     "30"
378 |   | ("31" : "29" )
379 |   | ("32" : "28" )
380 |   | ("33" : "27" )
381 |   | ("34" : "26" )
382 |   | ("35" : "25" )
383 |   | ("36" : "24" )
384 |   | ("37" : "23" )
385 |   | ("38" : "22" )
386 |   | ("39" : "21" )
387 |   | ("40" : "20" )
388 |   | ("41" : "19" )
389 |   | ("42" : "18" )
390 |   | ("43" : "17" )
391 |   | ("44" : "16" )
392 |   | ("45" : "15" )
393 |   | ("46" : "14" )
394 |   | ("47" : "13" )
395 |   | ("48" : "12" )
396 |   | ("49" : "11" )
397 |   | ("50" : "10" )
398 |   | ("51" : "9" )
399 |   | ("52" : "8" )
400 |   | ("53" : "7" )
401 |   | ("54" : "6" )
402 |   | ("55" : "5" )
403 |   | ("56" : "4" )
404 |   | ("57" : "3" )
405 |   | ("58" : "2" )
406 |   | ("59" : "1" )
407 | ;
408 | 
409 | export any_month =
410 |   (("0" : "")?
411 |   (
412 |     "1"
413 |   | "2"
414 |   | "3"
415 |   | "4"
416 |   | "5"
417 |   | "6"
418 |   | "7"
419 |   | "8"
420 |   | "9"
421 |   ))
422 |   | "10"
423 |   | "11"
424 |   | "12"
425 | ;
426 | 
427 | export any_day =
428 |   (("0" : "")?
429 |   (
430 |     "1"
431 |   | "2"
432 |   | "3"
433 |   | "4"
434 |   | "5"
435 |   | "6"
436 |   | "7"
437 |   | "8"
438 |   | "9"
439 |   ))
440 |   | "10"
441 |   | "11"
442 |   | "12"
443 |   | "13"
444 |   | "14"
445 |   | "15"
446 |   | "16"
447 |   | "17"
448 |   | "18"
449 |   | "19"
450 |   | "20"
451 |   | "21"
452 |   | "22"
453 |   | "23"
454 |   | "24"
455 |   | "25"
456 |   | "26"
457 |   | "27"
458 |   | "28"
459 |   | "29"
460 |   | "30"
461 |   | "31"
462 | ;
463 | 
464 | ## TODO: These rules need to be coordinated with the markup since that may
465 | ## change.
466 | 
467 | export approximately = "[~~]";
468 | 
469 | ## Rounded: say "approximately".
470 | 
471 | approx1 = Optimize[
472 |   "minutes:"
473 |   ("" : approximately) (minutes @ round_minutes)
474 |   "|"
475 |   "hours:"
476 |   hours
477 |   "|"
478 |   pass_anything]
479 | ;
480 | 
481 | ## Rounded to next hour.
482 | 
483 | approx2 = Optimize[
484 |   "minutes:"
485 |   ("" : approximately) round_minutes_next_hour
486 |   "|"
487 |   "hours:"
488 |   hours_shift
489 |   "|"
490 |   pass_anything]
491 | ;
492 | 
493 | ## Not rounded: don't say "approximately".
494 | 
495 | approx3 = Optimize[
496 |   "minutes:"
497 |   (minutes @ unrounded_minutes)
498 |   "|"
499 |   "hours:"
500 |   hours
501 |   "|"
502 |   pass_anything]
503 | ;
504 | 
505 | export approx = Optimize[
506 |   approx1 | approx2 | approx3
507 | ];
508 | 
509 | # "|" and "\" are escaped in the new serialization scheme using a backslash, so
510 | # we need to adjust these in the verbatim mappings.
511 | 
512 | func EscapedMappings[raw_mappings] {
513 |   escapes = ("\\\\" : "\\") | ("\\|" : "|");
514 |   return Optimize[
515 |     ((Project[raw_mappings, 'input'] - Project[escapes, 'output']) | escapes)
516 |     @ raw_mappings
517 |   ];
518 | }
519 | 
520 | # Allows verbatim grammars to be more permissive by accepting all inputs, it
521 | # simply consumes the input if it is not present in the raw mappings.
522 | 
523 | func ConsumeUnmapped[raw_mappings] {
524 |   unmapped = bytelib.kBytes - Project[raw_mappings, 'input'];
525 |   return Optimize[
526 |     D[unmapped]<20>
527 |   ];
528 | }
529 | 


--------------------------------------------------------------------------------