├── CONTRIBUTING ├── LICENSE ├── README.md ├── papers ├── gorman-sproat-2016.pdf └── wu-etal-2016.pdf └── src ├── Makefile ├── en ├── README.md └── verbalizer │ ├── cardinals.tsv │ ├── extra_numbers.grm │ ├── factorization.grm │ ├── float.grm │ ├── g.fst │ ├── lexical_map.grm │ ├── lexical_map.tsv │ ├── math.grm │ ├── miscellaneous.grm │ ├── money.grm │ ├── money.tsv │ ├── number_names.grm │ ├── numbers.grm │ ├── numbers_plus.grm │ ├── ordinals.tsv │ ├── params.tsv │ ├── spelled.grm │ ├── spoken_punct.grm │ ├── time.grm │ ├── urls.grm │ └── verbalizer.grm ├── number_data ├── README.md ├── minimal.txt ├── random-trn.txt └── random-tst.txt ├── ru ├── README.md ├── classifier │ └── cyrillic.grm └── verbalizer │ ├── cardinals-lex.grm │ ├── cardinals.tsv │ ├── extra_numbers.grm │ ├── factorization.grm │ ├── float.grm │ ├── g.fst │ ├── lexical_map.grm │ ├── lexical_map.tsv │ ├── math.grm │ ├── miscellaneous.grm │ ├── money.grm │ ├── money.tsv │ ├── nominatives.tsv │ ├── number_names.grm │ ├── numbers.grm │ ├── numbers_plus.grm │ ├── ordinal_endings.tsv │ ├── ordinals-lex.grm │ ├── ordinals.tsv │ ├── spelled.grm │ ├── spoken_punct.grm │ ├── time.grm │ ├── urls.grm │ └── verbalizer.grm ├── universal ├── README.md ├── roman_numerals.tsv └── thousands_punct.grm └── util ├── README.md ├── arithmetic.grm ├── byte.grm ├── case.grm ├── germanic.tsv └── util.grm /CONTRIBUTING: -------------------------------------------------------------------------------- 1 | Want to contribute? Great! First, read this page (including the small print at 2 | the end). 3 | 4 | ### Before you contribute 5 | 6 | Before we can use your code, you must sign the 7 | [Google Individual Contributor License Agreement] 8 | (https://cla.developers.google.com/about/google-individual) 9 | (CLA), which you can do online. The CLA is necessary mainly because you own the 10 | copyright to your changes, even after your contribution becomes part of our 11 | codebase, so we need your permission to use and distribute your code. We also 12 | need to be sure of various other things—for instance that you'll tell us if you 13 | know that your code infringes on other people's patents. You don't have to sign 14 | the CLA until after you've submitted your code for review and a member has 15 | approved it, but you must do it before we can put your code into our codebase. 16 | Before you start working on a larger contribution, you should get in touch with 17 | us first through the issue tracker with your idea so that we can help out and 18 | possibly guide you. Coordinating up front makes it much easier to avoid 19 | frustration later on. 20 | 21 | ### Code reviews 22 | 23 | All submissions, including submissions by project members, require review. We 24 | use Github pull requests for this purpose. 25 | 26 | ### The small print 27 | 28 | Contributions made by corporations are covered by a different agreement than 29 | the one above, the [Software Grant and Corporate Contributor License Agreement] 30 | (https://cla.developers.google.com/about/google-corporate). 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text normalization covering grammars 2 | 3 | This repository provides covering grammars for English and Russian text normalization as 4 | documented in: 5 | 6 | Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization. 7 | _Transactions of the Association for Computational Linguistics_ 4: 507-519. 8 | 9 | Ng, A. H., Gorman, K., and Sproat, R. 2017. Minimally supervised 10 | written-to-spoken text normalization. In _ASRU_, pages 665-670. 11 | 12 | If you use these grammars in a publication, we would appreciate if you cite these works. 13 | 14 | ## Building 15 | 16 | The grammars are written in [Thrax](thrax.opengrm.org) and compile into [OpenFst](openfst.org) FAR (FstARchive) files. To compile, simply run `make` in the `src/` directory. 17 | 18 | ## License 19 | 20 | See `LICENSE`. 21 | 22 | ## Mandatory disclaimer 23 | 24 | This is not an official Google product. 25 | -------------------------------------------------------------------------------- /papers/gorman-sproat-2016.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google-research-datasets/TextNormalizationCoveringGrammars/37f5fb8a9e81a2512f7ea78e88fad3210acf6fe6/papers/gorman-sproat-2016.pdf -------------------------------------------------------------------------------- /papers/wu-etal-2016.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google-research-datasets/TextNormalizationCoveringGrammars/37f5fb8a9e81a2512f7ea78e88fad3210acf6fe6/papers/wu-etal-2016.pdf -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | all: en/verbalizer/verbalizer.far ru/verbalizer/verbalizer.far 2 | 3 | 4 | # Language-universal definitions. 5 | 6 | universal/thousands_punct.far: universal/thousands_punct.grm util/byte.far util/util.far 7 | thraxcompiler --input_grammar=$< --output_far=$@ 8 | 9 | util/util.far: util/util.grm util/byte.far util/case.far 10 | thraxcompiler --input_grammar=$< --output_far=$@ 11 | 12 | util/byte.far: util/byte.grm 13 | thraxcompiler --input_grammar=$< --output_far=$@ 14 | 15 | util/case.far: util/case.grm util/byte.far 16 | thraxcompiler --input_grammar=$< --output_far=$@ 17 | 18 | util/arithmetic.far: util/arithmetic.grm util/germanic.tsv util/byte.far 19 | thraxcompiler --input_grammar=$< --output_far=$@ 20 | 21 | # English verbalizer. 22 | 23 | en/verbalizer/verbalizer.far: en/verbalizer/verbalizer.grm util/util.far en/verbalizer/extra_numbers.far en/verbalizer/float.far en/verbalizer/math.far en/verbalizer/miscellaneous.far en/verbalizer/money.far en/verbalizer/numbers.far en/verbalizer/numbers_plus.far en/verbalizer/spelled.far en/verbalizer/spoken_punct.far en/verbalizer/time.far en/verbalizer/urls.far 24 | thraxcompiler --input_grammar=$< --output_far=$@ 25 | 26 | en/verbalizer/number_names.far: en/verbalizer/number_names.grm en/verbalizer/cardinals.tsv en/verbalizer/g.fst en/verbalizer/ordinals.tsv util/arithmetic.far 27 | thraxcompiler --input_grammar=$< --output_far=$@ 28 | 29 | en/verbalizer/extra_numbers.far: en/verbalizer/extra_numbers.grm util/byte.far en/verbalizer/numbers.far 30 | thraxcompiler --input_grammar=$< --output_far=$@ 31 | 32 | en/verbalizer/numbers.far: en/verbalizer/numbers.grm en/verbalizer/number_names.far util/byte.far universal/thousands_punct.far 33 | thraxcompiler --input_grammar=$< --output_far=$@ 34 | 35 | en/verbalizer/float.far: en/verbalizer/float.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far 36 | thraxcompiler --input_grammar=$< --output_far=$@ 37 | 38 | en/verbalizer/factorization.far: en/verbalizer/factorization.grm util/byte.far util/util.far en/verbalizer/numbers.far 39 | thraxcompiler --input_grammar=$< --output_far=$@ 40 | 41 | en/verbalizer/lexical_map.far: en/verbalizer/lexical_map.grm util/byte.far en/verbalizer/lexical_map.tsv 42 | thraxcompiler --input_grammar=$< --output_far=$@ 43 | 44 | en/verbalizer/math.far: en/verbalizer/math.grm en/verbalizer/float.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far 45 | thraxcompiler --input_grammar=$< --output_far=$@ 46 | 47 | en/verbalizer/miscellaneous.far: en/verbalizer/miscellaneous.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/extra_numbers.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/spelled.far 48 | thraxcompiler --input_grammar=$< --output_far=$@ 49 | 50 | en/verbalizer/spelled.far: en/verbalizer/spelled.grm util/byte.far ru/classifier/cyrillic.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far 51 | thraxcompiler --input_grammar=$< --output_far=$@ 52 | 53 | en/verbalizer/money.far: en/verbalizer/money.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far en/verbalizer/money.tsv 54 | thraxcompiler --input_grammar=$< --output_far=$@ 55 | 56 | en/verbalizer/numbers_plus.far: en/verbalizer/numbers_plus.grm en/verbalizer/factorization.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far 57 | thraxcompiler --input_grammar=$< --output_far=$@ 58 | 59 | en/verbalizer/spoken_punct.far: en/verbalizer/spoken_punct.grm en/verbalizer/lexical_map.far 60 | thraxcompiler --input_grammar=$< --output_far=$@ 61 | 62 | en/verbalizer/time.far: en/verbalizer/time.grm util/byte.far en/verbalizer/lexical_map.far en/verbalizer/numbers.far 63 | thraxcompiler --input_grammar=$< --output_far=$@ 64 | 65 | en/verbalizer/urls.far: en/verbalizer/urls.grm util/byte.far en/verbalizer/lexical_map.far 66 | thraxcompiler --input_grammar=$< --output_far=$@ 67 | 68 | 69 | # Russian verbalizer. 70 | 71 | ru/verbalizer/verbalizer.far: ru/verbalizer/verbalizer.grm util/util.far ru/verbalizer/extra_numbers.far ru/verbalizer/float.far ru/verbalizer/math.far ru/verbalizer/miscellaneous.far ru/verbalizer/money.far ru/verbalizer/numbers.far ru/verbalizer/numbers_plus.far ru/verbalizer/spelled.far ru/verbalizer/spoken_punct.far ru/verbalizer/time.far ru/verbalizer/urls.far 72 | thraxcompiler --input_grammar=$< --output_far=$@ 73 | 74 | ru/verbalizer/number_names.far: ru/verbalizer/number_names.grm ru/verbalizer/cardinals.tsv ru/verbalizer/g.fst ru/verbalizer/ordinals.tsv util/arithmetic.far 75 | thraxcompiler --input_grammar=$< --output_far=$@ 76 | 77 | ru/verbalizer/extra_numbers.far: ru/verbalizer/extra_numbers.grm util/byte.far ru/verbalizer/numbers.far 78 | thraxcompiler --input_grammar=$< --output_far=$@ 79 | 80 | ru/verbalizer/numbers.far: ru/verbalizer/numbers.grm util/byte.far universal/thousands_punct.far ru/verbalizer/nominatives.tsv ru/verbalizer/number_names.far 81 | thraxcompiler --input_grammar=$< --output_far=$@ 82 | 83 | ru/verbalizer/cardinals-lex.far: ru/verbalizer/cardinals-lex.grm util/byte.far 84 | thraxcompiler --input_grammar=$< --output_far=$@ 85 | 86 | ru/verbalizer/ordinals-lex.far: ru/verbalizer/ordinals-lex.grm util/byte.far 87 | thraxcompiler --input_grammar=$< --output_far=$@ 88 | 89 | ru/verbalizer/float.far: ru/verbalizer/float.grm ru/verbalizer/factorization.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far 90 | thraxcompiler --input_grammar=$< --output_far=$@ 91 | 92 | ru/verbalizer/factorization.far: ru/verbalizer/factorization.grm util/byte.far util/util.far ru/verbalizer/numbers.far 93 | thraxcompiler --input_grammar=$< --output_far=$@ 94 | 95 | ru/verbalizer/lexical_map.far: ru/verbalizer/lexical_map.grm util/byte.far 96 | thraxcompiler --input_grammar=$< --output_far=$@ 97 | 98 | ru/verbalizer/math.far: ru/verbalizer/math.grm ru/verbalizer/float.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far 99 | thraxcompiler --input_grammar=$< --output_far=$@ 100 | 101 | ru/verbalizer/miscellaneous.far: ru/verbalizer/miscellaneous.grm util/byte.far ru/classifier/cyrillic.far ru/verbalizer/extra_numbers.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far ru/verbalizer/spelled.far 102 | thraxcompiler --input_grammar=$< --output_far=$@ 103 | 104 | ru/classifier/cyrillic.far: ru/classifier/cyrillic.grm 105 | thraxcompiler --input_grammar=$< --output_far=$@ 106 | 107 | ru/verbalizer/spelled.far: ru/verbalizer/spelled.grm util/byte.far ru/classifier/cyrillic.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far 108 | thraxcompiler --input_grammar=$< --output_far=$@ 109 | 110 | ru/verbalizer/money.far: ru/verbalizer/money.grm util/byte.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far ru/verbalizer/money.tsv 111 | thraxcompiler --input_grammar=$< --output_far=$@ 112 | 113 | ru/verbalizer/numbers_plus.far: ru/verbalizer/numbers_plus.grm ru/verbalizer/factorization.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far 114 | thraxcompiler --input_grammar=$< --output_far=$@ 115 | 116 | ru/verbalizer/spoken_punct.far: ru/verbalizer/spoken_punct.grm ru/verbalizer/lexical_map.far 117 | thraxcompiler --input_grammar=$< --output_far=$@ 118 | 119 | ru/verbalizer/time.far: ru/verbalizer/time.grm util/byte.far ru/verbalizer/lexical_map.far ru/verbalizer/numbers.far 120 | thraxcompiler --input_grammar=$< --output_far=$@ 121 | 122 | ru/verbalizer/urls.far: ru/verbalizer/urls.grm util/byte.far ru/verbalizer/lexical_map.far 123 | thraxcompiler --input_grammar=$< --output_far=$@ 124 | 125 | 126 | # Cleanup. 127 | 128 | clean: 129 | $(RM) */*.far */*/*.far 130 | -------------------------------------------------------------------------------- /src/en/README.md: -------------------------------------------------------------------------------- 1 | # English covering grammar definitions 2 | 3 | This directory defines a English text normalization covering grammar. The 4 | primary entry-point is the FST `VERBALIZER`, defined in 5 | `verbalizer/verbalizer.grm` and compiled in the FST archive 6 | `verbalizer/verbalizer.far`. 7 | -------------------------------------------------------------------------------- /src/en/verbalizer/cardinals.tsv: -------------------------------------------------------------------------------- 1 | 0 zero 2 | 1 one 3 | 2 two 4 | 3 three 5 | 4 four 6 | 5 five 7 | 6 six 8 | 7 seven 9 | 8 eight 10 | 9 nine 11 | 10 ten 12 | 11 eleven 13 | 12 twelve 14 | 13 thirteen 15 | 14 fourteen 16 | 15 fifteen 17 | 16 sixteen 18 | 17 seventeen 19 | 18 eighteen 20 | 19 nineteen 21 | 20 twenty 22 | 30 thirty 23 | 40 forty 24 | 50 fifty 25 | 60 sixty 26 | 70 seventy 27 | 80 eighty 28 | 90 ninety 29 | 100 hundred 30 | 1000 thousand 31 | 1000000 million 32 | 1000000000 billion 33 | -------------------------------------------------------------------------------- /src/en/verbalizer/extra_numbers.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/byte.grm' as b; 16 | import 'en/verbalizer/numbers.grm' as n; 17 | 18 | digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@"); 19 | 20 | export DIGITS = digit (n.I[" "] digit)*; 21 | 22 | # Various common factorizations 23 | 24 | two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS; 25 | 26 | three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS; 27 | 28 | mixed = 29 | (digit n.I[" "] two_digits) 30 | | (two_digits n.I[" "] two_digits) 31 | | (two_digits n.I[" "] three_digits) 32 | | (two_digits n.I[" "] two_digits n.I[" "] two_digits) 33 | ; 34 | 35 | export MIXED_NUMBERS = Optimize[mixed]; 36 | -------------------------------------------------------------------------------- /src/en/verbalizer/factorization.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/byte.grm' as b; 16 | import 'util/util.grm' as u; 17 | import 'en/verbalizer/numbers.grm' as n; 18 | 19 | func ToNumberName[expr] { 20 | number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*; 21 | return Optimize[expr @ number_name_seq]; 22 | } 23 | 24 | d = b.kDigit; 25 | 26 | leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*]; 27 | 28 | by_ones = d n.I[" "]; 29 | by_twos = (d{2} @ leading_zero) n.I[" "]; 30 | by_threes = (d{3} @ leading_zero) n.I[" "]; 31 | 32 | groupings = by_twos* (by_threes | by_twos | by_ones); 33 | 34 | export FRACTIONAL_PART_UNGROUPED = 35 | Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]] 36 | ; 37 | export FRACTIONAL_PART_GROUPED = 38 | Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]] 39 | ; 40 | export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]]; 41 | -------------------------------------------------------------------------------- /src/en/verbalizer/float.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'en/verbalizer/factorization.grm' as f; 16 | import 'en/verbalizer/lexical_map.grm' as l; 17 | import 'en/verbalizer/numbers.grm' as n; 18 | 19 | fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED; 20 | fractional_part_grouped = f.FRACTIONAL_PART_GROUPED; 21 | fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED; 22 | 23 | __fractional_part__ = fractional_part_ungrouped | fractional_part_unparsed; 24 | __decimal_marker__ = "."; 25 | 26 | export FLOAT = Optimize[ 27 | (n.CARDINAL_NUMBERS 28 | (__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ") 29 | __fractional_part__) @ l.LEXICAL_MAP] 30 | ; 31 | -------------------------------------------------------------------------------- /src/en/verbalizer/g.fst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google-research-datasets/TextNormalizationCoveringGrammars/37f5fb8a9e81a2512f7ea78e88fad3210acf6fe6/src/en/verbalizer/g.fst -------------------------------------------------------------------------------- /src/en/verbalizer/lexical_map.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/byte.grm' as b; 16 | 17 | lexical_map = StringFile['en/verbalizer/lexical_map.tsv']; 18 | 19 | sigma_star = b.kBytes*; 20 | 21 | del_null = CDRewrite["__NULL__" : "", "", "", sigma_star]; 22 | 23 | export LEXICAL_MAP = Optimize[ 24 | CDRewrite[lexical_map, "", "", sigma_star] @ del_null] 25 | ; 26 | -------------------------------------------------------------------------------- /src/en/verbalizer/lexical_map.tsv: -------------------------------------------------------------------------------- 1 | @@CONNECTOR_RANGE@@ to 2 | @@CONNECTOR_RATIO@@ to 3 | @@CONNECTOR_BY@@ by 4 | @@CONNECTOR_CONSECUTIVE_YEAR@@ to 5 | @@JANUARY@@ january 6 | @@FEBRUARY@@ february 7 | @@MARCH@@ march 8 | @@APRIL@@ april 9 | @@MAY@@ may 10 | @@JUNE@@ june 11 | @@JULY@@ july 12 | @@AUGUST@@ august 13 | @@SEPTEMBER@@ september 14 | @@OCTOBER@@ october 15 | @@NOVEMBER@@ november 16 | @@DECEMBER@@ december 17 | @@MINUS@@ minus 18 | @@DECIMAL_DOT_EXPRESSION@@ point 19 | @@URL_DOT_EXPRESSION@@ dot 20 | @@DECIMAL_EXPONENT@@ to the 21 | @@DECIMAL_EXPONENT@@ to the power of 22 | @@COLON@@ colon 23 | @@SLASH@@ slash 24 | @@SLASH@@ forward slash 25 | @@DASH@@ dash 26 | @@PASSWORD@@ password 27 | @@AT@@ at 28 | @@PORT@@ port 29 | @@QUESTION_MARK@@ question mark 30 | @@HASH@@ hash 31 | @@HASH@@ hash tag 32 | @@FRACTION_OVER@@ over 33 | @@MONEY_AND@@ and 34 | @@AND@@ and 35 | @@PHONE_PLUS@@ plus 36 | @@PHONE_EXTENSION@@ extension 37 | @@TIME_AM@@ a m 38 | @@TIME_PM@@ p m 39 | @@HOUR@@ o'clock 40 | @@MINUTE@@ minute 41 | @@MINUTE@@ minutes 42 | @@TIME_AFTER@@ after 43 | @@TIME_AFTER@@ past 44 | @@TIME_BEFORE@@ to 45 | @@TIME_BEFORE@@ till 46 | @@TIME_QUARTER@@ quarter 47 | @@TIME_HALF@@ half 48 | @@TIME_ZERO@@ oh 49 | @@TIME_THREE_QUARTER@@ three quarters 50 | @@ARITHMETIC_PLUS@@ plus 51 | @@ARITHMETIC_TIMES@@ times 52 | @@ARITHMETIC_TIMES@@ multiplied by 53 | @@ARITHMETIC_MINUS@@ minus 54 | @@ARITHMETIC_DIVISION@@ divided by 55 | @@ARITHMETIC_DIVISION@@ over 56 | @@ARITHMETIC_EQUALS@@ equals 57 | @@PERCENT@@ percent 58 | @@DEGREE@@ degree 59 | @@DEGREE@@ degrees 60 | @@SQUARE_ROOT@@ square root of 61 | @@SQUARE_ROOT@@ the square root of 62 | @@STAR@@ star 63 | @@HYPHEN@@ hyphen 64 | @@AT@@ at 65 | @@PER@@ per 66 | @@PERIOD@@ period 67 | @@PERIOD@@ full stop 68 | @@PERIOD@@ dot 69 | @@EXCLAMATION_MARK@@ exclamation mark 70 | @@EXCLAMATION_MARK@@ exclamation point 71 | @@COMMA@@ comma 72 | @@POSITIVE@@ positive 73 | @@NEGATIVE@@ negative 74 | @@OTHER_ZERO_VERBALIZATIONS@@ oh 75 | -------------------------------------------------------------------------------- /src/en/verbalizer/math.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'en/verbalizer/float.grm' as f; 16 | import 'en/verbalizer/lexical_map.grm' as l; 17 | import 'en/verbalizer/numbers.grm' as n; 18 | 19 | float = f.FLOAT; 20 | card = n.CARDINAL_NUMBERS; 21 | number = card | float; 22 | 23 | plus = "+" : " @@ARITHMETIC_PLUS@@ "; 24 | times = "*" : " @@ARITHMETIC_TIMES@@ "; 25 | minus = "-" : " @@ARITHMETIC_MINUS@@ "; 26 | division = "/" : " @@ARITHMETIC_DIVISION@@ "; 27 | 28 | operator = plus | times | minus | division; 29 | 30 | percent = "%" : " @@PERCENT@@"; 31 | 32 | export ARITHMETIC = 33 | Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP] 34 | ; 35 | -------------------------------------------------------------------------------- /src/en/verbalizer/miscellaneous.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/byte.grm' as b; 16 | import 'ru/classifier/cyrillic.grm' as c; 17 | import 'en/verbalizer/extra_numbers.grm' as e; 18 | import 'en/verbalizer/lexical_map.grm' as l; 19 | import 'en/verbalizer/numbers.grm' as n; 20 | import 'en/verbalizer/spelled.grm' as s; 21 | 22 | letter = b.kAlpha | c.kCyrillicAlpha; 23 | dash = "-"; 24 | word = letter+; 25 | possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?; 26 | 27 | post_word_symbol = 28 | ("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) | 29 | ("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) | 30 | ("*" : "@@STAR@@") 31 | ; 32 | 33 | pre_word_symbol = 34 | ("@" : "@@AT@@") | 35 | ("/" : "@@SLASH@@") | 36 | ("#" : "@@HASH@@") 37 | ; 38 | 39 | post_word = possibly_split_word n.I[" "] post_word_symbol; 40 | 41 | pre_word = pre_word_symbol n.I[" "] possibly_split_word; 42 | 43 | ## Number/digit sequence combos, maybe with a dash 44 | 45 | spelled_word = word @ s.SPELLED_NO_LETTER; 46 | 47 | word_number = 48 | (word | spelled_word) 49 | (n.I[" "] | (dash : " ")) 50 | (e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS) 51 | ; 52 | 53 | number_word = 54 | (e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS) 55 | (n.I[" "] | (dash : " ")) 56 | (word | spelled_word) 57 | ; 58 | 59 | ## Two-digit year. 60 | 61 | # Note that in this case to be fair we really have to allow ordinals too since 62 | # in some languages that's what you would have. 63 | 64 | two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS)); 65 | 66 | dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com"; 67 | 68 | miscellaneous = Optimize[ 69 | possibly_split_word 70 | | post_word 71 | | pre_word 72 | | word_number 73 | | number_word 74 | | two_digit_year 75 | | dot_com 76 | ]; 77 | 78 | export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP]; 79 | -------------------------------------------------------------------------------- /src/en/verbalizer/money.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/byte.grm' as b; 16 | import 'en/verbalizer/lexical_map.grm' as l; 17 | import 'en/verbalizer/numbers.grm' as n; 18 | 19 | card = n.CARDINAL_NUMBERS; 20 | 21 | __currency__ = StringFile['en/verbalizer/money.tsv']; 22 | 23 | d = b.kDigit; 24 | D = d - "0"; 25 | 26 | cents = ((n.D["0"] | D) d) @ card; 27 | 28 | # Only dollar for the verbalizer tests for English. Will need to add other 29 | # currencies. 30 | usd_maj = Project["usd_maj" @ __currency__, 'output']; 31 | usd_min = Project["usd_min" @ __currency__, 'output']; 32 | and = " @@MONEY_AND@@ " | " "; 33 | 34 | dollar1 = 35 | n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min] 36 | ; 37 | 38 | dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"]; 39 | 40 | dollar3 = n.D["$"] card n.I[" " usd_maj]; 41 | 42 | dollar = Optimize[dollar1 | dollar2 | dollar3]; 43 | 44 | export MONEY = Optimize[dollar @ l.LEXICAL_MAP]; 45 | -------------------------------------------------------------------------------- /src/en/verbalizer/money.tsv: -------------------------------------------------------------------------------- 1 | usd_maj dollar 2 | usd_maj dollars 3 | usd_min cent 4 | usd_min cents 5 | -------------------------------------------------------------------------------- /src/en/verbalizer/number_names.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # English minimally supervised number grammar. 16 | # 17 | # Supports both cardinals and ordinals without overt marking. 18 | # 19 | # The language-specific acceptor G was compiled with digit, teen, and decade 20 | # preterminals. The lexicon transducer L is unambiguous so no LM is used. 21 | 22 | import 'util/arithmetic.grm' as a; 23 | 24 | # Intersects the universal factorization transducer (F) with the 25 | # language-specific acceptor (G). 26 | 27 | d = a.DELTA_STAR; 28 | f = a.IARITHMETIC_RESTRICTED; 29 | g = LoadFst['en/verbalizer/g.fst']; 30 | fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]]; 31 | test1 = AssertEqual["230" @ fg, "(+ (* 2 100 *) 30 +)"]; 32 | 33 | # Compiles lexicon transducer (L). 34 | 35 | cardinal_name = StringFile['en/verbalizer/cardinals.tsv']; 36 | cardinal_l = Optimize[(cardinal_name " ")* cardinal_name]; 37 | test2 = AssertEqual["2 100 30" @ cardinal_l, "two hundred thirty"]; 38 | 39 | ordinal_name = StringFile['en/verbalizer/ordinals.tsv']; 40 | # In English, ordinals have the same syntax as cardinals and all but the final 41 | # element is verbalized using a cardinal number word; e.g., "two hundred 42 | # thirtieth". 43 | ordinal_l = Optimize[(cardinal_name " ")* ordinal_name]; 44 | test3 = AssertEqual["2 100 30" @ ordinal_l, "two hundred thirtieth"]; 45 | 46 | # Composes L with the leaf transducer (P), then composes that with FG. 47 | 48 | p = a.LEAVES; 49 | 50 | export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)]; 51 | test4 = AssertEqual["230" @ CARDINAL_NUMBER_NAME, "two hundred thirty"]; 52 | 53 | export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)]; 54 | test5 = AssertEqual["230" @ ORDINAL_NUMBER_NAME, "two hundred thirtieth"]; 55 | -------------------------------------------------------------------------------- /src/en/verbalizer/numbers.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'en/verbalizer/number_names.grm' as n; 16 | import 'util/byte.grm' as bytelib; 17 | import 'universal/thousands_punct.grm' as t; 18 | 19 | cardinal = n.CARDINAL_NUMBER_NAME; 20 | ordinal = n.ORDINAL_NUMBER_NAME; 21 | 22 | # Putting these here since this grammar gets incorporated by all the others. 23 | 24 | func I[expr] { 25 | return "" : expr; 26 | } 27 | 28 | func D[expr] { 29 | return expr : ""; 30 | } 31 | 32 | separators = t.comma_thousands | t.no_delimiter; 33 | 34 | # Language specific endings for ordinals. 35 | d = bytelib.kDigit; 36 | endings = "st" | "nd" | "rd" | "th"; 37 | 38 | st = (d* "1") - (d* "11"); 39 | nd = (d* "2") - (d* "12"); 40 | rd = (d* "3") - (d* "13"); 41 | th = Optimize[d* - st - nd - rd]; 42 | first = st ("st" : ""); 43 | second = nd ("nd" : ""); 44 | third = rd ("rd" : ""); 45 | other = th ("th" : ""); 46 | marked_ordinal = Optimize[first | second | third | other]; 47 | 48 | # The separator is a no-op here but will be needed once we replace 49 | # the above targets. 50 | 51 | export CARDINAL_NUMBERS = Optimize[separators @ cardinal]; 52 | 53 | export ORDINAL_NUMBERS = 54 | Optimize[(separators endings) @ marked_ordinal @ ordinal] 55 | ; 56 | 57 | export ORDINAL_NUMBERS_UNMARKED = Optimize[separators @ ordinal]; 58 | -------------------------------------------------------------------------------- /src/en/verbalizer/numbers_plus.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Grammar for things built mostly on numbers. 16 | 17 | import 'en/verbalizer/factorization.grm' as f; 18 | import 'en/verbalizer/lexical_map.grm' as l; 19 | import 'en/verbalizer/numbers.grm' as n; 20 | 21 | num = n.CARDINAL_NUMBERS; 22 | ord = n.ORDINAL_NUMBERS_UNMARKED; 23 | digits = f.FRACTIONAL_PART_UNGROUPED; 24 | 25 | # Various symbols. 26 | 27 | plus = "+" : "@@ARITHMETIC_PLUS@@"; 28 | minus = "-" : "@@ARITHMETIC_MINUS@@"; 29 | slash = "/" : "@@SLASH@@"; 30 | dot = "." : "@@URL_DOT_EXPRESSION@@"; 31 | dash = "-" : "@@DASH@@"; 32 | equals = "=" : "@@ARITHMETIC_EQUALS@@"; 33 | 34 | degree = "°" : "@@DEGREE@@"; 35 | 36 | division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@"; 37 | 38 | times = ("x" | "*") : "@@ARITHMETIC_TIMES@@"; 39 | 40 | power = "^" : "@@DECIMAL_EXPONENT@@"; 41 | 42 | square_root = "√" : "@@SQUARE_ROOT@@"; 43 | 44 | percent = "%" : "@@PERCENT@@"; 45 | 46 | # Safe roman numbers. 47 | 48 | # NB: Do not change the formatting here. NO_EDIT must be on the same 49 | # line as the path. 50 | rfile = 51 | 'universal/roman_numerals.tsv' # NO_EDIT 52 | ; 53 | 54 | roman = StringFile[rfile]; 55 | 56 | ## Main categories. 57 | 58 | cat_dot_number = 59 | num 60 | n.I[" "] dot n.I[" "] num 61 | (n.I[" "] dot n.I[" "] num)+ 62 | ; 63 | 64 | cat_slash_number = 65 | num 66 | n.I[" "] slash n.I[" "] num 67 | (n.I[" "] slash n.I[" "] num)* 68 | ; 69 | 70 | cat_dash_number = 71 | num 72 | n.I[" "] dash n.I[" "] num 73 | (n.I[" "] dash n.I[" "] num)* 74 | ; 75 | 76 | cat_signed_number = ((plus | minus) n.I[" "])? num; 77 | 78 | cat_degree = cat_signed_number n.I[" "] degree; 79 | 80 | cat_country_code = plus n.I[" "] (num | digits); 81 | 82 | cat_math_operations = 83 | plus 84 | | minus 85 | | division 86 | | times 87 | | equals 88 | | percent 89 | | power 90 | | square_root 91 | ; 92 | 93 | # Roman numbers are often either cardinals or ordinals in various languages. 94 | cat_roman = roman @ (num | ord); 95 | 96 | # Allow 97 | # 98 | # number:number 99 | # number-number 100 | # 101 | # to just be 102 | # 103 | # number number. 104 | 105 | cat_number_number = 106 | num ((":" | "-") : " ") num 107 | ; 108 | 109 | # Some additional readings for these symbols. 110 | 111 | cat_additional_readings = 112 | ("/" : "@@PER@@") | 113 | ("+" : "@@AND@@") | 114 | ("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) | 115 | ("*" : "@@STAR@@") | 116 | ("x" : ("x" | "@@CONNECTOR_BY@@")) | 117 | ("@" : "@@AT@@") 118 | ; 119 | 120 | numbers_plus = Optimize[ 121 | cat_dot_number 122 | | cat_slash_number 123 | | cat_dash_number 124 | | cat_signed_number 125 | | cat_degree 126 | | cat_country_code 127 | | cat_math_operations 128 | | cat_roman 129 | | cat_number_number 130 | | cat_additional_readings 131 | ]; 132 | 133 | export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP]; 134 | -------------------------------------------------------------------------------- /src/en/verbalizer/ordinals.tsv: -------------------------------------------------------------------------------- 1 | 0 zeroth 2 | 1 first 3 | 2 second 4 | 3 third 5 | 4 fourth 6 | 5 fifth 7 | 6 sixth 8 | 7 seventh 9 | 8 eighth 10 | 9 ninth 11 | 10 tenth 12 | 11 eleventh 13 | 12 twelfth 14 | 13 thirteenth 15 | 14 fourteenth 16 | 15 fifteenth 17 | 16 sixteenth 18 | 17 seventeenth 19 | 18 eighteenth 20 | 19 nineteenth 21 | 20 twentieth 22 | 30 thirtieth 23 | 40 fortieth 24 | 50 fiftieth 25 | 60 sixtieth 26 | 70 seventieth 27 | 80 eightieth 28 | 90 ninetieth 29 | 100 hundredth 30 | 1000 thousandth 31 | 1000000 millionth 32 | 1000000000 billionth 33 | -------------------------------------------------------------------------------- /src/en/verbalizer/params.tsv: -------------------------------------------------------------------------------- 1 | float.grm __fractional_part__ = fractional_part_ungrouped | fractional_part_unparsed; 2 | telephone.grm __grouping__ = f.UNGROUPED; 3 | measure.grm __measure__ = StringFile['en/verbalizer/measures.tsv']; 4 | money.grm __currency__ = StringFile['en/verbalizer/money.tsv']; 5 | time.grm __sep__ = ":"; 6 | time.grm __am__ = "a.m." | "am" | "AM"; 7 | time.grm __pm__ = "p.m." | "pm" | "PM"; 8 | -------------------------------------------------------------------------------- /src/en/verbalizer/spelled.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This verbalizer is used whenever there is an LM symbol that consists of 16 | # letters immediately followed by "{spelled}".l This strips the "{spelled}" 17 | # suffix. 18 | 19 | import 'util/byte.grm' as b; 20 | import 'ru/classifier/cyrillic.grm' as c; 21 | import 'en/verbalizer/lexical_map.grm' as l; 22 | import 'en/verbalizer/numbers.grm' as n; 23 | 24 | digit = b.kDigit @ n.CARDINAL_NUMBERS; 25 | 26 | char_set = (("a" | "A") : "letter-a") 27 | | (("b" | "B") : "letter-b") 28 | | (("c" | "C") : "letter-c") 29 | | (("d" | "D") : "letter-d") 30 | | (("e" | "E") : "letter-e") 31 | | (("f" | "F") : "letter-f") 32 | | (("g" | "G") : "letter-g") 33 | | (("h" | "H") : "letter-h") 34 | | (("i" | "I") : "letter-i") 35 | | (("j" | "J") : "letter-j") 36 | | (("k" | "K") : "letter-k") 37 | | (("l" | "L") : "letter-l") 38 | | (("m" | "M") : "letter-m") 39 | | (("n" | "N") : "letter-n") 40 | | (("o" | "O") : "letter-o") 41 | | (("p" | "P") : "letter-p") 42 | | (("q" | "Q") : "letter-q") 43 | | (("r" | "R") : "letter-r") 44 | | (("s" | "S") : "letter-s") 45 | | (("t" | "T") : "letter-t") 46 | | (("u" | "U") : "letter-u") 47 | | (("v" | "V") : "letter-v") 48 | | (("w" | "W") : "letter-w") 49 | | (("x" | "X") : "letter-x") 50 | | (("y" | "Y") : "letter-y") 51 | | (("z" | "Z") : "letter-z") 52 | | (digit) 53 | | ("&" : "@@AND@@") 54 | | ("." : "") 55 | | ("-" : "") 56 | | ("_" : "") 57 | | ("/" : "") 58 | | (n.I["letter-"] c.kCyrillicAlpha) 59 | ; 60 | 61 | ins_space = "" : " "; 62 | 63 | suffix = "{spelled}" : ""; 64 | 65 | spelled = Optimize[char_set (ins_space char_set)* suffix]; 66 | 67 | export SPELLED = Optimize[spelled @ l.LEXICAL_MAP]; 68 | 69 | sigma_star = b.kBytes*; 70 | 71 | # Gets rid of the letter- prefix since in some cases we don't want it. 72 | 73 | del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star]; 74 | 75 | spelled_no_tag = Optimize[char_set (ins_space char_set)*]; 76 | 77 | export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter]; 78 | -------------------------------------------------------------------------------- /src/en/verbalizer/spoken_punct.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'en/verbalizer/lexical_map.grm' as l; 16 | 17 | punct = 18 | ("." : "@@PERIOD@@") 19 | | ("," : "@@COMMA@@") 20 | | ("!" : "@@EXCLAMATION_MARK@@") 21 | | ("?" : "@@QUESTION_MARK@@") 22 | ; 23 | 24 | export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP]; 25 | -------------------------------------------------------------------------------- /src/en/verbalizer/time.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/byte.grm' as b; 16 | import 'en/verbalizer/lexical_map.grm' as l; 17 | import 'en/verbalizer/numbers.grm' as n; 18 | 19 | # Only handles 24-hour time with quarter-to, half-past and quarter-past. 20 | 21 | increment_hour = 22 | ("0" : "1") 23 | | ("1" : "2") 24 | | ("2" : "3") 25 | | ("3" : "4") 26 | | ("4" : "5") 27 | | ("5" : "6") 28 | | ("6" : "7") 29 | | ("7" : "8") 30 | | ("8" : "9") 31 | | ("9" : "10") 32 | | ("10" : "11") 33 | | ("11" : "12") 34 | | ("12" : "1") # If someone uses 12, we assume 12-hour by default. 35 | | ("13" : "14") 36 | | ("14" : "15") 37 | | ("15" : "16") 38 | | ("16" : "17") 39 | | ("17" : "18") 40 | | ("18" : "19") 41 | | ("19" : "20") 42 | | ("20" : "21") 43 | | ("21" : "22") 44 | | ("22" : "23") 45 | | ("23" : "12") 46 | ; 47 | 48 | hours = Project[increment_hour, 'input']; 49 | 50 | d = b.kDigit; 51 | D = d - "0"; 52 | 53 | minutes09 = "0" D; 54 | 55 | minutes = ("1" | "2" | "3" | "4" | "5") d; 56 | 57 | __sep__ = ":"; 58 | sep_space = __sep__ : " "; 59 | 60 | verbalize_hours = hours @ n.CARDINAL_NUMBERS; 61 | 62 | verbalize_minutes = 63 | ("00" : "@@HOUR@@") 64 | | (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS)) 65 | | (minutes @ n.CARDINAL_NUMBERS) 66 | ; 67 | 68 | time_basic = Optimize[verbalize_hours sep_space verbalize_minutes]; 69 | 70 | # Special cases we handle right now. 71 | # TODO: Need to allow for cases like 72 | # 73 | # half twelve (in the UK English sense) 74 | # half twaalf (in the Dutch sense) 75 | 76 | time_quarter_past = 77 | n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "] 78 | verbalize_hours 79 | n.D[__sep__ "15"]; 80 | 81 | time_half_past = 82 | n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "] 83 | verbalize_hours 84 | n.D[__sep__ "30"]; 85 | 86 | time_quarter_to = 87 | n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "] 88 | (increment_hour @ verbalize_hours) 89 | n.D[__sep__ "45"]; 90 | 91 | time_extra = Optimize[ 92 | time_quarter_past | time_half_past | time_quarter_to] 93 | ; 94 | 95 | # Basic time periods which most languages can be expected to have. 96 | __am__ = "a.m." | "am" | "AM"; 97 | __pm__ = "p.m." | "pm" | "PM"; 98 | 99 | period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@"); 100 | 101 | time_variants = time_basic | time_extra; 102 | 103 | time = Optimize[ 104 | (period (" " | n.I[" "]))? time_variants 105 | | time_variants ((" " | n.I[" "]) period)?] 106 | ; 107 | 108 | export TIME = Optimize[time @ l.LEXICAL_MAP]; 109 | -------------------------------------------------------------------------------- /src/en/verbalizer/urls.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Rules for URLs and email addresses. 16 | 17 | import 'util/byte.grm' as bytelib; 18 | import 'en/verbalizer/lexical_map.grm' as l; 19 | 20 | ins_space = "" : " "; 21 | dot = "." : "@@URL_DOT_EXPRESSION@@"; 22 | at = "@" : "@@AT@@"; 23 | 24 | url_suffix = 25 | (".com" : dot ins_space "com") | 26 | (".gov" : dot ins_space "gov") | 27 | (".edu" : dot ins_space "e d u") | 28 | (".org" : dot ins_space "org") | 29 | (".net" : dot ins_space "net") 30 | ; 31 | 32 | letter_string = (bytelib.kAlnum)* bytelib.kAlnum; 33 | 34 | letter_string_dot = 35 | ((letter_string ins_space dot ins_space)* letter_string) 36 | ; 37 | 38 | # Rules for URLs. 39 | export URL = Optimize[ 40 | ((letter_string_dot) (ins_space) 41 | (url_suffix)) @ l.LEXICAL_MAP 42 | ]; 43 | 44 | # Rules for email addresses. 45 | letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum); 46 | 47 | letter_by_letter_dot = 48 | ((letter_by_letter ins_space dot ins_space)* 49 | letter_by_letter) 50 | ; 51 | 52 | export EMAIL1 = Optimize[ 53 | ((letter_by_letter) (ins_space) 54 | (at) (ins_space) 55 | (letter_by_letter_dot) (ins_space) 56 | (url_suffix)) @ l.LEXICAL_MAP 57 | ]; 58 | 59 | export EMAIL2 = Optimize[ 60 | ((letter_by_letter) (ins_space) 61 | (at) (ins_space) 62 | (letter_string_dot) (ins_space) 63 | (url_suffix)) @ l.LEXICAL_MAP 64 | ]; 65 | 66 | export EMAILS = Optimize[ 67 | EMAIL1 | EMAIL2 68 | ]; 69 | -------------------------------------------------------------------------------- /src/en/verbalizer/verbalizer.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/util.grm' as util; 16 | import 'en/verbalizer/extra_numbers.grm' as e; 17 | import 'en/verbalizer/float.grm' as f; 18 | import 'en/verbalizer/math.grm' as ma; 19 | import 'en/verbalizer/miscellaneous.grm' as mi; 20 | import 'en/verbalizer/money.grm' as mo; 21 | import 'en/verbalizer/numbers.grm' as n; 22 | import 'en/verbalizer/numbers_plus.grm' as np; 23 | import 'en/verbalizer/spelled.grm' as s; 24 | import 'en/verbalizer/spoken_punct.grm' as sp; 25 | import 'en/verbalizer/time.grm' as t; 26 | import 'en/verbalizer/urls.grm' as u; 27 | 28 | export VERBALIZER = Optimize[RmWeight[ 29 | ( e.MIXED_NUMBERS 30 | | e.DIGITS 31 | | f.FLOAT 32 | | ma.ARITHMETIC 33 | | mi.MISCELLANEOUS 34 | | mo.MONEY 35 | | n.CARDINAL_NUMBERS 36 | | n.ORDINAL_NUMBERS 37 | | np.NUMBERS_PLUS 38 | | s.SPELLED 39 | | sp.SPOKEN_PUNCT 40 | | t.TIME 41 | | u.URL) @ util.CLEAN_SPACES 42 | ]]; 43 | -------------------------------------------------------------------------------- /src/number_data/README.md: -------------------------------------------------------------------------------- 1 | This directory contains data used in: 2 | 3 | Gorman, K., and Sproat, R. 2016. Minimally supervised number normalization. 4 | Transactions of the Association for Computational Linguistics 4: 507-519. 5 | 6 | * `minimal.txt`: A list of 30 curated numbers used as the "minimal" training 7 | set. 8 | * `random-trn.txt`: A list of 9000 randomly-generated numbers used as the 9 | "medium" training set. 10 | * `random-tst.txt`: A list of 1000 randomly-generated numbers used as the test 11 | set. 12 | 13 | Note that `random-trn.txt` and `random-tst.txt` are totally disjoint, but that 14 | a small number of examples occur both in `minimal.txt` and `random-tst.txt`. 15 | 16 | For information about the sampling procedure used to generate the random data 17 | sets, see appendix A of the aforementioned paper. 18 | -------------------------------------------------------------------------------- /src/number_data/minimal.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | 8 10 | 9 11 | 10 12 | 11 13 | 12 14 | 13 15 | 14 16 | 15 17 | 16 18 | 17 19 | 18 20 | 19 21 | 20 22 | 21 23 | 22 24 | 23 25 | 24 26 | 25 27 | 26 28 | 27 29 | 28 30 | 29 31 | 30 32 | 31 33 | 32 34 | 33 35 | 34 36 | 35 37 | 36 38 | 37 39 | 38 40 | 39 41 | 40 42 | 41 43 | 42 44 | 43 45 | 44 46 | 45 47 | 46 48 | 47 49 | 48 50 | 49 51 | 50 52 | 51 53 | 52 54 | 53 55 | 54 56 | 55 57 | 56 58 | 57 59 | 58 60 | 59 61 | 60 62 | 61 63 | 62 64 | 63 65 | 64 66 | 65 67 | 66 68 | 67 69 | 68 70 | 69 71 | 70 72 | 71 73 | 72 74 | 73 75 | 74 76 | 75 77 | 76 78 | 77 79 | 78 80 | 79 81 | 80 82 | 81 83 | 82 84 | 83 85 | 84 86 | 85 87 | 86 88 | 87 89 | 88 90 | 89 91 | 90 92 | 91 93 | 92 94 | 93 95 | 94 96 | 95 97 | 96 98 | 97 99 | 98 100 | 99 101 | 100 102 | 101 103 | 102 104 | 103 105 | 104 106 | 105 107 | 106 108 | 107 109 | 108 110 | 109 111 | 110 112 | 111 113 | 112 114 | 113 115 | 114 116 | 115 117 | 116 118 | 117 119 | 118 120 | 119 121 | 120 122 | 121 123 | 122 124 | 123 125 | 124 126 | 125 127 | 126 128 | 127 129 | 128 130 | 129 131 | 130 132 | 131 133 | 132 134 | 133 135 | 134 136 | 135 137 | 136 138 | 137 139 | 138 140 | 139 141 | 140 142 | 141 143 | 142 144 | 143 145 | 144 146 | 145 147 | 146 148 | 147 149 | 148 150 | 149 151 | 150 152 | 151 153 | 152 154 | 153 155 | 154 156 | 155 157 | 156 158 | 157 159 | 158 160 | 159 161 | 160 162 | 161 163 | 162 164 | 163 165 | 164 166 | 165 167 | 166 168 | 167 169 | 168 170 | 169 171 | 170 172 | 171 173 | 172 174 | 173 175 | 174 176 | 175 177 | 176 178 | 177 179 | 178 180 | 179 181 | 180 182 | 181 183 | 182 184 | 183 185 | 184 186 | 185 187 | 186 188 | 187 189 | 188 190 | 189 191 | 190 192 | 191 193 | 192 194 | 193 195 | 194 196 | 195 197 | 196 198 | 197 199 | 198 200 | 199 201 | 200 202 | 201 203 | 202 204 | 203 205 | 204 206 | 205 207 | 206 208 | 207 209 | 208 210 | 209 211 | 210 212 | 211 213 | 212 214 | 220 215 | 221 216 | 230 217 | 300 218 | 400 219 | 500 220 | 600 221 | 700 222 | 800 223 | 900 224 | 1000 225 | 1001 226 | 1002 227 | 1003 228 | 1004 229 | 1005 230 | 1006 231 | 1007 232 | 1008 233 | 1009 234 | 1010 235 | 1011 236 | 1012 237 | 1020 238 | 1021 239 | 1030 240 | 1200 241 | 2000 242 | 2001 243 | 2002 244 | 2003 245 | 2004 246 | 2005 247 | 2006 248 | 2007 249 | 2008 250 | 2009 251 | 2010 252 | 2011 253 | 2012 254 | 2020 255 | 2021 256 | 2030 257 | 2100 258 | 2200 259 | 5001 260 | 10000 261 | 12000 262 | 20000 263 | 21000 264 | 50001 265 | 100000 266 | 120000 267 | 200000 268 | 210000 269 | 500001 270 | 1000000 271 | 1001000 272 | 1200000 273 | 2000000 274 | 2100000 275 | 5000001 276 | 10000000 277 | 10001000 278 | 12000000 279 | 20000000 280 | 50000001 281 | 100000000 282 | 100001000 283 | 120000000 284 | 200000000 285 | 500000001 286 | 1000000000 287 | 1000001000 288 | 1200000000 289 | 2000000000 290 | 5000000001 291 | 10000000000 292 | 10000001000 293 | 12000000000 294 | 20000000000 295 | 50000000001 296 | 100000000000 297 | 100000001000 298 | 120000000000 299 | 200000000000 300 | 500000000001 301 | -------------------------------------------------------------------------------- /src/number_data/random-tst.txt: -------------------------------------------------------------------------------- 1 | 209 2 | 220 3 | 250 4 | 254 5 | 263 6 | 266 7 | 276 8 | 303 9 | 310 10 | 317 11 | 322 12 | 364 13 | 386 14 | 405 15 | 414 16 | 424 17 | 429 18 | 489 19 | 505 20 | 520 21 | 523 22 | 525 23 | 554 24 | 624 25 | 627 26 | 640 27 | 665 28 | 680 29 | 704 30 | 715 31 | 723 32 | 741 33 | 742 34 | 775 35 | 776 36 | 845 37 | 847 38 | 851 39 | 868 40 | 898 41 | 921 42 | 927 43 | 972 44 | 973 45 | 984 46 | 986 47 | 994 48 | 1038 49 | 1055 50 | 1077 51 | 1079 52 | 1083 53 | 1090 54 | 1123 55 | 1137 56 | 1161 57 | 1184 58 | 1186 59 | 1235 60 | 1257 61 | 1258 62 | 1285 63 | 1302 64 | 1307 65 | 1311 66 | 1358 67 | 1369 68 | 1372 69 | 1383 70 | 1391 71 | 1418 72 | 1441 73 | 1442 74 | 1447 75 | 1476 76 | 1478 77 | 1509 78 | 1535 79 | 1548 80 | 1550 81 | 1571 82 | 1581 83 | 1593 84 | 1615 85 | 1623 86 | 1639 87 | 1660 88 | 1686 89 | 1688 90 | 1717 91 | 1735 92 | 1782 93 | 1813 94 | 1815 95 | 1824 96 | 1831 97 | 1875 98 | 1881 99 | 1924 100 | 1931 101 | 1949 102 | 1951 103 | 1966 104 | 1970 105 | 1984 106 | 1990 107 | 1992 108 | 2012 109 | 2013 110 | 2024 111 | 2040 112 | 2058 113 | 2062 114 | 2064 115 | 2067 116 | 2075 117 | 2116 118 | 2130 119 | 2135 120 | 2171 121 | 2197 122 | 2200 123 | 2215 124 | 2220 125 | 2226 126 | 2246 127 | 2259 128 | 2277 129 | 2294 130 | 2303 131 | 2318 132 | 2342 133 | 2347 134 | 2349 135 | 2355 136 | 2364 137 | 2413 138 | 2419 139 | 2420 140 | 2433 141 | 2441 142 | 2445 143 | 2451 144 | 2468 145 | 2488 146 | 2498 147 | 2499 148 | 2500 149 | 2502 150 | 2514 151 | 2523 152 | 2524 153 | 2557 154 | 2568 155 | 2598 156 | 2609 157 | 2612 158 | 2629 159 | 2685 160 | 2697 161 | 2718 162 | 2724 163 | 2734 164 | 2739 165 | 2760 166 | 2763 167 | 2779 168 | 2796 169 | 2797 170 | 2809 171 | 2818 172 | 2828 173 | 2839 174 | 2842 175 | 2850 176 | 2857 177 | 2864 178 | 2916 179 | 2923 180 | 2984 181 | 2987 182 | 2991 183 | 2994 184 | 3021 185 | 3025 186 | 3026 187 | 3054 188 | 3070 189 | 3080 190 | 3086 191 | 3098 192 | 3114 193 | 3121 194 | 3130 195 | 3136 196 | 3137 197 | 3157 198 | 3175 199 | 3182 200 | 3200 201 | 3233 202 | 3245 203 | 3250 204 | 3270 205 | 3298 206 | 3303 207 | 3330 208 | 3341 209 | 3347 210 | 3368 211 | 3392 212 | 3394 213 | 3398 214 | 3400 215 | 3427 216 | 3435 217 | 3441 218 | 3449 219 | 3474 220 | 3477 221 | 3497 222 | 3501 223 | 3525 224 | 3526 225 | 3551 226 | 3570 227 | 3576 228 | 3597 229 | 3612 230 | 3630 231 | 3636 232 | 3639 233 | 3649 234 | 3651 235 | 3675 236 | 3692 237 | 3719 238 | 3742 239 | 3773 240 | 3785 241 | 3790 242 | 3850 243 | 3870 244 | 3873 245 | 3875 246 | 3885 247 | 3910 248 | 3926 249 | 3927 250 | 3928 251 | 3941 252 | 3943 253 | 3945 254 | 3950 255 | 3961 256 | 3971 257 | 3990 258 | 3992 259 | 3996 260 | 4010 261 | 4013 262 | 4018 263 | 4024 264 | 4032 265 | 4047 266 | 4065 267 | 4069 268 | 4079 269 | 4089 270 | 4097 271 | 4114 272 | 4125 273 | 4127 274 | 4148 275 | 4155 276 | 4173 277 | 4180 278 | 4206 279 | 4249 280 | 4256 281 | 4284 282 | 4298 283 | 4303 284 | 4305 285 | 4345 286 | 4354 287 | 4409 288 | 4417 289 | 4433 290 | 4437 291 | 4470 292 | 4474 293 | 4486 294 | 4494 295 | 4527 296 | 4538 297 | 4544 298 | 4572 299 | 4629 300 | 4630 301 | 4634 302 | 4647 303 | 4652 304 | 4654 305 | 4658 306 | 4680 307 | 4699 308 | 4747 309 | 4748 310 | 4773 311 | 4791 312 | 4852 313 | 4863 314 | 4884 315 | 4907 316 | 4927 317 | 4943 318 | 4953 319 | 5027 320 | 5032 321 | 5037 322 | 5080 323 | 5095 324 | 5108 325 | 5134 326 | 5163 327 | 5168 328 | 5186 329 | 5210 330 | 5236 331 | 5237 332 | 5265 333 | 5273 334 | 5283 335 | 5330 336 | 5351 337 | 5362 338 | 5396 339 | 5438 340 | 5446 341 | 5465 342 | 5495 343 | 5511 344 | 5526 345 | 5534 346 | 5556 347 | 5567 348 | 5611 349 | 5639 350 | 5642 351 | 5725 352 | 5738 353 | 5751 354 | 5774 355 | 5777 356 | 5786 357 | 5813 358 | 5837 359 | 5864 360 | 5879 361 | 5885 362 | 5889 363 | 5898 364 | 5921 365 | 5924 366 | 5946 367 | 5955 368 | 5959 369 | 5968 370 | 5976 371 | 5981 372 | 6021 373 | 6047 374 | 6049 375 | 6080 376 | 6158 377 | 6162 378 | 6170 379 | 6176 380 | 6206 381 | 6214 382 | 6220 383 | 6243 384 | 6253 385 | 6261 386 | 6284 387 | 6307 388 | 6322 389 | 6330 390 | 6338 391 | 6367 392 | 6413 393 | 6430 394 | 6434 395 | 6437 396 | 6470 397 | 6492 398 | 6499 399 | 6504 400 | 6512 401 | 6660 402 | 6670 403 | 6680 404 | 6699 405 | 6710 406 | 6737 407 | 6741 408 | 6751 409 | 6776 410 | 6779 411 | 6802 412 | 6819 413 | 6890 414 | 6892 415 | 6969 416 | 6970 417 | 7040 418 | 7045 419 | 7052 420 | 7063 421 | 7065 422 | 7088 423 | 7128 424 | 7129 425 | 7133 426 | 7155 427 | 7164 428 | 7166 429 | 7181 430 | 7210 431 | 7219 432 | 7234 433 | 7236 434 | 7256 435 | 7266 436 | 7270 437 | 7303 438 | 7364 439 | 7370 440 | 7378 441 | 7499 442 | 7593 443 | 7629 444 | 7633 445 | 7640 446 | 7675 447 | 7709 448 | 7753 449 | 7791 450 | 7792 451 | 7812 452 | 7838 453 | 7860 454 | 7890 455 | 7972 456 | 8014 457 | 8025 458 | 8096 459 | 8106 460 | 8123 461 | 8154 462 | 8159 463 | 8200 464 | 8228 465 | 8343 466 | 8381 467 | 8429 468 | 8490 469 | 8515 470 | 8526 471 | 8560 472 | 8568 473 | 8579 474 | 8658 475 | 8668 476 | 8672 477 | 8688 478 | 8710 479 | 8731 480 | 8739 481 | 8752 482 | 8771 483 | 8790 484 | 8833 485 | 8900 486 | 8917 487 | 8929 488 | 9002 489 | 9035 490 | 9043 491 | 9067 492 | 9078 493 | 9122 494 | 9138 495 | 9144 496 | 9183 497 | 9199 498 | 9211 499 | 9235 500 | 9240 501 | 9257 502 | 9330 503 | 9385 504 | 9390 505 | 9450 506 | 9512 507 | 9523 508 | 9530 509 | 9535 510 | 9564 511 | 9596 512 | 9601 513 | 9602 514 | 9603 515 | 9626 516 | 9655 517 | 9691 518 | 9695 519 | 9772 520 | 9780 521 | 9808 522 | 9849 523 | 9881 524 | 9911 525 | 9923 526 | 9946 527 | 9970 528 | 9986 529 | 10009 530 | 10019 531 | 10168 532 | 10178 533 | 10180 534 | 10190 535 | 10290 536 | 10348 537 | 10470 538 | 10520 539 | 10525 540 | 10535 541 | 10545 542 | 10627 543 | 10675 544 | 10715 545 | 10757 546 | 10772 547 | 10786 548 | 10896 549 | 10940 550 | 10970 551 | 11000 552 | 11101 553 | 11120 554 | 11132 555 | 11192 556 | 11201 557 | 11209 558 | 11265 559 | 11337 560 | 11392 561 | 11549 562 | 11557 563 | 11567 564 | 11736 565 | 11767 566 | 11807 567 | 11814 568 | 11866 569 | 11881 570 | 11913 571 | 12073 572 | 12098 573 | 12111 574 | 12137 575 | 12291 576 | 12370 577 | 12376 578 | 12397 579 | 12435 580 | 12439 581 | 12443 582 | 12511 583 | 12520 584 | 12567 585 | 12575 586 | 12615 587 | 12700 588 | 12710 589 | 12726 590 | 12729 591 | 12814 592 | 12822 593 | 12883 594 | 12890 595 | 12910 596 | 12915 597 | 12980 598 | 13069 599 | 13075 600 | 13127 601 | 13193 602 | 13209 603 | 13386 604 | 13390 605 | 13393 606 | 13511 607 | 13586 608 | 13607 609 | 13625 610 | 13630 611 | 13647 612 | 13656 613 | 13763 614 | 13810 615 | 13910 616 | 13979 617 | 13991 618 | 14073 619 | 14096 620 | 14111 621 | 14170 622 | 14210 623 | 14259 624 | 14306 625 | 14350 626 | 14351 627 | 14360 628 | 14479 629 | 14587 630 | 14613 631 | 14736 632 | 14745 633 | 14797 634 | 14810 635 | 14822 636 | 14824 637 | 14830 638 | 15020 639 | 15068 640 | 15118 641 | 15197 642 | 15230 643 | 15270 644 | 15310 645 | 15404 646 | 15510 647 | 15603 648 | 15680 649 | 15700 650 | 15721 651 | 15820 652 | 15928 653 | 15990 654 | 16012 655 | 16018 656 | 16030 657 | 16073 658 | 16123 659 | 16243 660 | 16275 661 | 16501 662 | 16690 663 | 16710 664 | 16765 665 | 16870 666 | 16958 667 | 17014 668 | 17030 669 | 17138 670 | 17190 671 | 17272 672 | 17409 673 | 17424 674 | 17430 675 | 17477 676 | 17678 677 | 17684 678 | 17687 679 | 17820 680 | 17840 681 | 17898 682 | 18097 683 | 18219 684 | 18284 685 | 18349 686 | 18525 687 | 18634 688 | 18680 689 | 19042 690 | 19070 691 | 19084 692 | 19120 693 | 19151 694 | 19250 695 | 19389 696 | 19679 697 | 19932 698 | 20080 699 | 20100 700 | 20133 701 | 20321 702 | 20440 703 | 20801 704 | 20819 705 | 20969 706 | 21190 707 | 21300 708 | 21340 709 | 21350 710 | 21360 711 | 21490 712 | 21531 713 | 21640 714 | 21728 715 | 21796 716 | 21831 717 | 21860 718 | 22040 719 | 22208 720 | 22282 721 | 22410 722 | 22566 723 | 22850 724 | 23060 725 | 23196 726 | 23380 727 | 24190 728 | 24350 729 | 24360 730 | 24380 731 | 24475 732 | 24480 733 | 24491 734 | 24521 735 | 24644 736 | 24695 737 | 24747 738 | 24760 739 | 24945 740 | 25000 741 | 25510 742 | 25754 743 | 25870 744 | 26200 745 | 26300 746 | 26410 747 | 26447 748 | 26472 749 | 26510 750 | 27000 751 | 27017 752 | 27400 753 | 27430 754 | 27531 755 | 27600 756 | 27740 757 | 27870 758 | 28200 759 | 28544 760 | 28570 761 | 28618 762 | 28629 763 | 28716 764 | 28753 765 | 28850 766 | 29027 767 | 29040 768 | 29045 769 | 29129 770 | 29190 771 | 29404 772 | 29600 773 | 29970 774 | 30030 775 | 30050 776 | 30190 777 | 30375 778 | 30500 779 | 30700 780 | 30778 781 | 30790 782 | 30838 783 | 31310 784 | 31379 785 | 31480 786 | 31547 787 | 31698 788 | 31986 789 | 32600 790 | 32991 791 | 33417 792 | 33603 793 | 34751 794 | 34900 795 | 34980 796 | 35059 797 | 35101 798 | 35190 799 | 35496 800 | 35500 801 | 35707 802 | 35761 803 | 36320 804 | 36496 805 | 36893 806 | 37200 807 | 37520 808 | 37780 809 | 38370 810 | 38500 811 | 38600 812 | 39200 813 | 39575 814 | 39580 815 | 40324 816 | 40560 817 | 41222 818 | 41300 819 | 41485 820 | 41973 821 | 43110 822 | 43229 823 | 44097 824 | 44550 825 | 44666 826 | 45078 827 | 45085 828 | 45090 829 | 45600 830 | 46170 831 | 46772 832 | 47060 833 | 48280 834 | 48500 835 | 48518 836 | 49400 837 | 49430 838 | 50100 839 | 50167 840 | 50359 841 | 50800 842 | 51386 843 | 51390 844 | 51531 845 | 51800 846 | 52092 847 | 52100 848 | 52590 849 | 52663 850 | 52670 851 | 52738 852 | 52990 853 | 53025 854 | 53450 855 | 53600 856 | 53620 857 | 54070 858 | 54505 859 | 56160 860 | 56165 861 | 57100 862 | 57730 863 | 58825 864 | 58900 865 | 60151 866 | 60500 867 | 61306 868 | 61710 869 | 62250 870 | 62270 871 | 62400 872 | 63310 873 | 63960 874 | 64235 875 | 64760 876 | 65200 877 | 65654 878 | 66240 879 | 66400 880 | 66600 881 | 68670 882 | 68920 883 | 71000 884 | 71400 885 | 72630 886 | 72700 887 | 72860 888 | 73700 889 | 75841 890 | 76108 891 | 77122 892 | 79220 893 | 79400 894 | 79670 895 | 81110 896 | 83574 897 | 84100 898 | 84500 899 | 86090 900 | 87078 901 | 87300 902 | 87860 903 | 88340 904 | 88880 905 | 89154 906 | 89950 907 | 92600 908 | 96220 909 | 96870 910 | 97503 911 | 99600 912 | 101000 913 | 104000 914 | 105100 915 | 105570 916 | 106900 917 | 108290 918 | 108400 919 | 110840 920 | 110975 921 | 113773 922 | 115000 923 | 116500 924 | 119200 925 | 124720 926 | 127000 927 | 127780 928 | 128200 929 | 128966 930 | 138900 931 | 140900 932 | 141000 933 | 141228 934 | 144000 935 | 145000 936 | 145061 937 | 147245 938 | 147562 939 | 148450 940 | 152218 941 | 154990 942 | 158775 943 | 159940 944 | 161000 945 | 161300 946 | 163500 947 | 165500 948 | 170559 949 | 176000 950 | 178000 951 | 184000 952 | 188800 953 | 196100 954 | 204400 955 | 204880 956 | 210900 957 | 216616 958 | 220930 959 | 238000 960 | 239740 961 | 257226 962 | 265000 963 | 271590 964 | 273200 965 | 285810 966 | 309620 967 | 315612 968 | 320959 969 | 321500 970 | 341400 971 | 348697 972 | 350260 973 | 359030 974 | 360000 975 | 360600 976 | 376500 977 | 378265 978 | 383070 979 | 394740 980 | 410000 981 | 446000 982 | 471750 983 | 497384 984 | 510600 985 | 560000 986 | 590000 987 | 608400 988 | 696900 989 | 704000 990 | 1448374 991 | 2256800 992 | 3275000 993 | 3980000 994 | 4500000 995 | 5066940 996 | 5166299 997 | 7113500 998 | 9842447 999 | 13020696 1000 | 70477170 1001 | -------------------------------------------------------------------------------- /src/ru/README.md: -------------------------------------------------------------------------------- 1 | # Russian covering grammar definitions 2 | 3 | This directory defines a Russian text normalization covering grammar. The 4 | primary entry-point is the FST `VERBALIZER`, defined in 5 | `verbalizer/verbalizer.grm` and compiled in the FST archive 6 | `verbalizer/verbalizer.far`. 7 | -------------------------------------------------------------------------------- /src/ru/classifier/cyrillic.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | export kRussianLowerAlpha = Optimize[ 16 | "а" | "б" | "в" | "г" | "д" | "е" | "ё" | "ж" | "з" | "и" | "й" | 17 | "к" | "л" | "м" | "н" | "о" | "п" | "р" | "с" | "т" | "у" | "ф" | 18 | "х" | "ц" | "ч" | "ш" | "щ" | "ъ" | "ы" | "ь" | "э" | "ю" | "я" ]; 19 | 20 | export kRussianUpperAlpha = Optimize[ 21 | "А" | "Б" | "В" | "Г" | "Д" | "Е" | "Ё" | "Ж" | "З" | "И" | "Й" | 22 | "К" | "Л" | "М" | "Н" | "О" | "П" | "Р" | "С" | "Т" | "У" | "Ф" | 23 | "Х" | "Ц" | "Ч" | "Ш" | "Щ" | "Ъ" | "Ы" | "Ь" | "Э" | "Ю" | "Я" ]; 24 | 25 | export kRussianLowerAlphaStressed = Optimize[ 26 | "а́" | "е́" | "ё́" | "и́" | "о́" | "у́" | "ы́" | "э́" | "ю́" | "я́" ]; 27 | 28 | export kRussianUpperAlphaStressed = Optimize[ 29 | "А́" | "Е́" | "Ё́" | "И́" | "О́" | "У́" | "Ы́" | "Э́" | "Ю́" | "Я́" ]; 30 | 31 | export kRussianRewriteStress = Optimize[ 32 | ("А́" : "А'") | ("Е́" : "Е'") | ("Ё́" : "Ё'") | ("И́" : "И'") | 33 | ("О́" : "О'") | ("У́" : "У'") | ("Ы́" : "Ы'") | ("Э́" : "Э'") | 34 | ("Ю́" : "Ю'") | ("Я́" : "Я'") | 35 | ("а́" : "а'") | ("е́" : "е'") | ("ё́" : "ё'") | ("и́" : "и'") | 36 | ("о́" : "о'") | ("у́" : "у'") | ("ы́" : "ы'") | ("э́" : "э'") | 37 | ("ю́" : "ю'") | ("я́" : "я'") 38 | ]; 39 | 40 | export kRussianRemoveStress = Optimize[ 41 | ("А́" : "А") | ("Е́" : "Е") | ("Ё́" : "Ё") | ("И́" : "И") | ("О́" : "О") | 42 | ("У́" : "У") | ("Ы́" : "Ы") | ("Э́" : "Э") | ("Ю́" : "Ю") | ("Я́" : "Я") | 43 | ("а́" : "а") | ("е́" : "е") | ("ё́" : "ё") | ("и́" : "и") | ("о́" : "о") | 44 | ("у́" : "у") | ("ы́" : "ы") | ("э́" : "э") | ("ю́" : "ю") | ("я́" : "я") 45 | ]; 46 | 47 | # Pre-reform characters, just in case. 48 | export kRussianPreReform = Optimize[ 49 | "ѣ" | "Ѣ" # http://en.wikipedia.org/wiki/Yat 50 | ]; 51 | 52 | export kCyrillicAlphaStressed = Optimize[ 53 | kRussianLowerAlphaStressed | kRussianUpperAlphaStressed 54 | ]; 55 | 56 | export kCyrillicAlpha = Optimize[ 57 | kRussianLowerAlpha | kRussianUpperAlpha | kRussianPreReform 58 | ]; 59 | -------------------------------------------------------------------------------- /src/ru/verbalizer/cardinals-lex.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # AUTOMATICALLY GENERATED: DO NOT EDIT. 16 | import 'util/byte.grm' as b; 17 | 18 | # Utilities for insertion and deletion. 19 | 20 | func I[expr] { 21 | return "" : expr; 22 | } 23 | 24 | func D[expr] { 25 | return expr : ""; 26 | } 27 | 28 | # Powers of base 10. 29 | export POWERS = 30 | "[E15]" 31 | | "[E14]" 32 | | "[E13]" 33 | | "[E12]" 34 | | "[E11]" 35 | | "[E10]" 36 | | "[E9]" 37 | | "[E8]" 38 | | "[E7]" 39 | | "[E6]" 40 | | "[E5]" 41 | | "[E4]" 42 | | "[E3]" 43 | | "[E2]" 44 | | "[E1]" 45 | ; 46 | 47 | export SIGMA = b.kBytes | POWERS; 48 | 49 | export SIGMA_STAR = SIGMA*; 50 | 51 | export SIGMA_PLUS = SIGMA+; 52 | 53 | ################################################################################ 54 | # BEGIN LANGUAGE SPECIFIC DATA 55 | revaluations = 56 | ("[E4]" : "[E1]") 57 | | ("[E5]" : "[E2]") 58 | | ("[E7]" : "[E1]") 59 | | ("[E8]" : "[E2]") 60 | ; 61 | 62 | Ms = "[E3]" | "[E6]" | "[E9]"; 63 | 64 | 65 | func Zero[expr] { 66 | return expr : (""); 67 | } 68 | 69 | space = " "; 70 | 71 | lexset3 = Optimize[ 72 | ("1[E1]+1" : "одиннадцати") 73 | | ("1[E1]+1" : "одиннадцать") 74 | | ("1[E1]+1" : "одиннадцатью") 75 | | ("1[E1]+2" : "двенадцати") 76 | | ("1[E1]+2" : "двенадцать") 77 | | ("1[E1]+2" : "двенадцатью") 78 | | ("1[E1]+3" : "тринадцати") 79 | | ("1[E1]+3" : "тринадцать") 80 | | ("1[E1]+3" : "тринадцатью") 81 | | ("1[E1]+4" : "четырнадцати") 82 | | ("1[E1]+4" : "четырнадцать") 83 | | ("1[E1]+4" : "четырнадцатью") 84 | | ("1[E1]+5" : "пятнадцати") 85 | | ("1[E1]+5" : "пятнадцать") 86 | | ("1[E1]+5" : "пятнадцатью") 87 | | ("1[E1]+6" : "шестнадцати") 88 | | ("1[E1]+6" : "шестнадцать") 89 | | ("1[E1]+6" : "шестнадцатью") 90 | | ("1[E1]+7" : "семнадцати") 91 | | ("1[E1]+7" : "семнадцать") 92 | | ("1[E1]+7" : "семнадцатью") 93 | | ("1[E1]+8" : "восемнадцати") 94 | | ("1[E1]+8" : "восемнадцать") 95 | | ("1[E1]+8" : "восемнадцатью") 96 | | ("1[E1]+9" : "девятнадцати") 97 | | ("1[E1]+9" : "девятнадцать") 98 | | ("1[E1]+9" : "девятнадцатью")] 99 | ; 100 | 101 | lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR]; 102 | 103 | lexset2 = Optimize[ 104 | ("1[E1]" : "десяти") 105 | | ("1[E1]" : "десять") 106 | | ("1[E1]" : "десятью") 107 | | ("1[E2]" : "ста") 108 | | ("1[E2]" : "сто") 109 | | ("2[E1]" : "двадцати") 110 | | ("2[E1]" : "двадцать") 111 | | ("2[E1]" : "двадцатью") 112 | | ("2[E2]" : "двести") 113 | | ("2[E2]" : "двумстам") 114 | | ("2[E2]" : "двумястами") 115 | | ("2[E2]" : "двухсот") 116 | | ("2[E2]" : "двухстах") 117 | | ("3[E1]" : "тридцати") 118 | | ("3[E1]" : "тридцать") 119 | | ("3[E1]" : "тридцатью") 120 | | ("3[E2]" : "тремстам") 121 | | ("3[E2]" : "тремястами") 122 | | ("3[E2]" : "трехсот") 123 | | ("3[E2]" : "трехстах") 124 | | ("3[E2]" : "триста") 125 | | ("4[E1]" : "сорок") 126 | | ("4[E1]" : "сорока") 127 | | ("4[E2]" : "четыремстам") 128 | | ("4[E2]" : "четыреста") 129 | | ("4[E2]" : "четырехсот") 130 | | ("4[E2]" : "четырехстах") 131 | | ("4[E2]" : "четырьмястами") 132 | | ("5[E1]" : "пятидесяти") 133 | | ("5[E1]" : "пятьдесят") 134 | | ("5[E1]" : "пятьюдесятью") 135 | | ("5[E2]" : "пятисот") 136 | | ("5[E2]" : "пятистам") 137 | | ("5[E2]" : "пятистах") 138 | | ("5[E2]" : "пятьсот") 139 | | ("5[E2]" : "пятьюстами") 140 | | ("6[E1]" : "шестидесяти") 141 | | ("6[E1]" : "шестьдесят") 142 | | ("6[E1]" : "шестьюдесятью") 143 | | ("6[E2]" : "шестисот") 144 | | ("6[E2]" : "шестистам") 145 | | ("6[E2]" : "шестистах") 146 | | ("6[E2]" : "шестьсот") 147 | | ("6[E2]" : "шестьюстами") 148 | | ("7[E1]" : "семидесяти") 149 | | ("7[E1]" : "семьдесят") 150 | | ("7[E1]" : "семьюдесятью") 151 | | ("7[E2]" : "семисот") 152 | | ("7[E2]" : "семистам") 153 | | ("7[E2]" : "семистах") 154 | | ("7[E2]" : "семьсот") 155 | | ("7[E2]" : "семьюстами") 156 | | ("8[E1]" : "восемьдесят") 157 | | ("8[E1]" : "восьмидесяти") 158 | | ("8[E1]" : "восьмьюдесятью") 159 | | ("8[E2]" : "восемьсот") 160 | | ("8[E2]" : "восемьюстами") 161 | | ("8[E2]" : "восьмисот") 162 | | ("8[E2]" : "восьмистам") 163 | | ("8[E2]" : "восьмистах") 164 | | ("8[E2]" : "восьмьюстами") 165 | | ("9[E1]" : "девяноста") 166 | | ("9[E1]" : "девяносто") 167 | | ("9[E2]" : "девятисот") 168 | | ("9[E2]" : "девятистам") 169 | | ("9[E2]" : "девятистах") 170 | | ("9[E2]" : "девятьсот") 171 | | ("9[E2]" : "девятьюстами")] 172 | ; 173 | 174 | lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR]; 175 | 176 | lexset1 = Optimize[ 177 | ("+" : "") 178 | | ("1" : "один") 179 | | ("1" : "одна") 180 | | ("1" : "одни") 181 | | ("1" : "одним") 182 | | ("1" : "одними") 183 | | ("1" : "одних") 184 | | ("1" : "одно") 185 | | ("1" : "одного") 186 | | ("1" : "одной") 187 | | ("1" : "одном") 188 | | ("1" : "одному") 189 | | ("1" : "одною") 190 | | ("1" : "одну") 191 | | ("2" : "два") 192 | | ("2" : "две") 193 | | ("2" : "двум") 194 | | ("2" : "двумя") 195 | | ("2" : "двух") 196 | | ("3" : "трем") 197 | | ("3" : "тремя") 198 | | ("3" : "трех") 199 | | ("3" : "три") 200 | | ("4" : "четыре") 201 | | ("4" : "четырем") 202 | | ("4" : "четырех") 203 | | ("4" : "четырьмя") 204 | | ("5" : "пяти") 205 | | ("5" : "пять") 206 | | ("5" : "пятью") 207 | | ("6" : "шести") 208 | | ("6" : "шесть") 209 | | ("6" : "шестью") 210 | | ("7" : "семи") 211 | | ("7" : "семь") 212 | | ("7" : "семью") 213 | | ("8" : "восемь") 214 | | ("8" : "восьми") 215 | | ("8" : "восьмью") 216 | | ("9" : "девяти") 217 | | ("9" : "девять") 218 | | ("9" : "девятью") 219 | | ("[E3]" : "тысяч") 220 | | ("[E3]" : "тысяча") 221 | | ("[E3]" : "тысячам") 222 | | ("[E3]" : "тысячами") 223 | | ("[E3]" : "тысячах") 224 | | ("[E3]" : "тысяче") 225 | | ("[E3]" : "тысячей") 226 | | ("[E3]" : "тысячи") 227 | | ("[E3]" : "тысячу") 228 | | ("[E3]" : "тысячью") 229 | | ("[E6]" : "миллион") 230 | | ("[E6]" : "миллиона") 231 | | ("[E6]" : "миллионам") 232 | | ("[E6]" : "миллионами") 233 | | ("[E6]" : "миллионах") 234 | | ("[E6]" : "миллионе") 235 | | ("[E6]" : "миллионов") 236 | | ("[E6]" : "миллионом") 237 | | ("[E6]" : "миллиону") 238 | | ("[E6]" : "миллионы") 239 | | ("[E9]" : "миллиард") 240 | | ("[E9]" : "миллиарда") 241 | | ("[E9]" : "миллиардам") 242 | | ("[E9]" : "миллиардами") 243 | | ("[E9]" : "миллиардах") 244 | | ("[E9]" : "миллиарде") 245 | | ("[E9]" : "миллиардов") 246 | | ("[E9]" : "миллиардом") 247 | | ("[E9]" : "миллиарду") 248 | | ("[E9]" : "миллиарды") 249 | | ("|0|" : "ноле") 250 | | ("|0|" : "нолем") 251 | | ("|0|" : "ноль") 252 | | ("|0|" : "нолю") 253 | | ("|0|" : "ноля") 254 | | ("|0|" : "нуле") 255 | | ("|0|" : "нулем") 256 | | ("|0|" : "нуль") 257 | | ("|0|" : "нулю") 258 | | ("|0|" : "нуля")] 259 | ; 260 | 261 | lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR]; 262 | 263 | export LEX = Optimize[lex3 @ lex2 @ lex1]; 264 | 265 | export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]"; 266 | 267 | # END LANGUAGE SPECIFIC DATA 268 | ################################################################################ 269 | # Inserts a marker after the Ms. 270 | export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR]; 271 | 272 | # Deletes all powers and "+". 273 | export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR]; 274 | 275 | # Deletes trailing zeros at the beginning of a number, so that "0003" does not 276 | # get treated as an ordinary number. 277 | export DELETE_INITIAL_ZEROS = 278 | CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR] 279 | ; 280 | 281 | NonMs = Optimize[POWERS - Ms]; 282 | 283 | # Deletes (usually) zeros before a non-M. E.g., +0[E1] should be deleted. 284 | export DELETE_INTERMEDIATE_ZEROS1 = 285 | CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR] 286 | ; 287 | 288 | # Deletes (usually) zeros before an M, if there is no non-zero element between 289 | # that and the previous boundary. Thus, if after the result of the rule above we 290 | # end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final 291 | # zero. 292 | export DELETE_INTERMEDIATE_ZEROS2 = Optimize[ 293 | CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR] 294 | @ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]] 295 | ; 296 | 297 | # Final clean up of stray zeros. 298 | export DELETE_REMAINING_ZEROS = Optimize[ 299 | CDRewrite[Zero["+0"], "", "", SIGMA_STAR] 300 | @ CDRewrite[Zero["0"], "", "", SIGMA_STAR]] 301 | ; 302 | 303 | # Applies the revaluation map. For example in English, changes [E4] to [E1] as a 304 | # modifier of [E3]. 305 | export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR]; 306 | 307 | # Deletes the various marks and powers in the input and output. 308 | export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR]; 309 | 310 | export CLEAN_SPACES = Optimize[ 311 | CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR] 312 | @ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR] 313 | @ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]] 314 | ; 315 | 316 | d = b.kDigit; 317 | 318 | # Germanic inversion rule. 319 | germanic = 320 | (I["1+"] d "[E1]" D["+1"]) 321 | | (I["2+"] d "[E1]" D["+2"]) 322 | | (I["3+"] d "[E1]" D["+3"]) 323 | | (I["4+"] d "[E1]" D["+4"]) 324 | | (I["5+"] d "[E1]" D["+5"]) 325 | | (I["6+"] d "[E1]" D["+6"]) 326 | | (I["7+"] d "[E1]" D["+7"]) 327 | | (I["8+"] d "[E1]" D["+8"]) 328 | | (I["9+"] d "[E1]" D["+9"]) 329 | ; 330 | 331 | germanic_inversion = 332 | CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt'] 333 | ; 334 | 335 | export GERMANIC_INVERSION = SIGMA_STAR; 336 | export ORDINAL_RESTRICTION = SIGMA_STAR; 337 | nondigits = b.kBytes - b.kDigit; 338 | export ORDINAL_SUFFIX = D[nondigits*]; 339 | -------------------------------------------------------------------------------- /src/ru/verbalizer/cardinals.tsv: -------------------------------------------------------------------------------- 1 | 0 ноле 2 | 0 ноль 3 | 0 нолю 4 | 0 ноля 5 | 0 нолём 6 | 0 нуле 7 | 0 нуль 8 | 0 нулю 9 | 0 нуля 10 | 0 нулём 11 | 1 один 12 | 1 одна 13 | 1 одни 14 | 1 одним 15 | 1 одними 16 | 1 одних 17 | 1 одно 18 | 1 одного 19 | 1 одной 20 | 1 одном 21 | 1 одному 22 | 1 одною 23 | 1 раз 24 | 1 одну 25 | 2 два 26 | 2 две 27 | 2 двум 28 | 2 двумя 29 | 2 двух 30 | 3 тремя 31 | 3 три 32 | 3 трём 33 | 3 трёх 34 | 4 четыре 35 | 4 четырьмя 36 | 4 четырём 37 | 4 четырёх 38 | 5 пяти 39 | 5 пять 40 | 5 пятью 41 | 6 шести 42 | 6 шесть 43 | 6 шестью 44 | 7 семи 45 | 7 семь 46 | 7 семью 47 | 8 восемь 48 | 8 восьми 49 | 8 восьмью 50 | 9 девяти 51 | 9 девять 52 | 9 девятью 53 | 10 десяти 54 | 10 десять 55 | 10 десятью 56 | 11 одиннадцати 57 | 11 одиннадцать 58 | 11 одиннадцатью 59 | 12 двенадцати 60 | 12 двенадцать 61 | 12 двенадцатью 62 | 13 тринадцати 63 | 13 тринадцать 64 | 13 тринадцатью 65 | 14 четырнадцати 66 | 14 четырнадцать 67 | 14 четырнадцатью 68 | 15 пятнадцати 69 | 15 пятнадцать 70 | 15 пятнадцатью 71 | 16 шестнадцати 72 | 16 шестнадцать 73 | 16 шестнадцатью 74 | 17 семнадцати 75 | 17 семнадцать 76 | 17 семнадцатью 77 | 18 восемнадцати 78 | 18 восемнадцать 79 | 18 восемнадцатью 80 | 19 девятнадцати 81 | 19 девятнадцать 82 | 19 девятнадцатью 83 | 20 двадцати 84 | 20 двадцать 85 | 20 двадцатью 86 | 30 тридцати 87 | 30 тридцать 88 | 30 тридцатью 89 | 40 сорок 90 | 40 сорока 91 | 50 пятидесяти 92 | 50 пятьдесят 93 | 50 пятьюдесятью 94 | 60 шестидесяти 95 | 60 шестьдесят 96 | 60 шестьюдесятью 97 | 70 семидесяти 98 | 70 семьдесят 99 | 70 семьюдесятью 100 | 80 восемьдесят 101 | 80 восьмидесяти 102 | 80 восьмьюдесятью 103 | 90 девяноста 104 | 90 девяносто 105 | 100 ста 106 | 100 сто 107 | 200 двести 108 | 200 двумстам 109 | 200 двумястами 110 | 200 двухсот 111 | 200 двухстах 112 | 300 тремястами 113 | 300 трехсот 114 | 300 триста 115 | 300 трёмстам 116 | 300 трёхстах 117 | 400 четыреста 118 | 400 четырьмястами 119 | 400 четырёмстам 120 | 400 четырёхсот 121 | 400 четырёхстах 122 | 500 пятисот 123 | 500 пятистам 124 | 500 пятистах 125 | 500 пятьсот 126 | 500 пятьюстами 127 | 600 шестисот 128 | 600 шестистам 129 | 600 шестистах 130 | 600 шестьсот 131 | 600 шестьюстами 132 | 700 семисот 133 | 700 семистам 134 | 700 семистах 135 | 700 семьсот 136 | 700 семьюстами 137 | 800 восемьсот 138 | 800 восемьюстами 139 | 800 восьмисот 140 | 800 восьмистам 141 | 800 восьмистах 142 | 800 восьмьюстами 143 | 900 девятисот 144 | 900 девятистам 145 | 900 девятистах 146 | 900 девятьсот 147 | 900 девятьюстами 148 | 1000 тысяч 149 | 1000 тысяча 150 | 1000 тысячам 151 | 1000 тысячами 152 | 1000 тысячах 153 | 1000 тысяче 154 | 1000 тысячей 155 | 1000 тысячи 156 | 1000 тысячу 157 | 1000 тысячью 158 | 1000000 миллион 159 | 1000000 миллиона 160 | 1000000 миллионам 161 | 1000000 миллионами 162 | 1000000 миллионах 163 | 1000000 миллионе 164 | 1000000 миллионов 165 | 1000000 миллионом 166 | 1000000 миллиону 167 | 1000000 миллионы 168 | 1000000000 миллиард 169 | 1000000000 миллиарда 170 | 1000000000 миллиардам 171 | 1000000000 миллиардами 172 | 1000000000 миллиардах 173 | 1000000000 миллиарде 174 | 1000000000 миллиардов 175 | 1000000000 миллиардом 176 | 1000000000 миллиарду 177 | 1000000000 миллиарды 178 | -------------------------------------------------------------------------------- /src/ru/verbalizer/extra_numbers.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/byte.grm' as b; 16 | import 'ru/verbalizer/numbers.grm' as n; 17 | 18 | digit = b.kDigit @ n.CARDINAL_NUMBERS | ("0" : "@@OTHER_ZERO_VERBALIZATIONS@@"); 19 | 20 | export DIGITS = digit (n.I[" "] digit)*; 21 | 22 | # Various common factorizations 23 | 24 | two_digits = b.kDigit{2} @ n.CARDINAL_NUMBERS; 25 | 26 | three_digits = b.kDigit{3} @ n.CARDINAL_NUMBERS; 27 | 28 | mixed = 29 | (digit n.I[" "] two_digits) 30 | | (two_digits n.I[" "] two_digits) 31 | | (two_digits n.I[" "] three_digits) 32 | | (two_digits n.I[" "] two_digits n.I[" "] two_digits) 33 | ; 34 | 35 | export MIXED_NUMBERS = Optimize[mixed]; 36 | -------------------------------------------------------------------------------- /src/ru/verbalizer/factorization.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/byte.grm' as b; 16 | import 'util/util.grm' as u; 17 | import 'ru/verbalizer/numbers.grm' as n; 18 | 19 | func ToNumberName[expr] { 20 | number_name_seq = n.CARDINAL_NUMBERS (" " n.CARDINAL_NUMBERS)*; 21 | return Optimize[expr @ number_name_seq]; 22 | } 23 | 24 | d = b.kDigit; 25 | 26 | leading_zero = CDRewrite[n.I[" "], ("[BOS]" | " ") "0", "", b.kBytes*]; 27 | 28 | by_ones = d n.I[" "]; 29 | by_twos = (d{2} @ leading_zero) n.I[" "]; 30 | by_threes = (d{3} @ leading_zero) n.I[" "]; 31 | 32 | groupings = by_twos* (by_threes | by_twos | by_ones); 33 | 34 | export FRACTIONAL_PART_UNGROUPED = 35 | Optimize[ToNumberName[by_ones+ @ u.CLEAN_SPACES]] 36 | ; 37 | export FRACTIONAL_PART_GROUPED = 38 | Optimize[ToNumberName[groupings @ u.CLEAN_SPACES]] 39 | ; 40 | export FRACTIONAL_PART_UNPARSED = Optimize[ToNumberName[d*]]; 41 | -------------------------------------------------------------------------------- /src/ru/verbalizer/float.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'ru/verbalizer/factorization.grm' as f; 16 | import 'ru/verbalizer/lexical_map.grm' as l; 17 | import 'ru/verbalizer/numbers.grm' as n; 18 | 19 | fractional_part_ungrouped = f.FRACTIONAL_PART_UNGROUPED; 20 | fractional_part_grouped = f.FRACTIONAL_PART_GROUPED; 21 | fractional_part_unparsed = f.FRACTIONAL_PART_UNPARSED; 22 | 23 | __fractional_part__ = fractional_part_unparsed; 24 | __decimal_marker__ = ","; 25 | 26 | export FLOAT = Optimize[ 27 | (n.CARDINAL_NUMBERS 28 | (__decimal_marker__ : " @@DECIMAL_DOT_EXPRESSION@@ ") 29 | __fractional_part__) @ l.LEXICAL_MAP] 30 | ; 31 | -------------------------------------------------------------------------------- /src/ru/verbalizer/g.fst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google-research-datasets/TextNormalizationCoveringGrammars/37f5fb8a9e81a2512f7ea78e88fad3210acf6fe6/src/ru/verbalizer/g.fst -------------------------------------------------------------------------------- /src/ru/verbalizer/lexical_map.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/byte.grm' as b; 16 | 17 | lexical_map = StringFile['ru/verbalizer/lexical_map.tsv']; 18 | 19 | sigma_star = b.kBytes*; 20 | 21 | del_null = CDRewrite["__NULL__" : "", "", "", sigma_star]; 22 | 23 | export LEXICAL_MAP = Optimize[ 24 | CDRewrite[lexical_map, "", "", sigma_star] @ del_null] 25 | ; 26 | -------------------------------------------------------------------------------- /src/ru/verbalizer/lexical_map.tsv: -------------------------------------------------------------------------------- 1 | @@CONNECTOR_RANGE@@ до 2 | @@CONNECTOR_RATIO@@ к 3 | @@CONNECTOR_BY@@ на 4 | @@CONNECTOR_CONSECUTIVE_YEAR@@ до 5 | @@JANUARY@@ январь 6 | @@JANUARY@@ январи 7 | @@JANUARY@@ января 8 | @@JANUARY@@ январей 9 | @@JANUARY@@ январю 10 | @@JANUARY@@ январям 11 | @@JANUARY@@ январь 12 | @@JANUARY@@ январи 13 | @@JANUARY@@ январём 14 | @@JANUARY@@ январями 15 | @@JANUARY@@ январе 16 | @@JANUARY@@ январях 17 | @@FEBRUARY@@ февраль 18 | @@FEBRUARY@@ феврали 19 | @@FEBRUARY@@ февраля 20 | @@FEBRUARY@@ февралей 21 | @@FEBRUARY@@ февралю 22 | @@FEBRUARY@@ февралям 23 | @@FEBRUARY@@ февраль 24 | @@FEBRUARY@@ феврали 25 | @@FEBRUARY@@ февралём 26 | @@FEBRUARY@@ февралями 27 | @@FEBRUARY@@ феврале 28 | @@FEBRUARY@@ февралях 29 | @@MARCH@@ март 30 | @@MARCH@@ марты 31 | @@MARCH@@ марта 32 | @@MARCH@@ мартов 33 | @@MARCH@@ марту 34 | @@MARCH@@ мартам 35 | @@MARCH@@ март 36 | @@MARCH@@ марты 37 | @@MARCH@@ мартом 38 | @@MARCH@@ мартами 39 | @@MARCH@@ марте 40 | @@MARCH@@ мартах 41 | @@APRIL@@ апрель 42 | @@APRIL@@ апрели 43 | @@APRIL@@ апреля 44 | @@APRIL@@ апрелей 45 | @@APRIL@@ апрелю 46 | @@APRIL@@ апрелям 47 | @@APRIL@@ апрель 48 | @@APRIL@@ апрели 49 | @@APRIL@@ апрелем 50 | @@APRIL@@ апрелями 51 | @@APRIL@@ апреле 52 | @@APRIL@@ апрелях 53 | @@MAY@@ май 54 | @@MAY@@ маи 55 | @@MAY@@ мая 56 | @@MAY@@ маев 57 | @@MAY@@ маю 58 | @@MAY@@ маям 59 | @@MAY@@ май 60 | @@MAY@@ маи 61 | @@MAY@@ маем 62 | @@MAY@@ маями 63 | @@MAY@@ мае 64 | @@MAY@@ маях 65 | @@JUN@@ июнь 66 | @@JUN@@ июни 67 | @@JUN@@ июня 68 | @@JUN@@ июней 69 | @@JUN@@ июню 70 | @@JUN@@ июням 71 | @@JUN@@ июнь 72 | @@JUN@@ июни 73 | @@JUN@@ июнем 74 | @@JUN@@ июнями 75 | @@JUN@@ июне 76 | @@JUN@@ июнях 77 | @@JUL@@ июль 78 | @@JUL@@ июли 79 | @@JUL@@ июля 80 | @@JUL@@ июлей 81 | @@JUL@@ июлю 82 | @@JUL@@ июлям 83 | @@JUL@@ июль 84 | @@JUL@@ июли 85 | @@JUL@@ июлем 86 | @@JUL@@ июлями 87 | @@JUL@@ июле 88 | @@JUL@@ июлях 89 | @@AUGUST@@ август 90 | @@AUGUST@@ августы 91 | @@AUGUST@@ августа 92 | @@AUGUST@@ августов 93 | @@AUGUST@@ августу 94 | @@AUGUST@@ августам 95 | @@AUGUST@@ август 96 | @@AUGUST@@ августы 97 | @@AUGUST@@ августом 98 | @@AUGUST@@ августами 99 | @@AUGUST@@ августе 100 | @@AUGUST@@ августах 101 | @@SEPTEMBER@@ сентябрь 102 | @@SEPTEMBER@@ сентябри 103 | @@SEPTEMBER@@ сентября 104 | @@SEPTEMBER@@ сентябрей 105 | @@SEPTEMBER@@ сентябрю 106 | @@SEPTEMBER@@ сентябрям 107 | @@SEPTEMBER@@ сентябрь 108 | @@SEPTEMBER@@ сентябри 109 | @@SEPTEMBER@@ сентябрём 110 | @@SEPTEMBER@@ сентябрями 111 | @@SEPTEMBER@@ сентябре 112 | @@SEPTEMBER@@ сентябрях 113 | @@OCTOBER@@ октябрь 114 | @@OCTOBER@@ октябри 115 | @@OCTOBER@@ октября 116 | @@OCTOBER@@ октябрей 117 | @@OCTOBER@@ октябрю 118 | @@OCTOBER@@ октябрям 119 | @@OCTOBER@@ октябрь 120 | @@OCTOBER@@ октябри 121 | @@OCTOBER@@ октябрём 122 | @@OCTOBER@@ октябрями 123 | @@OCTOBER@@ октябре 124 | @@OCTOBER@@ октябрях 125 | @@NOVEMBER@@ ноябрь 126 | @@NOVEMBER@@ ноябри 127 | @@NOVEMBER@@ ноября 128 | @@NOVEMBER@@ ноябрей 129 | @@NOVEMBER@@ ноябрю 130 | @@NOVEMBER@@ ноябрям 131 | @@NOVEMBER@@ ноябрь 132 | @@NOVEMBER@@ ноябри 133 | @@NOVEMBER@@ ноябрём 134 | @@NOVEMBER@@ ноябрями 135 | @@NOVEMBER@@ ноябре 136 | @@NOVEMBER@@ ноябрях 137 | @@DECEMBER@@ декабрь 138 | @@DECEMBER@@ декабри 139 | @@DECEMBER@@ декабря 140 | @@DECEMBER@@ декабрей 141 | @@DECEMBER@@ декабрю 142 | @@DECEMBER@@ декабрям 143 | @@DECEMBER@@ декабрь 144 | @@DECEMBER@@ декабри 145 | @@DECEMBER@@ декабрём 146 | @@DECEMBER@@ декабрями 147 | @@DECEMBER@@ декабре 148 | @@DECEMBER@@ декабрях 149 | @@MINUS@@ минус 150 | @@DECIMAL_DOT_EXPRESSION@@ целая 151 | @@DECIMAL_DOT_EXPRESSION@@ целой 152 | @@DECIMAL_DOT_EXPRESSION@@ целой 153 | @@DECIMAL_DOT_EXPRESSION@@ целую 154 | @@DECIMAL_DOT_EXPRESSION@@ целой 155 | @@DECIMAL_DOT_EXPRESSION@@ целой 156 | @@DECIMAL_DOT_EXPRESSION@@ целым 157 | @@DECIMAL_DOT_EXPRESSION@@ целыми 158 | @@DECIMAL_DOT_EXPRESSION@@ целых 159 | @@DECIMAL_DOT_EXPRESSION@@ целых 160 | @@URL_DOT_EXPRESSION@@ точка 161 | @@PERIOD@@ точка 162 | @@DECIMAL_EXPONENT@@ умножить на десять в степени 163 | @@COLON@@ двоеточие 164 | @@SLASH@@ косая черта 165 | @@PASSWORD@@ пароль 166 | @@AT@@ собака 167 | @@PORT@@ порт 168 | @@QUESTION_MARK@@ вопросительный знак 169 | @@HASH@@ решётка 170 | @@HASH@@ решетка 171 | @@MONEY_AND@@ и 172 | @@AND@@ и 173 | @@PHONE_PLUS@@ плюс 174 | @@ARITHMETIC_PLUS@@ плюс 175 | @@PHONE_EXTENSION@@ добавочный номер 176 | @@TIME_AM@@ утра 177 | @@TIME_PM@@ вечера 178 | @@HOUR@@ час 179 | @@HOUR@@ часа 180 | @@HOUR@@ часам 181 | @@HOUR@@ часами 182 | @@HOUR@@ часах 183 | @@HOUR@@ часе 184 | @@HOUR@@ часов 185 | @@HOUR@@ часом 186 | @@HOUR@@ часу 187 | @@HOUR@@ часы 188 | @@MINUTE@@ минут 189 | @@MINUTE@@ минута 190 | @@MINUTE@@ минутам 191 | @@MINUTE@@ минутами 192 | @@MINUTE@@ минутах 193 | @@MINUTE@@ минуте 194 | @@MINUTE@@ минутой 195 | @@MINUTE@@ минутою 196 | @@MINUTE@@ минуту 197 | @@MINUTE@@ минуты 198 | @@TIME_AFTER@@ __NULL__ 199 | @@TIME_BEFORE_PRE@@ без 200 | @@TIME_QUARTER@@ четверть 201 | @@TIME_QUARTER@@ четверти 202 | @@TIME_HALF@@ половина 203 | @@TIME_HALF@@ половины 204 | @@TIME_HALF@@ половину 205 | @@TIME_HALF@@ половин 206 | @@TIME_HALF@@ половине 207 | @@TIME_HALF@@ половинам 208 | @@TIME_HALF@@ половиной 209 | @@TIME_HALF@@ половинами 210 | @@TIME_HALF@@ половинах 211 | @@PERCENT@@ процент 212 | @@PERCENT@@ процента 213 | @@PERCENT@@ процентам 214 | @@PERCENT@@ процентами 215 | @@PERCENT@@ процентах 216 | @@PERCENT@@ проценте 217 | @@PERCENT@@ процентов 218 | @@PERCENT@@ процентом 219 | @@PERCENT@@ проценту 220 | @@PERCENT@@ проценты 221 | @@PERCENT@@ проценты 222 | -------------------------------------------------------------------------------- /src/ru/verbalizer/math.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'ru/verbalizer/float.grm' as f; 16 | import 'ru/verbalizer/lexical_map.grm' as l; 17 | import 'ru/verbalizer/numbers.grm' as n; 18 | 19 | float = f.FLOAT; 20 | card = n.CARDINAL_NUMBERS; 21 | number = card | float; 22 | 23 | plus = "+" : " @@ARITHMETIC_PLUS@@ "; 24 | times = "*" : " @@ARITHMETIC_TIMES@@ "; 25 | minus = "-" : " @@ARITHMETIC_MINUS@@ "; 26 | division = "/" : " @@ARITHMETIC_DIVISION@@ "; 27 | 28 | operator = plus | times | minus | division; 29 | 30 | percent = "%" : " @@PERCENT@@"; 31 | 32 | export ARITHMETIC = 33 | Optimize[((number operator number) | (number percent)) @ l.LEXICAL_MAP] 34 | ; 35 | -------------------------------------------------------------------------------- /src/ru/verbalizer/miscellaneous.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/byte.grm' as b; 16 | import 'ru/classifier/cyrillic.grm' as c; 17 | import 'ru/verbalizer/extra_numbers.grm' as e; 18 | import 'ru/verbalizer/lexical_map.grm' as l; 19 | import 'ru/verbalizer/numbers.grm' as n; 20 | import 'ru/verbalizer/spelled.grm' as s; 21 | 22 | letter = b.kAlpha | c.kCyrillicAlpha; 23 | dash = "-"; 24 | word = letter+; 25 | possibly_split_word = word (((dash | ".") : " ") word)* n.D["."]?; 26 | 27 | post_word_symbol = 28 | ("+" : ("@@ARITHMETIC_PLUS@@" | "@@POSITIVE@@")) | 29 | ("-" : ("@@ARITHMETIC_MINUS@@" | "@@NEGATIVE@@")) | 30 | ("*" : "@@STAR@@") 31 | ; 32 | 33 | pre_word_symbol = 34 | ("@" : "@@AT@@") | 35 | ("/" : "@@SLASH@@") | 36 | ("#" : "@@HASH@@") 37 | ; 38 | 39 | post_word = possibly_split_word n.I[" "] post_word_symbol; 40 | 41 | pre_word = pre_word_symbol n.I[" "] possibly_split_word; 42 | 43 | ## Number/digit sequence combos, maybe with a dash 44 | 45 | spelled_word = word @ s.SPELLED_NO_LETTER; 46 | 47 | word_number = 48 | (word | spelled_word) 49 | (n.I[" "] | (dash : " ")) 50 | (e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS) 51 | ; 52 | 53 | number_word = 54 | (e.DIGITS | n.CARDINAL_NUMBERS | e.MIXED_NUMBERS) 55 | (n.I[" "] | (dash : " ")) 56 | (word | spelled_word) 57 | ; 58 | 59 | ## Two-digit year. 60 | 61 | # Note that in this case to be fair we really have to allow ordinals too since 62 | # in some languages that's what you would have. 63 | 64 | two_digit_year = n.D["'"] (b.kDigit{2} @ (n.CARDINAL_NUMBERS | e.DIGITS)); 65 | 66 | dot_com = ("." : "@@URL_DOT_EXPRESSION@@") n.I[" "] "com"; 67 | 68 | miscellaneous = Optimize[ 69 | possibly_split_word 70 | | post_word 71 | | pre_word 72 | | word_number 73 | | number_word 74 | | two_digit_year 75 | | dot_com 76 | ]; 77 | 78 | export MISCELLANEOUS = Optimize[miscellaneous @ l.LEXICAL_MAP]; 79 | -------------------------------------------------------------------------------- /src/ru/verbalizer/money.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/byte.grm' as b; 16 | import 'ru/verbalizer/lexical_map.grm' as l; 17 | import 'ru/verbalizer/numbers.grm' as n; 18 | 19 | card = n.CARDINAL_NUMBERS; 20 | 21 | __currency__ = StringFile['ru/verbalizer/money.tsv']; 22 | 23 | d = b.kDigit; 24 | D = d - "0"; 25 | 26 | cents = ((n.D["0"] | D) d) @ card; 27 | 28 | # Only dollar for the verbalizer tests for English. Will need to add other 29 | # currencies. 30 | usd_maj = Project["usd_maj" @ __currency__, 'output']; 31 | usd_min = Project["usd_min" @ __currency__, 'output']; 32 | and = " @@MONEY_AND@@ " | " "; 33 | 34 | dollar1 = 35 | n.D["$"] card n.I[" " usd_maj] n.I[and] n.D["."] cents n.I[" " usd_min] 36 | ; 37 | 38 | dollar2 = n.D["$"] card n.I[" " usd_maj] n.D["."] n.D["00"]; 39 | 40 | dollar3 = n.D["$"] card n.I[" " usd_maj]; 41 | 42 | dollar = Optimize[dollar1 | dollar2 | dollar3]; 43 | 44 | export MONEY = Optimize[dollar @ l.LEXICAL_MAP]; 45 | -------------------------------------------------------------------------------- /src/ru/verbalizer/money.tsv: -------------------------------------------------------------------------------- 1 | usd_maj доллара 2 | usd_maj долларами 3 | usd_maj долларам 4 | usd_maj долларах 5 | usd_maj долларе 6 | usd_maj долларов 7 | usd_maj долларом 8 | usd_maj доллар 9 | usd_maj доллар 10 | usd_maj доллару 11 | usd_maj доллары 12 | usd_maj доллары 13 | usd_min цент 14 | usd_min цент 15 | usd_min цента 16 | usd_min центам 17 | usd_min центами 18 | usd_min центах 19 | usd_min центе 20 | usd_min центов 21 | usd_min центом 22 | usd_min центу 23 | usd_min центы 24 | usd_min центы 25 | -------------------------------------------------------------------------------- /src/ru/verbalizer/nominatives.tsv: -------------------------------------------------------------------------------- 1 | нуль 2 | ноль 3 | один 4 | два 5 | две 6 | три 7 | четыре 8 | пять 9 | шесть 10 | семь 11 | восемь 12 | девять 13 | десять 14 | одиннадцать 15 | двенадцать 16 | тринадцать 17 | четырнадцать 18 | пятнадцать 19 | шестнадцать 20 | семнадцать 21 | восемнадцать 22 | девятнадцать 23 | двадцать 24 | тридцать 25 | сорок 26 | пятьдесят 27 | шестьдесят 28 | семьдесят 29 | восемьдесят 30 | девяносто 31 | сто 32 | двести 33 | триста 34 | четыреста 35 | пятьсот 36 | шестьсот 37 | семьсот 38 | восемьсот 39 | девятьсот 40 | тысячи 41 | тысяч 42 | тысяча 43 | миллионов 44 | миллион 45 | миллиона 46 | миллиардов 47 | миллиард 48 | миллиарда 49 | первая 50 | первого 51 | первое 52 | первый 53 | вторая 54 | второе 55 | второй 56 | третий 57 | третье 58 | третья 59 | четвертая 60 | четвертое 61 | четвертой 62 | пятая 63 | пятое 64 | пятой 65 | шестая 66 | шестое 67 | шестой 68 | седьмая 69 | седьмое 70 | седьмой 71 | восьмая 72 | восьмое 73 | восьмой 74 | девятая 75 | девятое 76 | девятой 77 | десятая 78 | десятое 79 | десятой 80 | одиннадцатая 81 | одиннадцатое 82 | одиннадцатой 83 | двенадцатая 84 | двенадцатое 85 | двенадцатой 86 | тринадцатая 87 | тринадцатое 88 | тринадцатой 89 | четырнадцатая 90 | четырнадцатое 91 | четырнадцатой 92 | пятнадцатая 93 | пятнадцатое 94 | пятнадцатой 95 | шестнадцатая 96 | шестнадцатое 97 | шестнадцатой 98 | семнадцатая 99 | семнадцатое 100 | семнадцатой 101 | восемнадцатая 102 | восемнадцатое 103 | восемнадцатой 104 | девятнадцатая 105 | девятнадцатое 106 | девятнадцатой 107 | двадцатая 108 | двадцатое 109 | двадцатой 110 | тридцатая 111 | тридцатое 112 | тридцатой 113 | сороковая 114 | сороковое 115 | сороковой 116 | пятидесятая 117 | пятидесятое 118 | пятидесятой 119 | шестидесятая 120 | шестидесятое 121 | шестидесятой 122 | семидесятая 123 | семидесятое 124 | семидесятой 125 | восьмидесятая 126 | восьмидесятое 127 | восьмидесятой 128 | девяностая 129 | девяностое 130 | девяностой 131 | сотая 132 | сотое 133 | сотой 134 | двухсотая 135 | двухсотое 136 | двухсотой 137 | трехсотая 138 | трехсотое 139 | трехсотой 140 | четырехсотая 141 | четырехсотое 142 | четырехсотой 143 | пятисотая 144 | пятисотое 145 | пятисотой 146 | шестисотая 147 | шестисотое 148 | шестисотой 149 | семисотая 150 | семисотое 151 | семисотой 152 | восьмисотая 153 | восьмисотое 154 | восьмисотой 155 | девятисотая 156 | девятисотое 157 | девятисотой 158 | тысячная 159 | тысячное 160 | тысячной 161 | миллионная 162 | миллионное 163 | миллионной 164 | миллиардная 165 | миллиардное 166 | миллиардной 167 | -------------------------------------------------------------------------------- /src/ru/verbalizer/number_names.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Russian minimally supervised number grammar. 16 | # 17 | # Supports cardinals and ordinals in all inflected forms. 18 | # 19 | # The language-specific acceptor G was compiled with digit, teen, decade, 20 | # century, and big power-of-ten preterminals. The lexicon transducer is 21 | # highly ambiguous, but no LM is used. 22 | 23 | import 'util/arithmetic.grm' as a; 24 | 25 | # Intersects the universal factorization transducer (F) with language-specific 26 | # acceptor (G). 27 | 28 | d = a.DELTA_STAR; 29 | f = a.IARITHMETIC_RESTRICTED; 30 | g = LoadFst['ru/verbalizer/g.fst']; 31 | fg = Optimize[d @ Optimize[f @ Optimize[f @ Optimize[f @ g]]]]; 32 | test1 = AssertEqual["230" @ fg, "(+ 200 30 +)"]; 33 | 34 | # Compiles lexicon transducers (L). 35 | 36 | cardinal_name = StringFile['ru/verbalizer/cardinals.tsv']; 37 | cardinal_l = Optimize[(cardinal_name " ")* cardinal_name]; 38 | 39 | ordinal_name = StringFile['ru/verbalizer/ordinals.tsv']; 40 | ordinal_l = Optimize[(cardinal_name " ")* ordinal_name]; 41 | 42 | # Composes L with the leaf transducer (P), then composes that with FG. 43 | 44 | p = a.LEAVES; 45 | 46 | export CARDINAL_NUMBER_NAME = Optimize[fg @ (p @ cardinal_l)]; 47 | 48 | export ORDINAL_NUMBER_NAME = Optimize[fg @ (p @ ordinal_l)]; 49 | -------------------------------------------------------------------------------- /src/ru/verbalizer/numbers.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'ru/verbalizer/number_names.grm' as n; 16 | import 'universal/thousands_punct.grm' as t; 17 | import 'util/byte.grm' as b; 18 | 19 | nominatives = StringFile['ru/verbalizer/nominatives.tsv']; 20 | 21 | sigma_star = b.kBytes*; 22 | 23 | nominative_filter = 24 | CDRewrite[nominatives ("" : "" <-1>), "[BOS]" | " ", " " | "[EOS]", sigma_star] 25 | ; 26 | 27 | cardinal = n.CARDINAL_NUMBER_NAME; 28 | ordinal = n.ORDINAL_NUMBER_NAME; 29 | 30 | # Putting these here since this grammar gets incorporated by all the others. 31 | 32 | func I[expr] { 33 | return "" : expr; 34 | } 35 | 36 | func D[expr] { 37 | return expr : ""; 38 | } 39 | 40 | # Since we know this is the default for Russian, it's fair game to set it. 41 | separators = t.dot_thousands | t.no_delimiter; 42 | 43 | export CARDINAL_NUMBERS = Optimize[ 44 | separators 45 | @ cardinal 46 | ]; 47 | 48 | export ORDINAL_NUMBERS_UNMARKED = Optimize[ 49 | separators 50 | @ ordinal 51 | ]; 52 | 53 | 54 | endings = StringFile['ru/verbalizer/ordinal_endings.tsv']; 55 | 56 | not_dash = (b.kBytes - "-")+; 57 | del_ending = CDRewrite[("-" not_dash) : "", "", "[EOS]", sigma_star]; 58 | 59 | # Needs nominative_filter here if we take out Kyle's models. 60 | export ORDINAL_NUMBERS_MARKED = Optimize[ 61 | Optimize[Optimize[separators @ ordinal] "-" not_dash] 62 | @ Optimize[sigma_star endings] 63 | @ del_ending] 64 | ; 65 | 66 | export ORDINAL_NUMBERS = 67 | Optimize[ORDINAL_NUMBERS_MARKED | ORDINAL_NUMBERS_UNMARKED] 68 | ; 69 | -------------------------------------------------------------------------------- /src/ru/verbalizer/numbers_plus.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Grammar for things built mostly on numbers. 16 | 17 | import 'ru/verbalizer/factorization.grm' as f; 18 | import 'ru/verbalizer/lexical_map.grm' as l; 19 | import 'ru/verbalizer/numbers.grm' as n; 20 | 21 | num = n.CARDINAL_NUMBERS; 22 | ord = n.ORDINAL_NUMBERS_UNMARKED; 23 | digits = f.FRACTIONAL_PART_UNGROUPED; 24 | 25 | # Various symbols. 26 | 27 | plus = "+" : "@@ARITHMETIC_PLUS@@"; 28 | minus = "-" : "@@ARITHMETIC_MINUS@@"; 29 | slash = "/" : "@@SLASH@@"; 30 | dot = "." : "@@URL_DOT_EXPRESSION@@"; 31 | dash = "-" : "@@DASH@@"; 32 | equals = "=" : "@@ARITHMETIC_EQUALS@@"; 33 | 34 | degree = "°" : "@@DEGREE@@"; 35 | 36 | division = ("/" | "÷") : "@@ARITHMETIC_DIVISION@@"; 37 | 38 | times = ("x" | "*") : "@@ARITHMETIC_TIMES@@"; 39 | 40 | power = "^" : "@@DECIMAL_EXPONENT@@"; 41 | 42 | square_root = "√" : "@@SQUARE_ROOT@@"; 43 | 44 | percent = "%" : "@@PERCENT@@"; 45 | 46 | # Safe roman numbers. 47 | 48 | # NB: Do not change the formatting here. NO_EDIT must be on the same 49 | # line as the path. 50 | rfile = 51 | 'universal/roman_numerals.tsv' # NO_EDIT 52 | ; 53 | 54 | roman = StringFile[rfile]; 55 | 56 | ## Main categories. 57 | 58 | cat_dot_number = 59 | num 60 | n.I[" "] dot n.I[" "] num 61 | (n.I[" "] dot n.I[" "] num)+ 62 | ; 63 | 64 | cat_slash_number = 65 | num 66 | n.I[" "] slash n.I[" "] num 67 | (n.I[" "] slash n.I[" "] num)* 68 | ; 69 | 70 | cat_dash_number = 71 | num 72 | n.I[" "] dash n.I[" "] num 73 | (n.I[" "] dash n.I[" "] num)* 74 | ; 75 | 76 | cat_signed_number = ((plus | minus) n.I[" "])? num; 77 | 78 | cat_degree = cat_signed_number n.I[" "] degree; 79 | 80 | cat_country_code = plus n.I[" "] (num | digits); 81 | 82 | cat_math_operations = 83 | plus 84 | | minus 85 | | division 86 | | times 87 | | equals 88 | | percent 89 | | power 90 | | square_root 91 | ; 92 | 93 | # Roman numbers are often either cardinals or ordinals in various languages. 94 | cat_roman = roman @ (num | ord); 95 | 96 | # Allow 97 | # 98 | # number:number 99 | # number-number 100 | # 101 | # to just be 102 | # 103 | # number number. 104 | 105 | cat_number_number = 106 | num ((":" | "-") : " ") num 107 | ; 108 | 109 | # Some additional readings for these symbols. 110 | 111 | cat_additional_readings = 112 | ("/" : "@@PER@@") | 113 | ("+" : "@@AND@@") | 114 | ("-" : ("@@HYPHEN@@" | "@@CONNECTOR_TO@@")) | 115 | ("*" : "@@STAR@@") | 116 | ("x" : ("x" | "@@CONNECTOR_BY@@")) | 117 | ("@" : "@@AT@@") 118 | ; 119 | 120 | numbers_plus = Optimize[ 121 | cat_dot_number 122 | | cat_slash_number 123 | | cat_dash_number 124 | | cat_signed_number 125 | | cat_degree 126 | | cat_country_code 127 | | cat_math_operations 128 | | cat_roman 129 | | cat_number_number 130 | | cat_additional_readings 131 | ]; 132 | 133 | export NUMBERS_PLUS = Optimize[numbers_plus @ l.LEXICAL_MAP]; 134 | -------------------------------------------------------------------------------- /src/ru/verbalizer/ordinal_endings.tsv: -------------------------------------------------------------------------------- 1 | ая-ая 2 | ого-го 3 | ьего-го 4 | ьего-его 5 | ьей-ей 6 | ьему-ему 7 | ьем-ем 8 | ое-е 9 | ые-е 10 | ье-е 11 | ий-ий 12 | ьими-ими 13 | ьим-им 14 | ьих-их 15 | ьи-и 16 | ий-й 17 | ой-й 18 | ый-й 19 | ыми-ми 20 | ьими-ми 21 | ому-му 22 | ьему-му 23 | ого-ого 24 | ое-ое 25 | ой-ой 26 | ом-ом 27 | ому-ому 28 | ую-ую 29 | ых-х 30 | ьих-х 31 | ые-ые 32 | ый-ый 33 | ыми-ыми 34 | ым-ым 35 | ых-ых 36 | ую-ю 37 | ью-ю 38 | ая-я 39 | ья-я 40 | -------------------------------------------------------------------------------- /src/ru/verbalizer/ordinals-lex.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # AUTOMATICALLY GENERATED: DO NOT EDIT. 16 | import 'util/byte.grm' as b; 17 | 18 | # Utilities for insertion and deletion. 19 | 20 | func I[expr] { 21 | return "" : expr; 22 | } 23 | 24 | func D[expr] { 25 | return expr : ""; 26 | } 27 | 28 | # Powers of base 10. 29 | export POWERS = 30 | "[E15]" 31 | | "[E14]" 32 | | "[E13]" 33 | | "[E12]" 34 | | "[E11]" 35 | | "[E10]" 36 | | "[E9]" 37 | | "[E8]" 38 | | "[E7]" 39 | | "[E6]" 40 | | "[E5]" 41 | | "[E4]" 42 | | "[E3]" 43 | | "[E2]" 44 | | "[E1]" 45 | ; 46 | 47 | export SIGMA = b.kBytes | POWERS; 48 | 49 | export SIGMA_STAR = SIGMA*; 50 | 51 | export SIGMA_PLUS = SIGMA+; 52 | 53 | ################################################################################ 54 | # BEGIN LANGUAGE SPECIFIC DATA 55 | revaluations = 56 | ("[E4]" : "[E1]") 57 | | ("[E5]" : "[E2]") 58 | | ("[E7]" : "[E1]") 59 | | ("[E8]" : "[E2]") 60 | ; 61 | 62 | Ms = "[E3]" | "[E6]" | "[E9]"; 63 | 64 | 65 | func Zero[expr] { 66 | return expr : (""); 67 | } 68 | 69 | space = " "; 70 | 71 | lexset3 = Optimize[ 72 | ("1[E1]+1" : "одиннадцатая@") 73 | | ("1[E1]+1" : "одиннадцати") 74 | | ("1[E1]+1" : "одиннадцатого@") 75 | | ("1[E1]+1" : "одиннадцатое@") 76 | | ("1[E1]+1" : "одиннадцатой@") 77 | | ("1[E1]+1" : "одиннадцатом@") 78 | | ("1[E1]+1" : "одиннадцатому@") 79 | | ("1[E1]+1" : "одиннадцатую@") 80 | | ("1[E1]+1" : "одиннадцатые@") 81 | | ("1[E1]+1" : "одиннадцатый@") 82 | | ("1[E1]+1" : "одиннадцатым@") 83 | | ("1[E1]+1" : "одиннадцатыми@") 84 | | ("1[E1]+1" : "одиннадцатых@") 85 | | ("1[E1]+1" : "одиннадцать") 86 | | ("1[E1]+1" : "одиннадцатью") 87 | | ("1[E1]+2" : "двенадцатая@") 88 | | ("1[E1]+2" : "двенадцати") 89 | | ("1[E1]+2" : "двенадцатого@") 90 | | ("1[E1]+2" : "двенадцатое@") 91 | | ("1[E1]+2" : "двенадцатой@") 92 | | ("1[E1]+2" : "двенадцатом@") 93 | | ("1[E1]+2" : "двенадцатому@") 94 | | ("1[E1]+2" : "двенадцатую@") 95 | | ("1[E1]+2" : "двенадцатые@") 96 | | ("1[E1]+2" : "двенадцатый@") 97 | | ("1[E1]+2" : "двенадцатым@") 98 | | ("1[E1]+2" : "двенадцатыми@") 99 | | ("1[E1]+2" : "двенадцатых@") 100 | | ("1[E1]+2" : "двенадцать") 101 | | ("1[E1]+2" : "двенадцатью") 102 | | ("1[E1]+3" : "тринадцатая@") 103 | | ("1[E1]+3" : "тринадцати") 104 | | ("1[E1]+3" : "тринадцатого@") 105 | | ("1[E1]+3" : "тринадцатое@") 106 | | ("1[E1]+3" : "тринадцатой@") 107 | | ("1[E1]+3" : "тринадцатом@") 108 | | ("1[E1]+3" : "тринадцатому@") 109 | | ("1[E1]+3" : "тринадцатую@") 110 | | ("1[E1]+3" : "тринадцатые@") 111 | | ("1[E1]+3" : "тринадцатый@") 112 | | ("1[E1]+3" : "тринадцатым@") 113 | | ("1[E1]+3" : "тринадцатыми@") 114 | | ("1[E1]+3" : "тринадцатых@") 115 | | ("1[E1]+3" : "тринадцать") 116 | | ("1[E1]+3" : "тринадцатью") 117 | | ("1[E1]+4" : "четырнадцатая@") 118 | | ("1[E1]+4" : "четырнадцати") 119 | | ("1[E1]+4" : "четырнадцатого@") 120 | | ("1[E1]+4" : "четырнадцатое@") 121 | | ("1[E1]+4" : "четырнадцатой@") 122 | | ("1[E1]+4" : "четырнадцатом@") 123 | | ("1[E1]+4" : "четырнадцатому@") 124 | | ("1[E1]+4" : "четырнадцатую@") 125 | | ("1[E1]+4" : "четырнадцатые@") 126 | | ("1[E1]+4" : "четырнадцатый@") 127 | | ("1[E1]+4" : "четырнадцатым@") 128 | | ("1[E1]+4" : "четырнадцатыми@") 129 | | ("1[E1]+4" : "четырнадцатых@") 130 | | ("1[E1]+4" : "четырнадцать") 131 | | ("1[E1]+4" : "четырнадцатью") 132 | | ("1[E1]+5" : "пятнадцатая@") 133 | | ("1[E1]+5" : "пятнадцати") 134 | | ("1[E1]+5" : "пятнадцатого@") 135 | | ("1[E1]+5" : "пятнадцатое@") 136 | | ("1[E1]+5" : "пятнадцатой@") 137 | | ("1[E1]+5" : "пятнадцатом@") 138 | | ("1[E1]+5" : "пятнадцатому@") 139 | | ("1[E1]+5" : "пятнадцатую@") 140 | | ("1[E1]+5" : "пятнадцатые@") 141 | | ("1[E1]+5" : "пятнадцатый@") 142 | | ("1[E1]+5" : "пятнадцатым@") 143 | | ("1[E1]+5" : "пятнадцатыми@") 144 | | ("1[E1]+5" : "пятнадцатых@") 145 | | ("1[E1]+5" : "пятнадцать") 146 | | ("1[E1]+5" : "пятнадцатью") 147 | | ("1[E1]+6" : "шестнадцатая@") 148 | | ("1[E1]+6" : "шестнадцати") 149 | | ("1[E1]+6" : "шестнадцатого@") 150 | | ("1[E1]+6" : "шестнадцатое@") 151 | | ("1[E1]+6" : "шестнадцатой@") 152 | | ("1[E1]+6" : "шестнадцатом@") 153 | | ("1[E1]+6" : "шестнадцатому@") 154 | | ("1[E1]+6" : "шестнадцатую@") 155 | | ("1[E1]+6" : "шестнадцатые@") 156 | | ("1[E1]+6" : "шестнадцатый@") 157 | | ("1[E1]+6" : "шестнадцатым@") 158 | | ("1[E1]+6" : "шестнадцатыми@") 159 | | ("1[E1]+6" : "шестнадцатых@") 160 | | ("1[E1]+6" : "шестнадцать") 161 | | ("1[E1]+6" : "шестнадцатью") 162 | | ("1[E1]+7" : "семнадцатая@") 163 | | ("1[E1]+7" : "семнадцати") 164 | | ("1[E1]+7" : "семнадцатого@") 165 | | ("1[E1]+7" : "семнадцатое@") 166 | | ("1[E1]+7" : "семнадцатой@") 167 | | ("1[E1]+7" : "семнадцатом@") 168 | | ("1[E1]+7" : "семнадцатому@") 169 | | ("1[E1]+7" : "семнадцатую@") 170 | | ("1[E1]+7" : "семнадцатые@") 171 | | ("1[E1]+7" : "семнадцатый@") 172 | | ("1[E1]+7" : "семнадцатым@") 173 | | ("1[E1]+7" : "семнадцатыми@") 174 | | ("1[E1]+7" : "семнадцатых@") 175 | | ("1[E1]+7" : "семнадцать") 176 | | ("1[E1]+7" : "семнадцатью") 177 | | ("1[E1]+8" : "восемнадцатая@") 178 | | ("1[E1]+8" : "восемнадцати") 179 | | ("1[E1]+8" : "восемнадцатого@") 180 | | ("1[E1]+8" : "восемнадцатое@") 181 | | ("1[E1]+8" : "восемнадцатой@") 182 | | ("1[E1]+8" : "восемнадцатом@") 183 | | ("1[E1]+8" : "восемнадцатому@") 184 | | ("1[E1]+8" : "восемнадцатую@") 185 | | ("1[E1]+8" : "восемнадцатые@") 186 | | ("1[E1]+8" : "восемнадцатый@") 187 | | ("1[E1]+8" : "восемнадцатым@") 188 | | ("1[E1]+8" : "восемнадцатыми@") 189 | | ("1[E1]+8" : "восемнадцатых@") 190 | | ("1[E1]+8" : "восемнадцать") 191 | | ("1[E1]+8" : "восемнадцатью") 192 | | ("1[E1]+9" : "девятнадцатая@") 193 | | ("1[E1]+9" : "девятнадцати") 194 | | ("1[E1]+9" : "девятнадцатого@") 195 | | ("1[E1]+9" : "девятнадцатое@") 196 | | ("1[E1]+9" : "девятнадцатой@") 197 | | ("1[E1]+9" : "девятнадцатом@") 198 | | ("1[E1]+9" : "девятнадцатому@") 199 | | ("1[E1]+9" : "девятнадцатую@") 200 | | ("1[E1]+9" : "девятнадцатые@") 201 | | ("1[E1]+9" : "девятнадцатый@") 202 | | ("1[E1]+9" : "девятнадцатым@") 203 | | ("1[E1]+9" : "девятнадцатыми@") 204 | | ("1[E1]+9" : "девятнадцатых@") 205 | | ("1[E1]+9" : "девятнадцать") 206 | | ("1[E1]+9" : "девятнадцатью")] 207 | ; 208 | 209 | lex3 = CDRewrite[lexset3 I[space], "", "", SIGMA_STAR]; 210 | 211 | lexset2 = Optimize[ 212 | ("1[E1]" : "десятая@") 213 | | ("1[E1]" : "десяти") 214 | | ("1[E1]" : "десятого@") 215 | | ("1[E1]" : "десятое@") 216 | | ("1[E1]" : "десятой@") 217 | | ("1[E1]" : "десятом@") 218 | | ("1[E1]" : "десятому@") 219 | | ("1[E1]" : "десятую@") 220 | | ("1[E1]" : "десятые@") 221 | | ("1[E1]" : "десятый@") 222 | | ("1[E1]" : "десятым@") 223 | | ("1[E1]" : "десятыми@") 224 | | ("1[E1]" : "десятых@") 225 | | ("1[E1]" : "десять") 226 | | ("1[E1]" : "десятью") 227 | | ("1[E2]" : "сотая@") 228 | | ("1[E2]" : "сотого@") 229 | | ("1[E2]" : "сотое@") 230 | | ("1[E2]" : "сотой@") 231 | | ("1[E2]" : "сотом@") 232 | | ("1[E2]" : "сотому@") 233 | | ("1[E2]" : "сотую@") 234 | | ("1[E2]" : "сотые@") 235 | | ("1[E2]" : "сотый@") 236 | | ("1[E2]" : "сотым@") 237 | | ("1[E2]" : "сотыми@") 238 | | ("1[E2]" : "сотых@") 239 | | ("1[E2]" : "ста") 240 | | ("1[E2]" : "сто") 241 | | ("1[E3]" : "тысячная@") 242 | | ("1[E3]" : "тысячного@") 243 | | ("1[E3]" : "тысячное@") 244 | | ("1[E3]" : "тысячной@") 245 | | ("1[E3]" : "тысячном@") 246 | | ("1[E3]" : "тысячному@") 247 | | ("1[E3]" : "тысячную@") 248 | | ("1[E3]" : "тысячные@") 249 | | ("1[E3]" : "тысячный@") 250 | | ("1[E3]" : "тысячным@") 251 | | ("1[E3]" : "тысячными@") 252 | | ("1[E3]" : "тысячных@") 253 | | ("1[E6]" : "миллионная@") 254 | | ("1[E6]" : "миллионного@") 255 | | ("1[E6]" : "миллионное@") 256 | | ("1[E6]" : "миллионной@") 257 | | ("1[E6]" : "миллионном@") 258 | | ("1[E6]" : "миллионному@") 259 | | ("1[E6]" : "миллионную@") 260 | | ("1[E6]" : "миллионные@") 261 | | ("1[E6]" : "миллионный@") 262 | | ("1[E6]" : "миллионным@") 263 | | ("1[E6]" : "миллионными@") 264 | | ("1[E6]" : "миллионных@") 265 | | ("1[E9]" : "миллиардная@") 266 | | ("1[E9]" : "миллиардного@") 267 | | ("1[E9]" : "миллиардное@") 268 | | ("1[E9]" : "миллиардной@") 269 | | ("1[E9]" : "миллиардном@") 270 | | ("1[E9]" : "миллиардному@") 271 | | ("1[E9]" : "миллиардную@") 272 | | ("1[E9]" : "миллиардные@") 273 | | ("1[E9]" : "миллиардный@") 274 | | ("1[E9]" : "миллиардным@") 275 | | ("1[E9]" : "миллиардными@") 276 | | ("1[E9]" : "миллиардных@") 277 | | ("2[E1]" : "двадцатая@") 278 | | ("2[E1]" : "двадцати") 279 | | ("2[E1]" : "двадцатого@") 280 | | ("2[E1]" : "двадцатое@") 281 | | ("2[E1]" : "двадцатой@") 282 | | ("2[E1]" : "двадцатом@") 283 | | ("2[E1]" : "двадцатому@") 284 | | ("2[E1]" : "двадцатую@") 285 | | ("2[E1]" : "двадцатые@") 286 | | ("2[E1]" : "двадцатый@") 287 | | ("2[E1]" : "двадцатым@") 288 | | ("2[E1]" : "двадцатыми@") 289 | | ("2[E1]" : "двадцатых@") 290 | | ("2[E1]" : "двадцать") 291 | | ("2[E1]" : "двадцатью") 292 | | ("2[E2]" : "двести") 293 | | ("2[E2]" : "двумстам") 294 | | ("2[E2]" : "двумястами") 295 | | ("2[E2]" : "двухсот") 296 | | ("2[E2]" : "двухсотая@") 297 | | ("2[E2]" : "двухсотого@") 298 | | ("2[E2]" : "двухсотое@") 299 | | ("2[E2]" : "двухсотой@") 300 | | ("2[E2]" : "двухсотом@") 301 | | ("2[E2]" : "двухсотому@") 302 | | ("2[E2]" : "двухсотую@") 303 | | ("2[E2]" : "двухсотые@") 304 | | ("2[E2]" : "двухсотый@") 305 | | ("2[E2]" : "двухсотым@") 306 | | ("2[E2]" : "двухсотыми@") 307 | | ("2[E2]" : "двухсотых@") 308 | | ("2[E2]" : "двухстах") 309 | | ("3[E1]" : "тридцатая@") 310 | | ("3[E1]" : "тридцати") 311 | | ("3[E1]" : "тридцатого@") 312 | | ("3[E1]" : "тридцатое@") 313 | | ("3[E1]" : "тридцатой@") 314 | | ("3[E1]" : "тридцатом@") 315 | | ("3[E1]" : "тридцатому@") 316 | | ("3[E1]" : "тридцатую@") 317 | | ("3[E1]" : "тридцатые@") 318 | | ("3[E1]" : "тридцатый@") 319 | | ("3[E1]" : "тридцатым@") 320 | | ("3[E1]" : "тридцатыми@") 321 | | ("3[E1]" : "тридцатых@") 322 | | ("3[E1]" : "тридцать") 323 | | ("3[E1]" : "тридцатью") 324 | | ("3[E2]" : "тремстам") 325 | | ("3[E2]" : "тремястами") 326 | | ("3[E2]" : "трехсот") 327 | | ("3[E2]" : "трехсотая@") 328 | | ("3[E2]" : "трехсотого@") 329 | | ("3[E2]" : "трехсотое@") 330 | | ("3[E2]" : "трехсотой@") 331 | | ("3[E2]" : "трехсотом@") 332 | | ("3[E2]" : "трехсотому@") 333 | | ("3[E2]" : "трехсотую@") 334 | | ("3[E2]" : "трехсотые@") 335 | | ("3[E2]" : "трехсотый@") 336 | | ("3[E2]" : "трехсотым@") 337 | | ("3[E2]" : "трехсотыми@") 338 | | ("3[E2]" : "трехсотых@") 339 | | ("3[E2]" : "трехстах") 340 | | ("3[E2]" : "триста") 341 | | ("4[E1]" : "сорок") 342 | | ("4[E1]" : "сорока") 343 | | ("4[E1]" : "сороковая@") 344 | | ("4[E1]" : "сорокового@") 345 | | ("4[E1]" : "сороковое@") 346 | | ("4[E1]" : "сороковой@") 347 | | ("4[E1]" : "сороковом@") 348 | | ("4[E1]" : "сороковому@") 349 | | ("4[E1]" : "сороковую@") 350 | | ("4[E1]" : "сороковые@") 351 | | ("4[E1]" : "сороковым@") 352 | | ("4[E1]" : "сороковыми@") 353 | | ("4[E1]" : "сороковых@") 354 | | ("4[E2]" : "четыремстам") 355 | | ("4[E2]" : "четыреста") 356 | | ("4[E2]" : "четырехсот") 357 | | ("4[E2]" : "четырехсотая@") 358 | | ("4[E2]" : "четырехсотого@") 359 | | ("4[E2]" : "четырехсотое@") 360 | | ("4[E2]" : "четырехсотой@") 361 | | ("4[E2]" : "четырехсотом@") 362 | | ("4[E2]" : "четырехсотому@") 363 | | ("4[E2]" : "четырехсотую@") 364 | | ("4[E2]" : "четырехсотые@") 365 | | ("4[E2]" : "четырехсотый@") 366 | | ("4[E2]" : "четырехсотым@") 367 | | ("4[E2]" : "четырехсотыми@") 368 | | ("4[E2]" : "четырехсотых@") 369 | | ("4[E2]" : "четырехстах") 370 | | ("4[E2]" : "четырьмястами") 371 | | ("5[E1]" : "пятидесятая@") 372 | | ("5[E1]" : "пятидесяти") 373 | | ("5[E1]" : "пятидесятого@") 374 | | ("5[E1]" : "пятидесятое@") 375 | | ("5[E1]" : "пятидесятой@") 376 | | ("5[E1]" : "пятидесятом@") 377 | | ("5[E1]" : "пятидесятому@") 378 | | ("5[E1]" : "пятидесятую@") 379 | | ("5[E1]" : "пятидесятые@") 380 | | ("5[E1]" : "пятидесятый@") 381 | | ("5[E1]" : "пятидесятым@") 382 | | ("5[E1]" : "пятидесятыми@") 383 | | ("5[E1]" : "пятидесятых@") 384 | | ("5[E1]" : "пятьдесят") 385 | | ("5[E1]" : "пятьюдесятью") 386 | | ("5[E2]" : "пятисот") 387 | | ("5[E2]" : "пятисотая@") 388 | | ("5[E2]" : "пятисотого@") 389 | | ("5[E2]" : "пятисотое@") 390 | | ("5[E2]" : "пятисотой@") 391 | | ("5[E2]" : "пятисотом@") 392 | | ("5[E2]" : "пятисотому@") 393 | | ("5[E2]" : "пятисотую@") 394 | | ("5[E2]" : "пятисотые@") 395 | | ("5[E2]" : "пятисотый@") 396 | | ("5[E2]" : "пятисотым@") 397 | | ("5[E2]" : "пятисотыми@") 398 | | ("5[E2]" : "пятисотых@") 399 | | ("5[E2]" : "пятистам") 400 | | ("5[E2]" : "пятистах") 401 | | ("5[E2]" : "пятьсот") 402 | | ("5[E2]" : "пятьюстами") 403 | | ("6[E1]" : "шестидесятая@") 404 | | ("6[E1]" : "шестидесяти") 405 | | ("6[E1]" : "шестидесятого@") 406 | | ("6[E1]" : "шестидесятое@") 407 | | ("6[E1]" : "шестидесятой@") 408 | | ("6[E1]" : "шестидесятом@") 409 | | ("6[E1]" : "шестидесятому@") 410 | | ("6[E1]" : "шестидесятую@") 411 | | ("6[E1]" : "шестидесятые@") 412 | | ("6[E1]" : "шестидесятый@") 413 | | ("6[E1]" : "шестидесятым@") 414 | | ("6[E1]" : "шестидесятыми@") 415 | | ("6[E1]" : "шестидесятых@") 416 | | ("6[E1]" : "шестьдесят") 417 | | ("6[E1]" : "шестьюдесятью") 418 | | ("6[E2]" : "шестисот") 419 | | ("6[E2]" : "шестисотая@") 420 | | ("6[E2]" : "шестисотого@") 421 | | ("6[E2]" : "шестисотое@") 422 | | ("6[E2]" : "шестисотой@") 423 | | ("6[E2]" : "шестисотом@") 424 | | ("6[E2]" : "шестисотому@") 425 | | ("6[E2]" : "шестисотую@") 426 | | ("6[E2]" : "шестисотые@") 427 | | ("6[E2]" : "шестисотый@") 428 | | ("6[E2]" : "шестисотым@") 429 | | ("6[E2]" : "шестисотыми@") 430 | | ("6[E2]" : "шестисотых@") 431 | | ("6[E2]" : "шестистам") 432 | | ("6[E2]" : "шестистах") 433 | | ("6[E2]" : "шестьсот") 434 | | ("6[E2]" : "шестьюстами") 435 | | ("7[E1]" : "семидесятая@") 436 | | ("7[E1]" : "семидесяти") 437 | | ("7[E1]" : "семидесятого@") 438 | | ("7[E1]" : "семидесятое@") 439 | | ("7[E1]" : "семидесятой@") 440 | | ("7[E1]" : "семидесятом@") 441 | | ("7[E1]" : "семидесятому@") 442 | | ("7[E1]" : "семидесятую@") 443 | | ("7[E1]" : "семидесятые@") 444 | | ("7[E1]" : "семидесятый@") 445 | | ("7[E1]" : "семидесятым@") 446 | | ("7[E1]" : "семидесятыми@") 447 | | ("7[E1]" : "семидесятых@") 448 | | ("7[E1]" : "семьдесят") 449 | | ("7[E1]" : "семьюдесятью") 450 | | ("7[E2]" : "семисот") 451 | | ("7[E2]" : "семисотая@") 452 | | ("7[E2]" : "семисотого@") 453 | | ("7[E2]" : "семисотое@") 454 | | ("7[E2]" : "семисотой@") 455 | | ("7[E2]" : "семисотом@") 456 | | ("7[E2]" : "семисотому@") 457 | | ("7[E2]" : "семисотую@") 458 | | ("7[E2]" : "семисотые@") 459 | | ("7[E2]" : "семисотый@") 460 | | ("7[E2]" : "семисотым@") 461 | | ("7[E2]" : "семисотыми@") 462 | | ("7[E2]" : "семисотых@") 463 | | ("7[E2]" : "семистам") 464 | | ("7[E2]" : "семистах") 465 | | ("7[E2]" : "семьсот") 466 | | ("7[E2]" : "семьюстами") 467 | | ("8[E1]" : "восемьдесят") 468 | | ("8[E1]" : "восьмидесятая@") 469 | | ("8[E1]" : "восьмидесяти") 470 | | ("8[E1]" : "восьмидесятого@") 471 | | ("8[E1]" : "восьмидесятое@") 472 | | ("8[E1]" : "восьмидесятой@") 473 | | ("8[E1]" : "восьмидесятом@") 474 | | ("8[E1]" : "восьмидесятому@") 475 | | ("8[E1]" : "восьмидесятую@") 476 | | ("8[E1]" : "восьмидесятые@") 477 | | ("8[E1]" : "восьмидесятый@") 478 | | ("8[E1]" : "восьмидесятым@") 479 | | ("8[E1]" : "восьмидесятыми@") 480 | | ("8[E1]" : "восьмидесятых@") 481 | | ("8[E1]" : "восьмьюдесятью") 482 | | ("8[E2]" : "восемьсот") 483 | | ("8[E2]" : "восемьюстами") 484 | | ("8[E2]" : "восьмисот") 485 | | ("8[E2]" : "восьмисотая@") 486 | | ("8[E2]" : "восьмисотого@") 487 | | ("8[E2]" : "восьмисотое@") 488 | | ("8[E2]" : "восьмисотой@") 489 | | ("8[E2]" : "восьмисотом@") 490 | | ("8[E2]" : "восьмисотому@") 491 | | ("8[E2]" : "восьмисотую@") 492 | | ("8[E2]" : "восьмисотые@") 493 | | ("8[E2]" : "восьмисотый@") 494 | | ("8[E2]" : "восьмисотым@") 495 | | ("8[E2]" : "восьмисотыми@") 496 | | ("8[E2]" : "восьмисотых@") 497 | | ("8[E2]" : "восьмистам") 498 | | ("8[E2]" : "восьмистах") 499 | | ("8[E2]" : "восьмьюстами") 500 | | ("9[E1]" : "девяноста") 501 | | ("9[E1]" : "девяностая@") 502 | | ("9[E1]" : "девяносто") 503 | | ("9[E1]" : "девяностого@") 504 | | ("9[E1]" : "девяностое@") 505 | | ("9[E1]" : "девяностой@") 506 | | ("9[E1]" : "девяностом@") 507 | | ("9[E1]" : "девяностому@") 508 | | ("9[E1]" : "девяностую@") 509 | | ("9[E1]" : "девяностые@") 510 | | ("9[E1]" : "девяностый@") 511 | | ("9[E1]" : "девяностым@") 512 | | ("9[E1]" : "девяностыми@") 513 | | ("9[E1]" : "девяностых@") 514 | | ("9[E2]" : "девятисот") 515 | | ("9[E2]" : "девятисотая@") 516 | | ("9[E2]" : "девятисотого@") 517 | | ("9[E2]" : "девятисотое@") 518 | | ("9[E2]" : "девятисотой@") 519 | | ("9[E2]" : "девятисотом@") 520 | | ("9[E2]" : "девятисотому@") 521 | | ("9[E2]" : "девятисотую@") 522 | | ("9[E2]" : "девятисотые@") 523 | | ("9[E2]" : "девятисотый@") 524 | | ("9[E2]" : "девятисотым@") 525 | | ("9[E2]" : "девятисотыми@") 526 | | ("9[E2]" : "девятисотых@") 527 | | ("9[E2]" : "девятистам") 528 | | ("9[E2]" : "девятистах") 529 | | ("9[E2]" : "девятьсот") 530 | | ("9[E2]" : "девятьюстами")] 531 | ; 532 | 533 | lex2 = CDRewrite[lexset2 I[space], "", "", SIGMA_STAR]; 534 | 535 | lexset1 = Optimize[ 536 | ("+" : "") 537 | | ("1" : "один") 538 | | ("1" : "одна") 539 | | ("1" : "одни") 540 | | ("1" : "одним") 541 | | ("1" : "одними") 542 | | ("1" : "одних") 543 | | ("1" : "одно") 544 | | ("1" : "одного") 545 | | ("1" : "одной") 546 | | ("1" : "одном") 547 | | ("1" : "одному") 548 | | ("1" : "одною") 549 | | ("1" : "одну") 550 | | ("1" : "первая@") 551 | | ("1" : "первого@") 552 | | ("1" : "первое@") 553 | | ("1" : "первой@") 554 | | ("1" : "первом@") 555 | | ("1" : "первому@") 556 | | ("1" : "первую@") 557 | | ("1" : "первые@") 558 | | ("1" : "первый@") 559 | | ("1" : "первым@") 560 | | ("1" : "первыми@") 561 | | ("1" : "первых@") 562 | | ("2" : "вторая@") 563 | | ("2" : "второго@") 564 | | ("2" : "второе@") 565 | | ("2" : "второй@") 566 | | ("2" : "втором@") 567 | | ("2" : "второму@") 568 | | ("2" : "вторую@") 569 | | ("2" : "вторые@") 570 | | ("2" : "вторым@") 571 | | ("2" : "вторыми@") 572 | | ("2" : "вторых@") 573 | | ("2" : "два") 574 | | ("2" : "две") 575 | | ("2" : "двум") 576 | | ("2" : "двумя") 577 | | ("2" : "двух") 578 | | ("3" : "трем") 579 | | ("3" : "тремя") 580 | | ("3" : "третий@") 581 | | ("3" : "третье@") 582 | | ("3" : "третьего@") 583 | | ("3" : "третьей@") 584 | | ("3" : "третьем@") 585 | | ("3" : "третьему@") 586 | | ("3" : "третьи@") 587 | | ("3" : "третьим@") 588 | | ("3" : "третьими@") 589 | | ("3" : "третьих@") 590 | | ("3" : "третью@") 591 | | ("3" : "третья@") 592 | | ("3" : "трех") 593 | | ("3" : "три") 594 | | ("4" : "четвертая@") 595 | | ("4" : "четвертого@") 596 | | ("4" : "четвертое@") 597 | | ("4" : "четвертой@") 598 | | ("4" : "четвертом@") 599 | | ("4" : "четвертому@") 600 | | ("4" : "четвертую@") 601 | | ("4" : "четвертые@") 602 | | ("4" : "четвертый@") 603 | | ("4" : "четвертым@") 604 | | ("4" : "четвертыми@") 605 | | ("4" : "четвертых@") 606 | | ("4" : "четыре") 607 | | ("4" : "четырем") 608 | | ("4" : "четырех") 609 | | ("4" : "четырьмя") 610 | | ("5" : "пятая@") 611 | | ("5" : "пяти") 612 | | ("5" : "пятого@") 613 | | ("5" : "пятое@") 614 | | ("5" : "пятой@") 615 | | ("5" : "пятом@") 616 | | ("5" : "пятому@") 617 | | ("5" : "пятую@") 618 | | ("5" : "пятые@") 619 | | ("5" : "пятый@") 620 | | ("5" : "пятым@") 621 | | ("5" : "пятыми@") 622 | | ("5" : "пятых@") 623 | | ("5" : "пять") 624 | | ("5" : "пятью") 625 | | ("6" : "шестая@") 626 | | ("6" : "шести") 627 | | ("6" : "шестого@") 628 | | ("6" : "шестое@") 629 | | ("6" : "шестой@") 630 | | ("6" : "шестом@") 631 | | ("6" : "шестому@") 632 | | ("6" : "шестую@") 633 | | ("6" : "шестые@") 634 | | ("6" : "шестым@") 635 | | ("6" : "шестыми@") 636 | | ("6" : "шестых@") 637 | | ("6" : "шесть") 638 | | ("6" : "шестью") 639 | | ("7" : "седьмая@") 640 | | ("7" : "седьмого@") 641 | | ("7" : "седьмое@") 642 | | ("7" : "седьмой@") 643 | | ("7" : "седьмом@") 644 | | ("7" : "седьмому@") 645 | | ("7" : "седьмую@") 646 | | ("7" : "седьмые@") 647 | | ("7" : "седьмым@") 648 | | ("7" : "седьмыми@") 649 | | ("7" : "седьмых@") 650 | | ("7" : "семи") 651 | | ("7" : "семь") 652 | | ("7" : "семью") 653 | | ("8" : "восемь") 654 | | ("8" : "восьмая@") 655 | | ("8" : "восьми") 656 | | ("8" : "восьмого@") 657 | | ("8" : "восьмое@") 658 | | ("8" : "восьмой@") 659 | | ("8" : "восьмом@") 660 | | ("8" : "восьмому@") 661 | | ("8" : "восьмую@") 662 | | ("8" : "восьмые@") 663 | | ("8" : "восьмым@") 664 | | ("8" : "восьмыми@") 665 | | ("8" : "восьмых@") 666 | | ("8" : "восьмью") 667 | | ("9" : "девятая@") 668 | | ("9" : "девяти") 669 | | ("9" : "девятого@") 670 | | ("9" : "девятое@") 671 | | ("9" : "девятой@") 672 | | ("9" : "девятом@") 673 | | ("9" : "девятому@") 674 | | ("9" : "девятую@") 675 | | ("9" : "девятые@") 676 | | ("9" : "девятый@") 677 | | ("9" : "девятым@") 678 | | ("9" : "девятыми@") 679 | | ("9" : "девятых@") 680 | | ("9" : "девять") 681 | | ("9" : "девятью") 682 | | ("[E3]" : "тысяч") 683 | | ("[E3]" : "тысяча") 684 | | ("[E3]" : "тысячам") 685 | | ("[E3]" : "тысячами") 686 | | ("[E3]" : "тысячах") 687 | | ("[E3]" : "тысяче") 688 | | ("[E3]" : "тысячей") 689 | | ("[E3]" : "тысячи") 690 | | ("[E3]" : "тысячу") 691 | | ("[E3]" : "тысячью") 692 | | ("[E6]" : "миллион") 693 | | ("[E6]" : "миллиона") 694 | | ("[E6]" : "миллионам") 695 | | ("[E6]" : "миллионами") 696 | | ("[E6]" : "миллионах") 697 | | ("[E6]" : "миллионе") 698 | | ("[E6]" : "миллионов") 699 | | ("[E6]" : "миллионом") 700 | | ("[E6]" : "миллиону") 701 | | ("[E6]" : "миллионы") 702 | | ("[E9]" : "миллиард") 703 | | ("[E9]" : "миллиарда") 704 | | ("[E9]" : "миллиардам") 705 | | ("[E9]" : "миллиардами") 706 | | ("[E9]" : "миллиардах") 707 | | ("[E9]" : "миллиарде") 708 | | ("[E9]" : "миллиардов") 709 | | ("[E9]" : "миллиардом") 710 | | ("[E9]" : "миллиарду") 711 | | ("[E9]" : "миллиарды") 712 | | ("|0|" : "ноле") 713 | | ("|0|" : "нолем") 714 | | ("|0|" : "ноль") 715 | | ("|0|" : "нолю") 716 | | ("|0|" : "ноля") 717 | | ("|0|" : "нуле") 718 | | ("|0|" : "нулем") 719 | | ("|0|" : "нуль") 720 | | ("|0|" : "нулю") 721 | | ("|0|" : "нуля")] 722 | ; 723 | 724 | lex1 = CDRewrite[lexset1 I[space], "", "", SIGMA_STAR]; 725 | 726 | export LEX = Optimize[lex3 @ lex2 @ lex1]; 727 | 728 | export INDEPENDENT_EXPONENTS = "[E3]" | "[E6]" | "[E9]"; 729 | 730 | # END LANGUAGE SPECIFIC DATA 731 | ################################################################################ 732 | # Inserts a marker after the Ms. 733 | export INSERT_BOUNDARY = CDRewrite["" : "%", Ms, "", SIGMA_STAR]; 734 | 735 | # Deletes all powers and "+". 736 | export DELETE_POWERS = CDRewrite[D[POWERS | "+"], "", "", SIGMA_STAR]; 737 | 738 | # Deletes trailing zeros at the beginning of a number, so that "0003" does not 739 | # get treated as an ordinary number. 740 | export DELETE_INITIAL_ZEROS = 741 | CDRewrite[("0" POWERS "+") : "", "[BOS]", "", SIGMA_STAR] 742 | ; 743 | 744 | NonMs = Optimize[POWERS - Ms]; 745 | 746 | # Deletes (usually) zeros before a non-M. E.g., +0[E1] should be 747 | # deleted 748 | export DELETE_INTERMEDIATE_ZEROS1 = 749 | CDRewrite[Zero["+0" NonMs], "", "", SIGMA_STAR] 750 | ; 751 | 752 | # Deletes (usually) zeros before an M, if there is no non-zero element between 753 | # that and the previous boundary. Thus, if after the result of the rule above we 754 | # end up with "%+0[E3]", then that gets deleted. Also (really) deletes a final 755 | # zero. 756 | export DELETE_INTERMEDIATE_ZEROS2 = Optimize[ 757 | CDRewrite[Zero["%+0" Ms], "", "", SIGMA_STAR] 758 | @ CDRewrite[D["+0"], "", "[EOS]", SIGMA_STAR]] 759 | ; 760 | 761 | # Final clean up of stray zeros. 762 | export DELETE_REMAINING_ZEROS = Optimize[ 763 | CDRewrite[Zero["+0"], "", "", SIGMA_STAR] 764 | @ CDRewrite[Zero["0"], "", "", SIGMA_STAR]] 765 | ; 766 | 767 | # Applies the revaluation map. For example in English, change [E4] to [E1] as a 768 | # modifier of [E3] 769 | export REVALUE = CDRewrite[revaluations, "", "", SIGMA_STAR]; 770 | 771 | # Deletes the various marks and powers in the input and output. 772 | export DELETE_MARKS = CDRewrite[D["%" | "+" | POWERS], "", "", SIGMA_STAR]; 773 | 774 | export CLEAN_SPACES = Optimize[ 775 | CDRewrite[" "+ : " ", b.kNotSpace, b.kNotSpace, SIGMA_STAR] 776 | @ CDRewrite[" "* : "", "[BOS]", "", SIGMA_STAR] 777 | @ CDRewrite[" "* : "", "", "[EOS]", SIGMA_STAR]] 778 | ; 779 | 780 | d = b.kDigit; 781 | 782 | # Germanic inversion rule. 783 | germanic = 784 | (I["1+"] d "[E1]" D["+1"]) 785 | | (I["2+"] d "[E1]" D["+2"]) 786 | | (I["3+"] d "[E1]" D["+3"]) 787 | | (I["4+"] d "[E1]" D["+4"]) 788 | | (I["5+"] d "[E1]" D["+5"]) 789 | | (I["6+"] d "[E1]" D["+6"]) 790 | | (I["7+"] d "[E1]" D["+7"]) 791 | | (I["8+"] d "[E1]" D["+8"]) 792 | | (I["9+"] d "[E1]" D["+9"]) 793 | ; 794 | 795 | germanic_inversion = 796 | CDRewrite[germanic, "", "", SIGMA_STAR, 'ltr', 'opt'] 797 | ; 798 | 799 | export GERMANIC_INVERSION = SIGMA_STAR; 800 | export ORDINAL_RESTRICTION = 801 | Optimize[((SIGMA - "@")* "@") @ CDRewrite[D["@"], "", "", SIGMA_STAR]] 802 | ; 803 | nondigits = b.kBytes - b.kDigit; 804 | export ORDINAL_SUFFIX = D[nondigits*]; 805 | -------------------------------------------------------------------------------- /src/ru/verbalizer/ordinals.tsv: -------------------------------------------------------------------------------- 1 | 0 нулевая 2 | 0 нулевого 3 | 0 нулевое 4 | 0 нулевой 5 | 0 нулевом 6 | 0 нулевому 7 | 0 нулевую 8 | 0 нулевые 9 | 0 нулевым 10 | 0 нулевым 11 | 0 нулевыми 12 | 0 нулевых 13 | 1 первая 14 | 1 первого 15 | 1 первое 16 | 1 первой 17 | 1 первом 18 | 1 первому 19 | 1 первую 20 | 1 первые 21 | 1 первый 22 | 1 первым 23 | 1 первым 24 | 1 первыми 25 | 1 первых 26 | 2 вторая 27 | 2 второго 28 | 2 второе 29 | 2 второй 30 | 2 втором 31 | 2 второму 32 | 2 вторую 33 | 2 вторые 34 | 2 вторым 35 | 2 вторым 36 | 2 вторыми 37 | 2 вторых 38 | 3 третий 39 | 3 третье 40 | 3 третьего 41 | 3 третьей 42 | 3 третьем 43 | 3 третьему 44 | 3 третьи 45 | 3 третьим 46 | 3 третьим 47 | 3 третьими 48 | 3 третьих 49 | 3 третью 50 | 3 третья 51 | 4 четвертая 52 | 4 четвертого 53 | 4 четвертое 54 | 4 четвертой 55 | 4 четвертом 56 | 4 четвертому 57 | 4 четвертую 58 | 4 четвертые 59 | 4 четвертый 60 | 4 четвертым 61 | 4 четвертым 62 | 4 четвертыми 63 | 4 четвертых 64 | 4 четвёртая 65 | 4 четвёртого 66 | 4 четвёртое 67 | 4 четвёртой 68 | 4 четвёртом 69 | 4 четвёртому 70 | 4 четвёртую 71 | 4 четвёртые 72 | 4 четвёртый 73 | 4 четвёртым 74 | 4 четвёртым 75 | 4 четвёртыми 76 | 4 четвёртых 77 | 5 пятая 78 | 5 пятого 79 | 5 пятое 80 | 5 пятой 81 | 5 пятом 82 | 5 пятому 83 | 5 пятую 84 | 5 пятые 85 | 5 пятый 86 | 5 пятым 87 | 5 пятым 88 | 5 пятыми 89 | 5 пятых 90 | 6 шестая 91 | 6 шестого 92 | 6 шестое 93 | 6 шестой 94 | 6 шестом 95 | 6 шестому 96 | 6 шестую 97 | 6 шестые 98 | 6 шестым 99 | 6 шестым 100 | 6 шестыми 101 | 6 шестых 102 | 7 седьмая 103 | 7 седьмого 104 | 7 седьмое 105 | 7 седьмой 106 | 7 седьмом 107 | 7 седьмому 108 | 7 седьмую 109 | 7 седьмые 110 | 7 седьмым 111 | 7 седьмым 112 | 7 седьмыми 113 | 7 седьмых 114 | 8 восьмая 115 | 8 восьмого 116 | 8 восьмое 117 | 8 восьмой 118 | 8 восьмом 119 | 8 восьмому 120 | 8 восьмую 121 | 8 восьмые 122 | 8 восьмым 123 | 8 восьмым 124 | 8 восьмыми 125 | 8 восьмых 126 | 9 девятая 127 | 9 девятого 128 | 9 девятое 129 | 9 девятой 130 | 9 девятом 131 | 9 девятому 132 | 9 девятую 133 | 9 девятые 134 | 9 девятый 135 | 9 девятым 136 | 9 девятым 137 | 9 девятыми 138 | 9 девятых 139 | 10 десятая 140 | 10 десятого 141 | 10 десятое 142 | 10 десятой 143 | 10 десятом 144 | 10 десятому 145 | 10 десятую 146 | 10 десятые 147 | 10 десятый 148 | 10 десятым 149 | 10 десятым 150 | 10 десятыми 151 | 10 десятых 152 | 11 одиннадцатая 153 | 11 одиннадцатого 154 | 11 одиннадцатое 155 | 11 одиннадцатой 156 | 11 одиннадцатом 157 | 11 одиннадцатому 158 | 11 одиннадцатую 159 | 11 одиннадцатые 160 | 11 одиннадцатый 161 | 11 одиннадцатым 162 | 11 одиннадцатым 163 | 11 одиннадцатыми 164 | 11 одиннадцатых 165 | 12 двенадцатая 166 | 12 двенадцатого 167 | 12 двенадцатое 168 | 12 двенадцатой 169 | 12 двенадцатом 170 | 12 двенадцатому 171 | 12 двенадцатую 172 | 12 двенадцатые 173 | 12 двенадцатый 174 | 12 двенадцатым 175 | 12 двенадцатым 176 | 12 двенадцатыми 177 | 12 двенадцатых 178 | 13 тринадцатая 179 | 13 тринадцатого 180 | 13 тринадцатое 181 | 13 тринадцатой 182 | 13 тринадцатом 183 | 13 тринадцатому 184 | 13 тринадцатую 185 | 13 тринадцатые 186 | 13 тринадцатый 187 | 13 тринадцатым 188 | 13 тринадцатым 189 | 13 тринадцатыми 190 | 13 тринадцатых 191 | 14 четырнадцатая 192 | 14 четырнадцатого 193 | 14 четырнадцатое 194 | 14 четырнадцатой 195 | 14 четырнадцатом 196 | 14 четырнадцатому 197 | 14 четырнадцатую 198 | 14 четырнадцатые 199 | 14 четырнадцатый 200 | 14 четырнадцатым 201 | 14 четырнадцатым 202 | 14 четырнадцатыми 203 | 14 четырнадцатых 204 | 15 пятнадцатая 205 | 15 пятнадцатого 206 | 15 пятнадцатое 207 | 15 пятнадцатой 208 | 15 пятнадцатом 209 | 15 пятнадцатому 210 | 15 пятнадцатую 211 | 15 пятнадцатые 212 | 15 пятнадцатый 213 | 15 пятнадцатым 214 | 15 пятнадцатым 215 | 15 пятнадцатыми 216 | 15 пятнадцатых 217 | 16 шестнадцатая 218 | 16 шестнадцатого 219 | 16 шестнадцатое 220 | 16 шестнадцатой 221 | 16 шестнадцатом 222 | 16 шестнадцатому 223 | 16 шестнадцатую 224 | 16 шестнадцатые 225 | 16 шестнадцатый 226 | 16 шестнадцатым 227 | 16 шестнадцатым 228 | 16 шестнадцатыми 229 | 16 шестнадцатых 230 | 17 семнадцатая 231 | 17 семнадцатого 232 | 17 семнадцатое 233 | 17 семнадцатой 234 | 17 семнадцатом 235 | 17 семнадцатому 236 | 17 семнадцатую 237 | 17 семнадцатые 238 | 17 семнадцатый 239 | 17 семнадцатым 240 | 17 семнадцатым 241 | 17 семнадцатыми 242 | 17 семнадцатых 243 | 18 восемнадцатая 244 | 18 восемнадцатого 245 | 18 восемнадцатое 246 | 18 восемнадцатой 247 | 18 восемнадцатом 248 | 18 восемнадцатому 249 | 18 восемнадцатую 250 | 18 восемнадцатые 251 | 18 восемнадцатый 252 | 18 восемнадцатым 253 | 18 восемнадцатым 254 | 18 восемнадцатыми 255 | 18 восемнадцатых 256 | 19 девятнадцатая 257 | 19 девятнадцатого 258 | 19 девятнадцатое 259 | 19 девятнадцатой 260 | 19 девятнадцатом 261 | 19 девятнадцатому 262 | 19 девятнадцатую 263 | 19 девятнадцатые 264 | 19 девятнадцатый 265 | 19 девятнадцатым 266 | 19 девятнадцатым 267 | 19 девятнадцатыми 268 | 19 девятнадцатых 269 | 20 двадцатая 270 | 20 двадцатого 271 | 20 двадцатое 272 | 20 двадцатой 273 | 20 двадцатом 274 | 20 двадцатому 275 | 20 двадцатую 276 | 20 двадцатые 277 | 20 двадцатый 278 | 20 двадцатым 279 | 20 двадцатым 280 | 20 двадцатыми 281 | 20 двадцатых 282 | 30 тридцатая 283 | 30 тридцатого 284 | 30 тридцатое 285 | 30 тридцатой 286 | 30 тридцатом 287 | 30 тридцатому 288 | 30 тридцатую 289 | 30 тридцатые 290 | 30 тридцатый 291 | 30 тридцатым 292 | 30 тридцатым 293 | 30 тридцатыми 294 | 30 тридцатых 295 | 40 сороковая 296 | 40 сорокового 297 | 40 сороковое 298 | 40 сороковой 299 | 40 сороковом 300 | 40 сороковому 301 | 40 сороковую 302 | 40 сороковые 303 | 40 сороковым 304 | 40 сороковым 305 | 40 сороковыми 306 | 40 сороковых 307 | 50 пятидесятая 308 | 50 пятидесятого 309 | 50 пятидесятое 310 | 50 пятидесятой 311 | 50 пятидесятом 312 | 50 пятидесятому 313 | 50 пятидесятую 314 | 50 пятидесятые 315 | 50 пятидесятый 316 | 50 пятидесятым 317 | 50 пятидесятым 318 | 50 пятидесятыми 319 | 50 пятидесятых 320 | 60 шестидесятая 321 | 60 шестидесятого 322 | 60 шестидесятое 323 | 60 шестидесятой 324 | 60 шестидесятом 325 | 60 шестидесятому 326 | 60 шестидесятую 327 | 60 шестидесятые 328 | 60 шестидесятый 329 | 60 шестидесятым 330 | 60 шестидесятым 331 | 60 шестидесятыми 332 | 60 шестидесятых 333 | 70 семидесятая 334 | 70 семидесятого 335 | 70 семидесятое 336 | 70 семидесятой 337 | 70 семидесятом 338 | 70 семидесятому 339 | 70 семидесятую 340 | 70 семидесятые 341 | 70 семидесятый 342 | 70 семидесятым 343 | 70 семидесятым 344 | 70 семидесятыми 345 | 70 семидесятых 346 | 80 восьмидесятая 347 | 80 восьмидесятого 348 | 80 восьмидесятое 349 | 80 восьмидесятой 350 | 80 восьмидесятом 351 | 80 восьмидесятому 352 | 80 восьмидесятую 353 | 80 восьмидесятые 354 | 80 восьмидесятый 355 | 80 восьмидесятым 356 | 80 восьмидесятым 357 | 80 восьмидесятыми 358 | 80 восьмидесятых 359 | 90 девяностая 360 | 90 девяностого 361 | 90 девяностое 362 | 90 девяностой 363 | 90 девяностом 364 | 90 девяностому 365 | 90 девяностую 366 | 90 девяностые 367 | 90 девяностый 368 | 90 девяностым 369 | 90 девяностым 370 | 90 девяностыми 371 | 90 девяностых 372 | 100 сотая 373 | 100 сотого 374 | 100 сотое 375 | 100 сотой 376 | 100 сотом 377 | 100 сотому 378 | 100 сотую 379 | 100 сотые 380 | 100 сотый 381 | 100 сотым 382 | 100 сотым 383 | 100 сотыми 384 | 100 сотых 385 | 200 двухсотая 386 | 200 двухсотого 387 | 200 двухсотое 388 | 200 двухсотой 389 | 200 двухсотом 390 | 200 двухсотому 391 | 200 двухсотую 392 | 200 двухсотые 393 | 200 двухсотый 394 | 200 двухсотым 395 | 200 двухсотым 396 | 200 двухсотыми 397 | 200 двухсотых 398 | 300 трехсотая 399 | 300 трехсотого 400 | 300 трехсотое 401 | 300 трехсотой 402 | 300 трехсотом 403 | 300 трехсотому 404 | 300 трехсотую 405 | 300 трехсотые 406 | 300 трехсотый 407 | 300 трехсотым 408 | 300 трехсотым 409 | 300 трехсотыми 410 | 300 трехсотых 411 | 400 четырехсотая 412 | 400 четырехсотого 413 | 400 четырехсотое 414 | 400 четырехсотой 415 | 400 четырехсотом 416 | 400 четырехсотому 417 | 400 четырехсотую 418 | 400 четырехсотые 419 | 400 четырехсотый 420 | 400 четырехсотым 421 | 400 четырехсотым 422 | 400 четырехсотыми 423 | 400 четырехсотых 424 | 500 пятисотая 425 | 500 пятисотого 426 | 500 пятисотое 427 | 500 пятисотой 428 | 500 пятисотом 429 | 500 пятисотому 430 | 500 пятисотую 431 | 500 пятисотые 432 | 500 пятисотый 433 | 500 пятисотым 434 | 500 пятисотым 435 | 500 пятисотыми 436 | 500 пятисотых 437 | 600 шестисотая 438 | 600 шестисотого 439 | 600 шестисотое 440 | 600 шестисотой 441 | 600 шестисотом 442 | 600 шестисотому 443 | 600 шестисотую 444 | 600 шестисотые 445 | 600 шестисотый 446 | 600 шестисотым 447 | 600 шестисотым 448 | 600 шестисотыми 449 | 600 шестисотых 450 | 700 семисотая 451 | 700 семисотого 452 | 700 семисотое 453 | 700 семисотой 454 | 700 семисотом 455 | 700 семисотому 456 | 700 семисотую 457 | 700 семисотые 458 | 700 семисотый 459 | 700 семисотым 460 | 700 семисотым 461 | 700 семисотыми 462 | 700 семисотых 463 | 800 восьмисотая 464 | 800 восьмисотого 465 | 800 восьмисотое 466 | 800 восьмисотой 467 | 800 восьмисотом 468 | 800 восьмисотому 469 | 800 восьмисотую 470 | 800 восьмисотые 471 | 800 восьмисотый 472 | 800 восьмисотым 473 | 800 восьмисотым 474 | 800 восьмисотыми 475 | 800 восьмисотых 476 | 900 девятисотая 477 | 900 девятисотого 478 | 900 девятисотое 479 | 900 девятисотой 480 | 900 девятисотом 481 | 900 девятисотому 482 | 900 девятисотую 483 | 900 девятисотые 484 | 900 девятисотый 485 | 900 девятисотым 486 | 900 девятисотым 487 | 900 девятисотыми 488 | 900 девятисотых 489 | 1000 тысячная 490 | 1000 тысячного 491 | 1000 тысячное 492 | 1000 тысячной 493 | 1000 тысячном 494 | 1000 тысячному 495 | 1000 тысячную 496 | 1000 тысячные 497 | 1000 тысячный 498 | 1000 тысячным 499 | 1000 тысячным 500 | 1000 тысячными 501 | 1000 тысячных 502 | 1000000 миллионная 503 | 1000000 миллионного 504 | 1000000 миллионное 505 | 1000000 миллионной 506 | 1000000 миллионном 507 | 1000000 миллионному 508 | 1000000 миллионную 509 | 1000000 миллионные 510 | 1000000 миллионный 511 | 1000000 миллионным 512 | 1000000 миллионным 513 | 1000000 миллионными 514 | 1000000 миллионных 515 | 1000000000 миллиардная 516 | 1000000000 миллиардного 517 | 1000000000 миллиардное 518 | 1000000000 миллиардной 519 | 1000000000 миллиардном 520 | 1000000000 миллиардному 521 | 1000000000 миллиардную 522 | 1000000000 миллиардные 523 | 1000000000 миллиардный 524 | 1000000000 миллиардным 525 | 1000000000 миллиардным 526 | 1000000000 миллиардными 527 | 1000000000 миллиардных 528 | -------------------------------------------------------------------------------- /src/ru/verbalizer/spelled.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # This verbalizer is used whenever there is an LM symbol that consists of 16 | # letters immediately followed by "{spelled}". This strips the "{spelled}" 17 | # suffix. 18 | 19 | import 'util/byte.grm' as b; 20 | import 'ru/classifier/cyrillic.grm' as c; 21 | import 'ru/verbalizer/lexical_map.grm' as l; 22 | import 'ru/verbalizer/numbers.grm' as n; 23 | 24 | digit = b.kDigit @ n.CARDINAL_NUMBERS; 25 | 26 | char_set = (("a" | "A") : "letter-a") 27 | | (("b" | "B") : "letter-b") 28 | | (("c" | "C") : "letter-c") 29 | | (("d" | "D") : "letter-d") 30 | | (("e" | "E") : "letter-e") 31 | | (("f" | "F") : "letter-f") 32 | | (("g" | "G") : "letter-g") 33 | | (("h" | "H") : "letter-h") 34 | | (("i" | "I") : "letter-i") 35 | | (("j" | "J") : "letter-j") 36 | | (("k" | "K") : "letter-k") 37 | | (("l" | "L") : "letter-l") 38 | | (("m" | "M") : "letter-m") 39 | | (("n" | "N") : "letter-n") 40 | | (("o" | "O") : "letter-o") 41 | | (("p" | "P") : "letter-p") 42 | | (("q" | "Q") : "letter-q") 43 | | (("r" | "R") : "letter-r") 44 | | (("s" | "S") : "letter-s") 45 | | (("t" | "T") : "letter-t") 46 | | (("u" | "U") : "letter-u") 47 | | (("v" | "V") : "letter-v") 48 | | (("w" | "W") : "letter-w") 49 | | (("x" | "X") : "letter-x") 50 | | (("y" | "Y") : "letter-y") 51 | | (("z" | "Z") : "letter-z") 52 | | (digit) 53 | | ("&" : "@@AND@@") 54 | | ("." : "") 55 | | ("-" : "") 56 | | ("_" : "") 57 | | ("/" : "") 58 | | (n.I["letter-"] c.kCyrillicAlpha) 59 | ; 60 | 61 | ins_space = "" : " "; 62 | 63 | suffix = "{spelled}" : ""; 64 | 65 | spelled = Optimize[char_set (ins_space char_set)* suffix]; 66 | 67 | export SPELLED = Optimize[spelled @ l.LEXICAL_MAP]; 68 | 69 | sigma_star = b.kBytes*; 70 | 71 | # Gets rid of the letter- prefix since in some cases we don't want it. 72 | 73 | del_letter = CDRewrite[n.D["letter-"], "", "", sigma_star]; 74 | 75 | spelled_no_tag = Optimize[char_set (ins_space char_set)*]; 76 | 77 | export SPELLED_NO_LETTER = Optimize[spelled_no_tag @ del_letter]; 78 | -------------------------------------------------------------------------------- /src/ru/verbalizer/spoken_punct.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'ru/verbalizer/lexical_map.grm' as l; 16 | 17 | punct = 18 | ("." : "@@PERIOD@@") 19 | | ("," : "@@COMMA@@") 20 | | ("!" : "@@EXCLAMATION_MARK@@") 21 | | ("?" : "@@QUESTION_MARK@@") 22 | ; 23 | 24 | export SPOKEN_PUNCT = Optimize[punct @ l.LEXICAL_MAP]; 25 | -------------------------------------------------------------------------------- /src/ru/verbalizer/time.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/byte.grm' as b; 16 | import 'ru/verbalizer/lexical_map.grm' as l; 17 | import 'ru/verbalizer/numbers.grm' as n; 18 | 19 | # Only handles 24-hour time with quarter-to, half-past and quarter-past. 20 | 21 | increment_hour = 22 | ("0" : "1") 23 | | ("1" : "2") 24 | | ("2" : "3") 25 | | ("3" : "4") 26 | | ("4" : "5") 27 | | ("5" : "6") 28 | | ("6" : "7") 29 | | ("7" : "8") 30 | | ("8" : "9") 31 | | ("9" : "10") 32 | | ("10" : "11") 33 | | ("11" : "12") 34 | | ("12" : "1") # If someone uses 12, we assume 12-hour by default. 35 | | ("13" : "14") 36 | | ("14" : "15") 37 | | ("15" : "16") 38 | | ("16" : "17") 39 | | ("17" : "18") 40 | | ("18" : "19") 41 | | ("19" : "20") 42 | | ("20" : "21") 43 | | ("21" : "22") 44 | | ("22" : "23") 45 | | ("23" : "12") 46 | ; 47 | 48 | hours = Project[increment_hour, 'input']; 49 | 50 | d = b.kDigit; 51 | D = d - "0"; 52 | 53 | minutes09 = "0" D; 54 | 55 | minutes = ("1" | "2" | "3" | "4" | "5") d; 56 | 57 | __sep__ = ":"; 58 | sep_space = __sep__ : " "; 59 | 60 | verbalize_hours = hours @ n.CARDINAL_NUMBERS; 61 | 62 | verbalize_minutes = 63 | ("00" : "@@HOUR@@") 64 | | (minutes09 @ (("0" : "@@TIME_ZERO@@") n.I[" "] n.CARDINAL_NUMBERS)) 65 | | (minutes @ n.CARDINAL_NUMBERS) 66 | ; 67 | 68 | time_basic = Optimize[verbalize_hours sep_space verbalize_minutes]; 69 | 70 | # Special cases we handle right now. 71 | # TODO: Need to allow for cases like 72 | # 73 | # half twelve (in the UK English sense) 74 | # half twaalf (in the Dutch sense) 75 | 76 | time_quarter_past = 77 | n.I["@@TIME_QUARTER@@ @@TIME_AFTER@@ "] 78 | verbalize_hours 79 | n.D[__sep__ "15"]; 80 | 81 | time_half_past = 82 | n.I["@@TIME_HALF@@ @@TIME_AFTER@@ "] 83 | verbalize_hours 84 | n.D[__sep__ "30"]; 85 | 86 | time_quarter_to = 87 | n.I["@@TIME_QUARTER@@ @@TIME_BEFORE@@ "] 88 | (increment_hour @ verbalize_hours) 89 | n.D[__sep__ "45"]; 90 | 91 | time_extra = Optimize[ 92 | time_quarter_past | time_half_past | time_quarter_to] 93 | ; 94 | 95 | # Basic time periods which most languages can be expected to have. 96 | __am__ = "a.m." | "am" | "AM" | "утра"; 97 | __pm__ = "p.m." | "pm" | "PM" | "вечера"; 98 | 99 | period = (__am__ : "@@TIME_AM@@") | (__pm__ : "@@TIME_PM@@"); 100 | 101 | time_variants = time_basic | time_extra; 102 | 103 | time = Optimize[ 104 | (period (" " | n.I[" "]))? time_variants 105 | | time_variants ((" " | n.I[" "]) period)?] 106 | ; 107 | 108 | export TIME = Optimize[time @ l.LEXICAL_MAP]; 109 | -------------------------------------------------------------------------------- /src/ru/verbalizer/urls.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Rules for URLs and email addresses. 16 | 17 | import 'util/byte.grm' as bytelib; 18 | import 'ru/verbalizer/lexical_map.grm' as l; 19 | 20 | ins_space = "" : " "; 21 | dot = "." : "@@URL_DOT_EXPRESSION@@"; 22 | at = "@" : "@@AT@@"; 23 | 24 | url_suffix = 25 | (".com" : dot ins_space "com") | 26 | (".gov" : dot ins_space "gov") | 27 | (".edu" : dot ins_space "e d u") | 28 | (".org" : dot ins_space "org") | 29 | (".net" : dot ins_space "net") 30 | ; 31 | 32 | letter_string = (bytelib.kAlnum)* bytelib.kAlnum; 33 | 34 | letter_string_dot = 35 | ((letter_string ins_space dot ins_space)* letter_string) 36 | ; 37 | 38 | # Rules for URLs. 39 | export URL = Optimize[ 40 | ((letter_string_dot) (ins_space) 41 | (url_suffix)) @ l.LEXICAL_MAP 42 | ]; 43 | 44 | # Rules for email addresses. 45 | letter_by_letter = ((bytelib.kAlnum ins_space)* bytelib.kAlnum); 46 | 47 | letter_by_letter_dot = 48 | ((letter_by_letter ins_space dot ins_space)* 49 | letter_by_letter) 50 | ; 51 | 52 | export EMAIL1 = Optimize[ 53 | ((letter_by_letter) (ins_space) 54 | (at) (ins_space) 55 | (letter_by_letter_dot) (ins_space) 56 | (url_suffix)) @ l.LEXICAL_MAP 57 | ]; 58 | 59 | export EMAIL2 = Optimize[ 60 | ((letter_by_letter) (ins_space) 61 | (at) (ins_space) 62 | (letter_string_dot) (ins_space) 63 | (url_suffix)) @ l.LEXICAL_MAP 64 | ]; 65 | 66 | export EMAILS = Optimize[ 67 | EMAIL1 | EMAIL2 68 | ]; 69 | -------------------------------------------------------------------------------- /src/ru/verbalizer/verbalizer.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import 'util/util.grm' as util; 16 | import 'ru/verbalizer/extra_numbers.grm' as e; 17 | import 'ru/verbalizer/float.grm' as f; 18 | import 'ru/verbalizer/math.grm' as ma; 19 | import 'ru/verbalizer/miscellaneous.grm' as mi; 20 | import 'ru/verbalizer/money.grm' as mo; 21 | import 'ru/verbalizer/numbers.grm' as n; 22 | import 'ru/verbalizer/numbers_plus.grm' as np; 23 | import 'ru/verbalizer/spelled.grm' as s; 24 | import 'ru/verbalizer/spoken_punct.grm' as sp; 25 | import 'ru/verbalizer/time.grm' as t; 26 | import 'ru/verbalizer/urls.grm' as u; 27 | 28 | export VERBALIZER = Optimize[RmWeight[ 29 | ( e.MIXED_NUMBERS 30 | | e.DIGITS 31 | | f.FLOAT 32 | | ma.ARITHMETIC 33 | | mi.MISCELLANEOUS 34 | | mo.MONEY 35 | | n.CARDINAL_NUMBERS 36 | | n.ORDINAL_NUMBERS 37 | | np.NUMBERS_PLUS 38 | | s.SPELLED 39 | | sp.SPOKEN_PUNCT 40 | | t.TIME 41 | | u.URL) @ util.CLEAN_SPACES 42 | ]]; 43 | -------------------------------------------------------------------------------- /src/universal/README.md: -------------------------------------------------------------------------------- 1 | # Language-universal grammar definitions 2 | 3 | This directory contains various language-universal grammar definitions. 4 | -------------------------------------------------------------------------------- /src/universal/roman_numerals.tsv: -------------------------------------------------------------------------------- 1 | i 1 2 | ii 2 3 | iii 3 4 | iv 4 5 | v 5 6 | vi 6 7 | vii 7 8 | viii 8 9 | ix 9 10 | x 10 11 | xi 11 12 | xii 12 13 | xiii 13 14 | xiv 14 15 | xv 15 16 | xvi 16 17 | xvii 17 18 | xviii 18 19 | xix 19 20 | xx 20 21 | xxi 21 22 | xxii 22 23 | xxiii 23 24 | xxiv 24 25 | xxv 25 26 | xxvi 26 27 | xxvii 27 28 | xxviii 28 29 | xxix 29 30 | xxx 30 31 | xxxi 31 32 | xxxii 32 33 | xxxiii 33 34 | xxxiv 34 35 | xxxv 35 36 | xxxvi 36 37 | xxxvii 37 38 | xxxviii 38 39 | xxxix 39 40 | xl 40 41 | xli 41 42 | xlii 42 43 | xliii 43 44 | xliv 44 45 | xlv 45 46 | xlvi 46 47 | xlvii 47 48 | xlviii 48 49 | xlix 49 50 | mcmxciv 1994 51 | mcmxcv 1995 52 | mcmxcvi 1996 53 | mcmxcvii 1997 54 | mcmxcviii 1998 55 | mcmxcix 1999 56 | mm 2000 57 | mmi 2001 58 | mmii 2002 59 | mmiii 2003 60 | mmiv 2004 61 | mmv 2005 62 | mmvi 2006 63 | mmvii 2007 64 | mmviii 2008 65 | mmix 2009 66 | mmx 2010 67 | mmxi 2011 68 | mmxii 2012 69 | mmxiii 2013 70 | mmxiv 2014 71 | mmxv 2015 72 | mmxvi 2016 73 | mmxvii 2017 74 | mmxviii 2018 75 | mmxix 2019 76 | mmxx 2020 77 | mmxxi 2021 78 | mmxxii 2022 79 | mmxxiii 2023 80 | mmxxiv 2024 81 | mmxxv 2025 82 | mmxxvi 2026 83 | mmxxvii 2027 84 | mmxxviii 2028 85 | mmxxix 2029 86 | mmxxx 2030 87 | mmxxxi 2031 88 | mmxxxii 2032 89 | mmxxxiii 2033 90 | mmxxxiv 2034 91 | mmxxxv 2035 92 | -------------------------------------------------------------------------------- /src/universal/thousands_punct.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Specifies common ways of delimiting thousands in digit strings. 16 | 17 | import 'util/byte.grm' as bytelib; 18 | import 'util/util.grm' as util; 19 | 20 | killcomma = "," : ""; 21 | dot2comma = "." : ","; 22 | spaces2comma = " "+ : ","; 23 | 24 | zero = "0"; 25 | 26 | # no_delimiter = zero | "[1-9][0-9]*"; 27 | export no_delimiter = zero | (util.d1to9 bytelib.kDigit*); 28 | 29 | # delim_map_dot = ("[0-9]" | ("\." : ","))*; 30 | delim_map_dot = (bytelib.kDigit | dot2comma)*; 31 | 32 | # delim_map_space = ("[0-9]" | (" +" : ","))*; 33 | delim_map_space = (bytelib.kDigit | spaces2comma)*; 34 | 35 | ## Western systems group thousands. Korean goes this way too. 36 | 37 | # comma_thousands = zero | ("[1-9][0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9]")*); 38 | export comma_thousands = zero | (util.d1to9 bytelib.kDigit{0,2} (killcomma bytelib.kDigit{3})*); 39 | 40 | # ComposeFst: 1st argument cannot match on output labels and 2nd argument 41 | # cannot match on input labels (sort?). 42 | export dot_thousands = delim_map_dot @ comma_thousands; 43 | 44 | # ComposeFst: 1st argument cannot match on output labels and 2nd argument 45 | # cannot match on input labels (sort?). 46 | export space_thousands = delim_map_space @ comma_thousands; 47 | 48 | ## Chinese prefers grouping by fours (by ten-thousands). 49 | 50 | # chinese_comma = 51 | # zero | ("[1-9][0-9]?[0-9]?[0-9]?" (("," : "") "[0-9][0-9][0-9][0-9]")*); 52 | export chinese_comma = zero | (util.d1to9 (bytelib.kDigit{0,3}) (killcomma bytelib.kDigit{4})*); 53 | 54 | ## The Indian system is more complex because of the Stravinskian alternation 55 | ## between lakhs and crores. 56 | ## 57 | ## According to Wikipedia: 58 | ## 59 | ## Indian English Value 60 | ## One 1 61 | ## Ten 10 62 | ## Hundred 100 63 | ## Thousand 1,000 64 | ## Lakh 1,00,000 65 | ## Crore 1,00,00,000 66 | ## Arab 1,00,00,00,000 67 | ## Kharab 1,00,00,00,00,000 68 | 69 | # indian_hundreds = "[1-9][0-9]?[0-9]?"; 70 | indian_hundreds = util.d1to9 bytelib.kDigit{0,2}; 71 | 72 | ## Up to 99,999. 73 | 74 | # indian_comma_thousands = "[1-9][0-9]?" ("," : "") "[0-9][0-9][0-9]"; 75 | indian_comma_thousands = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{3}; 76 | 77 | ## Up to 99,99,999. 78 | 79 | # indian_comma_lakhs = "[1-9][0-9]?" ("," : "") "[0-9][0-9]" ("," : "") "[0-9][0-9][0-9]"; 80 | indian_comma_lakhs = util.d1to9 bytelib.kDigit? killcomma bytelib.kDigit{2} killcomma bytelib.kDigit{3}; 81 | 82 | ## Up to 999,99,99,999 83 | 84 | indian_comma_crores = 85 | util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma 86 | (bytelib.kDigit{2} killcomma)? 87 | bytelib.kDigit{2} killcomma 88 | bytelib.kDigit{3} 89 | ; 90 | 91 | ## Up to 99,999,99,99,999. 92 | 93 | indian_comma_thousand_crores = 94 | util.d1to9 bytelib.kDigit? killcomma 95 | bytelib.kDigit{3} killcomma 96 | bytelib.kDigit{2} killcomma 97 | bytelib.kDigit{2} killcomma 98 | bytelib.kDigit{3} 99 | ; 100 | 101 | ## Up to 999,99,999,99,99,999. 102 | 103 | indian_comma_lakh_crores = 104 | util.d1to9 bytelib.kDigit? bytelib.kDigit? killcomma 105 | bytelib.kDigit{2} killcomma 106 | bytelib.kDigit{3} killcomma 107 | bytelib.kDigit{2} killcomma 108 | bytelib.kDigit{2} killcomma 109 | bytelib.kDigit{3} 110 | ; 111 | 112 | export indian_comma = 113 | zero 114 | | indian_hundreds 115 | | indian_comma_thousands 116 | | indian_comma_lakhs 117 | | indian_comma_crores 118 | | indian_comma_thousand_crores 119 | | indian_comma_lakh_crores 120 | ; 121 | 122 | # Indian number system with dots. 123 | export indian_dot_number = delim_map_dot @ indian_comma; 124 | 125 | # Indian number system with spaces. 126 | export indian_space_number = delim_map_space @ indian_comma; 127 | -------------------------------------------------------------------------------- /src/util/README.md: -------------------------------------------------------------------------------- 1 | # Utility grammar definitions 2 | 3 | This directory contains various utility grammar definitions. 4 | -------------------------------------------------------------------------------- /src/util/arithmetic.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Basic arithmetic on S-expressions. Exported arithmetic transducers may either: 16 | # 17 | # * Support weak vigesimal addition and multiplication... 18 | # 19 | # (+ 20 17 +) -> 37 20 | # (+ 20 10 7 +) -> 37 21 | # (* 4 20 *) -> 80 22 | # 23 | # ...or not. 24 | # 25 | # * Support "Germanic decade flop" addition.... 26 | # 27 | # (+ 8 20 +) -> 28 28 | # (+ 4 60 +) -> 64 29 | # 30 | # ...or not. 31 | # 32 | # * Support multiplication where the left-hand side multiplicand is of a higher 33 | # order than the right-hand side multiplicand. 34 | # 35 | # (* 1000 100) -> 100000 36 | # 37 | # ...or not. 38 | # 39 | # However, modulo these exceptions, arithmetic transducers do not support 40 | # addition that requires "carrying", or multiplication where the right-hand 41 | # side multiplicand is not a power of ten. So this is not a *generic* 42 | # S-expression evaluator. 43 | # 44 | # LEAVES is a transducer that accepts symbols in delta but deletes symbols 45 | # in sigma - delta. So it essentially removes markup. 46 | # 47 | # REPEAT_FILTER is an acceptor which blocks derivations of the form 48 | # 49 | # (+ (* 50 1000 *) (* 4 1000) ...) "fifty thousand four thousand..." 50 | # 51 | # in languages where that is not licensed. 52 | 53 | import 'util/byte.grm' as b; 54 | 55 | # Deleter FST. 56 | func D[expr] { 57 | return expr : ""; 58 | } 59 | 60 | delta = b.kDigit; 61 | sigma = delta | " " | "(" | ")" | "+" | "*"; 62 | 63 | sigmastar = sigma*; 64 | deltastar = delta*; 65 | 66 | rparen = Optimize["+)" | "*)"]; 67 | space_or_rparen = Optimize[" " | rparen]; 68 | 69 | ## Multiplication. 70 | 71 | # Generic multiplication where the RHS is a power of ten. 72 | 73 | del_one = Optimize[delta+ D[" 1"] "0"+]; 74 | 75 | test1_1 = AssertEqual["2 10" @ del_one, "20"]; 76 | test1_2 = AssertEqual["20 10" @ del_one, "200"]; 77 | test1_3 = AssertEqual["2 100" @ del_one, "200"]; 78 | test1_4 = AssertEqual["20 100" @ del_one, "2000"]; 79 | test1_5 = AssertEqual["200 100" @ del_one, "20000"]; 80 | test1_6 = AssertEqual["2 1000" @ del_one, "2000"]; 81 | test1_7 = AssertEqual["20 1000" @ del_one, "20000"]; 82 | test1_8 = AssertEqual["200 1000" @ del_one, "200000"]; 83 | test1_9 = AssertEqual["2000 1000" @ del_one, "2000000"]; 84 | 85 | # Generic multiplication where the RHS is a power of ten and the LHS has fewer 86 | # trailing zeros than the RHS. 87 | del_one_restricted = Optimize[ # e.g., "2 x 10", "2 x 100", etc. 88 | delta D[" 1"] "0"+ | 89 | # e.g., "20 x 100", etc. 90 | delta{1,2} D[" 1"] "0" "0"+ | 91 | # e.g., "200" x 1000", etc. 92 | delta{2,3} D[" 1"] "0"{2} "0"+ | 93 | delta{3,4} D[" 1"] "0"{3} "0"+ | 94 | delta{4,5} D[" 1"] "0"{4} "0"+]; 95 | 96 | test2_01 = AssertEqual["2 10" @ del_one_restricted, "20"]; 97 | test2_02 = AssertNull["20 10" @ del_one_restricted]; 98 | test2_03 = AssertEqual["2 100" @ del_one_restricted, "200"]; 99 | test2_04 = AssertEqual["20 100" @ del_one_restricted, "2000"]; 100 | test2_05 = AssertNull[ "200 100" @ del_one_restricted]; 101 | test2_06 = AssertEqual["2 1000" @ del_one_restricted, "2000"]; 102 | test2_07 = AssertEqual["20 1000" @ del_one_restricted, "20000"]; 103 | test2_08 = AssertEqual["200 1000" @ del_one_restricted, "200000"]; 104 | test2_09 = AssertNull["2000 1000" @ del_one_restricted]; 105 | test2_10 = AssertEqual["1000 10000000" @ del_one_restricted, "10000000000"]; 106 | 107 | # Multiplication of vigesimal base for weak vigesimal systems 108 | 109 | vigesimal_times_map = ("1" : "2") | ("2" : "4") | ("3" : "6") | ("4" : "8"); 110 | 111 | del_two = Optimize[vigesimal_times_map D[" 2"] "0"+]; 112 | 113 | test3_1 = AssertEqual["1 20" @ del_two, "20"]; 114 | test3_2 = AssertEqual["2 20" @ del_two, "40"]; 115 | test3_3 = AssertEqual["3 20" @ del_two, "60"]; 116 | test3_4 = AssertEqual["4 20" @ del_two, "80"]; 117 | 118 | # Multiplication of vigesimal base restricted to cases where the LHS is [1-4] 119 | # and the RHS is a power of ten. 120 | 121 | del_two_restricted = Optimize[vigesimal_times_map D[" 2"] "0"+]; 122 | 123 | test4_1 = AssertEqual["1 20" @ del_two_restricted, "20"]; 124 | test4_2 = AssertEqual["2 20" @ del_two_restricted, "40"]; 125 | test4_3 = AssertEqual["3 20" @ del_two_restricted, "60"]; 126 | test4_4 = AssertEqual["4 20" @ del_two_restricted, "80"]; 127 | test4_5 = AssertNull["5 20" @ del_two_restricted]; 128 | test4_6 = AssertNull["10 20" @ del_two_restricted]; 129 | 130 | products = del_one | del_two; 131 | products_restricted = del_one_restricted | del_two_restricted; 132 | 133 | multiplication = CDRewrite[D["(* "] products D[" *)"], "", "", sigmastar]; 134 | multiplication_restricted = CDRewrite[D["(* "] products_restricted D[" *)"], 135 | "", "", sigmastar]; 136 | 137 | test5_1 = AssertEqual["(* 8 100 *)" @ multiplication, "800"]; 138 | test5_2 = AssertEqual["(* 1 100 *)" @ multiplication, "100"]; 139 | test5_3 = AssertEqual["(* 4 20 *)" @ multiplication, "80"]; 140 | test5_4 = AssertEqual["(* 13 1000 *)" @ multiplication, "13000"]; 141 | test5_5 = AssertEqual["(* 13000 10 *)" @ multiplication, "130000"]; 142 | test5_6 = AssertEqual["(* 13000 10 *)" @ multiplication_restricted, 143 | "(* 13000 10 *)"]; # Can't reduce this. 144 | 145 | ## Addition. 146 | 147 | insum = "+" (sigma - "(")*; 148 | rcon = insum deltastar; 149 | 150 | # Generic zero deletion up to 12. 151 | del_zero = Optimize[ 152 | # Handles lone zero inside a plus statement. 153 | CDRewrite[D[" 0"], rcon, space_or_rparen, sigmastar] @ 154 | # If we need to go any larger, we probably should switch to a PDT. 155 | CDRewrite[D["0"{12} " "] delta{12}, rcon, space_or_rparen, sigmastar] @ 156 | CDRewrite[D["0"{11} " "] delta{11}, rcon, space_or_rparen, sigmastar] @ 157 | CDRewrite[D["0"{10} " "] delta{10}, rcon, space_or_rparen, sigmastar] @ 158 | CDRewrite[D["0"{9} " "] delta{9}, rcon, space_or_rparen, sigmastar] @ 159 | CDRewrite[D["0"{8} " "] delta{8}, rcon, space_or_rparen, sigmastar] @ 160 | CDRewrite[D["0"{7} " "] delta{7}, rcon, space_or_rparen, sigmastar] @ 161 | CDRewrite[D["0"{6} " "] delta{6}, rcon, space_or_rparen, sigmastar] @ 162 | CDRewrite[D["0"{5} " "] delta{5}, rcon, space_or_rparen, sigmastar] @ 163 | CDRewrite[D["0"{4} " "] delta{4}, rcon, space_or_rparen, sigmastar] @ 164 | CDRewrite[D["0"{3} " "] delta{3}, rcon, space_or_rparen, sigmastar] @ 165 | CDRewrite[D["0"{2} " "] delta{2}, rcon, space_or_rparen, sigmastar] @ 166 | CDRewrite[D["0" " "] delta, rcon, space_or_rparen, sigmastar]]; 167 | 168 | ## Weak vigesimal cases involving scores and teens. 169 | 170 | vigesimal_plus_map = Optimize[("20 1" : "3") delta | 171 | ("40 1" : "5") delta | 172 | ("60 1" : "7") delta | 173 | ("80 1" : "9") delta]; 174 | 175 | vigesimal = CDRewrite[vigesimal_plus_map, insum, space_or_rparen, sigmastar]; 176 | 177 | ## Germanic decade flop. 178 | 179 | germanic_map = StringFile['util/germanic.tsv']; 180 | 181 | germanic = CDRewrite[germanic_map, insum, space_or_rparen, sigmastar]; 182 | 183 | sums = Optimize[germanic @ vigesimal @ del_zero]; 184 | 185 | # Deletes the surrounding "(+ +)" around a successful reduction. 186 | 187 | del_plus = CDRewrite[D["(+ "] delta+ D[" +)"], "", "", sigmastar]; 188 | 189 | addition = Optimize[sums @ del_plus]; 190 | 191 | test6_1 = AssertEqual["(+ 30 2 +)" @ addition, "32"]; 192 | test6_2 = AssertEqual["(+ 300 20 1 +)" @ addition, "321"]; 193 | test6_3 = AssertEqual["(+ 80 17 +)" @ addition, "97"]; 194 | test6_4 = AssertEqual["(+ 4 50 +)" @ addition, "54"]; 195 | test6_5 = AssertEqual["(+ 3000 80 17 +)" @ addition, "3097"]; 196 | test6_6 = AssertEqual["(+ 3000 4 50 +)" @ addition, "3054"]; 197 | test6_7 = AssertEqual["(+ 0 10 +)" @ addition, "10"]; 198 | test6_8 = AssertEqual["(+ 0 20 +)" @ addition, "20"]; 199 | test6_9 = AssertEqual["(+ 200 (+ 0 20 +) +)" @ addition @ addition, "220"]; 200 | 201 | ## Export statements. 202 | 203 | export ARITHMETIC = Optimize[multiplication @ addition]; 204 | export ARITHMETIC_RESTRICTED = Optimize[multiplication_restricted @ addition]; 205 | 206 | # Lightweight versions that lack the vigesimal /vɪˈdʒɛsɪməl/ or Germanic decade 207 | # flop, or both. 208 | 209 | export ARITHMETIC_BASIC = Optimize[multiplication @ del_zero @ del_plus]; 210 | export ARITHMETIC_BASIC_RESTRICTED = Optimize[multiplication_restricted @ 211 | del_zero @ del_plus]; 212 | 213 | export ARITHMETIC_GERMANIC = Optimize[multiplication @ germanic @ del_zero @ 214 | del_plus]; 215 | 216 | export ARITHMETIC_GERMANIC_RESTRICTED = Optimize[multiplication_restricted @ 217 | germanic @ del_zero @ 218 | del_plus]; 219 | 220 | export ARITHMETIC_VIGESIMAL = Optimize[multiplication @ vigesimal @ del_zero @ 221 | del_plus]; 222 | export ARITHMETIC_VIGESIMAL_RESTRICTED = Optimize[multiplication_restricted @ 223 | vigesimal @ del_zero @ 224 | del_plus]; 225 | 226 | ## LEAVES transducer. 227 | 228 | nonterm = "+" | "*"; 229 | export LEAVES = Optimize[CDRewrite["(" nonterm " " | " " nonterm ")" : "", 230 | "", "", sigmastar]]; 231 | 232 | test7 = AssertEqual["(* (+ (* 4 20 *) 10 7 +) 1000 *)" @ LEAVES, 233 | "4 20 10 7 1000"]; 234 | 235 | ## Optional filter for repeated large powers of ten, to be applied to leaves. 236 | 237 | func Filter[expr, sigstar] { 238 | return Optimize[sigstar - (sigstar expr sigstar)]; 239 | } 240 | 241 | func FilterMoreThanOne[expr, sigstar] { 242 | return Filter[expr " " (sigstar " ")? expr, sigstar]; 243 | } 244 | 245 | filter_sigstar = (delta | " ")*; 246 | 247 | export REPEAT_FILTER = 248 | Optimize[FilterMoreThanOne["1000", filter_sigstar] @ 249 | FilterMoreThanOne["10000", filter_sigstar] @ 250 | FilterMoreThanOne["100000", filter_sigstar] @ 251 | FilterMoreThanOne["1000000", filter_sigstar] @ 252 | FilterMoreThanOne["1000000000", filter_sigstar] @ 253 | FilterMoreThanOne["1000000000000", filter_sigstar]]; 254 | 255 | test8_1 = AssertNull["50 1000 4 1000" @ REPEAT_FILTER]; 256 | test8_2 = AssertNull["50 1000000 4 1000000" @ REPEAT_FILTER]; 257 | test8_3 = AssertEqual["50 100 1000" @ REPEAT_FILTER, "50 100 1000"]; 258 | test8_4 = AssertNull["20 1000 1000 20" @ REPEAT_FILTER]; 259 | test8_5 = AssertEqual[ 260 | "70 1000000 400 0 70 0 7 1000 100 0 70" @ REPEAT_FILTER, 261 | "70 1000000 400 0 70 0 7 1000 100 0 70" @ REPEAT_FILTER]; 262 | test8_6 = AssertNull[ 263 | "70 1000000 400 0 70 1000 0 7 1000 100 0 70" @ REPEAT_FILTER]; 264 | 265 | # Filters to force the output of *inverting* the arithmetic as applied to a 266 | # digit string to be a well-formed sexpr: 267 | 268 | not_space = b.kNotSpace; 269 | 270 | # Things like (+ 1 +)(+ 9 +). 271 | 272 | bad_parens = 273 | sigmastar ")" not_space sigmastar 274 | | sigmastar not_space "(" sigmastar 275 | ; 276 | 277 | no_bad_parens = sigmastar - bad_parens; 278 | 279 | # Things like (+ 1 +) or (* 3 *). 280 | 281 | spurious_operators = 282 | sigmastar "(+ " delta+ " +)" sigmastar 283 | | sigmastar "(* " delta+ " *)" sigmastar 284 | ; 285 | 286 | no_spurious_operators = sigmastar - spurious_operators; 287 | 288 | no_strings_of_zeros = 289 | sigmastar - (sigmastar " " "0"+ " " "0"+ " " sigmastar) 290 | ; 291 | 292 | no_bad_sequences = 293 | Optimize[no_bad_parens @ no_strings_of_zeros] 294 | ; 295 | 296 | export SEXP_FILTER = Optimize[ 297 | ( delta+ 298 | | "(* " no_bad_sequences " *)" 299 | | "(+ " no_bad_sequences " +)") @ no_spurious_operators] 300 | ; 301 | 302 | # For convenience adds inverses of the arithmetic rules: 303 | 304 | export IARITHMETIC = Invert[ARITHMETIC]; 305 | 306 | export IARITHMETIC_RESTRICTED = Invert[ARITHMETIC_RESTRICTED]; 307 | 308 | export IARITHMETIC_BASIC = Invert[ARITHMETIC_BASIC]; 309 | 310 | export IARITHMETIC_BASIC_RESTRICTED = Invert[ARITHMETIC_BASIC_RESTRICTED]; 311 | 312 | export IARITHMETIC_GERMANIC = Invert[ARITHMETIC_GERMANIC]; 313 | 314 | export IARITHMETIC_GERMANIC_RESTRICTED = 315 | Invert[ARITHMETIC_GERMANIC_RESTRICTED] 316 | ; 317 | 318 | export IARITHMETIC_VIGESIMAL = Invert[ARITHMETIC_VIGESIMAL]; 319 | 320 | export IARITHMETIC_VIGESIMAL_RESTRICTED = 321 | Invert[ARITHMETIC_VIGESIMAL_RESTRICTED] 322 | ; 323 | 324 | ## This should be applied on the lefthand side of FG to ensure that the only 325 | ## digit input nis permitted. 326 | export DELTA_STAR = deltastar; 327 | -------------------------------------------------------------------------------- /src/util/byte.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Standard constants for ASCII (byte) based strings. This mirrors the 16 | # functions provided by C/C++'s ctype.h library. 17 | 18 | # Note that [0] is missing; matching the string-termination character is kinda weird. 19 | export kBytes = Optimize[ 20 | "[1]" | "[2]" | "[3]" | "[4]" | "[5]" | "[6]" | "[7]" | "[8]" | "[9]" | "[10]" | 21 | "[11]" | "[12]" | "[13]" | "[14]" | "[15]" | "[16]" | "[17]" | "[18]" | "[19]" | "[20]" | 22 | "[21]" | "[22]" | "[23]" | "[24]" | "[25]" | "[26]" | "[27]" | "[28]" | "[29]" | "[30]" | 23 | "[31]" | "[32]" | "[33]" | "[34]" | "[35]" | "[36]" | "[37]" | "[38]" | "[39]" | "[40]" | 24 | "[41]" | "[42]" | "[43]" | "[44]" | "[45]" | "[46]" | "[47]" | "[48]" | "[49]" | "[50]" | 25 | "[51]" | "[52]" | "[53]" | "[54]" | "[55]" | "[56]" | "[57]" | "[58]" | "[59]" | "[60]" | 26 | "[61]" | "[62]" | "[63]" | "[64]" | "[65]" | "[66]" | "[67]" | "[68]" | "[69]" | "[70]" | 27 | "[71]" | "[72]" | "[73]" | "[74]" | "[75]" | "[76]" | "[77]" | "[78]" | "[79]" | "[80]" | 28 | "[81]" | "[82]" | "[83]" | "[84]" | "[85]" | "[86]" | "[87]" | "[88]" | "[89]" | "[90]" | 29 | "[91]" | "[92]" | "[93]" | "[94]" | "[95]" | "[96]" | "[97]" | "[98]" | "[99]" | "[100]" | 30 | "[101]" | "[102]" | "[103]" | "[104]" | "[105]" | "[106]" | "[107]" | "[108]" | "[109]" | "[110]" | 31 | "[111]" | "[112]" | "[113]" | "[114]" | "[115]" | "[116]" | "[117]" | "[118]" | "[119]" | "[120]" | 32 | "[121]" | "[122]" | "[123]" | "[124]" | "[125]" | "[126]" | "[127]" | "[128]" | "[129]" | "[130]" | 33 | "[131]" | "[132]" | "[133]" | "[134]" | "[135]" | "[136]" | "[137]" | "[138]" | "[139]" | "[140]" | 34 | "[141]" | "[142]" | "[143]" | "[144]" | "[145]" | "[146]" | "[147]" | "[148]" | "[149]" | "[150]" | 35 | "[151]" | "[152]" | "[153]" | "[154]" | "[155]" | "[156]" | "[157]" | "[158]" | "[159]" | "[160]" | 36 | "[161]" | "[162]" | "[163]" | "[164]" | "[165]" | "[166]" | "[167]" | "[168]" | "[169]" | "[170]" | 37 | "[171]" | "[172]" | "[173]" | "[174]" | "[175]" | "[176]" | "[177]" | "[178]" | "[179]" | "[180]" | 38 | "[181]" | "[182]" | "[183]" | "[184]" | "[185]" | "[186]" | "[187]" | "[188]" | "[189]" | "[190]" | 39 | "[191]" | "[192]" | "[193]" | "[194]" | "[195]" | "[196]" | "[197]" | "[198]" | "[199]" | "[200]" | 40 | "[201]" | "[202]" | "[203]" | "[204]" | "[205]" | "[206]" | "[207]" | "[208]" | "[209]" | "[210]" | 41 | "[211]" | "[212]" | "[213]" | "[214]" | "[215]" | "[216]" | "[217]" | "[218]" | "[219]" | "[220]" | 42 | "[221]" | "[222]" | "[223]" | "[224]" | "[225]" | "[226]" | "[227]" | "[228]" | "[229]" | "[230]" | 43 | "[231]" | "[232]" | "[233]" | "[234]" | "[235]" | "[236]" | "[237]" | "[238]" | "[239]" | "[240]" | 44 | "[241]" | "[242]" | "[243]" | "[244]" | "[245]" | "[246]" | "[247]" | "[248]" | "[249]" | "[250]" | 45 | "[251]" | "[252]" | "[253]" | "[254]" | "[255]" 46 | ]; 47 | 48 | export kDigit = Optimize[ 49 | "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" 50 | ]; 51 | 52 | export kLower = Optimize[ 53 | "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" | 54 | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" 55 | ]; 56 | export kUpper = Optimize[ 57 | "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | 58 | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" 59 | ]; 60 | export kAlpha = Optimize[kLower | kUpper]; 61 | 62 | export kAlnum = Optimize[kDigit | kAlpha]; 63 | 64 | export kSpace = Optimize[ 65 | " " | "\t" | "\n" | "\r" 66 | ]; 67 | export kNotSpace = Optimize[kBytes - kSpace]; 68 | 69 | export kPunct = Optimize[ 70 | "!" | "\"" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," | 71 | "-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "\[" | "\\" | 72 | "\]" | "^" | "_" | "`" | "{" | "|" | "}" | "~" 73 | ]; 74 | 75 | export kGraph = Optimize[kAlnum | kPunct]; 76 | -------------------------------------------------------------------------------- /src/util/germanic.tsv: -------------------------------------------------------------------------------- 1 | 1 10 11 2 | 2 10 12 3 | 3 10 13 4 | 4 10 14 5 | 5 10 15 6 | 6 10 16 7 | 7 10 17 8 | 8 10 18 9 | 9 10 19 10 | 1 20 21 11 | 2 20 22 12 | 3 20 23 13 | 4 20 24 14 | 5 20 25 15 | 6 20 26 16 | 7 20 27 17 | 8 20 28 18 | 9 20 29 19 | 1 30 31 20 | 2 30 32 21 | 3 30 33 22 | 4 30 34 23 | 5 30 35 24 | 6 30 36 25 | 7 30 37 26 | 8 30 38 27 | 9 30 39 28 | 1 40 41 29 | 2 40 42 30 | 3 40 43 31 | 4 40 44 32 | 5 40 45 33 | 6 40 46 34 | 7 40 47 35 | 8 40 48 36 | 9 40 49 37 | 1 50 51 38 | 2 50 52 39 | 3 50 53 40 | 4 50 54 41 | 5 50 55 42 | 6 50 56 43 | 7 50 57 44 | 8 50 58 45 | 9 50 59 46 | 1 60 61 47 | 2 60 62 48 | 3 60 63 49 | 4 60 64 50 | 5 60 65 51 | 6 60 66 52 | 7 60 67 53 | 8 60 68 54 | 9 60 69 55 | 1 70 71 56 | 2 70 72 57 | 3 70 73 58 | 4 70 74 59 | 5 70 75 60 | 6 70 76 61 | 7 70 77 62 | 8 70 78 63 | 9 70 79 64 | 1 80 81 65 | 2 80 82 66 | 3 80 83 67 | 4 80 84 68 | 5 80 85 69 | 6 80 86 70 | 7 80 87 71 | 8 80 88 72 | 9 80 89 73 | 1 90 91 74 | 2 90 92 75 | 3 90 93 76 | 4 90 94 77 | 5 90 95 78 | 6 90 96 79 | 7 90 97 80 | 8 90 98 81 | 9 90 99 82 | -------------------------------------------------------------------------------- /src/util/util.grm: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Utility functions. 16 | 17 | import 'util/byte.grm' as bytelib; 18 | import 'util/case.grm' as case; 19 | 20 | # A simplification helper function that encapsulates the left-to-right and 21 | # obligatory options. 22 | func CDR[t, l, r, s] { 23 | return CDRewrite[t, l, r, s, 'ltr', 'obl']; 24 | } 25 | 26 | # Useful insertion and deletion functions. 27 | 28 | func I[expr] { 29 | return "" : expr; 30 | } 31 | 32 | func D[expr] { 33 | return expr : ""; 34 | } 35 | 36 | # A machine that accepts nothing. 37 | export NULL = Optimize["" - ""]; 38 | 39 | export d1to9 = Optimize[bytelib.kDigit - "0"]; 40 | export d02to9 = Optimize[bytelib.kDigit - "1"]; 41 | export d2to9 = Optimize[d02to9 - "0"]; 42 | # Any number that isn't zero. May have leading zeroes. 43 | export non_zero_number = Optimize["0"* d1to9 bytelib.kDigit*]; 44 | # Any number, allowing for factorization markers. 45 | export factorized_number = Optimize[(bytelib.kDigit | "\[" | "E" | "\]")*]; 46 | export non_zero_factorized_number = Optimize["0"* d1to9 factorized_number]; 47 | 48 | export ins_space = "" : " "; 49 | export ins_sil = "" : " sil "; 50 | export ins_short_sil = "" : " sil|short "; 51 | export ins_quote = "" : "\""; 52 | 53 | # Caveat: pass_anything does not pass stuff like "[~~]". 54 | export pass_anything = bytelib.kBytes*; 55 | export pass_any_word = bytelib.kNotSpace+; 56 | 57 | export pass_space_plus = bytelib.kSpace+; 58 | export pass_space_star = bytelib.kSpace*; 59 | 60 | export clear_space = bytelib.kSpace : ""; 61 | export clear_space_plus = bytelib.kSpace+ : ""; 62 | export clear_space_star = bytelib.kSpace* : ""; 63 | 64 | export space_to_underscore = (bytelib.kAlnum | (" " : "_"))*; 65 | export one_space = clear_space_star ins_space; 66 | 67 | export CLEAN_SPACES = Optimize[ 68 | "" | (clear_space_star 69 | (pass_any_word (bytelib.kSpace+ : " "))* 70 | pass_any_word clear_space_star)] 71 | ; 72 | 73 | export del_space_star = " "* : ""; 74 | export del_space_plus = " "+ : ""; 75 | 76 | export sigma_star = Optimize[pass_anything]; 77 | 78 | export DELETE_SPACES = 79 | CDRewrite[clear_space_plus, "", "", sigma_star]; 80 | 81 | export REMOVE_LEADING_SPACES = 82 | CDRewrite[clear_space_plus, "[BOS]", "", sigma_star]; 83 | 84 | export REMOVE_FINAL_SPACES = 85 | CDRewrite[clear_space_plus, "", "[EOS]", sigma_star]; 86 | 87 | export REMOVE_BOUNDARY_SPACES = REMOVE_LEADING_SPACES @ REMOVE_FINAL_SPACES; 88 | 89 | export delete_initial_zero = 90 | CDRewrite["0" : "", "[BOS]", bytelib.kDigit, sigma_star]; 91 | 92 | export lower_case_letter = Optimize[case.tolower | case.LOWER | bytelib.kLower]; 93 | export lower_case = Optimize[lower_case_letter+]; 94 | export lower_case_anything = case.TOLOWER; 95 | 96 | export upper_case_letter = Optimize[case.toupper | case.UPPER | bytelib.kUpper]; 97 | export upper_case = Optimize[upper_case_letter+]; 98 | export upper_case_anything = case.TOUPPER; 99 | 100 | export opening_brace = del_space_star ("{" : "") del_space_star; 101 | export closing_brace = del_space_star ("}" : "") del_space_star; 102 | 103 | export quote = del_space_star ("\"" : "") del_space_star; 104 | export double_quote = del_space_star ("\"\"" : "") del_space_star; 105 | 106 | export VOWELS = Optimize["a" | "e" | "i" | "o" | "u"]; 107 | export VOWELS_Y = Optimize["a" | "e" | "i" | "o" | "u" | "y"]; 108 | export VOWELS_INSENSITIVE = Optimize[VOWELS_Y | "A" | "E" | "I" 109 | | "O" | "U" | "Y"]; 110 | export CONSONANTS = Optimize[bytelib.kLower - VOWELS]; 111 | export CONSONANTS_INSENSITIVE = Optimize[bytelib.kAlpha - VOWELS_INSENSITIVE]; 112 | 113 | # LSEQs that can be used for URL verbalization for all languages; 114 | # mainly protocol names & file extensions. 115 | export URL_LSEQS = Optimize["www" | "edu" | "ftp" | "htm" | "html" | "imdb" | 116 | "php" | "asp" | "aspx" | "bbc" | "cgi" | "xhtml" | 117 | "shtml" | "jsp"]; 118 | 119 | # Rule for swapping cardinal to decimal; useful for measures where 120 | # both can appear in the proto but may be handled similarly. 121 | export CARDINAL_TO_DECIMAL = Optimize[ 122 | CDRewrite["cardinal" : "decimal", "", "", sigma_star] @ 123 | CDRewrite["integer:" : "integer_part:", "", "", sigma_star] 124 | ]; 125 | 126 | export escape_quotes_and_backslashes = 127 | ((bytelib.kBytes - "\"" - "\\") | ("\"" : "\\\"") | ("\\" : "\\\\"))* 128 | ; 129 | 130 | ## Generally useful definition: 131 | 132 | export hours = 133 | "0" 134 | | "1" 135 | | "2" 136 | | "3" 137 | | "4" 138 | | "5" 139 | | "6" 140 | | "7" 141 | | "8" 142 | | "9" 143 | | "10" 144 | | "11" 145 | | "12" 146 | | "13" 147 | | "14" 148 | | "15" 149 | | "16" 150 | | "17" 151 | | "18" 152 | | "19" 153 | | "20" 154 | | "21" 155 | | "22" 156 | | "23" 157 | | "24" 158 | ; 159 | 160 | export hours_shift = 161 | ("0" : "1") 162 | | ("1" : "2") 163 | | ("2" : "3") 164 | | ("3" : "4") 165 | | ("4" : "5") 166 | | ("5" : "6") 167 | | ("6" : "7") 168 | | ("7" : "8") 169 | | ("8" : "9") 170 | | ("9" : "10") 171 | | ("10" : "11") 172 | | ("11" : "12") 173 | | ("12" : "13") 174 | | ("13" : "14") 175 | | ("14" : "15") 176 | | ("15" : "16") 177 | | ("16" : "17") 178 | | ("17" : "18") 179 | | ("18" : "19") 180 | | ("19" : "20") 181 | | ("20" : "21") 182 | | ("21" : "22") 183 | | ("22" : "23") 184 | | ("23" : "24") 185 | | ("24" : "1") 186 | ; 187 | 188 | export hours_24_to_12 = 189 | ("0" : "12") 190 | | "1" 191 | | "2" 192 | | "3" 193 | | "4" 194 | | "5" 195 | | "6" 196 | | "7" 197 | | "8" 198 | | "9" 199 | | "10" 200 | | "11" 201 | | "12" 202 | | ("13" : "1") 203 | | ("14" : "2") 204 | | ("15" : "3") 205 | | ("16" : "4") 206 | | ("17" : "5") 207 | | ("18" : "6") 208 | | ("19" : "7") 209 | | ("20" : "8") 210 | | ("21" : "9") 211 | | ("22" : "10") 212 | | ("23" : "11") 213 | | ("24" : "12") 214 | ; 215 | 216 | export hours_24_to_12_next = 217 | ("0" : "1") 218 | | ("1" : "2") 219 | | ("2" : "3") 220 | | ("3" : "4") 221 | | ("4" : "5") 222 | | ("5" : "6") 223 | | ("6" : "7") 224 | | ("7" : "8") 225 | | ("8" : "9") 226 | | ("9" : "10") 227 | | ("10" : "11") 228 | | ("11" : "12") 229 | | ("12" : "1") 230 | | ("13" : "2") 231 | | ("14" : "3") 232 | | ("15" : "4") 233 | | ("16" : "5") 234 | | ("17" : "6") 235 | | ("18" : "7") 236 | | ("19" : "8") 237 | | ("20" : "9") 238 | | ("21" : "10") 239 | | ("22" : "11") 240 | | ("23" : "12") 241 | | ("24" : "1") 242 | ; 243 | 244 | export minutes = 245 | "0" 246 | | "1" 247 | | "2" 248 | | "3" 249 | | "4" 250 | | "5" 251 | | "6" 252 | | "7" 253 | | "8" 254 | | "9" 255 | | "10" 256 | | "11" 257 | | "12" 258 | | "13" 259 | | "14" 260 | | "15" 261 | | "16" 262 | | "17" 263 | | "18" 264 | | "19" 265 | | "20" 266 | | "21" 267 | | "22" 268 | | "23" 269 | | "24" 270 | | "25" 271 | | "26" 272 | | "27" 273 | | "28" 274 | | "29" 275 | | "30" 276 | | "31" 277 | | "32" 278 | | "33" 279 | | "34" 280 | | "35" 281 | | "36" 282 | | "37" 283 | | "38" 284 | | "39" 285 | | "40" 286 | | "41" 287 | | "42" 288 | | "43" 289 | | "44" 290 | | "45" 291 | | "46" 292 | | "47" 293 | | "48" 294 | | "49" 295 | | "50" 296 | | "51" 297 | | "52" 298 | | "53" 299 | | "54" 300 | | "55" 301 | | "56" 302 | | "57" 303 | | "58" 304 | | "59" 305 | ; 306 | 307 | export round_minutes = 308 | ("1" : "0") 309 | | ("2" : "0") 310 | | ("3" : "5") 311 | | ("4" : "5") 312 | | ("6" : "5") 313 | | ("7" : "5") 314 | | ("8" : "10") 315 | | ("9" : "10") 316 | | ("11" : "10") 317 | | ("12" : "10") 318 | | ("13" : "15") 319 | | ("14" : "15") 320 | | ("16" : "15") 321 | | ("17" : "15") 322 | | ("18" : "20") 323 | | ("19" : "20") 324 | | ("21" : "20") 325 | | ("22" : "20") 326 | | ("23" : "25") 327 | | ("24" : "25") 328 | | ("26" : "25") 329 | | ("27" : "25") 330 | | ("28" : "30") 331 | | ("29" : "30") 332 | | ("31" : "30") 333 | | ("32" : "30") 334 | | ("33" : "35") 335 | | ("34" : "35") 336 | | ("36" : "35") 337 | | ("37" : "35") 338 | | ("38" : "40") 339 | | ("39" : "40") 340 | | ("41" : "40") 341 | | ("42" : "40") 342 | | ("43" : "45") 343 | | ("44" : "45") 344 | | ("46" : "45") 345 | | ("47" : "45") 346 | | ("48" : "50") 347 | | ("49" : "50") 348 | | ("51" : "50") 349 | | ("52" : "50") 350 | | ("53" : "55") 351 | | ("54" : "55") 352 | | ("56" : "55") 353 | | ("57" : "55") 354 | ; 355 | 356 | export unrounded_minutes = 357 | ("0" : "0") 358 | | ("5" : "5") 359 | | ("10" : "10") 360 | | ("15" : "15") 361 | | ("20" : "20") 362 | | ("25" : "25") 363 | | ("30" : "30") 364 | | ("35" : "35") 365 | | ("40" : "40") 366 | | ("45" : "45") 367 | | ("50" : "50") 368 | | ("55" : "55") 369 | ; 370 | 371 | export round_minutes_next_hour = 372 | ("58" : "0") 373 | | ("59" : "0") 374 | ; 375 | 376 | export subtract_from_60 = 377 | "30" 378 | | ("31" : "29" ) 379 | | ("32" : "28" ) 380 | | ("33" : "27" ) 381 | | ("34" : "26" ) 382 | | ("35" : "25" ) 383 | | ("36" : "24" ) 384 | | ("37" : "23" ) 385 | | ("38" : "22" ) 386 | | ("39" : "21" ) 387 | | ("40" : "20" ) 388 | | ("41" : "19" ) 389 | | ("42" : "18" ) 390 | | ("43" : "17" ) 391 | | ("44" : "16" ) 392 | | ("45" : "15" ) 393 | | ("46" : "14" ) 394 | | ("47" : "13" ) 395 | | ("48" : "12" ) 396 | | ("49" : "11" ) 397 | | ("50" : "10" ) 398 | | ("51" : "9" ) 399 | | ("52" : "8" ) 400 | | ("53" : "7" ) 401 | | ("54" : "6" ) 402 | | ("55" : "5" ) 403 | | ("56" : "4" ) 404 | | ("57" : "3" ) 405 | | ("58" : "2" ) 406 | | ("59" : "1" ) 407 | ; 408 | 409 | export any_month = 410 | (("0" : "")? 411 | ( 412 | "1" 413 | | "2" 414 | | "3" 415 | | "4" 416 | | "5" 417 | | "6" 418 | | "7" 419 | | "8" 420 | | "9" 421 | )) 422 | | "10" 423 | | "11" 424 | | "12" 425 | ; 426 | 427 | export any_day = 428 | (("0" : "")? 429 | ( 430 | "1" 431 | | "2" 432 | | "3" 433 | | "4" 434 | | "5" 435 | | "6" 436 | | "7" 437 | | "8" 438 | | "9" 439 | )) 440 | | "10" 441 | | "11" 442 | | "12" 443 | | "13" 444 | | "14" 445 | | "15" 446 | | "16" 447 | | "17" 448 | | "18" 449 | | "19" 450 | | "20" 451 | | "21" 452 | | "22" 453 | | "23" 454 | | "24" 455 | | "25" 456 | | "26" 457 | | "27" 458 | | "28" 459 | | "29" 460 | | "30" 461 | | "31" 462 | ; 463 | 464 | ## TODO: These rules need to be coordinated with the markup since that may 465 | ## change. 466 | 467 | export approximately = "[~~]"; 468 | 469 | ## Rounded: say "approximately". 470 | 471 | approx1 = Optimize[ 472 | "minutes:" 473 | ("" : approximately) (minutes @ round_minutes) 474 | "|" 475 | "hours:" 476 | hours 477 | "|" 478 | pass_anything] 479 | ; 480 | 481 | ## Rounded to next hour. 482 | 483 | approx2 = Optimize[ 484 | "minutes:" 485 | ("" : approximately) round_minutes_next_hour 486 | "|" 487 | "hours:" 488 | hours_shift 489 | "|" 490 | pass_anything] 491 | ; 492 | 493 | ## Not rounded: don't say "approximately". 494 | 495 | approx3 = Optimize[ 496 | "minutes:" 497 | (minutes @ unrounded_minutes) 498 | "|" 499 | "hours:" 500 | hours 501 | "|" 502 | pass_anything] 503 | ; 504 | 505 | export approx = Optimize[ 506 | approx1 | approx2 | approx3 507 | ]; 508 | 509 | # "|" and "\" are escaped in the new serialization scheme using a backslash, so 510 | # we need to adjust these in the verbatim mappings. 511 | 512 | func EscapedMappings[raw_mappings] { 513 | escapes = ("\\\\" : "\\") | ("\\|" : "|"); 514 | return Optimize[ 515 | ((Project[raw_mappings, 'input'] - Project[escapes, 'output']) | escapes) 516 | @ raw_mappings 517 | ]; 518 | } 519 | 520 | # Allows verbatim grammars to be more permissive by accepting all inputs, it 521 | # simply consumes the input if it is not present in the raw mappings. 522 | 523 | func ConsumeUnmapped[raw_mappings] { 524 | unmapped = bytelib.kBytes - Project[raw_mappings, 'input']; 525 | return Optimize[ 526 | D[unmapped]<20> 527 | ]; 528 | } 529 | --------------------------------------------------------------------------------