├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── g2p_id ├── __init__.py ├── data │ └── dict.json ├── g2p.py ├── model │ └── bert_pron.onnx └── syllable_splitter.py ├── setup.py └── test_g2p.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | .DS_Store 163 | .backup/ 164 | .data/ 165 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 58 | Public License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License 63 | ("Public License"). To the extent this Public License may be 64 | interpreted as a contract, You are granted the Licensed Rights in 65 | consideration of Your acceptance of these terms and conditions, and the 66 | Licensor grants You such rights in consideration of benefits the 67 | Licensor receives from making the Licensed Material available under 68 | these terms and conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-NC-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution, NonCommercial, and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. NonCommercial means not primarily intended for or directed towards 126 | commercial advantage or monetary compensation. For purposes of 127 | this Public License, the exchange of the Licensed Material for 128 | other material subject to Copyright and Similar Rights by digital 129 | file-sharing or similar means is NonCommercial provided there is 130 | no payment of monetary compensation in connection with the 131 | exchange. 132 | 133 | l. Share means to provide material to the public by any means or 134 | process that requires permission under the Licensed Rights, such 135 | as reproduction, public display, public performance, distribution, 136 | dissemination, communication, or importation, and to make material 137 | available to the public including in ways that members of the 138 | public may access the material from a place and at a time 139 | individually chosen by them. 140 | 141 | m. Sui Generis Database Rights means rights other than copyright 142 | resulting from Directive 96/9/EC of the European Parliament and of 143 | the Council of 11 March 1996 on the legal protection of databases, 144 | as amended and/or succeeded, as well as other essentially 145 | equivalent rights anywhere in the world. 146 | 147 | n. You means the individual or entity exercising the Licensed Rights 148 | under this Public License. Your has a corresponding meaning. 149 | 150 | 151 | Section 2 -- Scope. 152 | 153 | a. License grant. 154 | 155 | 1. Subject to the terms and conditions of this Public License, 156 | the Licensor hereby grants You a worldwide, royalty-free, 157 | non-sublicensable, non-exclusive, irrevocable license to 158 | exercise the Licensed Rights in the Licensed Material to: 159 | 160 | a. reproduce and Share the Licensed Material, in whole or 161 | in part, for NonCommercial purposes only; and 162 | 163 | b. produce, reproduce, and Share Adapted Material for 164 | NonCommercial purposes only. 165 | 166 | 2. Exceptions and Limitations. For the avoidance of doubt, where 167 | Exceptions and Limitations apply to Your use, this Public 168 | License does not apply, and You do not need to comply with 169 | its terms and conditions. 170 | 171 | 3. Term. The term of this Public License is specified in Section 172 | 6(a). 173 | 174 | 4. Media and formats; technical modifications allowed. The 175 | Licensor authorizes You to exercise the Licensed Rights in 176 | all media and formats whether now known or hereafter created, 177 | and to make technical modifications necessary to do so. The 178 | Licensor waives and/or agrees not to assert any right or 179 | authority to forbid You from making technical modifications 180 | necessary to exercise the Licensed Rights, including 181 | technical modifications necessary to circumvent Effective 182 | Technological Measures. For purposes of this Public License, 183 | simply making modifications authorized by this Section 2(a) 184 | (4) never produces Adapted Material. 185 | 186 | 5. Downstream recipients. 187 | 188 | a. Offer from the Licensor -- Licensed Material. Every 189 | recipient of the Licensed Material automatically 190 | receives an offer from the Licensor to exercise the 191 | Licensed Rights under the terms and conditions of this 192 | Public License. 193 | 194 | b. Additional offer from the Licensor -- Adapted Material. 195 | Every recipient of Adapted Material from You 196 | automatically receives an offer from the Licensor to 197 | exercise the Licensed Rights in the Adapted Material 198 | under the conditions of the Adapter's License You apply. 199 | 200 | c. No downstream restrictions. You may not offer or impose 201 | any additional or different terms or conditions on, or 202 | apply any Effective Technological Measures to, the 203 | Licensed Material if doing so restricts exercise of the 204 | Licensed Rights by any recipient of the Licensed 205 | Material. 206 | 207 | 6. No endorsement. Nothing in this Public License constitutes or 208 | may be construed as permission to assert or imply that You 209 | are, or that Your use of the Licensed Material is, connected 210 | with, or sponsored, endorsed, or granted official status by, 211 | the Licensor or others designated to receive attribution as 212 | provided in Section 3(a)(1)(A)(i). 213 | 214 | b. Other rights. 215 | 216 | 1. Moral rights, such as the right of integrity, are not 217 | licensed under this Public License, nor are publicity, 218 | privacy, and/or other similar personality rights; however, to 219 | the extent possible, the Licensor waives and/or agrees not to 220 | assert any such rights held by the Licensor to the limited 221 | extent necessary to allow You to exercise the Licensed 222 | Rights, but not otherwise. 223 | 224 | 2. Patent and trademark rights are not licensed under this 225 | Public License. 226 | 227 | 3. To the extent possible, the Licensor waives any right to 228 | collect royalties from You for the exercise of the Licensed 229 | Rights, whether directly or through a collecting society 230 | under any voluntary or waivable statutory or compulsory 231 | licensing scheme. In all other cases the Licensor expressly 232 | reserves any right to collect such royalties, including when 233 | the Licensed Material is used other than for NonCommercial 234 | purposes. 235 | 236 | 237 | Section 3 -- License Conditions. 238 | 239 | Your exercise of the Licensed Rights is expressly made subject to the 240 | following conditions. 241 | 242 | a. Attribution. 243 | 244 | 1. If You Share the Licensed Material (including in modified 245 | form), You must: 246 | 247 | a. retain the following if it is supplied by the Licensor 248 | with the Licensed Material: 249 | 250 | i. identification of the creator(s) of the Licensed 251 | Material and any others designated to receive 252 | attribution, in any reasonable manner requested by 253 | the Licensor (including by pseudonym if 254 | designated); 255 | 256 | ii. a copyright notice; 257 | 258 | iii. a notice that refers to this Public License; 259 | 260 | iv. a notice that refers to the disclaimer of 261 | warranties; 262 | 263 | v. a URI or hyperlink to the Licensed Material to the 264 | extent reasonably practicable; 265 | 266 | b. indicate if You modified the Licensed Material and 267 | retain an indication of any previous modifications; and 268 | 269 | c. indicate the Licensed Material is licensed under this 270 | Public License, and include the text of, or the URI or 271 | hyperlink to, this Public License. 272 | 273 | 2. You may satisfy the conditions in Section 3(a)(1) in any 274 | reasonable manner based on the medium, means, and context in 275 | which You Share the Licensed Material. For example, it may be 276 | reasonable to satisfy the conditions by providing a URI or 277 | hyperlink to a resource that includes the required 278 | information. 279 | 3. If requested by the Licensor, You must remove any of the 280 | information required by Section 3(a)(1)(A) to the extent 281 | reasonably practicable. 282 | 283 | b. ShareAlike. 284 | 285 | In addition to the conditions in Section 3(a), if You Share 286 | Adapted Material You produce, the following conditions also apply. 287 | 288 | 1. The Adapter's License You apply must be a Creative Commons 289 | license with the same License Elements, this version or 290 | later, or a BY-NC-SA Compatible License. 291 | 292 | 2. You must include the text of, or the URI or hyperlink to, the 293 | Adapter's License You apply. You may satisfy this condition 294 | in any reasonable manner based on the medium, means, and 295 | context in which You Share Adapted Material. 296 | 297 | 3. You may not offer or impose any additional or different terms 298 | or conditions on, or apply any Effective Technological 299 | Measures to, Adapted Material that restrict exercise of the 300 | rights granted under the Adapter's License You apply. 301 | 302 | 303 | Section 4 -- Sui Generis Database Rights. 304 | 305 | Where the Licensed Rights include Sui Generis Database Rights that 306 | apply to Your use of the Licensed Material: 307 | 308 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 309 | to extract, reuse, reproduce, and Share all or a substantial 310 | portion of the contents of the database for NonCommercial purposes 311 | only; 312 | 313 | b. if You include all or a substantial portion of the database 314 | contents in a database in which You have Sui Generis Database 315 | Rights, then the database in which You have Sui Generis Database 316 | Rights (but not its individual contents) is Adapted Material, 317 | including for purposes of Section 3(b); and 318 | 319 | c. You must comply with the conditions in Section 3(a) if You Share 320 | all or a substantial portion of the contents of the database. 321 | 322 | For the avoidance of doubt, this Section 4 supplements and does not 323 | replace Your obligations under this Public License where the Licensed 324 | Rights include other Copyright and Similar Rights. 325 | 326 | 327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 328 | 329 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 330 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 331 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 332 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 333 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 334 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 335 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 336 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 337 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 338 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 339 | 340 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 341 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 342 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 343 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 344 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 345 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 346 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 347 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 348 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 349 | 350 | c. The disclaimer of warranties and limitation of liability provided 351 | above shall be interpreted in a manner that, to the extent 352 | possible, most closely approximates an absolute disclaimer and 353 | waiver of all liability. 354 | 355 | 356 | Section 6 -- Term and Termination. 357 | 358 | a. This Public License applies for the term of the Copyright and 359 | Similar Rights licensed here. However, if You fail to comply with 360 | this Public License, then Your rights under this Public License 361 | terminate automatically. 362 | 363 | b. Where Your right to use the Licensed Material has terminated under 364 | Section 6(a), it reinstates: 365 | 366 | 1. automatically as of the date the violation is cured, provided 367 | it is cured within 30 days of Your discovery of the 368 | violation; or 369 | 370 | 2. upon express reinstatement by the Licensor. 371 | 372 | For the avoidance of doubt, this Section 6(b) does not affect any 373 | right the Licensor may have to seek remedies for Your violations 374 | of this Public License. 375 | 376 | c. For the avoidance of doubt, the Licensor may also offer the 377 | Licensed Material under separate terms or conditions or stop 378 | distributing the Licensed Material at any time; however, doing so 379 | will not terminate this Public License. 380 | 381 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 382 | License. 383 | 384 | 385 | Section 7 -- Other Terms and Conditions. 386 | 387 | a. The Licensor shall not be bound by any additional or different 388 | terms or conditions communicated by You unless expressly agreed. 389 | 390 | b. Any arrangements, understandings, or agreements regarding the 391 | Licensed Material not stated herein are separate from and 392 | independent of the terms and conditions of this Public License. 393 | 394 | 395 | Section 8 -- Interpretation. 396 | 397 | a. For the avoidance of doubt, this Public License does not, and 398 | shall not be interpreted to, reduce, limit, restrict, or impose 399 | conditions on any use of the Licensed Material that could lawfully 400 | be made without permission under this Public License. 401 | 402 | b. To the extent possible, if any provision of this Public License is 403 | deemed unenforceable, it shall be automatically reformed to the 404 | minimum extent necessary to make it enforceable. If the provision 405 | cannot be reformed, it shall be severed from this Public License 406 | without affecting the enforceability of the remaining terms and 407 | conditions. 408 | 409 | c. No term or condition of this Public License will be waived and no 410 | failure to comply consented to unless expressly agreed to by the 411 | Licensor. 412 | 413 | d. Nothing in this Public License constitutes or may be interpreted 414 | as a limitation upon, or waiver of, any privileges and immunities 415 | that apply to the Licensor or You, including from the legal 416 | processes of any jurisdiction or authority. 417 | 418 | ======================================================================= 419 | 420 | Creative Commons is not a party to its public 421 | licenses. Notwithstanding, Creative Commons may elect to apply one of 422 | its public licenses to material it publishes and in those instances 423 | will be considered the “Licensor.” The text of the Creative Commons 424 | public licenses is dedicated to the public domain under the CC0 Public 425 | Domain Dedication. Except for the limited purpose of indicating that 426 | material is shared under a Creative Commons public license or as 427 | otherwise permitted by the Creative Commons policies published at 428 | creativecommons.org/policies, Creative Commons does not authorize the 429 | use of the trademark "Creative Commons" or any other trademark or logo 430 | of Creative Commons without its prior written consent including, 431 | without limitation, in connection with any unauthorized modifications 432 | to any of its public licenses or any other arrangements, 433 | understandings, or agreements concerning use of licensed material. For 434 | the avoidance of doubt, this paragraph does not form part of the 435 | public licenses. 436 | 437 | Creative Commons may be contacted at creativecommons.org. 438 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include g2p_id/data/dict.json 2 | include g2p_id/model/bert_pron.onnx 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Indonesian Grapheme-to-Phoneme 2 | 3 | This module is designed to convert Indonesian graphemes (spelling) into phonemes (pronunciation). Fortunately, most Indonesian word pronunciations can be inferred from their spelling. 4 | 5 | Big thanks to Wilson Wongso for sharing about [Predicting Phonemes with BERT](https://w11wo.github.io/posts/2022/04/predicting-phonemes-with-bert/). I used his code to implement the predictor used in this module. 6 | 7 | ## Installation 8 | 9 | ```bash 10 | pip install git+https://github.com/Wikidepia/g2p-id 11 | ``` 12 | 13 | ## Example usage 14 | 15 | ```python 16 | from g2p_id import G2P 17 | 18 | g2p = G2P() 19 | g2p("Rumah Agus terbakar.") # ˈrumah ˈaɡʊs tərˈbakar. 20 | ``` 21 | 22 | ## References 23 | 24 | - [Variasi Bunyi Vokal - Narabahasa](https://narabahasa.id/linguistik-umum/fonologi/variasi-bunyi-vokal) 25 | - [Predicting Phonemes with BERT - Wilson Wongso](https://w11wo.github.io/posts/2022/04/predicting-phonemes-with-bert/) 26 | - Moeliono, Anton M., dkk. 2017. Tata Bahasa Baku Bahasa Indonesia Edisi Keempat. Jakarta: Badan Pengembangan dan Pembinaan Bahasa. 27 | 28 | ## TODO 29 | 30 | - [x] Add test cases 31 | - [ ] Better model for predicting "e" 32 | - [ ] Handle heteronym 33 | -------------------------------------------------------------------------------- /g2p_id/__init__.py: -------------------------------------------------------------------------------- 1 | from .g2p import G2P 2 | 3 | __version__ = "0.0.5" 4 | -------------------------------------------------------------------------------- /g2p_id/g2p.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | 5 | import numpy as np 6 | import onnxruntime 7 | from nltk.tokenize import TweetTokenizer 8 | from sacremoses import MosesDetokenizer 9 | 10 | from .syllable_splitter import SyllableSplitter 11 | 12 | ABJAD_MAPPING = { 13 | "a": "a", 14 | "b": "bé", 15 | "c": "cé", 16 | "d": "dé", 17 | "e": "é", 18 | "f": "èf", 19 | "g": "gé", 20 | "h": "ha", 21 | "i": "i", 22 | "j": "jé", 23 | "k": "ka", 24 | "l": "èl", 25 | "m": "èm", 26 | "n": "èn", 27 | "o": "o", 28 | "p": "pé", 29 | "q": "ki", 30 | "r": "èr", 31 | "s": "ès", 32 | "t": "té", 33 | "u": "u", 34 | "v": "vé", 35 | "w": "wé", 36 | "x": "èks", 37 | "y": "yé", 38 | "z": "zèt", 39 | } 40 | 41 | PHONETIC_MAPPING = { 42 | "sy": "ʃ", 43 | "ny": "ɲ", 44 | "ng": "ŋ", 45 | "dj": "dʒ", 46 | "'": "ʔ", 47 | "c": "tʃ", 48 | "é": "e", 49 | "è": "ɛ", 50 | "ê": "ə", 51 | "g": "ɡ", 52 | "I": "ɪ", 53 | "j": "dʒ", 54 | "ô": "ɔ", 55 | "q": "k", 56 | "U": "ʊ", 57 | "v": "f", 58 | "x": "ks", 59 | "y": "j", 60 | } 61 | 62 | 63 | dirname = os.path.dirname(__file__) 64 | 65 | # Predict pronounciation with BERT Masking 66 | # Read more: https://w11wo.github.io/posts/2022/04/predicting-phonemes-with-bert/ 67 | class Predictor: 68 | def __init__(self, model_path): 69 | # fmt: off 70 | self.vocab = ['', '[UNK]', 'a', 'n', 'ê', 'e', 'i', 'r', 'k', 's', 't', 'g', 'm', 'u', 'l', 'p', 'o', 'd', 'b', 'h', 'c', 'j', 'y', 'f', 'w', 'v', 'z', 'x', 'q', '[mask]'] 71 | self.mask_token_id = self.vocab.index("[mask]") 72 | # fmt: on 73 | self.session = onnxruntime.InferenceSession(model_path) 74 | 75 | def predict(self, word: str) -> str: 76 | """ 77 | Predict the phonetic representation of a word. 78 | 79 | Args: 80 | word (str): The word to predict. 81 | 82 | Returns: 83 | str: The predicted phonetic representation of the word. 84 | """ 85 | text = [self.vocab.index(c) if c != "e" else self.mask_token_id for c in word] 86 | text.extend([0] * (32 - len(text))) # Pad to 32 tokens 87 | inputs = np.array([text], dtype=np.int64) 88 | (predictions,) = self.session.run(None, {"input_4": inputs}) 89 | 90 | # find masked idx token 91 | _, masked_index = np.where(inputs == self.mask_token_id) 92 | 93 | # get prediction at those masked index only 94 | mask_prediction = predictions[0][masked_index] 95 | predicted_ids = np.argmax(mask_prediction, axis=1) 96 | 97 | # replace mask with predicted token 98 | for i, idx in enumerate(masked_index): 99 | text[idx] = predicted_ids[i] 100 | 101 | return "".join([self.vocab[i] for i in text if i != 0]) 102 | 103 | 104 | class G2P: 105 | def __init__(self): 106 | self.tokenizer = TweetTokenizer() 107 | self.detokenizer = MosesDetokenizer(lang="id") 108 | 109 | dict_path = os.path.join(dirname, "data/dict.json") 110 | with open(dict_path) as f: 111 | self.dict = json.load(f) 112 | 113 | model_path = os.path.join(dirname, "model/bert_pron.onnx") 114 | self.predictor = Predictor(model_path) 115 | 116 | self.syllable_splitter = SyllableSplitter() 117 | 118 | def __call__(self, text: str) -> str: 119 | """ 120 | Convert text to phonetic representation. 121 | 122 | Args: 123 | text (str): The text to convert. 124 | 125 | Returns: 126 | str: The phonetic representation of the text. 127 | """ 128 | text = text.lower() 129 | text = re.sub(r"[^ a-z0-9'\.,?!-]", "", text) 130 | text = text.replace("-", " ") 131 | 132 | prons = [] 133 | words = self.tokenizer.tokenize(text) 134 | for word in words: 135 | # PUEBI pronunciation 136 | if word in self.dict: 137 | pron = self.dict[word] 138 | elif len(word) == 1 and word in ABJAD_MAPPING: 139 | pron = ABJAD_MAPPING[word] 140 | elif "e" not in word or not word.isalpha(): 141 | pron = word 142 | elif "e" in word: 143 | pron = self.predictor.predict(word) 144 | 145 | # Replace alofon /e/ with e (temporary) 146 | pron = pron.replace("é", "e") 147 | pron = pron.replace("è", "e") 148 | 149 | # Replace /x/ with /s/ 150 | if pron.startswith("x"): 151 | pron = "s" + pron[1:] 152 | 153 | sylls = self.syllable_splitter.split_syllables(pron) 154 | # Decide where to put the stress 155 | stress_loc = len(sylls) - 1 156 | if len(sylls) > 1 and "ê" in sylls[-2]: 157 | if "ê" in sylls[-1]: 158 | stress_loc = len(sylls) - 2 159 | else: 160 | stress_loc = len(sylls) 161 | 162 | # Apply rules on syllable basis 163 | # All alophone are set to tense by default 164 | # and will be changed to lax if needed 165 | alophone = {"e": "é", "o": "o"} 166 | alophone_map = {"i": "I", "u": "U", "e": "è", "o": "ô"} 167 | for i, syll in enumerate(sylls, start=1): 168 | # Put Syllable stress 169 | if i == stress_loc: 170 | syll = "ˈ" + syll 171 | 172 | # Alophone syllable rules 173 | for v in ["e", "o"]: 174 | # Replace with lax allphone [ɛ, ɔ] if 175 | # in closed final syllables 176 | if v in syll and not syll.endswith(v) and i == len(sylls): 177 | alophone[v] = alophone_map[v] 178 | 179 | # Alophone syllable stress rules 180 | for v in ["i", "u"]: 181 | # Replace with lax allphone [ɪ, ʊ] if 182 | # in the middle of syllable without stress 183 | # and not ends with coda nasal [m, n, ng] (except for final syllable) 184 | if ( 185 | v in syll 186 | and not syll.startswith("ˈ") 187 | and not syll.endswith(v) 188 | and ( 189 | not any(syll.endswith(x) for x in ["m", "n", "ng"]) 190 | or i == len(sylls) 191 | ) 192 | ): 193 | syll = syll.replace(v, alophone_map[v]) 194 | 195 | if syll.endswith("nk"): 196 | syll = syll[:-2] + "ng" 197 | elif syll.endswith("d"): 198 | syll = syll[:-1] + "t" 199 | elif syll.endswith("b"): 200 | syll = syll[:-1] + "p" 201 | elif syll.endswith("k") or ( 202 | syll.endswith("g") and not syll.endswith("ng") 203 | ): 204 | syll = syll[:-1] + "'" 205 | sylls[i - 1] = syll 206 | 207 | pron = "".join(sylls) 208 | # Apply phonetic and alophone mapping 209 | for v in alophone: 210 | if v == "o" and pron.count("o") == 1: 211 | continue 212 | pron = pron.replace(v, alophone[v]) 213 | for g, p in PHONETIC_MAPPING.items(): 214 | pron = pron.replace(g, p) 215 | pron = pron.replace("kh", "x") 216 | 217 | prons.append(pron) 218 | prons.append(" ") 219 | 220 | return self.detokenizer.detokenize(prons) 221 | -------------------------------------------------------------------------------- /g2p_id/model/bert_pron.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Wikidepia/g2p-id/309063b4c06be2e67482e3c1c886b1973af86842/g2p_id/model/bert_pron.onnx -------------------------------------------------------------------------------- /g2p_id/syllable_splitter.py: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/fahadh4ilyas/syllable_splitter 2 | # MIT License 3 | import re 4 | 5 | 6 | class SyllableSplitter: 7 | def __init__(self): 8 | self.consonant = set( 9 | [ 10 | "b", 11 | "c", 12 | "d", 13 | "f", 14 | "g", 15 | "h", 16 | "j", 17 | "k", 18 | "l", 19 | "m", 20 | "n", 21 | "p", 22 | "q", 23 | "r", 24 | "s", 25 | "t", 26 | "v", 27 | "w", 28 | "x", 29 | "y", 30 | "z", 31 | "ng", 32 | "ny", 33 | "sy", 34 | "ch", 35 | "dh", 36 | "gh", 37 | "kh", 38 | "ph", 39 | "sh", 40 | "th", 41 | ] 42 | ) 43 | self.double_consonant = set(["ll", "ks", "rs", "rt", "nk", "nd"]) 44 | self.vocal = set(["a", "e", "ê", "é", "è", "i", "o", "u"]) 45 | 46 | def split_letters(self, string): 47 | letters = [] 48 | arrange = [] 49 | 50 | while string != "": 51 | letter = string[:2] 52 | 53 | if letter in self.double_consonant: 54 | if string[2:] != "" and string[2] in self.vocal: 55 | letters += [letter[0]] 56 | arrange += ["c"] 57 | string = string[1:] 58 | else: 59 | letters += [letter] 60 | arrange += ["c"] 61 | string = string[2:] 62 | elif letter in self.consonant: 63 | letters += [letter] 64 | arrange += ["c"] 65 | string = string[2:] 66 | elif letter in self.vocal: 67 | letters += [letter] 68 | arrange += ["v"] 69 | string = string[2:] 70 | else: 71 | letter = string[0] 72 | 73 | if letter in self.consonant: 74 | letters += [letter] 75 | arrange += ["c"] 76 | string = string[1:] 77 | elif letter in self.vocal: 78 | letters += [letter] 79 | arrange += ["v"] 80 | string = string[1:] 81 | else: 82 | letters += [letter] 83 | arrange += ["s"] 84 | string = string[1:] 85 | 86 | return letters, "".join(arrange) 87 | 88 | def split_syllables_from_letters(self, letters, arrange): 89 | consonant_index = re.search(r"vc{2,}", arrange) 90 | while consonant_index: 91 | i = consonant_index.start() + 1 92 | letters = letters[: i + 1] + ["|"] + letters[i + 1 :] 93 | arrange = arrange[: i + 1] + "|" + arrange[i + 1 :] 94 | consonant_index = re.search(r"vc{2,}", arrange) 95 | 96 | vocal_index = re.search(r"v{2,}", arrange) 97 | while vocal_index: 98 | i = vocal_index.start() 99 | letters = letters[: i + 1] + ["|"] + letters[i + 1 :] 100 | arrange = arrange[: i + 1] + "|" + arrange[i + 1 :] 101 | vocal_index = re.search(r"v{2,}", arrange) 102 | 103 | vcv_index = re.search(r"vcv", arrange) 104 | while vcv_index: 105 | i = vcv_index.start() 106 | letters = letters[: i + 1] + ["|"] + letters[i + 1 :] 107 | arrange = arrange[: i + 1] + "|" + arrange[i + 1 :] 108 | vcv_index = re.search(r"vcv", arrange) 109 | 110 | sep_index = re.search(r"[cvs]s", arrange) 111 | while sep_index: 112 | i = sep_index.start() 113 | letters = letters[: i + 1] + ["|"] + letters[i + 1 :] 114 | arrange = arrange[: i + 1] + "|" + arrange[i + 1 :] 115 | sep_index = re.search(r"[cvs]s", arrange) 116 | 117 | sep_index = re.search(r"s[cvs]", arrange) 118 | while sep_index: 119 | i = sep_index.start() 120 | letters = letters[: i + 1] + ["|"] + letters[i + 1 :] 121 | arrange = arrange[: i + 1] + "|" + arrange[i + 1 :] 122 | sep_index = re.search(r"s[cvs]", arrange) 123 | return "".join(letters).split("|") 124 | 125 | def split_syllables(self, string): 126 | letters, arrange = self.split_letters(string) 127 | return self.split_syllables_from_letters(letters, arrange) 128 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="g2p-id", 5 | packages=find_packages(exclude=[]), 6 | version="0.0.8", 7 | license="MIT", 8 | description="Indonesian Grapheme-to-Phoneme (G2P)", 9 | author="Akmal", 10 | author_email="akmal@depia.wiki", 11 | long_description_content_type="text/markdown", 12 | url="https://github.com/Wikidepia/g2p-id", 13 | install_requires=[ 14 | "sacremoses>=0.0.41", 15 | "nltk>=3.7", 16 | "onnxruntime>=1.7.0", 17 | ], 18 | classifiers=[ 19 | "Development Status :: 4 - Beta", 20 | "Intended Audience :: Developers", 21 | "License :: OSI Approved :: MIT License", 22 | "Programming Language :: Python :: 3.6", 23 | ], 24 | include_package_data=True, 25 | ) 26 | -------------------------------------------------------------------------------- /test_g2p.py: -------------------------------------------------------------------------------- 1 | from g2p_id import G2P 2 | 3 | g2p = G2P() 4 | 5 | 6 | def test_g2p_vocal(): 7 | # Alofon [e] 8 | assert g2p("serong") == "ˈseroŋ" 9 | assert g2p("sore") == "ˈsore" 10 | assert g2p("kare") == "ˈkare" 11 | 12 | # Alofon [ɛ] 13 | assert g2p("teh") == "tɛh" 14 | assert g2p("pek") == "pɛʔ" 15 | assert g2p("bebek") == "ˈbɛbɛʔ" 16 | 17 | # Alofon [ə] 18 | assert g2p("tante") == "ˈtantə" 19 | assert g2p("enam") == "əˈnam" 20 | assert g2p("emas") == "əˈmas" 21 | 22 | # Alofon [o] 23 | assert g2p("toko") == "ˈtoko" 24 | assert g2p("roda") == "ˈroda" 25 | assert g2p("sekolah") == "səˈkolah" 26 | 27 | # Alofon [ɔ] 28 | assert g2p("rokok") == "ˈrɔkɔʔ" 29 | assert g2p("pojok") == "ˈpɔdʒɔʔ" 30 | assert g2p("momok") == "ˈmɔmɔʔ" 31 | assert g2p("pohon") == "ˈpɔhɔn" 32 | # assert g2p("positif") == "pɔˈsitɪf" 33 | 34 | # Alofon [i] 35 | assert g2p("gigi") == "ˈɡiɡi" 36 | assert g2p("tali") == "ˈtali" 37 | assert g2p("ini") == "ˈini" 38 | assert g2p("bila") == "ˈbila" 39 | assert g2p("simpang") == "ˈsimpaŋ" 40 | assert g2p("periksa") == "pəˈriʔsa" 41 | 42 | # Alofon [ɪ] 43 | assert g2p("banting") == "ˈbantɪŋ" 44 | assert g2p("salin") == "ˈsalɪn" 45 | assert g2p("parit") == "ˈparɪt" 46 | assert g2p("pilih") == "ˈpilɪh" 47 | assert g2p("yakin") == "ˈjakɪn" 48 | assert g2p("kirim") == "ˈkirɪm" 49 | 50 | # Alofon [u] 51 | assert g2p("upah") == "ˈupah" 52 | assert g2p("tukang") == "ˈtukaŋ" 53 | assert g2p("bantu") == "ˈbantu" 54 | 55 | assert g2p("kumbang") == "ˈkumbaŋ" 56 | assert g2p("tunggu") == "ˈtuŋɡu" 57 | assert g2p("bundel") == "ˈbundəl" 58 | 59 | # Alofon [ʊ] 60 | assert g2p("warung") == "ˈwarʊŋ" 61 | 62 | # Narabahasa syllable seems to be (wrong?) 63 | # du.sta (NB) -> dus.ta (KBBI) 64 | # pu.lsa (NB) -> pul.sa (KBBI) 65 | # assert g2p("dusta") == "ˈdʊsta" 66 | # assert g2p("pulsa") == "ˈpʊlsa" 67 | 68 | 69 | if __name__ == "__main__": 70 | test_g2p_vocal() 71 | --------------------------------------------------------------------------------