├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── g2pk ├── __init__.py ├── english.py ├── g2pk.py ├── idioms.txt ├── numerals.py ├── regular.py ├── rules.txt ├── special.py ├── table.csv └── utils.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.xml 6 | .idea 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include g2pk/rules.txt 2 | include g2pk/idioms.txt 3 | include g2pk/table.csv -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![image](https://img.shields.io/pypi/v/g2pk.svg)](https://pypi.org/project/g2pk/) 2 | [![image](https://img.shields.io/pypi/l/g2pk.svg)](https://pypi.org/project/g2pk/) 3 | [![image](https://img.shields.io/pypi/pyversions/g2pk.svg)](https://pypi.org/project/g2pk/) 4 | 5 | # g2pK: g2p module for Korean 6 | 7 | g2p means a task that converts graphemes to phonemes. Hangul, the main script for Korean, is phonetic, but the pronunciation rules are notoriously complicated. 8 | So it is never easy to learn how to read a text in Korean. That's why g2p is necessary in various nlp tasks like TTS. 9 | . There's a open source g2p library for Korean, [KoG2P](https://github.com/scarletcho/KoG2P). It is 10 | simple and works well, but I think we need a better one. Please read through the following section (main features and usage) 11 | to understand the philosophy of g2pK and how to use g2pK. We know it is not perfect in present. 12 | That's one of the reasons your contributions are more than welcome. 13 | 14 | ## Requirements 15 | * python >= 3.6 16 | * jamo 17 | * [python-mecab-ko](https://github.com/jonghwanhyeon/python-mecab-ko) 18 | * konlpy 19 | * nltk 20 | 21 | ## Installation 22 | ``` 23 | pip install g2pk 24 | ``` 25 | 26 | ## Main features & Usage 27 | * Returns text as it is pronounced, keeping punctuations. 28 | ``` 29 | >>> from g2pk import G2p 30 | >>> g2p = G2p() 31 | >>> g2p("어제는 날씨가 맑았는데, 오늘은 흐리다.") 32 | 어제는 날씨가 말간는데, 오느른 흐리다. 33 | ``` 34 | * Determines pronunciation seeing context, thanks to Mecab, a morphological analyzer. 35 | In the following example, note that the first and second 신고 are pronounced differently. 36 | ``` 37 | >>> g2p("신을 신고 얼른 동사무소에 가서 혼인 신고 해라") 38 | 시늘 신꼬 얼른 동사무소에 가서 호닌 신고 해라 39 | ``` 40 | * Returns two types of results, that is, prescriptive (default) and descriptive (with the option `descriptive=True`) pronunciation. 41 | For example, josa 의 is pronounced 의 in principle, but in real life, it is often pronounced 에. 42 | Also, 계 is much more often pronounced 게. 43 | ``` 44 | >>> sent = "나의 친구는 계산이 아주 빠르다" 45 | >>> g2p(sent) 46 | 나의 친구는 계사니 아주 빠르다 47 | >>> g2p(sent, descriptive=True) 48 | 나에 친구는 게사니 아주 빠르다 49 | ``` 50 | * This distinction becomes more obvious if you set `group_vowels=True`. 51 | In contemporary colloquial speech, some vowels are hard to distinguish from each other. 52 | For example, in the example below, the vowel ㅒ is normalized to ㅖ. 53 | ``` 54 | >>> sent = "저는 예전에 그 얘기를 들은 적이 있습니다" 55 | >>> g2p(sent) 56 | 저느 녜저네 그 얘기를 드른 저기 읻씀니다 57 | >>> g2p(sent, group_vowels=True) 58 | 저느 녜저네 그 예기를 드른 저기 읻씀니다 59 | ``` 60 | * By default, it returns the standard Korean script, where letters are assembled to form a syllable. 61 | If you set `to_syl=False`, however, it returns Hangul letters or jamo. This can be useful for many applications like speech synthesis. 62 | \*Depending on the font you are using, the two results below may look the same, but actually they are not. 63 | ``` 64 | >>> sent = "어제는 날씨가 맑았는데, 오늘은 흐리다." 65 | >>> g2p(sent) 66 | 어제는 날씨가 말간는데, 오느른 흐리다. 67 | >>> g2p(sent, to_syl=False) 68 | 어제는 날씨가 말간는데, 오느른 흐리다. 69 | ``` 70 | * English words in alphabets are converted into Hangul. 71 | This is possible due to [cmu pronouncing dictionary](http://www.speech.cs.cmu.edu/cgi-bin/cmudict). 72 | ``` 73 | >>> sent = "그 사람은 좀, old school 같아" 74 | >>> g2p(sent) 75 | 그 사라믄 좀, 올드 스쿨 가타 76 | ``` 77 | * Arabic numbers are spelled out to their context. 78 | Note that the first 12 is pronounced 열두, whereas the second 12 is pronounced 십이. 79 | ``` 80 | >>> sent = "지금 시각은 12시 12분입니다" 81 | >>> g2p(sent) 82 | 지금 시가그 녈두시 시비부님니다 83 | ``` 84 | * It is natural that rules can NOT cover every single case. Add special idioms to `idioms.txt`. 85 | * If you set `verbose=True`, you will see the conversion processes with relevant information. 86 | ``` 87 | >>> sent = "학교에 갔다 와서, 엄마가 해 주신 밥을 먹었다." 88 | >>> g2p(sent, verbose=True) 89 | 학교에 갔다 와서, 엄마가 해 주신 밥을 먹었다. -> 학꾜에 갔다 와서, 엄마가 해 주신 밥을 먹었다. 90 | 제23항 받침 'ㄱ(ㄲ, ㅋ, ㄳ, ㄺ), ㄷ(ㅅ, ㅆ, ㅈ, ㅊ, ㅌ), ㅂ(ㅍ, ㄼ, ㄿ, ㅄ)' 뒤에 연결되는 'ㄱ, ㄷ, ㅂ, ㅅ, ㅈ'은 된소리로 발음한다. 91 | -> 국밥[국빱], 깎다[깍따], 넑받이[넉빠지], 삯돈[삭똔] 92 | -> 닭장[닥짱], 칡범[칙뻠], 뻗대다[뻗때다], 옷고름[옫꼬름] 93 | -> 있던[읻떤], 꽂고[꼳꼬], 꽃다발[꼳따발], 낯설다[낟썰다] 94 | -> 밭갈이[받까리], 솥전[솓쩐], 곱돌[곱똘], 덮개[덥깨] 95 | -> 옆집[엽찝], 넓죽하다[넙쭈카다], 읊조리다[읍쪼리다], 값지다[갑찌다] 96 | 학꾜에 갔다 와서, 엄마가 해 주신 밥을 먹었다. -> 학꾜에 갇따 와서, 엄마가 해 주신 밥을 먹얻따. 97 | 제9항 받침 'ㄲ, ㅋ', 'ㅅ, ㅆ, ㅈ, ㅊ, ㅌ', 'ㅍ'은 어말 또는 자음 앞에서 각각 대표음 [ㄱ, ㄷ, ㅂ]으로 발음한다. 98 | -> 닦다[닥따], 키읔[키윽], 키읔과[키윽꽈], 옷[옫] 99 | -> 웃다[욷따], 있다[읻따], 젖[젇], 빚다[빋따] 100 | -> 꽃[꼳], 쫓다[쫃따], 솥[솓], 뱉다[밷따] 101 | -> 앞[압], 덮다[덥따] 102 | 제23항 받침 'ㄱ(ㄲ, ㅋ, ㄳ, ㄺ), ㄷ(ㅅ, ㅆ, ㅈ, ㅊ, ㅌ), ㅂ(ㅍ, ㄼ, ㄿ, ㅄ)' 뒤에 연결되는 'ㄱ, ㄷ, ㅂ, ㅅ, ㅈ'은 된소리로 발음한다. 103 | -> 국밥[국빱], 깎다[깍따], 넑받이[넉빠지], 삯돈[삭똔] 104 | -> 닭장[닥짱], 칡범[칙뻠], 뻗대다[뻗때다], 옷고름[옫꼬름] 105 | -> 있던[읻떤], 꽂고[꼳꼬], 꽃다발[꼳따발], 낯설다[낟썰다] 106 | -> 밭갈이[받까리], 솥전[솓쩐], 곱돌[곱똘], 덮개[덥깨] 107 | -> 옆집[엽찝], 넓죽하다[넙쭈카다], 읊조리다[읍쪼리다], 값지다[갑찌다] 108 | 학꾜에 갇따 와서, 엄마가 해 주신 밥을 먹얻따. -> 학꾜에 갇따 와서, 엄마가 해 주신 바블 머걷따. 109 | 제13항 홑받침이나 쌍받침이 모음으로 시작된 조사나 어미, 접미사와 결합되는 경우에는, 제 음가대로 뒤 음절 첫소리로 옮겨 발음한다. 110 | -> 깎아[까까], 옷이[오시], 있어[이써], 낮이[나지] 111 | -> 꽂아[꼬자], 꽃을[꼬츨], 쫓아[쪼차], 밭에[바테] 112 | -> 앞으로[아프로], 덮이다[더피다] 113 | ``` 114 | 115 | 116 | ## References 117 | 118 | If you use our software for research, please cite: 119 | 120 | ``` 121 | @misc{park2019g2pk, 122 | author = {Park, Kyubyong}, 123 | title = {g2pK}, 124 | year = {2019}, 125 | publisher = {GitHub}, 126 | journal = {GitHub repository}, 127 | howpublished = {\url{https://github.com/Kyubyong/g2pk}} 128 | } 129 | ``` 130 | -------------------------------------------------------------------------------- /g2pk/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | r"""g2pK 3 | """ 4 | from __future__ import absolute_import 5 | 6 | from .g2pk import G2p -------------------------------------------------------------------------------- /g2pk/english.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Convert English to Hangul 4 | https://github.com/kyubyong/g2pK 5 | ''' 6 | 7 | import re 8 | 9 | from g2pk.utils import adjust, compose, to_choseong, to_jungseong, to_jongseong, reconstruct 10 | 11 | 12 | def convert_eng(string, cmu): 13 | '''Convert a string such that English words inside are turned into Hangul. 14 | string: input string. 15 | cmu: cmu dict object. 16 | 17 | >>> convert_eng("그 사람 좀 old school이야", cmu) 18 | 그 사람 좀 올드 스쿨이야 19 | ''' 20 | eng_words = set(re.findall("[A-Za-z']+", string)) 21 | for eng_word in eng_words: 22 | word = eng_word.lower() 23 | if word not in cmu: 24 | continue 25 | 26 | arpabets = cmu[word][0] # https://en.wikipedia.org/wiki/ARPABET 27 | phonemes = adjust(arpabets) 28 | ret = "" 29 | for i in range(len(phonemes)): 30 | p = phonemes[i] # phoneme 31 | p_prev = phonemes[i - 1] if i > 0 else "^" 32 | p_next = phonemes[i + 1] if i < len(phonemes) - 1 else "$" 33 | p_next2 = phonemes[i + 1] if i < len(phonemes) - 2 else "$" 34 | 35 | # desginated sets 36 | short_vowels = ("AE", "AH", "AX", "EH", "IH", "IX", "UH") 37 | vowels = "AEIOUY" 38 | consonants = "BCDFGHJKLMNPQRSTVWXZ" 39 | syllable_final_or_consonants = "$BCDFGHJKLMNPQRSTVWXZ" 40 | 41 | # 외래어 표기법 https://ko.dict.naver.com/help.nhn?page=4-1-3-1#dtl_cts 42 | # 1항. 무성 파열음 ([p], [t], [k]) 43 | # 1. 짧은 모음 다음의 어말 무성 파열음([p], [t], [k])은 받침으로 적는다. 44 | # 2. 짧은 모음과 유음·비음([l], [r], [m], [n]) 이외의 자음 사이에 오는 무성 파열음([p], [t], [k])은 받침으로 적는다. 45 | # 3. 위 경우 이외의 어말과 자음 앞의 [p], [t], [k]는 '으'를 붙여 적는다. 46 | 47 | if p in "PTK": 48 | if p_prev[:2] in short_vowels and p_next == "$": # 1 49 | ret += to_jongseong(p) 50 | elif p_prev[:2] in short_vowels and p_next[0] not in "AEIOULRMN": # 2 51 | ret += to_jongseong(p) 52 | elif p_next[0] in "$BCDFGHJKLMNPQRSTVWXYZ": # 3 53 | ret += to_choseong(p) 54 | ret += "ᅳ" 55 | else: 56 | ret += to_choseong(p) 57 | 58 | # 2항. 유성 파열음([b], [d], [g]) 59 | # 어말과 모든 자음 앞에 오는 유성 파열음은 '으'를 붙여 적는다. 60 | elif p in "BDG": 61 | ret += to_choseong(p) 62 | if p_next[0] in syllable_final_or_consonants: 63 | ret += "ᅳ" 64 | 65 | # 3항. 마찰음([s], [z], [f], [v], [θ], [ð], [ʃ], [ʒ]) 66 | # 1. 어말 또는 자음 앞의 [s], [z], [f], [v], [θ], [ð]는 '으'를 붙여 적는다. 67 | # 2. 어말의 [ʃ]는 '시'로 적고, 자음 앞의 [ʃ]는 '슈'로, 모음 앞의 [ʃ]는 뒤따르는 모음에 따라 '샤', '섀', '셔', '셰', '쇼', '슈', '시'로 적는다. 68 | # 3. 어말 또는 자음 앞의 [ʒ]는 '지'로 적고, 모음 앞의 [ʒ]는 'ㅈ'으로 적는다. 69 | elif p in ("S", "Z", "F", "V", "TH", "DH", "SH", "ZH"): 70 | ret += to_choseong(p) 71 | 72 | if p in ("S", "Z", "F", "V", "TH", "DH"): # 1 73 | if p_next[0] in syllable_final_or_consonants: 74 | ret += "ᅳ" 75 | elif p == "SH": # 2 76 | if p_next[0] in "$": 77 | ret += "ᅵ" 78 | elif p_next[0] in consonants: 79 | ret += "ᅲ" 80 | else: 81 | ret += "Y" 82 | elif p == "ZH": # 3 83 | if p_next[0] in syllable_final_or_consonants: 84 | ret += "ᅵ" 85 | 86 | # 4항. 파찰음([ʦ], [ʣ], [ʧ], [ʤ]) 87 | # 1. 어말 또는 자음 앞의 [ʦ], [ʣ]는 '츠', '즈'로 적고, [ʧ], [ʤ]는 '치', '지'로 적는다. 88 | # 2. 모음 앞의 [ʧ], [ʤ]는 'ㅊ', 'ㅈ'으로 적는다. 89 | elif p in ("TS", "DZ", "CH", "JH",): 90 | ret += to_choseong(p) # 2 91 | 92 | if p_next[0] in syllable_final_or_consonants: # 1 93 | if p in ("TS", "DZ"): 94 | ret += "ᅳ" 95 | else: 96 | ret += "ᅵ" 97 | 98 | # 5항. 비음([m], [n], [ŋ]) 99 | # 1. 어말 또는 자음 앞의 비음은 모두 받침으로 적는다. 100 | # 2. 모음과 모음 사이의 [ŋ]은 앞 음절의 받침 'ㆁ'으로 적는다. 101 | elif p in ("M", "N", "NG"): 102 | if p in "MN" and p_next[0] in vowels: 103 | ret += to_choseong(p) 104 | else: 105 | ret += to_jongseong(p) 106 | 107 | # 6항. 유음([l]) 108 | # 1. 어말 또는 자음 앞의 [l]은 받침으로 적는다. 109 | # 2. 어중의 [l]이 모음 앞에 오거나, 모음이 따르지 않는 비음([m], [n]) 앞에 올 때에는 'ㄹㄹ'로 적는다. 110 | # 3. 다만, 비음([m], [n]) 뒤의 [l]은 모음 앞에 오더라도 'ㄹ'로 적는다. 111 | elif p == "L": 112 | if p_prev == "^": # initial 113 | ret += to_choseong(p) 114 | elif p_next[0] in "$BCDFGHJKLPQRSTVWXZ": # 1 115 | ret += to_jongseong(p) 116 | elif p_prev in "MN": # 3 117 | ret += to_choseong(p) 118 | elif p_next[0] in vowels: # 2 119 | ret += "ᆯᄅ" 120 | elif p_next in "MN" and p_next2[0] not in vowels: # 2 121 | ret += "ᆯ르" 122 | 123 | # custom 124 | elif p == "ER": 125 | if p_prev[0] in vowels: 126 | ret += "ᄋ" 127 | ret += to_jungseong(p) 128 | if p_next[0] in vowels: 129 | ret += "ᄅ" 130 | elif p == "R": 131 | if p_next[0] in vowels: 132 | ret += to_choseong(p) 133 | 134 | # 8항. 중모음1) ([ai], [au], [ei], [ɔi], [ou], [auə]) 135 | # 중모음은 각 단모음의 음가를 살려서 적되, [ou]는 '오'로, [auə]는 '아워'로 적는다. 136 | elif p[0] in "AEIOU": 137 | ret += to_jungseong(p) 138 | 139 | else: 140 | ret += to_choseong(p) 141 | 142 | ret = reconstruct(ret) 143 | ret = compose(ret) 144 | ret = re.sub("[\u1100-\u11FF]", "", ret) # remove hangul jamo 145 | string = string.replace(eng_word, ret) 146 | return string 147 | 148 | if __name__ == "__main__": 149 | from nltk.corpus import cmudict 150 | cmu = cmudict.dict() 151 | print(convert_eng("오늘 학교에서 밥을 먹고 집에 와서 game을 했다", cmu)) -------------------------------------------------------------------------------- /g2pk/g2pk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | https://github.com/kyubyong/g2pK 4 | ''' 5 | 6 | import os, re 7 | 8 | import nltk 9 | import mecab 10 | from jamo import h2j 11 | from nltk.corpus import cmudict 12 | 13 | # For further info. about cmu dict, consult http://www.speech.cs.cmu.edu/cgi-bin/cmudict. 14 | try: 15 | nltk.data.find('corpora/cmudict.zip') 16 | except LookupError: 17 | nltk.download('cmudict') 18 | 19 | from g2pk.special import jyeo, ye, consonant_ui, josa_ui, vowel_ui, jamo, rieulgiyeok, rieulbieub, verb_nieun, balb, palatalize, modifying_rieul 20 | from g2pk.regular import link1, link2, link3, link4 21 | from g2pk.utils import annotate, compose, group, gloss, parse_table, get_rule_id2text 22 | from g2pk.english import convert_eng 23 | from g2pk.numerals import convert_num 24 | 25 | 26 | class G2p(object): 27 | def __init__(self): 28 | self.mecab = self.get_mecab() 29 | self.table = parse_table() 30 | 31 | self.cmu = cmudict.dict() # for English 32 | 33 | self.rule2text = get_rule_id2text() # for comments of main rules 34 | self.idioms_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "idioms.txt") 35 | 36 | def get_mecab(self): 37 | try: 38 | return mecab.MeCab() 39 | except Exception as e: 40 | raise Exception( 41 | 'If you want to install mecab, The command is... pip install python-mecab-ko' 42 | ) 43 | 44 | def idioms(self, string, descriptive=False, verbose=False): 45 | '''Process each line in `idioms.txt` 46 | Each line is delimited by "===", 47 | and the left string is replaced by the right one. 48 | inp: input string. 49 | descriptive: not used. 50 | verbose: boolean. 51 | 52 | >>> idioms("지금 mp3 파일을 다운받고 있어요") 53 | 지금 엠피쓰리 파일을 다운받고 있어요 54 | ''' 55 | rule = "from idioms.txt" 56 | out = string 57 | 58 | for line in open(self.idioms_path, 'r', encoding="utf8"): 59 | line = line.split("#")[0].strip() 60 | if "===" in line: 61 | str1, str2 = line.split("===") 62 | out = re.sub(str1, str2, out) 63 | gloss(verbose, out, string, rule) 64 | 65 | return out 66 | 67 | def __call__(self, string, descriptive=False, verbose=False, group_vowels=False, to_syl=True): 68 | '''Main function 69 | string: input string 70 | descriptive: boolean. 71 | verbose: boolean 72 | group_vowels: boolean. If True, the vowels of the identical sound are normalized. 73 | to_syl: boolean. If True, hangul letters or jamo are assembled to form syllables. 74 | 75 | For example, given an input string "나의 친구가 mp3 file 3개를 다운받고 있다", 76 | STEP 1. idioms 77 | -> 나의 친구가 엠피쓰리 file 3개를 다운받고 있다 78 | 79 | STEP 2. English to Hangul 80 | -> 나의 친구가 엠피쓰리 파일 3개를 다운받고 있다 81 | 82 | STEP 3. annotate 83 | -> 나의/J 친구가 엠피쓰리 파일 3개/B를 다운받고 있다 84 | 85 | STEP 4. Spell out arabic numbers 86 | -> 나의/J 친구가 엠피쓰리 파일 세개/B를 다운받고 있다 87 | 88 | STEP 5. decompose 89 | -> 나의/J 친구가 엠피쓰리 파일 세개/B를 다운받고 있다 90 | 91 | STEP 6-9. Hangul 92 | -> 나의 친구가 엠피쓰리 파일 세개를 다운받꼬 읻따 93 | ''' 94 | # 1. idioms 95 | string = self.idioms(string, descriptive, verbose) 96 | 97 | # 2 English to Hangul 98 | string = convert_eng(string, self.cmu) 99 | 100 | # 3. annotate 101 | string = annotate(string, self.mecab) 102 | 103 | # 4. Spell out arabic numbers 104 | string = convert_num(string) 105 | 106 | # 5. decompose 107 | inp = h2j(string) 108 | 109 | # 6. special 110 | for func in (jyeo, ye, consonant_ui, josa_ui, vowel_ui, \ 111 | jamo, rieulgiyeok, rieulbieub, verb_nieun, \ 112 | balb, palatalize, modifying_rieul): 113 | inp = func(inp, descriptive, verbose) 114 | inp = re.sub("/[PJEB]", "", inp) 115 | 116 | # 7. regular table: batchim + onset 117 | for str1, str2, rule_ids in self.table: 118 | _inp = inp 119 | inp = re.sub(str1, str2, inp) 120 | 121 | if len(rule_ids)>0: 122 | rule = "\n".join(self.rule2text.get(rule_id, "") for rule_id in rule_ids) 123 | else: 124 | rule = "" 125 | gloss(verbose, inp, _inp, rule) 126 | 127 | # 8 link 128 | for func in (link1, link2, link3, link4): 129 | inp = func(inp, descriptive, verbose) 130 | 131 | # 9. postprocessing 132 | if group_vowels: 133 | inp = group(inp) 134 | 135 | if to_syl: 136 | inp = compose(inp) 137 | return inp 138 | 139 | if __name__ == "__main__": 140 | g2p = G2p() 141 | g2p("나의 친구가 mp3 file 3개를 다운받고 있다") 142 | -------------------------------------------------------------------------------- /g2pk/idioms.txt: -------------------------------------------------------------------------------- 1 | # Each line should be considered prior to others. 2 | # Comments are preceded by #. 3 | # Each line should look like [str1]===[str2], 4 | # where str2 replaces str1. 5 | # Note that these will be processed through regular expression. 6 | 의견란===의견난 7 | 임진란===임진난 8 | 생산량===생산냥 9 | 결단력===결딴녁 10 | 공권력===공꿘녁 11 | 동원령===동원녕 12 | 상견례===상견녜 13 | 횡단로===횡단노 14 | 이원론===이원논 15 | 입원료===이붠뇨 16 | 구근류===구근뉴 17 | 18 | 갈등===갈뜽 19 | 발동===발똥 20 | 절도===절또 21 | 말살===말쌀 22 | 불소===불쏘 23 | 일시===일씨 24 | 갈증===갈쯩 25 | 물질===물찔 26 | 발전===발쩐 27 | 몰상식===몰쌍식 28 | 불세출===불쎄출 29 | 30 | 문고리===문꼬리 31 | 눈동자===눈똥자 32 | 신바람===신빠람 33 | 산새===산쌔 34 | 손재주===손째주 35 | 길가===길까 36 | 물동이===물똥이 37 | 발바닥===발빠닥 38 | 굴속===굴쏙 39 | 술잔===술짠 40 | 바람결===바람껼 41 | 그믐달===그믐딸 42 | 아침밥===아침빱 43 | 잠자리===잠짜리 44 | 강가===강까 45 | 초승달===초승딸 46 | 등불===등뿔 47 | 창살===창쌀 48 | 강줄기===강쭐기 49 | 50 | 솜이불===솜니불 51 | 홑이불===혼니불 52 | 막일===망닐 53 | 삯일===상닐 54 | 맨입===맨닙 55 | 꽃잎===꼰닙 56 | 내복약===내봉냑 57 | 색연필===생년필 58 | 직행열차===지캥녈차 59 | 늑막염===능망념 60 | 콩엿===콩녇 61 | 담요===담뇨 62 | 눈요기===눈뇨기 63 | 64 | 영업용===영엄뇽 65 | 식용유===시굥뉴 66 | 국민윤리===궁민뉼리 67 | 밤윳===밤뉻 68 | 이죽이죽===이중니죽 69 | 야금야금===야금냐금 70 | 검열===검녈 71 | 욜랑욜랑===욜랑뇰랑 72 | 금융===금늉 73 | 들일===들릴 74 | 솔잎===솔립 75 | 설익다===설릭따 76 | 물약===물략 77 | 불여우===불려우 78 | 서울역===서울력 79 | 물엿===물렫 80 | 휘발유===휘발류 81 | 유들유들===유들류들 82 | 한일===한닐 83 | 옷입다===온닙따 84 | 서른여섯===서른녀섣 85 | 3연대===삼년대 86 | 먹은엿===머근녇 87 | 할일===할릴 88 | 잘입다===잘립따 89 | 스물여섯===스물려섣 90 | 1연대===일련대 91 | 먹을엿===머글렫 92 | 6·25===유기오 93 | 3·1절===사밀쩔 94 | 송별연===송벼련 95 | 등용문===등용문 96 | 97 | 냇가===내까 98 | 샛길===새낄 99 | 빨랫돌===빨래똘 100 | 콧등===코뜽 101 | 깃발===기빨 102 | 대팻밥===대패빱 103 | 햇살===해쌀 104 | 뱃속===배쏙 105 | 뱃전===배쩐 106 | 고갯짓===고개찓 107 | 108 | 콧날===콘날 109 | 아랫니===아랜니 110 | 툇마루===퇸마루 111 | 뱃머리===밴머리 112 | 113 | 베갯잇===베갠닏 114 | 깻잎===깬닙 115 | 나뭇잎===나문닙 116 | 도리깻열===도리깬녈 117 | 뒷윷===뒨뉻 118 | 119 | 할걸===할껄 120 | 할밖에===할빠께 121 | 할세라===할쎄라 122 | 할수록===할쑤록 123 | 할지라도===할찌라도 124 | 할지언정===할찌언정 125 | 할진대===할찐대 126 | 127 | ml===밀리리터 128 | mp3===엠피쓰리 129 | %===퍼센트 130 | jpeg===제이펙 131 | mp4===엠피포 132 | 133 | 1번째===첫번째 134 | 10월===시월 135 | 136 | 137 | -------------------------------------------------------------------------------- /g2pk/numerals.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | https://github.com/kyubyong/g2pK 4 | ''' 5 | 6 | import re 7 | 8 | # This is a list of bound nouns preceded by pure Korean numerals. 9 | BOUND_NOUNS = "군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통" 10 | 11 | 12 | def process_num(num, sino=True): 13 | '''Process a string looking like arabic number. 14 | num: string. Consists of [0-9,]. e.g., 12,345 15 | sino: boolean. If True, sino-Korean numerals, i.e., 일, 이, .. are considered. 16 | Otherwise, pure Korean ones in their modifying forms such as 한, 두, ... are returned. 17 | 18 | >>> process_num("123,456,789", sino=True) 19 | 일억이천삼백사십오만육천칠백팔십구 20 | 21 | >>> process_num("123,456,789", sino=False) 22 | 일억이천삼백사십오만육천칠백여든아홉 23 | ''' 24 | num = re.sub(",", "", num) 25 | 26 | if num == "0": 27 | return "영" 28 | if not sino and num == "20": 29 | return "스무" 30 | 31 | digits = "123456789" 32 | names = "일이삼사오육칠팔구" 33 | digit2name = {d: n for d, n in zip(digits, names)} 34 | 35 | modifiers = "한 두 세 네 다섯 여섯 일곱 여덟 아홉" 36 | decimals = "열 스물 서른 마흔 쉰 예순 일흔 여든 아흔" 37 | digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())} 38 | digit2dec = {d: dec for d, dec in zip(digits, decimals.split())} 39 | 40 | spelledout = [] 41 | for i, digit in enumerate(num): 42 | i = len(num) - i - 1 43 | if sino: 44 | if i == 0: 45 | name = digit2name.get(digit, "") 46 | elif i == 1: 47 | name = digit2name.get(digit, "") + "십" 48 | name = name.replace("일십", "십") 49 | else: 50 | if i == 0: 51 | name = digit2mod.get(digit, "") 52 | elif i == 1: 53 | name = digit2dec.get(digit, "") 54 | if digit == '0': 55 | if i % 4 == 0: 56 | last_three = spelledout[-min(3, len(spelledout)):] 57 | if "".join(last_three) == "": 58 | spelledout.append("") 59 | continue 60 | else: 61 | spelledout.append("") 62 | continue 63 | if i == 2: 64 | name = digit2name.get(digit, "") + "백" 65 | name = name.replace("일백", "백") 66 | elif i == 3: 67 | name = digit2name.get(digit, "") + "천" 68 | name = name.replace("일천", "천") 69 | elif i == 4: 70 | name = digit2name.get(digit, "") + "만" 71 | name = name.replace("일만", "만") 72 | elif i == 5: 73 | name = digit2name.get(digit, "") + "십" 74 | name = name.replace("일십", "십") 75 | elif i == 6: 76 | name = digit2name.get(digit, "") + "백" 77 | name = name.replace("일백", "백") 78 | elif i == 7: 79 | name = digit2name.get(digit, "") + "천" 80 | name = name.replace("일천", "천") 81 | elif i == 8: 82 | name = digit2name.get(digit, "") + "억" 83 | elif i == 9: 84 | name = digit2name.get(digit, "") + "십" 85 | elif i == 10: 86 | name = digit2name.get(digit, "") + "백" 87 | elif i == 11: 88 | name = digit2name.get(digit, "") + "천" 89 | elif i == 12: 90 | name = digit2name.get(digit, "") + "조" 91 | elif i == 13: 92 | name = digit2name.get(digit, "") + "십" 93 | elif i == 14: 94 | name = digit2name.get(digit, "") + "백" 95 | elif i == 15: 96 | name = digit2name.get(digit, "") + "천" 97 | spelledout.append(name) 98 | 99 | return "".join(elem for elem in spelledout) 100 | 101 | 102 | def convert_num(string): 103 | '''Convert a annotated string such that arabic numerals inside are spelled out. 104 | >>> convert_num("우리 3시/B 10분/B에 만나자.") 105 | 우리 세시/B 십분/B에 만나자. 106 | ''' 107 | global BOUND_NOUNS 108 | 109 | # Bound Nouns 110 | tokens = set(re.findall("([\d][\d,]*)([ㄱ-힣]+)/B", string)) 111 | for token in tokens: 112 | num, bn = token 113 | if bn in BOUND_NOUNS: 114 | spelledout = process_num(num, sino=False) 115 | else: 116 | spelledout = process_num(num, sino=True) 117 | string = string.replace(f"{num}{bn}/B", f"{spelledout}{bn}/B") 118 | 119 | # digit by digit for remaining digits 120 | digits = "0123456789" 121 | names = "영일이삼사오육칠팔구" 122 | for d, n in zip(digits, names): 123 | string = string.replace(d, n) 124 | 125 | return string 126 | 127 | 128 | if __name__ == "__main__": 129 | # test 130 | print(process_num("123,456,789", sino=True)) 131 | print(process_num("123,456,789", sino=False)) 132 | print(convert_num("우리 3시/B 10분/B에 만나자.")) 133 | -------------------------------------------------------------------------------- /g2pk/regular.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | https://github.com/kyubyong/g2pK 4 | ''' 5 | 6 | from g2pk.utils import gloss, get_rule_id2text 7 | rule_id2text = get_rule_id2text() 8 | 9 | 10 | def link1(inp, descriptive=False, verbose=False): 11 | rule = rule_id2text["13"] 12 | out = inp 13 | 14 | pairs = [ ("ᆨᄋ", "ᄀ"), 15 | ("ᆩᄋ", "ᄁ"), 16 | ("ᆫᄋ", "ᄂ"), 17 | ("ᆮᄋ", "ᄃ"), 18 | ("ᆯᄋ", "ᄅ"), 19 | ("ᆷᄋ", "ᄆ"), 20 | ("ᆸᄋ", "ᄇ"), 21 | ("ᆺᄋ", "ᄉ"), 22 | ("ᆻᄋ", "ᄊ"), 23 | ("ᆽᄋ", "ᄌ"), 24 | ("ᆾᄋ", "ᄎ"), 25 | ("ᆿᄋ", "ᄏ"), 26 | ("ᇀᄋ", "ᄐ"), 27 | ("ᇁᄋ", "ᄑ")] 28 | for str1, str2 in pairs: 29 | out = out.replace(str1, str2) 30 | 31 | gloss(verbose, out, inp, rule) 32 | return out 33 | 34 | 35 | def link2(inp, descriptive=False, verbose=False): 36 | rule = rule_id2text["14"] 37 | out = inp 38 | 39 | pairs = [ ("ᆪᄋ", "ᆨᄊ"), 40 | ("ᆬᄋ", "ᆫᄌ"), 41 | ("ᆰᄋ", "ᆯᄀ"), 42 | ("ᆱᄋ", "ᆯᄆ"), 43 | ("ᆲᄋ", "ᆯᄇ"), 44 | ("ᆳᄋ", "ᆯᄊ"), 45 | ("ᆴᄋ", "ᆯᄐ"), 46 | ("ᆵᄋ", "ᆯᄑ"), 47 | ("ᆹᄋ", "ᆸᄊ") ] 48 | for str1, str2 in pairs: 49 | out = out.replace(str1, str2) 50 | 51 | gloss(verbose, out, inp, rule) 52 | return out 53 | 54 | 55 | def link3(inp, descriptive=False, verbose=False): 56 | rule = rule_id2text["15"] 57 | out = inp 58 | 59 | pairs = [ ("ᆨ ᄋ", " ᄀ"), 60 | ("ᆩ ᄋ", " ᄁ"), 61 | ("ᆫ ᄋ", " ᄂ"), 62 | ("ᆮ ᄋ", " ᄃ"), 63 | ("ᆯ ᄋ", " ᄅ"), 64 | ("ᆷ ᄋ", " ᄆ"), 65 | ("ᆸ ᄋ", " ᄇ"), 66 | ("ᆺ ᄋ", " ᄉ"), 67 | ("ᆻ ᄋ", " ᄊ"), 68 | ("ᆽ ᄋ", " ᄌ"), 69 | ("ᆾ ᄋ", " ᄎ"), 70 | ("ᆿ ᄋ", " ᄏ"), 71 | ("ᇀ ᄋ", " ᄐ"), 72 | ("ᇁ ᄋ", " ᄑ"), 73 | 74 | ("ᆪ ᄋ", "ᆨ ᄊ"), 75 | ("ᆬ ᄋ", "ᆫ ᄌ"), 76 | ("ᆰ ᄋ", "ᆯ ᄀ"), 77 | ("ᆱ ᄋ", "ᆯ ᄆ"), 78 | ("ᆲ ᄋ", "ᆯ ᄇ"), 79 | ("ᆳ ᄋ", "ᆯ ᄊ"), 80 | ("ᆴ ᄋ", "ᆯ ᄐ"), 81 | ("ᆵ ᄋ", "ᆯ ᄑ"), 82 | ("ᆹ ᄋ", "ᆸ ᄊ") ] 83 | 84 | for str1, str2 in pairs: 85 | out = out.replace(str1, str2) 86 | 87 | gloss(verbose, out, inp, rule) 88 | return out 89 | 90 | 91 | def link4(inp, descriptive=False, verbose=False): 92 | rule = rule_id2text["12.4"] 93 | 94 | out = inp 95 | 96 | pairs = [ ("ᇂᄋ", "ᄋ"), 97 | ("ᆭᄋ", "ᄂ"), 98 | ("ᆶᄋ", "ᄅ") ] 99 | 100 | for str1, str2 in pairs: 101 | out = out.replace(str1, str2) 102 | 103 | gloss(verbose, out, inp, rule) 104 | return out 105 | 106 | -------------------------------------------------------------------------------- /g2pk/rules.txt: -------------------------------------------------------------------------------- 1 | 5.1 2 | 5항. 다만 1. 용언의 활용형에 나타나는 '져, 쪄, 쳐'는 [저, 쩌, 처]로 발음한다. 3 | -> 가져[가저], 쪄[쩌], 다쳐[다처] 4 | 5 | 5.2 6 | 5항. 다만 2. '예, 례' 이외의 'ㅖ'는 [ㅔ]로도 발음한다. 7 | -> 계집[계집/게집], 계시다[계시다/게시다] 8 | -> 시계[시계/시게](時計), 연계[연계/연게](連繫) 9 | -> 몌별[몌별/메별](袂別), 개폐[개폐/개페](開閉) 10 | -> 혜택[혜택/헤택](惠澤), 지혜[지혜/지헤](智慧) 11 | # 실제로 언중은 예, 녜, 셰, 쎼 이외의 'ㅖ'는 [ㅔ]로 발음한다. by kyubyong 12 | 13 | 5.3 14 | 5항. 다만 3. 자음을 첫소리로 가지고 있는 음절의 'ㅢ'는 [ㅣ]로 발음한다. 15 | -> 늴리리[닐리리], 닁큼[닝큼], 무늬[무니], 띄어쓰기[띠어쓰기], 씌어[씨어] 16 | -> 틔어[티어], 희어[히어], 희떱다[히떱따], 희망[히망], 유희[유히] 17 | 18 | 5.4.1 19 | 다만 4. 단어의 첫음절 이외의 '의'는 [ㅣ]로 발음함도 허용한다. 20 | -> 주의[주의/주이], 협의[혀븨/혀비] 21 | # 실제로 언중은 높은 확률로 단어의 첫음절 이외의 '의'는 [ㅣ]로 발음한다. 22 | 23 | 5.4.2 24 | 다만 4. 조사 '의'는 [ㅔ]로 발음함도 허용한다. 25 | -> 우리의[우리의/우리에], 강의의[강의의/강이에] 26 | # 실제로 언중은 높은 확률로 조사 '의'는 [ㅔ]로 발음한다. 27 | 28 | 9 29 | 제9항 받침 'ㄲ, ㅋ', 'ㅅ, ㅆ, ㅈ, ㅊ, ㅌ', 'ㅍ'은 어말 또는 자음 앞에서 각각 대표음 [ㄱ, ㄷ, ㅂ]으로 발음한다. 30 | -> 닦다[닥따], 키읔[키윽], 키읔과[키윽꽈], 옷[옫] 31 | -> 웃다[욷따], 있다[읻따], 젖[젇], 빚다[빋따] 32 | -> 꽃[꼳], 쫓다[쫃따], 솥[솓], 뱉다[밷따] 33 | -> 앞[압], 덮다[덥따] 34 | 35 | 10 36 | 제10항 겹받침 'ㄳ', 'ㄵ', 'ㄼ, ㄽ, ㄾ', 'ㅄ'은 어말 또는 자음 앞에서 각각 [ㄱ, ㄴ, ㄹ, ㅂ]으로 발음한다. 37 | -> 넋[넉], 넋과[넉꽈], 앉다[안따], 여덟[여덜] 38 | -> 넓다[널따], 외곬[외골], 핥다[할따], 값[갑] 39 | -> 없다[업:따] 40 | 41 | 10.1 42 | 다만, '밟-'은 자음 앞에서 [밥]으로 발음하고, '넓-'은 다음과 같은 경우에 [넙]으로 발음한다. 43 | -> 1) 밟다[밥따], 밟소[밥쏘], 밟지[밥찌], 밟는[밤:는], 밟게[밥께], 밟고[밥꼬] 44 | -> 2) 넓죽하다[넙쭈카다], 넓둥글다[넙뚱글다] 45 | 46 | 11 47 | 제11항 겹받침 'ㄺ, ㄻ, ㄿ'은 어말 또는 자음 앞에서 각각 [ㄱ, ㅁ, ㅂ]으로 발음한다. 48 | -> 닭[닥], 흙과[흑꽈], 맑다[막따], 늙지[늑찌] 49 | -> 삶[삼], 젊다[점따], 읊고[읍꼬], 읊다[읍따] 50 | 51 | 11.1 52 | 다만, 용언의 어간 말음 'ㄺ'은 'ㄱ' 앞에서 [ㄹ]로 발음한다. 53 | -> 맑게[말께], 묽고[물꼬], 읽거나[일꺼나] 54 | 55 | 12 56 | 제12항 받침 'ㅎ'의 발음은 다음과 같다. 57 | 1. 'ㅎ(ㄶ, ㅀ)' 뒤에 'ㄱ, ㄷ, ㅈ'이 결합되는 경우에는, 뒤 음절 첫소리와 합쳐서 [ㅋ, ㅌ, ㅊ]으로 발음한다. 58 | -> 놓고[노코], 좋던[조턴], 쌓지[싸치], 많고[만코] 59 | -> 않던[안턴], 닳지[달치] 60 | [붙임 1] 받침 'ㄱ(ㄺ), ㄷ, ㅂ(ㄼ), ㅈ(ㄵ)'이 뒤 음절 첫소리 'ㅎ'과 결합되는 경우에도, 역시 두 소리를 합쳐서 [ㅋ, ㅌ, ㅍ, ㅊ]으로 발음한다. 61 | -> 각하[가카], 먹히다[머키다], 밟히다[발피다], 맏형[마텽] 62 | -> 좁히다[조피다], 넓히다[널피다], 꽂히다[꼬치다], 앉히다[안치다] 63 | [붙임 2] 규정에 따라 'ㄷ'으로 발음되는 'ㅅ, ㅈ, ㅊ, ㅌ'의 경우에는 이에 준한다. 64 | -> 옷 한 벌[오 탄 벌], 낮 한때[나 탄때], 꽃 한 송이[꼬 탄 송이] 65 | -> 숱하다[수타다] 66 | 2. 'ㅎ(ㄶ, ㅀ)' 뒤에 'ㅅ'이 결합되는 경우에는, 'ㅅ'을 [ㅆ]으로 발음한다. 67 | -> 닿소[다쏘], 많소[만쏘], 싫소[실쏘] 68 | 3. 'ㅎ' 뒤에 'ㄴ'이 결합되는 경우에는, [ㄴ]으로 발음한다. 69 | -> 놓는[논는], 쌓네[싼네] 70 | [붙임] 'ㄶ, ㅀ' 뒤에 'ㄴ'이 결합되는 경우에는, 'ㅎ'을 발음하지 않는다. 71 | -> 않네[안네], 않는[안는], 뚫네[뚤레], 뚫는[뚤른] 72 | * '뚫네[뚤네→뚤레], 뚫는[뚤는→뚤른]'에 대해서는 제20항 참조. 73 | 74 | 12.4 75 | 4. 'ㅎ(ㄶ, ㅀ)' 뒤에 모음으로 시작된 어미나 접미사가 결합되는 경우에는, 'ㅎ'을 발음하지 않는다. 76 | -> 낳은[나은], 놓아[노아], 쌓이다[싸이다], 많아[마나] 77 | -> 않은[아는], 닳아[다라], 싫어도[시러도] 78 | 79 | 13 80 | 제13항 홑받침이나 쌍받침이 모음으로 시작된 조사나 어미, 접미사와 결합되는 경우에는, 제 음가대로 뒤 음절 첫소리로 옮겨 발음한다. 81 | -> 깎아[까까], 옷이[오시], 있어[이써], 낮이[나지] 82 | -> 꽂아[꼬자], 꽃을[꼬츨], 쫓아[쪼차], 밭에[바테] 83 | -> 앞으로[아프로], 덮이다[더피다] 84 | 85 | 14 86 | 제14항 겹받침이 모음으로 시작된 조사나 어미, 접미사와 결합되는 경우에는, 뒤엣것만을 뒤 음절 첫소리로 옮겨 발음한다. (이 경우, 'ㅅ'은 된소리로 발음함.) 87 | -> 넋이[넉씨], 앉아[안자], 닭을[달글], 젊어[절머] 88 | -> 곬이[골씨], 핥아[할타], 읊어[을퍼], 값을[갑쓸] 89 | -> 없어[업써] 90 | 91 | 15 92 | 제15항 받침 뒤에 모음 'ㅏ, ㅓ, ㅗ, ㅜ, ㅟ'들로 시작되는 실질 형태소가 연결되는 경우에는, 대표음으로 바꾸어서 뒤 음절 첫소리로 옮겨 발음한다. 93 | -> 밭 아래[바 다래] 늪 앞[느 밥] 젖어미[저더미] 맛없다[마덥다] 94 | -> 겉옷[거돋] 헛웃음[허두슴] 꽃 위[꼬 뒤] 95 | 다만, '맛있다, 멋있다'는 [마싣따], [머싣따]로도 발음할 수 있다. 96 | [붙임] 겹받침의 경우에는 그 중 하나만을 옮겨 발음한다. 97 | -> 넋 없다[너 겁따] 닭 앞에[다 가페] 값어치[가 버치] 값있는[가빈는] 98 | 99 | 16 100 | 제16항 한글 자모의 이름은 그 받침소리를 연음하되, 'ㄷ, ㅈ, ㅊ, ㅋ, ㅌ, ㅍ, ㅎ'의 경우에는 특별히 다음과 같이 발음한다. 101 | -> 디귿이[디그시], 디귿을[디그슬], 디귿에[디그세] 102 | -> 지읒이[지으시], 지읒을[지으슬], 지읒에[지으세] 103 | -> 치읓이[치으시], 치읓을[치으슬], 치읓에[치으세] 104 | -> 키읔이[키으기], 키읔을[키으글], 키읔에[키으게] 105 | -> 티읕이[티으시], 티읕을[티으슬], 티읕에[티으세] 106 | -> 피읖이[피으비], 피읖을[피으블], 피읖에[피으베] 107 | -> 히읗이[히으시], 히읗을[히으슬], 히읗에[히으세] 108 | 109 | 17 110 | 제17항 받침 'ㄷ, ㅌ(ㄾ)'이 조사나 접미사의 모음 'ㅣ'와 결합되는 경우에는, [ㅈ, ㅊ]으로 바꾸어서 뒤 음절 첫소리로 옮겨 발음한다. 111 | -> 곧이듣다[고지듣따], 굳이[구지], 미닫이[미다지] 112 | -> 땀받이[땀바지], 밭이[바치], 벼훑이[벼훌치] 113 | [붙임] 'ㄷ' 뒤에 접미사 '히'가 결합되어 '티'를 이루는 것은 [치]로 발음한다. 114 | -> 굳히다[구치다], 닫히다[다치다], 묻히다[무치다] 115 | 116 | 18 117 | 제18항 받침 'ㄱ(ㄲ, ㅋ, ㄳ, ㄺ), ㄷ(ㅅ, ㅆ, ㅈ, ㅊ, ㅌ, ㅎ), ㅂ(ㅍ, ㄼ, ㄿ, ㅄ)'은 'ㄴ, ㅁ' 앞에서 [ㅇ, ㄴ, ㅁ]으로 발음한다. 118 | -> 먹는[멍는], 국물[궁물], 깎는[깡는], 키읔만[키응만] 119 | -> 몫몫이[몽목씨], 긁는[긍는], 흙만[흥만], 닫는[단는] 120 | -> 짓는[진:는], 옷맵시[온맵시], 있는[인는], 맞는[만는] 121 | -> 젖멍울[전멍울], 쫓는[쫀는], 꽃망울[꼰망울], 붙는[분는] 122 | -> 놓는[논는], 잡는[잠는], 밥물[밤물], 앞마당[암마당] 123 | -> 밟는[밤:는], 읊는[음는], 없는[엄:는], 값매다[감매다] 124 | [붙임] 두 단어를 이어서 한 마디로 발음하는 경우에도 이와 같다. 125 | -> 책 넣는다[챙 넌는다], 흙 말리다[흥 말리다], 옷 맞추다[온 마추다] 126 | -> 밥 먹는다[밤 멍는다], 값 매기다[감 매기다] 127 | 128 | 19 129 | 제19항 받침 'ㅁ, ㅇ' 뒤에 연결되는 'ㄹ'은 [ㄴ]으로 발음한다. 130 | -> 담력[담:녁], 침략[침냑], 강릉[강능], 항로[항:노], 대통령[대:통녕] 131 | [붙임] 받침 'ㄱ, ㅂ' 뒤에 연결되는 'ㄹ'도 [ㄴ]으로 발음한다. 132 | -> 막론[막논→망논], 백리[백니→뱅니], 협력[협녁→혐녁], 십리[십니→심니] 133 | 134 | 20 135 | 제20항 'ㄴ'은 'ㄹ'의 앞이나 뒤에서 [ㄹ]로 발음한다. 136 | -> 1) 난로[날:로], 신라[실라], 천리[철리], 광한루[광:할루], 대관령[대:괄령] 137 | -> 2) 칼날[칼랄], 물난리[물랄리], 줄넘기[줄럼끼], 할는지[할른지] 138 | [붙임] 첫소리 'ㄴ'이 'ㅀ, ㄾ' 뒤에 연결되는 경우에도 이에 준한다. 139 | -> 닳는[달른], 뚫는[뚤른], 핥네[할레] 140 | 다만, 다음과 같은 단어들은 'ㄹ'을 [ㄴ]으로 발음한다. 141 | -> 의견란[의견난], 임진란[임진난], 생산량[생산냥] 142 | -> 결단력[결딴녁], 공권력[공꿘녁], 동원령[동:원녕] 143 | -> 상견례[상견녜], 횡단로[횡단노], 이원론[이원논] 144 | -> 입원료[이붠뇨], 구근류[구근뉴] 145 | 146 | 21 147 | 제21항 위에서 지적한 이외의 자음 동화는 인정하지 않는다. 148 | -> 감기[감기], 옷감[옫깜], 있고[읻꼬] 149 | -> 꽃길[꼳낄], 젖먹이[전머기], 문법[문뻡] 150 | -> 꽃밭[꼳빧] 151 | 152 | 22 153 | 제22항 다음과 같은 용언의 어미는 [어]로 발음함을 원칙으로 하되, [여]로 발음함도 허용한다. 154 | -> 피어[피어/피여] 되어[되어/되여] 155 | [붙임] '이오, 아니오'도 이에 준하여 [이요, 아니요]로 발음함을 허용한다. 156 | 157 | 23 158 | 제23항 받침 'ㄱ(ㄲ, ㅋ, ㄳ, ㄺ), ㄷ(ㅅ, ㅆ, ㅈ, ㅊ, ㅌ), ㅂ(ㅍ, ㄼ, ㄿ, ㅄ)' 뒤에 연결되는 'ㄱ, ㄷ, ㅂ, ㅅ, ㅈ'은 된소리로 발음한다. 159 | -> 국밥[국빱], 깎다[깍따], 넑받이[넉빠지], 삯돈[삭똔] 160 | -> 닭장[닥짱], 칡범[칙뻠], 뻗대다[뻗때다], 옷고름[옫꼬름] 161 | -> 있던[읻떤], 꽂고[꼳꼬], 꽃다발[꼳따발], 낯설다[낟썰다] 162 | -> 밭갈이[받까리], 솥전[솓쩐], 곱돌[곱똘], 덮개[덥깨] 163 | -> 옆집[엽찝], 넓죽하다[넙쭈카다], 읊조리다[읍쪼리다], 값지다[갑찌다] 164 | 165 | 24 166 | 제24항 어간 받침 'ㄴ(ㄵ), ㅁ(ㄻ)' 뒤에 결합되는 어미의 첫소리 'ㄱ, ㄷ, ㅅ, ㅈ'은 된소리로 발음한다. 167 | -> 신고[신꼬], 껴안다[껴안따], 앉고[안꼬], 얹다[언따] 168 | -> 삼고[삼꼬], 더듬지[더듬찌], 닮고[담꼬], 젊지[점찌] 169 | 다만, 피동, 사동의 접미사 '-기-'는 된소리로 발음하지 않는다. 170 | -> 안기다[안기다], 감기다[감기다], 굶기다[굼기다], 옮기다[옴기다] 171 | 172 | 25 173 | 제25항 어간 받침 'ㄼ, ㄾ' 뒤에 결합되는 어미의 첫소리 'ㄱ, ㄷ, ㅅ, ㅈ'은 된소리로 발음한다. 174 | -> 넓게[널께], 핥다[할따], 훑소[훌쏘], 떫지[떨찌] 175 | 176 | 26 177 | 제26항 한자어에서, 'ㄹ' 받침 뒤에 연결되는 'ㄷ, ㅅ, ㅈ'은 된소리로 발음한다. 178 | -> 갈등[갈뜽], 발동[발똥], 절도[절또], 말살[말쌀] 179 | -> 불소[불쏘](弗素), 일시[일씨], 갈증[갈쯩], 물질[물찔] 180 | -> 발전[발쩐], 몰상식[몰쌍식], 불세출[불쎄출] 181 | 다만, 같은 한자가 겹쳐진 단어의 경우에는 된소리로 발음하지 않는다. 182 | -> 허허실실[허허실실](虛虛實實), 절절하다[절절하다](切切- ) 183 | 184 | 27 185 | 제27항 관형사형 '-[으]ㄹ' 뒤에 연결되는 'ㄱ, ㄷ, ㅂ, ㅅ, ㅈ'은 된소리로 발음한다. 186 | -> 할 것을[할 꺼슬], 갈 데가[갈 떼가], 할 바를[할 빠를] 187 | -> 할 수는[할 쑤는], 할 적에[할 쩌게], 갈 곳[갈 꼳] 188 | -> 할 도리[할 또리], 만날 사람[만날 싸람] 189 | 다만, 끊어서 말할 적에는 예사소리로 발음한다. 190 | [붙임] '-(으)ㄹ'로 시작되는 어미의 경우에도 이에 준한다. 191 | -> 할걸[할껄], 할밖에[할빠께], 할세라[할쎄라] 192 | -> 할수록[할쑤록], 할지라도[할찌라도], 할지언정[할찌언정] 193 | -> 할진대[할찐대] 194 | 195 | 28 196 | 제28항 표기상으로는 사이시옷이 없더라도, 관형격 기능을 지니는 사이시옷이 있어야 할(휴지가 성립되는) 합성어의 경우에는, 뒤 단어의 첫소리 'ㄱ, ㄷ, ㅂ, ㅅ, ㅈ'을 된소리로 발음한다. 197 | -> 문고리[문꼬리], 눈동자[눈똥자], 신바람[신빠람], 산새[산쌔] 198 | -> 손재주[손째주], 길가[길까], 물동이[물똥이], 발바닥[발빠닥] 199 | -> 굴속[굴쏙], 술잔[술짠], 바람결[바람껼], 그믐달[그믐딸] 200 | -> 아침밥[아침빱], 잠자리[잠짜리], 강가[강까], 초승달[초승딸] 201 | -> 등불[등뿔], 창살[창쌀], 강줄기[강쭐기] 202 | 203 | 29 204 | 제29항 합성어 및 파생어에서, 앞 단어나 접두사의 끝이 자음이고 뒤 단어나 접미사의 첫 음절이 '이, 야, 여, 요, 유'인 경우에는, 'ㄴ'소리를 첨가하여 [니, 냐, 녀, 뇨, 뉴]로 발음한다. 205 | -> 솜이불[솜니불], 홑이불[혼니불], 막일[망닐] 206 | -> 삯일[상닐], 맨입[맨닙], 꽃잎[꼰닙] 207 | -> 내복약[내봉냑], 색연필[생년필], 직행열차[지캥녈차] 208 | -> 늑막염[능망념], 콩엿[콩녇], 담요[담뇨] 209 | -> 눈요기[눈뇨기], 영업용[영엄뇽], 식용유[시굥뉴] 210 | -> 국민윤리[궁민뉼리], 밤윳[밤뉻] 211 | 다만, 다음과 같은 말들은 'ㄴ'소리를 첨가하여 발음하되, 표기대로 발음할 수 있다. 212 | -> 이죽이죽[이중니죽/이주기죽], 야금야금[야금냐금/야그먀금] 213 | -> 검열[검녈/거멸], 욜랑욜랑[욜랑뇰랑/욜랑욜랑] 214 | -> 금융[금늉/그뮹] 215 | [붙임 1] 'ㄹ' 받침 뒤에 첨가되는 'ㄴ'소리는 [ㄹ]로 발음한다. 216 | -> 들일[들릴], 솔잎[솔립], 설익다[설릭따] 217 | -> 물약[물략], 불여우[불려우], 서울역[서울력] 218 | -> 물엿[물렫], 휘발유[휘발류], 유들유들[유들류들] 219 | [붙임 2] 두 단어를 이어서 한 마디로 발음하는 경우에도 이에 준한다. 220 | -> 한 일[한 닐], 옷 입다[온 닙따], 서른여섯[서른녀섣] 221 | -> 3연대[삼년대], 먹은 엿[머근 녇] 222 | -> 할 일[할릴], 잘 입다[잘 립따], 스물여섯[스물려섣] 223 | -> 1연대[일련대], 먹을 엿[머글 렫] 224 | 다만, 다음과 같은 단어에서는 'ㄴ(ㄹ)'소리를 첨가하여 발음하지 않는다. 225 | -> 6·25[유기오], 3·1절[사밀쩔], 송별연[송벼련], 등용문[등용문] 226 | 227 | 30 228 | 제30항 사이시옷이 붙는 단어는 다음과 같이 발음한다. 229 | 1. 'ㄱ, ㄷ, ㅂ, ㅅ, ㅈ'으로 시작되는 단어 앞에 사이시옷이 올 때에는 이들 자음만을 된소리로 발음하는 것을 원칙으로 하되, 사이시옷을 [ㄷ]으로 발음하는 것도 허용한다. 230 | -> 냇가[내까/낻까], 샛길[새낄/샏낄], 빨랫돌[빨래똘/빨랟똘] 231 | -> 콧등[코뜽/콛뜽], 깃발[기빨/긷빨], 대팻밥[대패빱/대팯빱] 232 | -> 햇살[해쌀/핻쌀], 뱃속[배쏙/밷쏙], 뱃전[배쩐/밷쩐] 233 | -> 고갯짓[고개찓/고갣찓] 234 | 2. 사이시옷 뒤에 'ㄴ, ㅁ'이 결합되는 경우에는 [ㄴ]으로 발음한다. 235 | -> 콧날[콘날], 아랫니[아랜니] 236 | -> 툇마루[퇸마루], 뱃머리[밴머리] 237 | 3. 사이시옷 뒤에 '이'소리가 결합되는 경우에는 [ㄴㄴ]으로 발음한다. 238 | -> 베갯잇[베갣닏→베갠닏], 깻잎[깬닙] 239 | -> 나뭇잎[나문닙], 도리깻열[도리깬녈] 240 | -> 뒷윷[뒨뉻] 241 | -------------------------------------------------------------------------------- /g2pk/special.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Special rule for processing Hangul 4 | https://github.com/kyubyong/g2pK 5 | ''' 6 | 7 | import re 8 | 9 | from g2pk.utils import gloss, get_rule_id2text 10 | 11 | rule_id2text = get_rule_id2text() 12 | 13 | 14 | ############################ vowels ############################ 15 | def jyeo(inp, descriptive=False, verbose=False): 16 | rule = rule_id2text["5.1"] 17 | # 일반적인 규칙으로 취급한다 by kyubyong 18 | 19 | out = re.sub("([ᄌᄍᄎ])ᅧ", r"\1ᅥ", inp) 20 | gloss(verbose, out, inp, rule) 21 | return out 22 | 23 | 24 | def ye(inp, descriptive=False, verbose=False): 25 | rule = rule_id2text["5.2"] 26 | # 실제로 언중은 예, 녜, 셰, 쎼 이외의 'ㅖ'는 [ㅔ]로 발음한다. by kyubyong 27 | 28 | if descriptive: 29 | out = re.sub("([ᄀᄁᄃᄄㄹᄆᄇᄈᄌᄍᄎᄏᄐᄑᄒ])ᅨ", r"\1ᅦ", inp) 30 | else: 31 | out = inp 32 | gloss(verbose, out, inp, rule) 33 | return out 34 | 35 | 36 | def consonant_ui(inp, descriptive=False, verbose=False): 37 | rule = rule_id2text["5.3"] 38 | 39 | out = re.sub("([ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄌᄍᄎᄏᄐᄑᄒ])ᅴ", r"\1ᅵ", inp) 40 | gloss(verbose, out, inp, rule) 41 | return out 42 | 43 | 44 | def josa_ui(inp, descriptive=False, verbose=False): 45 | rule = rule_id2text["5.4.2"] 46 | # 실제로 언중은 높은 확률로 조사 '의'는 [ㅔ]로 발음한다. 47 | if descriptive: 48 | out = re.sub("의/J", "에", inp) 49 | else: 50 | out = inp.replace("/J", "") 51 | gloss(verbose, out, inp, rule) 52 | return out 53 | 54 | 55 | def vowel_ui(inp, descriptive=False, verbose=False): 56 | rule = rule_id2text["5.4.1"] 57 | # 실제로 언중은 높은 확률로 단어의 첫음절 이외의 '의'는 [ㅣ]로 발음한다.""" 58 | if descriptive: 59 | out = re.sub("(\Sᄋ)ᅴ", r"\1ᅵ", inp) 60 | else: 61 | out = inp 62 | gloss(verbose, out, inp, rule) 63 | return out 64 | 65 | 66 | def jamo(inp, descriptive=False, verbose=False): 67 | rule = rule_id2text["16"] 68 | out = inp 69 | 70 | out = re.sub("([그])ᆮᄋ", r"\1ᄉ", out) 71 | out = re.sub("([으])[ᆽᆾᇀᇂ]ᄋ", r"\1ᄉ", out) 72 | out = re.sub("([으])[ᆿ]ᄋ", r"\1ᄀ", out) 73 | out = re.sub("([으])[ᇁ]ᄋ", r"\1ᄇ", out) 74 | 75 | gloss(verbose, out, inp, rule) 76 | return out 77 | 78 | 79 | ############################ 어간 받침 ############################ 80 | def rieulgiyeok(inp, descriptive=False, verbose=False): 81 | rule = rule_id2text["11.1"] 82 | 83 | out = inp 84 | out = re.sub("ᆰ/P([ᄀᄁ])", r"ᆯᄁ", out) 85 | 86 | gloss(verbose, out, inp, rule) 87 | return out 88 | 89 | 90 | def rieulbieub(inp, descriptive=False, verbose=False): 91 | rule = rule_id2text["25"] 92 | out = inp 93 | 94 | out = re.sub("([ᆲᆴ])/Pᄀ", r"\1ᄁ", out) 95 | out = re.sub("([ᆲᆴ])/Pᄃ", r"\1ᄄ", out) 96 | out = re.sub("([ᆲᆴ])/Pᄉ", r"\1ᄊ", out) 97 | out = re.sub("([ᆲᆴ])/Pᄌ", r"\1ᄍ", out) 98 | 99 | gloss(verbose, out, inp, rule) 100 | return out 101 | 102 | 103 | def verb_nieun(inp, descriptive=False, verbose=False): 104 | rule = rule_id2text["24"] 105 | out = inp 106 | 107 | pairs = [ ("([ᆫᆷ])/Pᄀ", r"\1ᄁ"), 108 | ("([ᆫᆷ])/Pᄃ", r"\1ᄄ"), 109 | ("([ᆫᆷ])/Pᄉ", r"\1ᄊ"), 110 | ("([ᆫᆷ])/Pᄌ", r"\1ᄍ"), 111 | 112 | ("ᆬ/Pᄀ", "ᆫᄁ"), 113 | ("ᆬ/Pᄃ", "ᆫᄄ"), 114 | ("ᆬ/Pᄉ", "ᆫᄊ"), 115 | ("ᆬ/Pᄌ", "ᆫᄍ"), 116 | 117 | ("ᆱ/Pᄀ", "ᆷᄁ"), 118 | ("ᆱ/Pᄃ", "ᆷᄄ"), 119 | ("ᆱ/Pᄉ", "ᆷᄊ"), 120 | ("ᆱ/Pᄌ", "ᆷᄍ") ] 121 | 122 | for str1, str2 in pairs: 123 | out = re.sub(str1, str2, out) 124 | 125 | gloss(verbose, out, inp, rule) 126 | return out 127 | 128 | 129 | def balb(inp, descriptive=False, verbose=False): 130 | rule = rule_id2text["10.1"] 131 | out = inp 132 | syllable_final_or_consonants = "($|[^ᄋᄒ])" 133 | 134 | # exceptions 135 | out = re.sub(f"(바)ᆲ({syllable_final_or_consonants})", r"\1ᆸ\2", out) 136 | out = re.sub(f"(너)ᆲ([ᄌᄍ]ᅮ|[ᄃᄄ]ᅮ)", r"\1ᆸ\2", out) 137 | gloss(verbose, out, inp, rule) 138 | return out 139 | 140 | 141 | def palatalize(inp, descriptive=False, verbose=False): 142 | rule = rule_id2text["17"] 143 | out = inp 144 | 145 | out = re.sub("ᆮᄋ([ᅵᅧ])", r"ᄌ\1", out) 146 | out = re.sub("ᇀᄋ([ᅵᅧ])", r"ᄎ\1", out) 147 | out = re.sub("ᆴᄋ([ᅵᅧ])", r"ᆯᄎ\1", out) 148 | 149 | out = re.sub("ᆮᄒ([ᅵ])", r"ᄎ\1", out) 150 | 151 | gloss(verbose, out, inp, rule) 152 | return out 153 | 154 | 155 | def modifying_rieul(inp, descriptive=False, verbose=False): 156 | rule = rule_id2text["27"] 157 | out = inp 158 | 159 | pairs = [ ("ᆯ/E ᄀ", r"ᆯ ᄁ"), 160 | ("ᆯ/E ᄃ", r"ᆯ ᄄ"), 161 | ("ᆯ/E ᄇ", r"ᆯ ᄈ"), 162 | ("ᆯ/E ᄉ", r"ᆯ ᄊ"), 163 | ("ᆯ/E ᄌ", r"ᆯ ᄍ"), 164 | 165 | ("ᆯ걸", "ᆯ껄"), 166 | ("ᆯ밖에", "ᆯ빠께"), 167 | ("ᆯ세라", "ᆯ쎄라"), 168 | ("ᆯ수록", "ᆯ쑤록"), 169 | ("ᆯ지라도", "ᆯ찌라도"), 170 | ("ᆯ지언정", "ᆯ찌언정"), 171 | ("ᆯ진대", "ᆯ찐대") ] 172 | 173 | for str1, str2 in pairs: 174 | out = re.sub(str1, str2, out) 175 | 176 | gloss(verbose, out, inp, rule) 177 | return out 178 | -------------------------------------------------------------------------------- /g2pk/table.csv: -------------------------------------------------------------------------------- 1 | ,( ?)ᄒ,( ?)ᄀ,( ?)ᄁ,( ?)ᄂ,( ?)ᄃ,( ?)ᄄ,( ?)ᄅ,( ?)ᄆ,( ?)ᄇ,( ?)ᄈ,( ?)ᄉ,( ?)ᄊ,( ?)ᄌ,( ?)ᄍ,( ?)ᄎ,( ?)ᄏ,( ?)ᄐ,( ?)ᄑ,(\W|$) 2 | ᇂ,\1ᄒ,\1ᄏ(12),\1ᄁ,ᆫ\1ᄂ(12),\1ᄐ(12),\1ᄄ,\1ᄅ,\1ᄆ,\1ᄇ,\1ᄈ,\1ᄊ(12),\1ᄊ,\1ᄎ(12),\1ᄍ,\1ᄎ,\1ᄏ,\1ᄐ,\1ᄑ,ᆮ\1 3 | ᆨ,\1ᄏ(12),ᆨ\1ᄁ(23),,ᆼ\1ᄂ(18),ᆨ\1ᄄ(23),,ᆼ\1ᄂ(19/18),ᆼ\1ᄆ(18),ᆨ\1ᄈ(23),,ᆨ\1ᄊ(23),,ᆨ\1ᄍ(23),,,,,, 4 | ᆩ,\1ᄏ,ᆨ\1ᄁ(9/23),ᆨ\1ᄁ(9),ᆼ\1ᄂ(18),ᆨ\1ᄄ(9/23),ᆨ\1ᄄ(9),ᆼ\1ᄂ,ᆼ\1ᄆ(18),ᆨ\1ᄈ(9/23),ᆨ\1ᄈ(9),ᆨ\1ᄊ(9/23),ᆨ\1ᄊ(9),ᆨ\1ᄍ(9/23),ᆨ\1ᄍ(9),ᆨ\1ᄎ(9),ᆨ\1ᄏ(9),ᆨ\1ᄐ(9),ᆨ\1ᄑ(9),ᆨ\1(9) 5 | ᆪ,\1ᄏ,ᆨ\1ᄁ(9/23),ᆨ\1ᄁ(10),ᆼ\1ᄂ(18),ᆨ\1ᄄ(9/23),ᆨ\1ᄄ(10),ᆼ\1ᄂ,ᆼ\1ᄆ(18),ᆨ\1ᄈ(9/23),ᆨ\1ᄈ(10),ᆨ\1ᄊ(9/23),ᆨ\1ᄊ(10),ᆨ\1ᄍ(9/23),ᆨ\1ᄍ(10),ᆨ\1ᄎ(10),ᆨ\1ᄏ(10),ᆨ\1ᄐ(10),ᆨ\1ᄑ(10),ᆨ\1(10) 6 | ᆫ,,,,,,,ᆯ\1ᄅ(20),,,,,,,,,,,, 7 | ᆬ,ᆫ\1ᄎ(12),ᆫ\1ᄀ(10),ᆫ\1ᄁ(10),ᆫ\1ᄂ(10),ᆫ\1ᄃ(10),ᆫ\1ᄄ(10),ᆯ\1ᄅ(10/20),ᆫ\1ᄆ(10),ᆫ\1ᄇ(10),ᆫ\1ᄈ(10),ᆫ\1ᄉ(10),ᆫ\1ᄊ(10),ᆫ\1ᄌ(10),ᆫ\1ᄍ(10),ᆫ\1ᄎ(10),ᆫ\1ᄏ(10),ᆫ\1ᄐ(10),ᆫ\1ᄑ(10),ᆫ\1(10) 8 | ᆭ,ᆫ\1ᄒ,ᆫ\1ᄏ(12),ᆫ\1ᄁ,ᆫ\1ᄂ(12),ᆫ\1ᄐ(12),ᆫ\1ᄄ,ᆯ\1ᄅ,ᆫ\1ᄆ,ᆫ\1ᄇ,ᆫ\1ᄈ,ᆫ\1ᄊ(12),ᆫ\1ᄊ,ᆫ\1ᄎ(12),ᆫ\1ᄍ,ᆫ\1ᄎ,ᆫ\1ᄏ,ᆫ\1ᄐ,ᆫ\1ᄑ,ᆫ\1 9 | ᆮ,\1ᄐ(12),ᆮ\1ᄁ(23),,ᆫ\1ᄂ(18),ᆮ\1ᄄ(23),,ᆫ\1ᄂ,ᆫ\1ᄆ(18),ᆮ\1ᄈ(23),,ᆮ\1ᄊ(23),,ᆮ\1ᄍ(23),,,,,, 10 | ᆯ,,,,ᆯ\1ᄅ(20),,,,,,,,,,,,,,, 11 | ᆰ,ᆯ\1ᄏ(12),ᆨ\1ᄁ(11/23),ᆨ\1ᄁ(11),ᆼ\1ᄂ(11/18),ᆨ\1ᄄ(11/23),ᆨ\1ᄄ(11),ᆼ\1ᄂ(11/18),ᆼ\1ᄆ(11/18),ᆨ\1ᄈ(11/23),ᆨ\1ᄈ(11),ᆨ\1ᄊ(11/23),ᆨ\1ᄊ(11),ᆨ\1ᄍ(11/23),ᆨ\1ᄍ(11),ᆨ\1ᄎ(11),1),ᆨ\1ᄑ(11),,ᆨ\1(11) 12 | ᆱ,ᆷ\1ᄒ(11),ᆷ\1ᄀ(11),ᆷ\1ᄁ(11),ᆷ\1ᄂ(11),ᆷ\1ᄃ(11),ᆷ\1ᄄ(11),ᆷ\1ᄅ(11),ᆷ\1ᄆ(11),ᆷ\1ᄇ(11),ᆷ\1ᄈ(11),ᆷ\1ᄉ(11),ᆷ\1ᄊ(11),ᆷ\1ᄌ(11),ᆷ\1ᄍ(11),ᆷ\1ᄎ(11),ᆷ\1ᄏ(11),ᆷ\1ᄐ(11),ᆷ\1ᄑ(11),ᆷ\1(11) 13 | ᆲ,ᆯ\1ᄑ(12),ᆯ\1ᄁ(10/23),ᆯ\1ᄁ(10),ᆷ\1ᄂ(18),ᆯ\1ᄄ(10/23),ᆯ\1ᄄ(10),ᆯ\1ᄅ(10),ᆷ\1ᄆ(18),ᆯ\1ᄈ(10/23),ᆯ\1ᄈ(10),ᆯ\1ᄊ(10/23),ᆯ\1ᄊ(10),ᆯ\1ᄍ(10/23),ᆯ\1ᄍ(10),ᆯ\1ᄎ(10),ᆯ\1ᄏ(10)0),,,ᆯ\1(10) 14 | ᆳ,ᆯ\1ᄒ(10),ᆯ\1ᄁ(10/23),ᆯ\1ᄁ(10),ᆯ\1ᄅ(10/20),ᆯ\1ᄄ(10/23),ᆯ\1ᄄ(10),ᆯ\1ᄅ(10),ᆯ\1ᄆ(10),ᆯ\1ᄈ(10/23),ᆯ\1ᄈ(10),ᆯ\1ᄊ(10/23),ᆯ\1ᄊ(10),ᆯ\1ᄍ(10/23),ᆯ\1ᄍ(10),ᆯ\1ᄎ(10),ᆯ\1ᄏ(ᄑ(10),,,ᆯ\1(10) 15 | ᆴ,ᆯ\1ᄒ(10),ᆯ\1ᄀ(10),ᆯ\1ᄁ(10),ᆯ\1ᄅ(10/20),ᆯ\1ᄃ(10),ᆯ\1ᄄ(10),ᆯ\1ᄅ(10),ᆯ\1ᄆ(10),ᆯ\1ᄇ(10),ᆯ\1ᄈ(10),ᆯ\1ᄉ(10),ᆯ\1ᄊ(10),ᆯ\1ᄌ(10),ᆯ\1ᄍ(10),ᆯ\1ᄎ(10),ᆯ\1ᄏ(10),ᆯ\1ᄐ(10),ᆯ\1ᄑ(10),ᆯ\1(10) 16 | ᆵ,ᆸ\1ᄑ(11/12),ᆸ\1ᄁ(11/23),ᆸ\1ᄁ(11),ᆷ\1ᄂ(18),ᆸ\1ᄄ(11/23),ᆸ\1ᄄ(11),ᆷ\1ᄅ(11),ᆷ\1ᄆ(11/18),ᆸ\1ᄈ(11/23),ᆸ\1ᄈ(11),ᆸ\1ᄊ(11/23),ᆸ\1ᄊ(11),ᆸ\1ᄍ(11/23),ᆸ\1ᄍ(11),ᆸ\1ᄎ(11),ᆸ\1ᆸ\1ᄑ,,,ᆸ\1(11) 17 | ᆶ,ᆯ\1ᄒ(10),ᆯ\1ᄏ(12),ᆯ\1ᄁ,ᆯ\1ᄅ(12/20),ᆯ\1ᄐ(12),ᆯ\1ᄄ,ᆯ\1ᄅ,ᆯ\1ᄆ,ᆯ\1ᄇ,ᆯ\1ᄈ,ᆯ\1ᄊ(12),ᆯ\1ᄊ,ᆯ\1ᄎ(12),ᆯ\1ᄍ,ᆯ\1ᄎ,ᆯ\1ᄏ,ᆯ\1ᄐ,ᆯ\1ᄑ,ᆯ 18 | ᆷ,,,,,,,ᆷ\1ᄂ(19),,,,,,,,,,,, 19 | ᆸ,\1ᄑ(12),ᆸ\1ᄁ(23),,ᆷ\1ᄂ(18),ᆸ\1ᄄ(23),,ᆷ\1ᄂ(19/18),ᆷ\1ᄆ(18),ᆸ\1ᄈ(23),,ᆸ\1ᄊ(23),,ᆸ\1ᄍ(23),,,,,, 20 | ᆹ,ᆸ\1ᄑ(10),ᆸ\1ᄁ(10/23),ᆸ\1ᄁ(10),ᆷ\1ᄂ(10/18),ᆸ\1ᄄ(10/23),ᆸ\1ᄄ(10),ᆷ\1ᄂ(10/19/18),ᆷ\1ᄆ(10/18),ᆸ\1ᄈ(10/23),ᆸ\1ᄈ(10),ᆸ\1ᄊ(10/23),ᆸ\1ᄊ(10),ᆸ\1ᄍ(10/23),ᆸ\1ᄍ(10),ᆸ\1ᄎ(1ᄐ(10),ᆸ\1ᄑ,,,ᆸ\1(10) 21 | ᆺ,\1ᄐ,ᆮ\1ᄁ(9/23),ᆮ\1ᄁ(9),ᆫ\1ᄂ(9/18),ᆮ\1ᄄ(9/23),ᆮ\1ᄄ(9),ᆫ\1ᄂ(9/18),ᆫ\1ᄆ(9/18),ᆮ\1ᄈ(9/23),ᆮ\1ᄈ(9),ᆮ\1ᄊ(9/23),ᆮ\1ᄊ(9),ᆮ\1ᄍ(9/23),ᆮ\1ᄍ(9),ᆮ\1ᄎ(9),ᆮ\1ᄏ(9),ᆮ\1ᄐ(9),ᆮ\1ᄑ(9),ᆮ\1(9) 22 | ᆻ,\1ᄐ,ᆮ\1ᄁ(9/23),ᆮ\1ᄁ(9),ᆫ\1ᄂ(9/18),ᆮ\1ᄄ(9/23),ᆮ\1ᄄ(9),ᆫ\1ᄂ(9/18),ᆫ\1ᄆ(9/18),ᆮ\1ᄈ(9/23),ᆮ\1ᄈ(9),ᆮ\1ᄊ(9/23),ᆮ\1ᄊ(9),ᆮ\1ᄍ(9/23),ᆮ\1ᄍ(9),ᆮ\1ᄎ(9),ᆮ\1ᄏ(9),ᆮ\1ᄐ(9),ᆮ1ᄑ(9),ᆮ\1(9) 23 | ᆼ,,,,,,,ᆼ\1ᄂ(19),,,,,,,,,,,, 24 | ᆽ,\1ᄎ(12),ᆮ\1ᄁ(9/23),ᆮ\1ᄁ(9),ᆫ\1ᄂ(18),ᆮ\1ᄄ(9/23),ᆮ\1ᄄ(9),ᆫ\1ᄂ(9/18),ᆫ\1ᄆ(18),ᆮ\1ᄈ(9/23),ᆮ\1ᄈ(9),ᆮ\1ᄊ(9/23),ᆮ\1ᄊ(9),ᆮ\1ᄍ(9/23),ᆮ\1ᄍ(9),ᆮ\1ᄎ(9),ᆮ\1ᄏ(9),ᆮ\1ᄐ(9),ᆮ\1ᄑ(9),ᆮ\1(9) 25 | ᆾ,\1ᄐ,ᆮ\1ᄁ(9/23),ᆮ\1ᄁ(9),ᆫ\1ᄂ(18),ᆮ\1ᄄ(9/23),ᆮ\1ᄄ(9),ᆫ\1ᄂ(9/18),ᆫ\1ᄆ(18),ᆮ\1ᄈ(9/23),ᆮ\1ᄈ(9),ᆮ\1ᄊ(9/23),ᆮ\1ᄊ(9),ᆮ\1ᄍ(9/23),ᆮ\1ᄍ(9),ᆮ\1ᄎ(9),ᆮ\1ᄏ(9),ᆮ\1ᄐ(9),ᆮ\1ᄑ(9),ᆮ\1(9) 26 | ᆿ,\1ᄏ,ᆨ\1ᄁ(9/23),ᆨ\1ᄁ(9),ᆼ\1ᄂ(18),ᆨ\1ᄄ(9/23),ᆨ\1ᄄ(9),ᆼ\1ᄂ(9/18),ᆼ\1ᄆ(9/18),ᆨ\1ᄈ(9/23),ᆨ\1ᄈ(9),ᆨ\1ᄊ(9/23),ᆨ\1ᄊ(9),ᆨ\1ᄍ(9/23),ᆨ\1ᄍ(9),ᆨ\1ᄎ(9),ᆨ\1ᄏ(9),ᆨ\1ᄐ(9),ᆨ\1ᄑ(9),ᆨ\1(9) 27 | ᇀ,\1ᄐ,ᆮ\1ᄁ(9/23),ᆮ\1ᄁ(9),ᆫ\1ᄂ(18),ᆮ\1ᄄ(9/23),ᆮ\1ᄄ(9),ᆫ\1ᄂ(9/18),ᆫ\1ᄆ(18),ᆮ\1ᄈ(9/23),ᆮ\1ᄈ(9),ᆮ\1ᄊ(9/23),ᆮ\1ᄊ(9),ᆮ\1ᄍ(9/23),ᆮ\1ᄍ(9),ᆮ\1ᄎ(9),ᆮ\1ᄏ(9),ᆮ\1ᄐ(9),ᆮ\1ᄑ(9),ᆮ\1(9) 28 | ᇁ,\1ᄑ,ᆸ\1ᄁ(9/23),ᆸ\1ᄁ(9),ᆷ\1ᄂ(18),ᆸ\1ᄄ(9/23),ᆸ\1ᄄ(9),ᆫ\1ᄂ(9/18),ᆷ\1ᄆ(18),ᆸ\1ᄈ(9/23),ᆸ\1ᄈ(9),ᆸ\1ᄊ(9/23),ᆸ\1ᄊ(9),ᆸ\1ᄍ(9/23),ᆸ\1ᄍ(9),ᆸ\1ᄎ(9),ᆸ\1ᄏ(9),ᆸ\1ᄐ(9),ᆸ\1ᄑ(9),ᆸ\1(9) -------------------------------------------------------------------------------- /g2pk/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from jamo import h2j, j2h 3 | import os 4 | 5 | ############## English ############## 6 | def adjust(arpabets): 7 | '''Modify arpabets so that it fits our processes''' 8 | string = " " + " ".join(arpabets) + " $" 9 | string = re.sub("\d", "", string) 10 | string = string.replace(" T S ", " TS ") 11 | string = string.replace(" D Z ", " DZ ") 12 | string = string.replace(" AW ER ", " AWER ") 13 | string = string.replace(" IH R $", " IH ER ") 14 | string = string.replace(" EH R $", " EH ER ") 15 | string = string.replace(" $", "") 16 | 17 | return string.strip("$ ").split() 18 | 19 | 20 | def to_choseong(arpabet): 21 | '''Arpabet to choseong or onset''' 22 | d = \ 23 | {'B': 'ᄇ', 24 | 'CH': 'ᄎ', 25 | 'D': 'ᄃ', 26 | 'DH': 'ᄃ', 27 | 'DZ': 'ᄌ', 28 | 'F': 'ᄑ', 29 | 'G': 'ᄀ', 30 | 'HH': 'ᄒ', 31 | 'JH': 'ᄌ', 32 | 'K': 'ᄏ', 33 | 'L': 'ᄅ', 34 | 'M': 'ᄆ', 35 | 'N': 'ᄂ', 36 | 'NG': 'ᄋ', 37 | 'P': 'ᄑ', 38 | 'R': 'ᄅ', 39 | 'S': 'ᄉ', 40 | 'SH': 'ᄉ', 41 | 'T': 'ᄐ', 42 | 'TH': 'ᄉ', 43 | 'TS': 'ᄎ', 44 | 'V': 'ᄇ', 45 | 'W': 'W', 46 | 'Y': 'Y', 47 | 'Z': 'ᄌ', 48 | 'ZH': 'ᄌ'} 49 | 50 | return d.get(arpabet, arpabet) 51 | 52 | def to_jungseong(arpabet): 53 | '''Arpabet to jungseong or vowel''' 54 | d = \ 55 | {'AA': 'ᅡ', 56 | 'AE': 'ᅢ', 57 | 'AH': 'ᅥ', 58 | 'AO': 'ᅩ', 59 | 'AW': 'ᅡ우', 60 | 'AWER': "ᅡ워", 61 | 'AY': 'ᅡ이', 62 | 'EH': 'ᅦ', 63 | 'ER': 'ᅥ', 64 | 'EY': 'ᅦ이', 65 | 'IH': 'ᅵ', 66 | 'IY': 'ᅵ', 67 | 'OW': 'ᅩ', 68 | 'OY': 'ᅩ이', 69 | 'UH': 'ᅮ', 70 | 'UW': 'ᅮ'} 71 | return d.get(arpabet, arpabet) 72 | 73 | def to_jongseong(arpabet): 74 | '''Arpabet to jongseong or coda''' 75 | d = \ 76 | {'B': 'ᆸ', 77 | 'CH': 'ᆾ', 78 | 'D': 'ᆮ', 79 | 'DH': 'ᆮ', 80 | 'F': 'ᇁ', 81 | 'G': 'ᆨ', 82 | 'HH': 'ᇂ', 83 | 'JH': 'ᆽ', 84 | 'K': 'ᆨ', 85 | 'L': 'ᆯ', 86 | 'M': 'ᆷ', 87 | 'N': 'ᆫ', 88 | 'NG': 'ᆼ', 89 | 'P': 'ᆸ', 90 | 'R': 'ᆯ', 91 | 'S': 'ᆺ', 92 | 'SH': 'ᆺ', 93 | 'T': 'ᆺ', 94 | 'TH': 'ᆺ', 95 | 'V': 'ᆸ', 96 | 'W': 'ᆼ', 97 | 'Y': 'ᆼ', 98 | 'Z': 'ᆽ', 99 | 'ZH': 'ᆽ'} 100 | 101 | return d.get(arpabet, arpabet) 102 | 103 | 104 | def reconstruct(string): 105 | '''Some postprocessing rules''' 106 | pairs = [("그W", "ᄀW"), 107 | ("흐W", "ᄒW"), 108 | ("크W", "ᄏW"), 109 | ("ᄂYᅥ", "니어"), 110 | ("ᄃYᅥ", "디어"), 111 | ("ᄅYᅥ", "리어"), 112 | ("Yᅵ", "ᅵ"), 113 | ("Yᅡ", "ᅣ"), 114 | ("Yᅢ", "ᅤ"), 115 | ("Yᅥ", "ᅧ"), 116 | ("Yᅦ", "ᅨ"), 117 | ("Yᅩ", "ᅭ"), 118 | ("Yᅮ", "ᅲ"), 119 | ("Wᅡ", "ᅪ"), 120 | ("Wᅢ", "ᅫ"), 121 | ("Wᅥ", "ᅯ"), 122 | ("Wᅩ", "ᅯ"), 123 | ("Wᅮ", "ᅮ"), 124 | ("Wᅦ", "ᅰ"), 125 | ("Wᅵ", "ᅱ"), 126 | ("ᅳᅵ", "ᅴ"), 127 | ("Y", "ᅵ"), 128 | ("W", "ᅮ") 129 | ] 130 | for str1, str2 in pairs: 131 | string = string.replace(str1, str2) 132 | return string 133 | 134 | 135 | ############## Hangul ############## 136 | def parse_table(): 137 | '''Parse the main rule table''' 138 | lines = open(os.path.dirname(os.path.abspath(__file__)) + '/table.csv', 'r', encoding='utf8').read().splitlines() 139 | onsets = lines[0].split(",") 140 | table = [] 141 | for line in lines[1:]: 142 | cols = line.split(",") 143 | coda = cols[0] 144 | for i, onset in enumerate(onsets): 145 | cell = cols[i] 146 | if len(cell)==0: continue 147 | if i==0: 148 | continue 149 | else: 150 | str1 = f"{coda}{onset}" 151 | if "(" in cell: 152 | str2 = cell.split("(")[0] 153 | rule_ids = cell.split("(")[1][:-1].split("/") 154 | else: 155 | str2 = cell 156 | rule_ids = [] 157 | 158 | table.append((str1, str2, rule_ids)) 159 | return table 160 | 161 | 162 | ############## Preprocessing ############## 163 | def annotate(string, mecab): 164 | '''attach pos tags to the given string using Mecab 165 | mecab: mecab object 166 | ''' 167 | tokens = mecab.pos(string) 168 | if string.replace(" ", "") != "".join(token for token, _ in tokens): 169 | return string 170 | blanks = [i for i, char in enumerate(string) if char == " "] 171 | 172 | tag_seq = [] 173 | for token, tag in tokens: 174 | tag = tag.split("+")[-1] 175 | if tag=="NNBC": # bound noun 176 | tag = "B" 177 | else: 178 | tag = tag[0] 179 | tag_seq.append("_" * (len(token) - 1) + tag) 180 | tag_seq = "".join(tag_seq) 181 | 182 | for i in blanks: 183 | tag_seq = tag_seq[:i] + " " + tag_seq[i:] 184 | 185 | annotated = "" 186 | for char, tag in zip(string, tag_seq): 187 | annotated += char 188 | if char == "의" and tag == "J": 189 | annotated += "/J" 190 | elif tag=="E": 191 | if h2j(char)[-1] in "ᆯ": 192 | annotated += "/E" 193 | elif tag == "V": 194 | if h2j(char)[-1] in "ᆫᆬᆷᆱᆰᆲᆴ": 195 | annotated += "/P" 196 | elif tag == "B": # bound noun 197 | annotated += "/B" 198 | 199 | return annotated 200 | 201 | 202 | ############## Postprocessing ############## 203 | def compose(letters): 204 | # insert placeholder 205 | letters = re.sub("(^|[^\u1100-\u1112])([\u1161-\u1175])", r"\1ᄋ\2", letters) 206 | 207 | string = letters # assembled characters 208 | # c+v+c 209 | syls = set(re.findall("[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]", string)) 210 | for syl in syls: 211 | string = string.replace(syl, j2h(*syl)) 212 | 213 | # c+v 214 | syls = set(re.findall("[\u1100-\u1112][\u1161-\u1175]", string)) 215 | for syl in syls: 216 | string = string.replace(syl, j2h(*syl)) 217 | 218 | return string 219 | 220 | 221 | def group(inp): 222 | '''For group_vowels=True 223 | Contemporarily, Korean speakers don't distinguish some vowels. 224 | ''' 225 | inp = inp.replace("ᅢ", "ᅦ") 226 | inp = inp.replace("ᅤ", "ᅨ") 227 | inp = inp.replace("ᅫ", "ᅬ") 228 | inp = inp.replace("ᅰ", "ᅬ") 229 | 230 | return inp 231 | 232 | 233 | def _get_examples(): 234 | '''For internal use''' 235 | text = open('rules.txt', 'r', encoding='utf8').read().splitlines() 236 | examples = [] 237 | for line in text: 238 | if line.startswith("->"): 239 | examples.extend(re.findall("([ㄱ-힣][ ㄱ-힣]*)\[([ㄱ-힣][ ㄱ-힣]*)]", line)) 240 | _examples = [] 241 | for inp, gt in examples: 242 | for each in gt.split("/"): 243 | _examples.append((inp, each)) 244 | 245 | return _examples 246 | 247 | 248 | ############## Utilities ############## 249 | def get_rule_id2text(): 250 | '''for verbose=True''' 251 | rules = open(os.path.dirname(os.path.abspath(__file__)) + '/rules.txt', 'r', encoding='utf8').read().strip().split("\n\n") 252 | rule_id2text = dict() 253 | for rule in rules: 254 | rule_id, texts = rule.splitlines()[0], rule.splitlines()[1:] 255 | rule_id2text[rule_id.strip()] = "\n".join(texts) 256 | return rule_id2text 257 | 258 | 259 | def gloss(verbose, out, inp, rule): 260 | '''displays the process and relevant information''' 261 | if verbose and out != inp and out != re.sub("/[EJPB]", "", inp): 262 | print(compose(inp), "->", compose(out)) 263 | print("\033[1;31m", rule, "\033[0m") 264 | 265 | 266 | 267 | 268 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", mode="r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | REQUIRED_PACKAGES = [ 7 | 'jamo', 8 | 'nltk', 9 | 'konlpy', 10 | 'python-mecab-ko', 11 | ] 12 | 13 | setuptools.setup( 14 | name="g2pK", 15 | version="0.9.3", 16 | author="Kyubyong Park", 17 | author_email="kbpark.linguist@gmail.com", 18 | description="g2pK: g2p module for Korean", 19 | install_requires=REQUIRED_PACKAGES, 20 | license='Apache License 2.0', 21 | long_description=long_description, 22 | long_description_content_type="text/markdown", 23 | url="https://github.com/Kyubyong/g2pK", 24 | packages=setuptools.find_packages(), 25 | package_data={'g2pk': ['g2pk/idioms.txt', 'g2pk/rules.txt', 'g2pk/table.csv']}, 26 | python_requires=">=3.6", 27 | include_package_data=True, 28 | classifiers=[ 29 | 'Development Status :: 5 - Production/Stable', 30 | 'Intended Audience :: Developers', 31 | 'Intended Audience :: Science/Research', 32 | "License :: OSI Approved :: Apache Software License", 33 | "Operating System :: OS Independent", 34 | "Programming Language :: Python :: 3", 35 | 'Programming Language :: Python :: 3.6', 36 | 'Programming Language :: Python :: 3.7', 37 | ], 38 | ) 39 | --------------------------------------------------------------------------------