├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── g2pk
    ├── __init__.py
    ├── english.py
    ├── g2pk.py
    ├── idioms.txt
    ├── numerals.py
    ├── regular.py
    ├── rules.txt
    ├── special.py
    ├── table.csv
    └── utils.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.xml
  6 | .idea
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include g2pk/rules.txt
2 | include g2pk/idioms.txt
3 | include g2pk/table.csv


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![image](https://img.shields.io/pypi/v/g2pk.svg)](https://pypi.org/project/g2pk/)
  2 | [![image](https://img.shields.io/pypi/l/g2pk.svg)](https://pypi.org/project/g2pk/)
  3 | [![image](https://img.shields.io/pypi/pyversions/g2pk.svg)](https://pypi.org/project/g2pk/)
  4 | 
  5 | # g2pK: g2p module for Korean
  6 | 
  7 | g2p means a task that converts graphemes to phonemes. Hangul, the main script for Korean, is phonetic, but the pronunciation rules are notoriously complicated.
  8 | So it is never easy to learn how to read a text in Korean. That's why g2p is necessary in various nlp tasks like TTS.
  9 | . There's a open source g2p library for Korean, [KoG2P](https://github.com/scarletcho/KoG2P). It is 
 10 | simple and works well, but I think we need a better one. Please read through the following section (main features and usage)
 11 | to understand the philosophy of g2pK and how to use g2pK. We know it is not perfect in present. 
 12 | That's one of the reasons your contributions are more than welcome.
 13 | 
 14 | ## Requirements
 15 | * python >= 3.6
 16 | * jamo
 17 | * [python-mecab-ko](https://github.com/jonghwanhyeon/python-mecab-ko)
 18 | * konlpy
 19 | * nltk
 20 | 
 21 | ## Installation
 22 | ```
 23 | pip install g2pk
 24 | ```
 25 | 
 26 | ## Main features & Usage
 27 | * Returns text as it is pronounced, keeping punctuations.
 28 | ```
 29 | >>> from g2pk import G2p
 30 | >>> g2p = G2p()
 31 | >>> g2p("어제는 날씨가 맑았는데, 오늘은 흐리다.")
 32 | 어제는 날씨가 말간는데, 오느른 흐리다.
 33 | ```
 34 | * Determines pronunciation seeing context, thanks to Mecab, a morphological analyzer.
 35 | In the following example, note that the first and second 신고 are pronounced differently.
 36 | ```
 37 | >>> g2p("신을 신고 얼른 동사무소에 가서 혼인 신고 해라")
 38 | 시늘 신꼬 얼른 동사무소에 가서 호닌 신고 해라
 39 | ```
 40 | * Returns two types of results, that is, prescriptive (default) and descriptive (with the option `descriptive=True`) pronunciation.
 41 | For example,  josa 의 is pronounced 의 in principle, but in real life, it is often pronounced 에.
 42 | Also, 계 is much more often pronounced 게. 
 43 | ```
 44 | >>> sent = "나의 친구는 계산이 아주 빠르다"
 45 | >>> g2p(sent)
 46 | 나의 친구는 계사니 아주 빠르다
 47 | >>> g2p(sent, descriptive=True)
 48 | 나에 친구는 게사니 아주 빠르다
 49 | ```
 50 | * This distinction becomes more obvious if you set `group_vowels=True`.
 51 | In contemporary colloquial speech, some vowels are hard to distinguish from each other.
 52 | For example, in the example below, the vowel ㅒ is normalized to ㅖ.
 53 | ```
 54 | >>> sent = "저는 예전에 그 얘기를 들은 적이 있습니다"
 55 | >>> g2p(sent)
 56 | 저느 녜저네 그 얘기를 드른 저기 읻씀니다
 57 | >>> g2p(sent, group_vowels=True)
 58 | 저느 녜저네 그 예기를 드른 저기 읻씀니다
 59 | ```
 60 | * By default, it returns the standard Korean script, where letters are assembled to form a syllable.
 61 |  If you set `to_syl=False`,  however, it returns Hangul letters or jamo. This can be useful for many applications like speech synthesis.
 62 | \*Depending on the font you are using, the two results below may look the same, but actually they are not.
 63 | ```
 64 | >>> sent = "어제는 날씨가 맑았는데, 오늘은 흐리다."
 65 | >>> g2p(sent)
 66 | 어제는 날씨가 말간는데, 오느른 흐리다.
 67 | >>> g2p(sent, to_syl=False)
 68 | 어제는 날씨가 말간는데, 오느른 흐리다.
 69 | ```
 70 | * English words in alphabets are converted into Hangul. 
 71 | This is possible due to [cmu pronouncing dictionary](http://www.speech.cs.cmu.edu/cgi-bin/cmudict).
 72 | ```
 73 | >>> sent = "그 사람은 좀, old school 같아"
 74 | >>> g2p(sent)
 75 | 그 사라믄 좀, 올드 스쿨 가타
 76 | ```
 77 | * Arabic numbers are spelled out to their context.
 78 |  Note that the first 12 is pronounced 열두, whereas the second 12 is pronounced 십이.
 79 | ```
 80 | >>> sent = "지금 시각은 12시 12분입니다"
 81 | >>> g2p(sent)
 82 | 지금 시가그 녈두시 시비부님니다
 83 | ```
 84 | * It is natural that rules can NOT cover every single case. Add special idioms to `idioms.txt`.
 85 | * If you set `verbose=True`, you will see the conversion processes with relevant information.
 86 | ```
 87 | >>> sent = "학교에 갔다 와서, 엄마가 해 주신 밥을 먹었다."
 88 | >>> g2p(sent, verbose=True)
 89 | 학교에 갔다 와서, 엄마가 해 주신 밥을 먹었다. -> 학꾜에 갔다 와서, 엄마가 해 주신 밥을 먹었다.
 90 |  제23항　받침 'ㄱ(ㄲ, ㅋ, ㄳ, ㄺ), ㄷ(ㅅ, ㅆ, ㅈ, ㅊ, ㅌ), ㅂ(ㅍ, ㄼ, ㄿ, ㅄ)' 뒤에 연결되는 'ㄱ, ㄷ, ㅂ, ㅅ, ㅈ'은 된소리로 발음한다.
 91 | -> 국밥[국빱], 깎다[깍따], 넑받이[넉빠지], 삯돈[삭똔]
 92 | -> 닭장[닥짱], 칡범[칙뻠], 뻗대다[뻗때다], 옷고름[옫꼬름]
 93 | -> 있던[읻떤], 꽂고[꼳꼬], 꽃다발[꼳따발], 낯설다[낟썰다]
 94 | -> 밭갈이[받까리], 솥전[솓쩐], 곱돌[곱똘], 덮개[덥깨]
 95 | -> 옆집[엽찝], 넓죽하다[넙쭈카다], 읊조리다[읍쪼리다], 값지다[갑찌다] 
 96 | 학꾜에 갔다 와서, 엄마가 해 주신 밥을 먹었다. -> 학꾜에 갇따 와서, 엄마가 해 주신 밥을 먹얻따.
 97 |  제9항　받침 'ㄲ, ㅋ', 'ㅅ, ㅆ, ㅈ, ㅊ, ㅌ', 'ㅍ'은 어말 또는 자음 앞에서 각각 대표음 [ㄱ, ㄷ, ㅂ]으로 발음한다.
 98 | -> 닦다[닥따], 키읔[키윽], 키읔과[키윽꽈], 옷[옫]
 99 | -> 웃다[욷따], 있다[읻따], 젖[젇], 빚다[빋따]
100 | -> 꽃[꼳], 쫓다[쫃따], 솥[솓], 뱉다[밷따]
101 | -> 앞[압], 덮다[덥따]
102 | 제23항　받침 'ㄱ(ㄲ, ㅋ, ㄳ, ㄺ), ㄷ(ㅅ, ㅆ, ㅈ, ㅊ, ㅌ), ㅂ(ㅍ, ㄼ, ㄿ, ㅄ)' 뒤에 연결되는 'ㄱ, ㄷ, ㅂ, ㅅ, ㅈ'은 된소리로 발음한다.
103 | -> 국밥[국빱], 깎다[깍따], 넑받이[넉빠지], 삯돈[삭똔]
104 | -> 닭장[닥짱], 칡범[칙뻠], 뻗대다[뻗때다], 옷고름[옫꼬름]
105 | -> 있던[읻떤], 꽂고[꼳꼬], 꽃다발[꼳따발], 낯설다[낟썰다]
106 | -> 밭갈이[받까리], 솥전[솓쩐], 곱돌[곱똘], 덮개[덥깨]
107 | -> 옆집[엽찝], 넓죽하다[넙쭈카다], 읊조리다[읍쪼리다], 값지다[갑찌다] 
108 | 학꾜에 갇따 와서, 엄마가 해 주신 밥을 먹얻따. -> 학꾜에 갇따 와서, 엄마가 해 주신 바블 머걷따.
109 |  제13항　홑받침이나 쌍받침이 모음으로 시작된 조사나 어미, 접미사와 결합되는 경우에는, 제 음가대로 뒤 음절 첫소리로 옮겨 발음한다.
110 | -> 깎아[까까], 옷이[오시], 있어[이써], 낮이[나지]
111 | -> 꽂아[꼬자], 꽃을[꼬츨], 쫓아[쪼차], 밭에[바테]
112 | -> 앞으로[아프로], 덮이다[더피다] 
113 | ```
114 | 
115 | 
116 | ## References
117 | 
118 | If you use our software for research, please cite:
119 | 
120 | ```
121 | @misc{park2019g2pk,
122 |   author = {Park, Kyubyong},
123 |   title = {g2pK},
124 |   year = {2019},
125 |   publisher = {GitHub},
126 |   journal = {GitHub repository},
127 |   howpublished = {\url{https://github.com/Kyubyong/g2pk}}
128 | }
129 | ```
130 | 


--------------------------------------------------------------------------------
/g2pk/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | r"""g2pK
3 | """
4 | from __future__ import absolute_import
5 | 
6 | from .g2pk import G2p


--------------------------------------------------------------------------------
/g2pk/english.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | Convert English to Hangul
  4 | https://github.com/kyubyong/g2pK
  5 | '''
  6 | 
  7 | import re
  8 | 
  9 | from g2pk.utils import adjust, compose, to_choseong, to_jungseong, to_jongseong, reconstruct
 10 | 
 11 | 
 12 | def convert_eng(string, cmu):
 13 |     '''Convert a string such that English words inside are turned into Hangul.
 14 |     string: input string.
 15 |     cmu: cmu dict object.
 16 | 
 17 |     >>> convert_eng("그 사람 좀 old school이야", cmu)
 18 |     그 사람 좀 올드 스쿨이야
 19 |     '''
 20 |     eng_words = set(re.findall("[A-Za-z']+", string))
 21 |     for eng_word in eng_words:
 22 |         word = eng_word.lower()
 23 |         if word not in cmu:
 24 |             continue
 25 | 
 26 |         arpabets = cmu[word][0] # https://en.wikipedia.org/wiki/ARPABET
 27 |         phonemes = adjust(arpabets)
 28 |         ret = ""
 29 |         for i in range(len(phonemes)):
 30 |             p = phonemes[i] # phoneme
 31 |             p_prev = phonemes[i - 1] if i > 0 else "^"
 32 |             p_next = phonemes[i + 1] if i < len(phonemes) - 1 else "$"
 33 |             p_next2 = phonemes[i + 1] if i < len(phonemes) - 2 else "$"
 34 | 
 35 |             # desginated sets
 36 |             short_vowels = ("AE", "AH", "AX", "EH", "IH", "IX", "UH")
 37 |             vowels = "AEIOUY"
 38 |             consonants = "BCDFGHJKLMNPQRSTVWXZ"
 39 |             syllable_final_or_consonants = "$BCDFGHJKLMNPQRSTVWXZ"
 40 | 
 41 |             # 외래어 표기법 https://ko.dict.naver.com/help.nhn?page=4-1-3-1#dtl_cts
 42 |             #  1항. 무성 파열음 ([p], [t], [k])
 43 |             # 1. 짧은 모음 다음의 어말 무성 파열음([p], [t], [k])은 받침으로 적는다.
 44 |             # 2. 짧은 모음과 유음·비음([l], [r], [m], [n]) 이외의 자음 사이에 오는 무성 파열음([p], [t], [k])은 받침으로 적는다.
 45 |             # 3. 위 경우 이외의 어말과 자음 앞의 [p], [t], [k]는 '으'를 붙여 적는다.
 46 | 
 47 |             if p in "PTK":
 48 |                 if p_prev[:2] in short_vowels and p_next == "$":  # 1
 49 |                     ret += to_jongseong(p)
 50 |                 elif p_prev[:2] in short_vowels and p_next[0] not in "AEIOULRMN":  # 2
 51 |                     ret += to_jongseong(p)
 52 |                 elif p_next[0] in "$BCDFGHJKLMNPQRSTVWXYZ":  # 3
 53 |                     ret += to_choseong(p)
 54 |                     ret += "ᅳ"
 55 |                 else:
 56 |                     ret += to_choseong(p)
 57 | 
 58 |             # 2항. 유성 파열음([b], [d], [g])
 59 |             # 어말과 모든 자음 앞에 오는 유성 파열음은 '으'를 붙여 적는다.
 60 |             elif p in "BDG":
 61 |                 ret += to_choseong(p)
 62 |                 if p_next[0] in syllable_final_or_consonants:
 63 |                     ret += "ᅳ"
 64 | 
 65 |             # 3항. 마찰음([s], [z], [f], [v], [θ], [ð], [ʃ], [ʒ])
 66 |             # 1. 어말 또는 자음 앞의 [s], [z], [f], [v], [θ], [ð]는 '으'를 붙여 적는다.
 67 |             # 2. 어말의 [ʃ]는 '시'로 적고, 자음 앞의 [ʃ]는 '슈'로, 모음 앞의 [ʃ]는 뒤따르는 모음에 따라 '샤', '섀', '셔', '셰', '쇼', '슈', '시'로 적는다.
 68 |             # 3. 어말 또는 자음 앞의 [ʒ]는 '지'로 적고, 모음 앞의 [ʒ]는 'ㅈ'으로 적는다.
 69 |             elif p in ("S", "Z", "F", "V", "TH", "DH", "SH", "ZH"):
 70 |                 ret += to_choseong(p)
 71 | 
 72 |                 if p in ("S", "Z", "F", "V", "TH", "DH"):  # 1
 73 |                     if p_next[0] in syllable_final_or_consonants:
 74 |                         ret += "ᅳ"
 75 |                 elif p == "SH":  # 2
 76 |                     if p_next[0] in "$":
 77 |                         ret += "ᅵ"
 78 |                     elif p_next[0] in consonants:
 79 |                         ret += "ᅲ"
 80 |                     else:
 81 |                         ret += "Y"
 82 |                 elif p == "ZH":  # 3
 83 |                     if p_next[0] in syllable_final_or_consonants:
 84 |                         ret += "ᅵ"
 85 | 
 86 |             # 4항. 파찰음([ʦ], [ʣ], [ʧ], [ʤ])
 87 |             # 1. 어말 또는 자음 앞의 [ʦ], [ʣ]는 '츠', '즈'로 적고, [ʧ], [ʤ]는 '치', '지'로 적는다.
 88 |             # 2. 모음 앞의 [ʧ], [ʤ]는 'ㅊ', 'ㅈ'으로 적는다.
 89 |             elif p in ("TS", "DZ", "CH", "JH",):
 90 |                 ret += to_choseong(p)  # 2
 91 | 
 92 |                 if p_next[0] in syllable_final_or_consonants:  # 1
 93 |                     if p in ("TS", "DZ"):
 94 |                         ret += "ᅳ"
 95 |                     else:
 96 |                         ret += "ᅵ"
 97 | 
 98 |             # 5항. 비음([m], [n], [ŋ])
 99 |             # 1. 어말 또는 자음 앞의 비음은 모두 받침으로 적는다.
100 |             # 2. 모음과 모음 사이의 [ŋ]은 앞 음절의 받침 'ㆁ'으로 적는다.
101 |             elif p in ("M", "N", "NG"):
102 |                 if p in "MN" and p_next[0] in vowels:
103 |                     ret += to_choseong(p)
104 |                 else:
105 |                     ret += to_jongseong(p)
106 | 
107 |             # 6항. 유음([l])
108 |             # 1. 어말 또는 자음 앞의 [l]은 받침으로 적는다.
109 |             # 2. 어중의 [l]이 모음 앞에 오거나, 모음이 따르지 않는 비음([m], [n]) 앞에 올 때에는 'ㄹㄹ'로 적는다.
110 |             # 3. 다만, 비음([m], [n]) 뒤의 [l]은 모음 앞에 오더라도 'ㄹ'로 적는다.
111 |             elif p == "L":
112 |                 if p_prev == "^":  # initial
113 |                     ret += to_choseong(p)
114 |                 elif p_next[0] in "$BCDFGHJKLPQRSTVWXZ":  # 1
115 |                     ret += to_jongseong(p)
116 |                 elif p_prev in "MN":  # 3
117 |                     ret += to_choseong(p)
118 |                 elif p_next[0] in vowels:  # 2
119 |                     ret += "ᆯᄅ"
120 |                 elif p_next in "MN" and p_next2[0] not in vowels:  # 2
121 |                     ret += "ᆯ르"
122 | 
123 |             # custom
124 |             elif p == "ER":
125 |                 if p_prev[0] in vowels:
126 |                     ret += "ᄋ"
127 |                 ret += to_jungseong(p)
128 |                 if p_next[0] in vowels:
129 |                     ret += "ᄅ"
130 |             elif p == "R":
131 |                 if p_next[0] in vowels:
132 |                     ret += to_choseong(p)
133 | 
134 |             # 8항. 중모음1) ([ai], [au], [ei], [ɔi], [ou], [auə])
135 |             # 중모음은 각 단모음의 음가를 살려서 적되, [ou]는 '오'로, [auə]는 '아워'로 적는다.
136 |             elif p[0] in "AEIOU":
137 |                 ret += to_jungseong(p)
138 | 
139 |             else:
140 |                 ret += to_choseong(p)
141 | 
142 |         ret = reconstruct(ret)
143 |         ret = compose(ret)
144 |         ret = re.sub("[\u1100-\u11FF]", "", ret) # remove hangul jamo
145 |         string = string.replace(eng_word, ret)
146 |     return string
147 | 
148 | if __name__ == "__main__":
149 |     from nltk.corpus import cmudict
150 |     cmu = cmudict.dict()
151 |     print(convert_eng("오늘 학교에서 밥을 먹고 집에 와서 game을 했다", cmu))


--------------------------------------------------------------------------------
/g2pk/g2pk.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | https://github.com/kyubyong/g2pK
  4 | '''
  5 | 
  6 | import os, re
  7 | 
  8 | import nltk
  9 | import mecab
 10 | from jamo import h2j
 11 | from nltk.corpus import cmudict
 12 | 
 13 | # For further info. about cmu dict, consult http://www.speech.cs.cmu.edu/cgi-bin/cmudict.
 14 | try:
 15 |     nltk.data.find('corpora/cmudict.zip')
 16 | except LookupError:
 17 |     nltk.download('cmudict')
 18 | 
 19 | from g2pk.special import jyeo, ye, consonant_ui, josa_ui, vowel_ui, jamo, rieulgiyeok, rieulbieub, verb_nieun, balb, palatalize, modifying_rieul
 20 | from g2pk.regular import link1, link2, link3, link4
 21 | from g2pk.utils import annotate, compose, group, gloss, parse_table, get_rule_id2text
 22 | from g2pk.english import convert_eng
 23 | from g2pk.numerals import convert_num
 24 | 
 25 | 
 26 | class G2p(object):
 27 |     def __init__(self):
 28 |         self.mecab = self.get_mecab()
 29 |         self.table = parse_table()
 30 | 
 31 |         self.cmu = cmudict.dict() # for English
 32 | 
 33 |         self.rule2text = get_rule_id2text() # for comments of main rules
 34 |         self.idioms_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "idioms.txt")
 35 | 
 36 |     def get_mecab(self):
 37 |         try:
 38 |             return mecab.MeCab()
 39 |         except Exception as e:
 40 |             raise Exception(
 41 |                 'If you want to install mecab, The command is... pip install python-mecab-ko'
 42 |             )
 43 | 
 44 |     def idioms(self, string, descriptive=False, verbose=False):
 45 |         '''Process each line in `idioms.txt`
 46 |         Each line is delimited by "===",
 47 |         and the left string is replaced by the right one.
 48 |         inp: input string.
 49 |         descriptive: not used.
 50 |         verbose: boolean.
 51 | 
 52 |         >>> idioms("지금 mp3 파일을 다운받고 있어요")
 53 |         지금 엠피쓰리 파일을 다운받고 있어요
 54 |         '''
 55 |         rule = "from idioms.txt"
 56 |         out = string
 57 | 
 58 |         for line in open(self.idioms_path, 'r', encoding="utf8"):
 59 |             line = line.split("#")[0].strip()
 60 |             if "===" in line:
 61 |                 str1, str2 = line.split("===")
 62 |                 out = re.sub(str1, str2, out)
 63 |         gloss(verbose, out, string, rule)
 64 | 
 65 |         return out
 66 | 
 67 |     def __call__(self, string, descriptive=False, verbose=False, group_vowels=False, to_syl=True):
 68 |         '''Main function
 69 |         string: input string
 70 |         descriptive: boolean.
 71 |         verbose: boolean
 72 |         group_vowels: boolean. If True, the vowels of the identical sound are normalized.
 73 |         to_syl: boolean. If True, hangul letters or jamo are assembled to form syllables.
 74 | 
 75 |         For example, given an input string "나의 친구가 mp3 file 3개를 다운받고 있다",
 76 |         STEP 1. idioms
 77 |         -> 나의 친구가 엠피쓰리 file 3개를 다운받고 있다
 78 | 
 79 |         STEP 2. English to Hangul
 80 |         -> 나의 친구가 엠피쓰리 파일 3개를 다운받고 있다
 81 | 
 82 |         STEP 3. annotate
 83 |         -> 나의/J 친구가 엠피쓰리 파일 3개/B를 다운받고 있다
 84 | 
 85 |         STEP 4. Spell out arabic numbers
 86 |         -> 나의/J 친구가 엠피쓰리 파일 세개/B를 다운받고 있다
 87 | 
 88 |         STEP 5. decompose
 89 |         -> 나의/J 친구가 엠피쓰리 파일 세개/B를 다운받고 있다
 90 | 
 91 |         STEP 6-9. Hangul
 92 |         -> 나의 친구가 엠피쓰리 파일 세개를 다운받꼬 읻따
 93 |         '''
 94 |         # 1. idioms
 95 |         string = self.idioms(string, descriptive, verbose)
 96 | 
 97 |         # 2 English to Hangul
 98 |         string = convert_eng(string, self.cmu)
 99 | 
100 |         # 3. annotate
101 |         string = annotate(string, self.mecab)
102 | 
103 |         # 4. Spell out arabic numbers
104 |         string = convert_num(string)
105 | 
106 |         # 5. decompose
107 |         inp = h2j(string)
108 | 
109 |         # 6. special
110 |         for func in (jyeo, ye, consonant_ui, josa_ui, vowel_ui, \
111 |                      jamo, rieulgiyeok, rieulbieub, verb_nieun, \
112 |                      balb, palatalize, modifying_rieul):
113 |             inp = func(inp, descriptive, verbose)
114 |         inp = re.sub("/[PJEB]", "", inp)
115 | 
116 |         # 7. regular table: batchim + onset
117 |         for str1, str2, rule_ids in self.table:
118 |             _inp = inp
119 |             inp = re.sub(str1, str2, inp)
120 | 
121 |             if len(rule_ids)>0:
122 |                 rule = "\n".join(self.rule2text.get(rule_id, "") for rule_id in rule_ids)
123 |             else:
124 |                 rule = ""
125 |             gloss(verbose, inp, _inp, rule)
126 | 
127 |         # 8 link
128 |         for func in (link1, link2, link3, link4):
129 |             inp = func(inp, descriptive, verbose)
130 | 
131 |         # 9. postprocessing
132 |         if group_vowels:
133 |             inp = group(inp)
134 | 
135 |         if to_syl:
136 |             inp = compose(inp)
137 |         return inp
138 | 
139 | if __name__ == "__main__":
140 |     g2p = G2p()
141 |     g2p("나의 친구가 mp3 file 3개를 다운받고 있다")
142 | 


--------------------------------------------------------------------------------
/g2pk/idioms.txt:
--------------------------------------------------------------------------------
  1 | # Each line should be considered prior to others.
  2 | # Comments are preceded by #.
  3 | # Each line should look like [str1]===[str2],
  4 | # where str2 replaces str1.
  5 | # Note that these will be processed through regular expression.
  6 | 의견란===의견난
  7 | 임진란===임진난
  8 | 생산량===생산냥
  9 | 결단력===결딴녁
 10 | 공권력===공꿘녁
 11 | 동원령===동원녕
 12 | 상견례===상견녜
 13 | 횡단로===횡단노
 14 | 이원론===이원논
 15 | 입원료===이붠뇨
 16 | 구근류===구근뉴
 17 | 
 18 | 갈등===갈뜽
 19 | 발동===발똥
 20 | 절도===절또
 21 | 말살===말쌀
 22 | 불소===불쏘
 23 | 일시===일씨
 24 | 갈증===갈쯩
 25 | 물질===물찔
 26 | 발전===발쩐
 27 | 몰상식===몰쌍식
 28 | 불세출===불쎄출
 29 | 
 30 | 문고리===문꼬리
 31 | 눈동자===눈똥자
 32 | 신바람===신빠람
 33 | 산새===산쌔
 34 | 손재주===손째주
 35 | 길가===길까
 36 | 물동이===물똥이
 37 | 발바닥===발빠닥
 38 | 굴속===굴쏙
 39 | 술잔===술짠
 40 | 바람결===바람껼
 41 | 그믐달===그믐딸
 42 | 아침밥===아침빱
 43 | 잠자리===잠짜리
 44 | 강가===강까
 45 | 초승달===초승딸
 46 | 등불===등뿔
 47 | 창살===창쌀
 48 | 강줄기===강쭐기
 49 | 
 50 | 솜이불===솜니불
 51 | 홑이불===혼니불
 52 | 막일===망닐
 53 | 삯일===상닐
 54 | 맨입===맨닙
 55 | 꽃잎===꼰닙
 56 | 내복약===내봉냑
 57 | 색연필===생년필
 58 | 직행열차===지캥녈차
 59 | 늑막염===능망념
 60 | 콩엿===콩녇
 61 | 담요===담뇨
 62 | 눈요기===눈뇨기
 63 | 
 64 | 영업용===영엄뇽
 65 | 식용유===시굥뉴
 66 | 국민윤리===궁민뉼리
 67 | 밤윳===밤뉻
 68 | 이죽이죽===이중니죽
 69 | 야금야금===야금냐금
 70 | 검열===검녈
 71 | 욜랑욜랑===욜랑뇰랑
 72 | 금융===금늉
 73 | 들일===들릴
 74 | 솔잎===솔립
 75 | 설익다===설릭따
 76 | 물약===물략
 77 | 불여우===불려우
 78 | 서울역===서울력
 79 | 물엿===물렫
 80 | 휘발유===휘발류
 81 | 유들유들===유들류들
 82 | 한일===한닐
 83 | 옷입다===온닙따
 84 | 서른여섯===서른녀섣
 85 | 3연대===삼년대
 86 | 먹은엿===머근녇
 87 | 할일===할릴
 88 | 잘입다===잘립따
 89 | 스물여섯===스물려섣
 90 | 1연대===일련대
 91 | 먹을엿===머글렫
 92 | 6·25===유기오
 93 | 3·1절===사밀쩔
 94 | 송별연===송벼련
 95 | 등용문===등용문
 96 | 
 97 | 냇가===내까
 98 | 샛길===새낄
 99 | 빨랫돌===빨래똘
100 | 콧등===코뜽
101 | 깃발===기빨
102 | 대팻밥===대패빱
103 | 햇살===해쌀
104 | 뱃속===배쏙
105 | 뱃전===배쩐
106 | 고갯짓===고개찓
107 | 
108 | 콧날===콘날
109 | 아랫니===아랜니
110 | 툇마루===퇸마루
111 | 뱃머리===밴머리
112 | 
113 | 베갯잇===베갠닏
114 | 깻잎===깬닙
115 | 나뭇잎===나문닙
116 | 도리깻열===도리깬녈
117 | 뒷윷===뒨뉻
118 | 
119 | 할걸===할껄
120 | 할밖에===할빠께
121 | 할세라===할쎄라
122 | 할수록===할쑤록
123 | 할지라도===할찌라도
124 | 할지언정===할찌언정
125 | 할진대===할찐대
126 | 
127 | ml===밀리리터
128 | mp3===엠피쓰리
129 | %===퍼센트
130 | jpeg===제이펙
131 | mp4===엠피포
132 | 
133 | 1번째===첫번째
134 | 10월===시월
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/g2pk/numerals.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | https://github.com/kyubyong/g2pK
  4 | '''
  5 | 
  6 | import re
  7 | 
  8 | # This is a list of bound nouns preceded by pure Korean numerals.
  9 | BOUND_NOUNS = "군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통"
 10 | 
 11 | 
 12 | def process_num(num, sino=True):
 13 |     '''Process a string looking like arabic number.
 14 |     num: string. Consists of [0-9,]. e.g., 12,345
 15 |     sino: boolean. If True, sino-Korean numerals, i.e., 일, 이, .. are considered.
 16 |         Otherwise, pure Korean ones in their modifying forms such as 한, 두, ... are returned.
 17 | 
 18 |     >>> process_num("123,456,789", sino=True)
 19 |     일억이천삼백사십오만육천칠백팔십구
 20 | 
 21 |     >>> process_num("123,456,789", sino=False)
 22 |     일억이천삼백사십오만육천칠백여든아홉
 23 |     '''
 24 |     num = re.sub(",", "", num)
 25 | 
 26 |     if num == "0":
 27 |         return "영"
 28 |     if not sino and num == "20":
 29 |         return "스무"
 30 | 
 31 |     digits = "123456789"
 32 |     names = "일이삼사오육칠팔구"
 33 |     digit2name = {d: n for d, n in zip(digits, names)}
 34 | 
 35 |     modifiers = "한 두 세 네 다섯 여섯 일곱 여덟 아홉"
 36 |     decimals = "열 스물 서른 마흔 쉰 예순 일흔 여든 아흔"
 37 |     digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
 38 |     digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
 39 | 
 40 |     spelledout = []
 41 |     for i, digit in enumerate(num):
 42 |         i = len(num) - i - 1
 43 |         if sino:
 44 |             if i == 0:
 45 |                 name = digit2name.get(digit, "")
 46 |             elif i == 1:
 47 |                 name = digit2name.get(digit, "") + "십"
 48 |                 name = name.replace("일십", "십")
 49 |         else:
 50 |             if i == 0:
 51 |                 name = digit2mod.get(digit, "")
 52 |             elif i == 1:
 53 |                 name = digit2dec.get(digit, "")
 54 |         if digit == '0':
 55 |             if i % 4 == 0:
 56 |                 last_three = spelledout[-min(3, len(spelledout)):]
 57 |                 if "".join(last_three) == "":
 58 |                     spelledout.append("")
 59 |                     continue
 60 |             else:
 61 |                 spelledout.append("")
 62 |                 continue
 63 |         if i == 2:
 64 |             name = digit2name.get(digit, "") + "백"
 65 |             name = name.replace("일백", "백")
 66 |         elif i == 3:
 67 |             name = digit2name.get(digit, "") + "천"
 68 |             name = name.replace("일천", "천")
 69 |         elif i == 4:
 70 |             name = digit2name.get(digit, "") + "만"
 71 |             name = name.replace("일만", "만")
 72 |         elif i == 5:
 73 |             name = digit2name.get(digit, "") + "십"
 74 |             name = name.replace("일십", "십")
 75 |         elif i == 6:
 76 |             name = digit2name.get(digit, "") + "백"
 77 |             name = name.replace("일백", "백")
 78 |         elif i == 7:
 79 |             name = digit2name.get(digit, "") + "천"
 80 |             name = name.replace("일천", "천")
 81 |         elif i == 8:
 82 |             name = digit2name.get(digit, "") + "억"
 83 |         elif i == 9:
 84 |             name = digit2name.get(digit, "") + "십"
 85 |         elif i == 10:
 86 |             name = digit2name.get(digit, "") + "백"
 87 |         elif i == 11:
 88 |             name = digit2name.get(digit, "") + "천"
 89 |         elif i == 12:
 90 |             name = digit2name.get(digit, "") + "조"
 91 |         elif i == 13:
 92 |             name = digit2name.get(digit, "") + "십"
 93 |         elif i == 14:
 94 |             name = digit2name.get(digit, "") + "백"
 95 |         elif i == 15:
 96 |             name = digit2name.get(digit, "") + "천"
 97 |         spelledout.append(name)
 98 | 
 99 |     return "".join(elem for elem in spelledout)
100 | 
101 | 
102 | def convert_num(string):
103 |     '''Convert a annotated string such that arabic numerals inside are spelled out.
104 |     >>> convert_num("우리 3시/B 10분/B에 만나자.")
105 |     우리 세시/B 십분/B에 만나자.
106 |     '''
107 |     global BOUND_NOUNS
108 | 
109 |     # Bound Nouns
110 |     tokens = set(re.findall("([\d][\d,]*)([ㄱ-힣]+)/B", string))
111 |     for token in tokens:
112 |         num, bn = token
113 |         if bn in BOUND_NOUNS:
114 |             spelledout = process_num(num, sino=False)
115 |         else:
116 |             spelledout = process_num(num, sino=True)
117 |         string = string.replace(f"{num}{bn}/B", f"{spelledout}{bn}/B")
118 | 
119 |     # digit by digit for remaining digits
120 |     digits = "0123456789"
121 |     names = "영일이삼사오육칠팔구"
122 |     for d, n in zip(digits, names):
123 |         string = string.replace(d, n)
124 | 
125 |     return string
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     # test
130 |     print(process_num("123,456,789", sino=True))
131 |     print(process_num("123,456,789", sino=False))
132 |     print(convert_num("우리 3시/B 10분/B에 만나자."))
133 | 


--------------------------------------------------------------------------------
/g2pk/regular.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | https://github.com/kyubyong/g2pK
  4 | '''
  5 | 
  6 | from g2pk.utils import gloss, get_rule_id2text
  7 | rule_id2text = get_rule_id2text()
  8 | 
  9 | 
 10 | def link1(inp, descriptive=False, verbose=False):
 11 |     rule = rule_id2text["13"]
 12 |     out = inp
 13 | 
 14 |     pairs = [ ("ᆨᄋ", "ᄀ"),
 15 |               ("ᆩᄋ", "ᄁ"),
 16 |               ("ᆫᄋ", "ᄂ"),
 17 |               ("ᆮᄋ", "ᄃ"),
 18 |               ("ᆯᄋ", "ᄅ"),
 19 |               ("ᆷᄋ", "ᄆ"),
 20 |               ("ᆸᄋ", "ᄇ"),
 21 |               ("ᆺᄋ", "ᄉ"),
 22 |               ("ᆻᄋ", "ᄊ"),
 23 |               ("ᆽᄋ", "ᄌ"),
 24 |               ("ᆾᄋ", "ᄎ"),
 25 |               ("ᆿᄋ", "ᄏ"),
 26 |               ("ᇀᄋ", "ᄐ"),
 27 |               ("ᇁᄋ", "ᄑ")]
 28 |     for str1, str2 in pairs:
 29 |         out = out.replace(str1, str2)
 30 | 
 31 |     gloss(verbose, out, inp, rule)
 32 |     return out
 33 | 
 34 | 
 35 | def link2(inp, descriptive=False, verbose=False):
 36 |     rule = rule_id2text["14"]
 37 |     out = inp
 38 | 
 39 |     pairs = [ ("ᆪᄋ", "ᆨᄊ"),
 40 |               ("ᆬᄋ", "ᆫᄌ"),
 41 |               ("ᆰᄋ", "ᆯᄀ"),
 42 |               ("ᆱᄋ", "ᆯᄆ"),
 43 |               ("ᆲᄋ", "ᆯᄇ"),
 44 |               ("ᆳᄋ", "ᆯᄊ"),
 45 |               ("ᆴᄋ", "ᆯᄐ"),
 46 |               ("ᆵᄋ", "ᆯᄑ"),
 47 |               ("ᆹᄋ", "ᆸᄊ") ]
 48 |     for str1, str2 in pairs:
 49 |         out = out.replace(str1, str2)
 50 | 
 51 |     gloss(verbose, out, inp, rule)
 52 |     return out
 53 | 
 54 | 
 55 | def link3(inp, descriptive=False, verbose=False):
 56 |     rule = rule_id2text["15"]
 57 |     out = inp
 58 | 
 59 |     pairs = [ ("ᆨ ᄋ", " ᄀ"),
 60 |               ("ᆩ ᄋ", " ᄁ"),
 61 |               ("ᆫ ᄋ", " ᄂ"),
 62 |               ("ᆮ ᄋ", " ᄃ"),
 63 |               ("ᆯ ᄋ", " ᄅ"),
 64 |               ("ᆷ ᄋ", " ᄆ"),
 65 |               ("ᆸ ᄋ", " ᄇ"),
 66 |               ("ᆺ ᄋ", " ᄉ"),
 67 |               ("ᆻ ᄋ", " ᄊ"),
 68 |               ("ᆽ ᄋ", " ᄌ"),
 69 |               ("ᆾ ᄋ", " ᄎ"),
 70 |               ("ᆿ ᄋ", " ᄏ"),
 71 |               ("ᇀ ᄋ", " ᄐ"),
 72 |               ("ᇁ ᄋ", " ᄑ"),
 73 | 
 74 |               ("ᆪ ᄋ", "ᆨ ᄊ"),
 75 |               ("ᆬ ᄋ", "ᆫ ᄌ"),
 76 |               ("ᆰ ᄋ", "ᆯ ᄀ"),
 77 |               ("ᆱ ᄋ", "ᆯ ᄆ"),
 78 |               ("ᆲ ᄋ", "ᆯ ᄇ"),
 79 |               ("ᆳ ᄋ", "ᆯ ᄊ"),
 80 |               ("ᆴ ᄋ", "ᆯ ᄐ"),
 81 |               ("ᆵ ᄋ", "ᆯ ᄑ"),
 82 |               ("ᆹ ᄋ", "ᆸ ᄊ") ]
 83 | 
 84 |     for str1, str2 in pairs:
 85 |         out = out.replace(str1, str2)
 86 | 
 87 |     gloss(verbose, out, inp, rule)
 88 |     return out
 89 | 
 90 | 
 91 | def link4(inp, descriptive=False, verbose=False):
 92 |     rule = rule_id2text["12.4"]
 93 | 
 94 |     out = inp
 95 | 
 96 |     pairs = [ ("ᇂᄋ", "ᄋ"),
 97 |               ("ᆭᄋ", "ᄂ"),
 98 |               ("ᆶᄋ", "ᄅ") ]
 99 | 
100 |     for str1, str2 in pairs:
101 |         out = out.replace(str1, str2)
102 | 
103 |     gloss(verbose, out, inp, rule)
104 |     return out
105 | 
106 | 


--------------------------------------------------------------------------------
/g2pk/rules.txt:
--------------------------------------------------------------------------------
  1 | 5.1
  2 | 5항. 다만 1. 용언의 활용형에 나타나는 '져, 쪄, 쳐'는 [저, 쩌, 처]로 발음한다.
  3 | -> 가져[가저], 쪄[쩌], 다쳐[다처]
  4 | 
  5 | 5.2
  6 | 5항. 다만 2. '예, 례' 이외의 'ㅖ'는 [ㅔ]로도 발음한다.
  7 | -> 계집[계집/게집], 계시다[계시다/게시다]
  8 | -> 시계[시계/시게](時計), 연계[연계/연게](連繫)
  9 | -> 몌별[몌별/메별](袂別), 개폐[개폐/개페](開閉)
 10 | -> 혜택[혜택/헤택](惠澤), 지혜[지혜/지헤](智慧)
 11 | # 실제로 언중은 예, 녜, 셰, 쎼 이외의 'ㅖ'는 [ㅔ]로 발음한다. by kyubyong
 12 | 
 13 | 5.3
 14 | 5항. 다만 3. 자음을 첫소리로 가지고 있는 음절의 'ㅢ'는 [ㅣ]로 발음한다.
 15 | -> 늴리리[닐리리], 닁큼[닝큼], 무늬[무니], 띄어쓰기[띠어쓰기], 씌어[씨어]
 16 | -> 틔어[티어], 희어[히어], 희떱다[히떱따], 희망[히망], 유희[유히]
 17 | 
 18 | 5.4.1
 19 | 다만 4. 단어의 첫음절 이외의 '의'는 [ㅣ]로 발음함도 허용한다.
 20 | -> 주의[주의/주이], 협의[혀븨/혀비]
 21 | # 실제로 언중은 높은 확률로 단어의 첫음절 이외의 '의'는 [ㅣ]로 발음한다.
 22 | 
 23 | 5.4.2
 24 | 다만 4. 조사 '의'는 [ㅔ]로 발음함도 허용한다.
 25 | -> 우리의[우리의/우리에], 강의의[강의의/강이에]
 26 | # 실제로 언중은 높은 확률로 조사 '의'는 [ㅔ]로 발음한다.
 27 | 
 28 | 9
 29 | 제9항　받침 'ㄲ, ㅋ', 'ㅅ, ㅆ, ㅈ, ㅊ, ㅌ', 'ㅍ'은 어말 또는 자음 앞에서 각각 대표음 [ㄱ, ㄷ, ㅂ]으로 발음한다.
 30 | -> 닦다[닥따], 키읔[키윽], 키읔과[키윽꽈], 옷[옫]
 31 | -> 웃다[욷따], 있다[읻따], 젖[젇], 빚다[빋따]
 32 | -> 꽃[꼳], 쫓다[쫃따], 솥[솓], 뱉다[밷따]
 33 | -> 앞[압], 덮다[덥따]
 34 | 
 35 | 10
 36 | 제10항　겹받침 'ㄳ', 'ㄵ', 'ㄼ, ㄽ, ㄾ', 'ㅄ'은 어말 또는 자음 앞에서 각각 [ㄱ, ㄴ, ㄹ, ㅂ]으로 발음한다.
 37 | -> 넋[넉], 넋과[넉꽈], 앉다[안따], 여덟[여덜]
 38 | -> 넓다[널따], 외곬[외골], 핥다[할따], 값[갑]
 39 | -> 없다[업:따]
 40 | 
 41 | 10.1
 42 | 다만, '밟-'은 자음 앞에서 [밥]으로 발음하고, '넓-'은 다음과 같은 경우에 [넙]으로 발음한다.
 43 | -> 1) 밟다[밥따], 밟소[밥쏘], 밟지[밥찌], 밟는[밤:는], 밟게[밥께], 밟고[밥꼬]
 44 | -> 2) 넓죽하다[넙쭈카다], 넓둥글다[넙뚱글다]
 45 | 
 46 | 11
 47 | 제11항　겹받침 'ㄺ, ㄻ, ㄿ'은 어말 또는 자음 앞에서 각각 [ㄱ, ㅁ, ㅂ]으로 발음한다.
 48 | -> 닭[닥], 흙과[흑꽈], 맑다[막따], 늙지[늑찌]
 49 | -> 삶[삼], 젊다[점따], 읊고[읍꼬], 읊다[읍따]
 50 | 
 51 | 11.1
 52 | 다만, 용언의 어간 말음 'ㄺ'은 'ㄱ' 앞에서 [ㄹ]로 발음한다.
 53 | -> 맑게[말께], 묽고[물꼬], 읽거나[일꺼나]
 54 | 
 55 | 12
 56 | 제12항　받침 'ㅎ'의 발음은 다음과 같다.
 57 | 1. 'ㅎ(ㄶ, ㅀ)' 뒤에 'ㄱ, ㄷ, ㅈ'이 결합되는 경우에는, 뒤 음절 첫소리와 합쳐서 [ㅋ, ㅌ, ㅊ]으로 발음한다.
 58 | -> 놓고[노코], 좋던[조턴], 쌓지[싸치], 많고[만코]
 59 | -> 않던[안턴], 닳지[달치]
 60 | [붙임 1] 받침 'ㄱ(ㄺ), ㄷ, ㅂ(ㄼ), ㅈ(ㄵ)'이 뒤 음절 첫소리 'ㅎ'과 결합되는 경우에도, 역시 두 소리를 합쳐서 [ㅋ, ㅌ, ㅍ, ㅊ]으로 발음한다.
 61 | -> 각하[가카], 먹히다[머키다], 밟히다[발피다], 맏형[마텽]
 62 | -> 좁히다[조피다], 넓히다[널피다], 꽂히다[꼬치다], 앉히다[안치다]
 63 | [붙임 2] 규정에 따라 'ㄷ'으로 발음되는 'ㅅ, ㅈ, ㅊ, ㅌ'의 경우에는 이에 준한다.
 64 | -> 옷 한 벌[오 탄 벌], 낮 한때[나 탄때], 꽃 한 송이[꼬 탄 송이]
 65 | -> 숱하다[수타다]
 66 | 2. 'ㅎ(ㄶ, ㅀ)' 뒤에 'ㅅ'이 결합되는 경우에는, 'ㅅ'을 [ㅆ]으로 발음한다.
 67 | -> 닿소[다쏘], 많소[만쏘], 싫소[실쏘]
 68 | 3. 'ㅎ' 뒤에 'ㄴ'이 결합되는 경우에는, [ㄴ]으로 발음한다.
 69 | -> 놓는[논는], 쌓네[싼네]
 70 | [붙임] 'ㄶ, ㅀ' 뒤에 'ㄴ'이 결합되는 경우에는, 'ㅎ'을 발음하지 않는다.
 71 | -> 않네[안네], 않는[안는], 뚫네[뚤레], 뚫는[뚤른]
 72 | ＊ '뚫네[뚤네→뚤레], 뚫는[뚤는→뚤른]'에 대해서는 제20항 참조.
 73 | 
 74 | 12.4
 75 | 4. 'ㅎ(ㄶ, ㅀ)' 뒤에 모음으로 시작된 어미나 접미사가 결합되는 경우에는, 'ㅎ'을 발음하지 않는다.
 76 | -> 낳은[나은], 놓아[노아], 쌓이다[싸이다], 많아[마나]
 77 | -> 않은[아는], 닳아[다라], 싫어도[시러도]
 78 | 
 79 | 13
 80 | 제13항　홑받침이나 쌍받침이 모음으로 시작된 조사나 어미, 접미사와 결합되는 경우에는, 제 음가대로 뒤 음절 첫소리로 옮겨 발음한다.
 81 | -> 깎아[까까], 옷이[오시], 있어[이써], 낮이[나지]
 82 | -> 꽂아[꼬자], 꽃을[꼬츨], 쫓아[쪼차], 밭에[바테]
 83 | -> 앞으로[아프로], 덮이다[더피다]
 84 | 
 85 | 14
 86 | 제14항　겹받침이 모음으로 시작된 조사나 어미, 접미사와 결합되는 경우에는, 뒤엣것만을 뒤 음절 첫소리로 옮겨 발음한다. (이 경우, 'ㅅ'은 된소리로 발음함.)
 87 | -> 넋이[넉씨], 앉아[안자], 닭을[달글], 젊어[절머]
 88 | -> 곬이[골씨], 핥아[할타], 읊어[을퍼], 값을[갑쓸]
 89 | -> 없어[업써]
 90 | 
 91 | 15
 92 | 제15항　받침 뒤에 모음 'ㅏ, ㅓ, ㅗ, ㅜ, ㅟ'들로 시작되는 실질 형태소가 연결되는 경우에는, 대표음으로 바꾸어서 뒤 음절 첫소리로 옮겨 발음한다.
 93 | -> 밭 아래[바 다래] 		늪 앞[느 밥] 		젖어미[저더미] 		맛없다[마덥다]
 94 | -> 겉옷[거돋] 		헛웃음[허두슴] 		꽃 위[꼬 뒤]
 95 | 다만, '맛있다, 멋있다'는 [마싣따], [머싣따]로도 발음할 수 있다.
 96 | [붙임] 겹받침의 경우에는 그 중 하나만을 옮겨 발음한다.
 97 | -> 넋 없다[너 겁따] 		닭 앞에[다 가페] 		값어치[가 버치] 		값있는[가빈는]
 98 | 
 99 | 16
100 | 제16항　한글 자모의 이름은 그 받침소리를 연음하되, 'ㄷ, ㅈ, ㅊ, ㅋ, ㅌ, ㅍ, ㅎ'의 경우에는 특별히 다음과 같이 발음한다.
101 | -> 디귿이[디그시], 디귿을[디그슬], 디귿에[디그세]
102 | -> 지읒이[지으시], 지읒을[지으슬], 지읒에[지으세]
103 | -> 치읓이[치으시], 치읓을[치으슬], 치읓에[치으세]
104 | -> 키읔이[키으기], 키읔을[키으글], 키읔에[키으게]
105 | -> 티읕이[티으시], 티읕을[티으슬], 티읕에[티으세]
106 | -> 피읖이[피으비], 피읖을[피으블], 피읖에[피으베]
107 | -> 히읗이[히으시], 히읗을[히으슬], 히읗에[히으세]
108 | 
109 | 17
110 | 제17항　받침 'ㄷ, ㅌ(ㄾ)'이 조사나 접미사의 모음 'ㅣ'와 결합되는 경우에는, [ㅈ, ㅊ]으로 바꾸어서 뒤 음절 첫소리로 옮겨 발음한다.
111 | -> 곧이듣다[고지듣따], 굳이[구지], 미닫이[미다지]
112 | -> 땀받이[땀바지], 밭이[바치], 벼훑이[벼훌치]
113 | [붙임] 'ㄷ' 뒤에 접미사 '히'가 결합되어 '티'를 이루는 것은 [치]로 발음한다.
114 | -> 굳히다[구치다], 닫히다[다치다], 묻히다[무치다]
115 | 
116 | 18
117 | 제18항　받침 'ㄱ(ㄲ, ㅋ, ㄳ, ㄺ), ㄷ(ㅅ, ㅆ, ㅈ, ㅊ, ㅌ, ㅎ), ㅂ(ㅍ, ㄼ, ㄿ, ㅄ)'은 'ㄴ, ㅁ' 앞에서 [ㅇ, ㄴ, ㅁ]으로 발음한다.
118 | -> 먹는[멍는], 국물[궁물], 깎는[깡는], 키읔만[키응만]
119 | -> 몫몫이[몽목씨], 긁는[긍는], 흙만[흥만], 닫는[단는]
120 | -> 짓는[진:는], 옷맵시[온맵시], 있는[인는], 맞는[만는]
121 | -> 젖멍울[전멍울], 쫓는[쫀는], 꽃망울[꼰망울], 붙는[분는]
122 | -> 놓는[논는], 잡는[잠는], 밥물[밤물], 앞마당[암마당]
123 | -> 밟는[밤:는], 읊는[음는], 없는[엄:는], 값매다[감매다]
124 | [붙임] 두 단어를 이어서 한 마디로 발음하는 경우에도 이와 같다.
125 | -> 책 넣는다[챙 넌는다], 흙 말리다[흥 말리다], 옷 맞추다[온 마추다]
126 | -> 밥 먹는다[밤 멍는다], 값 매기다[감 매기다]
127 | 
128 | 19
129 | 제19항　받침 'ㅁ, ㅇ' 뒤에 연결되는 'ㄹ'은 [ㄴ]으로 발음한다.
130 | -> 담력[담:녁], 침략[침냑], 강릉[강능], 항로[항:노], 대통령[대:통녕]
131 | [붙임] 받침 'ㄱ, ㅂ' 뒤에 연결되는 'ㄹ'도 [ㄴ]으로 발음한다.
132 | -> 막론[막논→망논], 백리[백니→뱅니], 협력[협녁→혐녁], 십리[십니→심니]
133 | 
134 | 20
135 | 제20항　'ㄴ'은 'ㄹ'의 앞이나 뒤에서 [ㄹ]로 발음한다.
136 | -> 1) 난로[날:로], 신라[실라], 천리[철리], 광한루[광:할루], 대관령[대:괄령]
137 | -> 2) 칼날[칼랄], 물난리[물랄리], 줄넘기[줄럼끼], 할는지[할른지]
138 | [붙임] 첫소리 'ㄴ'이 'ㅀ, ㄾ' 뒤에 연결되는 경우에도 이에 준한다.
139 | -> 닳는[달른], 뚫는[뚤른], 핥네[할레]
140 | 다만, 다음과 같은 단어들은 'ㄹ'을 [ㄴ]으로 발음한다.
141 | -> 의견란[의견난], 임진란[임진난], 생산량[생산냥]
142 | -> 결단력[결딴녁], 공권력[공꿘녁], 동원령[동:원녕]
143 | -> 상견례[상견녜], 횡단로[횡단노], 이원론[이원논]
144 | -> 입원료[이붠뇨], 구근류[구근뉴]
145 | 
146 | 21
147 | 제21항　위에서 지적한 이외의 자음 동화는 인정하지 않는다.
148 | -> 감기[감기], 옷감[옫깜], 있고[읻꼬]
149 | -> 꽃길[꼳낄],	젖먹이[전머기], 문법[문뻡]
150 | -> 꽃밭[꼳빧]
151 | 
152 | 22
153 | 제22항　다음과 같은 용언의 어미는 [어]로 발음함을 원칙으로 하되, [여]로 발음함도 허용한다.
154 | -> 피어[피어/피여] 		되어[되어/되여]
155 | [붙임] '이오, 아니오'도 이에 준하여 [이요, 아니요]로 발음함을 허용한다.
156 | 
157 | 23
158 | 제23항　받침 'ㄱ(ㄲ, ㅋ, ㄳ, ㄺ), ㄷ(ㅅ, ㅆ, ㅈ, ㅊ, ㅌ), ㅂ(ㅍ, ㄼ, ㄿ, ㅄ)' 뒤에 연결되는 'ㄱ, ㄷ, ㅂ, ㅅ, ㅈ'은 된소리로 발음한다.
159 | -> 국밥[국빱], 깎다[깍따], 넑받이[넉빠지], 삯돈[삭똔]
160 | -> 닭장[닥짱], 칡범[칙뻠], 뻗대다[뻗때다], 옷고름[옫꼬름]
161 | -> 있던[읻떤], 꽂고[꼳꼬], 꽃다발[꼳따발], 낯설다[낟썰다]
162 | -> 밭갈이[받까리], 솥전[솓쩐], 곱돌[곱똘], 덮개[덥깨]
163 | -> 옆집[엽찝], 넓죽하다[넙쭈카다], 읊조리다[읍쪼리다], 값지다[갑찌다]
164 | 
165 | 24
166 | 제24항　어간 받침 'ㄴ(ㄵ), ㅁ(ㄻ)' 뒤에 결합되는 어미의 첫소리 'ㄱ, ㄷ, ㅅ, ㅈ'은 된소리로 발음한다.
167 | -> 신고[신꼬], 껴안다[껴안따], 앉고[안꼬], 얹다[언따]
168 | -> 삼고[삼꼬], 더듬지[더듬찌], 닮고[담꼬], 젊지[점찌]
169 | 다만, 피동, 사동의 접미사 '-기-'는 된소리로 발음하지 않는다.
170 | -> 안기다[안기다], 감기다[감기다], 굶기다[굼기다], 옮기다[옴기다]
171 | 
172 | 25
173 | 제25항　어간 받침 'ㄼ, ㄾ' 뒤에 결합되는 어미의 첫소리 'ㄱ, ㄷ, ㅅ, ㅈ'은 된소리로 발음한다.
174 | -> 넓게[널께], 핥다[할따], 훑소[훌쏘], 떫지[떨찌]
175 | 
176 | 26
177 | 제26항　한자어에서, 'ㄹ' 받침 뒤에 연결되는 'ㄷ, ㅅ, ㅈ'은 된소리로 발음한다.
178 | -> 갈등[갈뜽], 발동[발똥], 절도[절또], 말살[말쌀]
179 | -> 불소[불쏘](弗素), 일시[일씨], 갈증[갈쯩], 물질[물찔]
180 | -> 발전[발쩐], 몰상식[몰쌍식], 불세출[불쎄출]
181 | 다만, 같은 한자가 겹쳐진 단어의 경우에는 된소리로 발음하지 않는다.
182 | -> 허허실실[허허실실](虛虛實實), 절절하다[절절하다](切切-　)
183 | 
184 | 27
185 | 제27항　관형사형 '-[으]ㄹ' 뒤에 연결되는 'ㄱ, ㄷ, ㅂ, ㅅ, ㅈ'은 된소리로 발음한다.
186 | -> 할 것을[할 꺼슬], 갈 데가[갈 떼가], 할 바를[할 빠를]
187 | -> 할 수는[할 쑤는], 할 적에[할 쩌게], 갈 곳[갈 꼳]
188 | -> 할 도리[할 또리], 만날 사람[만날 싸람]
189 | 다만, 끊어서 말할 적에는 예사소리로 발음한다.
190 | [붙임] '-(으)ㄹ'로 시작되는 어미의 경우에도 이에 준한다.
191 | -> 할걸[할껄], 할밖에[할빠께], 할세라[할쎄라]
192 | -> 할수록[할쑤록], 할지라도[할찌라도], 할지언정[할찌언정]
193 | -> 할진대[할찐대]
194 | 
195 | 28
196 | 제28항　표기상으로는 사이시옷이 없더라도, 관형격 기능을 지니는 사이시옷이 있어야 할(휴지가 성립되는) 합성어의 경우에는, 뒤 단어의 첫소리 'ㄱ, ㄷ, ㅂ, ㅅ, ㅈ'을 된소리로 발음한다.
197 | -> 문고리[문꼬리], 눈동자[눈똥자], 신바람[신빠람], 산새[산쌔]
198 | -> 손재주[손째주], 길가[길까], 물동이[물똥이], 발바닥[발빠닥]
199 | -> 굴속[굴쏙], 술잔[술짠], 바람결[바람껼], 그믐달[그믐딸]
200 | -> 아침밥[아침빱], 잠자리[잠짜리], 강가[강까], 초승달[초승딸]
201 | -> 등불[등뿔], 창살[창쌀], 강줄기[강쭐기]
202 | 
203 | 29
204 | 제29항　합성어 및 파생어에서, 앞 단어나 접두사의 끝이 자음이고 뒤 단어나 접미사의 첫 음절이 '이, 야, 여, 요, 유'인 경우에는, 'ㄴ'소리를 첨가하여 [니, 냐, 녀, 뇨, 뉴]로 발음한다.
205 | -> 솜이불[솜니불], 홑이불[혼니불], 막일[망닐]
206 | -> 삯일[상닐], 맨입[맨닙], 꽃잎[꼰닙]
207 | -> 내복약[내봉냑], 색연필[생년필], 직행열차[지캥녈차]
208 | -> 늑막염[능망념], 콩엿[콩녇], 담요[담뇨]
209 | -> 눈요기[눈뇨기], 영업용[영엄뇽], 식용유[시굥뉴]
210 | -> 국민윤리[궁민뉼리], 밤윳[밤뉻]
211 | 다만, 다음과 같은 말들은 'ㄴ'소리를 첨가하여 발음하되, 표기대로 발음할 수 있다.
212 | -> 이죽이죽[이중니죽/이주기죽], 야금야금[야금냐금/야그먀금]
213 | -> 검열[검녈/거멸], 욜랑욜랑[욜랑뇰랑/욜랑욜랑]
214 | -> 금융[금늉/그뮹]
215 | [붙임 1] 'ㄹ' 받침 뒤에 첨가되는 'ㄴ'소리는 [ㄹ]로 발음한다.
216 | -> 들일[들릴], 솔잎[솔립], 설익다[설릭따]
217 | -> 물약[물략], 불여우[불려우], 서울역[서울력]
218 | -> 물엿[물렫], 휘발유[휘발류], 유들유들[유들류들]
219 | [붙임 2] 두 단어를 이어서 한 마디로 발음하는 경우에도 이에 준한다.
220 | -> 한 일[한 닐], 옷 입다[온 닙따], 서른여섯[서른녀섣]
221 | -> 3연대[삼년대], 먹은 엿[머근 녇]
222 | -> 할 일[할릴], 잘 입다[잘 립따], 스물여섯[스물려섣]
223 | -> 1연대[일련대], 먹을 엿[머글 렫]
224 | 다만, 다음과 같은 단어에서는 'ㄴ(ㄹ)'소리를 첨가하여 발음하지 않는다.
225 | -> 6·25[유기오], 3·1절[사밀쩔], 송별연[송벼련], 등용문[등용문]
226 | 
227 | 30
228 | 제30항　사이시옷이 붙는 단어는 다음과 같이 발음한다.
229 | 1. 'ㄱ, ㄷ, ㅂ, ㅅ, ㅈ'으로 시작되는 단어 앞에 사이시옷이 올 때에는 이들 자음만을 된소리로 발음하는 것을 원칙으로 하되, 사이시옷을 [ㄷ]으로 발음하는 것도 허용한다.
230 | -> 냇가[내까/낻까], 샛길[새낄/샏낄], 빨랫돌[빨래똘/빨랟똘]
231 | -> 콧등[코뜽/콛뜽], 깃발[기빨/긷빨], 대팻밥[대패빱/대팯빱]
232 | -> 햇살[해쌀/핻쌀], 뱃속[배쏙/밷쏙], 뱃전[배쩐/밷쩐]
233 | -> 고갯짓[고개찓/고갣찓]
234 | 2. 사이시옷 뒤에 'ㄴ, ㅁ'이 결합되는 경우에는 [ㄴ]으로 발음한다.
235 | -> 콧날[콘날], 아랫니[아랜니]
236 | -> 툇마루[퇸마루], 뱃머리[밴머리]
237 | 3. 사이시옷 뒤에 '이'소리가 결합되는 경우에는 [ㄴㄴ]으로 발음한다.
238 | -> 베갯잇[베갣닏→베갠닏], 깻잎[깬닙]
239 | -> 나뭇잎[나문닙], 도리깻열[도리깬녈]
240 | -> 뒷윷[뒨뉻]
241 | 


--------------------------------------------------------------------------------
/g2pk/special.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | Special rule for processing Hangul
  4 | https://github.com/kyubyong/g2pK
  5 | '''
  6 | 
  7 | import re
  8 | 
  9 | from g2pk.utils import gloss, get_rule_id2text
 10 | 
 11 | rule_id2text = get_rule_id2text()
 12 | 
 13 | 
 14 | ############################ vowels ############################
 15 | def jyeo(inp, descriptive=False, verbose=False):
 16 |     rule = rule_id2text["5.1"]
 17 |     # 일반적인 규칙으로 취급한다 by kyubyong
 18 | 
 19 |     out = re.sub("([ᄌᄍᄎ])ᅧ", r"\1ᅥ", inp)
 20 |     gloss(verbose, out, inp, rule)
 21 |     return out
 22 | 
 23 | 
 24 | def ye(inp, descriptive=False, verbose=False):
 25 |     rule = rule_id2text["5.2"]
 26 |     # 실제로 언중은 예, 녜, 셰, 쎼 이외의 'ㅖ'는 [ㅔ]로 발음한다. by kyubyong
 27 | 
 28 |     if descriptive:
 29 |         out = re.sub("([ᄀᄁᄃᄄㄹᄆᄇᄈᄌᄍᄎᄏᄐᄑᄒ])ᅨ", r"\1ᅦ", inp)
 30 |     else:
 31 |         out = inp
 32 |     gloss(verbose, out, inp, rule)
 33 |     return out
 34 | 
 35 | 
 36 | def consonant_ui(inp, descriptive=False, verbose=False):
 37 |     rule = rule_id2text["5.3"]
 38 | 
 39 |     out = re.sub("([ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄌᄍᄎᄏᄐᄑᄒ])ᅴ", r"\1ᅵ", inp)
 40 |     gloss(verbose, out, inp, rule)
 41 |     return out
 42 | 
 43 | 
 44 | def josa_ui(inp, descriptive=False, verbose=False):
 45 |     rule = rule_id2text["5.4.2"]
 46 |     # 실제로 언중은 높은 확률로 조사 '의'는 [ㅔ]로 발음한다.
 47 |     if descriptive:
 48 |         out = re.sub("의/J", "에", inp)
 49 |     else:
 50 |         out = inp.replace("/J", "")
 51 |     gloss(verbose, out, inp, rule)
 52 |     return out
 53 | 
 54 | 
 55 | def vowel_ui(inp, descriptive=False, verbose=False):
 56 |     rule = rule_id2text["5.4.1"]
 57 |     # 실제로 언중은 높은 확률로 단어의 첫음절 이외의 '의'는 [ㅣ]로 발음한다."""
 58 |     if descriptive:
 59 |         out = re.sub("(\Sᄋ)ᅴ", r"\1ᅵ", inp)
 60 |     else:
 61 |         out = inp
 62 |     gloss(verbose, out, inp, rule)
 63 |     return out
 64 | 
 65 | 
 66 | def jamo(inp, descriptive=False, verbose=False):
 67 |     rule = rule_id2text["16"]
 68 |     out = inp
 69 | 
 70 |     out = re.sub("([그])ᆮᄋ", r"\1ᄉ", out)
 71 |     out = re.sub("([으])[ᆽᆾᇀᇂ]ᄋ", r"\1ᄉ", out)
 72 |     out = re.sub("([으])[ᆿ]ᄋ", r"\1ᄀ", out)
 73 |     out = re.sub("([으])[ᇁ]ᄋ", r"\1ᄇ", out)
 74 | 
 75 |     gloss(verbose, out, inp, rule)
 76 |     return out
 77 | 
 78 | 
 79 |     ############################ 어간 받침 ############################
 80 | def rieulgiyeok(inp, descriptive=False, verbose=False):
 81 |     rule = rule_id2text["11.1"]
 82 | 
 83 |     out = inp
 84 |     out = re.sub("ᆰ/P([ᄀᄁ])", r"ᆯᄁ", out)
 85 | 
 86 |     gloss(verbose, out, inp, rule)
 87 |     return out
 88 | 
 89 | 
 90 | def rieulbieub(inp, descriptive=False, verbose=False):
 91 |     rule = rule_id2text["25"]
 92 |     out = inp
 93 | 
 94 |     out = re.sub("([ᆲᆴ])/Pᄀ", r"\1ᄁ", out)
 95 |     out = re.sub("([ᆲᆴ])/Pᄃ", r"\1ᄄ", out)
 96 |     out = re.sub("([ᆲᆴ])/Pᄉ", r"\1ᄊ", out)
 97 |     out = re.sub("([ᆲᆴ])/Pᄌ", r"\1ᄍ", out)
 98 | 
 99 |     gloss(verbose, out, inp, rule)
100 |     return out
101 | 
102 | 
103 | def verb_nieun(inp, descriptive=False, verbose=False):
104 |     rule = rule_id2text["24"]
105 |     out = inp
106 | 
107 |     pairs = [ ("([ᆫᆷ])/Pᄀ", r"\1ᄁ"),
108 |               ("([ᆫᆷ])/Pᄃ", r"\1ᄄ"),
109 |               ("([ᆫᆷ])/Pᄉ", r"\1ᄊ"),
110 |               ("([ᆫᆷ])/Pᄌ", r"\1ᄍ"),
111 | 
112 |               ("ᆬ/Pᄀ", "ᆫᄁ"),
113 |               ("ᆬ/Pᄃ", "ᆫᄄ"),
114 |               ("ᆬ/Pᄉ", "ᆫᄊ"),
115 |               ("ᆬ/Pᄌ", "ᆫᄍ"),
116 | 
117 |               ("ᆱ/Pᄀ", "ᆷᄁ"),
118 |               ("ᆱ/Pᄃ", "ᆷᄄ"),
119 |               ("ᆱ/Pᄉ", "ᆷᄊ"),
120 |               ("ᆱ/Pᄌ", "ᆷᄍ")  ]
121 | 
122 |     for str1, str2 in pairs:
123 |         out = re.sub(str1, str2, out)
124 | 
125 |     gloss(verbose, out, inp, rule)
126 |     return out
127 | 
128 | 
129 | def balb(inp, descriptive=False, verbose=False):
130 |     rule = rule_id2text["10.1"]
131 |     out = inp
132 |     syllable_final_or_consonants = "($|[^ᄋᄒ])"
133 | 
134 |     # exceptions
135 |     out = re.sub(f"(바)ᆲ({syllable_final_or_consonants})", r"\1ᆸ\2", out)
136 |     out = re.sub(f"(너)ᆲ([ᄌᄍ]ᅮ|[ᄃᄄ]ᅮ)", r"\1ᆸ\2", out)
137 |     gloss(verbose, out, inp, rule)
138 |     return out
139 | 
140 | 
141 | def palatalize(inp, descriptive=False, verbose=False):
142 |     rule = rule_id2text["17"]
143 |     out = inp
144 | 
145 |     out = re.sub("ᆮᄋ([ᅵᅧ])", r"ᄌ\1", out)
146 |     out = re.sub("ᇀᄋ([ᅵᅧ])", r"ᄎ\1", out)
147 |     out = re.sub("ᆴᄋ([ᅵᅧ])", r"ᆯᄎ\1", out)
148 | 
149 |     out = re.sub("ᆮᄒ([ᅵ])", r"ᄎ\1", out)
150 | 
151 |     gloss(verbose, out, inp, rule)
152 |     return out
153 | 
154 | 
155 | def modifying_rieul(inp, descriptive=False, verbose=False):
156 |     rule = rule_id2text["27"]
157 |     out = inp
158 |     
159 |     pairs = [   ("ᆯ/E ᄀ", r"ᆯ ᄁ"),
160 |                 ("ᆯ/E ᄃ", r"ᆯ ᄄ"),
161 |                 ("ᆯ/E ᄇ", r"ᆯ ᄈ"),
162 |                 ("ᆯ/E ᄉ", r"ᆯ ᄊ"),
163 |                 ("ᆯ/E ᄌ", r"ᆯ ᄍ"),
164 | 
165 |                 ("ᆯ걸", "ᆯ껄"),
166 |                 ("ᆯ밖에", "ᆯ빠께"),
167 |                 ("ᆯ세라", "ᆯ쎄라"),
168 |                 ("ᆯ수록", "ᆯ쑤록"),
169 |                 ("ᆯ지라도", "ᆯ찌라도"),
170 |                 ("ᆯ지언정", "ᆯ찌언정"),
171 |                 ("ᆯ진대", "ᆯ찐대") ]
172 | 
173 |     for str1, str2 in pairs:
174 |         out = re.sub(str1, str2, out)
175 | 
176 |     gloss(verbose, out, inp, rule)
177 |     return out
178 | 


--------------------------------------------------------------------------------
/g2pk/table.csv:
--------------------------------------------------------------------------------
 1 | ﻿,( ?)ᄒ,( ?)ᄀ,( ?)ᄁ,( ?)ᄂ,( ?)ᄃ,( ?)ᄄ,( ?)ᄅ,( ?)ᄆ,( ?)ᄇ,( ?)ᄈ,( ?)ᄉ,( ?)ᄊ,( ?)ᄌ,( ?)ᄍ,( ?)ᄎ,( ?)ᄏ,( ?)ᄐ,( ?)ᄑ,(\W|$)
 2 | ᇂ,\1ᄒ,\1ᄏ(12),\1ᄁ,ᆫ\1ᄂ(12),\1ᄐ(12),\1ᄄ,\1ᄅ,\1ᄆ,\1ᄇ,\1ᄈ,\1ᄊ(12),\1ᄊ,\1ᄎ(12),\1ᄍ,\1ᄎ,\1ᄏ,\1ᄐ,\1ᄑ,ᆮ\1
 3 | ᆨ,\1ᄏ(12),ᆨ\1ᄁ(23),,ᆼ\1ᄂ(18),ᆨ\1ᄄ(23),,ᆼ\1ᄂ(19/18),ᆼ\1ᄆ(18),ᆨ\1ᄈ(23),,ᆨ\1ᄊ(23),,ᆨ\1ᄍ(23),,,,,,
 4 | ᆩ,\1ᄏ,ᆨ\1ᄁ(9/23),ᆨ\1ᄁ(9),ᆼ\1ᄂ(18),ᆨ\1ᄄ(9/23),ᆨ\1ᄄ(9),ᆼ\1ᄂ,ᆼ\1ᄆ(18),ᆨ\1ᄈ(9/23),ᆨ\1ᄈ(9),ᆨ\1ᄊ(9/23),ᆨ\1ᄊ(9),ᆨ\1ᄍ(9/23),ᆨ\1ᄍ(9),ᆨ\1ᄎ(9),ᆨ\1ᄏ(9),ᆨ\1ᄐ(9),ᆨ\1ᄑ(9),ᆨ\1(9)
 5 | ᆪ,\1ᄏ,ᆨ\1ᄁ(9/23),ᆨ\1ᄁ(10),ᆼ\1ᄂ(18),ᆨ\1ᄄ(9/23),ᆨ\1ᄄ(10),ᆼ\1ᄂ,ᆼ\1ᄆ(18),ᆨ\1ᄈ(9/23),ᆨ\1ᄈ(10),ᆨ\1ᄊ(9/23),ᆨ\1ᄊ(10),ᆨ\1ᄍ(9/23),ᆨ\1ᄍ(10),ᆨ\1ᄎ(10),ᆨ\1ᄏ(10),ᆨ\1ᄐ(10),ᆨ\1ᄑ(10),ᆨ\1(10)
 6 | ᆫ,,,,,,,ᆯ\1ᄅ(20),,,,,,,,,,,,
 7 | ᆬ,ᆫ\1ᄎ(12),ᆫ\1ᄀ(10),ᆫ\1ᄁ(10),ᆫ\1ᄂ(10),ᆫ\1ᄃ(10),ᆫ\1ᄄ(10),ᆯ\1ᄅ(10/20),ᆫ\1ᄆ(10),ᆫ\1ᄇ(10),ᆫ\1ᄈ(10),ᆫ\1ᄉ(10),ᆫ\1ᄊ(10),ᆫ\1ᄌ(10),ᆫ\1ᄍ(10),ᆫ\1ᄎ(10),ᆫ\1ᄏ(10),ᆫ\1ᄐ(10),ᆫ\1ᄑ(10),ᆫ\1(10)
 8 | ᆭ,ᆫ\1ᄒ,ᆫ\1ᄏ(12),ᆫ\1ᄁ,ᆫ\1ᄂ(12),ᆫ\1ᄐ(12),ᆫ\1ᄄ,ᆯ\1ᄅ,ᆫ\1ᄆ,ᆫ\1ᄇ,ᆫ\1ᄈ,ᆫ\1ᄊ(12),ᆫ\1ᄊ,ᆫ\1ᄎ(12),ᆫ\1ᄍ,ᆫ\1ᄎ,ᆫ\1ᄏ,ᆫ\1ᄐ,ᆫ\1ᄑ,ᆫ\1
 9 | ᆮ,\1ᄐ(12),ᆮ\1ᄁ(23),,ᆫ\1ᄂ(18),ᆮ\1ᄄ(23),,ᆫ\1ᄂ,ᆫ\1ᄆ(18),ᆮ\1ᄈ(23),,ᆮ\1ᄊ(23),,ᆮ\1ᄍ(23),,,,,,
10 | ᆯ,,,,ᆯ\1ᄅ(20),,,,,,,,,,,,,,,
11 | ᆰ,ᆯ\1ᄏ(12),ᆨ\1ᄁ(11/23),ᆨ\1ᄁ(11),ᆼ\1ᄂ(11/18),ᆨ\1ᄄ(11/23),ᆨ\1ᄄ(11),ᆼ\1ᄂ(11/18),ᆼ\1ᄆ(11/18),ᆨ\1ᄈ(11/23),ᆨ\1ᄈ(11),ᆨ\1ᄊ(11/23),ᆨ\1ᄊ(11),ᆨ\1ᄍ(11/23),ᆨ\1ᄍ(11),ᆨ\1ᄎ(11),1),ᆨ\1ᄑ(11),,ᆨ\1(11)
12 | ᆱ,ᆷ\1ᄒ(11),ᆷ\1ᄀ(11),ᆷ\1ᄁ(11),ᆷ\1ᄂ(11),ᆷ\1ᄃ(11),ᆷ\1ᄄ(11),ᆷ\1ᄅ(11),ᆷ\1ᄆ(11),ᆷ\1ᄇ(11),ᆷ\1ᄈ(11),ᆷ\1ᄉ(11),ᆷ\1ᄊ(11),ᆷ\1ᄌ(11),ᆷ\1ᄍ(11),ᆷ\1ᄎ(11),ᆷ\1ᄏ(11),ᆷ\1ᄐ(11),ᆷ\1ᄑ(11),ᆷ\1(11)
13 | ᆲ,ᆯ\1ᄑ(12),ᆯ\1ᄁ(10/23),ᆯ\1ᄁ(10),ᆷ\1ᄂ(18),ᆯ\1ᄄ(10/23),ᆯ\1ᄄ(10),ᆯ\1ᄅ(10),ᆷ\1ᄆ(18),ᆯ\1ᄈ(10/23),ᆯ\1ᄈ(10),ᆯ\1ᄊ(10/23),ᆯ\1ᄊ(10),ᆯ\1ᄍ(10/23),ᆯ\1ᄍ(10),ᆯ\1ᄎ(10),ᆯ\1ᄏ(10)0),,,ᆯ\1(10)
14 | ᆳ,ᆯ\1ᄒ(10),ᆯ\1ᄁ(10/23),ᆯ\1ᄁ(10),ᆯ\1ᄅ(10/20),ᆯ\1ᄄ(10/23),ᆯ\1ᄄ(10),ᆯ\1ᄅ(10),ᆯ\1ᄆ(10),ᆯ\1ᄈ(10/23),ᆯ\1ᄈ(10),ᆯ\1ᄊ(10/23),ᆯ\1ᄊ(10),ᆯ\1ᄍ(10/23),ᆯ\1ᄍ(10),ᆯ\1ᄎ(10),ᆯ\1ᄏ(ᄑ(10),,,ᆯ\1(10)
15 | ᆴ,ᆯ\1ᄒ(10),ᆯ\1ᄀ(10),ᆯ\1ᄁ(10),ᆯ\1ᄅ(10/20),ᆯ\1ᄃ(10),ᆯ\1ᄄ(10),ᆯ\1ᄅ(10),ᆯ\1ᄆ(10),ᆯ\1ᄇ(10),ᆯ\1ᄈ(10),ᆯ\1ᄉ(10),ᆯ\1ᄊ(10),ᆯ\1ᄌ(10),ᆯ\1ᄍ(10),ᆯ\1ᄎ(10),ᆯ\1ᄏ(10),ᆯ\1ᄐ(10),ᆯ\1ᄑ(10),ᆯ\1(10)
16 | ᆵ,ᆸ\1ᄑ(11/12),ᆸ\1ᄁ(11/23),ᆸ\1ᄁ(11),ᆷ\1ᄂ(18),ᆸ\1ᄄ(11/23),ᆸ\1ᄄ(11),ᆷ\1ᄅ(11),ᆷ\1ᄆ(11/18),ᆸ\1ᄈ(11/23),ᆸ\1ᄈ(11),ᆸ\1ᄊ(11/23),ᆸ\1ᄊ(11),ᆸ\1ᄍ(11/23),ᆸ\1ᄍ(11),ᆸ\1ᄎ(11),ᆸ\1ᆸ\1ᄑ,,,ᆸ\1(11)
17 | ᆶ,ᆯ\1ᄒ(10),ᆯ\1ᄏ(12),ᆯ\1ᄁ,ᆯ\1ᄅ(12/20),ᆯ\1ᄐ(12),ᆯ\1ᄄ,ᆯ\1ᄅ,ᆯ\1ᄆ,ᆯ\1ᄇ,ᆯ\1ᄈ,ᆯ\1ᄊ(12),ᆯ\1ᄊ,ᆯ\1ᄎ(12),ᆯ\1ᄍ,ᆯ\1ᄎ,ᆯ\1ᄏ,ᆯ\1ᄐ,ᆯ\1ᄑ,ᆯ
18 | ᆷ,,,,,,,ᆷ\1ᄂ(19),,,,,,,,,,,,
19 | ᆸ,\1ᄑ(12),ᆸ\1ᄁ(23),,ᆷ\1ᄂ(18),ᆸ\1ᄄ(23),,ᆷ\1ᄂ(19/18),ᆷ\1ᄆ(18),ᆸ\1ᄈ(23),,ᆸ\1ᄊ(23),,ᆸ\1ᄍ(23),,,,,,
20 | ᆹ,ᆸ\1ᄑ(10),ᆸ\1ᄁ(10/23),ᆸ\1ᄁ(10),ᆷ\1ᄂ(10/18),ᆸ\1ᄄ(10/23),ᆸ\1ᄄ(10),ᆷ\1ᄂ(10/19/18),ᆷ\1ᄆ(10/18),ᆸ\1ᄈ(10/23),ᆸ\1ᄈ(10),ᆸ\1ᄊ(10/23),ᆸ\1ᄊ(10),ᆸ\1ᄍ(10/23),ᆸ\1ᄍ(10),ᆸ\1ᄎ(1ᄐ(10),ᆸ\1ᄑ,,,ᆸ\1(10)
21 | ᆺ,\1ᄐ,ᆮ\1ᄁ(9/23),ᆮ\1ᄁ(9),ᆫ\1ᄂ(9/18),ᆮ\1ᄄ(9/23),ᆮ\1ᄄ(9),ᆫ\1ᄂ(9/18),ᆫ\1ᄆ(9/18),ᆮ\1ᄈ(9/23),ᆮ\1ᄈ(9),ᆮ\1ᄊ(9/23),ᆮ\1ᄊ(9),ᆮ\1ᄍ(9/23),ᆮ\1ᄍ(9),ᆮ\1ᄎ(9),ᆮ\1ᄏ(9),ᆮ\1ᄐ(9),ᆮ\1ᄑ(9),ᆮ\1(9)
22 | ᆻ,\1ᄐ,ᆮ\1ᄁ(9/23),ᆮ\1ᄁ(9),ᆫ\1ᄂ(9/18),ᆮ\1ᄄ(9/23),ᆮ\1ᄄ(9),ᆫ\1ᄂ(9/18),ᆫ\1ᄆ(9/18),ᆮ\1ᄈ(9/23),ᆮ\1ᄈ(9),ᆮ\1ᄊ(9/23),ᆮ\1ᄊ(9),ᆮ\1ᄍ(9/23),ᆮ\1ᄍ(9),ᆮ\1ᄎ(9),ᆮ\1ᄏ(9),ᆮ\1ᄐ(9),ᆮ1ᄑ(9),ᆮ\1(9)
23 | ᆼ,,,,,,,ᆼ\1ᄂ(19),,,,,,,,,,,,
24 | ᆽ,\1ᄎ(12),ᆮ\1ᄁ(9/23),ᆮ\1ᄁ(9),ᆫ\1ᄂ(18),ᆮ\1ᄄ(9/23),ᆮ\1ᄄ(9),ᆫ\1ᄂ(9/18),ᆫ\1ᄆ(18),ᆮ\1ᄈ(9/23),ᆮ\1ᄈ(9),ᆮ\1ᄊ(9/23),ᆮ\1ᄊ(9),ᆮ\1ᄍ(9/23),ᆮ\1ᄍ(9),ᆮ\1ᄎ(9),ᆮ\1ᄏ(9),ᆮ\1ᄐ(9),ᆮ\1ᄑ(9),ᆮ\1(9)
25 | ᆾ,\1ᄐ,ᆮ\1ᄁ(9/23),ᆮ\1ᄁ(9),ᆫ\1ᄂ(18),ᆮ\1ᄄ(9/23),ᆮ\1ᄄ(9),ᆫ\1ᄂ(9/18),ᆫ\1ᄆ(18),ᆮ\1ᄈ(9/23),ᆮ\1ᄈ(9),ᆮ\1ᄊ(9/23),ᆮ\1ᄊ(9),ᆮ\1ᄍ(9/23),ᆮ\1ᄍ(9),ᆮ\1ᄎ(9),ᆮ\1ᄏ(9),ᆮ\1ᄐ(9),ᆮ\1ᄑ(9),ᆮ\1(9)
26 | ᆿ,\1ᄏ,ᆨ\1ᄁ(9/23),ᆨ\1ᄁ(9),ᆼ\1ᄂ(18),ᆨ\1ᄄ(9/23),ᆨ\1ᄄ(9),ᆼ\1ᄂ(9/18),ᆼ\1ᄆ(9/18),ᆨ\1ᄈ(9/23),ᆨ\1ᄈ(9),ᆨ\1ᄊ(9/23),ᆨ\1ᄊ(9),ᆨ\1ᄍ(9/23),ᆨ\1ᄍ(9),ᆨ\1ᄎ(9),ᆨ\1ᄏ(9),ᆨ\1ᄐ(9),ᆨ\1ᄑ(9),ᆨ\1(9)
27 | ᇀ,\1ᄐ,ᆮ\1ᄁ(9/23),ᆮ\1ᄁ(9),ᆫ\1ᄂ(18),ᆮ\1ᄄ(9/23),ᆮ\1ᄄ(9),ᆫ\1ᄂ(9/18),ᆫ\1ᄆ(18),ᆮ\1ᄈ(9/23),ᆮ\1ᄈ(9),ᆮ\1ᄊ(9/23),ᆮ\1ᄊ(9),ᆮ\1ᄍ(9/23),ᆮ\1ᄍ(9),ᆮ\1ᄎ(9),ᆮ\1ᄏ(9),ᆮ\1ᄐ(9),ᆮ\1ᄑ(9),ᆮ\1(9)
28 | ᇁ,\1ᄑ,ᆸ\1ᄁ(9/23),ᆸ\1ᄁ(9),ᆷ\1ᄂ(18),ᆸ\1ᄄ(9/23),ᆸ\1ᄄ(9),ᆫ\1ᄂ(9/18),ᆷ\1ᄆ(18),ᆸ\1ᄈ(9/23),ᆸ\1ᄈ(9),ᆸ\1ᄊ(9/23),ᆸ\1ᄊ(9),ᆸ\1ᄍ(9/23),ᆸ\1ᄍ(9),ᆸ\1ᄎ(9),ᆸ\1ᄏ(9),ᆸ\1ᄐ(9),ᆸ\1ᄑ(9),ᆸ\1(9)


--------------------------------------------------------------------------------
/g2pk/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from jamo import h2j, j2h
  3 | import os
  4 | 
  5 | ############## English ##############
  6 | def adjust(arpabets):
  7 |     '''Modify arpabets so that it fits our processes'''
  8 |     string = " " + " ".join(arpabets) + " $"
  9 |     string = re.sub("\d", "", string)
 10 |     string = string.replace(" T S ", " TS ")
 11 |     string = string.replace(" D Z ", " DZ ")
 12 |     string = string.replace(" AW ER ", " AWER ")
 13 |     string = string.replace(" IH R $", " IH ER ")
 14 |     string = string.replace(" EH R $", " EH ER ")
 15 |     string = string.replace(" $", "")
 16 | 
 17 |     return string.strip("$ ").split()
 18 | 
 19 | 
 20 | def to_choseong(arpabet):
 21 |     '''Arpabet to choseong or onset'''
 22 |     d = \
 23 |         {'B': 'ᄇ',
 24 |          'CH': 'ᄎ',
 25 |          'D': 'ᄃ',
 26 |          'DH': 'ᄃ',
 27 |          'DZ': 'ᄌ',
 28 |          'F': 'ᄑ',
 29 |          'G': 'ᄀ',
 30 |          'HH': 'ᄒ',
 31 |          'JH': 'ᄌ',
 32 |          'K': 'ᄏ',
 33 |          'L': 'ᄅ',
 34 |          'M': 'ᄆ',
 35 |          'N': 'ᄂ',
 36 |          'NG': 'ᄋ',
 37 |          'P': 'ᄑ',
 38 |          'R': 'ᄅ',
 39 |          'S': 'ᄉ',
 40 |          'SH': 'ᄉ',
 41 |          'T': 'ᄐ',
 42 |          'TH': 'ᄉ',
 43 |          'TS': 'ᄎ',
 44 |          'V': 'ᄇ',
 45 |          'W': 'W',
 46 |          'Y': 'Y',
 47 |          'Z': 'ᄌ',
 48 |          'ZH': 'ᄌ'}
 49 | 
 50 |     return d.get(arpabet, arpabet)
 51 | 
 52 | def to_jungseong(arpabet):
 53 |     '''Arpabet to jungseong or vowel'''
 54 |     d = \
 55 |         {'AA': 'ᅡ',
 56 |          'AE': 'ᅢ',
 57 |          'AH': 'ᅥ',
 58 |          'AO': 'ᅩ',
 59 |          'AW': 'ᅡ우',
 60 |          'AWER': "ᅡ워",
 61 |          'AY': 'ᅡ이',
 62 |          'EH': 'ᅦ',
 63 |          'ER': 'ᅥ',
 64 |          'EY': 'ᅦ이',
 65 |          'IH': 'ᅵ',
 66 |          'IY': 'ᅵ',
 67 |          'OW': 'ᅩ',
 68 |          'OY': 'ᅩ이',
 69 |          'UH': 'ᅮ',
 70 |          'UW': 'ᅮ'}
 71 |     return d.get(arpabet, arpabet)
 72 | 
 73 | def to_jongseong(arpabet):
 74 |     '''Arpabet to jongseong or coda'''
 75 |     d = \
 76 |         {'B': 'ᆸ',
 77 |          'CH': 'ᆾ',
 78 |          'D': 'ᆮ',
 79 |          'DH': 'ᆮ',
 80 |          'F': 'ᇁ',
 81 |          'G': 'ᆨ',
 82 |          'HH': 'ᇂ',
 83 |          'JH': 'ᆽ',
 84 |          'K': 'ᆨ',
 85 |          'L': 'ᆯ',
 86 |          'M': 'ᆷ',
 87 |          'N': 'ᆫ',
 88 |          'NG': 'ᆼ',
 89 |          'P': 'ᆸ',
 90 |          'R': 'ᆯ',
 91 |          'S': 'ᆺ',
 92 |          'SH': 'ᆺ',
 93 |          'T': 'ᆺ',
 94 |          'TH': 'ᆺ',
 95 |          'V': 'ᆸ',
 96 |          'W': 'ᆼ',
 97 |          'Y': 'ᆼ',
 98 |          'Z': 'ᆽ',
 99 |          'ZH': 'ᆽ'}
100 | 
101 |     return d.get(arpabet, arpabet)
102 | 
103 | 
104 | def reconstruct(string):
105 |     '''Some postprocessing rules'''
106 |     pairs = [("그W", "ᄀW"),
107 |              ("흐W", "ᄒW"),
108 |              ("크W", "ᄏW"),
109 |              ("ᄂYᅥ", "니어"),
110 |              ("ᄃYᅥ", "디어"),
111 |              ("ᄅYᅥ", "리어"),
112 |              ("Yᅵ", "ᅵ"),
113 |              ("Yᅡ", "ᅣ"),
114 |              ("Yᅢ", "ᅤ"),
115 |              ("Yᅥ", "ᅧ"),
116 |              ("Yᅦ", "ᅨ"),
117 |              ("Yᅩ", "ᅭ"),
118 |              ("Yᅮ", "ᅲ"),
119 |              ("Wᅡ", "ᅪ"),
120 |              ("Wᅢ", "ᅫ"),
121 |              ("Wᅥ", "ᅯ"),
122 |              ("Wᅩ", "ᅯ"),
123 |              ("Wᅮ", "ᅮ"),
124 |              ("Wᅦ", "ᅰ"),
125 |              ("Wᅵ", "ᅱ"),
126 |              ("ᅳᅵ", "ᅴ"),
127 |              ("Y", "ᅵ"),
128 |              ("W", "ᅮ")
129 |              ]
130 |     for str1, str2 in pairs:
131 |         string = string.replace(str1, str2)
132 |     return string
133 | 
134 | 
135 | ############## Hangul ##############
136 | def parse_table():
137 |     '''Parse the main rule table'''
138 |     lines = open(os.path.dirname(os.path.abspath(__file__)) + '/table.csv', 'r', encoding='utf8').read().splitlines()
139 |     onsets = lines[0].split(",")
140 |     table = []
141 |     for line in lines[1:]:
142 |         cols = line.split(",")
143 |         coda = cols[0]
144 |         for i, onset in enumerate(onsets):
145 |             cell = cols[i]
146 |             if len(cell)==0: continue
147 |             if i==0:
148 |                 continue
149 |             else:
150 |                 str1 = f"{coda}{onset}"
151 |                 if "(" in cell:
152 |                     str2 = cell.split("(")[0]
153 |                     rule_ids = cell.split("(")[1][:-1].split("/")
154 |                 else:
155 |                     str2 = cell
156 |                     rule_ids = []
157 | 
158 |                 table.append((str1, str2, rule_ids))
159 |     return table
160 | 
161 | 
162 | ############## Preprocessing ##############
163 | def annotate(string, mecab):
164 |     '''attach pos tags to the given string using Mecab
165 |     mecab: mecab object
166 |     '''
167 |     tokens = mecab.pos(string)
168 |     if string.replace(" ", "") != "".join(token for token, _ in tokens):
169 |         return string
170 |     blanks = [i for i, char in enumerate(string) if char == " "]
171 | 
172 |     tag_seq = []
173 |     for token, tag in tokens:
174 |         tag = tag.split("+")[-1]
175 |         if tag=="NNBC": # bound noun
176 |             tag = "B"
177 |         else:
178 |             tag = tag[0]
179 |         tag_seq.append("_" * (len(token) - 1) + tag)
180 |     tag_seq = "".join(tag_seq)
181 | 
182 |     for i in blanks:
183 |         tag_seq = tag_seq[:i] + " " + tag_seq[i:]
184 | 
185 |     annotated = ""
186 |     for char, tag in zip(string, tag_seq):
187 |         annotated += char
188 |         if char == "의" and tag == "J":
189 |             annotated += "/J"
190 |         elif tag=="E":
191 |             if h2j(char)[-1] in "ᆯ":
192 |                 annotated += "/E"
193 |         elif tag == "V":
194 |             if h2j(char)[-1] in "ᆫᆬᆷᆱᆰᆲᆴ":
195 |                 annotated += "/P"
196 |         elif tag == "B": # bound noun
197 |             annotated += "/B"
198 | 
199 |     return annotated
200 | 
201 | 
202 | ############## Postprocessing ##############
203 | def compose(letters):
204 |     # insert placeholder
205 |     letters = re.sub("(^|[^\u1100-\u1112])([\u1161-\u1175])", r"\1ᄋ\2", letters)
206 | 
207 |     string = letters # assembled characters
208 |     # c+v+c
209 |     syls = set(re.findall("[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]", string))
210 |     for syl in syls:
211 |         string = string.replace(syl, j2h(*syl))
212 | 
213 |     # c+v
214 |     syls = set(re.findall("[\u1100-\u1112][\u1161-\u1175]", string))
215 |     for syl in syls:
216 |         string = string.replace(syl, j2h(*syl))
217 | 
218 |     return string
219 | 
220 | 
221 | def group(inp):
222 |     '''For group_vowels=True
223 |     Contemporarily, Korean speakers don't distinguish some vowels.
224 |     '''
225 |     inp = inp.replace("ᅢ", "ᅦ")
226 |     inp = inp.replace("ᅤ", "ᅨ")
227 |     inp = inp.replace("ᅫ", "ᅬ")
228 |     inp = inp.replace("ᅰ", "ᅬ")
229 | 
230 |     return inp
231 | 
232 | 
233 | def _get_examples():
234 |     '''For internal use'''
235 |     text = open('rules.txt', 'r', encoding='utf8').read().splitlines()
236 |     examples = []
237 |     for line in text:
238 |         if line.startswith("->"):
239 |             examples.extend(re.findall("([ㄱ-힣][ ㄱ-힣]*)\[([ㄱ-힣][ ㄱ-힣]*)]", line))
240 |     _examples = []
241 |     for inp, gt in examples:
242 |         for each in gt.split("/"):
243 |             _examples.append((inp, each))
244 | 
245 |     return _examples
246 | 
247 | 
248 | ############## Utilities ##############
249 | def get_rule_id2text():
250 |     '''for verbose=True'''
251 |     rules = open(os.path.dirname(os.path.abspath(__file__)) + '/rules.txt', 'r', encoding='utf8').read().strip().split("\n\n")
252 |     rule_id2text = dict()
253 |     for rule in rules:
254 |         rule_id, texts = rule.splitlines()[0], rule.splitlines()[1:]
255 |         rule_id2text[rule_id.strip()] = "\n".join(texts)
256 |     return rule_id2text
257 | 
258 | 
259 | def gloss(verbose, out, inp, rule):
260 |     '''displays the process and relevant information'''
261 |     if verbose and out != inp and out != re.sub("/[EJPB]", "", inp):
262 |         print(compose(inp), "->", compose(out))
263 |         print("\033[1;31m", rule, "\033[0m")
264 | 
265 | 
266 | 
267 | 
268 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", mode="r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | REQUIRED_PACKAGES = [
 7 |     'jamo',
 8 |     'nltk',
 9 |     'konlpy',
10 |     'python-mecab-ko',
11 | ]
12 | 
13 | setuptools.setup(
14 |     name="g2pK",
15 |     version="0.9.3",
16 |     author="Kyubyong Park",
17 |     author_email="kbpark.linguist@gmail.com",
18 |     description="g2pK: g2p module for Korean",
19 |     install_requires=REQUIRED_PACKAGES,
20 |     license='Apache License 2.0',
21 |     long_description=long_description,
22 |     long_description_content_type="text/markdown",
23 |     url="https://github.com/Kyubyong/g2pK",
24 |     packages=setuptools.find_packages(),
25 |     package_data={'g2pk': ['g2pk/idioms.txt', 'g2pk/rules.txt', 'g2pk/table.csv']},
26 |     python_requires=">=3.6",
27 |     include_package_data=True,
28 |     classifiers=[
29 |         'Development Status :: 5 - Production/Stable',
30 |         'Intended Audience :: Developers',
31 |         'Intended Audience :: Science/Research',
32 |         "License :: OSI Approved :: Apache Software License",
33 |         "Operating System :: OS Independent",
34 |         "Programming Language :: Python :: 3",
35 |         'Programming Language :: Python :: 3.6',
36 |         'Programming Language :: Python :: 3.7',
37 |     ],
38 | )
39 | 


--------------------------------------------------------------------------------