├── .gitignore
├── .projectile
├── .python_version
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── VERSION
├── bin
    ├── distance_matrix.py
    ├── gruut-ipa
    └── speak-ipa
├── default.nix
├── derivation.nix
├── gruut_ipa
    ├── __init__.py
    ├── __main__.py
    ├── accent.py
    ├── constants.py
    ├── data
    │   ├── ar
    │   │   └── phonemes.txt
    │   ├── cs-cz
    │   │   └── phonemes.txt
    │   ├── de-de
    │   │   └── phonemes.txt
    │   ├── el-gr
    │   │   └── phonemes.txt
    │   ├── en-gb
    │   │   └── phonemes.txt
    │   ├── en-us
    │   │   ├── cmudict
    │   │   │   ├── ipa_map.txt
    │   │   │   └── phonemes.txt
    │   │   ├── phonemes.txt
    │   │   └── zamia
    │   │   │   ├── ipa_map.txt
    │   │   │   └── phonemes.txt
    │   ├── es-es
    │   │   └── phonemes.txt
    │   ├── fa
    │   │   └── phonemes.txt
    │   ├── fr-fr
    │   │   └── phonemes.txt
    │   ├── it-it
    │   │   └── phonemes.txt
    │   ├── lb-lb
    │   │   └── phonemes.txt
    │   ├── nl
    │   │   ├── cgn
    │   │   │   ├── ipa_map.txt
    │   │   │   └── phonemes.txt
    │   │   └── phonemes.txt
    │   ├── phoneme_distances.json.gz
    │   ├── pt
    │   │   └── phonemes.txt
    │   ├── ru-ru
    │   │   └── phonemes.txt
    │   ├── sv-se
    │   │   └── phonemes.txt
    │   ├── sw
    │   │   ├── alffa
    │   │   │   ├── ipa_map.txt
    │   │   │   └── phonemes.txt
    │   │   └── phonemes.txt
    │   └── vi-n
    │   │   └── phonemes.txt
    ├── distances.py
    ├── espeak.py
    ├── features.py
    ├── kirshenbaum.py
    ├── phonemes.py
    ├── py.typed
    ├── sampa.py
    └── utils.py
├── img
    ├── ipa.png
    └── ipa.svg
├── mypy.ini
├── pylintrc
├── requirements_dev.txt
├── requirements_test.txt
├── scripts
    ├── check-code.sh
    ├── create-venv.sh
    ├── format-code.sh
    └── run-tests.sh
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── test_accent.py
    ├── test_distances.py
    ├── test_features.py
    ├── test_phone.py
    ├── test_phonemes.py
    └── test_pronunciation.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .mypy_cache/
3 | *.egg-info/
4 | /.venv/
5 | /dist/
6 | /.tox/
7 | 


--------------------------------------------------------------------------------
/.projectile:
--------------------------------------------------------------------------------
1 | -/.venv/
2 | -/__pycache__/
3 | -/gruut_ipa/__pycache__/
4 | -/tests/__pycache__/


--------------------------------------------------------------------------------
/.python_version:
--------------------------------------------------------------------------------
1 | 3.7
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Michael Hansen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements_dev.txt
2 | include LICENSE
3 | include README.md
4 | include VERSION


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL := bash
 2 | 
 3 | .PHONY: check clean reformat dist test install
 4 | 
 5 | all: dist
 6 | 
 7 | install:
 8 | 	scripts/create-venv.sh
 9 | 
10 | check:
11 | 	scripts/check-code.sh
12 | 
13 | reformat:
14 | 	scripts/format-code.sh
15 | 
16 | test:
17 | 	scripts/run-tests.sh
18 | 
19 | dist:
20 | 	python3 setup.py sdist
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Gruut IPA
  2 | 
  3 | Library for manipulating [International Phonetic Alphabet](https://en.wikipedia.org/wiki/International_Phonetic_Alphabet) (IPA) pronunciations.
  4 | 
  5 | Features include:
  6 | 
  7 | * Getting the category and details of a phone, e.g. "open front unrounded vowel" for ɶ
  8 | * Splitting IPA pronunciations into groups of:
  9 |     * Phones (`/ˈt͡ʃuːz/` to `ˈt͡ʃ uː z` )
 10 |     * Phonemes (`/kˈaʊ/` to `k ˈaʊ` for U.S. English)
 11 | * Converting pronunciations between:
 12 |     * IPA
 13 |     * [espeak](https://github.com/espeak-ng/)
 14 |     * [sampa](https://www.phon.ucl.ac.uk/home/sampa/)
 15 |     
 16 | Supported Languages:
 17 | 
 18 | * Arabic (`ar`)
 19 | * Czech (`cs-cz`)
 20 | * German (`de-de`)
 21 | * U.S. English (`en-us`)
 22 | * U.K. English (`en-gb`)
 23 | * Spanish (`es-es`)
 24 | * Persian/Farsi (`fa`)
 25 | * Spanish (`es-es`)
 26 | * Italian (`it-it`)
 27 | * Luxembourgish (`lb-lb`)
 28 | * Dutch (`nl`)
 29 | * Portuguese (`pt`)
 30 | * Russian (`ru-ru`)
 31 | * Swahili (`sw`)
 32 |     
 33 | ## Installing
 34 | 
 35 | ```sh
 36 | $ pip install gruut-ipa
 37 | ```
 38 | 
 39 | ## Dependencies
 40 | 
 41 | * Python 3.6 or higher
 42 | 
 43 | For command-line usage, you may also want:
 44 | 
 45 | * [espeak](https://github.com/espeak-ng/)
 46 | * [jq](https://stedolan.github.io/jq/)
 47 | 
 48 | Install these with:
 49 | 
 50 | ```sh
 51 | $ sudo apt-get install espeak jq
 52 | ```
 53 | 
 54 | ## Phones and Phonemes
 55 | 
 56 | ![IPA phones](img/ipa.png)
 57 | 
 58 | Phones in IPA are composed of different components:
 59 | 
 60 | * Letters
 61 |     * [Non-combining](https://en.wikipedia.org/wiki/Character_(computing)#Terminology) Unicode characters that represent a distinct human sound (phone)
 62 | * Suprasegmentals
 63 |     * [Non-combining](https://en.wikipedia.org/wiki/Character_(computing)#Terminology) Unicode characters that represent language features above individual vowels or consonants
 64 |     * Stress (ˈˌ), elongation (ː), linking/ties (t͡s), and short/long breaks (| ‖) are suprasegmentals
 65 | * Diacritics
 66 |     * [Combining characters](https://en.wikipedia.org/wiki/Combining_character) that provide additional information about a phone's pronunciation, such as [nasalation](https://en.wikipedia.org/wiki/Nasalization)
 67 |     
 68 | See [IPA Chart](https://www.ipachart.com/) for more details.
 69 | 
 70 | ### Phonemes
 71 | 
 72 | While phones represent individual sounds, phonemes are the phonetic units of a language that meaningfully distinguish words. A phoneme may be realized by many different phones. For example, the `/r/` in [Standard German](https://en.wikipedia.org/wiki/Standard_German_phonology) can be realized as a uvular fricative (χ/ʁ), a uvular approximant (ɹ), or a uvular tap or trill (ʀ/r).
 73 | 
 74 | A phoneme may also be composed of multiple phones, such as the [dipthong](https://en.wikipedia.org/wiki/Diphthong) `aʊ` in U.S. English (the "ow" in "cow").
 75 | 
 76 | Supported languages in `gruut-ipa` contain a `phonemes.txt` file in the `gruut_ipa/data` directory. This file has the following format:
 77 | 
 78 | ```text
 79 | <phoneme> <example> [<replace> ...]
 80 | ```
 81 | 
 82 | where `<phoneme>` is a set of IPA letters, like `ɶ` or `aʊ`. The `<example>` is a word whose pronunciation contains the `<phoneme>`. After that, there are one or more optional `<replace>` strings that will be replaced with `<phoneme>`. The German `/r/` example from above might be represented as:
 83 | 
 84 | ```text
 85 | r brot χ ʁ ɹ ʀ
 86 | ```
 87 | 
 88 | Phonemes for a given language come from [phonological analyses](https://en.wikipedia.org/wiki/Template:Language_phonologies) and from [public databases](https://phoible.org/). Ultimately, they are geared towards capturing pronunciations from [Wiktionary](https://www.wiktionary.org/).
 89 |  
 90 | ## Usage
 91 | 
 92 | Print JSON information about phones:
 93 | 
 94 | ```sh
 95 | $ python3 -m gruut_ipa describe "ˈãː" | jq .
 96 | {
 97 |   "text": "ˈãː",
 98 |   "letters": "a",
 99 |   "stress": "primary",
100 |   "height": "open",
101 |   "placement": "front",
102 |   "rounded": false,
103 |   "type": "Vowel",
104 |   "nasalated": true,
105 |   "elongated": true
106 | }
107 | ```
108 | 
109 | Split an IPA pronunciation into phones:
110 | 
111 | ```sh
112 | $ python3 -m gruut_ipa phones "ˈjɛs|ˈt͡ʃuːz aɪpiːeɪ‖"
113 | ˈj ɛ s | ˈt͡ʃ uː z a ɪ p iː e ɪ ‖
114 | ```
115 | 
116 | Group phones into phonemes for a specific language:
117 | 
118 | ```sh
119 | $ python3 -m gruut_ipa phonemes en-us "/dʒʌst ə kaʊ/"
120 | d͡ʒ ʌ s t ə k aʊ
121 | ```
122 | 
123 | Convert between IPA, [espeak](https://github.com/espeak-ng/), and [sampa](https://www.phon.ucl.ac.uk/home/sampa/):
124 | 
125 | ```sh
126 | $ python3 -m gruut_ipa convert ipa espeak "mʊmˈbaɪ"
127 | [[mUm'baI]]
128 | 
129 | $ python3 -m gruut_ipa convert espeak ipa "[[D,Is Iz sVm f@n'EtIk t'Ekst 'InpUt]]"
130 | ðˌɪs ɪz sʌm fɘnˈɛtɪk tˈɛkst ˈɪnpʊt
131 | ```
132 | 
133 | Chain commands together:
134 | 
135 | ```sh
136 | $ python3 -m gruut_ipa convert espeak ipa "[[k'aU]]" | \
137 |     python3 -m gruut_ipa phonemes en-us --keep-stress
138 | k ˈaʊ
139 | ```
140 | 
141 | ### Alternative Phoneme Sets
142 | 
143 | Some languages have multiple phoneme sets available:
144 | 
145 | * U.S. English (`en-us`)
146 |     * CMUDict (`en-us/cmudict`)
147 |     * [Zamia](https://github.com/gooofy/zamia-speech) (`en-us/zamia`)
148 | * Swahili (`sw`)
149 |     * [ALFFA](http://alffa.imag.fr/) (`sw/alffa`)
150 |     
151 | Convert from IPA to alternative phoneme set:
152 | 
153 | ```sh
154 | $ python3 -m gruut_ipa convert ipa en-us/cmudict "h ɛ l ˈoʊ w ˈɚ l d"
155 | HH EH0 L OW1 W ER1 L D
156 | ```
157 | 
158 | Convert from alternative phoneme set to IPA:
159 | 
160 | ```sh
161 | $ python3 -m gruut_ipa convert en-us/cmudict ipa "HH EH0 L OW1 W ER1 L D"
162 | h ɛ l ˈoʊ w ˈɚ l d
163 | ```
164 | 
165 | ## Scripts
166 | 
167 | Use the `speak-ipa` script to have [espeak](https://github.com/espeak-ng/) pronounce IPA. You may need to `apt-get install espeak` first.
168 | 
169 | ```sh
170 | $ echo '/hɛloʊ wɝld/' | bin/speak-ipa en-us -s 60 -w 'hello world.wav'
171 | $ aplay 'hello world.wav'
172 | ```
173 | 
174 | ## Phones
175 | 
176 | Supported IPA phones can be printed with:
177 | 
178 | ```sh
179 | $ python3 -m gruut_ipa print
180 | {"text": "i", "letters": "i", "stress": "none", "height": "close", "placement": "front", "rounded": false, "type": "Vowel", "nasalated": false, "elongated": false, "description": "close front unrounded vowel", "espeak": "i", "sampa": "i"}
181 | {"text": "y", "letters": "y", "stress": "none", "height": "close", "placement": "front", "rounded": true, "type": "Vowel", "nasalated": false, "elongated": false, "description": "close front rounded vowel", "espeak": "y", "sampa": "y"}
182 | ...
183 | ```
184 | 
185 | A nice table can be generated with [jq](https://stedolan.github.io/jq/):
186 | 
187 | ```sh
188 | $ python3 -m gruut_ipa print | \
189 |     jq -r '. | "\(.text)\t\(.espeak)\t\(.sampa)\t\(.description)"'
190 | ```
191 | 
192 | Converted to Markdown:
193 | 
194 | | IPA  | eSpeak | Sampa   | Description                           |
195 | | ---- | -----  | ------- | -----------------------------------   |
196 | | i    | i      | i       | close front unrounded vowel           |
197 | | y    | y      | y       | close front rounded vowel             |
198 | | ɨ    | i"     | 1       | close central unrounded vowel         |
199 | | ʉ    | u"     | }       | close central rounded vowel           |
200 | | ɯ    | u-     | M       | close back unrounded vowel            |
201 | | u    | u      | u       | close back rounded vowel              |
202 | | ɪ    | I      | I       | near-close near-front unrounded vowel |
203 | | ʏ    | I.     | Y       | near-close near-front rounded vowel   |
204 | | ʊ    | U      | U       | near-close near-back rounded vowel    |
205 | | e    | e      | e       | close-mid front unrounded vowel       |
206 | | ø    | Y      | 2       | close-mid front rounded vowel         |
207 | | ɘ    | @      | @\\     | close-mid central unrounded vowel     |
208 | | ɵ    | @.     | 8       | close-mid central rounded vowel       |
209 | | ɤ    | o-     | 7       | close-mid back unrounded vowel        |
210 | | o    | o      | o       | close-mid back rounded vowel          |
211 | | ɛ    | E      | E       | open-mid front unrounded vowel        |
212 | | œ    | W      | 9       | open-mid front rounded vowel          |
213 | | ɜ    | V"     | 3       | open-mid central unrounded vowel      |
214 | | ɞ    | O"     | 3\\     | open-mid central rounded vowel        |
215 | | ʌ    | V      | V       | open-mid back unrounded vowel         |
216 | | ɔ    | O      | O       | open-mid back rounded vowel           |
217 | | æ    | a      | {       | near-open front unrounded vowel       |
218 | | ɐ    | V      | 6       | near-open central unrounded vowel     |
219 | | a    | a      | a       | open front unrounded vowel            |
220 | | ɶ    | W      | &       | open front rounded vowel              |
221 | | ɑ    | A      | A       | open back unrounded vowel             |
222 | | ɒ    | A.     | Q       | open back rounded vowel               |
223 | | m    | m      | m       | voiced bilabial nasal                 |
224 | | ɱ    | M      | F       | voiced labio-dental nasal             |
225 | | n    | n      | n       | voiced alveolar nasal                 |
226 | | ɳ    | n.     | n\`     | voiced retroflex nasal                |
227 | | ŋ    | N      | N       | voiced velar nasal                    |
228 | | ɴ    | n"     | N\\     | voiced uvular nasal                   |
229 | | p    | p      | p       | voiceless bilabial plosive            |
230 | | b    | b      | b       | voiced bilabial plosive               |
231 | | t    | t      | t       | voiceless alveolar plosive            |
232 | | d    | d      | d       | voiced alveolar plosive               |
233 | | ʈ    | t.     | t\`     | voiceless retroflex plosive           |
234 | | ɖ    | d.     | d\`     | voiced retroflex plosive              |
235 | | c    | c      | c       | voiceless palatal plosive             |
236 | | ɟ    | J      | J\\     | voiced palatal plosive                |
237 | | k    | k      | k       | voiceless velar plosive               |
238 | | ɡ    | g      | g       | voiced velar plosive                  |
239 | | g    | g      | g       | voiced velar plosive                  |
240 | | q    | q      | q       | voiceless uvular plosive              |
241 | | ɢ    | G      | G\\     | voiced uvular plosive                 |
242 | | ʡ    |        | >\\     | voiceless pharyngeal plosive          |
243 | | ʔ    | ?      | ?       | voiceless glottal plosive             |
244 | | p͡f   | pf     | pf      | voiceless labio-dental affricate      |
245 | | b͡v   | bv     | bv      | voiced dental affricate               |
246 | | t̪͡s   | ts     | t_ds    | voiceless dental affricate            |
247 | | t͡s   | ts     | ts      | voiceless alveolar affricate          |
248 | | d͡z   | dz     | dz      | voiced alveolar affricate             |
249 | | t͡ʃ   | tS     | tS      | voiceless post-alveolar affricate     |
250 | | d͡ʒ   | dZ     | dZ      | voiced post-alveolar affricate        |
251 | | ʈ͡ʂ   | tS     | ts\`    | voiceless retroflex affricate         |
252 | | ɖ͡ʐ   | dz     | dz\`    | voiced retroflex affricate            |
253 | | t͡ɕ   | tS;    | ts\\    | voiceless palatal affricate           |
254 | | d͡ʑ   | dZ;    | dz\\    | voiced palatal affricate              |
255 | | k͡x   | k      | k_x     | voiceless velar affricate             |
256 | | ɸ    | F      | p\\     | voiceless bilabial fricative          |
257 | | β    | B      | B       | voiced bilabial fricative             |
258 | | f    | f      | f       | voiceless labio-dental fricative      |
259 | | v    | v      | v       | voiced labio-dental fricative         |
260 | | θ    | T      | T       | voiceless dental fricative            |
261 | | ð    | D      | D       | voiced dental fricative               |
262 | | s    | s      | s       | voiceless alveolar fricative          |
263 | | z    | z      | z       | voiced alveolar fricative             |
264 | | ʃ    | S      | S       | voiceless post-alveolar fricative     |
265 | | ʒ    | Z      | Z       | voiced post-alveolar fricative        |
266 | | ʂ    | s.     | s\`     | voiceless retroflex fricative         |
267 | | ʐ    | z.     | z\`     | voiced palatal fricative              |
268 | | ç    | C      | C       | voiceless palatal fricative           |
269 | | x    | x      | x       | voiceless velar fricative             |
270 | | ɣ    | Q      | G       | voiced velar fricative                |
271 | | χ    | X      | X       | voiceless uvular fricative            |
272 | | ʁ    | g"     | R       | voiced uvular fricative               |
273 | | ħ    | H      | X\\     | voiceless pharyngeal fricative        |
274 | | h    | h      | h       | voiceless glottal fricative           |
275 | | ɦ    | h<?>   | h\\     | voiced glottal fricative              |
276 | | w    | w      | w       | voiced bilabial approximant           |
277 | | ʋ    | v#     | v\\     | voiced labio-dental approximant       |
278 | | ɹ    | r      | r\\     | voiced alveolar approximant           |
279 | | ɻ    | r.     | r\\\`   | voiced retroflex approximant          |
280 | | j    | j      | j       | voiced palatal approximant            |
281 | | ɰ    | Q      | M\\     | voiced velar approximant              |
282 | | ⱱ    | ⱱ      | ⱱ       | voiced labio-dental flap              |
283 | | ɾ    | *      | 4       | voiced alveolar flap                  |
284 | | ɽ    | *.     | r\`     | voiced retroflex flap                 |
285 | | ʙ    | b<trl> | B\\     | voiced bilabial trill                 |
286 | | r    | r<trl> | r       | voiced alveolar trill                 |
287 | | ʀ    | r"     | R\\     | voiced uvular trill                   |
288 | | l    | l      | l       | voiced alveolar lateral-approximant   |
289 | | ɫ    | l      | 5       | voiced alveolar lateral-approximant   |
290 | | ɭ    | l.     | l\`     | voiced retroflex lateral-approximant  |
291 | | ʎ    | l^     | L       | voiced palatal lateral-approximant    |
292 | | ʟ    | L      | L\\     | voiced velar lateral-approximant      |
293 | | ə    | @      | @       | schwa                                 |
294 | | ɚ    | 3      | @\`     | r-coloured schwa                      |
295 | | ɝ    | 3      | @\`     | r-coloured schwa                      |
296 | | ɹ̩    | r-     | r\\̩     | voiced alveolar approximant           |
297 | 
298 | If you see anything wrong or missing, please let me know.
299 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.13.0
2 | 


--------------------------------------------------------------------------------
/bin/distance_matrix.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import itertools
  3 | import json
  4 | import sys
  5 | 
  6 | import gruut_ipa
  7 | from gruut_ipa.constants import (
  8 |     IPA,
  9 |     VowelHeight,
 10 |     VowelPlacement,
 11 |     ConsonantType,
 12 |     ConsonantPlace,
 13 |     BreakType,
 14 |     Stress,
 15 | )
 16 | 
 17 | import numpy as np
 18 | import sklearn.metrics
 19 | from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
 20 | 
 21 | 
 22 | def main():
 23 |     """Main entry point"""
 24 | 
 25 |     feature_cols = {
 26 |         "symbol_type": ["phoneme", "break"],
 27 |         "phoneme_type": ["NONE", "vowel", "consonant", "schwa"],
 28 |         "diacritic": ["NONE", "nasalated", "velarized"],
 29 |         "vowel_height": ["NONE"] + [v.value for v in VowelHeight],
 30 |         "vowel_place": ["NONE"] + [v.value for v in VowelPlacement],
 31 |         "vowel_rounded": ["NONE", "rounded", "unrounded"],
 32 |         "consonant_voiced": ["NONE", "voiced", "unvoiced"],
 33 |         "consonant_type": ["NONE"] + [v.value for v in ConsonantType],
 34 |         "consonant_place": ["NONE"] + [v.value for v in ConsonantPlace],
 35 |         "consonant_sounds_like": ["NONE", "r", "l", "g", ""],
 36 |         "break_type": ["NONE"] + [v.value for v in BreakType],
 37 |         "stress": ["NONE"] + [v.value for v in Stress],
 38 |     }
 39 | 
 40 |     ordinal_cols = {
 41 |         "vowel_height": VowelHeight,
 42 |         "vowel_place": VowelPlacement,
 43 |         "consonant_type": ConsonantType,
 44 |         "consonant_place": ConsonantPlace,
 45 |         "break_type": BreakType,
 46 |         "stress": Stress,
 47 |     }
 48 | 
 49 |     feature_keys = {}
 50 |     offset = 0
 51 |     for feature_col, feature_values in feature_cols.items():
 52 |         if feature_col in ordinal_cols:
 53 |             continue
 54 | 
 55 |         feature_keys[feature_col] = slice(offset, offset + len(feature_values))
 56 |         offset += len(feature_values)
 57 | 
 58 |     for feature_col in ordinal_cols:
 59 |         feature_keys[feature_col] = offset
 60 |         offset += 1
 61 | 
 62 |     ordinal_enc = OrdinalEncoder(categories=[feature_cols[col] for col in ordinal_cols])
 63 |     onehot_enc = OneHotEncoder(
 64 |         categories=[
 65 |             feature_cols[col] for col in feature_cols if col not in ordinal_cols
 66 |         ]
 67 |     )
 68 | 
 69 |     symbol_features = {}
 70 | 
 71 |     for break_symbol, break_type in [
 72 |         (IPA.BREAK_WORD, BreakType.WORD),
 73 |         (IPA.BREAK_MINOR, BreakType.MINOR),
 74 |         (IPA.BREAK_MAJOR, BreakType.MAJOR),
 75 |     ]:
 76 |         features = {"symbol_type": "break", "break_type": str(break_type.value)}
 77 |         symbol_features[break_symbol] = features
 78 | 
 79 |     for s in itertools.chain(gruut_ipa.VOWELS, gruut_ipa.CONSONANTS, gruut_ipa.SCHWAS):
 80 |         if s in symbol_features:
 81 |             continue
 82 | 
 83 |         p = gruut_ipa.Phoneme(s)
 84 |         features = {"symbol_type": "phoneme"}
 85 | 
 86 |         if p.vowel:
 87 |             features["phoneme_type"] = "vowel"
 88 |             features["vowel_height"] = p.vowel.height.value
 89 |             features["vowel_place"] = p.vowel.placement.value
 90 |             features["vowel_rounded"] = "rounded" if p.vowel.rounded else "unrounded"
 91 | 
 92 |             if p.nasalated:
 93 |                 features["diacritic"] = "nasalated"
 94 |         elif p.consonant:
 95 |             features["phoneme_type"] = "consonant"
 96 |             features["consonant_voiced"] = (
 97 |                 "voiced" if p.consonant.voiced else "unvoiced"
 98 |             )
 99 |             features["consonant_type"] = p.consonant.type.value
100 |             features["consonant_place"] = p.consonant.place.value
101 |             features["consonant_sounds_like"] = p.consonant.sounds_like.value
102 | 
103 |             if p.consonant.velarized:
104 |                 features["diacritic"] = "velarized"
105 |         elif p.schwa:
106 |             features["phoneme_type"] = "schwa"
107 |             if p.schwa.r_coloured:
108 |                 features["consonant_sounds_like"] = "r"
109 | 
110 |         symbol_features[s] = features
111 | 
112 |     vectors = {}
113 |     for s, features in symbol_features.items():
114 |         onehot_features = []
115 |         ordinal_features = []
116 | 
117 |         assert "symbol_type" in feature_cols
118 | 
119 |         for col in feature_cols:
120 |             if col not in features:
121 |                 features[col] = "NONE"
122 | 
123 |             if col in ordinal_cols:
124 |                 ordinal_features.append(features[col])
125 |             else:
126 |                 onehot_features.append(features[col])
127 | 
128 |         onehot_vector = onehot_enc.fit_transform([onehot_features]).toarray()[0]
129 |         ordinal_vector = ordinal_enc.fit_transform([ordinal_features])[0]
130 | 
131 |         for col_i, (_col_name, col_val) in enumerate(ordinal_cols.items()):
132 |             ordinal_vector[col_i] /= len(col_val)
133 | 
134 |         vectors[s] = np.hstack((onehot_vector, ordinal_vector))
135 | 
136 |     matrix = np.vstack(list(vectors.values()))
137 | 
138 |     w = np.ones(matrix.shape[1])
139 |     w[feature_keys["vowel_place"]] = 0.5
140 |     w[feature_keys["vowel_rounded"]] = 0.5
141 |     w[feature_keys["consonant_place"]] = 0.05
142 |     w[feature_keys["consonant_sounds_like"]] = 0.5
143 | 
144 |     dist = sklearn.metrics.pairwise_distances(matrix, metric="minkowski", p=2, w=w)
145 | 
146 |     symbols = list(vectors.keys())
147 | 
148 |     json.dump(
149 |         {
150 |             "symbols": symbols,
151 |             "columns": list(feature_cols.items()),
152 |             "features": matrix.tolist(),
153 |             "closest": {
154 |                 s: list(symbols[j] for j in dist[i].argsort())[1:]
155 |                 for i, s in enumerate(symbols)
156 |             },
157 |             "distances": dist.tolist(),
158 |         },
159 |         sys.stdout,
160 |         indent=4,
161 |         ensure_ascii=False,
162 |     )
163 | 
164 | 
165 | # -----------------------------------------------------------------------------
166 | 
167 | if __name__ == "__main__":
168 |     main()
169 | 


--------------------------------------------------------------------------------
/bin/gruut-ipa:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Directory of *this* script
 3 | this_dir="$( cd "$( dirname "$0" )" && pwd )"
 4 | src_dir="$(realpath "${this_dir}/..")"
 5 | 
 6 | venv="${src_dir}/.venv"
 7 | if [[ -d "${venv}" ]]; then
 8 |     source "${venv}/bin/activate"
 9 | fi
10 | 
11 | export PYTHONPATH="${src_dir}:${PYTHONPATH}"
12 | python3 -m gruut_ipa "$@"
13 | 


--------------------------------------------------------------------------------
/bin/speak-ipa:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | lang="$1"
 3 | 
 4 | if [[ -z "${lang}" ]]; then
 5 |     echo "Usage: speak-ipa LANGUAGE"
 6 |     exit 1;
 7 | fi
 8 | 
 9 | shift
10 | 
11 | espeak_args=()
12 | voice="${lang}"
13 | while [[ -n "$1" ]]; do
14 |     if [[ "$1" == '-v' ]]; then
15 |         voice=''
16 |     fi
17 | 
18 |     espeak_args+=("$1")
19 |     shift
20 | done
21 | 
22 | if [[ -n "${voice}" ]]; then
23 |     espeak_args+=('-v' "${voice}")
24 | fi
25 | 
26 | # -----------------------------------------------------------------------------
27 | 
28 | # Directory of *this* script
29 | this_dir="$( cd "$( dirname "$0" )" && pwd )"
30 | src_dir="$(realpath "${this_dir}/..")"
31 | 
32 | venv="${src_dir}/.venv"
33 | if [[ -d "${venv}" ]]; then
34 |     source "${venv}/bin/activate"
35 | fi
36 | 
37 | export PYTHONPATH="${src_dir}:${PYTHONPATH}"
38 | python3 -m gruut_ipa convert ipa espeak | \
39 |     tee >(cat >&2) | \
40 |     while read line;
41 |     do espeak "${espeak_args[@]}" "${line}"
42 |     done
43 | 


--------------------------------------------------------------------------------
/default.nix:
--------------------------------------------------------------------------------
1 | { pkgs ? import <nixpkgs> {} }:
2 | pkgs.callPackage ./derivation.nix {}
3 | 


--------------------------------------------------------------------------------
/derivation.nix:
--------------------------------------------------------------------------------
 1 | { lib, nixpkgs ? import <nixpkgs> {}, pythonPkgs ? nixpkgs.pkgs.python38Packages }:
 2 | pythonPkgs.buildPythonPackage rec {
 3 |   name   = "gruut-ipa-${version}";
 4 |   version = "0.10.0";
 5 | 
 6 |   src = pythonPkgs.fetchPypi {
 7 |     inherit version;
 8 |     pname   = "gruut-ipa";
 9 |     sha256 = "1kxrpv4qnzqbgv0vprlsvk0y0p58pl9xxz8sm7z4xxbyd1zamicf";
10 |   };
11 | 
12 |   meta = with lib; {
13 |     homepage    = "https://github.com/rhasspy/gruut-ipa";
14 |     description = "Library for manipulating pronunciations using the International Phonetic Alphabet (IPA)";
15 |     license     = licenses.mit;
16 |     platforms   = platforms.linux;
17 |   };
18 | 
19 |   doCheck = false;
20 | }
21 | 


--------------------------------------------------------------------------------
/gruut_ipa/__init__.py:
--------------------------------------------------------------------------------
 1 | """Classes for dealing with phones and phonemes"""
 2 | from gruut_ipa.accent import GuessedPhonemes, guess_phonemes  # noqa: F401
 3 | from gruut_ipa.constants import (  # noqa: F401
 4 |     CONSONANTS,
 5 |     FEATURE_COLUMNS,
 6 |     FEATURE_EMPTY,
 7 |     FEATURE_KEYS,
 8 |     FEATURE_ORDINAL_COLUMNS,
 9 |     IPA,
10 |     LANG_ALIASES,
11 |     SCHWAS,
12 |     VOWELS,
13 |     Accent,
14 |     Break,
15 |     BreakType,
16 |     Consonant,
17 |     ConsonantPlace,
18 |     ConsonantType,
19 |     Dipthong,
20 |     Intonation,
21 |     PhonemeLength,
22 |     Schwa,
23 |     Stress,
24 |     Vowel,
25 |     VowelHeight,
26 |     VowelPlacement,
27 | )
28 | from gruut_ipa.distances import get_closest  # noqa: F401
29 | from gruut_ipa.espeak import espeak_to_ipa, ipa_to_espeak  # noqa: F401
30 | from gruut_ipa.features import from_vector, string_to_symbol, to_vector  # noqa: F401
31 | from gruut_ipa.phonemes import Phone, Phoneme, Phonemes, Pronunciation  # noqa: F401
32 | from gruut_ipa.sampa import ipa_to_sampa, sampa_to_ipa  # noqa: F401
33 | from gruut_ipa.utils import resolve_lang  # noqa:F401
34 | 


--------------------------------------------------------------------------------
/gruut_ipa/__main__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Command-line interface to gruut-ipa"""
  3 | import argparse
  4 | import itertools
  5 | import json
  6 | import logging
  7 | import os
  8 | import sys
  9 | import typing
 10 | from pathlib import Path
 11 | 
 12 | # -----------------------------------------------------------------------------
 13 | 
 14 | _LOGGER = logging.getLogger("gruut_ipa")
 15 | 
 16 | _DIR = Path(__file__).parent
 17 | _DATA_DIR = _DIR / "data"
 18 | 
 19 | # -----------------------------------------------------------------------------
 20 | 
 21 | 
 22 | def main():
 23 |     """Main entry point"""
 24 |     args = get_args()
 25 | 
 26 |     if args.debug:
 27 |         logging.basicConfig(level=logging.DEBUG)
 28 |     else:
 29 |         logging.basicConfig(level=logging.INFO)
 30 | 
 31 |     _LOGGER.debug(args)
 32 | 
 33 |     # Dispatch to sub-command
 34 |     args.func(args)
 35 | 
 36 | 
 37 | # -----------------------------------------------------------------------------
 38 | 
 39 | 
 40 | def do_print(args):
 41 |     """Print known IPA phones"""
 42 |     from gruut_ipa import CONSONANTS, SCHWAS, VOWELS, Phoneme, Phonemes
 43 |     from gruut_ipa.espeak import ipa_to_espeak
 44 |     from gruut_ipa.sampa import ipa_to_sampa
 45 | 
 46 |     allowed_phonemes: typing.Set[str] = set()
 47 | 
 48 |     if args.language:
 49 |         # Load phonemes using language code
 50 |         phonemes_path = _DATA_DIR / args.language / "phonemes.txt"
 51 | 
 52 |         _LOGGER.debug("Loading phonemes from %s", phonemes_path)
 53 |         with open(phonemes_path, "r", encoding="utf-8") as phonemes_file:
 54 |             phonemes = Phonemes.from_text(phonemes_file)
 55 | 
 56 |         allowed_phonemes.update(p.text for p in phonemes)
 57 | 
 58 |     for phone_str in sorted(itertools.chain(VOWELS, CONSONANTS, SCHWAS)):
 59 |         phone = Phoneme(phone_str)
 60 | 
 61 |         if allowed_phonemes and (phone.text not in allowed_phonemes):
 62 |             # Skip phoneme outside language
 63 |             continue
 64 | 
 65 |         description = ""
 66 |         if phone.vowel:
 67 |             v = phone.vowel
 68 |             description = (
 69 |                 v.height.value
 70 |                 + " "
 71 |                 + v.placement.value
 72 |                 + " "
 73 |                 + ("rounded" if v.rounded else "unrounded")
 74 |                 + " vowel"
 75 |             )
 76 |         elif phone.consonant:
 77 |             c = phone.consonant
 78 |             description = (
 79 |                 ("voiced" if c.voiced else "voiceless")
 80 |                 + " "
 81 |                 + c.place.value
 82 |                 + " "
 83 |                 + c.type.value
 84 |             )
 85 |         elif phone.schwa:
 86 |             s = phone.schwa
 87 |             if s.r_coloured:
 88 |                 description = "r-coloured schwa"
 89 |             else:
 90 |                 description = "schwa"
 91 | 
 92 |         phone_dict = phone.to_dict()
 93 |         phone_dict["description"] = description
 94 | 
 95 |         # Add espeak/sampa
 96 |         phone_dict["espeak"] = ipa_to_espeak(phone_str)
 97 |         phone_dict["sampa"] = ipa_to_sampa(phone_str)
 98 | 
 99 |         phone_dict_str = json.dumps(phone_dict, ensure_ascii=False)
100 |         print(phone_dict_str)
101 | 
102 | 
103 | # -----------------------------------------------------------------------------
104 | 
105 | 
106 | def do_describe(args):
107 |     """Describe IPA phones"""
108 |     from gruut_ipa import Phoneme
109 | 
110 |     if args.phone:
111 |         # From arguments
112 |         phones = args.phone
113 |     else:
114 |         # From stdin
115 |         phones = sys.stdin
116 | 
117 |         if os.isatty(sys.stdin.fileno()):
118 |             print("Reading phones from stdin...", file=sys.stderr)
119 | 
120 |     for line in phones:
121 |         line = line.strip()
122 |         if line:
123 |             line_phone = Phoneme(text=line)
124 |             phone_str = json.dumps(line_phone.to_dict(), ensure_ascii=False)
125 |             print(phone_str)
126 |             sys.stdout.flush()
127 | 
128 | 
129 | # -----------------------------------------------------------------------------
130 | 
131 | 
132 | def do_phones(args):
133 |     """Group phones in IPA pronunciation"""
134 |     from gruut_ipa import Pronunciation
135 | 
136 |     if args.pronunciation:
137 |         # From arguments
138 |         pronunciations = args.pronunciation
139 |     else:
140 |         # From stdin
141 |         pronunciations = sys.stdin
142 | 
143 |         if os.isatty(sys.stdin.fileno()):
144 |             print("Reading pronunciations from stdin...", file=sys.stderr)
145 | 
146 |     for line in pronunciations:
147 |         line = line.strip()
148 |         if line:
149 |             line_pron = Pronunciation.from_string(line)
150 |             phones_str = args.separator.join(p.text for p in line_pron if p.text)
151 |             print(phones_str)
152 |             sys.stdout.flush()
153 | 
154 | 
155 | # -----------------------------------------------------------------------------
156 | 
157 | 
158 | def do_phonemes(args):
159 |     """Group phones in IPA pronuncation according to language phonemes"""
160 |     from gruut_ipa import Phonemes
161 | 
162 |     if args.pronunciation:
163 |         # From arguments
164 |         pronunciations = args.pronunciation
165 |     else:
166 |         # From stdin
167 |         pronunciations = sys.stdin
168 | 
169 |         if os.isatty(sys.stdin.fileno()):
170 |             print("Reading pronunciations from stdin...", file=sys.stderr)
171 | 
172 |     if args.phonemes_file:
173 |         # Load phonemes from file
174 |         phonemes_path = Path(args.phonemes_file)
175 |     else:
176 |         # Load phonemes using language code
177 |         phonemes_path = _DATA_DIR / args.language / "phonemes.txt"
178 | 
179 |         # Check language support
180 |         if not phonemes_path.is_file():
181 |             supported_languages = [d.name for d in _DATA_DIR.iterdir() if d.is_dir()]
182 |             _LOGGER.fatal("Unsupported language: %s", args.language)
183 |             _LOGGER.fatal("Supported languages: %s", supported_languages)
184 |             sys.exit(1)
185 | 
186 |     _LOGGER.debug("Loading phonemes from %s", phonemes_path)
187 |     with open(phonemes_path, "r", encoding="utf-8") as phonemes_file:
188 |         phonemes = Phonemes.from_text(phonemes_file)
189 | 
190 |     for line in pronunciations:
191 |         line = line.strip()
192 |         if line:
193 |             line_phonemes = phonemes.split(
194 |                 line, keep_stress=args.keep_stress, drop_tones=args.drop_tones
195 |             )
196 |             phonemes_str = args.separator.join(p.text for p in line_phonemes if p.text)
197 |             print(phonemes_str)
198 |             sys.stdout.flush()
199 | 
200 | 
201 | # -----------------------------------------------------------------------------
202 | 
203 | 
204 | def do_convert(args):
205 |     """Convert pronunciations between different representations"""
206 |     from gruut_ipa import Phoneme, Phonemes
207 |     from gruut_ipa.espeak import espeak_to_ipa, ipa_to_espeak
208 |     from gruut_ipa.sampa import ipa_to_sampa, sampa_to_ipa
209 | 
210 |     fixed_src_dest = {"ipa", "espeak", "sampa"}
211 |     src_phonemes: typing.Optional[Phonemes] = None
212 |     dest_phonemes: typing.Optional[Phonemes] = None
213 | 
214 |     if args.src not in fixed_src_dest:
215 |         src_phonemes = Phonemes.from_language(args.src)
216 | 
217 |     if args.dest not in fixed_src_dest:
218 |         dest_phoneme_map = Phonemes.from_language(args.dest).gruut_ipa_map
219 | 
220 |         # ipa -> original phoneme
221 |         dest_phonemes = Phonemes()
222 |         for k, v in dest_phoneme_map.items():
223 |             if v in dest_phonemes.gruut_ipa_map:
224 |                 continue
225 | 
226 |             dest_phonemes.phonemes.append(Phoneme(text=k, is_ipa=False))
227 |             dest_phonemes.ipa_map[v] = k
228 | 
229 |         dest_phonemes.update()
230 | 
231 |     if args.pronunciation:
232 |         # From arguments
233 |         pronunciations = args.pronunciation
234 |     else:
235 |         # From stdin
236 |         pronunciations = sys.stdin
237 | 
238 |         if os.isatty(sys.stdin.fileno()):
239 |             print("Reading pronunciations from stdin...", file=sys.stderr)
240 | 
241 |     for line in pronunciations:
242 |         line = line.strip()
243 |         if line:
244 |             if args.src == "ipa":
245 |                 src_ipa = line
246 |             elif args.src == "espeak":
247 |                 src_ipa = espeak_to_ipa(line)
248 |             elif args.src == "sampa":
249 |                 src_ipa = sampa_to_ipa(line)
250 |             else:
251 |                 assert src_phonemes is not None
252 |                 src_ipa = args.separator.join(
253 |                     src_phonemes.gruut_ipa_map.get(p.text, p.text)
254 |                     for p in src_phonemes.split(line)
255 |                 )
256 | 
257 |             if args.dest == "ipa":
258 |                 dest_pron = src_ipa
259 |             elif args.dest == "espeak":
260 |                 dest_pron = "[[" + ipa_to_espeak(src_ipa) + "]]"
261 |             elif args.dest == "sampa":
262 |                 dest_pron = ipa_to_sampa(src_ipa)
263 |             else:
264 |                 assert dest_phonemes is not None
265 |                 dest_pron = args.separator.join(
266 |                     p.text for p in dest_phonemes.split(src_ipa, is_ipa=False)
267 |                 )
268 | 
269 |             print(dest_pron)
270 |             sys.stdout.flush()
271 | 
272 | 
273 | # -----------------------------------------------------------------------------
274 | 
275 | 
276 | def get_args() -> argparse.Namespace:
277 |     """Parse command-line arguments"""
278 |     parser = argparse.ArgumentParser(prog="gruut_ipa")
279 | 
280 |     # Create subparsers for each sub-command
281 |     sub_parsers = parser.add_subparsers()
282 |     sub_parsers.required = True
283 |     sub_parsers.dest = "command"
284 | 
285 |     # -----
286 |     # print
287 |     # -----
288 |     print_parser = sub_parsers.add_parser("print", help="Print all known IPA phones")
289 |     print_parser.add_argument(
290 |         "--language", help="Only print phones from a specific language or language/set"
291 |     )
292 |     print_parser.set_defaults(func=do_print)
293 | 
294 |     # --------
295 |     # describe
296 |     # --------
297 |     describe_parser = sub_parsers.add_parser("describe", help="Describe IPA phone(s)")
298 |     describe_parser.set_defaults(func=do_describe)
299 |     describe_parser.add_argument(
300 |         "phone", nargs="*", help="IPA phones (read from stdin if not provided)"
301 |     )
302 | 
303 |     # --------
304 |     # phones
305 |     # --------
306 |     phones_parser = sub_parsers.add_parser(
307 |         "phones", help="Group phones in IPA pronunciation"
308 |     )
309 |     phones_parser.set_defaults(func=do_phones)
310 |     phones_parser.add_argument(
311 |         "pronunciation",
312 |         nargs="*",
313 |         help="IPA pronunciations (read from stdin if not provided)",
314 |     )
315 |     phones_parser.add_argument(
316 |         "--separator",
317 |         default=" ",
318 |         help="Separator to add between phones in output (default: space)",
319 |     )
320 | 
321 |     # --------
322 |     # phonemes
323 |     # --------
324 |     phonemes_parser = sub_parsers.add_parser(
325 |         "phonemes",
326 |         help="Group phones in IPA pronunciation according to language phonemes",
327 |     )
328 |     phonemes_parser.set_defaults(func=do_phonemes)
329 |     phonemes_parser.add_argument("language", help="Language code (e.g., en-us)")
330 |     phonemes_parser.add_argument(
331 |         "pronunciation",
332 |         nargs="*",
333 |         help="IPA pronunciations (read from stdin if not provided)",
334 |     )
335 |     phonemes_parser.add_argument(
336 |         "--separator",
337 |         default=" ",
338 |         help="Separator to add between phonemes in output (default: space)",
339 |     )
340 |     phonemes_parser.add_argument(
341 |         "--keep-stress",
342 |         action="store_true",
343 |         help="Keep primary/secondary stress markers",
344 |     )
345 |     phonemes_parser.add_argument(
346 |         "--drop-tones", action="store_true", help="Remove tone numbers/letters"
347 |     )
348 |     phonemes_parser.add_argument(
349 |         "--phonemes-file", help="Load phonemes from file instead of using language code"
350 |     )
351 | 
352 |     # -------
353 |     # convert
354 |     # -------
355 |     convert_parser = sub_parsers.add_parser(
356 |         "convert", help="Convert pronunciations between ipa, espeak, and sampa"
357 |     )
358 |     convert_parser.set_defaults(func=do_convert)
359 |     convert_parser.add_argument(
360 |         "src", help="Source format (language, language/set, ipa, espeak, sampa)"
361 |     )
362 |     convert_parser.add_argument(
363 |         "dest", help="Destination format (language, language/set, ipa, espeak, sampa)"
364 |     )
365 |     convert_parser.add_argument(
366 |         "pronunciation",
367 |         nargs="*",
368 |         help="Pronunciations (read from stdin if not provided)",
369 |     )
370 |     convert_parser.add_argument(
371 |         "--separator", default=" ", help="Separator between phonemes (default: space)"
372 |     )
373 | 
374 |     # Shared arguments
375 |     for sub_parser in [
376 |         print_parser,
377 |         describe_parser,
378 |         phones_parser,
379 |         phonemes_parser,
380 |         convert_parser,
381 |     ]:
382 |         sub_parser.add_argument(
383 |             "--debug", action="store_true", help="Print DEBUG messages to console"
384 |         )
385 | 
386 |     return parser.parse_args()
387 | 
388 | 
389 | # -----------------------------------------------------------------------------
390 | 
391 | 
392 | if __name__ == "__main__":
393 |     main()
394 | 


--------------------------------------------------------------------------------
/gruut_ipa/accent.py:
--------------------------------------------------------------------------------
  1 | """Methods for mapping phonemes from one language to another"""
  2 | import typing
  3 | import unicodedata
  4 | from copy import copy
  5 | from dataclasses import dataclass
  6 | 
  7 | from gruut_ipa.constants import Vowel, VowelHeight, VowelPlacement
  8 | from gruut_ipa.distances import get_closest
  9 | from gruut_ipa.phonemes import Phoneme, Phonemes, Pronunciation
 10 | 
 11 | # ---------------------------------------------------------------------
 12 | 
 13 | R_LIKE = ["ɹ", "ʁ", "r", "ʀ", "ɻ", "ɚ"]
 14 | SCHWA_PREFERRED = ["ə", "ɐ"]
 15 | GS = ["ɡ", "g"]
 16 | PHARY_GLOTTAL = ["ʡ", "ʔ"]
 17 | 
 18 | MATCHING_PHONEMES = typing.List[Phoneme]
 19 | 
 20 | 
 21 | @dataclass
 22 | class GuessedPhonemes:
 23 |     """Result from guess_phonemes"""
 24 | 
 25 |     phonemes: MATCHING_PHONEMES
 26 |     distance: typing.Optional[float] = None
 27 | 
 28 | 
 29 | def guess_phonemes(
 30 |     from_phoneme: typing.Union[str, Phoneme], to_phonemes: Phonemes,
 31 | ) -> GuessedPhonemes:
 32 |     """Get best matching phonemes for a single phoneme"""
 33 |     best_phonemes: MATCHING_PHONEMES = []
 34 |     best_dist: typing.Optional[float] = None
 35 | 
 36 |     from_codepoints: typing.Optional[typing.Set[str]] = None
 37 | 
 38 |     if isinstance(from_phoneme, str):
 39 |         # Parse phoneme
 40 |         from_phoneme = Phoneme(from_phoneme)
 41 | 
 42 |     if from_phoneme.text in GS:
 43 |         # Correctly map two forms of "g"
 44 |         for maybe_g in GS:
 45 |             if maybe_g in to_phonemes:
 46 |                 best_phonemes = [Phoneme(maybe_g)]
 47 |                 best_dist = 0.0
 48 |                 break
 49 | 
 50 |     if (not best_phonemes) and from_phoneme.schwa:
 51 |         if from_phoneme.schwa.r_coloured:
 52 |             # Try r-like
 53 |             for maybe_r_like in R_LIKE:
 54 |                 if maybe_r_like in to_phonemes:
 55 |                     best_phonemes = [Phoneme(maybe_r_like)]
 56 |                     best_dist = 0.0
 57 |                     break
 58 | 
 59 |         if not best_phonemes:
 60 |             for maybe_schwa in SCHWA_PREFERRED:
 61 |                 # Try known schwa preferences
 62 |                 if maybe_schwa in to_phonemes:
 63 |                     best_phonemes = [Phoneme(maybe_schwa)]
 64 |                     best_dist = 0.0
 65 |                     break
 66 | 
 67 |         if not best_phonemes:
 68 |             # Treat as a mid-central vowel
 69 |             from_phoneme = copy(from_phoneme)
 70 |             setattr(
 71 |                 from_phoneme,
 72 |                 "vowel",
 73 |                 Vowel(
 74 |                     ipa="ə",
 75 |                     height=VowelHeight.MID,
 76 |                     placement=VowelPlacement.CENTRAL,
 77 |                     rounded=False,
 78 |                 ),
 79 |             )
 80 | 
 81 |     if (not best_phonemes) and (from_phoneme.text in R_LIKE):
 82 |         # Map r-like consonant
 83 |         for maybe_r in R_LIKE:
 84 |             if maybe_r in to_phonemes:
 85 |                 best_phonemes = [Phoneme(maybe_r)]
 86 |                 best_dist = 0.0
 87 |                 break
 88 | 
 89 |     if (not best_phonemes) and (from_phoneme.text in PHARY_GLOTTAL):
 90 |         # Map or drop
 91 |         for maybe_pg in PHARY_GLOTTAL:
 92 |             if maybe_pg in to_phonemes:
 93 |                 best_phonemes = [Phoneme(maybe_pg)]
 94 |                 best_dist = 0
 95 |                 break
 96 | 
 97 |         if not best_phonemes:
 98 |             # Drop
 99 |             return GuessedPhonemes(phonemes=[])
100 | 
101 |     if best_phonemes:
102 |         return GuessedPhonemes(phonemes=best_phonemes, distance=best_dist)
103 | 
104 |     # Search through target phonemes
105 |     for to_phoneme in to_phonemes:
106 |         if from_phoneme.text == to_phoneme.text:
107 |             # Easy case: exact match
108 |             best_phonemes = [to_phoneme]
109 |             best_dist = 0.0
110 |             break
111 | 
112 |         if from_phoneme.letters == to_phoneme.letters:
113 |             # Match except for elongation, accent
114 |             if from_codepoints is None:
115 |                 from_codepoints = set(unicodedata.normalize("NFD", from_phoneme.text))
116 | 
117 |             # Compute a "distance" based on how many codepoints different between the two phonemes.
118 |             # This should usually be < 1 so that it can be a better match than the vowel/consonant distances.
119 |             to_codepoints = set(unicodedata.normalize("NFD", to_phoneme.text))
120 | 
121 |             # Divide by 10 to ensure this is usually < 1
122 |             dist = abs(len(from_codepoints) - len(to_codepoints)) / 10.0
123 | 
124 |             if (best_dist is None) or (dist < best_dist):
125 |                 best_phonemes = [to_phoneme]
126 |                 best_dist = dist
127 | 
128 |             continue
129 | 
130 |         # if from_phoneme.vowel and to_phoneme.vowel:
131 |         #     # Vowel distance
132 |         #     dist = vowel_distance(from_phoneme.vowel, to_phoneme.vowel)
133 | 
134 |         #     # Extra penalty for not matching elongation
135 |         #     dist += 0.5 if from_phoneme.elongated != to_phoneme.elongated else 0
136 | 
137 |         #     if (best_dist is None) or (dist < best_dist):
138 |         #         best_dist = dist
139 |         #         best_phonemes = [to_phoneme]
140 |         #         continue
141 | 
142 |         # if from_phoneme.consonant and to_phoneme.consonant:
143 |         #     # Consonant distance
144 |         #     dist = consonant_distance(from_phoneme.consonant, to_phoneme.consonant)
145 |         #     # Extra penalty for not matching elongation
146 |         #     dist += 0.5 if from_phoneme.elongated != to_phoneme.elongated else 0
147 | 
148 |         #     if (best_dist is None) or (dist < best_dist):
149 |         #         best_dist = dist
150 |         #         best_phonemes = [to_phoneme]
151 |         #         continue
152 | 
153 |     if len(from_phoneme.letters) > 1:
154 |         # Split apart and match each letter separately
155 |         best_split: MATCHING_PHONEMES = []
156 |         split_phonemes = Pronunciation.from_string(from_phoneme.text, keep_ties=False)
157 |         dist = 1.0
158 | 
159 |         for split_phoneme in split_phonemes:
160 |             guessed = guess_phonemes(split_phoneme.text, to_phonemes)
161 | 
162 |             if (not guessed.phonemes) or (guessed.distance is None):
163 |                 break
164 | 
165 |             dist += guessed.distance
166 |             best_split.extend(guessed.phonemes)
167 | 
168 |         if (best_dist is None) or (dist < best_dist):
169 |             best_phonemes = best_split
170 |             best_dist = dist
171 |     elif best_dist is None:
172 |         closest = get_closest(from_phoneme.text)
173 | 
174 |         if closest:
175 |             for candidate_str in closest:
176 |                 for to_phoneme in to_phonemes:
177 |                     if candidate_str == to_phoneme.text:
178 |                         best_phonemes = [Phoneme(candidate_str)]
179 |                         best_dist = 0.5
180 |                         break
181 | 
182 |                 if best_dist is not None:
183 |                     break
184 | 
185 |     if best_dist is None:
186 |         # Last resort
187 |         for to_phoneme in to_phonemes:
188 |             if from_phoneme.letters[0] == to_phoneme.letters[0]:
189 |                 best_phonemes = [to_phoneme]
190 |                 best_dist = 10.0
191 |                 break
192 | 
193 |     return GuessedPhonemes(phonemes=best_phonemes, distance=best_dist)
194 | 
195 | 
196 | # ---------------------------------------------------------------------
197 | 
198 | # VOWEL_HEIGHT_NUM = {h: i for i, h in enumerate(VowelHeight)}
199 | # VOWEL_PLACE_NUM = {p: i for i, p in enumerate(VowelPlacement)}
200 | 
201 | 
202 | # def vowel_distance(vowel_1: Vowel, vowel_2: Vowel) -> float:
203 | #     """Return a distance measure between two vowels"""
204 | #     dist_height = (
205 | #         abs(VOWEL_HEIGHT_NUM[vowel_1.height] - VOWEL_HEIGHT_NUM[vowel_2.height]) * 2
206 | #     )
207 | #     dist_place = abs(
208 | #         VOWEL_PLACE_NUM[vowel_1.placement] - VOWEL_PLACE_NUM[vowel_2.placement]
209 | #     )
210 | #     dist_rounded = 1 if vowel_1.rounded != vowel_2.rounded else 0
211 | 
212 | #     return dist_height + dist_place + dist_rounded
213 | 
214 | 
215 | # CONSONANT_TYPE_NUM = {t: i for i, t in enumerate(ConsonantType)}
216 | # CONSONANT_PLACE_NUM = {p: i for i, p in enumerate(ConsonantPlace)}
217 | 
218 | 
219 | # def consonant_distance(consonant_1: Consonant, consonant_2: Consonant) -> float:
220 | #     """Return a distance measure between two consonants"""
221 | #     dist_type = abs(
222 | #         CONSONANT_TYPE_NUM[consonant_1.type] - CONSONANT_TYPE_NUM[consonant_2.type]
223 | #     )
224 | #     # dist_type = 1 if consonant_1.type != consonant_2.type else 0
225 | #     dist_place = abs(
226 | #         CONSONANT_PLACE_NUM[consonant_1.place] - CONSONANT_PLACE_NUM[consonant_2.place]
227 | #     )
228 | #     dist_voiced = 1 if consonant_1.voiced != consonant_2.voiced else 0
229 | 
230 | #     return dist_type + dist_place + dist_voiced
231 | 


--------------------------------------------------------------------------------
/gruut_ipa/constants.py:
--------------------------------------------------------------------------------
  1 | """Enums, vowels, and consonants for gruut-ipa"""
  2 | import typing
  3 | import unicodedata
  4 | from dataclasses import dataclass
  5 | from enum import Enum
  6 | from pathlib import Path
  7 | 
  8 | _DIR = Path(__file__).parent
  9 | 
 10 | _DATA_DIR = _DIR / "data"
 11 | 
 12 | LANG_ALIASES = {
 13 |     "ar": "ar",
 14 |     "cs": "cs-cz",
 15 |     "de": "de-de",
 16 |     "en": "en-us",
 17 |     "es": "es-es",
 18 |     "fa": "fa",
 19 |     "fr": "fr-fr",
 20 |     "it": "it-it",
 21 |     "nl": "nl",
 22 |     "pt-br": "pt",
 23 |     "ru": "ru-ru",
 24 |     "sv": "sv-se",
 25 |     "sw": "sw",
 26 | }
 27 | 
 28 | 
 29 | class IPA(str, Enum):
 30 |     """International phonetic alphabet characters"""
 31 | 
 32 |     STRESS_PRIMARY = "\u02C8"  # ˈ
 33 |     STRESS_SECONDARY = "\u02CC"  # ˌ
 34 | 
 35 |     ACCENT_ACUTE = "'"
 36 |     ACCENT_GRAVE = "²"
 37 | 
 38 |     LONG = "\u02D0"  # ː
 39 |     HALF_LONG = "\u02D1"  # eˑ
 40 |     EXTRA_SHORT = "\u0306"  # ə̆
 41 |     NASAL = "\u0303"  # ẽ
 42 |     RAISED = "\u031D"  # r̝
 43 |     TIE_ABOVE = "\u0361"  # ͡
 44 |     TIE_BELOW = "\u035C"  # ͜
 45 | 
 46 |     SYLLABIC = "\u0329"
 47 |     NON_SYLLABIC = "\u032F"
 48 | 
 49 |     BREAK_SYLLABLE = "."
 50 |     BREAK_MINOR = "|"
 51 |     BREAK_MAJOR = "\u2016"  # ‖
 52 |     BREAK_WORD = "#"
 53 | 
 54 |     INTONATION_RISING = "\u2197"  # ↗
 55 |     INTONATION_FALLING = "\u2198"  # ↘
 56 | 
 57 |     TONE_1 = "¹"
 58 |     TONE_2 = "²"
 59 |     TONE_3 = "³"
 60 |     TONE_4 = "⁴"
 61 |     TONE_5 = "⁵"
 62 |     TONE_6 = "⁶"
 63 |     TONE_7 = "⁷"
 64 |     TONE_8 = "⁸"
 65 |     TONE_9 = "⁹"
 66 | 
 67 |     TONE_EXTRA_HIGH = "˥"
 68 |     TONE_HIGH = "˦"
 69 |     TONE_MID = "˧"
 70 |     TONE_LOW = "˨"
 71 |     TONE_EXTRA_LOW = "˩"
 72 | 
 73 |     TONE_GLOTTALIZED = "ˀ"
 74 |     TONE_SHORT = "ʔ"
 75 | 
 76 |     BRACKET_PHONETIC_LEFT = "["
 77 |     BRACKET_PHONETIC_RIGHT = "]"
 78 |     BRACKET_PHONEMIC_LEFT = "/"
 79 |     BRACKET_PHONEMIC_RIGHT = "/"
 80 |     BRACKET_PROSODIC_LEFT = "{"
 81 |     BRACKET_PROSODIC_RIGHT = "}"
 82 |     BRACKET_OPTIONAL_LEFT = "("
 83 |     BRACKET_OPTIONAL_RIGHT = ")"
 84 | 
 85 |     @staticmethod
 86 |     def is_long(codepoint: str) -> bool:
 87 |         """True if elongated symbol"""
 88 |         return codepoint == IPA.LONG
 89 | 
 90 |     @staticmethod
 91 |     def is_nasal(codepoint: str) -> bool:
 92 |         """True if nasalated diacritic"""
 93 |         return codepoint == IPA.NASAL
 94 | 
 95 |     @staticmethod
 96 |     def is_raised(codepoint: str) -> bool:
 97 |         """True if rased diacritic"""
 98 |         return codepoint == IPA.RAISED
 99 | 
100 |     @staticmethod
101 |     def is_stress(codepoint: str) -> bool:
102 |         """True if primary/secondary stress symbol"""
103 |         return codepoint in (IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY)
104 | 
105 |     @staticmethod
106 |     def is_accent(codepoint: str) -> bool:
107 |         """True if accent symbol"""
108 |         return codepoint in {IPA.ACCENT_ACUTE, IPA.ACCENT_GRAVE}
109 | 
110 |     @staticmethod
111 |     def is_tie(codepoint: str) -> bool:
112 |         """True if above/below tie symbol"""
113 |         return codepoint in (IPA.TIE_ABOVE, IPA.TIE_BELOW)
114 | 
115 |     @staticmethod
116 |     def is_bracket(codepoint: str) -> bool:
117 |         """True if any IPA bracket symbol"""
118 |         return codepoint in {
119 |             IPA.BRACKET_PHONETIC_LEFT,
120 |             IPA.BRACKET_PHONETIC_RIGHT,
121 |             IPA.BRACKET_PHONEMIC_LEFT,
122 |             IPA.BRACKET_PHONEMIC_RIGHT,
123 |             IPA.BRACKET_PROSODIC_LEFT,
124 |             IPA.BRACKET_PROSODIC_RIGHT,
125 |             IPA.BRACKET_OPTIONAL_LEFT,
126 |             IPA.BRACKET_OPTIONAL_RIGHT,
127 |         }
128 | 
129 |     @staticmethod
130 |     def is_break(codepoint: str) -> bool:
131 |         """True if any IPA break symbol"""
132 |         return codepoint in {
133 |             IPA.BREAK_SYLLABLE,
134 |             IPA.BREAK_MINOR,
135 |             IPA.BREAK_MAJOR,
136 |             IPA.BREAK_WORD,
137 |         }
138 | 
139 |     @staticmethod
140 |     def is_intonation(codepoint: str) -> bool:
141 |         """True if a rising or falling IPA intonation symbol"""
142 |         return codepoint in {IPA.INTONATION_RISING, IPA.INTONATION_FALLING}
143 | 
144 |     @staticmethod
145 |     def is_tone(codepoint: str) -> bool:
146 |         """True if any IPA tone symbol"""
147 |         return codepoint in {
148 |             IPA.TONE_1,
149 |             IPA.TONE_2,
150 |             IPA.TONE_3,
151 |             IPA.TONE_4,
152 |             IPA.TONE_5,
153 |             IPA.TONE_6,
154 |             IPA.TONE_7,
155 |             IPA.TONE_8,
156 |             IPA.TONE_9,
157 |             IPA.TONE_EXTRA_HIGH,
158 |             IPA.TONE_HIGH,
159 |             IPA.TONE_MID,
160 |             IPA.TONE_LOW,
161 |             IPA.TONE_EXTRA_LOW,
162 |         }
163 | 
164 |     @staticmethod
165 |     def graphemes(codepoints: str) -> typing.List[str]:
166 |         """Split a string into graphemes"""
167 |         codepoints = unicodedata.normalize("NFD", codepoints)
168 | 
169 |         graphemes = []
170 |         grapheme = ""
171 | 
172 |         for c in codepoints:
173 |             if unicodedata.combining(c) > 0:
174 |                 grapheme += c
175 |             elif grapheme:
176 |                 # Next grapheme
177 |                 graphemes.append(unicodedata.normalize("NFC", grapheme))
178 |                 grapheme = c
179 |             else:
180 |                 # Start of grapheme
181 |                 grapheme = c
182 | 
183 |         if grapheme:
184 |             # Final grapheme
185 |             graphemes.append(unicodedata.normalize("NFC", grapheme))
186 | 
187 |         return graphemes
188 | 
189 |     @staticmethod
190 |     def without_stress(codepoints: str, drop_accent: bool = True) -> str:
191 |         """Return string without primary/secondary stress"""
192 |         return "".join(
193 |             c
194 |             for c in codepoints
195 |             if (not IPA.is_stress(c) and (not drop_accent or not IPA.is_accent(c)))
196 |         )
197 | 
198 | 
199 | class Stress(str, Enum):
200 |     """Applied stress"""
201 | 
202 |     SECONDARY = "secondary"
203 |     PRIMARY = "primary"
204 | 
205 | 
206 | class Accent(str, Enum):
207 |     """Applied accent"""
208 | 
209 |     ACUTE = "acute"  # '
210 |     GRAVE = "grave"  # ²
211 | 
212 | 
213 | class BreakType(str, Enum):
214 |     """Type of break"""
215 | 
216 |     WORD = "word"  # '#'
217 |     MINOR = "minor"  # |
218 |     MAJOR = "major"  # ‖
219 | 
220 | 
221 | class PhonemeLength(str, Enum):
222 |     """Spoken length of a phoneme"""
223 | 
224 |     SHORT = "short"  # ˑ
225 |     NORMAL = "normal"
226 |     LONG = "long"  # ː
227 | 
228 | 
229 | # -----------------------------------------------------------------------------
230 | 
231 | 
232 | class VowelHeight(str, Enum):
233 |     """Height of a vowel"""
234 | 
235 |     CLOSE = "close"
236 |     NEAR_CLOSE = "near-close"
237 |     CLOSE_MID = "close-mid"
238 |     MID = "mid"
239 |     OPEN_MID = "open-mid"
240 |     NEAR_OPEN = "near-open"
241 |     OPEN = "open"
242 | 
243 | 
244 | class VowelPlacement(str, Enum):
245 |     """Front/back placement of a vowel"""
246 | 
247 |     FRONT = "front"
248 |     NEAR_FRONT = "near-front"
249 |     CENTRAL = "central"
250 |     NEAR_BACK = "near-back"
251 |     BACK = "back"
252 | 
253 | 
254 | @dataclass
255 | class Vowel:
256 |     """Necessary information for a vowel"""
257 | 
258 |     ipa: str
259 |     height: VowelHeight
260 |     placement: VowelPlacement
261 |     rounded: bool
262 |     nasalated: bool = False
263 |     stress: typing.Optional[Stress] = None
264 |     length: PhonemeLength = PhonemeLength.NORMAL
265 |     alias_of: typing.Optional[str] = None
266 | 
267 | 
268 | # -----------------------------------------------------------------
269 | # Vowels        Front    Near-Front    Central    Near-Back    Back
270 | # -----------------------------------------------------------------
271 | # Close         i/y                    ɨ/ʉ                     ɯ/u
272 | # Near-Close             ɪ/ʏ                      ʊ
273 | # Close-Mid     e/ø                    ɘ/ɵ                     ɤ/o
274 | # Mid                                  ə
275 | # Open-Mid      ɛ/œ                    ɜ/ɞ                     ʌ/ɔ
276 | # Near-Open     æ                      ɐ
277 | # Open          a/ɶ                                            ɑ/ɒ
278 | # -----------------------------------------------------------------
279 | 
280 | 
281 | _VOWELS = [
282 |     Vowel("i", VowelHeight.CLOSE, VowelPlacement.FRONT, False),
283 |     Vowel("y", VowelHeight.CLOSE, VowelPlacement.FRONT, True),
284 |     Vowel("ɨ", VowelHeight.CLOSE, VowelPlacement.CENTRAL, False),
285 |     Vowel("ᵻ", VowelHeight.CLOSE, VowelPlacement.CENTRAL, False, alias_of="ɨ"),
286 |     Vowel("ʉ", VowelHeight.CLOSE, VowelPlacement.CENTRAL, True),
287 |     Vowel("ɯ", VowelHeight.CLOSE, VowelPlacement.BACK, False),
288 |     Vowel("u", VowelHeight.CLOSE, VowelPlacement.BACK, True),
289 |     #
290 |     Vowel("ɪ", VowelHeight.NEAR_CLOSE, VowelPlacement.NEAR_FRONT, False),
291 |     Vowel("ʏ", VowelHeight.NEAR_CLOSE, VowelPlacement.NEAR_FRONT, True),
292 |     Vowel("ʊ", VowelHeight.NEAR_CLOSE, VowelPlacement.NEAR_BACK, True),
293 |     #
294 |     Vowel("e", VowelHeight.CLOSE_MID, VowelPlacement.FRONT, False),
295 |     Vowel("ẽ", VowelHeight.CLOSE_MID, VowelPlacement.FRONT, False, nasalated=True),
296 |     Vowel("ø", VowelHeight.CLOSE_MID, VowelPlacement.FRONT, True),
297 |     Vowel("ɘ", VowelHeight.CLOSE_MID, VowelPlacement.CENTRAL, False),
298 |     Vowel("ɵ", VowelHeight.CLOSE_MID, VowelPlacement.CENTRAL, True),
299 |     Vowel("ɤ", VowelHeight.CLOSE_MID, VowelPlacement.BACK, False),
300 |     Vowel("o", VowelHeight.CLOSE_MID, VowelPlacement.BACK, True),
301 |     #
302 |     # Represented as a schwa too
303 |     Vowel("ə", VowelHeight.MID, VowelPlacement.CENTRAL, False),
304 |     #
305 |     Vowel("ɛ", VowelHeight.OPEN_MID, VowelPlacement.FRONT, False),
306 |     Vowel("œ", VowelHeight.OPEN_MID, VowelPlacement.FRONT, True),
307 |     Vowel("ɜ", VowelHeight.OPEN_MID, VowelPlacement.CENTRAL, False),
308 |     Vowel("ɞ", VowelHeight.OPEN_MID, VowelPlacement.CENTRAL, True),
309 |     Vowel("ʌ", VowelHeight.OPEN_MID, VowelPlacement.BACK, False),
310 |     Vowel("ɔ", VowelHeight.OPEN_MID, VowelPlacement.BACK, True),
311 |     Vowel("ɔ̃", VowelHeight.OPEN_MID, VowelPlacement.BACK, True, nasalated=True),
312 |     #
313 |     Vowel("æ", VowelHeight.NEAR_OPEN, VowelPlacement.FRONT, False),
314 |     Vowel("ɐ", VowelHeight.NEAR_OPEN, VowelPlacement.CENTRAL, False),
315 |     #
316 |     Vowel("a", VowelHeight.OPEN, VowelPlacement.FRONT, False),
317 |     Vowel("ã", VowelHeight.OPEN, VowelPlacement.FRONT, False, nasalated=True),
318 |     Vowel("ɶ", VowelHeight.OPEN, VowelPlacement.FRONT, True),
319 |     Vowel("ɑ", VowelHeight.OPEN, VowelPlacement.BACK, False),
320 |     Vowel("ɒ", VowelHeight.OPEN, VowelPlacement.BACK, True),
321 | ]
322 | 
323 | VOWELS = {v.ipa: v for v in _VOWELS}
324 | 
325 | # -----------------------------------------------------------------------------
326 | 
327 | 
328 | @dataclass
329 | class Dipthong:
330 |     """Combination of two vowels"""
331 | 
332 |     vowel1: Vowel
333 |     vowel2: Vowel
334 | 
335 | 
336 | # -----------------------------------------------------------------------------
337 | 
338 | 
339 | @dataclass
340 | class Schwa:
341 |     """Vowel-like sound"""
342 | 
343 |     ipa: str
344 |     r_coloured: bool
345 |     length: PhonemeLength = PhonemeLength.NORMAL
346 |     alias_of: typing.Optional[str] = None
347 | 
348 | 
349 | _SCHWAS = [Schwa("ə", False), Schwa("ɚ", True), Schwa("ɝ", True, alias_of="ɚ")]
350 | 
351 | SCHWAS = {s.ipa: s for s in _SCHWAS}
352 | 
353 | # -----------------------------------------------------------------------------
354 | 
355 | 
356 | class ConsonantType(str, Enum):
357 |     """Type of a consonant"""
358 | 
359 |     NASAL = "nasal"
360 |     PLOSIVE = "plosive"
361 |     AFFRICATE = "affricate"
362 |     FRICATIVE = "fricative"
363 |     APPROXIMANT = "approximant"
364 |     FLAP = "flap"
365 |     TRILL = "trill"
366 |     LATERAL_APPROXIMANT = "lateral-approximant"
367 | 
368 | 
369 | class ConsonantPlace(str, Enum):
370 |     """Place of articulation"""
371 | 
372 |     BILABIAL = "bilabial"
373 |     LABIO_DENTAL = "labio-dental"
374 |     DENTAL = "dental"
375 |     ALVEOLAR = "alveolar"
376 |     POST_ALVEOLAR = "post-alveolar"
377 |     RETROFLEX = "retroflex"
378 |     PALATAL = "palatal"
379 |     VELAR = "velar"
380 |     UVULAR = "uvular"
381 |     PHARYNGEAL = "pharyngeal"
382 |     GLOTTAL = "glottal"
383 | 
384 | 
385 | class ConsonantSoundsLike(str, Enum):
386 |     """Class of sounds this consonant is similar to"""
387 | 
388 |     NONE = ""
389 |     R = "r"
390 |     G = "g"
391 |     L = "l"
392 | 
393 | 
394 | @dataclass
395 | class Consonant:
396 |     """Necessary information for a consonant"""
397 | 
398 |     ipa: str
399 |     type: ConsonantType
400 |     place: ConsonantPlace
401 |     voiced: bool
402 |     velarized: bool = False
403 |     sounds_like: ConsonantSoundsLike = ConsonantSoundsLike.NONE
404 |     length: PhonemeLength = PhonemeLength.NORMAL
405 |     alias_of: typing.Optional[str] = None
406 | 
407 | 
408 | # --------------------------------------------------------------------------------------------------------------------------------------------
409 | # Type         Bilabial    Labiodental    Dental    Alveolar    Postalveolar    Retroflex  Palatal    Velar    Uvular    Pharyngeal    Glottal
410 | # --------------------------------------------------------------------------------------------------------------------------------------------
411 | # Nasal        m           ɱ                        n                           ɳ          ɲ          ŋ        ɴ
412 | # Plosive      p/b                                  t/d                         ʈ/ɖ        c/ɟ        k/ɡ      q/ɢ       ʡ             ʔ
413 | # Affricate                p͡f/b͡v          t̪͡s̪/b͡v̪     t͡s/d͡z       t͡ʃ/d͡ʒ           ʈ͡ʂ/ɖ͡ʐ      t͡ɕ/d͡ʑ      k͡x
414 | # Fricative    ɸ/β         f/v            θ/ð       s/z         ʃ/ʒ             ʂ/ʐ        ç/ʝ        x/ɣ      χ/ʁ       ħ             h ɦ
415 | # Approximant  w           ʋ                        ɹ                           ɻ          j          ɰ
416 | # Flap                     ⱱ                        ɾ                           ɽ
417 | # Trill        ʙ                                    r                                                          ʀ
418 | # Lateral App                                       l                           ɭ          ʎ          ʟ
419 | # --------------------------------------------------------------------------------------------------------------------------------------------
420 | 
421 | _CONSONANTS = [
422 |     Consonant("m", ConsonantType.NASAL, ConsonantPlace.BILABIAL, True),
423 |     Consonant("ɱ", ConsonantType.NASAL, ConsonantPlace.LABIO_DENTAL, True),
424 |     Consonant("n", ConsonantType.NASAL, ConsonantPlace.ALVEOLAR, True),
425 |     Consonant("ɳ", ConsonantType.NASAL, ConsonantPlace.RETROFLEX, True),
426 |     Consonant("ɲ", ConsonantType.NASAL, ConsonantPlace.PALATAL, True),
427 |     Consonant("ŋ", ConsonantType.NASAL, ConsonantPlace.VELAR, True),
428 |     Consonant("ɴ", ConsonantType.NASAL, ConsonantPlace.UVULAR, True),
429 |     #
430 |     Consonant("p", ConsonantType.PLOSIVE, ConsonantPlace.BILABIAL, False),
431 |     Consonant("b", ConsonantType.PLOSIVE, ConsonantPlace.BILABIAL, True),
432 |     Consonant("t", ConsonantType.PLOSIVE, ConsonantPlace.ALVEOLAR, False),
433 |     Consonant("d", ConsonantType.PLOSIVE, ConsonantPlace.ALVEOLAR, True),
434 |     Consonant("ʈ", ConsonantType.PLOSIVE, ConsonantPlace.RETROFLEX, False),
435 |     Consonant("ɖ", ConsonantType.PLOSIVE, ConsonantPlace.RETROFLEX, True),
436 |     Consonant("c", ConsonantType.PLOSIVE, ConsonantPlace.PALATAL, False),
437 |     Consonant("ɟ", ConsonantType.PLOSIVE, ConsonantPlace.PALATAL, True),
438 |     Consonant("k", ConsonantType.PLOSIVE, ConsonantPlace.VELAR, False),
439 |     Consonant(
440 |         "ɡ",
441 |         ConsonantType.PLOSIVE,
442 |         ConsonantPlace.VELAR,
443 |         True,
444 |         sounds_like=ConsonantSoundsLike.G,
445 |     ),
446 |     Consonant(
447 |         "g",
448 |         ConsonantType.PLOSIVE,
449 |         ConsonantPlace.VELAR,
450 |         True,
451 |         sounds_like=ConsonantSoundsLike.G,
452 |         alias_of="ɡ",
453 |     ),
454 |     Consonant(
455 |         "q",
456 |         ConsonantType.PLOSIVE,
457 |         ConsonantPlace.UVULAR,
458 |         False,
459 |         sounds_like=ConsonantSoundsLike.G,
460 |     ),
461 |     Consonant(
462 |         "ɢ",
463 |         ConsonantType.PLOSIVE,
464 |         ConsonantPlace.UVULAR,
465 |         True,
466 |         sounds_like=ConsonantSoundsLike.G,
467 |     ),
468 |     Consonant("ʡ", ConsonantType.PLOSIVE, ConsonantPlace.PHARYNGEAL, False),
469 |     Consonant("ʔ", ConsonantType.PLOSIVE, ConsonantPlace.GLOTTAL, False),
470 |     #
471 |     Consonant("p͡f", ConsonantType.AFFRICATE, ConsonantPlace.LABIO_DENTAL, False),
472 |     Consonant("b͡v", ConsonantType.AFFRICATE, ConsonantPlace.LABIO_DENTAL, True),
473 |     Consonant("t̪͡s", ConsonantType.AFFRICATE, ConsonantPlace.DENTAL, False),
474 |     Consonant("b͡v", ConsonantType.AFFRICATE, ConsonantPlace.DENTAL, True),
475 |     Consonant("t͡s", ConsonantType.AFFRICATE, ConsonantPlace.ALVEOLAR, False),
476 |     Consonant("d͡z", ConsonantType.AFFRICATE, ConsonantPlace.ALVEOLAR, True),
477 |     Consonant("t͡ʃ", ConsonantType.AFFRICATE, ConsonantPlace.POST_ALVEOLAR, False),
478 |     Consonant("d͡ʒ", ConsonantType.AFFRICATE, ConsonantPlace.POST_ALVEOLAR, True),
479 |     Consonant("ʈ͡ʂ", ConsonantType.AFFRICATE, ConsonantPlace.RETROFLEX, False),
480 |     Consonant("ɖ͡ʐ", ConsonantType.AFFRICATE, ConsonantPlace.RETROFLEX, True),
481 |     Consonant("t͡ɕ", ConsonantType.AFFRICATE, ConsonantPlace.PALATAL, False),
482 |     Consonant("d͡ʑ", ConsonantType.AFFRICATE, ConsonantPlace.PALATAL, True),
483 |     Consonant("k͡x", ConsonantType.AFFRICATE, ConsonantPlace.VELAR, False),
484 |     #
485 |     Consonant("ɸ", ConsonantType.FRICATIVE, ConsonantPlace.BILABIAL, False),
486 |     Consonant("β", ConsonantType.FRICATIVE, ConsonantPlace.BILABIAL, True),
487 |     Consonant("f", ConsonantType.FRICATIVE, ConsonantPlace.LABIO_DENTAL, False),
488 |     Consonant("v", ConsonantType.FRICATIVE, ConsonantPlace.LABIO_DENTAL, True),
489 |     Consonant("θ", ConsonantType.FRICATIVE, ConsonantPlace.DENTAL, False),
490 |     Consonant("ð", ConsonantType.FRICATIVE, ConsonantPlace.DENTAL, True),
491 |     Consonant("s", ConsonantType.FRICATIVE, ConsonantPlace.ALVEOLAR, False),
492 |     Consonant("z", ConsonantType.FRICATIVE, ConsonantPlace.ALVEOLAR, True),
493 |     Consonant("ʃ", ConsonantType.FRICATIVE, ConsonantPlace.POST_ALVEOLAR, False),
494 |     Consonant("ʒ", ConsonantType.FRICATIVE, ConsonantPlace.POST_ALVEOLAR, True),
495 |     Consonant("ʂ", ConsonantType.FRICATIVE, ConsonantPlace.RETROFLEX, False),
496 |     Consonant("ʐ", ConsonantType.FRICATIVE, ConsonantPlace.RETROFLEX, True),
497 |     Consonant("ç", ConsonantType.FRICATIVE, ConsonantPlace.PALATAL, False),
498 |     Consonant(
499 |         "ʝ", ConsonantType.FRICATIVE, ConsonantPlace.PALATAL, False, alias_of="ç"
500 |     ),
501 |     Consonant("ʐ", ConsonantType.FRICATIVE, ConsonantPlace.PALATAL, True),
502 |     Consonant("x", ConsonantType.FRICATIVE, ConsonantPlace.VELAR, False),
503 |     Consonant("ɣ", ConsonantType.FRICATIVE, ConsonantPlace.VELAR, True),
504 |     Consonant("χ", ConsonantType.FRICATIVE, ConsonantPlace.UVULAR, False),
505 |     Consonant(
506 |         "ʁ",
507 |         ConsonantType.FRICATIVE,
508 |         ConsonantPlace.UVULAR,
509 |         True,
510 |         sounds_like=ConsonantSoundsLike.R,
511 |     ),
512 |     Consonant("ħ", ConsonantType.FRICATIVE, ConsonantPlace.PHARYNGEAL, False),
513 |     Consonant("h", ConsonantType.FRICATIVE, ConsonantPlace.GLOTTAL, False),
514 |     Consonant("ɦ", ConsonantType.FRICATIVE, ConsonantPlace.GLOTTAL, True),
515 |     #
516 |     Consonant("w", ConsonantType.APPROXIMANT, ConsonantPlace.BILABIAL, True),
517 |     Consonant("ʋ", ConsonantType.APPROXIMANT, ConsonantPlace.LABIO_DENTAL, True),
518 |     Consonant(
519 |         "ɹ",
520 |         ConsonantType.APPROXIMANT,
521 |         ConsonantPlace.ALVEOLAR,
522 |         True,
523 |         sounds_like=ConsonantSoundsLike.R,
524 |     ),
525 |     Consonant(
526 |         "ɻ",
527 |         ConsonantType.APPROXIMANT,
528 |         ConsonantPlace.RETROFLEX,
529 |         True,
530 |         sounds_like=ConsonantSoundsLike.R,
531 |     ),
532 |     Consonant("j", ConsonantType.APPROXIMANT, ConsonantPlace.PALATAL, True),
533 |     Consonant("ɰ", ConsonantType.APPROXIMANT, ConsonantPlace.VELAR, True),
534 |     #
535 |     Consonant("ⱱ", ConsonantType.FLAP, ConsonantPlace.LABIO_DENTAL, True),
536 |     Consonant(
537 |         "ɾ",
538 |         ConsonantType.FLAP,
539 |         ConsonantPlace.ALVEOLAR,
540 |         True,
541 |         sounds_like=ConsonantSoundsLike.R,
542 |     ),
543 |     Consonant(
544 |         "ɽ",
545 |         ConsonantType.FLAP,
546 |         ConsonantPlace.RETROFLEX,
547 |         True,
548 |         sounds_like=ConsonantSoundsLike.R,
549 |     ),
550 |     #
551 |     Consonant("ʙ", ConsonantType.TRILL, ConsonantPlace.BILABIAL, True),
552 |     Consonant(
553 |         "r",
554 |         ConsonantType.TRILL,
555 |         ConsonantPlace.ALVEOLAR,
556 |         True,
557 |         sounds_like=ConsonantSoundsLike.R,
558 |     ),
559 |     Consonant(
560 |         "ʀ",
561 |         ConsonantType.TRILL,
562 |         ConsonantPlace.UVULAR,
563 |         True,
564 |         sounds_like=ConsonantSoundsLike.R,
565 |     ),
566 |     #
567 |     Consonant(
568 |         "l",
569 |         ConsonantType.LATERAL_APPROXIMANT,
570 |         ConsonantPlace.ALVEOLAR,
571 |         True,
572 |         sounds_like=ConsonantSoundsLike.L,
573 |     ),
574 |     Consonant(
575 |         "ɫ",
576 |         ConsonantType.LATERAL_APPROXIMANT,
577 |         ConsonantPlace.ALVEOLAR,
578 |         True,
579 |         velarized=True,
580 |         sounds_like=ConsonantSoundsLike.L,
581 |     ),
582 |     Consonant(
583 |         "ɭ",
584 |         ConsonantType.LATERAL_APPROXIMANT,
585 |         ConsonantPlace.RETROFLEX,
586 |         True,
587 |         sounds_like=ConsonantSoundsLike.L,
588 |     ),
589 |     Consonant("ʎ", ConsonantType.LATERAL_APPROXIMANT, ConsonantPlace.PALATAL, True),
590 |     Consonant(
591 |         "ʟ",
592 |         ConsonantType.LATERAL_APPROXIMANT,
593 |         ConsonantPlace.VELAR,
594 |         True,
595 |         sounds_like=ConsonantSoundsLike.L,
596 |     ),
597 | ]
598 | 
599 | CONSONANTS = {c.ipa: c for c in _CONSONANTS}
600 | 
601 | # -----------------------------------------------------------------------------
602 | 
603 | 
604 | @dataclass
605 | class Break:
606 |     """IPA break/boundary"""
607 | 
608 |     type: BreakType
609 |     text: str = ""
610 | 
611 |     def __post_init__(self):
612 |         if self.type == BreakType.MINOR:
613 |             self.text = IPA.BREAK_MINOR
614 |         elif self.type == BreakType.MAJOR:
615 |             self.text = IPA.BREAK_MAJOR
616 |         elif self.type == BreakType.WORD:
617 |             self.text = IPA.BREAK_WORD
618 |         else:
619 |             raise ValueError(f"Unrecognized break type: {type}")
620 | 
621 |     @staticmethod
622 |     def from_string(break_str: str) -> "Break":
623 |         """Parse break from string"""
624 |         if break_str == IPA.BREAK_MINOR:
625 |             break_type = BreakType.MINOR
626 |         elif break_str == IPA.BREAK_MAJOR:
627 |             break_type = BreakType.MAJOR
628 |         elif break_str == IPA.BREAK_WORD:
629 |             break_type = BreakType.WORD
630 |         else:
631 |             raise ValueError(f"Unrecognized break type: {break_str}")
632 | 
633 |         return Break(break_type)
634 | 
635 | 
636 | class Intonation:
637 |     """IPA rising/falling intonation"""
638 | 
639 |     def __init__(self, rising: bool):
640 |         self.rising = rising
641 | 
642 |         if self.rising:
643 |             self.text = IPA.INTONATION_RISING
644 |         else:
645 |             self.text = IPA.INTONATION_FALLING
646 | 
647 |     def __repr__(self) -> str:
648 |         return self.text
649 | 
650 |     @staticmethod
651 |     def from_string(intonation_str: str) -> "Intonation":
652 |         """Parse intonation from string"""
653 |         if intonation_str == IPA.INTONATION_RISING:
654 |             rising = True
655 |         elif intonation_str == IPA.INTONATION_FALLING:
656 |             rising = False
657 |         else:
658 |             raise ValueError(f"Unrecognized intonation type: {intonation_str}")
659 | 
660 |         return Intonation(rising)
661 | 
662 | 
663 | # -----------------------------------------------------------------------------
664 | 
665 | 
666 | FEATURE_EMPTY = "NONE"
667 | 
668 | FEATURE_COLUMNS: typing.Dict[str, typing.List[str]] = {
669 |     "symbol_type": ["phoneme", "break"],
670 |     "phoneme_type": [FEATURE_EMPTY, "vowel", "consonant", "schwa"],
671 |     "break_type": [FEATURE_EMPTY] + [v.value for v in BreakType],
672 |     "diacritic": [FEATURE_EMPTY, "nasalated", "velarized"],
673 |     "vowel_height": [FEATURE_EMPTY] + [v.value for v in VowelHeight],
674 |     "vowel_place": [FEATURE_EMPTY] + [v.value for v in VowelPlacement],
675 |     "vowel_rounded": [FEATURE_EMPTY, "rounded", "unrounded"],
676 |     "vowel_stress": [FEATURE_EMPTY] + [v.value for v in Stress],
677 |     "consonant_voiced": [FEATURE_EMPTY, "voiced", "unvoiced"],
678 |     "consonant_type": [FEATURE_EMPTY] + [v.value for v in ConsonantType],
679 |     "consonant_place": [FEATURE_EMPTY] + [v.value for v in ConsonantPlace],
680 |     "consonant_sounds_like": [FEATURE_EMPTY, "r", "l", "g", ""],
681 |     "phoneme_length": [FEATURE_EMPTY] + [v.value for v in PhonemeLength],
682 | }
683 | 
684 | FEATURE_ORDINAL_COLUMNS: typing.Set[str] = {
685 |     "vowel_height",
686 |     "vowel_place",
687 |     "vowel_stress",
688 |     "consonant_type",
689 |     "consonant_place",
690 |     "break_type",
691 |     "phoneme_length",
692 | }
693 | 
694 | 
695 | def _make_feature_keys() -> typing.Mapping[str, typing.Union[int, slice]]:
696 |     """Create mapping from feature column name to vector index (ordinal) or slice (one-hot)"""
697 |     feature_keys: typing.Dict[str, typing.Union[int, slice]] = {}
698 |     offset = 0
699 |     for feature_col, feature_values in FEATURE_COLUMNS.items():
700 |         if feature_col in FEATURE_ORDINAL_COLUMNS:
701 |             feature_keys[feature_col] = offset
702 |             offset += 1
703 |         else:
704 |             feature_keys[feature_col] = slice(offset, offset + len(feature_values))
705 |             offset += len(feature_values)
706 | 
707 |     return feature_keys
708 | 
709 | 
710 | FEATURE_KEYS = _make_feature_keys()
711 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/ar/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # https://en.wikipedia.org/wiki/Arabic_phonology
 2 | 
 3 | # Vowels
 4 | i [] ɪ e e̞ ɛ
 5 | u [] ʊ o o̞ ɔ
 6 | iː
 7 | uː
 8 | a
 9 | aː
10 | 
11 | # Consonants
12 | m
13 | n
14 | t
15 | tˤ
16 | tˤˤ
17 | k
18 | q
19 | ʔ
20 | b
21 | d
22 | dˤ
23 | dˤˤ
24 | d͡ʒ [] dʒ dʒʒ
25 | f
26 | θ
27 | s
28 | sˤ
29 | sˤˤ
30 | ʃ
31 | x
32 | ħ
33 | h
34 | ð
35 | z
36 | ðˤ
37 | ðˤˤ
38 | ɣ
39 | ʕ
40 | l
41 | ɫ
42 | j
43 | w
44 | r
45 | 
46 | # Dipthongs
47 | aw
48 | aj
49 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/cs-cz/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # https://en.wikipedia.org/wiki/Czech_phonology
 2 | 
 3 | # Short vowels
 4 | a p[a]k æ ɑ
 5 | ɛ z[e] ,e[^u̯] ə
 6 | ɪ tad[y] ,i[^ː] y ɪ̯
 7 | u tom[u] ʊ ʌ ʊ
 8 | o ah[o]j ɔ ᴐ ɒ
 9 | 
10 | # Long vowels
11 | ɛː žádn[é] eː
12 | iː ř[í]kala ɪː
13 | aː n[á]s ɑː
14 | uː p[ů]jdu ʊː ɐ
15 | oː žener[ó]zní ɔː
16 | 
17 | # Dipthongs
18 | au̯ [au]gur aʊ̯ au̯
19 | eu̯ [eu]ro ɛʊ̯ ɛu̯
20 | ou̯ form[ou] ɔʊ̯ ɔu̯
21 | 
22 | # Consonants
23 | r p[r]o ɹ ɾ rː r̩
24 | r̝ řád r̝̊
25 | t [t]oho
26 | k pa[k]
27 | l pů[l] l̩
28 | s je[s]tli
29 | n d[n]eska ŋ nː n̩
30 | v [v]lastně
31 | m [m]u ɱ m̩
32 | p [p]ůl
33 | d [d]alší
34 | j aho[j] ʲ
35 | b [b]yla
36 | ɲ o[n]i
37 | z předcho[z]í
38 | t͡s do[c]ela
39 | t͡ʃ [č]lověk
40 | ʃ ne[ž]
41 | ɦ dlou[h]o h
42 | f dou[f]ám
43 | c [t]ím
44 | x by[ch]om
45 | ɡ ně[k]do g
46 | d͡ʒ min[dž]a
47 | ʒ mů[ž]ete
48 | ɟ li[d]i
49 | ʔ [u]vedl ˀ
50 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/de-de/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # Vowels
 2 | a d[a]s ɑ
 3 | aː j[a]hr ɑː
 4 | ɛ w[e]nn
 5 | ə ein[e] ɘ
 6 | ɐ od[er] ɐ̯
 7 | ɛː k[ä]se
 8 | eː g[e]gen e
 9 | ɪ w[i]rd ĭ
10 | iː d[ie] i
11 | ɔ d[o]ch
12 | oː w[o] o ɔu ɔʊ̯
13 | œ k[ö]nnen
14 | øː l[ös]en
15 | ʊ m[u]ss
16 | uː g[u]t u
17 | ʏ m[ü]cke
18 | yː f[ü]r y ʏː œ̃ː 
19 | 
20 | # Nasal Vowels from Loanwords
21 | ãː restaur[ant] ã ɑ̃
22 | õː sais[on]
23 | ɛ̃ː cous[in]
24 | 
25 | # Diphthongs
26 | ɔʏ̯ [eu]le ø
27 | aɪ̯ h[ei]m a͜i
28 | aʊ̯ h[au]s a͡ʊ a͜u
29 | 
30 | # Plosives
31 | p o[b]st
32 | b [b]itte
33 | t nich[t]
34 | d [d]er
35 | k [k]ann
36 | g [g]eht ɡ
37 | ʔ be[a]mter
38 | 
39 | # Nasal Consonants
40 | m [m]it m̩
41 | n ei[n] ɱ n̩
42 | ŋ la[ng] ŋ̩
43 | 
44 | # Fricatives
45 | f [v]on
46 | v [w]as
47 | s au[s]
48 | z [s]ie
49 | ʃ [sch]on
50 | ʒ [g]enie
51 | ç mi[ch] c
52 | x bu[ch]
53 | χ ba[ch]
54 | ʁ da[r]auf ɽ r ɾ ʀ
55 | h [h]ut
56 | 
57 | # Approximants
58 | j [j]a
59 | 
60 | # Lateral Approximants
61 | l a[l]s l̩
62 | 
63 | # Affricates
64 | p͡f [pf]erd pf
65 | t͡s [z]eit  ​t͡s t͜s
66 | t͡ʃ deu[tsch] ʧ
67 | d͡ʒ [dsch]ungel
68 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/el-gr/phonemes.txt:
--------------------------------------------------------------------------------
 1 | a π[α]ς
 2 | b [μ]πεσ
 3 | d [τ]ζιμ
 4 | δ [δ]εις
 5 | e π[ε]ς
 6 | f α[φ]ού
 7 | g [γ]κέι
 8 | ɣ αρ[γ]ά
 9 | x ει[χ]α
10 | ç ει[χ]ε
11 | i π[ει]ς
12 | ʝ [γ]ειά
13 | k [κ]ανω
14 | d͡z [δ]ικέ
15 | l α[λλ]η ʎ
16 | m [μ]αζι
17 | n ε[ν]ασ ɲ
18 | ŋ μά[γ]κα
19 | o π[ω]ς
20 | p [π]ήρα
21 | r κ[ρ]ισ ɾ ɾ̠ ɹ
22 | s βγε[σ] s̠
23 | t κα[τ]ω
24 | θ ηρ[θ]α
25 | t͡s ts ματσ
26 | u π[ου]
27 | v [β]άζω
28 | z [ζ]εισ z̠
29 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/en-gb/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # Normal vowels
 2 | ɒ f[a]ther
 3 | æ c[a]t a
 4 | e b[e]d ɛ
 5 | ɪ s[i]t
 6 | ɔ l[aw] 
 7 | ʊ p[u]t
 8 | ʌ r[u]n ɐ
 9 | 
10 | # Elognated vowels
11 | iː s[ee] i
12 | ɑː n[o]t
13 | uː s[oo]n u
14 | ɔː n[or]th
15 | ɜː n[ur]se
16 | 
17 | # Schwas
18 | ə [a]llow
19 | 
20 | # Dipthongs
21 | eɪ r[ai]se e eɪ̯
22 | aɪ r[i]ce aɪ̯
23 | əʊ kn[ow]
24 | ɔɪ n[oi]se
25 | aʊ h[ou]se
26 | 
27 | # Stops
28 | p [p]in
29 | b [b]ut
30 | t [t]on
31 | d [d]ot
32 | k [c]at
33 | ɡ [g]ive g
34 | 
35 | # Affricatives
36 | t͡ʃ [ch]in tʃ
37 | d͡ʒ [g]in dʒ
38 | 
39 | # Fricatives
40 | f [f]in
41 | v [v]im
42 | θ [th]in
43 | ð [th]is
44 | s [s]et
45 | z [z]ing
46 | ʃ [s]ure
47 | ʒ mea[sure]
48 | h [h]am
49 | 
50 | # Other consonants
51 | l [l]ong l̩ ɫ ʟ̩ l̩
52 | m [m]ock m̩
53 | n [kn]ock n̩
54 | ŋ thi[ng]
55 | ɹ [wr]ong r ɾ
56 | w [w]asp
57 | j [y]acht
58 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/en-us/cmudict/ipa_map.txt:
--------------------------------------------------------------------------------
 1 | AA ɑ
 2 | AA0 ɑ
 3 | AA1 ˈɑ
 4 | AA2 ˌɑ
 5 | AE æ
 6 | AE0 æ
 7 | AE1 ˈæ
 8 | AE2 ˌæ
 9 | AH ʌ
10 | AH0 ʌ
11 | AH1 ˈʌ
12 | AH2 ˌʌ
13 | AO ɔ
14 | AO0 ɔ
15 | AO1 ˈɔ
16 | AO2 ˌɔ
17 | AW aʊ
18 | AW0 aʊ
19 | AW1 ˈaʊ
20 | AW2 ˌaʊ
21 | AY aɪ
22 | AY0 aɪ
23 | AY1 ˈaɪ
24 | AY2 ˌaɪ
25 | B b
26 | CH t͡ʃ
27 | D d
28 | DH ð
29 | EH ɛ
30 | EH0 ɛ
31 | EH1 ˈɛ
32 | EH2 ˌɛ
33 | ER ɚ
34 | ER0 ɚ
35 | ER1 ˈɚ
36 | ER2 ˌɚ
37 | EY eɪ
38 | EY0 eɪ
39 | EY1 ˈeɪ
40 | EY2 ˌeɪ
41 | F f
42 | G ɡ
43 | HH h
44 | IH ɪ
45 | IH0 ɪ
46 | IH1 ˈɪ
47 | IH2 ˌɪ
48 | IY i
49 | IY0 i
50 | IY1 ˈi
51 | IY2 ˌi
52 | JH d͡ʒ
53 | K k
54 | L l
55 | M m
56 | N n
57 | NG ŋ
58 | OW oʊ
59 | OW0 oʊ
60 | OW1 ˈoʊ
61 | OW2 ˌoʊ
62 | OY ɔɪ
63 | OY0 ɔɪ
64 | OY1 ˈɔɪ
65 | OY2 ˌɔɪ
66 | P p
67 | R ɹ
68 | S s
69 | SH ʃ
70 | T t
71 | TH θ
72 | UH ʊ
73 | UH0 ʊ
74 | UH1 ˈʊ
75 | UH2 ˌʊ
76 | UW u
77 | UW0 u
78 | UW1 ˈu
79 | UW2 ˌu
80 | V v
81 | W w
82 | Y j
83 | Z z
84 | ZH ʒ
85 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/en-us/cmudict/phonemes.txt:
--------------------------------------------------------------------------------
 1 | AA [au]nt
 2 | AA0
 3 | AA1
 4 | AA2
 5 | AE [a]lan
 6 | AE0
 7 | AE1
 8 | AE2
 9 | AH b[u]n
10 | AH0
11 | AH1
12 | AH2
13 | AO also
14 | AO0
15 | AO1
16 | AO2
17 | AW d[ow]n
18 | AW0
19 | AW1
20 | AW2
21 | AY b[i]ke
22 | AY0
23 | AY1
24 | AY2
25 | B a[b]le
26 | CH ea[ch]
27 | D an[d]y
28 | DH [th]an
29 | EH b[e]ll
30 | EH0
31 | EH1
32 | EH2
33 | ER b[ir]d
34 | ER0
35 | ER1
36 | ER2
37 | EY aw[ay]
38 | EY0
39 | EY1
40 | EY2
41 | F [f]ace
42 | G ba[g]s
43 | HH [h]alf
44 | IH s[i]t
45 | IH0
46 | IH1
47 | IH2
48 | IY army
49 | IY0
50 | IY1
51 | IY2
52 | JH e[dge]
53 | K as[k]
54 | L a[l]ex
55 | M bo[mb]
56 | N ame[n]
57 | NG ba[ng]
58 | OW bl[ow]
59 | OW0
60 | OW1
61 | OW2
62 | OY j[oi]n
63 | OY0
64 | OY1
65 | OY2
66 | P cam[p]
67 | R a[r]ea
68 | S art[s]
69 | SH bu[sh]
70 | T an[t]i
71 | TH ba[th]
72 | UH b[oo]k
73 | UH0
74 | UH1
75 | UH2
76 | UW bl[ue]
77 | UW0
78 | UW1
79 | UW2
80 | V da[v]e
81 | W [wh]at
82 | Y c[u]te
83 | Z arm[s]
84 | ZH u[s]ual
85 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/en-us/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # Normal vowels
 2 | ɑ f[a]ther aː ɑː
 3 | æ c[a]t
 4 | ɛ b[e]d ɜ ɜː
 5 | i cit[y] iː
 6 | ɪ s[i]t ɨ
 7 | ɔ l[aw] ɔː ɒ
 8 | ʊ p[u]t ʊ̯
 9 | ʌ r[u]n
10 | u s[oo]n uː
11 | 
12 | # Schwas
13 | ə [a]llow
14 | ɚ corn[er] ɝː ɝ
15 | 
16 | # Dipthongs
17 | eɪ r[ai]se e eɪ̯
18 | aɪ r[i]ce aɪ̯
19 | oʊ kn[ow] o
20 | ɔɪ n[oi]se
21 | aʊ h[ou]se
22 | 
23 | # Stops
24 | p [p]in
25 | b [b]ut
26 | t [t]on
27 | d [d]ot
28 | k [c]at
29 | ɡ [g]ive g
30 | 
31 | # Affricatives
32 | t͡ʃ [ch]in tʃ
33 | d͡ʒ [g]in dʒ
34 | 
35 | # Fricatives
36 | f [f]in
37 | v [v]im
38 | θ [th]in
39 | ð [th]is
40 | s [s]et
41 | z [z]ing
42 | ʃ [s]ure
43 | ʒ mea[sure]
44 | h [h]am
45 | 
46 | # Other consonants
47 | l [l]ong l̩ ɫ ʟ̩ l̩
48 | m [m]ock m̩
49 | n [kn]ock n̩
50 | ŋ thi[ng]
51 | ɹ [wr]ong r ɾ
52 | w [w]asp
53 | j [y]acht
54 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/en-us/zamia/ipa_map.txt:
--------------------------------------------------------------------------------
 1 | 3 ɚ
 2 | A ɑ
 3 | D ð
 4 | E ɛ
 5 | I ɪ
 6 | N ŋ
 7 | O ɔ
 8 | OI ɔɪ
 9 | S ʃ
10 | T θ
11 | U ʊ
12 | V ʌ
13 | Z ʒ
14 | aI aɪ
15 | aU aʊ
16 | b b
17 | d d
18 | dZ d͡ʒ
19 | e e
20 | f f
21 | g ɡ
22 | h h
23 | i i
24 | j j
25 | k k
26 | l l
27 | m m
28 | n n
29 | o o
30 | p p
31 | pf pf
32 | r ɹ
33 | s s
34 | t t
35 | tS t͡ʃ
36 | ts ts
37 | u u
38 | v v
39 | w w
40 | z z
41 | { æ
42 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/en-us/zamia/phonemes.txt:
--------------------------------------------------------------------------------
 1 | 3 b[i]rd
 2 | A [a]rts
 3 | D [th]an
 4 | E b[ea]r
 5 | I [a]ges
 6 | N ba[ng]
 7 | O [a]lso
 8 | OI b[oy]s
 9 | S bu[sh]
10 | T ba[th]
11 | U b[oa]t
12 | V ab[le]
13 | Z u[s]ual
14 | aI b[i]ke
15 | aU d[ow]n
16 | b [b]a[b]e
17 | d an[d]y
18 | dZ ed[ge]
19 | e [a]men
20 | f [f]ace
21 | g ba[g]s
22 | h [h]ad
23 | i ar[e]a
24 | j c[u]te
25 | k ba[ck]
26 | l a[l]ex
27 | m ar[m]y
28 | n ala[n]
29 | o bl[o]w
30 | p cam[p]
31 | pf [pff]t
32 | r a[r]en
33 | s ask[s]
34 | t an[t]i
35 | tS ea[ch]
36 | ts ge[ts]
37 | u bl[ue]
38 | v da[ve]
39 | w a[w]ay
40 | z arm[s]
41 | { [a]dam
42 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/es-es/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # https://en.wikipedia.org/wiki/Spanish_phonology
 2 | 
 3 | # Vowels
 4 | a p[a]so
 5 | e p[e]so
 6 | i p[i]so
 7 | o p[o]so
 8 | u p[u]jo
 9 | 
10 | # Dipthongs
11 | ai [ai]re
12 | au p[au]sa
13 | ei r[ey]
14 | eu n[eu]tro
15 | oi h[oy]
16 | ou b[ou]
17 | 
18 | ja hacia
19 | je t[ie]rra
20 | jo rad[io]
21 | ju v[iu]da
22 | wa c[ua]dro
23 | we f[ue]go
24 | wi b[ui]tre
25 | wo c[uo]ta
26 | 
27 | # Consonants
28 | b [b]ueno β
29 | d [d]os ð
30 | g [g]racias ɣ
31 | m [m]ucho
32 | n [n]os ŋ
33 | ɲ ni[ñ]o
34 | p [p]or
35 | t [t]u
36 | t͡ʃ mu[ch]a tʃ
37 | k [q]ué
38 | f per[f]ecto
39 | θ gra[c]ias
40 | s [s]í z
41 | ʝ grac[i]as j
42 | x [j]efe
43 | l e[l]
44 | ʎ [ll]ega
45 | ɾ ot[r]a
46 | r co[rr]ecto
47 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/fa/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # https://en.wikipedia.org/wiki/Persian_phonology
 2 | 
 3 | # Vowels
 4 | æ نه
 5 | ɒː تا
 6 | e̞ که e eː
 7 | iː شیر í î i
 8 | o تو oː
 9 | uː زود ʊ
10 | 
11 | # Consonants
12 | b برادر
13 | p پدر
14 | t تا
15 | d دوست
16 | t͡ʃ چوب tʃ
17 | d͡ʒ جوان dʒ
18 | k کشور
19 | g گروه ɡ
20 | ʔ معنا
21 | f فشار
22 | v ویژه
23 | s سایه ŝ
24 | z آزاد
25 | ʃ شاه
26 | ʒ ژاله
27 | x خانه χ
28 | ɢ قلم ɣ q
29 | h هفت
30 | m مادر
31 | n نان
32 | l لب
33 | ɾ ایران r ʁ
34 | j یا
35 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/fr-fr/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # Vowels
 2 | i s[i]
 3 | y s[u]
 4 | u s[ous]
 5 | e f[ée]
 6 | ø c[eux]
 7 | o s[ot]
 8 | ə c[e]
 9 | ɛ f[ait]
10 | œ s[œu]r
11 | ɔ s[o]rt
12 | a s[a] ɑ
13 | 
14 | # Nasalated vowels
15 | ɔ̃ s[on]
16 | ɛ̃ br[in]
17 | ɑ̃ s[ans]
18 | œ̃ br[un]
19 | 
20 | # Semi-vowels
21 | j [h]ier
22 | ɥ pl[u]ie
23 | w [ou]i
24 | 
25 | # Consonants
26 | m [m]ou
27 | n [n]ous
28 | ŋ ku[ng]-fu
29 | p [p]ou
30 | t [t]out
31 | k [c]ou
32 | b [b]oue
33 | d [d]oux
34 | ɡ [g]oût
35 | f [f]ou
36 | s [s]ous
37 | ʃ [ch]ou
38 | v [v]ous
39 | z [z]ou
40 | ʒ [j]oue
41 | ʁ [r]ou
42 | l [l]oup
43 | 
44 | # From loan words
45 | ɲ [gn]ouf
46 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/it-it/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # https://en.wikipedia.org/wiki/Italian_phonology
 2 | 
 3 | # Vowels
 4 | ɛ ad[e]sso ɛː ə
 5 | ɔ c[o]lla ɔː ɔ̃
 6 | a senz[a] aː à ɑ̃ ɒ æ ä ɑ
 7 | e p[e]rò eː
 8 | i cos[ì] iː ɪ I ì ɪː
 9 | o un[o] oː ʊ
10 | u t[u]tte uː u̯
11 | 
12 | # Consonants
13 | ɲ o[gn]uno
14 | ʎ consi[gli]o
15 | ʃ u[s]cita
16 | b [b]ella
17 | d [d]ue
18 | d͡ʒ [g]ente dʒ
19 | d͡z [z]ona dz
20 | f [f]orte
21 | ɡ [g]rave g
22 | j d[i]eci
23 | k [q]uesto c
24 | l so[l]tanto
25 | m al[m]eno
26 | n a[n]cora
27 | ɱ i[n]fame
28 | ŋ a[n]che
29 | p [p]erò
30 | r pe[r]ò ʁ ɹ
31 | s [s]ia
32 | t quan[t]o
33 | t͡ʃ per[ci]ò tʃ
34 | t͡s for[z]a ts
35 | v a[v]anti
36 | w q[u]attro
37 | z pae[s]e
38 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/lb-lb/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # Luxembourgish phonemes
 2 | # Luxembourgish vowels (monophtongs)
 3 | ɑ k[a]pp
 4 | aː k[a]p
 5 | ɛː st[ä]ren
 6 | e m[é]ck
 7 | æ h[e]ll
 8 | eː k[ee]ss
 9 | ə n[e]t
10 | ɐ kann[er]
11 | i m[i]dd
12 | iː l[ii]cht
13 | o spr[o]ch
14 | oː spr[oo]ch
15 | u g[u]tt
16 | uː d[uu]scht
17 | // Monophtongs from loanwoards
18 | y conj[u]gaisoun
19 | y: s[ü]den
20 | ãː restaur[ant]
21 | õː sais[on]
22 | ɛ̃ː cous[in]
23 | œː interi[eu]r
24 | // Luxembourgish diphtongs
25 | æːɪ z[äi]t
26 | ɑʊ [au]to
27 | æːʊ r[au]m
28 | ɑɪ l[ei]t
29 | ɜɪ fr[éi]
30 | oɪ [eu]ro
31 | iə h[ie]n
32 | əʊ sch[ou]l
33 | uə b[ue]dem
34 | // Consonants
35 | # Nasals
36 | m [m]a[mm]
37 | n ma[nn]
38 | ŋ ke[ng]
39 | # Plosives
40 | p [p]aken
41 | b [b]aken
42 | t blu[tt]
43 | d [d]äiwel
44 | k [k]eess
45 | g [g]eess
46 | // Affricates
47 | ʦ schwä[tz]en
48 | dʒ bu[dg]et
49 | # Fricatives
50 | f [f]ësch
51 | v [v]akanz
52 | w sch[w]aarz
53 | s taa[ss]
54 | z [s]ummer
55 | ʃ bii[sch]t
56 | ʒ pro[j]et
57 | X ku[ch]
58 | ɕ lii[ch]t
59 | ʁ ku[g]el
60 | ʑ spi[g]el
61 | h [h]ei
62 | # Approximants
63 | l [l]oft
64 | j [j]o
65 | // Trills
66 | ʀ [r]ou
67 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/nl/cgn/ipa_map.txt:
--------------------------------------------------------------------------------
 1 | @ ə
 2 | A ɑ
 3 | AU ɑu
 4 | E ɛ
 5 | E2 ɛː
 6 | EI ɛi
 7 | EU ø
 8 | G ɣ
 9 | I ɪ
10 | N ŋ
11 | O ɔ
12 | S ʃ
13 | U ʏ
14 | UI œy
15 | Z ʒ
16 | a a
17 | b b
18 | d d
19 | e e
20 | f f
21 | g ɡ
22 | h h
23 | i i
24 | j j
25 | k k
26 | l l
27 | m m
28 | n n
29 | o o
30 | p p
31 | r ɹ
32 | s s
33 | t t
34 | u u
35 | v v
36 | w w
37 | x x
38 | y y
39 | z z
40 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/nl/cgn/phonemes.txt:
--------------------------------------------------------------------------------
 1 | @
 2 | A
 3 | AU
 4 | E
 5 | E2
 6 | EI
 7 | EU
 8 | G
 9 | I
10 | N
11 | O
12 | S
13 | U
14 | UI
15 | Z
16 | a
17 | b
18 | d
19 | e
20 | f
21 | g
22 | h
23 | i
24 | j
25 | k
26 | l
27 | m
28 | n
29 | o
30 | p
31 | r
32 | s
33 | t
34 | u
35 | v
36 | w
37 | x
38 | y
39 | z
40 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/nl/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # dutch (nl)
 2 | # phoneme example [homophone] [homophone]...
 3 | 
 4 | # sources:
 5 | # https://nl.wiktionary.org/wiki/wikiwoordenboek:standaardweergave_uitspraak_nederlands
 6 | # https://en.wikipedia.org/wiki/dutch_phonology
 7 | 
 8 | # normal vowels
 9 | a h[aa]r
10 | ɑ k[a]n ɑ̃
11 | e m[ee]r
12 | ɛ p[e]t æ
13 | i b[i]et i̯
14 | ɪ k[i]p ɪ̯ ɪː
15 | o [oo]g o̝
16 | ɔ [o]s ʌ ʊ̯
17 | y [uu]r y̯
18 | ʏ [u]k ʏ̯
19 | ø [eu]ro ɵ
20 | u [oe]ver u̯
21 | ə g[e]makkelijk
22 | 
23 | # elongated vowels
24 | aː j[a]
25 | ɑː f[a]rm
26 | eː v[ee]r
27 | ɛː [oe]ver
28 | iː anal[y]se
29 | ɔː r[o]ze oː øː
30 | yː centrif[u]ge
31 | uː c[oo]l
32 | œː man[oeu]vre œ
33 | ʏː res[ea]rch
34 | oː b[oo]t
35 | 
36 | # dipthongs
37 | ɛi [ij]s
38 | ɑu [au]gurk
39 | œy [ui]l ʌʊ
40 | 
41 | # plosives (stops)
42 | p [p]as
43 | b [b]ij
44 | t [t]ien
45 | d [d]en
46 | c [tj]alk
47 | ʔ na[-]apen
48 | k [k]at
49 | ɡ [g]oal g
50 | 
51 | # fricatives
52 | f [f]oto
53 | v [v]ier
54 | s [s]ok
55 | z [z]es
56 | x [ch]emie
57 | ɣ [g]at χ
58 | h [h]eer ɦ
59 | ʃ [sj]aal
60 | ʒ [j]am
61 | 
62 | # other consonants
63 | m [m]an
64 | n [n]ul ɲ
65 | ŋ to[ng]
66 | ɱ i[n]fuus
67 | l [l]ip ɫ
68 | ɹ [r]ol r
69 | j [j]as ʲ
70 | w [w]iel ʋ β w̞ β̞
71 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/phoneme_distances.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rhasspy/gruut-ipa/ec9ae6ce7cca0103d9a563fcde3352f805a8e27e/gruut_ipa/data/phoneme_distances.json.gz


--------------------------------------------------------------------------------
/gruut_ipa/data/pt/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # https://en.wikipedia.org/wiki/Portuguese_phonology
 2 | 
 3 | # Vowels
 4 | a f[a]lo
 5 | i v[i]nte ɨ
 6 | ĩ f[im]
 7 | u j[u]s ʊ
 8 | ũ [u]m
 9 | e al[e]x
10 | ẽ [e]mprego
11 | o am[o]r
12 | õ b[o]ns
13 | ɛ b[e]lo
14 | ɐ f[a]lo
15 | ɐ̃ c[a]ma
16 | ɔ pi[o]r
17 | 
18 | # Consonants
19 | p [p]ai
20 | b [b]arco
21 | t [t]enho
22 | d [d]oce
23 | k [c]om
24 | ɡ [g]rande g
25 | f [f]alo
26 | v [v]erde
27 | s [c]éu
28 | z ca[s]a
29 | ʃ [ch]apéu
30 | ʒ [j]óia
31 | m [m]ar
32 | n [n]ada
33 | ɲ vi[nh]o
34 | l [l]anche
35 | ʎ taba[lh]o
36 | ɾ ca[r]o
37 | ʁ [r]ua
38 | ɹ agi[r]
39 | w ág[u]a
40 | j aqu[i]
41 | 
42 | # Dipthongs
43 | aj p[ai]
44 | ɐj pl[ai]na
45 | ej r[ei]
46 | ɛj gel[ei]a
47 | oj d[oi]s
48 | ɔj d[ói]
49 | uj f[ui]
50 | ɐw s[au]dade
51 | ew s[eu]
52 | ɛw c[éu]
53 | iw v[iu]
54 | ow [ou]ro
55 | 
56 | # Nasalated dipthongs
57 | ɐ̃j̃ m[ãe]
58 | ẽj̃ b[em]
59 | õj̃ p[õe]
60 | ũj̃ m[ui]to uj̃
61 | ɐ̃w̃ fal[am]
62 | õw̃ b[om]
63 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/ru-ru/phonemes.txt:
--------------------------------------------------------------------------------
 1 | a [а]лло
 2 | aː бр[а]т
 3 | b [б]лин
 4 | bʲ се[б]е
 5 | d во[д]у
 6 | dʲ буд[ь]
 7 | e ваш[е]
 8 | eː ве[д]ь
 9 | f [ф]ото
10 | fʲ ко[ф]е
11 | ɡ [г]лаз
12 | ɡʲ дру[г]ие
13 | x все[х]
14 | xʲ взма[х]е
15 | i ваш[и]
16 | iː в[и]жу
17 | j все[й]
18 | k [к]атя
19 | kʲ [к]ино
20 | l бы[л]а
21 | lʲ г[л]еб
22 | m идё[м]
23 | mʲ [м]еня
24 | n же[н]а
25 | nʲ де[нь]
26 | o дел[о]
27 | oː дв[о]е
28 | p [п]апа
29 | pʲ [п]ить
30 | r ве[р]а
31 | rʲ в[р]яд
32 | s е[с]ли
33 | ɕː жен[щ]ин
34 | ʂ ва[ш]а
35 | sʲ вес[ь]
36 | t идё[т]
37 | tʲ бы[ть]
38 | t͡s ли[ц]о
39 | t͡ɕ вра[ч]
40 | u ваш[у]
41 | uː буд[у]
42 | v [в]зял
43 | vʲ [в]ещи
44 | ɨ сил[ы]
45 | ɨː в[ы]ше
46 | z вни[з]
47 | ʐ да[ж]е
48 | zʲ в[з]яла
49 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/sv-se/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # https://en.wikipedia.org/wiki/Swedish_phonology
 2 | 
 3 | # Vowels
 4 | ɪ s[i]ll
 5 | ʏ s[y]ll
 6 | ʊ b[o]tt
 7 | ʉː f[u]l
 8 | ɑː m[a]t
 9 | a m[a]tt ä å
10 | eː h[e]l e
11 | ɛ h[e]tta
12 | ɛː h[ä]l
13 | iː s[i]l i
14 | oː m[å]l o ö
15 | ɵ f[u]ll
16 | øː n[ö]t
17 | ɔ m[o]ll
18 | œ n[ö]tt
19 | uː b[o]t u
20 | yː s[y]l
21 | 
22 | # Consonants
23 | ɡ [g]od g gː gː
24 | ɕ [kj]ol ʂ ɕː ʂː
25 | ɧ [sj]ok ɧː
26 | b [b]ok bː
27 | d [d]op ɖ dː ɖː
28 | f [f]ot fː
29 | h [h]ot hː
30 | j [j]ord jː
31 | k [k]on kː
32 | l [l]ov lː ɭ ɭː
33 | m [m]od mː
34 | n [n]od nː ɳ ɳː
35 | ŋ lå[ng] ŋː
36 | p [p]ol pː
37 | r [r]ov rː
38 | s [s]ot sː
39 | t [t]ok ʈ tː ʈː
40 | v [v]åt vː
41 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/sw/alffa/ipa_map.txt:
--------------------------------------------------------------------------------
 1 | BB ᵐɓ
 2 | CC t͡ʃ
 3 | DD ⁿɗ
 4 | GG ᵑg
 5 | JJ ⁿɗ͡ʒ
 6 | LL ð
 7 | NN _
 8 | RR ɣ
 9 | SS ʃ
10 | TT θ
11 | VV ᶬv
12 | XX x
13 | ZZ ⁿz
14 | a ɑ
15 | b ɓ
16 | d ɗ
17 | e ɛ
18 | f f
19 | g ɠ
20 | h h
21 | i i
22 | j ʄ
23 | k k
24 | l l
25 | m m
26 | n n
27 | o ɔ
28 | p p
29 | r ɾ
30 | s s
31 | t t
32 | u u
33 | v v
34 | w w
35 | y j
36 | z z
37 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/sw/alffa/phonemes.txt:
--------------------------------------------------------------------------------
 1 | BB
 2 | CC
 3 | DD
 4 | GG
 5 | JJ
 6 | LL
 7 | NN
 8 | RR
 9 | SS
10 | TT
11 | VV
12 | XX
13 | ZZ
14 | a
15 | b
16 | d
17 | e
18 | f
19 | g
20 | h
21 | i
22 | j
23 | k
24 | l
25 | m
26 | n
27 | o
28 | p
29 | r
30 | s
31 | t
32 | u
33 | v
34 | w
35 | y
36 | z
37 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/sw/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # https://en.wikipedia.org/wiki/Help:IPA/Swahili
 2 | 
 3 | # Vowels
 4 | ɑ b[a]b[a] a
 5 | ɛ nd[e]g[e] e
 6 | i K[i]swah[i]l[i]
 7 | ɔ mt[o]t[o] o
 8 | u [u]h[u]r[u]
 9 | 
10 | # Consonants
11 | ɓ [b]a[b]a b
12 | ɗ [d]ola d
13 | ð [dh]ambi
14 | ʄ ma[j]f j
15 | f [f]isi
16 | ɠ [g]ani g g
17 | ɣ [gh]ali
18 | h u[h]uru
19 | j [y]e[y]e y
20 | k [k]itabu
21 | l [l]akini
22 | m da[m]u m m̩
23 | ᵐɓ [mb]ali ᵐb
24 | ᶬv [mv]inyo
25 | n [n]i[n]i n̩
26 | ⁿɗ muhi[nd]i
27 | ᵑg [ng]oma ᵑg
28 | ⁿɗ͡ʒ  i[nj]ili ɗʒ 
29 | ⁿz kwa[nz]a
30 | p kika[p]u
31 | ɾ [r]afiki r
32 | s [s]i[s]i
33 | ʃ [sh]amba
34 | t mo[t]o
35 | t͡ʃ [ch]umba tʃ
36 | θ [th]ela[th]ini
37 | v [v]itabu
38 | w [w]atu
39 | x subul[kh]eri
40 | z ma[z]iwa
41 | 
42 | # http://alffa.imag.fr/
43 | # IPA ALFFA
44 | # ᵐɓ BB
45 | # t͡ʃ CC
46 | # ⁿɗ DD
47 | # ᵑg GG
48 | # ⁿɗ͡ʒ JJ
49 | # ð LL
50 | # _ NN
51 | # ɣ RR
52 | # ʃ SS
53 | # θ TT
54 | # ᶬv VV
55 | # x XX
56 | # ⁿz ZZ
57 | # ɑ a
58 | # ɓ b
59 | # ɗ d
60 | # ɛ e
61 | # f f
62 | # ɠ g
63 | # h h
64 | # i i
65 | # ʄ j
66 | # k k
67 | # l l
68 | # m m
69 | # n n
70 | # ɔ o
71 | # p p
72 | # ɾ r
73 | # s s
74 | # t t
75 | # u u
76 | # v v
77 | # w w
78 | # j y
79 | # z z
80 | 


--------------------------------------------------------------------------------
/gruut_ipa/data/vi-n/phonemes.txt:
--------------------------------------------------------------------------------
 1 | # https://en.wikipedia.org/wiki/Vietnamese_phonology
 2 | 
 3 | # Vowels
 4 | ɨ ? ɯ ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
 5 | a ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
 6 | ă ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
 7 | e ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
 8 | ə ? ɤ ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
 9 | ɛ ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
10 | ə̆ ? ɤ̆ ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
11 | i ? y ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
12 | o ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
13 | ɔ ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
14 | u ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
15 | 
16 | # Dipthongs and tripthongs
17 | aj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
18 | ăj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
19 | aw ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
20 | ăw ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
21 | ɨə̯ ? ɯə ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
22 | əj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
23 | ɨə̯j ? ɯə̯j ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
24 | ə̆j ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
25 | ew ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
26 | ɛw ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
27 | ɨə̯w ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
28 | ə̆w ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
29 | iə̯ ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
30 | iə̯w ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
31 | iw ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
32 | ɨj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
33 | oj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
34 | ɔj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
35 | uə̯ ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
36 | uə̯j ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
37 | uj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
38 | ɨw ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
39 | 
40 | # Consonants
41 | ɲ ?
42 | ɣ ?
43 | b ? ɓ
44 | c [ch]ẻ ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
45 | d ? ɗ
46 | f ?
47 | h ?
48 | j ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
49 | k ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
50 | k͡p ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
51 | l ?
52 | m ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
53 | n ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
54 | ŋ ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
55 | ŋ͡m ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
56 | ɹ [r]a
57 | s [x]inh ʂ
58 | t ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
59 | tʰ ?
60 | c [tr]ẻ
61 | v [v]ợ
62 | w ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥
63 | x ?
64 | z [d]a
65 | p chấ[p]
66 | 


--------------------------------------------------------------------------------
/gruut_ipa/distances.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Functions for comparing phonemes by a distance metric"""
 3 | import gzip
 4 | import itertools
 5 | import json
 6 | import sys
 7 | import threading
 8 | import typing
 9 | 
10 | import numpy as np
11 | 
12 | from gruut_ipa.constants import (
13 |     _CONSONANTS,
14 |     _DATA_DIR,
15 |     _SCHWAS,
16 |     _VOWELS,
17 |     FEATURE_KEYS,
18 |     Consonant,
19 |     Schwa,
20 |     Vowel,
21 | )
22 | from gruut_ipa.features import to_vector
23 | 
24 | _CLOSEST_TYPE = typing.Mapping[str, typing.Sequence[str]]
25 | _CLOSEST: typing.Optional[_CLOSEST_TYPE] = None
26 | 
27 | 
28 | def create_closest(
29 |     symbols: typing.Optional[
30 |         typing.Iterable[typing.Union[Vowel, Consonant, Schwa]]
31 |     ] = None
32 | ) -> _CLOSEST_TYPE:
33 |     """Create mapping from each IPA symbol to a list of other IPA symbols reverse ordered by feature distance"""
34 |     import sklearn.metrics
35 | 
36 |     if not symbols:
37 |         symbols = itertools.chain(_VOWELS, _CONSONANTS, _SCHWAS,)
38 | 
39 |     symbol_list = list(symbols)
40 |     vectors = {}
41 |     for symbol in symbol_list:
42 |         if symbol.ipa in vectors:
43 |             continue
44 | 
45 |         vectors[symbol.ipa] = to_vector(symbol)
46 | 
47 |     matrix = np.vstack(list(vectors.values()))
48 | 
49 |     w = np.ones(matrix.shape[1])
50 | 
51 |     # Adjust feature weights
52 |     w[FEATURE_KEYS["vowel_place"]] = 0.5
53 |     w[FEATURE_KEYS["vowel_height"]] = 1
54 |     w[FEATURE_KEYS["vowel_rounded"]] = 0.01
55 | 
56 |     w[FEATURE_KEYS["consonant_place"]] = 0.15
57 |     w[FEATURE_KEYS["consonant_voiced"]] = 0.5
58 |     w[FEATURE_KEYS["consonant_sounds_like"]] = 0.5
59 | 
60 |     dist = sklearn.metrics.pairwise_distances(matrix, metric="minkowski", p=2, w=w)
61 | 
62 |     dist_symbols = list(vectors.keys())
63 |     closest = {
64 |         s: [dist_symbols[j] for j in dist[i].argsort() if s != dist_symbols[j]]
65 |         for i, s in enumerate(dist_symbols)
66 |     }
67 | 
68 |     return closest
69 | 
70 | 
71 | _CLOSEST_LOCK = threading.Lock()
72 | 
73 | 
74 | def get_closest(ipa: str) -> typing.Optional[typing.Sequence[str]]:
75 |     """Get a list of IPA symbols that are closest, ordered by increasing distance."""
76 |     global _CLOSEST
77 | 
78 |     with _CLOSEST_LOCK:
79 |         if _CLOSEST is None:
80 |             closest_path = _DATA_DIR / "phoneme_distances.json.gz"
81 |             with gzip.open(closest_path, "r") as closest_file:
82 |                 _CLOSEST = json.load(closest_file)
83 | 
84 |     assert _CLOSEST is not None
85 | 
86 |     return _CLOSEST.get(ipa)
87 | 
88 | 
89 | # -----------------------------------------------------------------------------
90 | 
91 | if __name__ == "__main__":
92 |     # {
93 |     #   "<symbol>": ["<closest symbol>", "<next closest symbol>", ...],
94 |     #   ...
95 |     # }
96 |     json.dump(create_closest(), sys.stdout, indent=4, ensure_ascii=False)
97 | 


--------------------------------------------------------------------------------
/gruut_ipa/espeak.py:
--------------------------------------------------------------------------------
  1 | """Mapping between IPA and Espeak"""
  2 | import re
  3 | import unicodedata
  4 | 
  5 | # http://espeak.sourceforge.net/phonemes.html
  6 | 
  7 | 
  8 | def ipa_to_espeak(ipa: str, keep_whitespace: bool = True) -> str:
  9 |     """Convert IPA string to eSpeak phonemes"""
 10 |     ipa_codepoints = unicodedata.normalize("NFD", ipa)
 11 | 
 12 |     return IPA_PATTERN.sub(
 13 |         lambda match: IPA_TO_ESPEAK.get(match.group(1), ""), ipa_codepoints
 14 |     )
 15 | 
 16 | 
 17 | def espeak_to_ipa(espeak: str) -> str:
 18 |     """Convert eSpeak phonemes to IPA phones"""
 19 |     # Remove brackets
 20 |     espeak_codepoints = "".join(
 21 |         c for c in unicodedata.normalize("NFD", espeak) if c not in {"[", "]"}
 22 |     )
 23 | 
 24 |     return ESPEAK_PATTERN.sub(
 25 |         lambda match: ESPEAK_TO_IPA.get(match.group(1), ""), espeak_codepoints
 26 |     )
 27 | 
 28 | 
 29 | # -----------------------------------------------------------------------------
 30 | 
 31 | IPA_TO_ESPEAK = {
 32 |     "\u00e6": "a",
 33 |     "\u0061": "a",
 34 |     "\u0251": "A",
 35 |     "\u0252": "A.",
 36 |     "\u028c": "V",
 37 |     "\u0250": "V",
 38 |     "\u0062": "b",
 39 |     "\u0253": "b`",
 40 |     "\u0299": "b<trl>",
 41 |     "\u03b2": "B",
 42 |     "\u0063": "c",
 43 |     "\u00e7": "C",
 44 |     "\u0063\u0327": "C",
 45 |     "\u0255": "S;",
 46 |     "\u0064": "d",
 47 |     "\u0257": "d`",
 48 |     "\u0256": "d.",
 49 |     "\u00f0": "D",
 50 |     "\u0065": "e",
 51 |     "\u0259": "@",
 52 |     "\u025a": "3",
 53 |     "\u0258": "@",
 54 |     "\u025b": "E",
 55 |     "\u025c": 'V"',
 56 |     "\u025d": "3",
 57 |     "\u025e": 'O"',
 58 |     "\u0066": "f",
 59 |     "\u0261": "g",
 60 |     "\u0067": "g",
 61 |     "\u0260": "g`",
 62 |     "\u0262": "G",
 63 |     "\u029b": "G`",
 64 |     "\u0263": "Q",
 65 |     "\u02e0": "~",
 66 |     "\u0264": "o-",
 67 |     "\u0068": "h",
 68 |     "\u02b0": "<h>",
 69 |     "\u0127": "H",
 70 |     "\u0266": "h<?>",
 71 |     "\u0267": "",
 72 |     "\u0265": "j<rnd>",
 73 |     "\u029c": "",
 74 |     "\u0069": "i",
 75 |     "\u0268": 'i"',
 76 |     "\u026a": "I",
 77 |     "\u006a": "j",
 78 |     "\u02b2": ";",
 79 |     "\u029d": "C<vcd>",
 80 |     "\u025f": "J",
 81 |     "\u0284": "J`",
 82 |     "\u006b": "k",
 83 |     "\u006c": "l",
 84 |     "\u026b": "l",
 85 |     "\u026c": "s<lat>",
 86 |     "\u026d": "l.",
 87 |     "\u026e": "z<lat>",
 88 |     "\u029f": "L",
 89 |     "\u006d": "m",
 90 |     "\u0271": "M",
 91 |     "\u026f": "u-",
 92 |     "\u0270": "Q",
 93 |     "\u006e": "n",
 94 |     "\u0272": "n^",
 95 |     "\u014b": "N",
 96 |     "\u0273": "n.",
 97 |     "\u0274": 'n"',
 98 |     "\u006f": "o",
 99 |     "\u0298": "p!",
100 |     "\u0275": "@.",
101 |     "\u00f8": "Y",
102 |     "\u0153": "W",
103 |     "\u0276": "W",
104 |     "\u0254": "O",
105 |     "\u0070": "p",
106 |     "\u0278": "F",
107 |     "\u0071": "q",
108 |     "\u0072": "r<trl>",
109 |     "\u027e": "R",
110 |     "\u027c": "",
111 |     "\u027d": "*.",
112 |     "\u0279": "r",
113 |     "\u027b": "r.",
114 |     "\u027a": "*<lat>",
115 |     "\u0280": 'r"',
116 |     "\u0281": "r",
117 |     "\u0073": "s",
118 |     "\u0282": "s.",
119 |     "\u0283": "S",
120 |     "\u0074": "t",
121 |     "\u0288": "t.",
122 |     "\u03b8": "T",
123 |     "\u0075": "u",
124 |     "\u0289": 'u"',
125 |     "\u028a": "U",
126 |     "\u0076": "v",
127 |     "\u028b": "v#",
128 |     "\u0077": "w",
129 |     "\u02b7": "<w>",
130 |     "\u028d": "w<vls>",
131 |     "\u0078": "x",
132 |     "\u03c7": "X",
133 |     "\u0079": "y",
134 |     "\u028e": "l^",
135 |     "\u028f": "I.",
136 |     "\u007a": "z",
137 |     "\u0291": "Z;",
138 |     "\u0290": "z.",
139 |     "\u0292": "Z",
140 |     "\u0294": "?",
141 |     "\u02a1": "",
142 |     "\u0295": "H<vcd>",
143 |     "\u02a2": "",
144 |     "\u02e4": "<H>",
145 |     "\u01c3": "c!",
146 |     "\u01c0": "t!",
147 |     "\u01c2": "c!",
148 |     "\u01c1": "l!",
149 |     "\u0320": "",
150 |     "\u032a": "",
151 |     "\u033a": "",
152 |     "\u031f": "",
153 |     "\u031d": "",
154 |     "\u031e": "",
155 |     "\u02c8": "'",
156 |     "\u02cc": ",",
157 |     "\u0329": "-",
158 |     "\u031a": "<o>",
159 |     "\u002e": "",
160 |     "\u02d1": "",
161 |     "\u0308": "",
162 |     "\u0324": "<?>",
163 |     "\u02d0": ":",
164 |     "\u02bc": "`",
165 |     "\u0325": "<o>",
166 |     "\u030a": "",
167 |     "\u031c": "",
168 |     "\u0339": "",
169 |     "\u0303": "~",
170 |     "\u0334": "~",
171 |     "\u0330": "",
172 |     "\u032c": "",
173 |     "\u0306": "",
174 |     "\u032f": "",
175 |     "\u033d": "",
176 |     "\u02de": "<r>",
177 |     "\u033b": "",
178 |     "\u0318": "",
179 |     "\u0319": "",
180 |     "\u033c": "",
181 |     "\u2197": "",
182 |     "\u2191": "",
183 |     "\u2198": "",
184 |     "\u2193": "",
185 |     #
186 |     # Ties
187 |     "\u0361": "",
188 |     "\u035C": "",
189 |     #
190 |     # Tied symbols
191 |     "\u0288\u0361\u0282": "tS",
192 |     "\u0256\u0361\u0290": "dz",
193 |     #
194 |     # Breaks
195 |     "|": "_::",
196 |     "\u2016": "_::_::",
197 |     "#": "",
198 | }
199 | 
200 | ESPEAK_TO_IPA = {v: k for k, v in IPA_TO_ESPEAK.items() if v}
201 | 
202 | # Regex disjunction in descending length order
203 | ESPEAK_PATTERN = re.compile(
204 |     "({})".format(
205 |         "|".join(re.escape(espeak) for espeak in sorted(ESPEAK_TO_IPA, reverse=True))
206 |     )
207 | )
208 | 
209 | IPA_PATTERN = re.compile(
210 |     "({})".format(
211 |         "|".join(re.escape(ipa) for ipa in sorted(IPA_TO_ESPEAK, key=len, reverse=True))
212 |     )
213 | )
214 | 


--------------------------------------------------------------------------------
/gruut_ipa/features.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Functions for converting IPA symbols to and from feature vectors."""
  3 | import dataclasses
  4 | import typing
  5 | 
  6 | from gruut_ipa.constants import (
  7 |     CONSONANTS,
  8 |     FEATURE_COLUMNS,
  9 |     FEATURE_EMPTY,
 10 |     FEATURE_KEYS,
 11 |     FEATURE_ORDINAL_COLUMNS,
 12 |     IPA,
 13 |     SCHWAS,
 14 |     VOWELS,
 15 |     Break,
 16 |     BreakType,
 17 |     Consonant,
 18 |     ConsonantPlace,
 19 |     ConsonantType,
 20 |     PhonemeLength,
 21 |     Schwa,
 22 |     Stress,
 23 |     Vowel,
 24 |     VowelHeight,
 25 |     VowelPlacement,
 26 | )
 27 | 
 28 | 
 29 | def to_vector(
 30 |     symbol: typing.Union[Vowel, Consonant, Schwa, Break]
 31 | ) -> typing.Sequence[float]:
 32 |     """Converts a symbol into a feature vector"""
 33 |     features: typing.Dict[str, str] = {}
 34 | 
 35 |     if isinstance(symbol, Vowel):
 36 |         features["symbol_type"] = "phoneme"
 37 |         features["phoneme_type"] = "vowel"
 38 |         features["vowel_height"] = symbol.height.value
 39 |         features["vowel_place"] = symbol.placement.value
 40 |         features["vowel_rounded"] = "rounded" if symbol.rounded else "unrounded"
 41 |         features["phoneme_length"] = symbol.length.value
 42 | 
 43 |         if symbol.nasalated:
 44 |             features["diacritic"] = "nasalated"
 45 | 
 46 |         if symbol.stress is not None:
 47 |             features["vowel_stress"] = symbol.stress.value
 48 | 
 49 |     elif isinstance(symbol, Consonant):
 50 |         features["symbol_type"] = "phoneme"
 51 |         features["phoneme_type"] = "consonant"
 52 |         features["consonant_voiced"] = "voiced" if symbol.voiced else "unvoiced"
 53 |         features["consonant_type"] = symbol.type.value
 54 |         features["consonant_place"] = symbol.place.value
 55 |         features["consonant_sounds_like"] = symbol.sounds_like.value
 56 |         features["phoneme_length"] = symbol.length.value
 57 | 
 58 |         if symbol.velarized:
 59 |             features["diacritic"] = "velarized"
 60 | 
 61 |     elif isinstance(symbol, Schwa):
 62 |         features["symbol_type"] = "phoneme"
 63 |         features["phoneme_type"] = "schwa"
 64 |         features["phoneme_length"] = symbol.length.value
 65 | 
 66 |         if symbol.r_coloured:
 67 |             features["consonant_sounds_like"] = "r"
 68 | 
 69 |     elif isinstance(symbol, Break):
 70 |         features["symbol_type"] = "break"
 71 |         features["break_type"] = symbol.type.value
 72 |     else:
 73 |         # Unsupported symbol type
 74 |         raise ValueError(symbol)
 75 | 
 76 |     return features_to_vector(features)
 77 | 
 78 | 
 79 | def from_vector(
 80 |     vector: typing.Sequence[float],
 81 | ) -> typing.Union[Vowel, Consonant, Schwa, Break]:
 82 |     """Converts a feature vector back into a symbol"""
 83 |     features = vector_to_features(vector)
 84 |     if features["symbol_type"] == "break":
 85 |         break_type = BreakType(features["break_type"])
 86 |         return Break(break_type)
 87 | 
 88 |     if features["symbol_type"] == "phoneme":
 89 |         if features["phoneme_type"] == "vowel":
 90 |             height = VowelHeight(features["vowel_height"])
 91 |             placement = VowelPlacement(features["vowel_place"])
 92 |             rounded = features["vowel_rounded"] == "rounded"
 93 |             nasalated = features["diacritic"] == "nasalated"
 94 |             length = PhonemeLength(features["phoneme_length"])
 95 | 
 96 |             stress: typing.Optional[Stress] = None
 97 |             stress_val = features["vowel_stress"]
 98 |             if stress_val != FEATURE_EMPTY:
 99 |                 stress = Stress(stress_val)
100 | 
101 |             for vowel in VOWELS.values():
102 |                 if (
103 |                     (vowel.height == height)
104 |                     and (vowel.placement == placement)
105 |                     and (vowel.rounded == rounded)
106 |                     and (vowel.nasalated == nasalated)
107 |                 ):
108 |                     if (stress is None) and (length == PhonemeLength.NORMAL):
109 |                         # Don't need to make a copy
110 |                         return vowel
111 | 
112 |                     return dataclasses.replace(vowel, stress=stress)
113 | 
114 |             raise ValueError(f"Unknown vowel: {features}")
115 | 
116 |         if features["phoneme_type"] == "consonant":
117 |             c_type = ConsonantType(features["consonant_type"])
118 |             place = ConsonantPlace(features["consonant_place"])
119 |             voiced = features["consonant_voiced"] == "voiced"
120 |             velarized = features["diacritic"] == "velarized"
121 |             length = PhonemeLength(features["phoneme_length"])
122 | 
123 |             for consonant in CONSONANTS.values():
124 |                 if (
125 |                     (consonant.type == c_type)
126 |                     and (consonant.place == place)
127 |                     and (consonant.voiced == voiced)
128 |                     and (consonant.velarized == velarized)
129 |                 ):
130 |                     if length == PhonemeLength.NORMAL:
131 |                         # Don't need to make a copy
132 |                         return consonant
133 | 
134 |                     return dataclasses.replace(consonant, length=length)
135 | 
136 |             raise ValueError(f"Unknown vowel: {features}")
137 | 
138 |         if features["phoneme_type"] == "schwa":
139 |             r_coloured = features["consonant_sounds_like"] == "r"
140 |             length = PhonemeLength(features["phoneme_length"])
141 | 
142 |             for schwa in SCHWAS.values():
143 |                 if schwa.r_coloured == r_coloured:
144 |                     if length == PhonemeLength.NORMAL:
145 |                         # Don't need to make a copy
146 |                         return schwa
147 | 
148 |                     return dataclasses.replace(schwa, length=length)
149 | 
150 |             raise ValueError(f"Unknown vowel: {features}")
151 | 
152 |         # Unsupported phoneme type
153 |         raise ValueError(f"Unknown phoneme type: {features}")
154 | 
155 |     # Unsupported symbol type
156 |     raise ValueError(f"Unknown symbol type: {features}")
157 | 
158 | 
159 | def string_to_symbol(symbol_str: str) -> typing.Union[Vowel, Consonant, Schwa, Break]:
160 |     """Get gruut IPA object for IPA symbol"""
161 |     if not symbol_str:
162 |         raise ValueError("Empty symbol")
163 | 
164 |     # Check break first
165 |     if symbol_str == IPA.BREAK_WORD:
166 |         return Break(BreakType.WORD)
167 | 
168 |     if symbol_str == IPA.BREAK_MINOR:
169 |         return Break(BreakType.MINOR)
170 | 
171 |     if symbol_str == IPA.BREAK_MAJOR:
172 |         return Break(BreakType.MAJOR)
173 | 
174 |     # Strip stress
175 |     maybe_stress: typing.Optional[Stress] = None
176 |     if symbol_str[0] == IPA.STRESS_PRIMARY:
177 |         maybe_stress = Stress.PRIMARY
178 |         symbol_str = symbol_str[1:]
179 |     elif symbol_str[0] == IPA.STRESS_SECONDARY:
180 |         maybe_stress = Stress.SECONDARY
181 |         symbol_str = symbol_str[1:]
182 | 
183 |     if not symbol_str:
184 |         raise ValueError("No letters")
185 | 
186 |     # Strip length
187 |     length = PhonemeLength.NORMAL
188 |     if symbol_str[-1] == IPA.HALF_LONG:
189 |         length = PhonemeLength.SHORT
190 |         symbol_str = symbol_str[:-1]
191 |     elif symbol_str[-1] == IPA.LONG:
192 |         length = PhonemeLength.LONG
193 |         symbol_str = symbol_str[:-1]
194 | 
195 |     if not symbol_str:
196 |         raise ValueError("No letters")
197 | 
198 |     # Look up
199 |     maybe_vowel = VOWELS.get(symbol_str)
200 |     if maybe_vowel is not None:
201 |         return dataclasses.replace(maybe_vowel, stress=maybe_stress, length=length)
202 | 
203 |     maybe_consonant = CONSONANTS.get(symbol_str)
204 |     if maybe_consonant is not None:
205 |         return dataclasses.replace(maybe_consonant, length=length)
206 | 
207 |     maybe_schwa = SCHWAS.get(symbol_str)
208 |     if maybe_schwa is not None:
209 |         return dataclasses.replace(maybe_schwa, length=length)
210 | 
211 |     raise ValueError(f"Unsupported symbol type: {symbol_str}")
212 | 
213 | 
214 | def features_to_vector(features: typing.Mapping[str, str]) -> typing.Sequence[float]:
215 |     """Create phoneme feature vector from mapping"""
216 |     vector: typing.List[float] = []
217 | 
218 |     for col, values in FEATURE_COLUMNS.items():
219 |         value = features.get(col, FEATURE_EMPTY)
220 | 
221 |         if col in FEATURE_ORDINAL_COLUMNS:
222 |             # Single value normalized by number of possible values
223 |             vector.append(values.index(value) / len(values))
224 |         else:
225 |             # One-hot vector
226 |             for v in values:
227 |                 vector.append(1.0 if (v == value) else 0.0)
228 | 
229 |     return vector
230 | 
231 | 
232 | def vector_to_features(vector: typing.Sequence[float]) -> typing.Mapping[str, str]:
233 |     """Create mapping from phoneme feature vector"""
234 |     features: typing.Dict[str, str] = {}
235 | 
236 |     for col_name, values in FEATURE_COLUMNS.items():
237 |         col_key = FEATURE_KEYS[col_name]
238 |         if col_name in FEATURE_ORDINAL_COLUMNS:
239 |             # Single value normalized by number of possible values
240 |             assert isinstance(col_key, int)
241 |             val_idx = int(vector[col_key] * len(values))
242 |         else:
243 |             # One-hot vector
244 |             assert isinstance(col_key, slice)
245 |             if 1.0 not in vector[col_key]:
246 |                 assert False, (col_name, col_key, vector[col_key])
247 |             val_idx = vector[col_key].index(1.0)
248 | 
249 |         features[col_name] = values[val_idx]
250 | 
251 |     return features
252 | 


--------------------------------------------------------------------------------
/gruut_ipa/kirshenbaum.py:
--------------------------------------------------------------------------------
  1 | """Mapping between IPA and Kirshenbaum"""
  2 | 
  3 | # http://www.blahedo.org/ascii-ipa.html
  4 | 
  5 | IPA_TO_KIRSHENBAUM = {
  6 |     "\u0061": "a",
  7 |     "\u0250": "",
  8 |     "\u0251": "A",
  9 |     "\u0252": "A.",
 10 |     "\u00e6": "&",
 11 |     "\u028c": "V",
 12 |     "\u0062": "b",
 13 |     "\u0253": "b`",
 14 |     "\u0299": "b<trl>",
 15 |     "\u03b2": "B",
 16 |     "\u0063": "c",
 17 |     "\u00e7": "C",
 18 |     "\u0255": "",
 19 |     "\u0064": "d",
 20 |     "\u0257": "d`",
 21 |     "\u0256": "d.",
 22 |     "\u00f0": "D",
 23 |     "\u0065": "e",
 24 |     "\u0259": "@",
 25 |     "\u025a": "R",
 26 |     "\u0258": "@",
 27 |     "\u025b": "E",
 28 |     "\u025c": 'V"',
 29 |     "\u025d": "R<umd>",
 30 |     "\u025e": 'O"',
 31 |     "\u0066": "f",
 32 |     "\u0261": "g",
 33 |     "\u0260": "g`",
 34 |     "\u0262": "G",
 35 |     "\u029b": "G`",
 36 |     "\u0263": "Q",
 37 |     "\u02e0": "~",
 38 |     "\u0264": "o-",
 39 |     "\u0068": "h",
 40 |     "\u02b0": "<h>",
 41 |     "\u0127": "H",
 42 |     "\u0266": "h<?>",
 43 |     "\u0267": "",
 44 |     "\u0265": "j<rnd>",
 45 |     "\u029c": "",
 46 |     "\u0069": "i",
 47 |     "\u0268": 'i"',
 48 |     "\u026a": "I",
 49 |     "\u006a": "j",
 50 |     "\u02b2": ";",
 51 |     "\u029d": "C<vcd>",
 52 |     "\u025f": "J",
 53 |     "\u0284": "J`",
 54 |     "\u006b": "k",
 55 |     "\u006c": "l",
 56 |     "\u026b": "",
 57 |     "\u026c": "s<lat>",
 58 |     "\u026d": "l.",
 59 |     "\u026e": "z<lat>",
 60 |     "\u029f": "L",
 61 |     "\u006d": "m",
 62 |     "\u0271": "M",
 63 |     "\u026f": "u-",
 64 |     "\u0270": "j<vel>",
 65 |     "\u006e": "n",
 66 |     "\u0272": "n^",
 67 |     "\u014b": "N",
 68 |     "\u0273": "n.",
 69 |     "\u0274": 'n"',
 70 |     "\u006f": "o",
 71 |     "\u0298": "p!",
 72 |     "\u0275": "@.",
 73 |     "\u00f8": "Y",
 74 |     "\u0153": "&.",
 75 |     "\u0276": "W",
 76 |     "\u0254": "O",
 77 |     "\u0070": "p",
 78 |     "\u0278": "F",
 79 |     "\u0071": "q",
 80 |     "\u0072": "r<trl>",
 81 |     "\u027e": "*",
 82 |     "\u027c": "",
 83 |     "\u027d": "*.",
 84 |     "\u0279": "r",
 85 |     "\u027b": "r.",
 86 |     "\u027a": "*<lat>",
 87 |     "\u0280": 'r"',
 88 |     "\u0281": 'g"',
 89 |     "\u0073": "s",
 90 |     "\u0282": "s.",
 91 |     "\u0283": "S",
 92 |     "\u0074": "t",
 93 |     "\u0288": "t.",
 94 |     "\u03b8": "T",
 95 |     "\u0075": "u",
 96 |     "\u0289": 'u"',
 97 |     "\u028a": "U",
 98 |     "\u0076": "v",
 99 |     "\u028b": "r<lbd>",
100 |     "\u0077": "w",
101 |     "\u02b7": "<w>",
102 |     "\u028d": "w<vls>",
103 |     "\u0078": "x",
104 |     "\u03c7": "X",
105 |     "\u0079": "y",
106 |     "\u028e": "l^",
107 |     "\u028f": "I.",
108 |     "\u007a": "z",
109 |     "\u0291": "",
110 |     "\u0290": "z.",
111 |     "\u0292": "Z",
112 |     "\u0294": "?",
113 |     "\u02a1": "",
114 |     "\u0295": "H<vcd>",
115 |     "\u02a2": "",
116 |     "\u02e4": "<H>",
117 |     "\u01c3": "c!",
118 |     "\u01c0": "t!",
119 |     "\u01c2": "c!",
120 |     "\u01c1": "l!",
121 |     "\u0320": "",
122 |     "\u032a": "",
123 |     "\u033a": "",
124 |     "\u031f": "",
125 |     "\u031d": "",
126 |     "\u031e": "",
127 |     "\u02c8": "'",
128 |     "\u02cc": ",",
129 |     "\u0329": "-",
130 |     "\u031a": "<o>",
131 |     "\u002e": "",
132 |     "\u02d1": "",
133 |     "\u0308": "",
134 |     "\u0324": "<?>",
135 |     "\u02d0": ":",
136 |     "\u02bc": "`",
137 |     "\u0325": "<o>",
138 |     "\u030a": "",
139 |     "\u031c": "",
140 |     "\u0339": "",
141 |     "\u0303": "~",
142 |     "\u0334": "~",
143 |     "\u0330": "",
144 |     "\u032c": "",
145 |     "\u0306": "",
146 |     "\u032f": "",
147 |     "\u033d": "",
148 |     "\u02de": "<r>",
149 |     "\u033b": "",
150 |     "\u0318": "",
151 |     "\u0319": "",
152 |     "\u033c": "",
153 |     "\u2197": "",
154 |     "\u2191": "",
155 |     "\u2198": "",
156 |     "\u2193": "",
157 |     "\u030f": "1",
158 |     "\u0300": "2",
159 |     "\u0304": "3",
160 |     "\u0301": "4",
161 |     "\u030b": "5",
162 | }
163 | 


--------------------------------------------------------------------------------
/gruut_ipa/phonemes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Functions for manipulating phones/phonemes"""
  3 | import logging
  4 | import re
  5 | import typing
  6 | import unicodedata
  7 | from collections import defaultdict
  8 | 
  9 | from gruut_ipa.constants import (  # noqa: F401
 10 |     _DATA_DIR,
 11 |     _DIR,
 12 |     CONSONANTS,
 13 |     FEATURE_COLUMNS,
 14 |     FEATURE_EMPTY,
 15 |     FEATURE_KEYS,
 16 |     FEATURE_ORDINAL_COLUMNS,
 17 |     IPA,
 18 |     LANG_ALIASES,
 19 |     SCHWAS,
 20 |     VOWELS,
 21 |     Accent,
 22 |     Break,
 23 |     BreakType,
 24 |     Consonant,
 25 |     ConsonantPlace,
 26 |     ConsonantType,
 27 |     Dipthong,
 28 |     Intonation,
 29 |     PhonemeLength,
 30 |     Schwa,
 31 |     Stress,
 32 |     Vowel,
 33 |     VowelHeight,
 34 |     VowelPlacement,
 35 | )
 36 | from gruut_ipa.utils import resolve_lang
 37 | 
 38 | _LOGGER = logging.getLogger("gruut_ipa")
 39 | 
 40 | # -----------------------------------------------------------------------------
 41 | 
 42 | 
 43 | class Phone:
 44 |     """Single IPA phone with diacritics and suprasegmentals"""
 45 | 
 46 |     def __init__(
 47 |         self,
 48 |         letters: str,
 49 |         stress: typing.Optional[Stress] = None,
 50 |         accents: typing.Optional[typing.Iterable[Accent]] = None,
 51 |         is_long: bool = False,
 52 |         nasal: typing.Optional[typing.Set[int]] = None,
 53 |         raised: typing.Optional[typing.Set[int]] = None,
 54 |         diacritics: typing.Optional[typing.Dict[int, typing.Set[str]]] = None,
 55 |         suprasegmentals: typing.Optional[typing.Set[str]] = None,
 56 |         tone: str = "",
 57 |     ):
 58 |         self.letters: str = unicodedata.normalize("NFC", letters)
 59 |         self.stress = stress
 60 |         self.accents: typing.List[Accent] = list(accents or [])
 61 |         self.is_long: bool = is_long
 62 | 
 63 |         self.nasal: typing.Set[int] = nasal or set()
 64 |         self.is_nasal = bool(self.nasal)
 65 | 
 66 |         self.raised: typing.Set[int] = raised or set()
 67 |         self.is_raised = bool(self.raised)
 68 | 
 69 |         self.tone: str = tone
 70 | 
 71 |         self.diacritics: typing.Dict[int, typing.Set[str]] = diacritics or defaultdict(
 72 |             set
 73 |         )
 74 |         self.suprasegmentals: typing.Set[str] = suprasegmentals or set()
 75 | 
 76 |         # Decompose suprasegmentals and diacritics
 77 |         if self.stress == Stress.PRIMARY:
 78 |             self.suprasegmentals.add(IPA.STRESS_PRIMARY)
 79 |         elif self.stress == Stress.SECONDARY:
 80 |             self.suprasegmentals.add(IPA.STRESS_SECONDARY)
 81 | 
 82 |         if Accent.ACUTE in self.accents:
 83 |             self.suprasegmentals.add(IPA.ACCENT_ACUTE)
 84 | 
 85 |         if Accent.GRAVE in self.accents:
 86 |             self.suprasegmentals.add(IPA.ACCENT_GRAVE)
 87 | 
 88 |         if self.is_long:
 89 |             self.suprasegmentals.add(IPA.LONG)
 90 | 
 91 |         # Nasal
 92 |         for letter_index in self.nasal:
 93 |             letter_diacritics = self.diacritics.get(letter_index)
 94 |             if letter_diacritics is None:
 95 |                 letter_diacritics = set()
 96 |                 self.diacritics[letter_index] = letter_diacritics
 97 | 
 98 |             letter_diacritics.add(IPA.NASAL)
 99 | 
100 |         # Raised
101 |         for letter_index in self.raised:
102 |             letter_diacritics = self.diacritics.get(letter_index)
103 |             if letter_diacritics is None:
104 |                 letter_diacritics = set()
105 |                 self.diacritics[letter_index] = letter_diacritics
106 | 
107 |             letter_diacritics.add(IPA.RAISED)
108 | 
109 |         self._text: str = ""
110 | 
111 |         self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters)
112 |         self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters)
113 |         self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters)
114 | 
115 |     @property
116 |     def text(self) -> str:
117 |         """Get textual representation of phone (NFC normalized)"""
118 |         if self._text:
119 |             return self._text
120 | 
121 |         # Pre-letter suprasegmentals
122 |         for accent in self.accents:
123 |             if accent == Accent.ACUTE:
124 |                 self._text += IPA.ACCENT_ACUTE
125 |             elif accent == Accent.GRAVE:
126 |                 self._text += IPA.ACCENT_GRAVE
127 | 
128 |         if self.stress == Stress.PRIMARY:
129 |             self._text += IPA.STRESS_PRIMARY
130 |         elif self.stress == Stress.SECONDARY:
131 |             self._text += IPA.STRESS_SECONDARY
132 | 
133 |         # Letters and diacritics
134 |         for letter_index, letter in enumerate(self.letters):
135 |             self._text += letter
136 | 
137 |             # Diacritics
138 |             for diacritic in self.diacritics.get(letter_index, []):
139 |                 self._text += diacritic
140 | 
141 |         # Tone
142 |         if self.tone:
143 |             self._text += self.tone
144 | 
145 |         # Post-letter suprasegmentals
146 |         if self.is_long:
147 |             self._text += IPA.LONG
148 | 
149 |         # Re-normalize and combine
150 |         self._text = unicodedata.normalize("NFC", self._text)
151 | 
152 |         return self._text
153 | 
154 |     @property
155 |     def is_vowel(self) -> bool:
156 |         """True if phone is a vowel"""
157 |         return self.vowel is not None
158 | 
159 |     @property
160 |     def is_consonant(self) -> bool:
161 |         """True if phone is a consonant"""
162 |         return self.consonant is not None
163 | 
164 |     @property
165 |     def is_schwa(self) -> bool:
166 |         """True if phone is a schwa"""
167 |         return self.schwa is not None
168 | 
169 |     def __repr__(self) -> str:
170 |         return self.text
171 | 
172 |     @staticmethod
173 |     def from_string(phone_str: str) -> "Phone":
174 |         """Parse phone from string"""
175 |         # Decompose into base and combining characters
176 |         codepoints = unicodedata.normalize("NFD", phone_str)
177 |         kwargs: typing.Dict[str, typing.Any] = {
178 |             "letters": "",
179 |             "diacritics": defaultdict(set),
180 |             "tone": "",
181 |             "accents": [],
182 |             "nasal": set(),
183 |             "raised": set(),
184 |         }
185 | 
186 |         in_tone = False
187 |         new_letter = False
188 |         letter_index = 0
189 | 
190 |         for c in codepoints:
191 |             # Check for stress
192 |             if (c == IPA.ACCENT_ACUTE) and not in_tone:
193 |                 kwargs["accents"].append(Accent.ACUTE)
194 |             elif (c == IPA.ACCENT_GRAVE) and not in_tone:
195 |                 kwargs["accents"].append(Accent.GRAVE)
196 |             elif c == IPA.STRESS_PRIMARY:
197 |                 kwargs["stress"] = Stress.PRIMARY
198 |             elif c == IPA.STRESS_SECONDARY:
199 |                 kwargs["stress"] = Stress.SECONDARY
200 |             elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
201 |                 # Interpret as part of tone
202 |                 kwargs["tone"] += c
203 |             elif IPA.is_long(c):
204 |                 # Check for elongation
205 |                 kwargs["is_long"] = True
206 |             elif IPA.is_nasal(c):
207 |                 # Check for nasalation
208 |                 kwargs["nasal"].add(letter_index)
209 |             elif IPA.is_raised(c):
210 |                 # Check for raised articulation
211 |                 kwargs["raised"].add(letter_index)
212 |             elif IPA.is_bracket(c) or IPA.is_break(c):
213 |                 # Skip brackets/syllable breaks
214 |                 pass
215 |             elif IPA.is_tie(c):
216 |                 # Keep ties in letters
217 |                 kwargs["letters"] += c
218 |                 letter_index += 1
219 |             elif IPA.is_tone(c):
220 |                 # Tone numbers/letters
221 |                 kwargs["tone"] += c
222 |                 in_tone = True
223 |             elif unicodedata.combining(c) > 0:
224 |                 # Stow some diacritics that we don't do anything with
225 |                 kwargs["diacritics"][letter_index].add(c)
226 |             else:
227 |                 # Include all other characters in letters
228 |                 kwargs["letters"] += c
229 |                 if new_letter:
230 |                     letter_index += 1
231 | 
232 |                 new_letter = True
233 | 
234 |         return Phone(**kwargs)
235 | 
236 | 
237 | # -----------------------------------------------------------------------------
238 | 
239 | 
240 | class Pronunciation:
241 |     """Collection of phones and breaks for some unit of text (word, sentence, etc.)"""
242 | 
243 |     def __init__(
244 |         self, phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]]
245 |     ):
246 |         self.phones_and_others = phones_and_others
247 | 
248 |         self.phones: typing.List[Phone] = []
249 |         self.breaks: typing.List[Break] = []
250 |         self.intonations: typing.List[Intonation] = []
251 | 
252 |         # Decompose into phones, breaks, and intonations
253 |         for p in self.phones_and_others:
254 |             if isinstance(p, Phone):
255 |                 self.phones.append(p)
256 |             elif isinstance(p, Break):
257 |                 self.breaks.append(p)
258 |             elif isinstance(p, Intonation):
259 |                 self.intonations.append(p)
260 | 
261 |         self._text = ""
262 | 
263 |     @property
264 |     def text(self) -> str:
265 |         """Get text representation of pronunciation (NFC normalized)"""
266 |         if not self._text:
267 |             self._text = "".join(p.text for p in self.phones_and_others)
268 | 
269 |         return self._text
270 | 
271 |     def __repr__(self) -> str:
272 |         return self.text
273 | 
274 |     def __iter__(self):
275 |         return iter(self.phones_and_others)
276 | 
277 |     def __getitem__(self, idx):
278 |         return self.phones_and_others[idx]
279 | 
280 |     @staticmethod
281 |     def from_string(
282 |         pron_str: str,
283 |         keep_stress: bool = True,
284 |         keep_accents: typing.Optional[bool] = None,
285 |         drop_tones: bool = False,
286 |         keep_ties: bool = True,
287 |     ) -> "Pronunciation":
288 |         """Split an IPA pronunciation into phones.
289 | 
290 |         Stress/accent markers bind to the next non-combining codepoint (e.g., ˈa).
291 |         Elongation markers bind to the previous non-combining codepoint (e.g., aː).
292 |         Ties join two non-combining sequences (e.g. t͡ʃ).
293 | 
294 |         Whitespace and brackets are skipped.
295 | 
296 |         Returns list of phones.
297 |         """
298 |         if keep_accents is None:
299 |             keep_accents = keep_stress
300 | 
301 |         clusters = []
302 |         cluster = ""
303 |         stress = ""
304 |         is_stress = False
305 |         accents = ""
306 |         is_accent = False
307 |         tone = ""
308 |         in_tone = False
309 |         skip_next_cluster = False
310 | 
311 |         codepoints = unicodedata.normalize("NFD", pron_str)
312 | 
313 |         for codepoint in codepoints:
314 |             new_cluster = False
315 |             is_stress = False
316 |             is_accent = False
317 | 
318 |             if (
319 |                 codepoint.isspace()
320 |                 or IPA.is_bracket(codepoint)
321 |                 or (codepoint in {IPA.BREAK_SYLLABLE})
322 |             ):
323 |                 # Skip whitespace, brackets, and syllable breaks
324 |                 continue
325 | 
326 |             if IPA.is_break(codepoint) or IPA.is_intonation(codepoint):
327 |                 # Keep minor/major/word breaks and intonation markers
328 |                 new_cluster = True
329 | 
330 |             if IPA.is_accent(codepoint) and not in_tone:
331 |                 is_accent = True
332 |                 if cluster:
333 |                     new_cluster = True
334 |                     skip_next_cluster = True
335 |             elif IPA.is_stress(codepoint):
336 |                 is_stress = True
337 |                 if cluster:
338 |                     new_cluster = True
339 |                     skip_next_cluster = True
340 |             elif in_tone and (codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
341 |                 # Interpret as part of tone
342 |                 if not drop_tones:
343 |                     tone += codepoint
344 | 
345 |                 continue
346 |             elif IPA.is_long(codepoint):
347 |                 # Add to current cluster
348 |                 pass
349 |             elif IPA.is_tie(codepoint):
350 |                 if keep_ties:
351 |                     # Add next non-combining to current cluster
352 |                     skip_next_cluster = True
353 |                 else:
354 |                     # Ignore ties
355 |                     continue
356 |             elif IPA.is_tone(codepoint):
357 |                 # Add to end of current cluster
358 |                 if not drop_tones:
359 |                     tone += codepoint
360 | 
361 |                 in_tone = True
362 |                 continue
363 |             elif unicodedata.combining(codepoint) == 0:
364 |                 # Non-combining character
365 |                 if skip_next_cluster:
366 |                     # Add to current cluster
367 |                     skip_next_cluster = False
368 |                 elif cluster:
369 |                     # Start a new cluster
370 |                     new_cluster = True
371 | 
372 |             if new_cluster and cluster:
373 |                 clusters.append(accents + stress + cluster + tone)
374 |                 accents = ""
375 |                 stress = ""
376 |                 cluster = ""
377 |                 tone = ""
378 | 
379 |             if is_accent:
380 |                 if keep_accents:
381 |                     accents += codepoint
382 |             elif is_stress:
383 |                 if keep_stress:
384 |                     stress += codepoint
385 |             else:
386 |                 cluster += codepoint
387 | 
388 |         if cluster:
389 |             clusters.append(accents + stress + cluster + tone)
390 | 
391 |         phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]] = []
392 |         for cluster in clusters:
393 |             if IPA.is_break(cluster):
394 |                 phones_and_others.append(Break.from_string(cluster))
395 |             elif IPA.is_intonation(cluster):
396 |                 phones_and_others.append(Intonation.from_string(cluster))
397 |             else:
398 |                 phones_and_others.append(Phone.from_string(cluster))
399 | 
400 |         return Pronunciation(phones_and_others)
401 | 
402 | 
403 | # -----------------------------------------------------------------------------
404 | 
405 | 
406 | class Phoneme:
407 |     """Phoneme composed of international phonetic alphabet symbols"""
408 | 
409 |     def __init__(
410 |         self,
411 |         text: str,
412 |         example: str = "",
413 |         unknown: bool = False,
414 |         tones: typing.Optional[typing.Iterable[str]] = None,
415 |         is_ipa: bool = True,
416 |     ):
417 |         self._text = ""
418 |         self._text_compare = ""
419 |         self.example = example
420 |         self.unknown = unknown
421 | 
422 |         # List of allowable tones for phoneme
423 |         self.tones = list(tones or [])
424 | 
425 |         self.stress: typing.Optional[Stress] = None
426 |         self.accents: typing.List[Accent] = []
427 |         self.elongated: bool = False
428 |         self.nasalated: typing.Set[int] = set()
429 |         self.raised: typing.Set[int] = set()
430 |         self._extra_combining: typing.Dict[int, typing.List[str]] = defaultdict(list)
431 | 
432 |         # Decompose into base and combining characters
433 |         codepoints = unicodedata.normalize("NFD", text)
434 |         self.letters = ""
435 |         self.tone = ""
436 | 
437 |         if is_ipa:
438 |             in_tone = False
439 |             letter_index = 0
440 |             new_letter = False
441 | 
442 |             for c in codepoints:
443 |                 # Check for stress
444 |                 if (c == IPA.ACCENT_ACUTE) and (not in_tone):
445 |                     self.accents.append(Accent.ACUTE)
446 |                 elif (c == IPA.ACCENT_GRAVE) and (not in_tone):
447 |                     self.accents.append(Accent.GRAVE)
448 |                 elif c == IPA.STRESS_PRIMARY:
449 |                     self.stress = Stress.PRIMARY
450 |                 elif c == IPA.STRESS_SECONDARY:
451 |                     self.stress = Stress.SECONDARY
452 |                 elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
453 |                     # Interpret as part of tone
454 |                     self.tone += c
455 |                 elif IPA.is_long(c):
456 |                     # Check for elongation
457 |                     self.elongated = True
458 |                 elif IPA.is_nasal(c):
459 |                     # Check for nasalation
460 |                     self.nasalated.add(letter_index)
461 |                 elif IPA.is_raised(c):
462 |                     # Check for raised articulation
463 |                     self.raised.add(letter_index)
464 |                 elif IPA.is_bracket(c) or IPA.is_break(c):
465 |                     # Skip brackets/syllable breaks
466 |                     pass
467 |                 elif IPA.is_tone(c):
468 |                     # Keep tone separate
469 |                     self.tone += c
470 |                     in_tone = True
471 |                 elif c in {IPA.SYLLABIC, IPA.NON_SYLLABIC, IPA.EXTRA_SHORT}:
472 |                     # Stow some diacritics that we don't do anything with
473 |                     self._extra_combining[letter_index].append(c)
474 |                 else:
475 |                     # Include all other characters in base
476 |                     self.letters += c
477 | 
478 |                     if new_letter:
479 |                         letter_index += 1
480 | 
481 |                     new_letter = True
482 |         else:
483 |             self.letters = text
484 | 
485 |         # Re-normalize and combine letters
486 |         self.letters = unicodedata.normalize("NFC", self.letters)
487 |         self.letters_graphemes = IPA.graphemes(self.letters)
488 | 
489 |         # Categorize
490 |         self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters)
491 |         self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters)
492 |         self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters)
493 |         self.dipthong: typing.Optional[Dipthong] = None
494 | 
495 |         if (
496 |             (not self.vowel)
497 |             and (not self.consonant)
498 |             and (not self.schwa)
499 |             and (len(self.letters) == 2)
500 |         ):
501 |             # Check if dipthong (two vowels)
502 |             vowel1 = VOWELS.get(self.letters[0])
503 |             vowel2 = VOWELS.get(self.letters[1])
504 |             if vowel1 and vowel2:
505 |                 self.dipthong = Dipthong(vowel1, vowel2)
506 | 
507 |     @property
508 |     def text(self) -> str:
509 |         """Return letters with stress and elongation (NFC normalized)"""
510 |         if self._text:
511 |             return self._text
512 | 
513 |         for accent in self.accents:
514 |             if accent == Accent.ACUTE:
515 |                 self._text += IPA.ACCENT_ACUTE
516 |             elif accent == Accent.GRAVE:
517 |                 self._text += IPA.ACCENT_GRAVE
518 | 
519 |         if self.stress == Stress.PRIMARY:
520 |             self._text += IPA.STRESS_PRIMARY
521 |         elif self.stress == Stress.SECONDARY:
522 |             self._text += IPA.STRESS_SECONDARY
523 | 
524 |         for letter_index, letter in enumerate(self.letters):
525 |             self._text += letter
526 | 
527 |             if letter_index in self.nasalated:
528 |                 self._text += IPA.NASAL
529 | 
530 |             if letter_index in self.raised:
531 |                 self._text += IPA.RAISED
532 | 
533 |             for c in self._extra_combining[letter_index]:
534 |                 self._text += c
535 | 
536 |         if self.tone:
537 |             self._text += self.tone
538 | 
539 |         if self.elongated:
540 |             self._text += IPA.LONG
541 | 
542 |         # Re-normalize and combine
543 |         self._text = unicodedata.normalize("NFC", self._text)
544 | 
545 |         return self._text
546 | 
547 |     @property
548 |     def text_compare(self) -> str:
549 |         """Return letters and elongation with no stress/tones (NFC normalized)"""
550 |         if self._text_compare:
551 |             return self._text_compare
552 | 
553 |         for letter_index, letter in enumerate(self.letters):
554 |             self._text_compare += letter
555 | 
556 |             if letter_index in self.nasalated:
557 |                 self._text_compare += IPA.NASAL
558 | 
559 |             if letter_index in self.raised:
560 |                 self._text_compare += IPA.RAISED
561 | 
562 |             for c in self._extra_combining[letter_index]:
563 |                 self._text_compare += c
564 | 
565 |         if self.elongated:
566 |             self._text_compare += IPA.LONG
567 | 
568 |         # Re-normalize and combine
569 |         self._text_compare = unicodedata.normalize("NFC", self._text_compare)
570 | 
571 |         return self._text_compare
572 | 
573 |     def copy(self) -> "Phoneme":
574 |         """Create a copy of this phonemes"""
575 |         return Phoneme(text=self.text, example=self.example, unknown=self.unknown)
576 | 
577 |     def __repr__(self) -> str:
578 |         """Return symbol with stress and elongation."""
579 |         return self.text
580 | 
581 |     def to_dict(self) -> typing.Dict[str, typing.Any]:
582 |         """Return properties of phoneme as a dict"""
583 |         type_name = "Phoneme"
584 |         props: typing.Dict[str, typing.Any] = {
585 |             "text": repr(self),
586 |             "letters": self.letters,
587 |             "tone": self.tone,
588 |             "tones": self.tones,
589 |         }
590 | 
591 |         if self.unknown:
592 |             props["unknown"] = True
593 | 
594 |         if self.example:
595 |             props["example"] = self.example
596 | 
597 |         props["accents"] = [a.value for a in self.accents]
598 |         props["stress"] = self.stress.value if self.stress is not None else ""
599 | 
600 |         if self.vowel:
601 |             type_name = "Vowel"
602 |             props["height"] = self.vowel.height.value
603 |             props["placement"] = self.vowel.placement.value
604 |             props["rounded"] = self.vowel.rounded
605 |         elif self.consonant:
606 |             type_name = "Consonant"
607 |             props["type"] = self.consonant.type.value
608 |             props["place"] = self.consonant.place.value
609 |             props["voiced"] = self.consonant.voiced
610 |         elif self.dipthong:
611 |             type_name = "Dipthong"
612 |         elif self.schwa:
613 |             type_name = "Schwa"
614 |             props["r_coloured"] = self.schwa.r_coloured
615 | 
616 |         props["type"] = type_name
617 | 
618 |         props["nasalated"] = list(self.nasalated)
619 |         props["raised"] = list(self.raised)
620 |         props["elongated"] = self.elongated
621 | 
622 |         return props
623 | 
624 |     def to_string(self) -> str:
625 |         """Return descriptive string of phoneme"""
626 |         props = self.to_dict()
627 |         type_name = props.get("type", "Phoneme")
628 | 
629 |         prop_strs = [f"{k}={v}" for k, v in props.items()]
630 | 
631 |         return f"{type_name}(" + ", ".join(prop_strs) + ")"
632 | 
633 | 
634 | # -----------------------------------------------------------------------------
635 | 
636 | 
637 | class Phonemes:
638 |     """Set of phonemes and allophones for a language"""
639 | 
640 |     COMMENT_STR = "#"
641 | 
642 |     def __init__(self, phonemes=None, ipa_map=None):
643 |         self.phonemes = phonemes or []
644 |         self.ipa_map = ipa_map or {}
645 | 
646 |         # Regex for replacing IPA
647 |         self._ipa_map_regex = None
648 | 
649 |         # Phonemes sorted by descreasing length
650 |         self._phonemes_sorted = None
651 | 
652 |         # Map from original phoneme to gruut IPA
653 |         self.gruut_ipa_map: typing.Dict[str, str] = {}
654 | 
655 |         self.phoneme_texts: typing.Set[str] = {}
656 |         self.update()
657 | 
658 |     def __iter__(self):
659 |         return iter(self.phonemes)
660 | 
661 |     def __len__(self):
662 |         return len(self.phonemes)
663 | 
664 |     def __getitem__(self, key):
665 |         return self.phonemes[key]
666 | 
667 |     def __contains__(self, item):
668 |         if isinstance(item, str):
669 |             # Compare IPA text
670 |             return item in self.phoneme_texts
671 | 
672 |         return item in self.phonemes
673 | 
674 |     @staticmethod
675 |     def from_language(language: str) -> "Phonemes":
676 |         """Load phonemes for a given language"""
677 |         language = resolve_lang(language)
678 | 
679 |         # Load phonemes themselves
680 |         phonemes_path = _DATA_DIR / language / "phonemes.txt"
681 |         with open(phonemes_path, "r", encoding="utf-8") as phonemes_file:
682 |             phonemes = Phonemes.from_text(phonemes_file)
683 | 
684 |         # Try to load optional map from original phoneme to gruut IPA
685 |         gruut_ipa_map: typing.Optional[typing.Dict[str, str]] = None
686 |         map_path = _DATA_DIR / language / "ipa_map.txt"
687 |         if map_path.is_file():
688 |             gruut_ipa_map = {}
689 |             with open(map_path, "r", encoding="utf-8") as map_file:
690 |                 for line in map_file:
691 |                     line = line.strip()
692 |                     if not line:
693 |                         continue
694 | 
695 |                     from_phoneme, to_ipa = line.split(maxsplit=1)
696 |                     gruut_ipa_map[from_phoneme] = to_ipa
697 | 
698 |         if gruut_ipa_map:
699 |             phonemes.gruut_ipa_map = gruut_ipa_map
700 | 
701 |         return phonemes
702 | 
703 |     @staticmethod
704 |     def from_text(text_file) -> "Phonemes":
705 |         """Load text file with phonemes, examples, and allophones"""
706 |         lang = Phonemes()
707 | 
708 |         for line in text_file:
709 |             # Remove comments
710 |             line, *_ = line.split(Phonemes.COMMENT_STR, maxsplit=1)
711 |             line = line.strip()
712 |             if line:
713 |                 # phoneme [example] [allophone] [allophone] ! [tone] [tone]...
714 |                 parts = line.split()
715 |                 phoneme_ipa = parts[0]
716 |                 example = ""
717 | 
718 |                 if len(parts) > 1:
719 |                     example = parts[1]
720 | 
721 |                 tones = []
722 |                 if len(parts) > 2:
723 |                     in_tone = False
724 | 
725 |                     # Map allophone back to phoneme
726 |                     for part in parts[2:]:
727 |                         if part == "!":
728 |                             # Begin possible tones for this phoneme
729 |                             in_tone = True
730 |                         elif in_tone:
731 |                             tones.append(part)
732 |                         else:
733 |                             lang.ipa_map[part] = phoneme_ipa
734 | 
735 |                 lang.phonemes.append(
736 |                     Phoneme(text=phoneme_ipa, example=example, tones=tones)
737 |                 )
738 | 
739 |         lang.update()
740 | 
741 |         return lang
742 | 
743 |     def update(self):
744 |         """Call after modifying phonemes or IPA map to re-sort"""
745 |         # Create single regex that will be used to replace IPA.
746 |         # The final regex is of the form (AAA|BB|C) where each case is in
747 |         # decreasing length order.
748 |         #
749 |         # If the replacement is not a substring of any phonemes, then the
750 |         # replacement is straightforward.
751 |         #
752 |         # If it is a substring of some phoneme, however, we need to be careful.
753 |         # For example, naively replacing "e" with "eɪ" in the string "beɪ" will
754 |         # produce "beeɪ" when we want it to be "beɪ".
755 |         #
756 |         # So the substring case becomes "e(?!ɪ)" which uses a negative lookahead
757 |         # to avoid the problem.
758 |         cases = []
759 |         for match_text in sorted(self.ipa_map.keys(), key=len, reverse=True):
760 |             if match_text.startswith(","):
761 |                 # Raw regex
762 |                 cases.append(match_text[1:])
763 |                 continue
764 | 
765 |             # Check against all of the phonemes
766 |             case_added = False
767 |             for phoneme in self.phonemes:
768 |                 num_extra = len(phoneme.text) - len(match_text)
769 |                 if (num_extra > 0) and phoneme.text.startswith(match_text):
770 |                     # Use negative lookahead to avoid replacing part of a valid
771 |                     # phoneme.
772 |                     cases.append(
773 |                         "{}(?!{})".format(
774 |                             re.escape(match_text[:num_extra]),
775 |                             re.escape(phoneme.text[num_extra:]),
776 |                         )
777 |                     )
778 | 
779 |                     case_added = True
780 |                     break
781 | 
782 |             if not case_added:
783 |                 # No substring problem
784 |                 cases.append(re.escape(match_text))
785 | 
786 |         ipa_map_regex_str = "({})".format("|".join(cases))
787 |         self._ipa_map_regex = re.compile(ipa_map_regex_str)
788 | 
789 |         # Split phonemes and sort by reverse length
790 |         split_phonemes = [
791 |             ([pb.text for pb in Pronunciation.from_string(p.text)], p)
792 |             for p in self.phonemes
793 |         ]
794 | 
795 |         self._phonemes_sorted = sorted(
796 |             split_phonemes, key=lambda kp: len(kp[0]), reverse=True
797 |         )
798 | 
799 |         # Update IPA texts set for phonemes
800 |         self.phoneme_texts = set(p.text for p in self.phonemes)
801 | 
802 |     def split(
803 |         self,
804 |         pron_str: typing.Union[str, Pronunciation],
805 |         keep_stress: bool = True,
806 |         keep_accents: typing.Optional[bool] = None,
807 |         drop_tones: bool = False,
808 |         is_ipa: bool = True,
809 |     ) -> typing.List[Phoneme]:
810 |         """Split an IPA pronunciation into phonemes"""
811 |         if not self._ipa_map_regex:
812 |             self.update()
813 | 
814 |         if keep_accents is None:
815 |             keep_accents = keep_stress
816 | 
817 |         word_phonemes: typing.List[Phoneme] = []
818 | 
819 |         if self.ipa_map:
820 |             if isinstance(pron_str, Pronunciation):
821 |                 pron_str = "".join(p.text for p in pron_str)
822 | 
823 |             def handle_replace(match):
824 |                 text = match.group(1)
825 |                 return self.ipa_map.get(text, text)
826 | 
827 |             pron_str = self._ipa_map_regex.sub(handle_replace, pron_str)
828 | 
829 |         # Get text for IPA phones
830 |         if isinstance(pron_str, Pronunciation):
831 |             # Use supplied pronunication
832 |             ipas = [pb.text for pb in pron_str]
833 |         elif is_ipa:
834 |             # Split string into pronunciation
835 |             pron = Pronunciation.from_string(
836 |                 pron_str,
837 |                 keep_stress=keep_stress,
838 |                 keep_accents=keep_accents,
839 |                 drop_tones=drop_tones,
840 |             )
841 |             ipas = [pb.text for pb in pron]
842 |         else:
843 |             ipas = IPA.graphemes(pron_str)
844 | 
845 |         # Keep stress and tones separate to make phoneme comparisons easier
846 |         ipa_stress: typing.Dict[int, str] = defaultdict(str)
847 |         ipa_tones: typing.Dict[int, str] = defaultdict(str)
848 | 
849 |         if is_ipa:
850 |             in_tone = False
851 |             for ipa_idx, ipa in enumerate(ipas):
852 |                 if ipa:
853 |                     keep_ipa = ""
854 |                     for codepoint in ipa:
855 |                         if IPA.is_accent(codepoint) and (not in_tone):
856 |                             if keep_accents:
857 |                                 ipa_stress[ipa_idx] += codepoint
858 |                         elif IPA.is_stress(codepoint):
859 |                             if keep_stress:
860 |                                 ipa_stress[ipa_idx] += codepoint
861 |                         elif in_tone and (
862 |                             codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}
863 |                         ):
864 |                             # Interpret as part of time
865 |                             if not drop_tones:
866 |                                 ipa_tones[ipa_idx] += codepoint
867 |                         elif IPA.is_tone(codepoint):
868 |                             if not drop_tones:
869 |                                 ipa_tones[ipa_idx] += codepoint
870 | 
871 |                             in_tone = True
872 |                         else:
873 |                             keep_ipa += codepoint
874 | 
875 |                     ipas[ipa_idx] = keep_ipa
876 | 
877 |         num_ipas: int = len(ipas)
878 | 
879 |         # ---------------------------------------------------------------------
880 | 
881 |         # pylint: disable=consider-using-enumerate
882 |         for ipa_idx in range(len(ipas)):
883 |             ipa = ipas[ipa_idx]
884 |             if ipa is None:
885 |                 # Skip replaced piece
886 |                 continue
887 | 
888 |             phoneme_match = False
889 |             for phoneme_ipas, phoneme in self._phonemes_sorted:
890 |                 if ipa_idx <= (num_ipas - len(phoneme_ipas)):
891 |                     phoneme_match = True
892 |                     phoneme_stress = ""
893 |                     phoneme_tones = ""
894 | 
895 |                     # Look forward into sequence
896 |                     for phoneme_idx in range(len(phoneme_ipas)):
897 |                         phoneme_stress += ipa_stress[ipa_idx + phoneme_idx]
898 |                         phoneme_tones += ipa_tones[ipa_idx + phoneme_idx]
899 | 
900 |                         if phoneme_ipas[phoneme_idx] != ipas[ipa_idx + phoneme_idx]:
901 |                             phoneme_match = False
902 |                             break
903 | 
904 |                     if phoneme_match:
905 |                         # Successful match
906 |                         if phoneme_stress or phoneme_tones:
907 |                             # Create a copy of the phoneme with applied stress/tones
908 |                             phoneme = Phoneme(
909 |                                 text=(phoneme_stress + phoneme.text + phoneme_tones),
910 |                                 example=phoneme.example,
911 |                             )
912 | 
913 |                         word_phonemes.append(phoneme)
914 | 
915 |                         # Patch ipas to skip replaced pieces
916 |                         for phoneme_idx in range(1, len(phoneme_ipas)):
917 |                             ipas[ipa_idx + phoneme_idx] = None
918 | 
919 |                         break
920 | 
921 |             if not phoneme_match:
922 |                 # Add unknown phoneme
923 |                 word_phonemes.append(Phoneme(text=ipa, unknown=True))
924 | 
925 |         return word_phonemes
926 | 


--------------------------------------------------------------------------------
/gruut_ipa/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rhasspy/gruut-ipa/ec9ae6ce7cca0103d9a563fcde3352f805a8e27e/gruut_ipa/py.typed


--------------------------------------------------------------------------------
/gruut_ipa/sampa.py:
--------------------------------------------------------------------------------
  1 | """Mapping between IPA and Sampa"""
  2 | import re
  3 | import unicodedata
  4 | 
  5 | # http://www.blahedo.org/ascii-ipa.html
  6 | 
  7 | 
  8 | def ipa_to_sampa(ipa: str) -> str:
  9 |     """Convert IPA string to sampa phonemes"""
 10 |     ipa_codepoints = unicodedata.normalize("NFD", ipa)
 11 | 
 12 |     return IPA_PATTERN.sub(
 13 |         lambda match: IPA_TO_SAMPA.get(match.group(1), ""), ipa_codepoints
 14 |     )
 15 | 
 16 | 
 17 | def sampa_to_ipa(sampa: str) -> str:
 18 |     """Convert sampa phonemes to IPA phones"""
 19 |     sampa_codepoints = unicodedata.normalize("NFD", sampa)
 20 | 
 21 |     return SAMPA_PATTERN.sub(
 22 |         lambda match: SAMPA_TO_IPA.get(match.group(1), ""), sampa_codepoints
 23 |     )
 24 | 
 25 | 
 26 | # -----------------------------------------------------------------------------
 27 | 
 28 | IPA_TO_SAMPA = {
 29 |     "\u0061": "a",
 30 |     "\u0250": "6",
 31 |     "\u0251": "A",
 32 |     "\u0252": "Q",
 33 |     "\u00e6": "{",
 34 |     "\u028c": "V",
 35 |     "\u0062": "b",
 36 |     "\u0253": "",
 37 |     "\u0299": "B\\",
 38 |     "\u03b2": "B",
 39 |     "\u0063": "c",
 40 |     "\u00e7": "C",
 41 |     "\u0063\u0327": "C",
 42 |     "\u0255": "s\\",
 43 |     "\u0064": "d",
 44 |     "\u0257": "",
 45 |     "\u0256": "d`",
 46 |     "\u00f0": "D",
 47 |     "\u0065": "e",
 48 |     "\u0259": "@",
 49 |     "\u025a": "@`",
 50 |     "\u0258": "@\\",
 51 |     "\u025b": "E",
 52 |     "\u025c": "3",
 53 |     "\u025d": "@`",
 54 |     "\u025e": "3\\",
 55 |     "\u0066": "f",
 56 |     "\u0261": "g",
 57 |     "\u0067": "g",
 58 |     "\u0260": "",
 59 |     "\u0262": "G\\",
 60 |     "\u029b": "G\\_<",
 61 |     "\u0263": "G",
 62 |     "\u02e0": "_G",
 63 |     "\u0264": "7",
 64 |     "\u0068": "h",
 65 |     "\u02b0": "_h",
 66 |     "\u0127": "X\\",
 67 |     "\u0266": "h\\",
 68 |     "\u0267": "x\\",
 69 |     "\u0265": "H",
 70 |     "\u029c": "H\\",
 71 |     "\u0069": "i",
 72 |     "\u0268": "1",
 73 |     "\u026a": "I",
 74 |     "\u006a": "j",
 75 |     "\u02b2": "', _j",
 76 |     "\u029d": "j\\",
 77 |     "\u025f": "J\\",
 78 |     "\u0284": "J\\_<",
 79 |     "\u006b": "k",
 80 |     "\u006c": "l",
 81 |     "\u026b": "5",
 82 |     "\u026c": "K",
 83 |     "\u026d": "l`",
 84 |     "\u026e": "K\\",
 85 |     "\u029f": "L\\",
 86 |     "\u006d": "m",
 87 |     "\u0271": "F",
 88 |     "\u026f": "M",
 89 |     "\u0270": "M\\",
 90 |     "\u006e": "n",
 91 |     "\u0272": "J",
 92 |     "\u014b": "N",
 93 |     "\u0273": "n`",
 94 |     "\u0274": "N\\",
 95 |     "\u006f": "o",
 96 |     "\u0298": "O\\",
 97 |     "\u0275": "8",
 98 |     "\u00f8": "2",
 99 |     "\u0153": "9",
100 |     "\u0276": "&",
101 |     "\u0254": "O",
102 |     "\u0070": "p",
103 |     "\u0278": "p\\",
104 |     "\u0071": "q",
105 |     "\u0072": "r",
106 |     "\u027e": "4",
107 |     "\u027c": "",
108 |     "\u027d": "r`",
109 |     "\u0279": "r\\",
110 |     "\u027b": "r\\`",
111 |     "\u027a": "l\\",
112 |     "\u0280": "R\\",
113 |     "\u0281": "R",
114 |     "\u0073": "s",
115 |     "\u0282": "s`",
116 |     "\u0283": "S",
117 |     "\u0074": "t",
118 |     "\u0288": "t`",
119 |     "\u03b8": "T",
120 |     "\u0075": "u",
121 |     "\u0289": "}",
122 |     "\u028a": "U",
123 |     "\u0076": "v",
124 |     "\u028b": "v\\",
125 |     "\u0077": "w",
126 |     "\u02b7": "_w",
127 |     "\u028d": "W",
128 |     "\u0078": "x",
129 |     "\u03c7": "X",
130 |     "\u0079": "y",
131 |     "\u028e": "L",
132 |     "\u028f": "Y",
133 |     "\u007a": "z",
134 |     "\u0291": "z\\",
135 |     "\u0290": "z`",
136 |     "\u0292": "Z",
137 |     "\u0294": "?",
138 |     "\u02a1": ">\\",
139 |     "\u0295": "?\\",
140 |     "\u02a2": "<\\",
141 |     "\u02e4": "_?\\",
142 |     "\u01c3": "!\\",
143 |     "\u01c0": "|\\",
144 |     "\u01c1": "|\\|\\",
145 |     "\u0320": "_-",
146 |     "\u032a": "_d",
147 |     "\u033a": "_a",
148 |     "\u031f": "_+",
149 |     "\u031d": "_r",
150 |     "\u031e": "_o",
151 |     "\u02c8": '"',
152 |     "\u02cc": "%",
153 |     "\u031a": "_}",
154 |     "\u002e": "",
155 |     "\u02d1": ":\\",
156 |     "\u0308": '_"',
157 |     "\u0324": "_t",
158 |     "\u02d0": ":",
159 |     "\u02bc": "",
160 |     "\u0325": "_0",
161 |     "\u030a": "",
162 |     "\u031c": "_c",
163 |     "\u0339": "_O",
164 |     "\u0303": "~, _~",
165 |     "\u0334": "_e",
166 |     "\u0330": "_k",
167 |     "\u032c": "_v",
168 |     "\u0306": "_X",
169 |     "\u032f": "_^",
170 |     "\u033d": "",
171 |     "\u02de": "`",
172 |     "\u033b": "_m",
173 |     "\u0318": "_A",
174 |     "\u0319": "_q",
175 |     "\u033c": "_N",
176 |     "\u2197": "<R>",
177 |     "\u2191": "^",
178 |     "\u2198": "",
179 |     "\u2193": "!",
180 |     "\u030f": "_B",
181 |     "\u0300": "_L",
182 |     "\u0304": "_M",
183 |     "\u0301": "_H",
184 |     "\u030b": "_T",
185 |     #
186 |     # Ties
187 |     "\u0361": "",
188 |     "\u035C": "",
189 |     #
190 |     # Tied symbols
191 |     "\u0288\u0361\u0282": "ts`",
192 |     "\u0256\u0361\u0290": "dz`",
193 |     "\u006b\u0361\u0078": "k_x",
194 |     #
195 |     # Breaks
196 |     "|": "",
197 |     "\u2016": "",
198 |     "#": "",
199 | }
200 | 
201 | SAMPA_TO_IPA = {v: k for k, v in IPA_TO_SAMPA.items() if v}
202 | 
203 | # Regex disjunctions in descending length order
204 | SAMPA_PATTERN = re.compile(
205 |     "({})".format(
206 |         "|".join(
207 |             re.escape(sampa) for sampa in sorted(SAMPA_TO_IPA, key=len, reverse=True)
208 |         )
209 |     )
210 | )
211 | 
212 | IPA_PATTERN = re.compile(
213 |     "({})".format(
214 |         "|".join(re.escape(ipa) for ipa in sorted(IPA_TO_SAMPA, key=len, reverse=True))
215 |     )
216 | )
217 | 


--------------------------------------------------------------------------------
/gruut_ipa/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Utility methods"""
 3 | 
 4 | from gruut_ipa.constants import LANG_ALIASES
 5 | 
 6 | 
 7 | def resolve_lang(lang: str) -> str:
 8 |     """Resolve language with known aliases"""
 9 |     if "/" in lang:
10 |         lang, rest = lang.split("/", maxsplit=1)
11 |         lang = LANG_ALIASES.get(lang, lang)
12 |         return f"{lang}/{rest}"
13 | 
14 |     return LANG_ALIASES.get(lang, lang)
15 | 


--------------------------------------------------------------------------------
/img/ipa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rhasspy/gruut-ipa/ec9ae6ce7cca0103d9a563fcde3352f805a8e27e/img/ipa.png


--------------------------------------------------------------------------------
/img/ipa.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
  6 |    xmlns:cc="http://creativecommons.org/ns#"
  7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  8 |    xmlns:svg="http://www.w3.org/2000/svg"
  9 |    xmlns="http://www.w3.org/2000/svg"
 10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 12 |    width="168.8185mm"
 13 |    height="86.899605mm"
 14 |    viewBox="0 0 168.8185 86.899605"
 15 |    version="1.1"
 16 |    id="svg8"
 17 |    inkscape:version="0.92.3 (2405546, 2018-03-11)"
 18 |    sodipodi:docname="ipa.svg"
 19 |    inkscape:export-filename="./ipa.png"
 20 |    inkscape:export-xdpi="90"
 21 |    inkscape:export-ydpi="90">
 22 |   <defs
 23 |      id="defs2" />
 24 |   <sodipodi:namedview
 25 |      id="base"
 26 |      pagecolor="#ffffff"
 27 |      bordercolor="#666666"
 28 |      borderopacity="1.0"
 29 |      inkscape:pageopacity="1"
 30 |      inkscape:pageshadow="2"
 31 |      inkscape:zoom="0.98994949"
 32 |      inkscape:cx="292.80513"
 33 |      inkscape:cy="96.308912"
 34 |      inkscape:document-units="mm"
 35 |      inkscape:current-layer="layer1"
 36 |      showgrid="false"
 37 |      inkscape:window-width="1440"
 38 |      inkscape:window-height="832"
 39 |      inkscape:window-x="0"
 40 |      inkscape:window-y="208"
 41 |      inkscape:window-maximized="1"
 42 |      inkscape:snap-global="false"
 43 |      fit-margin-top="5"
 44 |      fit-margin-left="5"
 45 |      fit-margin-right="5"
 46 |      fit-margin-bottom="5" />
 47 |   <metadata
 48 |      id="metadata5">
 49 |     <rdf:RDF>
 50 |       <cc:Work
 51 |          rdf:about="">
 52 |         <dc:format>image/svg+xml</dc:format>
 53 |         <dc:type
 54 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
 55 |         <dc:title></dc:title>
 56 |       </cc:Work>
 57 |     </rdf:RDF>
 58 |   </metadata>
 59 |   <g
 60 |      inkscape:label="Layer 1"
 61 |      inkscape:groupmode="layer"
 62 |      id="layer1"
 63 |      transform="translate(15.832719,-17.488019)">
 64 |     <text
 65 |        xml:space="preserve"
 66 |        style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#aa0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
 67 |        x="53.402088"
 68 |        y="44.204948"
 69 |        id="text829"><tspan
 70 |          sodipodi:role="line"
 71 |          id="tspan827"
 72 |          x="53.402088"
 73 |          y="44.204948"
 74 |          style="font-size:8.81944466px;fill:#aa0000;fill-opacity:1;stroke-width:0.26458332">stress</tspan></text>
 75 |     <text
 76 |        id="text837"
 77 |        y="99.262741"
 78 |        x="55.327374"
 79 |        style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#aa0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
 80 |        xml:space="preserve"><tspan
 81 |          style="font-weight:bold;font-size:8.81944466px;fill:#000000;fill-opacity:1;stroke-width:0.26458332"
 82 |          y="99.262741"
 83 |          x="55.327374"
 84 |          id="tspan835"
 85 |          sodipodi:role="line">letters</tspan></text>
 86 |     <text
 87 |        xml:space="preserve"
 88 |        style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#008000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
 89 |        x="26.701443"
 90 |        y="44.315655"
 91 |        id="text848"><tspan
 92 |          sodipodi:role="line"
 93 |          id="tspan846"
 94 |          x="26.701443"
 95 |          y="44.315655"
 96 |          style="font-size:8.81944466px;fill:#008000;fill-opacity:1;stroke-width:0.26458332">long</tspan></text>
 97 |     <text
 98 |        xml:space="preserve"
 99 |        style="font-style:normal;font-weight:normal;font-size:6.9378953px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#aa0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
100 |        x="78.971069"
101 |        y="26.880665"
102 |        id="text852"><tspan
103 |          sodipodi:role="line"
104 |          id="tspan850"
105 |          x="78.971069"
106 |          y="26.880665"
107 |          style="font-size:5.78157997px;fill:#000000;fill-opacity:1;stroke-width:0.26458332">suprasegmentals</tspan></text>
108 |     <text
109 |        id="text858"
110 |        y="44.315655"
111 |        x="90.201439"
112 |        style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#008000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
113 |        xml:space="preserve"><tspan
114 |          style="font-size:8.81944466px;fill:#0000ff;fill-opacity:1;stroke-width:0.26458332"
115 |          y="44.315655"
116 |          x="90.201439"
117 |          id="tspan856"
118 |          sodipodi:role="line">tie</tspan></text>
119 |     <text
120 |        id="text911"
121 |        y="67.015236"
122 |        x="-11.319339"
123 |        style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#aa00d4;fill-opacity:1;stroke:none;stroke-width:0.26458332"
124 |        xml:space="preserve"><tspan
125 |          style="font-size:8.81944466px;fill:#aa00d4;fill-opacity:1;stroke-width:0.26458332"
126 |          y="67.015236"
127 |          x="-11.319339"
128 |          id="tspan909"
129 |          sodipodi:role="line">diacritics</tspan></text>
130 |     <text
131 |        xml:space="preserve"
132 |        style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
133 |        x="106.77828"
134 |        y="66.739136"
135 |        id="text915"><tspan
136 |          sodipodi:role="line"
137 |          id="tspan913"
138 |          x="106.77828"
139 |          y="66.739136"
140 |          style="font-size:17.63888931px;fill:#d45500;stroke-width:0.26458332">‖</tspan></text>
141 |     <text
142 |        id="text919"
143 |        y="66.739136"
144 |        x="66.145836"
145 |        style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
146 |        xml:space="preserve"><tspan
147 |          style="font-size:17.63888931px;fill:#d45500;stroke-width:0.26458332"
148 |          y="66.739136"
149 |          x="66.145836"
150 |          id="tspan917"
151 |          sodipodi:role="line">|</tspan></text>
152 |     <text
153 |        xml:space="preserve"
154 |        style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#d45500;fill-opacity:1;stroke:none;stroke-width:0.26458332"
155 |        x="118.82735"
156 |        y="65.367554"
157 |        id="text941"><tspan
158 |          sodipodi:role="line"
159 |          id="tspan939"
160 |          x="118.82735"
161 |          y="65.367554"
162 |          style="font-size:8.81944466px;fill:#d45500;fill-opacity:1;stroke-width:0.26458332">breaks</tspan></text>
163 |     <g
164 |        id="g976">
165 |       <g
166 |          id="g937">
167 |         <path
168 |            inkscape:connector-curvature="0"
169 |            id="path923"
170 |            style="font-style:normal;font-weight:normal;font-size:17.63888931px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#aa00d4;fill-opacity:1;stroke:none;stroke-width:0.26458332"
171 |            d="m 49.631636,71.279705 h -1.016304 q -0.09474,-0.645955 -0.50815,-0.973239 -0.422025,-0.318672 -1.171333,-0.318672 -0.740695,0 -1.154107,0.318672 -0.413411,0.318671 -0.525377,0.973239 h -1.016303 q 0.08613,-1.231621 0.766534,-1.851739 0.680405,-0.620117 1.929253,-0.620117 1.248847,0 1.929254,0.620117 0.680406,0.620118 0.766533,1.851739 z" />
172 |         <path
173 |            inkscape:connector-curvature="0"
174 |            id="path839"
175 |            style="font-style:normal;font-weight:normal;font-size:medium;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#aa0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
176 |            d="m 39.95953,53.198247 v 4.78007 h -1.171333 v -4.78007 z" />
177 |         <path
178 |            inkscape:connector-curvature="0"
179 |            id="path841"
180 |            style="font-style:normal;font-weight:normal;font-size:17.63888931px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
181 |            d="m 47.848799,61.733471 c -1.280427,0 -2.167539,0.146417 -2.661336,0.43925 -0.493797,0.292833 -0.740696,0.792372 -0.740696,1.498616 0,0.562699 0.183738,1.010562 0.551215,1.343588 0.373219,0.327284 0.878499,0.490926 1.515842,0.490926 0.878499,0 1.581873,-0.310059 2.110121,-0.930176 0.53399,-0.625859 0.800985,-1.455552 0.800985,-2.489081 v -0.353123 z m 3.160875,-0.654568 v 5.50354 H 49.42493 v -1.464165 c -0.361735,0.585666 -0.812468,1.019174 -1.3522,1.300523 -0.539731,0.275608 -1.200041,0.413412 -1.98093,0.413412 -0.987594,0 -1.774224,-0.275608 -2.35989,-0.826823 -0.579925,-0.556957 -0.869887,-1.300524 -0.869887,-2.2307 0,-1.085205 0.361735,-1.903415 1.085205,-2.45463 0.729212,-0.551215 1.814417,-0.826823 3.255615,-0.826823 h 2.222087 v -0.155029 c 0,-0.729212 -0.241157,-1.291911 -0.72347,-1.688097 -0.476571,-0.401928 -1.148365,-0.602892 -2.015381,-0.602892 -0.551215,0 -1.088076,0.06603 -1.610582,0.198093 -0.522506,0.132062 -1.024916,0.330155 -1.507229,0.594279 v -1.464166 c 0.579924,-0.223931 1.142623,-0.390444 1.688096,-0.499539 0.545474,-0.114836 1.076593,-0.172254 1.593357,-0.172254 1.395264,0 2.437405,0.361735 3.126424,1.085205 0.689019,0.72347 1.033529,1.820159 1.033529,3.290066 z"
182 |            sodipodi:nodetypes="ssscscscssccccscscscscsscccsss" />
183 |         <path
184 |            inkscape:connector-curvature="0"
185 |            id="path843"
186 |            style="font-style:normal;font-weight:normal;font-size:17.63888931px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#008000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
187 |            d="M 55.109338,60.303757 53.57627,57.461553 h 3.074748 z m 0,3.436482 1.54168,2.842204 H 53.57627 Z" />
188 |         <path
189 |            d="M 46.884172,54.782991 46.393246,54.30929 c -0.12632,-0.114837 -0.238286,-0.198093 -0.335897,-0.249769 -0.09187,-0.05742 -0.175126,-0.08613 -0.249769,-0.08613 -0.218189,0 -0.378961,0.106224 -0.482314,0.318672 -0.103353,0.206705 -0.160771,0.545473 -0.172254,1.016303 h -1.076593 c 0.01149,-0.775147 0.163645,-1.372297 0.456475,-1.79145 0.292833,-0.424895 0.700503,-0.637342 1.223009,-0.637342 0.218189,0 0.419153,0.04019 0.602892,0.120578 0.183739,0.08038 0.381832,0.215318 0.594279,0.404799 l 0.490926,0.4737 c 0.12632,0.114837 0.235415,0.200964 0.327284,0.258382 0.09761,0.05168 0.183739,0.07752 0.258382,0.07752 0.21819,0 0.378961,-0.103353 0.482314,-0.310058 0.103353,-0.212448 0.160771,-0.554087 0.172255,-1.024916 h 1.076592 c -0.01149,0.775146 -0.163645,1.375167 -0.456475,1.800062 -0.292833,0.419153 -0.700503,0.62873 -1.223009,0.62873 -0.218189,0 -0.419153,-0.04019 -0.602892,-0.120578 -0.183739,-0.08039 -0.381832,-0.215319 -0.594279,-0.404799 z"
190 |            style="font-style:normal;font-weight:normal;font-size:17.63888931px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#aa00d4;fill-opacity:1;stroke:none;stroke-width:0.26458332"
191 |            id="path895"
192 |            inkscape:connector-curvature="0"
193 |            sodipodi:nodetypes="cccsccccsscccsccccsccc" />
194 |       </g>
195 |       <rect
196 |          y="49.511932"
197 |          x="34.679428"
198 |          height="25.135416"
199 |          width="26.080357"
200 |          id="rect943"
201 |          style="opacity:1;fill:none;fill-opacity:1;stroke:#808080;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:1.50000002, 3.00000004;stroke-dashoffset:0;stroke-opacity:1" />
202 |     </g>
203 |     <text
204 |        xml:space="preserve"
205 |        style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#808080;fill-opacity:1;stroke:none;stroke-width:0.26458332"
206 |        x="36.447384"
207 |        y="83.23674"
208 |        id="text947"><tspan
209 |          sodipodi:role="line"
210 |          id="tspan945"
211 |          x="36.447384"
212 |          y="83.23674"
213 |          style="font-weight:normal;font-size:7.0555563px;fill:#808080;fill-opacity:1;stroke-width:0.26458332">phone</tspan></text>
214 |     <g
215 |        id="g967"
216 |        transform="translate(0,0.27532578)">
217 |       <g
218 |          transform="translate(-1.4637451e-6,0.32787461)"
219 |          id="g907">
220 |         <path
221 |            inkscape:connector-curvature="0"
222 |            id="path879"
223 |            style="font-style:normal;font-weight:normal;font-size:17.63888931px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#aa0000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
224 |            d="m 82.748559,65.662231 v 4.78007 h -1.171333 v -4.78007 z" />
225 |         <path
226 |            inkscape:connector-curvature="0"
227 |            id="path881"
228 |            style="font-style:normal;font-weight:normal;font-size:medium;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
229 |            d="m 87.821462,55.447523 v 2.738851 h 3.264228 v 1.231622 h -3.264228 v 5.236545 q 0,1.179945 0.318672,1.515842 0.327284,0.335897 1.317749,0.335897 h 1.627807 v 1.326362 h -1.627807 q -1.834514,0 -2.532145,-0.680407 -0.697632,-0.689019 -0.697632,-2.497694 v -5.236545 h -1.16272 v -1.231622 h 1.16272 v -2.738851 z" />
230 |         <path
231 |            inkscape:connector-curvature="0"
232 |            id="path883"
233 |            style="font-style:normal;font-weight:normal;font-size:medium;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#0000ff;fill-opacity:1;stroke:none;stroke-width:0.26458332"
234 |            d="m 91.507714,51.451212 q 1.929254,0 3.884345,0.508152 1.955092,0.508151 3.979086,1.524455 v 1.085205 q -1.972317,-0.895725 -3.927409,-1.343588 -1.946479,-0.447862 -3.936022,-0.447862 -1.989543,0 -3.927409,0.447862 -1.937866,0.43925 -3.918796,1.343588 v -1.085205 q 2.032606,-1.024916 3.979085,-1.524455 1.946479,-0.508152 3.86712,-0.508152 z" />
235 |         <path
236 |            inkscape:connector-curvature="0"
237 |            id="path885"
238 |            style="font-style:normal;font-weight:normal;font-size:medium;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
239 |            d="M 93.152747,68.004896 V 57.51458 q 0,-1.610582 0.749309,-2.342665 0.749308,-0.740695 2.377116,-0.740695 h 1.498616 v 1.317749 h -1.515842 q -0.852661,0 -1.188558,0.344509 -0.327284,0.34451 -0.327284,1.240235 v 10.671183 q 0,1.843127 -0.706245,2.669949 -0.697631,0.826823 -2.256537,0.826823 H 91.18043 v -1.343587 h 0.422024 q 0.904338,0 1.223009,-0.422024 0.327284,-0.413412 0.327284,-1.731161 z" />
240 |       </g>
241 |       <rect
242 |          style="opacity:1;fill:none;fill-opacity:1;stroke:#808080;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:1.50000002, 3.00000004;stroke-dashoffset:0;stroke-opacity:1"
243 |          id="rect955"
244 |          width="26.080357"
245 |          height="25.135416"
246 |          x="77.434006"
247 |          y="49.236607" />
248 |     </g>
249 |     <text
250 |        id="text959"
251 |        y="83.23674"
252 |        x="79.29657"
253 |        style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#808080;fill-opacity:1;stroke:none;stroke-width:0.26458332"
254 |        xml:space="preserve"><tspan
255 |          style="font-weight:normal;font-size:7.0555563px;fill:#808080;fill-opacity:1;stroke-width:0.26458332"
256 |          y="83.23674"
257 |          x="79.29657"
258 |          id="tspan957"
259 |          sodipodi:role="line">phone</tspan></text>
260 |     <g
261 |        id="g1003">
262 |       <path
263 |          sodipodi:nodetypes="cc"
264 |          inkscape:connector-curvature="0"
265 |          id="path951"
266 |          d="m 46.302083,42.433036 c 3.481566,4.173142 6.344667,8.552438 8.315475,13.229166"
267 |          style="fill:none;stroke:#008000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
268 |       <circle
269 |          r="0.51971728"
270 |          cy="42.404682"
271 |          cx="46.160339"
272 |          id="path978"
273 |          style="opacity:1;fill:#008000;fill-opacity:1;stroke:none;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:1.50000002, 3.00000004;stroke-dashoffset:0;stroke-opacity:1" />
274 |     </g>
275 |     <g
276 |        id="g997">
277 |       <path
278 |          sodipodi:nodetypes="ccc"
279 |          inkscape:connector-curvature="0"
280 |          id="path949"
281 |          d="M 43.089284,69.269345 C 37.989022,69.232713 33.159055,68.621703 30.049107,64.355654 36.72403,64.057372 42.130625,62.33222 42.805804,55.284226"
282 |          style="fill:none;stroke:#aa00d4;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
283 |       <circle
284 |          r="0.51971728"
285 |          cy="64.354729"
286 |          cx="30.122358"
287 |          id="path978-3"
288 |          style="opacity:1;fill:#aa00d4;fill-opacity:1;stroke:none;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:1.50000004, 3.00000007;stroke-dashoffset:0;stroke-opacity:1" />
289 |     </g>
290 |     <g
291 |        id="g1009">
292 |       <path
293 |          sodipodi:nodetypes="ccc"
294 |          inkscape:connector-curvature="0"
295 |          id="path953"
296 |          d="M 82.115326,65.017112 C 82.452169,59.139505 83.70347,50.989522 77.107142,46.30729 50.520406,45.820661 40.654623,48.093801 39.970982,51.88244"
297 |          style="fill:none;stroke:#aa0000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
298 |       <circle
299 |          style="opacity:1;fill:#aa0000;fill-opacity:1;stroke:none;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:1.50000002, 3.00000004;stroke-dashoffset:0;stroke-opacity:1"
300 |          id="circle999"
301 |          cx="77.154388"
302 |          cy="46.278938"
303 |          r="0.51971728" />
304 |     </g>
305 |     <g
306 |        id="g1017">
307 |       <circle
308 |          r="0.51971728"
309 |          cy="46.089951"
310 |          cx="94.872025"
311 |          id="circle1005"
312 |          style="opacity:1;fill:#0000ff;fill-opacity:1;stroke:none;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:1.50000002, 3.00000004;stroke-dashoffset:0;stroke-opacity:1" />
313 |       <path
314 |          sodipodi:nodetypes="cc"
315 |          inkscape:connector-curvature="0"
316 |          id="path1013"
317 |          d="m 93.407366,51.504463 1.464657,-5.433406"
318 |          style="fill:none;stroke:#0000ff;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
319 |     </g>
320 |     <g
321 |        id="g1023"
322 |        transform="rotate(-75,66.391788,52.665449)">
323 |       <path
324 |          style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
325 |          d="M 54.377221,76.712287 C 48.569466,75.495643 57.545425,60.326501 30.049107,64.355654 36.72403,64.057372 45.188892,49.28433 48.14113,42.702302"
326 |          id="path1019"
327 |          inkscape:connector-curvature="0"
328 |          sodipodi:nodetypes="ccc" />
329 |       <circle
330 |          style="opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:1.50000004, 3.00000007;stroke-dashoffset:0;stroke-opacity:1"
331 |          id="circle1021"
332 |          cx="30.122358"
333 |          cy="64.354729"
334 |          r="0.51971728" />
335 |     </g>
336 |     <g
337 |        transform="matrix(0.25881905,0.96592583,0.96592583,-0.25881905,54.063108,54.891173)"
338 |        id="g1029">
339 |       <path
340 |          sodipodi:nodetypes="ccc"
341 |          inkscape:connector-curvature="0"
342 |          id="path1025"
343 |          d="M 21.497327,9.8626888 C 44.147946,5.3250536 58.172014,60.158607 30.675696,64.18776 31.293179,60.06128 29.848968,58.067045 28.480353,55.994627"
344 |          style="fill:none;stroke:#d45500;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
345 |       <circle
346 |          r="0.51971728"
347 |          cy="64.207985"
348 |          cx="30.670004"
349 |          id="circle1027"
350 |          style="opacity:1;fill:#d45500;fill-opacity:1;stroke:none;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:1.50000004, 3.00000007;stroke-dashoffset:0;stroke-opacity:1" />
351 |     </g>
352 |     <path
353 |        style="fill:none;stroke:#000000;stroke-width:0.26499999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:0.26499999, 0.52999997;stroke-dashoffset:0;stroke-opacity:1"
354 |        d="M 40.254465,36.385415 103.8962,30.007069 75.078064,38.044386"
355 |        id="path1031"
356 |        inkscape:connector-curvature="0"
357 |        sodipodi:nodetypes="ccc" />
358 |     <path
359 |        style="fill:none;stroke:#000000;stroke-width:0.26499999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:0.26499999, 0.52999997;stroke-dashoffset:0;stroke-opacity:1"
360 |        d="m 97.152472,37.548105 6.914608,-7.525469 20.07962,24.831172"
361 |        id="path1033"
362 |        inkscape:connector-curvature="0"
363 |        sodipodi:nodetypes="ccc" />
364 |     <circle
365 |        r="0.51971728"
366 |        cy="29.880222"
367 |        cx="103.89566"
368 |        id="circle1005-6"
369 |        style="opacity:1;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.75;stroke-linecap:round;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:1.50000004, 3.00000007;stroke-dashoffset:0;stroke-opacity:1" />
370 |   </g>
371 | </svg>
372 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | 
2 | [mypy]
3 | 
4 | [mypy-setuptools.*]
5 | ignore_missing_imports = True
6 | 
7 | [mypy-sklearn.*]
8 | ignore_missing_imports = True
9 | 


--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
 1 | [MESSAGES CONTROL]
 2 | disable=
 3 |   format,
 4 |   abstract-class-little-used,
 5 |   abstract-method,
 6 |   cyclic-import,
 7 |   duplicate-code,
 8 |   global-statement,
 9 |   import-outside-toplevel,
10 |   inconsistent-return-statements,
11 |   locally-disabled,
12 |   not-context-manager,
13 |   redefined-variable-type,
14 |   too-few-public-methods,
15 |   too-many-arguments,
16 |   too-many-branches,
17 |   too-many-instance-attributes,
18 |   too-many-lines,
19 |   too-many-locals,
20 |   too-many-public-methods,
21 |   too-many-return-statements,
22 |   too-many-statements,
23 |   too-many-boolean-expressions,
24 |   unnecessary-pass,
25 |   unused-argument,
26 |   broad-except,
27 |   too-many-nested-blocks,
28 |   invalid-name,
29 |   unused-import
30 | 
31 | [FORMAT]
32 | expected-line-ending-format=LF


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | black==19.10b0
2 | coverage==5.0.4
3 | flake8==3.7.9
4 | mypy==0.910
5 | pylint==2.10.2
6 | pytest==5.4.1
7 | pytest-cov==2.8.1
8 | 


--------------------------------------------------------------------------------
/requirements_test.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | 


--------------------------------------------------------------------------------
/scripts/check-code.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Directory of *this* script
 5 | this_dir="$( cd "$( dirname "$0" )" && pwd )"
 6 | src_dir="$(realpath "${this_dir}/..")"
 7 | 
 8 | venv="${src_dir}/.venv"
 9 | if [[ -d "${venv}" ]]; then
10 |     source "${venv}/bin/activate"
11 | fi
12 | 
13 | python_files=("${src_dir}/gruut_ipa/"*.py "${src_dir}/setup.py")
14 | python_files+=("${src_dir}/tests/"*.py)
15 | 
16 | # -----------------------------------------------------------------------------
17 | 
18 | flake8 "${python_files[@]}"
19 | pylint "${python_files[@]}"
20 | mypy "${python_files[@]}"
21 | black --check "${python_files[@]}"
22 | isort --check-only "${python_files[@]}"
23 | 
24 | # -----------------------------------------------------------------------------
25 | 
26 | echo "OK"
27 | 


--------------------------------------------------------------------------------
/scripts/create-venv.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | if [[ -z "${PIP_INSTALL}" ]]; then
 5 |     PIP_INSTALL='install'
 6 | fi
 7 | 
 8 | # Directory of *this* script
 9 | this_dir="$( cd "$( dirname "$0" )" && pwd )"
10 | src_dir="$(realpath "${this_dir}/..")"
11 | 
12 | # -----------------------------------------------------------------------------
13 | 
14 | venv="${src_dir}/.venv"
15 | 
16 | # -----------------------------------------------------------------------------
17 | 
18 | : "${PYTHON=python3}"
19 | 
20 | python_version="$(${PYTHON} --version)"
21 | 
22 | # Create virtual environment
23 | echo "Creating virtual environment at ${venv} (${python_version})"
24 | rm -rf "${venv}"
25 | "${PYTHON}" -m venv "${venv}"
26 | source "${venv}/bin/activate"
27 | 
28 | # Install Python dependencies
29 | echo "Installing Python dependencies"
30 | pip3 ${PIP_INSTALL} --upgrade pip
31 | pip3 ${PIP_INSTALL} --upgrade wheel setuptools
32 | 
33 | pip3 ${PIP_INSTALL} "${src_dir}"
34 | 
35 | # Development dependencies
36 | if [[ -f requirements_dev.txt ]]; then
37 |     pip3 ${PIP_INSTALL} -r requirements_dev.txt || echo "Failed to install development dependencies" >&2
38 | fi
39 | 
40 | # -----------------------------------------------------------------------------
41 | 
42 | echo "OK"
43 | 


--------------------------------------------------------------------------------
/scripts/format-code.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Directory of *this* script
 5 | this_dir="$( cd "$( dirname "$0" )" && pwd )"
 6 | src_dir="$(realpath "${this_dir}/..")"
 7 | 
 8 | venv="${src_dir}/.venv"
 9 | if [[ -d "${venv}" ]]; then
10 |     source "${venv}/bin/activate"
11 | fi
12 | 
13 | python_files=("${src_dir}/gruut_ipa/"*.py "${src_dir}/setup.py")
14 | python_files+=("${src_dir}/tests/"*.py)
15 | 
16 | # -----------------------------------------------------------------------------
17 | 
18 | black "${python_files[@]}"
19 | isort "${python_files[@]}"
20 | 
21 | # -----------------------------------------------------------------------------
22 | 
23 | echo "OK"
24 | 


--------------------------------------------------------------------------------
/scripts/run-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # Directory of *this* script
 5 | this_dir="$( cd "$( dirname "$0" )" && pwd )"
 6 | src_dir="$(realpath "${this_dir}/..")"
 7 | 
 8 | venv="${src_dir}/.venv"
 9 | if [[ -d "${venv}" ]]; then
10 |     source "${venv}/bin/activate"
11 | fi
12 | 
13 | python_files=("${src_dir}/tests/"*.py)
14 | 
15 | # -----------------------------------------------------------------------------
16 | 
17 | python3 -m unittest "${python_files[@]}"
18 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | # To work with Black
 3 | max-line-length = 88
 4 | # E501: line too long
 5 | # W503: Line break occurred before a binary operator
 6 | # E203: Whitespace before ':'
 7 | # D202 No blank lines allowed after function docstring
 8 | # W504 line break after binary operator
 9 | ignore =
10 |     E501,
11 |     W503,
12 |     E203,
13 |     D202,
14 |     W504
15 | 
16 | # F401 import unused
17 | per-file-ignores =
18 |     dodo.py:F401
19 | 
20 | [isort]
21 | multi_line_output = 3
22 | include_trailing_comma=True
23 | force_grid_wrap=0
24 | use_parentheses=True
25 | line_length=88
26 | indent = "    "


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """Setup file for gruut-ipa"""
 2 | from pathlib import Path
 3 | 
 4 | import setuptools
 5 | 
 6 | this_dir = Path(__file__).parent
 7 | 
 8 | # -----------------------------------------------------------------------------
 9 | 
10 | # Load README in as long description
11 | long_description: str = ""
12 | readme_path = this_dir / "README.md"
13 | if readme_path.is_file():
14 |     long_description = readme_path.read_text(encoding="UTF-8")
15 | 
16 | requirements = []
17 | requirements_path = this_dir / "requirements.txt"
18 | if requirements_path.is_file():
19 |     with open(requirements_path, "r", encoding="utf-8") as requirements_file:
20 |         requirements = requirements_file.read().splitlines()
21 | 
22 | version_path = this_dir / "VERSION"
23 | with open(version_path, "r", encoding="utf-8") as version_file:
24 |     version = version_file.read().strip()
25 | 
26 | # -----------------------------------------------------------------------------
27 | 
28 | module_dir = this_dir / "gruut_ipa"
29 | data_dir = module_dir / "data"
30 | data_files = [str(f.relative_to(module_dir)) for f in data_dir.rglob("*")]
31 | 
32 | setuptools.setup(
33 |     name="gruut-ipa",
34 |     description="Library for manipulating pronunciations using the International Phonetic Alphabet (IPA)",
35 |     version=version,
36 |     author="Michael Hansen",
37 |     author_email="mike@rhasspy.org",
38 |     url="https://github.com/rhasspy/gruut-ipa",
39 |     packages=setuptools.find_packages(),
40 |     package_data={"gruut_ipa": data_files + ["py.typed"]},
41 |     install_requires=requirements,
42 |     extras_require={':python_version<"3.7"': "dataclasses"},
43 |     entry_points={"console_scripts": ["gruut-ipa = gruut_ipa.__main__:main"]},
44 |     classifiers=[
45 |         "Programming Language :: Python :: 3",
46 |         "Programming Language :: Python :: 3.6",
47 |         "Programming Language :: Python :: 3.7",
48 |         "Programming Language :: Python :: 3.8",
49 |         "Programming Language :: Python :: 3.9",
50 |         "License :: OSI Approved :: MIT License",
51 |     ],
52 |     long_description=long_description,
53 |     long_description_content_type="text/markdown",
54 |     python_requires=">=3.6",
55 | )
56 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rhasspy/gruut-ipa/ec9ae6ce7cca0103d9a563fcde3352f805a8e27e/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_accent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Tests for phoneme conversion between languages"""
 3 | import unittest
 4 | 
 5 | from gruut_ipa import Phonemes
 6 | from gruut_ipa.accent import guess_phonemes
 7 | 
 8 | 
 9 | class AccentTestCase(unittest.TestCase):
10 |     """Test cases for phoneme conversion between languages"""
11 | 
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         cls.de_phonemes = Phonemes.from_language("de-de")
15 | 
16 |     def test_exact(self):
17 |         """Test exact match"""
18 |         guessed = guess_phonemes("k", self.de_phonemes)
19 | 
20 |         self.assertEqual(len(guessed.phonemes), 1)
21 |         self.assertEqual(guessed.phonemes[0].text, "k")
22 | 
23 |     def test_letters(self):
24 |         """Test matching letters"""
25 |         guessed = guess_phonemes("ɐ̯ː", self.de_phonemes)
26 | 
27 |         self.assertEqual(len(guessed.phonemes), 1)
28 |         self.assertEqual(guessed.phonemes[0].text, "ɐ")
29 | 
30 |     def test_close_vowel(self):
31 |         """Test nearby vowel"""
32 |         guessed = guess_phonemes("ɑ", self.de_phonemes)
33 | 
34 |         self.assertEqual(len(guessed.phonemes), 1)
35 | 
36 |         # Placement is more important that height
37 |         self.assertEqual(guessed.phonemes[0].text, "ɐ")
38 | 
39 |     def test_close_consonant(self):
40 |         """Test nearby consonant"""
41 |         guessed = guess_phonemes("ð", self.de_phonemes)
42 | 
43 |         self.assertEqual(len(guessed.phonemes), 1)
44 | 
45 |         # Should match a nearby voiced consonant
46 |         self.assertIn(guessed.phonemes[0].text, {"v", "z"})
47 | 
48 |     def test_dipthong_letters_match(self):
49 |         """Test dipthong (two vowels) with matching letters"""
50 |         guessed = guess_phonemes("aʊ", self.de_phonemes)
51 | 
52 |         self.assertEqual(len(guessed.phonemes), 1)
53 |         self.assertEqual(guessed.phonemes[0].text, "aʊ̯")
54 | 
55 |     def test_dipthong_split(self):
56 |         """Test dipthong (two vowels) split into two phonemes"""
57 |         guessed = guess_phonemes("oʊ", self.de_phonemes)
58 | 
59 |         self.assertEqual(len(guessed.phonemes), 2)
60 |         self.assertEqual(guessed.phonemes[0].text, "oː")
61 |         self.assertEqual(guessed.phonemes[1].text, "ʊ")
62 | 
63 |     def test_g(self):
64 |         """Test ɡ/g mapping"""
65 |         from gruut_ipa.accent import GS
66 | 
67 |         for g in GS:
68 |             guessed = guess_phonemes(g, self.de_phonemes)
69 | 
70 |             self.assertEqual(len(guessed.phonemes), 1)
71 |             self.assertIn(guessed.phonemes[0].text, GS)
72 | 
73 |     def test_r(self):
74 |         """Test r-like mapping"""
75 |         from gruut_ipa.accent import R_LIKE
76 | 
77 |         for r in R_LIKE:
78 |             guessed = guess_phonemes(r, self.de_phonemes)
79 | 
80 |             self.assertEqual(len(guessed.phonemes), 1)
81 |             self.assertIn(guessed.phonemes[0].text, R_LIKE)
82 | 
83 |     def test_schwa(self):
84 |         """Test schwa mapping"""
85 |         from gruut_ipa.accent import R_LIKE, SCHWA_PREFERRED
86 |         from gruut_ipa.constants import SCHWAS
87 | 
88 |         for s in SCHWAS:
89 |             guessed = guess_phonemes(s, self.de_phonemes)
90 | 
91 |             self.assertEqual(len(guessed.phonemes), 1)
92 |             self.assertIn(guessed.phonemes[0].text, SCHWA_PREFERRED + R_LIKE)
93 | 
94 | 
95 | # -----------------------------------------------------------------------------
96 | 
97 | if __name__ == "__main__":
98 |     unittest.main()
99 | 


--------------------------------------------------------------------------------
/tests/test_distances.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Tests for phoneme distances"""
 3 | import unittest
 4 | 
 5 | from gruut_ipa import get_closest
 6 | 
 7 | 
 8 | class DistancesTestCase(unittest.TestCase):
 9 |     """Test cases for phoneme distances"""
10 | 
11 |     def test_vowels(self):
12 |         """Test distances for vowels"""
13 |         self.assertEqual(get_closest("p")[0], "t")
14 | 
15 |     def test_consonants(self):
16 |         """Test distances for consonants"""
17 |         self.assertEqual(get_closest("ɑ")[0], "ɒ")
18 |         self.assertEqual(get_closest("ʝ")[0], "ç")
19 | 
20 |     def test_schwas(self):
21 |         """Test distances for schwas"""
22 |         self.assertEqual(get_closest("ɝ")[0], "ɚ")
23 | 
24 | 
25 | # -----------------------------------------------------------------------------
26 | 
27 | if __name__ == "__main__":
28 |     unittest.main()
29 | 


--------------------------------------------------------------------------------
/tests/test_features.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Tests for phoneme features"""
 3 | import dataclasses
 4 | import unittest
 5 | 
 6 | from gruut_ipa import (
 7 |     CONSONANTS,
 8 |     SCHWAS,
 9 |     VOWELS,
10 |     Break,
11 |     BreakType,
12 |     PhonemeLength,
13 |     Stress,
14 |     from_vector,
15 |     string_to_symbol,
16 |     to_vector,
17 | )
18 | 
19 | 
20 | class FeaturesTestCase(unittest.TestCase):
21 |     """Test cases for phoneme features"""
22 | 
23 |     def test_vowels(self):
24 |         """Test to/from feature vector for vowels"""
25 |         for vowel in VOWELS.values():
26 |             if vowel.alias_of:
27 |                 continue
28 | 
29 |             feat_vec = to_vector(vowel)
30 |             self.assertEqual(vowel, from_vector(feat_vec))
31 | 
32 |             # Test with stress
33 |             for stress in Stress:
34 |                 vowel_stressed = dataclasses.replace(vowel, stress=stress)
35 |                 feat_vec = to_vector(vowel_stressed)
36 |                 self.assertEqual(vowel_stressed, from_vector(feat_vec))
37 | 
38 |     def test_consonants(self):
39 |         """Test to/from feature vector for consonants"""
40 |         for consonant in CONSONANTS.values():
41 |             if consonant.alias_of:
42 |                 continue
43 | 
44 |             feat_vec = to_vector(consonant)
45 |             self.assertEqual(consonant, from_vector(feat_vec))
46 | 
47 |     def test_schwas(self):
48 |         """Test to/from feature vector for schwas"""
49 |         for schwa in SCHWAS.values():
50 |             if schwa.alias_of:
51 |                 continue
52 | 
53 |             feat_vec = to_vector(schwa)
54 |             self.assertEqual(schwa, from_vector(feat_vec))
55 | 
56 |     def test_breaks(self):
57 |         """Test to/from feature vector for breaks"""
58 |         for break_type in BreakType:
59 |             ipa_break = Break(break_type)
60 |             feat_vec = to_vector(ipa_break)
61 |             self.assertEqual(ipa_break, from_vector(feat_vec))
62 | 
63 |     def test_string_to_symbol(self):
64 |         """Test symbol parsing"""
65 |         self.assertEqual(
66 |             string_to_symbol("ˈãː"),
67 |             dataclasses.replace(
68 |                 VOWELS["ã"], stress=Stress.PRIMARY, length=PhonemeLength.LONG
69 |             ),
70 |         )
71 | 
72 |         self.assertEqual(
73 |             string_to_symbol("ɫː"),
74 |             dataclasses.replace(CONSONANTS["ɫ"], length=PhonemeLength.LONG),
75 |         )
76 | 
77 |         self.assertEqual(
78 |             string_to_symbol("ɚː"),
79 |             dataclasses.replace(SCHWAS["ɚ"], length=PhonemeLength.LONG),
80 |         )
81 | 
82 | 
83 | # -----------------------------------------------------------------------------
84 | 
85 | if __name__ == "__main__":
86 |     unittest.main()
87 | 


--------------------------------------------------------------------------------
/tests/test_phone.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Tests for Phone class"""
 3 | import unicodedata
 4 | import unittest
 5 | 
 6 | from gruut_ipa import IPA, Phone, Stress, VowelHeight, VowelPlacement
 7 | 
 8 | 
 9 | class PhoneTestCase(unittest.TestCase):
10 |     """Test cases for Phone class"""
11 | 
12 |     def test_from_string(self):
13 |         """Test Phone.from_string"""
14 |         # ˈãː
15 |         codepoints = [IPA.STRESS_PRIMARY, "a", IPA.NASAL, IPA.LONG]
16 |         ipa = "".join(codepoints)
17 | 
18 |         phone = Phone.from_string(ipa)
19 | 
20 |         # Important: text is NFC normalized, so combining characters are
21 |         # elimiated if possible.
22 |         self.assertEqual(phone.text, unicodedata.normalize("NFC", "ˈãː"))
23 | 
24 |         self.assertEqual(phone.letters, "a")
25 |         self.assertEqual(phone.diacritics[0], {IPA.NASAL})
26 |         self.assertEqual(phone.suprasegmentals, {IPA.STRESS_PRIMARY, IPA.LONG})
27 | 
28 |         self.assertEqual(phone.stress, Stress.PRIMARY)
29 |         self.assertTrue(phone.is_nasal)
30 |         self.assertTrue(phone.is_long)
31 | 
32 |         self.assertTrue(phone.is_vowel)
33 |         self.assertEqual(phone.vowel.height, VowelHeight.OPEN)
34 |         self.assertEqual(phone.vowel.placement, VowelPlacement.FRONT)
35 | 
36 | 
37 | # -----------------------------------------------------------------------------
38 | 
39 | if __name__ == "__main__":
40 |     unittest.main()
41 | 


--------------------------------------------------------------------------------
/tests/test_phonemes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Tests for Phonemes class"""
 3 | import unittest
 4 | 
 5 | from gruut_ipa import Phonemes
 6 | 
 7 | 
 8 | class PhonemesTestCase(unittest.TestCase):
 9 |     """Test cases for Phonemes class"""
10 | 
11 |     def test_split(self):
12 |         """Test Phonemes.from_string"""
13 |         # "Just a cow."
14 |         pron_str = "/dʒʌst ə kˈaʊ/"
15 | 
16 |         lang_phonemes = Phonemes.from_language("en-us")
17 |         pron_phonemes = lang_phonemes.split(pron_str, keep_stress=True)
18 | 
19 |         # Ensure "d ʒ" -> "d͡ʒ" and "a ʊ" -> "aʊ"
20 |         phoneme_strs = [p.text for p in pron_phonemes]
21 |         self.assertEqual(phoneme_strs, ["d͡ʒ", "ʌ", "s", "t", "ə", "k", "ˈaʊ"])
22 | 
23 |     def test_split_substring(self):
24 |         """Test Phonemes.split with a substring replacement"""
25 |         pron_str = "/viːtɛt͡ʃnaː/"
26 | 
27 |         lang_phonemes = Phonemes.from_language("cs-cz")
28 |         pron_phonemes = lang_phonemes.split(pron_str, keep_stress=False)
29 | 
30 |         # Ensure iː doesn't get transformed into ɪː
31 |         phoneme_strs = [p.text for p in pron_phonemes]
32 |         self.assertEqual(phoneme_strs, ["v", "iː", "t", "ɛ", "t͡ʃ", "n", "aː"])
33 | 
34 |     def test_split_diacritics(self):
35 |         """Test Phonemes.split with a diacritic substring replacement"""
36 |         pron_str = "/ɑɑ̃/"
37 | 
38 |         lang_phonemes = Phonemes.from_language("fr-fr")
39 |         pron_phonemes = lang_phonemes.split(pron_str, keep_stress=False)
40 | 
41 |         # Ensure first ɑ is transformed into a, but not the second
42 |         phoneme_strs = [p.text for p in pron_phonemes]
43 |         self.assertEqual(phoneme_strs, ["a", "ɑ̃"])
44 | 
45 |     def test_dipthong(self):
46 |         """Test Phonemes.from_string with a dipthong"""
47 |         # ampliam
48 |         pron_str = "/ɐ̃pliɐ̃w̃/"
49 | 
50 |         lang_phonemes = Phonemes.from_language("pt")
51 |         pron_phonemes = lang_phonemes.split(pron_str)
52 | 
53 |         # Ensure "ɐ̃" and "ɐ̃w̃" are kept
54 |         phoneme_strs = [p.text for p in pron_phonemes]
55 |         self.assertEqual(phoneme_strs, ["ɐ̃", "p", "l", "i", "ɐ̃w̃"])
56 | 
57 |     def test_split_dipthong(self):
58 |         """Test Phonemes.split with a dipthong"""
59 |         pron_str = "/neu̯rt͡ʃɪtou̯/"
60 | 
61 |         lang_phonemes = Phonemes.from_language("cs-cz")
62 |         pron_phonemes = lang_phonemes.split(pron_str, keep_stress=False)
63 | 
64 |         # Ensure eu̯ ends up as eu̯
65 |         phoneme_strs = [p.text for p in pron_phonemes]
66 |         self.assertEqual(phoneme_strs, ["n", "eu̯", "r", "t͡ʃ", "ɪ", "t", "ou̯"])
67 | 
68 |     def test_tones(self):
69 |         """Test Phonemes.split with tones"""
70 |         # á khôi
71 |         pron_str = "/a˨˦xoj˧˧/"
72 | 
73 |         lang_phonemes = Phonemes.from_language("vi-n")
74 |         pron_phonemes = lang_phonemes.split(pron_str)
75 | 
76 |         # Ensure tones are kept
77 |         phoneme_strs = [p.text for p in pron_phonemes]
78 |         self.assertEqual(phoneme_strs, ["a˨˦", "x", "oj˧˧"])
79 | 
80 | 
81 | # -----------------------------------------------------------------------------
82 | 
83 | if __name__ == "__main__":
84 |     unittest.main()
85 | 


--------------------------------------------------------------------------------
/tests/test_pronunciation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Tests for Pronunciation class"""
 3 | import unittest
 4 | 
 5 | from gruut_ipa import IPA, Pronunciation
 6 | 
 7 | 
 8 | class PronunciationTestCase(unittest.TestCase):
 9 |     """Test cases for Pronunciation class"""
10 | 
11 |     def test_from_string(self):
12 |         """Test Pronuncation.from_string"""
13 |         # "Yes, choose IPA."
14 |         pron_str = "↗ˈjɛs|ˈt͡ʃuːz#↘aɪpiːeɪ‖"
15 | 
16 |         pron = Pronunciation.from_string(pron_str, keep_stress=False)
17 | 
18 |         phone_strs = [p.text for p in pron.phones]
19 |         self.assertEqual(
20 |             phone_strs, ["j", "ɛ", "s", "t͡ʃ", "uː", "z", "a", "ɪ", "p", "iː", "e", "ɪ"]
21 |         )
22 | 
23 |         phone_strs = [p.text for p in pron]
24 |         self.assertEqual(
25 |             phone_strs,
26 |             [
27 |                 IPA.INTONATION_RISING,
28 |                 "j",
29 |                 "ɛ",
30 |                 "s",
31 |                 IPA.BREAK_MINOR,
32 |                 "t͡ʃ",
33 |                 "uː",
34 |                 "z",
35 |                 IPA.BREAK_WORD,
36 |                 IPA.INTONATION_FALLING,
37 |                 "a",
38 |                 "ɪ",
39 |                 "p",
40 |                 "iː",
41 |                 "e",
42 |                 "ɪ",
43 |                 IPA.BREAK_MAJOR,
44 |             ],
45 |         )
46 | 
47 |     def test_diacritics(self):
48 |         """Test Pronuncation.from_string with extra diacritics"""
49 |         pron_str = "ɔʊ̯"
50 |         pron = Pronunciation.from_string(pron_str)
51 | 
52 |         self.assertEqual(pron.text, pron_str)
53 | 
54 |     def test_tones(self):
55 |         """Test Pronuncation.from_string with tone numbers"""
56 |         pron_str = "/hwiən˧˨ ziəw˨ˀ˩ʔ/"
57 |         pron = Pronunciation.from_string(pron_str)
58 | 
59 |         phone_strs = [p.text for p in pron]
60 |         self.assertEqual(
61 |             phone_strs, ["h", "w", "i", "ə", "n˧˨", "z", "i", "ə", "w˨ˀ˩ʔ"]
62 |         )
63 | 
64 |     def test_accents(self):
65 |         """Test Pronuncation.from_string with accents"""
66 |         pron_str = "/²'alːdɑːglɪg/"
67 |         pron = Pronunciation.from_string(pron_str)
68 | 
69 |         phone_strs = [p.text for p in pron]
70 |         self.assertEqual(phone_strs, ["²'a", "lː", "d", "ɑː", "g", "l", "ɪ", "g"])
71 | 
72 | 
73 | # -----------------------------------------------------------------------------
74 | 
75 | if __name__ == "__main__":
76 |     unittest.main()
77 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py36,py37,py38,py39
3 | 
4 | [testenv]
5 | deps = -r{toxinidir}/requirements_test.txt
6 | commands =
7 |     pytest
8 | 


--------------------------------------------------------------------------------