├── .gitignore ├── .projectile ├── .python_version ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── VERSION ├── bin ├── distance_matrix.py ├── gruut-ipa └── speak-ipa ├── default.nix ├── derivation.nix ├── gruut_ipa ├── __init__.py ├── __main__.py ├── accent.py ├── constants.py ├── data │ ├── ar │ │ └── phonemes.txt │ ├── cs-cz │ │ └── phonemes.txt │ ├── de-de │ │ └── phonemes.txt │ ├── el-gr │ │ └── phonemes.txt │ ├── en-gb │ │ └── phonemes.txt │ ├── en-us │ │ ├── cmudict │ │ │ ├── ipa_map.txt │ │ │ └── phonemes.txt │ │ ├── phonemes.txt │ │ └── zamia │ │ │ ├── ipa_map.txt │ │ │ └── phonemes.txt │ ├── es-es │ │ └── phonemes.txt │ ├── fa │ │ └── phonemes.txt │ ├── fr-fr │ │ └── phonemes.txt │ ├── it-it │ │ └── phonemes.txt │ ├── lb-lb │ │ └── phonemes.txt │ ├── nl │ │ ├── cgn │ │ │ ├── ipa_map.txt │ │ │ └── phonemes.txt │ │ └── phonemes.txt │ ├── phoneme_distances.json.gz │ ├── pt │ │ └── phonemes.txt │ ├── ru-ru │ │ └── phonemes.txt │ ├── sv-se │ │ └── phonemes.txt │ ├── sw │ │ ├── alffa │ │ │ ├── ipa_map.txt │ │ │ └── phonemes.txt │ │ └── phonemes.txt │ └── vi-n │ │ └── phonemes.txt ├── distances.py ├── espeak.py ├── features.py ├── kirshenbaum.py ├── phonemes.py ├── py.typed ├── sampa.py └── utils.py ├── img ├── ipa.png └── ipa.svg ├── mypy.ini ├── pylintrc ├── requirements_dev.txt ├── requirements_test.txt ├── scripts ├── check-code.sh ├── create-venv.sh ├── format-code.sh └── run-tests.sh ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── test_accent.py ├── test_distances.py ├── test_features.py ├── test_phone.py ├── test_phonemes.py └── test_pronunciation.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .mypy_cache/ 3 | *.egg-info/ 4 | /.venv/ 5 | /dist/ 6 | /.tox/ 7 | -------------------------------------------------------------------------------- /.projectile: -------------------------------------------------------------------------------- 1 | -/.venv/ 2 | -/__pycache__/ 3 | -/gruut_ipa/__pycache__/ 4 | -/tests/__pycache__/ -------------------------------------------------------------------------------- /.python_version: -------------------------------------------------------------------------------- 1 | 3.7 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Michael Hansen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements_dev.txt 2 | include LICENSE 3 | include README.md 4 | include VERSION -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := bash 2 | 3 | .PHONY: check clean reformat dist test install 4 | 5 | all: dist 6 | 7 | install: 8 | scripts/create-venv.sh 9 | 10 | check: 11 | scripts/check-code.sh 12 | 13 | reformat: 14 | scripts/format-code.sh 15 | 16 | test: 17 | scripts/run-tests.sh 18 | 19 | dist: 20 | python3 setup.py sdist 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gruut IPA 2 | 3 | Library for manipulating [International Phonetic Alphabet](https://en.wikipedia.org/wiki/International_Phonetic_Alphabet) (IPA) pronunciations. 4 | 5 | Features include: 6 | 7 | * Getting the category and details of a phone, e.g. "open front unrounded vowel" for ɶ 8 | * Splitting IPA pronunciations into groups of: 9 | * Phones (`/ˈt͡ʃuːz/` to `ˈt͡ʃ uː z` ) 10 | * Phonemes (`/kˈaʊ/` to `k ˈaʊ` for U.S. English) 11 | * Converting pronunciations between: 12 | * IPA 13 | * [espeak](https://github.com/espeak-ng/) 14 | * [sampa](https://www.phon.ucl.ac.uk/home/sampa/) 15 | 16 | Supported Languages: 17 | 18 | * Arabic (`ar`) 19 | * Czech (`cs-cz`) 20 | * German (`de-de`) 21 | * U.S. English (`en-us`) 22 | * U.K. English (`en-gb`) 23 | * Spanish (`es-es`) 24 | * Persian/Farsi (`fa`) 25 | * Spanish (`es-es`) 26 | * Italian (`it-it`) 27 | * Luxembourgish (`lb-lb`) 28 | * Dutch (`nl`) 29 | * Portuguese (`pt`) 30 | * Russian (`ru-ru`) 31 | * Swahili (`sw`) 32 | 33 | ## Installing 34 | 35 | ```sh 36 | $ pip install gruut-ipa 37 | ``` 38 | 39 | ## Dependencies 40 | 41 | * Python 3.6 or higher 42 | 43 | For command-line usage, you may also want: 44 | 45 | * [espeak](https://github.com/espeak-ng/) 46 | * [jq](https://stedolan.github.io/jq/) 47 | 48 | Install these with: 49 | 50 | ```sh 51 | $ sudo apt-get install espeak jq 52 | ``` 53 | 54 | ## Phones and Phonemes 55 | 56 | ![IPA phones](img/ipa.png) 57 | 58 | Phones in IPA are composed of different components: 59 | 60 | * Letters 61 | * [Non-combining](https://en.wikipedia.org/wiki/Character_(computing)#Terminology) Unicode characters that represent a distinct human sound (phone) 62 | * Suprasegmentals 63 | * [Non-combining](https://en.wikipedia.org/wiki/Character_(computing)#Terminology) Unicode characters that represent language features above individual vowels or consonants 64 | * Stress (ˈˌ), elongation (ː), linking/ties (t͡s), and short/long breaks (| ‖) are suprasegmentals 65 | * Diacritics 66 | * [Combining characters](https://en.wikipedia.org/wiki/Combining_character) that provide additional information about a phone's pronunciation, such as [nasalation](https://en.wikipedia.org/wiki/Nasalization) 67 | 68 | See [IPA Chart](https://www.ipachart.com/) for more details. 69 | 70 | ### Phonemes 71 | 72 | While phones represent individual sounds, phonemes are the phonetic units of a language that meaningfully distinguish words. A phoneme may be realized by many different phones. For example, the `/r/` in [Standard German](https://en.wikipedia.org/wiki/Standard_German_phonology) can be realized as a uvular fricative (χ/ʁ), a uvular approximant (ɹ), or a uvular tap or trill (ʀ/r). 73 | 74 | A phoneme may also be composed of multiple phones, such as the [dipthong](https://en.wikipedia.org/wiki/Diphthong) `aʊ` in U.S. English (the "ow" in "cow"). 75 | 76 | Supported languages in `gruut-ipa` contain a `phonemes.txt` file in the `gruut_ipa/data` directory. This file has the following format: 77 | 78 | ```text 79 | [ ...] 80 | ``` 81 | 82 | where `` is a set of IPA letters, like `ɶ` or `aʊ`. The `` is a word whose pronunciation contains the ``. After that, there are one or more optional `` strings that will be replaced with ``. The German `/r/` example from above might be represented as: 83 | 84 | ```text 85 | r brot χ ʁ ɹ ʀ 86 | ``` 87 | 88 | Phonemes for a given language come from [phonological analyses](https://en.wikipedia.org/wiki/Template:Language_phonologies) and from [public databases](https://phoible.org/). Ultimately, they are geared towards capturing pronunciations from [Wiktionary](https://www.wiktionary.org/). 89 | 90 | ## Usage 91 | 92 | Print JSON information about phones: 93 | 94 | ```sh 95 | $ python3 -m gruut_ipa describe "ˈãː" | jq . 96 | { 97 | "text": "ˈãː", 98 | "letters": "a", 99 | "stress": "primary", 100 | "height": "open", 101 | "placement": "front", 102 | "rounded": false, 103 | "type": "Vowel", 104 | "nasalated": true, 105 | "elongated": true 106 | } 107 | ``` 108 | 109 | Split an IPA pronunciation into phones: 110 | 111 | ```sh 112 | $ python3 -m gruut_ipa phones "ˈjɛs|ˈt͡ʃuːz aɪpiːeɪ‖" 113 | ˈj ɛ s | ˈt͡ʃ uː z a ɪ p iː e ɪ ‖ 114 | ``` 115 | 116 | Group phones into phonemes for a specific language: 117 | 118 | ```sh 119 | $ python3 -m gruut_ipa phonemes en-us "/dʒʌst ə kaʊ/" 120 | d͡ʒ ʌ s t ə k aʊ 121 | ``` 122 | 123 | Convert between IPA, [espeak](https://github.com/espeak-ng/), and [sampa](https://www.phon.ucl.ac.uk/home/sampa/): 124 | 125 | ```sh 126 | $ python3 -m gruut_ipa convert ipa espeak "mʊmˈbaɪ" 127 | [[mUm'baI]] 128 | 129 | $ python3 -m gruut_ipa convert espeak ipa "[[D,Is Iz sVm f@n'EtIk t'Ekst 'InpUt]]" 130 | ðˌɪs ɪz sʌm fɘnˈɛtɪk tˈɛkst ˈɪnpʊt 131 | ``` 132 | 133 | Chain commands together: 134 | 135 | ```sh 136 | $ python3 -m gruut_ipa convert espeak ipa "[[k'aU]]" | \ 137 | python3 -m gruut_ipa phonemes en-us --keep-stress 138 | k ˈaʊ 139 | ``` 140 | 141 | ### Alternative Phoneme Sets 142 | 143 | Some languages have multiple phoneme sets available: 144 | 145 | * U.S. English (`en-us`) 146 | * CMUDict (`en-us/cmudict`) 147 | * [Zamia](https://github.com/gooofy/zamia-speech) (`en-us/zamia`) 148 | * Swahili (`sw`) 149 | * [ALFFA](http://alffa.imag.fr/) (`sw/alffa`) 150 | 151 | Convert from IPA to alternative phoneme set: 152 | 153 | ```sh 154 | $ python3 -m gruut_ipa convert ipa en-us/cmudict "h ɛ l ˈoʊ w ˈɚ l d" 155 | HH EH0 L OW1 W ER1 L D 156 | ``` 157 | 158 | Convert from alternative phoneme set to IPA: 159 | 160 | ```sh 161 | $ python3 -m gruut_ipa convert en-us/cmudict ipa "HH EH0 L OW1 W ER1 L D" 162 | h ɛ l ˈoʊ w ˈɚ l d 163 | ``` 164 | 165 | ## Scripts 166 | 167 | Use the `speak-ipa` script to have [espeak](https://github.com/espeak-ng/) pronounce IPA. You may need to `apt-get install espeak` first. 168 | 169 | ```sh 170 | $ echo '/hɛloʊ wɝld/' | bin/speak-ipa en-us -s 60 -w 'hello world.wav' 171 | $ aplay 'hello world.wav' 172 | ``` 173 | 174 | ## Phones 175 | 176 | Supported IPA phones can be printed with: 177 | 178 | ```sh 179 | $ python3 -m gruut_ipa print 180 | {"text": "i", "letters": "i", "stress": "none", "height": "close", "placement": "front", "rounded": false, "type": "Vowel", "nasalated": false, "elongated": false, "description": "close front unrounded vowel", "espeak": "i", "sampa": "i"} 181 | {"text": "y", "letters": "y", "stress": "none", "height": "close", "placement": "front", "rounded": true, "type": "Vowel", "nasalated": false, "elongated": false, "description": "close front rounded vowel", "espeak": "y", "sampa": "y"} 182 | ... 183 | ``` 184 | 185 | A nice table can be generated with [jq](https://stedolan.github.io/jq/): 186 | 187 | ```sh 188 | $ python3 -m gruut_ipa print | \ 189 | jq -r '. | "\(.text)\t\(.espeak)\t\(.sampa)\t\(.description)"' 190 | ``` 191 | 192 | Converted to Markdown: 193 | 194 | | IPA | eSpeak | Sampa | Description | 195 | | ---- | ----- | ------- | ----------------------------------- | 196 | | i | i | i | close front unrounded vowel | 197 | | y | y | y | close front rounded vowel | 198 | | ɨ | i" | 1 | close central unrounded vowel | 199 | | ʉ | u" | } | close central rounded vowel | 200 | | ɯ | u- | M | close back unrounded vowel | 201 | | u | u | u | close back rounded vowel | 202 | | ɪ | I | I | near-close near-front unrounded vowel | 203 | | ʏ | I. | Y | near-close near-front rounded vowel | 204 | | ʊ | U | U | near-close near-back rounded vowel | 205 | | e | e | e | close-mid front unrounded vowel | 206 | | ø | Y | 2 | close-mid front rounded vowel | 207 | | ɘ | @ | @\\ | close-mid central unrounded vowel | 208 | | ɵ | @. | 8 | close-mid central rounded vowel | 209 | | ɤ | o- | 7 | close-mid back unrounded vowel | 210 | | o | o | o | close-mid back rounded vowel | 211 | | ɛ | E | E | open-mid front unrounded vowel | 212 | | œ | W | 9 | open-mid front rounded vowel | 213 | | ɜ | V" | 3 | open-mid central unrounded vowel | 214 | | ɞ | O" | 3\\ | open-mid central rounded vowel | 215 | | ʌ | V | V | open-mid back unrounded vowel | 216 | | ɔ | O | O | open-mid back rounded vowel | 217 | | æ | a | { | near-open front unrounded vowel | 218 | | ɐ | V | 6 | near-open central unrounded vowel | 219 | | a | a | a | open front unrounded vowel | 220 | | ɶ | W | & | open front rounded vowel | 221 | | ɑ | A | A | open back unrounded vowel | 222 | | ɒ | A. | Q | open back rounded vowel | 223 | | m | m | m | voiced bilabial nasal | 224 | | ɱ | M | F | voiced labio-dental nasal | 225 | | n | n | n | voiced alveolar nasal | 226 | | ɳ | n. | n\` | voiced retroflex nasal | 227 | | ŋ | N | N | voiced velar nasal | 228 | | ɴ | n" | N\\ | voiced uvular nasal | 229 | | p | p | p | voiceless bilabial plosive | 230 | | b | b | b | voiced bilabial plosive | 231 | | t | t | t | voiceless alveolar plosive | 232 | | d | d | d | voiced alveolar plosive | 233 | | ʈ | t. | t\` | voiceless retroflex plosive | 234 | | ɖ | d. | d\` | voiced retroflex plosive | 235 | | c | c | c | voiceless palatal plosive | 236 | | ɟ | J | J\\ | voiced palatal plosive | 237 | | k | k | k | voiceless velar plosive | 238 | | ɡ | g | g | voiced velar plosive | 239 | | g | g | g | voiced velar plosive | 240 | | q | q | q | voiceless uvular plosive | 241 | | ɢ | G | G\\ | voiced uvular plosive | 242 | | ʡ | | >\\ | voiceless pharyngeal plosive | 243 | | ʔ | ? | ? | voiceless glottal plosive | 244 | | p͡f | pf | pf | voiceless labio-dental affricate | 245 | | b͡v | bv | bv | voiced dental affricate | 246 | | t̪͡s | ts | t_ds | voiceless dental affricate | 247 | | t͡s | ts | ts | voiceless alveolar affricate | 248 | | d͡z | dz | dz | voiced alveolar affricate | 249 | | t͡ʃ | tS | tS | voiceless post-alveolar affricate | 250 | | d͡ʒ | dZ | dZ | voiced post-alveolar affricate | 251 | | ʈ͡ʂ | tS | ts\` | voiceless retroflex affricate | 252 | | ɖ͡ʐ | dz | dz\` | voiced retroflex affricate | 253 | | t͡ɕ | tS; | ts\\ | voiceless palatal affricate | 254 | | d͡ʑ | dZ; | dz\\ | voiced palatal affricate | 255 | | k͡x | k | k_x | voiceless velar affricate | 256 | | ɸ | F | p\\ | voiceless bilabial fricative | 257 | | β | B | B | voiced bilabial fricative | 258 | | f | f | f | voiceless labio-dental fricative | 259 | | v | v | v | voiced labio-dental fricative | 260 | | θ | T | T | voiceless dental fricative | 261 | | ð | D | D | voiced dental fricative | 262 | | s | s | s | voiceless alveolar fricative | 263 | | z | z | z | voiced alveolar fricative | 264 | | ʃ | S | S | voiceless post-alveolar fricative | 265 | | ʒ | Z | Z | voiced post-alveolar fricative | 266 | | ʂ | s. | s\` | voiceless retroflex fricative | 267 | | ʐ | z. | z\` | voiced palatal fricative | 268 | | ç | C | C | voiceless palatal fricative | 269 | | x | x | x | voiceless velar fricative | 270 | | ɣ | Q | G | voiced velar fricative | 271 | | χ | X | X | voiceless uvular fricative | 272 | | ʁ | g" | R | voiced uvular fricative | 273 | | ħ | H | X\\ | voiceless pharyngeal fricative | 274 | | h | h | h | voiceless glottal fricative | 275 | | ɦ | h | h\\ | voiced glottal fricative | 276 | | w | w | w | voiced bilabial approximant | 277 | | ʋ | v# | v\\ | voiced labio-dental approximant | 278 | | ɹ | r | r\\ | voiced alveolar approximant | 279 | | ɻ | r. | r\\\` | voiced retroflex approximant | 280 | | j | j | j | voiced palatal approximant | 281 | | ɰ | Q | M\\ | voiced velar approximant | 282 | | ⱱ | ⱱ | ⱱ | voiced labio-dental flap | 283 | | ɾ | * | 4 | voiced alveolar flap | 284 | | ɽ | *. | r\` | voiced retroflex flap | 285 | | ʙ | b | B\\ | voiced bilabial trill | 286 | | r | r | r | voiced alveolar trill | 287 | | ʀ | r" | R\\ | voiced uvular trill | 288 | | l | l | l | voiced alveolar lateral-approximant | 289 | | ɫ | l | 5 | voiced alveolar lateral-approximant | 290 | | ɭ | l. | l\` | voiced retroflex lateral-approximant | 291 | | ʎ | l^ | L | voiced palatal lateral-approximant | 292 | | ʟ | L | L\\ | voiced velar lateral-approximant | 293 | | ə | @ | @ | schwa | 294 | | ɚ | 3 | @\` | r-coloured schwa | 295 | | ɝ | 3 | @\` | r-coloured schwa | 296 | | ɹ̩ | r- | r\\̩ | voiced alveolar approximant | 297 | 298 | If you see anything wrong or missing, please let me know. 299 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.13.0 2 | -------------------------------------------------------------------------------- /bin/distance_matrix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import itertools 3 | import json 4 | import sys 5 | 6 | import gruut_ipa 7 | from gruut_ipa.constants import ( 8 | IPA, 9 | VowelHeight, 10 | VowelPlacement, 11 | ConsonantType, 12 | ConsonantPlace, 13 | BreakType, 14 | Stress, 15 | ) 16 | 17 | import numpy as np 18 | import sklearn.metrics 19 | from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder 20 | 21 | 22 | def main(): 23 | """Main entry point""" 24 | 25 | feature_cols = { 26 | "symbol_type": ["phoneme", "break"], 27 | "phoneme_type": ["NONE", "vowel", "consonant", "schwa"], 28 | "diacritic": ["NONE", "nasalated", "velarized"], 29 | "vowel_height": ["NONE"] + [v.value for v in VowelHeight], 30 | "vowel_place": ["NONE"] + [v.value for v in VowelPlacement], 31 | "vowel_rounded": ["NONE", "rounded", "unrounded"], 32 | "consonant_voiced": ["NONE", "voiced", "unvoiced"], 33 | "consonant_type": ["NONE"] + [v.value for v in ConsonantType], 34 | "consonant_place": ["NONE"] + [v.value for v in ConsonantPlace], 35 | "consonant_sounds_like": ["NONE", "r", "l", "g", ""], 36 | "break_type": ["NONE"] + [v.value for v in BreakType], 37 | "stress": ["NONE"] + [v.value for v in Stress], 38 | } 39 | 40 | ordinal_cols = { 41 | "vowel_height": VowelHeight, 42 | "vowel_place": VowelPlacement, 43 | "consonant_type": ConsonantType, 44 | "consonant_place": ConsonantPlace, 45 | "break_type": BreakType, 46 | "stress": Stress, 47 | } 48 | 49 | feature_keys = {} 50 | offset = 0 51 | for feature_col, feature_values in feature_cols.items(): 52 | if feature_col in ordinal_cols: 53 | continue 54 | 55 | feature_keys[feature_col] = slice(offset, offset + len(feature_values)) 56 | offset += len(feature_values) 57 | 58 | for feature_col in ordinal_cols: 59 | feature_keys[feature_col] = offset 60 | offset += 1 61 | 62 | ordinal_enc = OrdinalEncoder(categories=[feature_cols[col] for col in ordinal_cols]) 63 | onehot_enc = OneHotEncoder( 64 | categories=[ 65 | feature_cols[col] for col in feature_cols if col not in ordinal_cols 66 | ] 67 | ) 68 | 69 | symbol_features = {} 70 | 71 | for break_symbol, break_type in [ 72 | (IPA.BREAK_WORD, BreakType.WORD), 73 | (IPA.BREAK_MINOR, BreakType.MINOR), 74 | (IPA.BREAK_MAJOR, BreakType.MAJOR), 75 | ]: 76 | features = {"symbol_type": "break", "break_type": str(break_type.value)} 77 | symbol_features[break_symbol] = features 78 | 79 | for s in itertools.chain(gruut_ipa.VOWELS, gruut_ipa.CONSONANTS, gruut_ipa.SCHWAS): 80 | if s in symbol_features: 81 | continue 82 | 83 | p = gruut_ipa.Phoneme(s) 84 | features = {"symbol_type": "phoneme"} 85 | 86 | if p.vowel: 87 | features["phoneme_type"] = "vowel" 88 | features["vowel_height"] = p.vowel.height.value 89 | features["vowel_place"] = p.vowel.placement.value 90 | features["vowel_rounded"] = "rounded" if p.vowel.rounded else "unrounded" 91 | 92 | if p.nasalated: 93 | features["diacritic"] = "nasalated" 94 | elif p.consonant: 95 | features["phoneme_type"] = "consonant" 96 | features["consonant_voiced"] = ( 97 | "voiced" if p.consonant.voiced else "unvoiced" 98 | ) 99 | features["consonant_type"] = p.consonant.type.value 100 | features["consonant_place"] = p.consonant.place.value 101 | features["consonant_sounds_like"] = p.consonant.sounds_like.value 102 | 103 | if p.consonant.velarized: 104 | features["diacritic"] = "velarized" 105 | elif p.schwa: 106 | features["phoneme_type"] = "schwa" 107 | if p.schwa.r_coloured: 108 | features["consonant_sounds_like"] = "r" 109 | 110 | symbol_features[s] = features 111 | 112 | vectors = {} 113 | for s, features in symbol_features.items(): 114 | onehot_features = [] 115 | ordinal_features = [] 116 | 117 | assert "symbol_type" in feature_cols 118 | 119 | for col in feature_cols: 120 | if col not in features: 121 | features[col] = "NONE" 122 | 123 | if col in ordinal_cols: 124 | ordinal_features.append(features[col]) 125 | else: 126 | onehot_features.append(features[col]) 127 | 128 | onehot_vector = onehot_enc.fit_transform([onehot_features]).toarray()[0] 129 | ordinal_vector = ordinal_enc.fit_transform([ordinal_features])[0] 130 | 131 | for col_i, (_col_name, col_val) in enumerate(ordinal_cols.items()): 132 | ordinal_vector[col_i] /= len(col_val) 133 | 134 | vectors[s] = np.hstack((onehot_vector, ordinal_vector)) 135 | 136 | matrix = np.vstack(list(vectors.values())) 137 | 138 | w = np.ones(matrix.shape[1]) 139 | w[feature_keys["vowel_place"]] = 0.5 140 | w[feature_keys["vowel_rounded"]] = 0.5 141 | w[feature_keys["consonant_place"]] = 0.05 142 | w[feature_keys["consonant_sounds_like"]] = 0.5 143 | 144 | dist = sklearn.metrics.pairwise_distances(matrix, metric="minkowski", p=2, w=w) 145 | 146 | symbols = list(vectors.keys()) 147 | 148 | json.dump( 149 | { 150 | "symbols": symbols, 151 | "columns": list(feature_cols.items()), 152 | "features": matrix.tolist(), 153 | "closest": { 154 | s: list(symbols[j] for j in dist[i].argsort())[1:] 155 | for i, s in enumerate(symbols) 156 | }, 157 | "distances": dist.tolist(), 158 | }, 159 | sys.stdout, 160 | indent=4, 161 | ensure_ascii=False, 162 | ) 163 | 164 | 165 | # ----------------------------------------------------------------------------- 166 | 167 | if __name__ == "__main__": 168 | main() 169 | -------------------------------------------------------------------------------- /bin/gruut-ipa: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Directory of *this* script 3 | this_dir="$( cd "$( dirname "$0" )" && pwd )" 4 | src_dir="$(realpath "${this_dir}/..")" 5 | 6 | venv="${src_dir}/.venv" 7 | if [[ -d "${venv}" ]]; then 8 | source "${venv}/bin/activate" 9 | fi 10 | 11 | export PYTHONPATH="${src_dir}:${PYTHONPATH}" 12 | python3 -m gruut_ipa "$@" 13 | -------------------------------------------------------------------------------- /bin/speak-ipa: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | lang="$1" 3 | 4 | if [[ -z "${lang}" ]]; then 5 | echo "Usage: speak-ipa LANGUAGE" 6 | exit 1; 7 | fi 8 | 9 | shift 10 | 11 | espeak_args=() 12 | voice="${lang}" 13 | while [[ -n "$1" ]]; do 14 | if [[ "$1" == '-v' ]]; then 15 | voice='' 16 | fi 17 | 18 | espeak_args+=("$1") 19 | shift 20 | done 21 | 22 | if [[ -n "${voice}" ]]; then 23 | espeak_args+=('-v' "${voice}") 24 | fi 25 | 26 | # ----------------------------------------------------------------------------- 27 | 28 | # Directory of *this* script 29 | this_dir="$( cd "$( dirname "$0" )" && pwd )" 30 | src_dir="$(realpath "${this_dir}/..")" 31 | 32 | venv="${src_dir}/.venv" 33 | if [[ -d "${venv}" ]]; then 34 | source "${venv}/bin/activate" 35 | fi 36 | 37 | export PYTHONPATH="${src_dir}:${PYTHONPATH}" 38 | python3 -m gruut_ipa convert ipa espeak | \ 39 | tee >(cat >&2) | \ 40 | while read line; 41 | do espeak "${espeak_args[@]}" "${line}" 42 | done 43 | -------------------------------------------------------------------------------- /default.nix: -------------------------------------------------------------------------------- 1 | { pkgs ? import {} }: 2 | pkgs.callPackage ./derivation.nix {} 3 | -------------------------------------------------------------------------------- /derivation.nix: -------------------------------------------------------------------------------- 1 | { lib, nixpkgs ? import {}, pythonPkgs ? nixpkgs.pkgs.python38Packages }: 2 | pythonPkgs.buildPythonPackage rec { 3 | name = "gruut-ipa-${version}"; 4 | version = "0.10.0"; 5 | 6 | src = pythonPkgs.fetchPypi { 7 | inherit version; 8 | pname = "gruut-ipa"; 9 | sha256 = "1kxrpv4qnzqbgv0vprlsvk0y0p58pl9xxz8sm7z4xxbyd1zamicf"; 10 | }; 11 | 12 | meta = with lib; { 13 | homepage = "https://github.com/rhasspy/gruut-ipa"; 14 | description = "Library for manipulating pronunciations using the International Phonetic Alphabet (IPA)"; 15 | license = licenses.mit; 16 | platforms = platforms.linux; 17 | }; 18 | 19 | doCheck = false; 20 | } 21 | -------------------------------------------------------------------------------- /gruut_ipa/__init__.py: -------------------------------------------------------------------------------- 1 | """Classes for dealing with phones and phonemes""" 2 | from gruut_ipa.accent import GuessedPhonemes, guess_phonemes # noqa: F401 3 | from gruut_ipa.constants import ( # noqa: F401 4 | CONSONANTS, 5 | FEATURE_COLUMNS, 6 | FEATURE_EMPTY, 7 | FEATURE_KEYS, 8 | FEATURE_ORDINAL_COLUMNS, 9 | IPA, 10 | LANG_ALIASES, 11 | SCHWAS, 12 | VOWELS, 13 | Accent, 14 | Break, 15 | BreakType, 16 | Consonant, 17 | ConsonantPlace, 18 | ConsonantType, 19 | Dipthong, 20 | Intonation, 21 | PhonemeLength, 22 | Schwa, 23 | Stress, 24 | Vowel, 25 | VowelHeight, 26 | VowelPlacement, 27 | ) 28 | from gruut_ipa.distances import get_closest # noqa: F401 29 | from gruut_ipa.espeak import espeak_to_ipa, ipa_to_espeak # noqa: F401 30 | from gruut_ipa.features import from_vector, string_to_symbol, to_vector # noqa: F401 31 | from gruut_ipa.phonemes import Phone, Phoneme, Phonemes, Pronunciation # noqa: F401 32 | from gruut_ipa.sampa import ipa_to_sampa, sampa_to_ipa # noqa: F401 33 | from gruut_ipa.utils import resolve_lang # noqa:F401 34 | -------------------------------------------------------------------------------- /gruut_ipa/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Command-line interface to gruut-ipa""" 3 | import argparse 4 | import itertools 5 | import json 6 | import logging 7 | import os 8 | import sys 9 | import typing 10 | from pathlib import Path 11 | 12 | # ----------------------------------------------------------------------------- 13 | 14 | _LOGGER = logging.getLogger("gruut_ipa") 15 | 16 | _DIR = Path(__file__).parent 17 | _DATA_DIR = _DIR / "data" 18 | 19 | # ----------------------------------------------------------------------------- 20 | 21 | 22 | def main(): 23 | """Main entry point""" 24 | args = get_args() 25 | 26 | if args.debug: 27 | logging.basicConfig(level=logging.DEBUG) 28 | else: 29 | logging.basicConfig(level=logging.INFO) 30 | 31 | _LOGGER.debug(args) 32 | 33 | # Dispatch to sub-command 34 | args.func(args) 35 | 36 | 37 | # ----------------------------------------------------------------------------- 38 | 39 | 40 | def do_print(args): 41 | """Print known IPA phones""" 42 | from gruut_ipa import CONSONANTS, SCHWAS, VOWELS, Phoneme, Phonemes 43 | from gruut_ipa.espeak import ipa_to_espeak 44 | from gruut_ipa.sampa import ipa_to_sampa 45 | 46 | allowed_phonemes: typing.Set[str] = set() 47 | 48 | if args.language: 49 | # Load phonemes using language code 50 | phonemes_path = _DATA_DIR / args.language / "phonemes.txt" 51 | 52 | _LOGGER.debug("Loading phonemes from %s", phonemes_path) 53 | with open(phonemes_path, "r", encoding="utf-8") as phonemes_file: 54 | phonemes = Phonemes.from_text(phonemes_file) 55 | 56 | allowed_phonemes.update(p.text for p in phonemes) 57 | 58 | for phone_str in sorted(itertools.chain(VOWELS, CONSONANTS, SCHWAS)): 59 | phone = Phoneme(phone_str) 60 | 61 | if allowed_phonemes and (phone.text not in allowed_phonemes): 62 | # Skip phoneme outside language 63 | continue 64 | 65 | description = "" 66 | if phone.vowel: 67 | v = phone.vowel 68 | description = ( 69 | v.height.value 70 | + " " 71 | + v.placement.value 72 | + " " 73 | + ("rounded" if v.rounded else "unrounded") 74 | + " vowel" 75 | ) 76 | elif phone.consonant: 77 | c = phone.consonant 78 | description = ( 79 | ("voiced" if c.voiced else "voiceless") 80 | + " " 81 | + c.place.value 82 | + " " 83 | + c.type.value 84 | ) 85 | elif phone.schwa: 86 | s = phone.schwa 87 | if s.r_coloured: 88 | description = "r-coloured schwa" 89 | else: 90 | description = "schwa" 91 | 92 | phone_dict = phone.to_dict() 93 | phone_dict["description"] = description 94 | 95 | # Add espeak/sampa 96 | phone_dict["espeak"] = ipa_to_espeak(phone_str) 97 | phone_dict["sampa"] = ipa_to_sampa(phone_str) 98 | 99 | phone_dict_str = json.dumps(phone_dict, ensure_ascii=False) 100 | print(phone_dict_str) 101 | 102 | 103 | # ----------------------------------------------------------------------------- 104 | 105 | 106 | def do_describe(args): 107 | """Describe IPA phones""" 108 | from gruut_ipa import Phoneme 109 | 110 | if args.phone: 111 | # From arguments 112 | phones = args.phone 113 | else: 114 | # From stdin 115 | phones = sys.stdin 116 | 117 | if os.isatty(sys.stdin.fileno()): 118 | print("Reading phones from stdin...", file=sys.stderr) 119 | 120 | for line in phones: 121 | line = line.strip() 122 | if line: 123 | line_phone = Phoneme(text=line) 124 | phone_str = json.dumps(line_phone.to_dict(), ensure_ascii=False) 125 | print(phone_str) 126 | sys.stdout.flush() 127 | 128 | 129 | # ----------------------------------------------------------------------------- 130 | 131 | 132 | def do_phones(args): 133 | """Group phones in IPA pronunciation""" 134 | from gruut_ipa import Pronunciation 135 | 136 | if args.pronunciation: 137 | # From arguments 138 | pronunciations = args.pronunciation 139 | else: 140 | # From stdin 141 | pronunciations = sys.stdin 142 | 143 | if os.isatty(sys.stdin.fileno()): 144 | print("Reading pronunciations from stdin...", file=sys.stderr) 145 | 146 | for line in pronunciations: 147 | line = line.strip() 148 | if line: 149 | line_pron = Pronunciation.from_string(line) 150 | phones_str = args.separator.join(p.text for p in line_pron if p.text) 151 | print(phones_str) 152 | sys.stdout.flush() 153 | 154 | 155 | # ----------------------------------------------------------------------------- 156 | 157 | 158 | def do_phonemes(args): 159 | """Group phones in IPA pronuncation according to language phonemes""" 160 | from gruut_ipa import Phonemes 161 | 162 | if args.pronunciation: 163 | # From arguments 164 | pronunciations = args.pronunciation 165 | else: 166 | # From stdin 167 | pronunciations = sys.stdin 168 | 169 | if os.isatty(sys.stdin.fileno()): 170 | print("Reading pronunciations from stdin...", file=sys.stderr) 171 | 172 | if args.phonemes_file: 173 | # Load phonemes from file 174 | phonemes_path = Path(args.phonemes_file) 175 | else: 176 | # Load phonemes using language code 177 | phonemes_path = _DATA_DIR / args.language / "phonemes.txt" 178 | 179 | # Check language support 180 | if not phonemes_path.is_file(): 181 | supported_languages = [d.name for d in _DATA_DIR.iterdir() if d.is_dir()] 182 | _LOGGER.fatal("Unsupported language: %s", args.language) 183 | _LOGGER.fatal("Supported languages: %s", supported_languages) 184 | sys.exit(1) 185 | 186 | _LOGGER.debug("Loading phonemes from %s", phonemes_path) 187 | with open(phonemes_path, "r", encoding="utf-8") as phonemes_file: 188 | phonemes = Phonemes.from_text(phonemes_file) 189 | 190 | for line in pronunciations: 191 | line = line.strip() 192 | if line: 193 | line_phonemes = phonemes.split( 194 | line, keep_stress=args.keep_stress, drop_tones=args.drop_tones 195 | ) 196 | phonemes_str = args.separator.join(p.text for p in line_phonemes if p.text) 197 | print(phonemes_str) 198 | sys.stdout.flush() 199 | 200 | 201 | # ----------------------------------------------------------------------------- 202 | 203 | 204 | def do_convert(args): 205 | """Convert pronunciations between different representations""" 206 | from gruut_ipa import Phoneme, Phonemes 207 | from gruut_ipa.espeak import espeak_to_ipa, ipa_to_espeak 208 | from gruut_ipa.sampa import ipa_to_sampa, sampa_to_ipa 209 | 210 | fixed_src_dest = {"ipa", "espeak", "sampa"} 211 | src_phonemes: typing.Optional[Phonemes] = None 212 | dest_phonemes: typing.Optional[Phonemes] = None 213 | 214 | if args.src not in fixed_src_dest: 215 | src_phonemes = Phonemes.from_language(args.src) 216 | 217 | if args.dest not in fixed_src_dest: 218 | dest_phoneme_map = Phonemes.from_language(args.dest).gruut_ipa_map 219 | 220 | # ipa -> original phoneme 221 | dest_phonemes = Phonemes() 222 | for k, v in dest_phoneme_map.items(): 223 | if v in dest_phonemes.gruut_ipa_map: 224 | continue 225 | 226 | dest_phonemes.phonemes.append(Phoneme(text=k, is_ipa=False)) 227 | dest_phonemes.ipa_map[v] = k 228 | 229 | dest_phonemes.update() 230 | 231 | if args.pronunciation: 232 | # From arguments 233 | pronunciations = args.pronunciation 234 | else: 235 | # From stdin 236 | pronunciations = sys.stdin 237 | 238 | if os.isatty(sys.stdin.fileno()): 239 | print("Reading pronunciations from stdin...", file=sys.stderr) 240 | 241 | for line in pronunciations: 242 | line = line.strip() 243 | if line: 244 | if args.src == "ipa": 245 | src_ipa = line 246 | elif args.src == "espeak": 247 | src_ipa = espeak_to_ipa(line) 248 | elif args.src == "sampa": 249 | src_ipa = sampa_to_ipa(line) 250 | else: 251 | assert src_phonemes is not None 252 | src_ipa = args.separator.join( 253 | src_phonemes.gruut_ipa_map.get(p.text, p.text) 254 | for p in src_phonemes.split(line) 255 | ) 256 | 257 | if args.dest == "ipa": 258 | dest_pron = src_ipa 259 | elif args.dest == "espeak": 260 | dest_pron = "[[" + ipa_to_espeak(src_ipa) + "]]" 261 | elif args.dest == "sampa": 262 | dest_pron = ipa_to_sampa(src_ipa) 263 | else: 264 | assert dest_phonemes is not None 265 | dest_pron = args.separator.join( 266 | p.text for p in dest_phonemes.split(src_ipa, is_ipa=False) 267 | ) 268 | 269 | print(dest_pron) 270 | sys.stdout.flush() 271 | 272 | 273 | # ----------------------------------------------------------------------------- 274 | 275 | 276 | def get_args() -> argparse.Namespace: 277 | """Parse command-line arguments""" 278 | parser = argparse.ArgumentParser(prog="gruut_ipa") 279 | 280 | # Create subparsers for each sub-command 281 | sub_parsers = parser.add_subparsers() 282 | sub_parsers.required = True 283 | sub_parsers.dest = "command" 284 | 285 | # ----- 286 | # print 287 | # ----- 288 | print_parser = sub_parsers.add_parser("print", help="Print all known IPA phones") 289 | print_parser.add_argument( 290 | "--language", help="Only print phones from a specific language or language/set" 291 | ) 292 | print_parser.set_defaults(func=do_print) 293 | 294 | # -------- 295 | # describe 296 | # -------- 297 | describe_parser = sub_parsers.add_parser("describe", help="Describe IPA phone(s)") 298 | describe_parser.set_defaults(func=do_describe) 299 | describe_parser.add_argument( 300 | "phone", nargs="*", help="IPA phones (read from stdin if not provided)" 301 | ) 302 | 303 | # -------- 304 | # phones 305 | # -------- 306 | phones_parser = sub_parsers.add_parser( 307 | "phones", help="Group phones in IPA pronunciation" 308 | ) 309 | phones_parser.set_defaults(func=do_phones) 310 | phones_parser.add_argument( 311 | "pronunciation", 312 | nargs="*", 313 | help="IPA pronunciations (read from stdin if not provided)", 314 | ) 315 | phones_parser.add_argument( 316 | "--separator", 317 | default=" ", 318 | help="Separator to add between phones in output (default: space)", 319 | ) 320 | 321 | # -------- 322 | # phonemes 323 | # -------- 324 | phonemes_parser = sub_parsers.add_parser( 325 | "phonemes", 326 | help="Group phones in IPA pronunciation according to language phonemes", 327 | ) 328 | phonemes_parser.set_defaults(func=do_phonemes) 329 | phonemes_parser.add_argument("language", help="Language code (e.g., en-us)") 330 | phonemes_parser.add_argument( 331 | "pronunciation", 332 | nargs="*", 333 | help="IPA pronunciations (read from stdin if not provided)", 334 | ) 335 | phonemes_parser.add_argument( 336 | "--separator", 337 | default=" ", 338 | help="Separator to add between phonemes in output (default: space)", 339 | ) 340 | phonemes_parser.add_argument( 341 | "--keep-stress", 342 | action="store_true", 343 | help="Keep primary/secondary stress markers", 344 | ) 345 | phonemes_parser.add_argument( 346 | "--drop-tones", action="store_true", help="Remove tone numbers/letters" 347 | ) 348 | phonemes_parser.add_argument( 349 | "--phonemes-file", help="Load phonemes from file instead of using language code" 350 | ) 351 | 352 | # ------- 353 | # convert 354 | # ------- 355 | convert_parser = sub_parsers.add_parser( 356 | "convert", help="Convert pronunciations between ipa, espeak, and sampa" 357 | ) 358 | convert_parser.set_defaults(func=do_convert) 359 | convert_parser.add_argument( 360 | "src", help="Source format (language, language/set, ipa, espeak, sampa)" 361 | ) 362 | convert_parser.add_argument( 363 | "dest", help="Destination format (language, language/set, ipa, espeak, sampa)" 364 | ) 365 | convert_parser.add_argument( 366 | "pronunciation", 367 | nargs="*", 368 | help="Pronunciations (read from stdin if not provided)", 369 | ) 370 | convert_parser.add_argument( 371 | "--separator", default=" ", help="Separator between phonemes (default: space)" 372 | ) 373 | 374 | # Shared arguments 375 | for sub_parser in [ 376 | print_parser, 377 | describe_parser, 378 | phones_parser, 379 | phonemes_parser, 380 | convert_parser, 381 | ]: 382 | sub_parser.add_argument( 383 | "--debug", action="store_true", help="Print DEBUG messages to console" 384 | ) 385 | 386 | return parser.parse_args() 387 | 388 | 389 | # ----------------------------------------------------------------------------- 390 | 391 | 392 | if __name__ == "__main__": 393 | main() 394 | -------------------------------------------------------------------------------- /gruut_ipa/accent.py: -------------------------------------------------------------------------------- 1 | """Methods for mapping phonemes from one language to another""" 2 | import typing 3 | import unicodedata 4 | from copy import copy 5 | from dataclasses import dataclass 6 | 7 | from gruut_ipa.constants import Vowel, VowelHeight, VowelPlacement 8 | from gruut_ipa.distances import get_closest 9 | from gruut_ipa.phonemes import Phoneme, Phonemes, Pronunciation 10 | 11 | # --------------------------------------------------------------------- 12 | 13 | R_LIKE = ["ɹ", "ʁ", "r", "ʀ", "ɻ", "ɚ"] 14 | SCHWA_PREFERRED = ["ə", "ɐ"] 15 | GS = ["ɡ", "g"] 16 | PHARY_GLOTTAL = ["ʡ", "ʔ"] 17 | 18 | MATCHING_PHONEMES = typing.List[Phoneme] 19 | 20 | 21 | @dataclass 22 | class GuessedPhonemes: 23 | """Result from guess_phonemes""" 24 | 25 | phonemes: MATCHING_PHONEMES 26 | distance: typing.Optional[float] = None 27 | 28 | 29 | def guess_phonemes( 30 | from_phoneme: typing.Union[str, Phoneme], to_phonemes: Phonemes, 31 | ) -> GuessedPhonemes: 32 | """Get best matching phonemes for a single phoneme""" 33 | best_phonemes: MATCHING_PHONEMES = [] 34 | best_dist: typing.Optional[float] = None 35 | 36 | from_codepoints: typing.Optional[typing.Set[str]] = None 37 | 38 | if isinstance(from_phoneme, str): 39 | # Parse phoneme 40 | from_phoneme = Phoneme(from_phoneme) 41 | 42 | if from_phoneme.text in GS: 43 | # Correctly map two forms of "g" 44 | for maybe_g in GS: 45 | if maybe_g in to_phonemes: 46 | best_phonemes = [Phoneme(maybe_g)] 47 | best_dist = 0.0 48 | break 49 | 50 | if (not best_phonemes) and from_phoneme.schwa: 51 | if from_phoneme.schwa.r_coloured: 52 | # Try r-like 53 | for maybe_r_like in R_LIKE: 54 | if maybe_r_like in to_phonemes: 55 | best_phonemes = [Phoneme(maybe_r_like)] 56 | best_dist = 0.0 57 | break 58 | 59 | if not best_phonemes: 60 | for maybe_schwa in SCHWA_PREFERRED: 61 | # Try known schwa preferences 62 | if maybe_schwa in to_phonemes: 63 | best_phonemes = [Phoneme(maybe_schwa)] 64 | best_dist = 0.0 65 | break 66 | 67 | if not best_phonemes: 68 | # Treat as a mid-central vowel 69 | from_phoneme = copy(from_phoneme) 70 | setattr( 71 | from_phoneme, 72 | "vowel", 73 | Vowel( 74 | ipa="ə", 75 | height=VowelHeight.MID, 76 | placement=VowelPlacement.CENTRAL, 77 | rounded=False, 78 | ), 79 | ) 80 | 81 | if (not best_phonemes) and (from_phoneme.text in R_LIKE): 82 | # Map r-like consonant 83 | for maybe_r in R_LIKE: 84 | if maybe_r in to_phonemes: 85 | best_phonemes = [Phoneme(maybe_r)] 86 | best_dist = 0.0 87 | break 88 | 89 | if (not best_phonemes) and (from_phoneme.text in PHARY_GLOTTAL): 90 | # Map or drop 91 | for maybe_pg in PHARY_GLOTTAL: 92 | if maybe_pg in to_phonemes: 93 | best_phonemes = [Phoneme(maybe_pg)] 94 | best_dist = 0 95 | break 96 | 97 | if not best_phonemes: 98 | # Drop 99 | return GuessedPhonemes(phonemes=[]) 100 | 101 | if best_phonemes: 102 | return GuessedPhonemes(phonemes=best_phonemes, distance=best_dist) 103 | 104 | # Search through target phonemes 105 | for to_phoneme in to_phonemes: 106 | if from_phoneme.text == to_phoneme.text: 107 | # Easy case: exact match 108 | best_phonemes = [to_phoneme] 109 | best_dist = 0.0 110 | break 111 | 112 | if from_phoneme.letters == to_phoneme.letters: 113 | # Match except for elongation, accent 114 | if from_codepoints is None: 115 | from_codepoints = set(unicodedata.normalize("NFD", from_phoneme.text)) 116 | 117 | # Compute a "distance" based on how many codepoints different between the two phonemes. 118 | # This should usually be < 1 so that it can be a better match than the vowel/consonant distances. 119 | to_codepoints = set(unicodedata.normalize("NFD", to_phoneme.text)) 120 | 121 | # Divide by 10 to ensure this is usually < 1 122 | dist = abs(len(from_codepoints) - len(to_codepoints)) / 10.0 123 | 124 | if (best_dist is None) or (dist < best_dist): 125 | best_phonemes = [to_phoneme] 126 | best_dist = dist 127 | 128 | continue 129 | 130 | # if from_phoneme.vowel and to_phoneme.vowel: 131 | # # Vowel distance 132 | # dist = vowel_distance(from_phoneme.vowel, to_phoneme.vowel) 133 | 134 | # # Extra penalty for not matching elongation 135 | # dist += 0.5 if from_phoneme.elongated != to_phoneme.elongated else 0 136 | 137 | # if (best_dist is None) or (dist < best_dist): 138 | # best_dist = dist 139 | # best_phonemes = [to_phoneme] 140 | # continue 141 | 142 | # if from_phoneme.consonant and to_phoneme.consonant: 143 | # # Consonant distance 144 | # dist = consonant_distance(from_phoneme.consonant, to_phoneme.consonant) 145 | # # Extra penalty for not matching elongation 146 | # dist += 0.5 if from_phoneme.elongated != to_phoneme.elongated else 0 147 | 148 | # if (best_dist is None) or (dist < best_dist): 149 | # best_dist = dist 150 | # best_phonemes = [to_phoneme] 151 | # continue 152 | 153 | if len(from_phoneme.letters) > 1: 154 | # Split apart and match each letter separately 155 | best_split: MATCHING_PHONEMES = [] 156 | split_phonemes = Pronunciation.from_string(from_phoneme.text, keep_ties=False) 157 | dist = 1.0 158 | 159 | for split_phoneme in split_phonemes: 160 | guessed = guess_phonemes(split_phoneme.text, to_phonemes) 161 | 162 | if (not guessed.phonemes) or (guessed.distance is None): 163 | break 164 | 165 | dist += guessed.distance 166 | best_split.extend(guessed.phonemes) 167 | 168 | if (best_dist is None) or (dist < best_dist): 169 | best_phonemes = best_split 170 | best_dist = dist 171 | elif best_dist is None: 172 | closest = get_closest(from_phoneme.text) 173 | 174 | if closest: 175 | for candidate_str in closest: 176 | for to_phoneme in to_phonemes: 177 | if candidate_str == to_phoneme.text: 178 | best_phonemes = [Phoneme(candidate_str)] 179 | best_dist = 0.5 180 | break 181 | 182 | if best_dist is not None: 183 | break 184 | 185 | if best_dist is None: 186 | # Last resort 187 | for to_phoneme in to_phonemes: 188 | if from_phoneme.letters[0] == to_phoneme.letters[0]: 189 | best_phonemes = [to_phoneme] 190 | best_dist = 10.0 191 | break 192 | 193 | return GuessedPhonemes(phonemes=best_phonemes, distance=best_dist) 194 | 195 | 196 | # --------------------------------------------------------------------- 197 | 198 | # VOWEL_HEIGHT_NUM = {h: i for i, h in enumerate(VowelHeight)} 199 | # VOWEL_PLACE_NUM = {p: i for i, p in enumerate(VowelPlacement)} 200 | 201 | 202 | # def vowel_distance(vowel_1: Vowel, vowel_2: Vowel) -> float: 203 | # """Return a distance measure between two vowels""" 204 | # dist_height = ( 205 | # abs(VOWEL_HEIGHT_NUM[vowel_1.height] - VOWEL_HEIGHT_NUM[vowel_2.height]) * 2 206 | # ) 207 | # dist_place = abs( 208 | # VOWEL_PLACE_NUM[vowel_1.placement] - VOWEL_PLACE_NUM[vowel_2.placement] 209 | # ) 210 | # dist_rounded = 1 if vowel_1.rounded != vowel_2.rounded else 0 211 | 212 | # return dist_height + dist_place + dist_rounded 213 | 214 | 215 | # CONSONANT_TYPE_NUM = {t: i for i, t in enumerate(ConsonantType)} 216 | # CONSONANT_PLACE_NUM = {p: i for i, p in enumerate(ConsonantPlace)} 217 | 218 | 219 | # def consonant_distance(consonant_1: Consonant, consonant_2: Consonant) -> float: 220 | # """Return a distance measure between two consonants""" 221 | # dist_type = abs( 222 | # CONSONANT_TYPE_NUM[consonant_1.type] - CONSONANT_TYPE_NUM[consonant_2.type] 223 | # ) 224 | # # dist_type = 1 if consonant_1.type != consonant_2.type else 0 225 | # dist_place = abs( 226 | # CONSONANT_PLACE_NUM[consonant_1.place] - CONSONANT_PLACE_NUM[consonant_2.place] 227 | # ) 228 | # dist_voiced = 1 if consonant_1.voiced != consonant_2.voiced else 0 229 | 230 | # return dist_type + dist_place + dist_voiced 231 | -------------------------------------------------------------------------------- /gruut_ipa/constants.py: -------------------------------------------------------------------------------- 1 | """Enums, vowels, and consonants for gruut-ipa""" 2 | import typing 3 | import unicodedata 4 | from dataclasses import dataclass 5 | from enum import Enum 6 | from pathlib import Path 7 | 8 | _DIR = Path(__file__).parent 9 | 10 | _DATA_DIR = _DIR / "data" 11 | 12 | LANG_ALIASES = { 13 | "ar": "ar", 14 | "cs": "cs-cz", 15 | "de": "de-de", 16 | "en": "en-us", 17 | "es": "es-es", 18 | "fa": "fa", 19 | "fr": "fr-fr", 20 | "it": "it-it", 21 | "nl": "nl", 22 | "pt-br": "pt", 23 | "ru": "ru-ru", 24 | "sv": "sv-se", 25 | "sw": "sw", 26 | } 27 | 28 | 29 | class IPA(str, Enum): 30 | """International phonetic alphabet characters""" 31 | 32 | STRESS_PRIMARY = "\u02C8" # ˈ 33 | STRESS_SECONDARY = "\u02CC" # ˌ 34 | 35 | ACCENT_ACUTE = "'" 36 | ACCENT_GRAVE = "²" 37 | 38 | LONG = "\u02D0" # ː 39 | HALF_LONG = "\u02D1" # eˑ 40 | EXTRA_SHORT = "\u0306" # ə̆ 41 | NASAL = "\u0303" # ẽ 42 | RAISED = "\u031D" # r̝ 43 | TIE_ABOVE = "\u0361" # ͡ 44 | TIE_BELOW = "\u035C" # ͜ 45 | 46 | SYLLABIC = "\u0329" 47 | NON_SYLLABIC = "\u032F" 48 | 49 | BREAK_SYLLABLE = "." 50 | BREAK_MINOR = "|" 51 | BREAK_MAJOR = "\u2016" # ‖ 52 | BREAK_WORD = "#" 53 | 54 | INTONATION_RISING = "\u2197" # ↗ 55 | INTONATION_FALLING = "\u2198" # ↘ 56 | 57 | TONE_1 = "¹" 58 | TONE_2 = "²" 59 | TONE_3 = "³" 60 | TONE_4 = "⁴" 61 | TONE_5 = "⁵" 62 | TONE_6 = "⁶" 63 | TONE_7 = "⁷" 64 | TONE_8 = "⁸" 65 | TONE_9 = "⁹" 66 | 67 | TONE_EXTRA_HIGH = "˥" 68 | TONE_HIGH = "˦" 69 | TONE_MID = "˧" 70 | TONE_LOW = "˨" 71 | TONE_EXTRA_LOW = "˩" 72 | 73 | TONE_GLOTTALIZED = "ˀ" 74 | TONE_SHORT = "ʔ" 75 | 76 | BRACKET_PHONETIC_LEFT = "[" 77 | BRACKET_PHONETIC_RIGHT = "]" 78 | BRACKET_PHONEMIC_LEFT = "/" 79 | BRACKET_PHONEMIC_RIGHT = "/" 80 | BRACKET_PROSODIC_LEFT = "{" 81 | BRACKET_PROSODIC_RIGHT = "}" 82 | BRACKET_OPTIONAL_LEFT = "(" 83 | BRACKET_OPTIONAL_RIGHT = ")" 84 | 85 | @staticmethod 86 | def is_long(codepoint: str) -> bool: 87 | """True if elongated symbol""" 88 | return codepoint == IPA.LONG 89 | 90 | @staticmethod 91 | def is_nasal(codepoint: str) -> bool: 92 | """True if nasalated diacritic""" 93 | return codepoint == IPA.NASAL 94 | 95 | @staticmethod 96 | def is_raised(codepoint: str) -> bool: 97 | """True if rased diacritic""" 98 | return codepoint == IPA.RAISED 99 | 100 | @staticmethod 101 | def is_stress(codepoint: str) -> bool: 102 | """True if primary/secondary stress symbol""" 103 | return codepoint in (IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY) 104 | 105 | @staticmethod 106 | def is_accent(codepoint: str) -> bool: 107 | """True if accent symbol""" 108 | return codepoint in {IPA.ACCENT_ACUTE, IPA.ACCENT_GRAVE} 109 | 110 | @staticmethod 111 | def is_tie(codepoint: str) -> bool: 112 | """True if above/below tie symbol""" 113 | return codepoint in (IPA.TIE_ABOVE, IPA.TIE_BELOW) 114 | 115 | @staticmethod 116 | def is_bracket(codepoint: str) -> bool: 117 | """True if any IPA bracket symbol""" 118 | return codepoint in { 119 | IPA.BRACKET_PHONETIC_LEFT, 120 | IPA.BRACKET_PHONETIC_RIGHT, 121 | IPA.BRACKET_PHONEMIC_LEFT, 122 | IPA.BRACKET_PHONEMIC_RIGHT, 123 | IPA.BRACKET_PROSODIC_LEFT, 124 | IPA.BRACKET_PROSODIC_RIGHT, 125 | IPA.BRACKET_OPTIONAL_LEFT, 126 | IPA.BRACKET_OPTIONAL_RIGHT, 127 | } 128 | 129 | @staticmethod 130 | def is_break(codepoint: str) -> bool: 131 | """True if any IPA break symbol""" 132 | return codepoint in { 133 | IPA.BREAK_SYLLABLE, 134 | IPA.BREAK_MINOR, 135 | IPA.BREAK_MAJOR, 136 | IPA.BREAK_WORD, 137 | } 138 | 139 | @staticmethod 140 | def is_intonation(codepoint: str) -> bool: 141 | """True if a rising or falling IPA intonation symbol""" 142 | return codepoint in {IPA.INTONATION_RISING, IPA.INTONATION_FALLING} 143 | 144 | @staticmethod 145 | def is_tone(codepoint: str) -> bool: 146 | """True if any IPA tone symbol""" 147 | return codepoint in { 148 | IPA.TONE_1, 149 | IPA.TONE_2, 150 | IPA.TONE_3, 151 | IPA.TONE_4, 152 | IPA.TONE_5, 153 | IPA.TONE_6, 154 | IPA.TONE_7, 155 | IPA.TONE_8, 156 | IPA.TONE_9, 157 | IPA.TONE_EXTRA_HIGH, 158 | IPA.TONE_HIGH, 159 | IPA.TONE_MID, 160 | IPA.TONE_LOW, 161 | IPA.TONE_EXTRA_LOW, 162 | } 163 | 164 | @staticmethod 165 | def graphemes(codepoints: str) -> typing.List[str]: 166 | """Split a string into graphemes""" 167 | codepoints = unicodedata.normalize("NFD", codepoints) 168 | 169 | graphemes = [] 170 | grapheme = "" 171 | 172 | for c in codepoints: 173 | if unicodedata.combining(c) > 0: 174 | grapheme += c 175 | elif grapheme: 176 | # Next grapheme 177 | graphemes.append(unicodedata.normalize("NFC", grapheme)) 178 | grapheme = c 179 | else: 180 | # Start of grapheme 181 | grapheme = c 182 | 183 | if grapheme: 184 | # Final grapheme 185 | graphemes.append(unicodedata.normalize("NFC", grapheme)) 186 | 187 | return graphemes 188 | 189 | @staticmethod 190 | def without_stress(codepoints: str, drop_accent: bool = True) -> str: 191 | """Return string without primary/secondary stress""" 192 | return "".join( 193 | c 194 | for c in codepoints 195 | if (not IPA.is_stress(c) and (not drop_accent or not IPA.is_accent(c))) 196 | ) 197 | 198 | 199 | class Stress(str, Enum): 200 | """Applied stress""" 201 | 202 | SECONDARY = "secondary" 203 | PRIMARY = "primary" 204 | 205 | 206 | class Accent(str, Enum): 207 | """Applied accent""" 208 | 209 | ACUTE = "acute" # ' 210 | GRAVE = "grave" # ² 211 | 212 | 213 | class BreakType(str, Enum): 214 | """Type of break""" 215 | 216 | WORD = "word" # '#' 217 | MINOR = "minor" # | 218 | MAJOR = "major" # ‖ 219 | 220 | 221 | class PhonemeLength(str, Enum): 222 | """Spoken length of a phoneme""" 223 | 224 | SHORT = "short" # ˑ 225 | NORMAL = "normal" 226 | LONG = "long" # ː 227 | 228 | 229 | # ----------------------------------------------------------------------------- 230 | 231 | 232 | class VowelHeight(str, Enum): 233 | """Height of a vowel""" 234 | 235 | CLOSE = "close" 236 | NEAR_CLOSE = "near-close" 237 | CLOSE_MID = "close-mid" 238 | MID = "mid" 239 | OPEN_MID = "open-mid" 240 | NEAR_OPEN = "near-open" 241 | OPEN = "open" 242 | 243 | 244 | class VowelPlacement(str, Enum): 245 | """Front/back placement of a vowel""" 246 | 247 | FRONT = "front" 248 | NEAR_FRONT = "near-front" 249 | CENTRAL = "central" 250 | NEAR_BACK = "near-back" 251 | BACK = "back" 252 | 253 | 254 | @dataclass 255 | class Vowel: 256 | """Necessary information for a vowel""" 257 | 258 | ipa: str 259 | height: VowelHeight 260 | placement: VowelPlacement 261 | rounded: bool 262 | nasalated: bool = False 263 | stress: typing.Optional[Stress] = None 264 | length: PhonemeLength = PhonemeLength.NORMAL 265 | alias_of: typing.Optional[str] = None 266 | 267 | 268 | # ----------------------------------------------------------------- 269 | # Vowels Front Near-Front Central Near-Back Back 270 | # ----------------------------------------------------------------- 271 | # Close i/y ɨ/ʉ ɯ/u 272 | # Near-Close ɪ/ʏ ʊ 273 | # Close-Mid e/ø ɘ/ɵ ɤ/o 274 | # Mid ə 275 | # Open-Mid ɛ/œ ɜ/ɞ ʌ/ɔ 276 | # Near-Open æ ɐ 277 | # Open a/ɶ ɑ/ɒ 278 | # ----------------------------------------------------------------- 279 | 280 | 281 | _VOWELS = [ 282 | Vowel("i", VowelHeight.CLOSE, VowelPlacement.FRONT, False), 283 | Vowel("y", VowelHeight.CLOSE, VowelPlacement.FRONT, True), 284 | Vowel("ɨ", VowelHeight.CLOSE, VowelPlacement.CENTRAL, False), 285 | Vowel("ᵻ", VowelHeight.CLOSE, VowelPlacement.CENTRAL, False, alias_of="ɨ"), 286 | Vowel("ʉ", VowelHeight.CLOSE, VowelPlacement.CENTRAL, True), 287 | Vowel("ɯ", VowelHeight.CLOSE, VowelPlacement.BACK, False), 288 | Vowel("u", VowelHeight.CLOSE, VowelPlacement.BACK, True), 289 | # 290 | Vowel("ɪ", VowelHeight.NEAR_CLOSE, VowelPlacement.NEAR_FRONT, False), 291 | Vowel("ʏ", VowelHeight.NEAR_CLOSE, VowelPlacement.NEAR_FRONT, True), 292 | Vowel("ʊ", VowelHeight.NEAR_CLOSE, VowelPlacement.NEAR_BACK, True), 293 | # 294 | Vowel("e", VowelHeight.CLOSE_MID, VowelPlacement.FRONT, False), 295 | Vowel("ẽ", VowelHeight.CLOSE_MID, VowelPlacement.FRONT, False, nasalated=True), 296 | Vowel("ø", VowelHeight.CLOSE_MID, VowelPlacement.FRONT, True), 297 | Vowel("ɘ", VowelHeight.CLOSE_MID, VowelPlacement.CENTRAL, False), 298 | Vowel("ɵ", VowelHeight.CLOSE_MID, VowelPlacement.CENTRAL, True), 299 | Vowel("ɤ", VowelHeight.CLOSE_MID, VowelPlacement.BACK, False), 300 | Vowel("o", VowelHeight.CLOSE_MID, VowelPlacement.BACK, True), 301 | # 302 | # Represented as a schwa too 303 | Vowel("ə", VowelHeight.MID, VowelPlacement.CENTRAL, False), 304 | # 305 | Vowel("ɛ", VowelHeight.OPEN_MID, VowelPlacement.FRONT, False), 306 | Vowel("œ", VowelHeight.OPEN_MID, VowelPlacement.FRONT, True), 307 | Vowel("ɜ", VowelHeight.OPEN_MID, VowelPlacement.CENTRAL, False), 308 | Vowel("ɞ", VowelHeight.OPEN_MID, VowelPlacement.CENTRAL, True), 309 | Vowel("ʌ", VowelHeight.OPEN_MID, VowelPlacement.BACK, False), 310 | Vowel("ɔ", VowelHeight.OPEN_MID, VowelPlacement.BACK, True), 311 | Vowel("ɔ̃", VowelHeight.OPEN_MID, VowelPlacement.BACK, True, nasalated=True), 312 | # 313 | Vowel("æ", VowelHeight.NEAR_OPEN, VowelPlacement.FRONT, False), 314 | Vowel("ɐ", VowelHeight.NEAR_OPEN, VowelPlacement.CENTRAL, False), 315 | # 316 | Vowel("a", VowelHeight.OPEN, VowelPlacement.FRONT, False), 317 | Vowel("ã", VowelHeight.OPEN, VowelPlacement.FRONT, False, nasalated=True), 318 | Vowel("ɶ", VowelHeight.OPEN, VowelPlacement.FRONT, True), 319 | Vowel("ɑ", VowelHeight.OPEN, VowelPlacement.BACK, False), 320 | Vowel("ɒ", VowelHeight.OPEN, VowelPlacement.BACK, True), 321 | ] 322 | 323 | VOWELS = {v.ipa: v for v in _VOWELS} 324 | 325 | # ----------------------------------------------------------------------------- 326 | 327 | 328 | @dataclass 329 | class Dipthong: 330 | """Combination of two vowels""" 331 | 332 | vowel1: Vowel 333 | vowel2: Vowel 334 | 335 | 336 | # ----------------------------------------------------------------------------- 337 | 338 | 339 | @dataclass 340 | class Schwa: 341 | """Vowel-like sound""" 342 | 343 | ipa: str 344 | r_coloured: bool 345 | length: PhonemeLength = PhonemeLength.NORMAL 346 | alias_of: typing.Optional[str] = None 347 | 348 | 349 | _SCHWAS = [Schwa("ə", False), Schwa("ɚ", True), Schwa("ɝ", True, alias_of="ɚ")] 350 | 351 | SCHWAS = {s.ipa: s for s in _SCHWAS} 352 | 353 | # ----------------------------------------------------------------------------- 354 | 355 | 356 | class ConsonantType(str, Enum): 357 | """Type of a consonant""" 358 | 359 | NASAL = "nasal" 360 | PLOSIVE = "plosive" 361 | AFFRICATE = "affricate" 362 | FRICATIVE = "fricative" 363 | APPROXIMANT = "approximant" 364 | FLAP = "flap" 365 | TRILL = "trill" 366 | LATERAL_APPROXIMANT = "lateral-approximant" 367 | 368 | 369 | class ConsonantPlace(str, Enum): 370 | """Place of articulation""" 371 | 372 | BILABIAL = "bilabial" 373 | LABIO_DENTAL = "labio-dental" 374 | DENTAL = "dental" 375 | ALVEOLAR = "alveolar" 376 | POST_ALVEOLAR = "post-alveolar" 377 | RETROFLEX = "retroflex" 378 | PALATAL = "palatal" 379 | VELAR = "velar" 380 | UVULAR = "uvular" 381 | PHARYNGEAL = "pharyngeal" 382 | GLOTTAL = "glottal" 383 | 384 | 385 | class ConsonantSoundsLike(str, Enum): 386 | """Class of sounds this consonant is similar to""" 387 | 388 | NONE = "" 389 | R = "r" 390 | G = "g" 391 | L = "l" 392 | 393 | 394 | @dataclass 395 | class Consonant: 396 | """Necessary information for a consonant""" 397 | 398 | ipa: str 399 | type: ConsonantType 400 | place: ConsonantPlace 401 | voiced: bool 402 | velarized: bool = False 403 | sounds_like: ConsonantSoundsLike = ConsonantSoundsLike.NONE 404 | length: PhonemeLength = PhonemeLength.NORMAL 405 | alias_of: typing.Optional[str] = None 406 | 407 | 408 | # -------------------------------------------------------------------------------------------------------------------------------------------- 409 | # Type Bilabial Labiodental Dental Alveolar Postalveolar Retroflex Palatal Velar Uvular Pharyngeal Glottal 410 | # -------------------------------------------------------------------------------------------------------------------------------------------- 411 | # Nasal m ɱ n ɳ ɲ ŋ ɴ 412 | # Plosive p/b t/d ʈ/ɖ c/ɟ k/ɡ q/ɢ ʡ ʔ 413 | # Affricate p͡f/b͡v t̪͡s̪/b͡v̪ t͡s/d͡z t͡ʃ/d͡ʒ ʈ͡ʂ/ɖ͡ʐ t͡ɕ/d͡ʑ k͡x 414 | # Fricative ɸ/β f/v θ/ð s/z ʃ/ʒ ʂ/ʐ ç/ʝ x/ɣ χ/ʁ ħ h ɦ 415 | # Approximant w ʋ ɹ ɻ j ɰ 416 | # Flap ⱱ ɾ ɽ 417 | # Trill ʙ r ʀ 418 | # Lateral App l ɭ ʎ ʟ 419 | # -------------------------------------------------------------------------------------------------------------------------------------------- 420 | 421 | _CONSONANTS = [ 422 | Consonant("m", ConsonantType.NASAL, ConsonantPlace.BILABIAL, True), 423 | Consonant("ɱ", ConsonantType.NASAL, ConsonantPlace.LABIO_DENTAL, True), 424 | Consonant("n", ConsonantType.NASAL, ConsonantPlace.ALVEOLAR, True), 425 | Consonant("ɳ", ConsonantType.NASAL, ConsonantPlace.RETROFLEX, True), 426 | Consonant("ɲ", ConsonantType.NASAL, ConsonantPlace.PALATAL, True), 427 | Consonant("ŋ", ConsonantType.NASAL, ConsonantPlace.VELAR, True), 428 | Consonant("ɴ", ConsonantType.NASAL, ConsonantPlace.UVULAR, True), 429 | # 430 | Consonant("p", ConsonantType.PLOSIVE, ConsonantPlace.BILABIAL, False), 431 | Consonant("b", ConsonantType.PLOSIVE, ConsonantPlace.BILABIAL, True), 432 | Consonant("t", ConsonantType.PLOSIVE, ConsonantPlace.ALVEOLAR, False), 433 | Consonant("d", ConsonantType.PLOSIVE, ConsonantPlace.ALVEOLAR, True), 434 | Consonant("ʈ", ConsonantType.PLOSIVE, ConsonantPlace.RETROFLEX, False), 435 | Consonant("ɖ", ConsonantType.PLOSIVE, ConsonantPlace.RETROFLEX, True), 436 | Consonant("c", ConsonantType.PLOSIVE, ConsonantPlace.PALATAL, False), 437 | Consonant("ɟ", ConsonantType.PLOSIVE, ConsonantPlace.PALATAL, True), 438 | Consonant("k", ConsonantType.PLOSIVE, ConsonantPlace.VELAR, False), 439 | Consonant( 440 | "ɡ", 441 | ConsonantType.PLOSIVE, 442 | ConsonantPlace.VELAR, 443 | True, 444 | sounds_like=ConsonantSoundsLike.G, 445 | ), 446 | Consonant( 447 | "g", 448 | ConsonantType.PLOSIVE, 449 | ConsonantPlace.VELAR, 450 | True, 451 | sounds_like=ConsonantSoundsLike.G, 452 | alias_of="ɡ", 453 | ), 454 | Consonant( 455 | "q", 456 | ConsonantType.PLOSIVE, 457 | ConsonantPlace.UVULAR, 458 | False, 459 | sounds_like=ConsonantSoundsLike.G, 460 | ), 461 | Consonant( 462 | "ɢ", 463 | ConsonantType.PLOSIVE, 464 | ConsonantPlace.UVULAR, 465 | True, 466 | sounds_like=ConsonantSoundsLike.G, 467 | ), 468 | Consonant("ʡ", ConsonantType.PLOSIVE, ConsonantPlace.PHARYNGEAL, False), 469 | Consonant("ʔ", ConsonantType.PLOSIVE, ConsonantPlace.GLOTTAL, False), 470 | # 471 | Consonant("p͡f", ConsonantType.AFFRICATE, ConsonantPlace.LABIO_DENTAL, False), 472 | Consonant("b͡v", ConsonantType.AFFRICATE, ConsonantPlace.LABIO_DENTAL, True), 473 | Consonant("t̪͡s", ConsonantType.AFFRICATE, ConsonantPlace.DENTAL, False), 474 | Consonant("b͡v", ConsonantType.AFFRICATE, ConsonantPlace.DENTAL, True), 475 | Consonant("t͡s", ConsonantType.AFFRICATE, ConsonantPlace.ALVEOLAR, False), 476 | Consonant("d͡z", ConsonantType.AFFRICATE, ConsonantPlace.ALVEOLAR, True), 477 | Consonant("t͡ʃ", ConsonantType.AFFRICATE, ConsonantPlace.POST_ALVEOLAR, False), 478 | Consonant("d͡ʒ", ConsonantType.AFFRICATE, ConsonantPlace.POST_ALVEOLAR, True), 479 | Consonant("ʈ͡ʂ", ConsonantType.AFFRICATE, ConsonantPlace.RETROFLEX, False), 480 | Consonant("ɖ͡ʐ", ConsonantType.AFFRICATE, ConsonantPlace.RETROFLEX, True), 481 | Consonant("t͡ɕ", ConsonantType.AFFRICATE, ConsonantPlace.PALATAL, False), 482 | Consonant("d͡ʑ", ConsonantType.AFFRICATE, ConsonantPlace.PALATAL, True), 483 | Consonant("k͡x", ConsonantType.AFFRICATE, ConsonantPlace.VELAR, False), 484 | # 485 | Consonant("ɸ", ConsonantType.FRICATIVE, ConsonantPlace.BILABIAL, False), 486 | Consonant("β", ConsonantType.FRICATIVE, ConsonantPlace.BILABIAL, True), 487 | Consonant("f", ConsonantType.FRICATIVE, ConsonantPlace.LABIO_DENTAL, False), 488 | Consonant("v", ConsonantType.FRICATIVE, ConsonantPlace.LABIO_DENTAL, True), 489 | Consonant("θ", ConsonantType.FRICATIVE, ConsonantPlace.DENTAL, False), 490 | Consonant("ð", ConsonantType.FRICATIVE, ConsonantPlace.DENTAL, True), 491 | Consonant("s", ConsonantType.FRICATIVE, ConsonantPlace.ALVEOLAR, False), 492 | Consonant("z", ConsonantType.FRICATIVE, ConsonantPlace.ALVEOLAR, True), 493 | Consonant("ʃ", ConsonantType.FRICATIVE, ConsonantPlace.POST_ALVEOLAR, False), 494 | Consonant("ʒ", ConsonantType.FRICATIVE, ConsonantPlace.POST_ALVEOLAR, True), 495 | Consonant("ʂ", ConsonantType.FRICATIVE, ConsonantPlace.RETROFLEX, False), 496 | Consonant("ʐ", ConsonantType.FRICATIVE, ConsonantPlace.RETROFLEX, True), 497 | Consonant("ç", ConsonantType.FRICATIVE, ConsonantPlace.PALATAL, False), 498 | Consonant( 499 | "ʝ", ConsonantType.FRICATIVE, ConsonantPlace.PALATAL, False, alias_of="ç" 500 | ), 501 | Consonant("ʐ", ConsonantType.FRICATIVE, ConsonantPlace.PALATAL, True), 502 | Consonant("x", ConsonantType.FRICATIVE, ConsonantPlace.VELAR, False), 503 | Consonant("ɣ", ConsonantType.FRICATIVE, ConsonantPlace.VELAR, True), 504 | Consonant("χ", ConsonantType.FRICATIVE, ConsonantPlace.UVULAR, False), 505 | Consonant( 506 | "ʁ", 507 | ConsonantType.FRICATIVE, 508 | ConsonantPlace.UVULAR, 509 | True, 510 | sounds_like=ConsonantSoundsLike.R, 511 | ), 512 | Consonant("ħ", ConsonantType.FRICATIVE, ConsonantPlace.PHARYNGEAL, False), 513 | Consonant("h", ConsonantType.FRICATIVE, ConsonantPlace.GLOTTAL, False), 514 | Consonant("ɦ", ConsonantType.FRICATIVE, ConsonantPlace.GLOTTAL, True), 515 | # 516 | Consonant("w", ConsonantType.APPROXIMANT, ConsonantPlace.BILABIAL, True), 517 | Consonant("ʋ", ConsonantType.APPROXIMANT, ConsonantPlace.LABIO_DENTAL, True), 518 | Consonant( 519 | "ɹ", 520 | ConsonantType.APPROXIMANT, 521 | ConsonantPlace.ALVEOLAR, 522 | True, 523 | sounds_like=ConsonantSoundsLike.R, 524 | ), 525 | Consonant( 526 | "ɻ", 527 | ConsonantType.APPROXIMANT, 528 | ConsonantPlace.RETROFLEX, 529 | True, 530 | sounds_like=ConsonantSoundsLike.R, 531 | ), 532 | Consonant("j", ConsonantType.APPROXIMANT, ConsonantPlace.PALATAL, True), 533 | Consonant("ɰ", ConsonantType.APPROXIMANT, ConsonantPlace.VELAR, True), 534 | # 535 | Consonant("ⱱ", ConsonantType.FLAP, ConsonantPlace.LABIO_DENTAL, True), 536 | Consonant( 537 | "ɾ", 538 | ConsonantType.FLAP, 539 | ConsonantPlace.ALVEOLAR, 540 | True, 541 | sounds_like=ConsonantSoundsLike.R, 542 | ), 543 | Consonant( 544 | "ɽ", 545 | ConsonantType.FLAP, 546 | ConsonantPlace.RETROFLEX, 547 | True, 548 | sounds_like=ConsonantSoundsLike.R, 549 | ), 550 | # 551 | Consonant("ʙ", ConsonantType.TRILL, ConsonantPlace.BILABIAL, True), 552 | Consonant( 553 | "r", 554 | ConsonantType.TRILL, 555 | ConsonantPlace.ALVEOLAR, 556 | True, 557 | sounds_like=ConsonantSoundsLike.R, 558 | ), 559 | Consonant( 560 | "ʀ", 561 | ConsonantType.TRILL, 562 | ConsonantPlace.UVULAR, 563 | True, 564 | sounds_like=ConsonantSoundsLike.R, 565 | ), 566 | # 567 | Consonant( 568 | "l", 569 | ConsonantType.LATERAL_APPROXIMANT, 570 | ConsonantPlace.ALVEOLAR, 571 | True, 572 | sounds_like=ConsonantSoundsLike.L, 573 | ), 574 | Consonant( 575 | "ɫ", 576 | ConsonantType.LATERAL_APPROXIMANT, 577 | ConsonantPlace.ALVEOLAR, 578 | True, 579 | velarized=True, 580 | sounds_like=ConsonantSoundsLike.L, 581 | ), 582 | Consonant( 583 | "ɭ", 584 | ConsonantType.LATERAL_APPROXIMANT, 585 | ConsonantPlace.RETROFLEX, 586 | True, 587 | sounds_like=ConsonantSoundsLike.L, 588 | ), 589 | Consonant("ʎ", ConsonantType.LATERAL_APPROXIMANT, ConsonantPlace.PALATAL, True), 590 | Consonant( 591 | "ʟ", 592 | ConsonantType.LATERAL_APPROXIMANT, 593 | ConsonantPlace.VELAR, 594 | True, 595 | sounds_like=ConsonantSoundsLike.L, 596 | ), 597 | ] 598 | 599 | CONSONANTS = {c.ipa: c for c in _CONSONANTS} 600 | 601 | # ----------------------------------------------------------------------------- 602 | 603 | 604 | @dataclass 605 | class Break: 606 | """IPA break/boundary""" 607 | 608 | type: BreakType 609 | text: str = "" 610 | 611 | def __post_init__(self): 612 | if self.type == BreakType.MINOR: 613 | self.text = IPA.BREAK_MINOR 614 | elif self.type == BreakType.MAJOR: 615 | self.text = IPA.BREAK_MAJOR 616 | elif self.type == BreakType.WORD: 617 | self.text = IPA.BREAK_WORD 618 | else: 619 | raise ValueError(f"Unrecognized break type: {type}") 620 | 621 | @staticmethod 622 | def from_string(break_str: str) -> "Break": 623 | """Parse break from string""" 624 | if break_str == IPA.BREAK_MINOR: 625 | break_type = BreakType.MINOR 626 | elif break_str == IPA.BREAK_MAJOR: 627 | break_type = BreakType.MAJOR 628 | elif break_str == IPA.BREAK_WORD: 629 | break_type = BreakType.WORD 630 | else: 631 | raise ValueError(f"Unrecognized break type: {break_str}") 632 | 633 | return Break(break_type) 634 | 635 | 636 | class Intonation: 637 | """IPA rising/falling intonation""" 638 | 639 | def __init__(self, rising: bool): 640 | self.rising = rising 641 | 642 | if self.rising: 643 | self.text = IPA.INTONATION_RISING 644 | else: 645 | self.text = IPA.INTONATION_FALLING 646 | 647 | def __repr__(self) -> str: 648 | return self.text 649 | 650 | @staticmethod 651 | def from_string(intonation_str: str) -> "Intonation": 652 | """Parse intonation from string""" 653 | if intonation_str == IPA.INTONATION_RISING: 654 | rising = True 655 | elif intonation_str == IPA.INTONATION_FALLING: 656 | rising = False 657 | else: 658 | raise ValueError(f"Unrecognized intonation type: {intonation_str}") 659 | 660 | return Intonation(rising) 661 | 662 | 663 | # ----------------------------------------------------------------------------- 664 | 665 | 666 | FEATURE_EMPTY = "NONE" 667 | 668 | FEATURE_COLUMNS: typing.Dict[str, typing.List[str]] = { 669 | "symbol_type": ["phoneme", "break"], 670 | "phoneme_type": [FEATURE_EMPTY, "vowel", "consonant", "schwa"], 671 | "break_type": [FEATURE_EMPTY] + [v.value for v in BreakType], 672 | "diacritic": [FEATURE_EMPTY, "nasalated", "velarized"], 673 | "vowel_height": [FEATURE_EMPTY] + [v.value for v in VowelHeight], 674 | "vowel_place": [FEATURE_EMPTY] + [v.value for v in VowelPlacement], 675 | "vowel_rounded": [FEATURE_EMPTY, "rounded", "unrounded"], 676 | "vowel_stress": [FEATURE_EMPTY] + [v.value for v in Stress], 677 | "consonant_voiced": [FEATURE_EMPTY, "voiced", "unvoiced"], 678 | "consonant_type": [FEATURE_EMPTY] + [v.value for v in ConsonantType], 679 | "consonant_place": [FEATURE_EMPTY] + [v.value for v in ConsonantPlace], 680 | "consonant_sounds_like": [FEATURE_EMPTY, "r", "l", "g", ""], 681 | "phoneme_length": [FEATURE_EMPTY] + [v.value for v in PhonemeLength], 682 | } 683 | 684 | FEATURE_ORDINAL_COLUMNS: typing.Set[str] = { 685 | "vowel_height", 686 | "vowel_place", 687 | "vowel_stress", 688 | "consonant_type", 689 | "consonant_place", 690 | "break_type", 691 | "phoneme_length", 692 | } 693 | 694 | 695 | def _make_feature_keys() -> typing.Mapping[str, typing.Union[int, slice]]: 696 | """Create mapping from feature column name to vector index (ordinal) or slice (one-hot)""" 697 | feature_keys: typing.Dict[str, typing.Union[int, slice]] = {} 698 | offset = 0 699 | for feature_col, feature_values in FEATURE_COLUMNS.items(): 700 | if feature_col in FEATURE_ORDINAL_COLUMNS: 701 | feature_keys[feature_col] = offset 702 | offset += 1 703 | else: 704 | feature_keys[feature_col] = slice(offset, offset + len(feature_values)) 705 | offset += len(feature_values) 706 | 707 | return feature_keys 708 | 709 | 710 | FEATURE_KEYS = _make_feature_keys() 711 | -------------------------------------------------------------------------------- /gruut_ipa/data/ar/phonemes.txt: -------------------------------------------------------------------------------- 1 | # https://en.wikipedia.org/wiki/Arabic_phonology 2 | 3 | # Vowels 4 | i [] ɪ e e̞ ɛ 5 | u [] ʊ o o̞ ɔ 6 | iː 7 | uː 8 | a 9 | aː 10 | 11 | # Consonants 12 | m 13 | n 14 | t 15 | tˤ 16 | tˤˤ 17 | k 18 | q 19 | ʔ 20 | b 21 | d 22 | dˤ 23 | dˤˤ 24 | d͡ʒ [] dʒ dʒʒ 25 | f 26 | θ 27 | s 28 | sˤ 29 | sˤˤ 30 | ʃ 31 | x 32 | ħ 33 | h 34 | ð 35 | z 36 | ðˤ 37 | ðˤˤ 38 | ɣ 39 | ʕ 40 | l 41 | ɫ 42 | j 43 | w 44 | r 45 | 46 | # Dipthongs 47 | aw 48 | aj 49 | -------------------------------------------------------------------------------- /gruut_ipa/data/cs-cz/phonemes.txt: -------------------------------------------------------------------------------- 1 | # https://en.wikipedia.org/wiki/Czech_phonology 2 | 3 | # Short vowels 4 | a p[a]k æ ɑ 5 | ɛ z[e] ,e[^u̯] ə 6 | ɪ tad[y] ,i[^ː] y ɪ̯ 7 | u tom[u] ʊ ʌ ʊ 8 | o ah[o]j ɔ ᴐ ɒ 9 | 10 | # Long vowels 11 | ɛː žádn[é] eː 12 | iː ř[í]kala ɪː 13 | aː n[á]s ɑː 14 | uː p[ů]jdu ʊː ɐ 15 | oː žener[ó]zní ɔː 16 | 17 | # Dipthongs 18 | au̯ [au]gur aʊ̯ au̯ 19 | eu̯ [eu]ro ɛʊ̯ ɛu̯ 20 | ou̯ form[ou] ɔʊ̯ ɔu̯ 21 | 22 | # Consonants 23 | r p[r]o ɹ ɾ rː r̩ 24 | r̝ řád r̝̊ 25 | t [t]oho 26 | k pa[k] 27 | l pů[l] l̩ 28 | s je[s]tli 29 | n d[n]eska ŋ nː n̩ 30 | v [v]lastně 31 | m [m]u ɱ m̩ 32 | p [p]ůl 33 | d [d]alší 34 | j aho[j] ʲ 35 | b [b]yla 36 | ɲ o[n]i 37 | z předcho[z]í 38 | t͡s do[c]ela 39 | t͡ʃ [č]lověk 40 | ʃ ne[ž] 41 | ɦ dlou[h]o h 42 | f dou[f]ám 43 | c [t]ím 44 | x by[ch]om 45 | ɡ ně[k]do g 46 | d͡ʒ min[dž]a 47 | ʒ mů[ž]ete 48 | ɟ li[d]i 49 | ʔ [u]vedl ˀ 50 | -------------------------------------------------------------------------------- /gruut_ipa/data/de-de/phonemes.txt: -------------------------------------------------------------------------------- 1 | # Vowels 2 | a d[a]s ɑ 3 | aː j[a]hr ɑː 4 | ɛ w[e]nn 5 | ə ein[e] ɘ 6 | ɐ od[er] ɐ̯ 7 | ɛː k[ä]se 8 | eː g[e]gen e 9 | ɪ w[i]rd ĭ 10 | iː d[ie] i 11 | ɔ d[o]ch 12 | oː w[o] o ɔu ɔʊ̯ 13 | œ k[ö]nnen 14 | øː l[ös]en 15 | ʊ m[u]ss 16 | uː g[u]t u 17 | ʏ m[ü]cke 18 | yː f[ü]r y ʏː œ̃ː 19 | 20 | # Nasal Vowels from Loanwords 21 | ãː restaur[ant] ã ɑ̃ 22 | õː sais[on] 23 | ɛ̃ː cous[in] 24 | 25 | # Diphthongs 26 | ɔʏ̯ [eu]le ø 27 | aɪ̯ h[ei]m a͜i 28 | aʊ̯ h[au]s a͡ʊ a͜u 29 | 30 | # Plosives 31 | p o[b]st 32 | b [b]itte 33 | t nich[t] 34 | d [d]er 35 | k [k]ann 36 | g [g]eht ɡ 37 | ʔ be[a]mter 38 | 39 | # Nasal Consonants 40 | m [m]it m̩ 41 | n ei[n] ɱ n̩ 42 | ŋ la[ng] ŋ̩ 43 | 44 | # Fricatives 45 | f [v]on 46 | v [w]as 47 | s au[s] 48 | z [s]ie 49 | ʃ [sch]on 50 | ʒ [g]enie 51 | ç mi[ch] c 52 | x bu[ch] 53 | χ ba[ch] 54 | ʁ da[r]auf ɽ r ɾ ʀ 55 | h [h]ut 56 | 57 | # Approximants 58 | j [j]a 59 | 60 | # Lateral Approximants 61 | l a[l]s l̩ 62 | 63 | # Affricates 64 | p͡f [pf]erd pf 65 | t͡s [z]eit ​t͡s t͜s 66 | t͡ʃ deu[tsch] ʧ 67 | d͡ʒ [dsch]ungel 68 | -------------------------------------------------------------------------------- /gruut_ipa/data/el-gr/phonemes.txt: -------------------------------------------------------------------------------- 1 | a π[α]ς 2 | b [μ]πεσ 3 | d [τ]ζιμ 4 | δ [δ]εις 5 | e π[ε]ς 6 | f α[φ]ού 7 | g [γ]κέι 8 | ɣ αρ[γ]ά 9 | x ει[χ]α 10 | ç ει[χ]ε 11 | i π[ει]ς 12 | ʝ [γ]ειά 13 | k [κ]ανω 14 | d͡z [δ]ικέ 15 | l α[λλ]η ʎ 16 | m [μ]αζι 17 | n ε[ν]ασ ɲ 18 | ŋ μά[γ]κα 19 | o π[ω]ς 20 | p [π]ήρα 21 | r κ[ρ]ισ ɾ ɾ̠ ɹ 22 | s βγε[σ] s̠ 23 | t κα[τ]ω 24 | θ ηρ[θ]α 25 | t͡s ts ματσ 26 | u π[ου] 27 | v [β]άζω 28 | z [ζ]εισ z̠ 29 | -------------------------------------------------------------------------------- /gruut_ipa/data/en-gb/phonemes.txt: -------------------------------------------------------------------------------- 1 | # Normal vowels 2 | ɒ f[a]ther 3 | æ c[a]t a 4 | e b[e]d ɛ 5 | ɪ s[i]t 6 | ɔ l[aw] 7 | ʊ p[u]t 8 | ʌ r[u]n ɐ 9 | 10 | # Elognated vowels 11 | iː s[ee] i 12 | ɑː n[o]t 13 | uː s[oo]n u 14 | ɔː n[or]th 15 | ɜː n[ur]se 16 | 17 | # Schwas 18 | ə [a]llow 19 | 20 | # Dipthongs 21 | eɪ r[ai]se e eɪ̯ 22 | aɪ r[i]ce aɪ̯ 23 | əʊ kn[ow] 24 | ɔɪ n[oi]se 25 | aʊ h[ou]se 26 | 27 | # Stops 28 | p [p]in 29 | b [b]ut 30 | t [t]on 31 | d [d]ot 32 | k [c]at 33 | ɡ [g]ive g 34 | 35 | # Affricatives 36 | t͡ʃ [ch]in tʃ 37 | d͡ʒ [g]in dʒ 38 | 39 | # Fricatives 40 | f [f]in 41 | v [v]im 42 | θ [th]in 43 | ð [th]is 44 | s [s]et 45 | z [z]ing 46 | ʃ [s]ure 47 | ʒ mea[sure] 48 | h [h]am 49 | 50 | # Other consonants 51 | l [l]ong l̩ ɫ ʟ̩ l̩ 52 | m [m]ock m̩ 53 | n [kn]ock n̩ 54 | ŋ thi[ng] 55 | ɹ [wr]ong r ɾ 56 | w [w]asp 57 | j [y]acht 58 | -------------------------------------------------------------------------------- /gruut_ipa/data/en-us/cmudict/ipa_map.txt: -------------------------------------------------------------------------------- 1 | AA ɑ 2 | AA0 ɑ 3 | AA1 ˈɑ 4 | AA2 ˌɑ 5 | AE æ 6 | AE0 æ 7 | AE1 ˈæ 8 | AE2 ˌæ 9 | AH ʌ 10 | AH0 ʌ 11 | AH1 ˈʌ 12 | AH2 ˌʌ 13 | AO ɔ 14 | AO0 ɔ 15 | AO1 ˈɔ 16 | AO2 ˌɔ 17 | AW aʊ 18 | AW0 aʊ 19 | AW1 ˈaʊ 20 | AW2 ˌaʊ 21 | AY aɪ 22 | AY0 aɪ 23 | AY1 ˈaɪ 24 | AY2 ˌaɪ 25 | B b 26 | CH t͡ʃ 27 | D d 28 | DH ð 29 | EH ɛ 30 | EH0 ɛ 31 | EH1 ˈɛ 32 | EH2 ˌɛ 33 | ER ɚ 34 | ER0 ɚ 35 | ER1 ˈɚ 36 | ER2 ˌɚ 37 | EY eɪ 38 | EY0 eɪ 39 | EY1 ˈeɪ 40 | EY2 ˌeɪ 41 | F f 42 | G ɡ 43 | HH h 44 | IH ɪ 45 | IH0 ɪ 46 | IH1 ˈɪ 47 | IH2 ˌɪ 48 | IY i 49 | IY0 i 50 | IY1 ˈi 51 | IY2 ˌi 52 | JH d͡ʒ 53 | K k 54 | L l 55 | M m 56 | N n 57 | NG ŋ 58 | OW oʊ 59 | OW0 oʊ 60 | OW1 ˈoʊ 61 | OW2 ˌoʊ 62 | OY ɔɪ 63 | OY0 ɔɪ 64 | OY1 ˈɔɪ 65 | OY2 ˌɔɪ 66 | P p 67 | R ɹ 68 | S s 69 | SH ʃ 70 | T t 71 | TH θ 72 | UH ʊ 73 | UH0 ʊ 74 | UH1 ˈʊ 75 | UH2 ˌʊ 76 | UW u 77 | UW0 u 78 | UW1 ˈu 79 | UW2 ˌu 80 | V v 81 | W w 82 | Y j 83 | Z z 84 | ZH ʒ 85 | -------------------------------------------------------------------------------- /gruut_ipa/data/en-us/cmudict/phonemes.txt: -------------------------------------------------------------------------------- 1 | AA [au]nt 2 | AA0 3 | AA1 4 | AA2 5 | AE [a]lan 6 | AE0 7 | AE1 8 | AE2 9 | AH b[u]n 10 | AH0 11 | AH1 12 | AH2 13 | AO also 14 | AO0 15 | AO1 16 | AO2 17 | AW d[ow]n 18 | AW0 19 | AW1 20 | AW2 21 | AY b[i]ke 22 | AY0 23 | AY1 24 | AY2 25 | B a[b]le 26 | CH ea[ch] 27 | D an[d]y 28 | DH [th]an 29 | EH b[e]ll 30 | EH0 31 | EH1 32 | EH2 33 | ER b[ir]d 34 | ER0 35 | ER1 36 | ER2 37 | EY aw[ay] 38 | EY0 39 | EY1 40 | EY2 41 | F [f]ace 42 | G ba[g]s 43 | HH [h]alf 44 | IH s[i]t 45 | IH0 46 | IH1 47 | IH2 48 | IY army 49 | IY0 50 | IY1 51 | IY2 52 | JH e[dge] 53 | K as[k] 54 | L a[l]ex 55 | M bo[mb] 56 | N ame[n] 57 | NG ba[ng] 58 | OW bl[ow] 59 | OW0 60 | OW1 61 | OW2 62 | OY j[oi]n 63 | OY0 64 | OY1 65 | OY2 66 | P cam[p] 67 | R a[r]ea 68 | S art[s] 69 | SH bu[sh] 70 | T an[t]i 71 | TH ba[th] 72 | UH b[oo]k 73 | UH0 74 | UH1 75 | UH2 76 | UW bl[ue] 77 | UW0 78 | UW1 79 | UW2 80 | V da[v]e 81 | W [wh]at 82 | Y c[u]te 83 | Z arm[s] 84 | ZH u[s]ual 85 | -------------------------------------------------------------------------------- /gruut_ipa/data/en-us/phonemes.txt: -------------------------------------------------------------------------------- 1 | # Normal vowels 2 | ɑ f[a]ther aː ɑː 3 | æ c[a]t 4 | ɛ b[e]d ɜ ɜː 5 | i cit[y] iː 6 | ɪ s[i]t ɨ 7 | ɔ l[aw] ɔː ɒ 8 | ʊ p[u]t ʊ̯ 9 | ʌ r[u]n 10 | u s[oo]n uː 11 | 12 | # Schwas 13 | ə [a]llow 14 | ɚ corn[er] ɝː ɝ 15 | 16 | # Dipthongs 17 | eɪ r[ai]se e eɪ̯ 18 | aɪ r[i]ce aɪ̯ 19 | oʊ kn[ow] o 20 | ɔɪ n[oi]se 21 | aʊ h[ou]se 22 | 23 | # Stops 24 | p [p]in 25 | b [b]ut 26 | t [t]on 27 | d [d]ot 28 | k [c]at 29 | ɡ [g]ive g 30 | 31 | # Affricatives 32 | t͡ʃ [ch]in tʃ 33 | d͡ʒ [g]in dʒ 34 | 35 | # Fricatives 36 | f [f]in 37 | v [v]im 38 | θ [th]in 39 | ð [th]is 40 | s [s]et 41 | z [z]ing 42 | ʃ [s]ure 43 | ʒ mea[sure] 44 | h [h]am 45 | 46 | # Other consonants 47 | l [l]ong l̩ ɫ ʟ̩ l̩ 48 | m [m]ock m̩ 49 | n [kn]ock n̩ 50 | ŋ thi[ng] 51 | ɹ [wr]ong r ɾ 52 | w [w]asp 53 | j [y]acht 54 | -------------------------------------------------------------------------------- /gruut_ipa/data/en-us/zamia/ipa_map.txt: -------------------------------------------------------------------------------- 1 | 3 ɚ 2 | A ɑ 3 | D ð 4 | E ɛ 5 | I ɪ 6 | N ŋ 7 | O ɔ 8 | OI ɔɪ 9 | S ʃ 10 | T θ 11 | U ʊ 12 | V ʌ 13 | Z ʒ 14 | aI aɪ 15 | aU aʊ 16 | b b 17 | d d 18 | dZ d͡ʒ 19 | e e 20 | f f 21 | g ɡ 22 | h h 23 | i i 24 | j j 25 | k k 26 | l l 27 | m m 28 | n n 29 | o o 30 | p p 31 | pf pf 32 | r ɹ 33 | s s 34 | t t 35 | tS t͡ʃ 36 | ts ts 37 | u u 38 | v v 39 | w w 40 | z z 41 | { æ 42 | -------------------------------------------------------------------------------- /gruut_ipa/data/en-us/zamia/phonemes.txt: -------------------------------------------------------------------------------- 1 | 3 b[i]rd 2 | A [a]rts 3 | D [th]an 4 | E b[ea]r 5 | I [a]ges 6 | N ba[ng] 7 | O [a]lso 8 | OI b[oy]s 9 | S bu[sh] 10 | T ba[th] 11 | U b[oa]t 12 | V ab[le] 13 | Z u[s]ual 14 | aI b[i]ke 15 | aU d[ow]n 16 | b [b]a[b]e 17 | d an[d]y 18 | dZ ed[ge] 19 | e [a]men 20 | f [f]ace 21 | g ba[g]s 22 | h [h]ad 23 | i ar[e]a 24 | j c[u]te 25 | k ba[ck] 26 | l a[l]ex 27 | m ar[m]y 28 | n ala[n] 29 | o bl[o]w 30 | p cam[p] 31 | pf [pff]t 32 | r a[r]en 33 | s ask[s] 34 | t an[t]i 35 | tS ea[ch] 36 | ts ge[ts] 37 | u bl[ue] 38 | v da[ve] 39 | w a[w]ay 40 | z arm[s] 41 | { [a]dam 42 | -------------------------------------------------------------------------------- /gruut_ipa/data/es-es/phonemes.txt: -------------------------------------------------------------------------------- 1 | # https://en.wikipedia.org/wiki/Spanish_phonology 2 | 3 | # Vowels 4 | a p[a]so 5 | e p[e]so 6 | i p[i]so 7 | o p[o]so 8 | u p[u]jo 9 | 10 | # Dipthongs 11 | ai [ai]re 12 | au p[au]sa 13 | ei r[ey] 14 | eu n[eu]tro 15 | oi h[oy] 16 | ou b[ou] 17 | 18 | ja hacia 19 | je t[ie]rra 20 | jo rad[io] 21 | ju v[iu]da 22 | wa c[ua]dro 23 | we f[ue]go 24 | wi b[ui]tre 25 | wo c[uo]ta 26 | 27 | # Consonants 28 | b [b]ueno β 29 | d [d]os ð 30 | g [g]racias ɣ 31 | m [m]ucho 32 | n [n]os ŋ 33 | ɲ ni[ñ]o 34 | p [p]or 35 | t [t]u 36 | t͡ʃ mu[ch]a tʃ 37 | k [q]ué 38 | f per[f]ecto 39 | θ gra[c]ias 40 | s [s]í z 41 | ʝ grac[i]as j 42 | x [j]efe 43 | l e[l] 44 | ʎ [ll]ega 45 | ɾ ot[r]a 46 | r co[rr]ecto 47 | -------------------------------------------------------------------------------- /gruut_ipa/data/fa/phonemes.txt: -------------------------------------------------------------------------------- 1 | # https://en.wikipedia.org/wiki/Persian_phonology 2 | 3 | # Vowels 4 | æ نه 5 | ɒː تا 6 | e̞ که e eː 7 | iː شیر í î i 8 | o تو oː 9 | uː زود ʊ 10 | 11 | # Consonants 12 | b برادر 13 | p پدر 14 | t تا 15 | d دوست 16 | t͡ʃ چوب tʃ 17 | d͡ʒ جوان dʒ 18 | k کشور 19 | g گروه ɡ 20 | ʔ معنا 21 | f فشار 22 | v ویژه 23 | s سایه ŝ 24 | z آزاد 25 | ʃ شاه 26 | ʒ ژاله 27 | x خانه χ 28 | ɢ قلم ɣ q 29 | h هفت 30 | m مادر 31 | n نان 32 | l لب 33 | ɾ ایران r ʁ 34 | j یا 35 | -------------------------------------------------------------------------------- /gruut_ipa/data/fr-fr/phonemes.txt: -------------------------------------------------------------------------------- 1 | # Vowels 2 | i s[i] 3 | y s[u] 4 | u s[ous] 5 | e f[ée] 6 | ø c[eux] 7 | o s[ot] 8 | ə c[e] 9 | ɛ f[ait] 10 | œ s[œu]r 11 | ɔ s[o]rt 12 | a s[a] ɑ 13 | 14 | # Nasalated vowels 15 | ɔ̃ s[on] 16 | ɛ̃ br[in] 17 | ɑ̃ s[ans] 18 | œ̃ br[un] 19 | 20 | # Semi-vowels 21 | j [h]ier 22 | ɥ pl[u]ie 23 | w [ou]i 24 | 25 | # Consonants 26 | m [m]ou 27 | n [n]ous 28 | ŋ ku[ng]-fu 29 | p [p]ou 30 | t [t]out 31 | k [c]ou 32 | b [b]oue 33 | d [d]oux 34 | ɡ [g]oût 35 | f [f]ou 36 | s [s]ous 37 | ʃ [ch]ou 38 | v [v]ous 39 | z [z]ou 40 | ʒ [j]oue 41 | ʁ [r]ou 42 | l [l]oup 43 | 44 | # From loan words 45 | ɲ [gn]ouf 46 | -------------------------------------------------------------------------------- /gruut_ipa/data/it-it/phonemes.txt: -------------------------------------------------------------------------------- 1 | # https://en.wikipedia.org/wiki/Italian_phonology 2 | 3 | # Vowels 4 | ɛ ad[e]sso ɛː ə 5 | ɔ c[o]lla ɔː ɔ̃ 6 | a senz[a] aː à ɑ̃ ɒ æ ä ɑ 7 | e p[e]rò eː 8 | i cos[ì] iː ɪ I ì ɪː 9 | o un[o] oː ʊ 10 | u t[u]tte uː u̯ 11 | 12 | # Consonants 13 | ɲ o[gn]uno 14 | ʎ consi[gli]o 15 | ʃ u[s]cita 16 | b [b]ella 17 | d [d]ue 18 | d͡ʒ [g]ente dʒ 19 | d͡z [z]ona dz 20 | f [f]orte 21 | ɡ [g]rave g 22 | j d[i]eci 23 | k [q]uesto c 24 | l so[l]tanto 25 | m al[m]eno 26 | n a[n]cora 27 | ɱ i[n]fame 28 | ŋ a[n]che 29 | p [p]erò 30 | r pe[r]ò ʁ ɹ 31 | s [s]ia 32 | t quan[t]o 33 | t͡ʃ per[ci]ò tʃ 34 | t͡s for[z]a ts 35 | v a[v]anti 36 | w q[u]attro 37 | z pae[s]e 38 | -------------------------------------------------------------------------------- /gruut_ipa/data/lb-lb/phonemes.txt: -------------------------------------------------------------------------------- 1 | # Luxembourgish phonemes 2 | # Luxembourgish vowels (monophtongs) 3 | ɑ k[a]pp 4 | aː k[a]p 5 | ɛː st[ä]ren 6 | e m[é]ck 7 | æ h[e]ll 8 | eː k[ee]ss 9 | ə n[e]t 10 | ɐ kann[er] 11 | i m[i]dd 12 | iː l[ii]cht 13 | o spr[o]ch 14 | oː spr[oo]ch 15 | u g[u]tt 16 | uː d[uu]scht 17 | // Monophtongs from loanwoards 18 | y conj[u]gaisoun 19 | y: s[ü]den 20 | ãː restaur[ant] 21 | õː sais[on] 22 | ɛ̃ː cous[in] 23 | œː interi[eu]r 24 | // Luxembourgish diphtongs 25 | æːɪ z[äi]t 26 | ɑʊ [au]to 27 | æːʊ r[au]m 28 | ɑɪ l[ei]t 29 | ɜɪ fr[éi] 30 | oɪ [eu]ro 31 | iə h[ie]n 32 | əʊ sch[ou]l 33 | uə b[ue]dem 34 | // Consonants 35 | # Nasals 36 | m [m]a[mm] 37 | n ma[nn] 38 | ŋ ke[ng] 39 | # Plosives 40 | p [p]aken 41 | b [b]aken 42 | t blu[tt] 43 | d [d]äiwel 44 | k [k]eess 45 | g [g]eess 46 | // Affricates 47 | ʦ schwä[tz]en 48 | dʒ bu[dg]et 49 | # Fricatives 50 | f [f]ësch 51 | v [v]akanz 52 | w sch[w]aarz 53 | s taa[ss] 54 | z [s]ummer 55 | ʃ bii[sch]t 56 | ʒ pro[j]et 57 | X ku[ch] 58 | ɕ lii[ch]t 59 | ʁ ku[g]el 60 | ʑ spi[g]el 61 | h [h]ei 62 | # Approximants 63 | l [l]oft 64 | j [j]o 65 | // Trills 66 | ʀ [r]ou 67 | -------------------------------------------------------------------------------- /gruut_ipa/data/nl/cgn/ipa_map.txt: -------------------------------------------------------------------------------- 1 | @ ə 2 | A ɑ 3 | AU ɑu 4 | E ɛ 5 | E2 ɛː 6 | EI ɛi 7 | EU ø 8 | G ɣ 9 | I ɪ 10 | N ŋ 11 | O ɔ 12 | S ʃ 13 | U ʏ 14 | UI œy 15 | Z ʒ 16 | a a 17 | b b 18 | d d 19 | e e 20 | f f 21 | g ɡ 22 | h h 23 | i i 24 | j j 25 | k k 26 | l l 27 | m m 28 | n n 29 | o o 30 | p p 31 | r ɹ 32 | s s 33 | t t 34 | u u 35 | v v 36 | w w 37 | x x 38 | y y 39 | z z 40 | -------------------------------------------------------------------------------- /gruut_ipa/data/nl/cgn/phonemes.txt: -------------------------------------------------------------------------------- 1 | @ 2 | A 3 | AU 4 | E 5 | E2 6 | EI 7 | EU 8 | G 9 | I 10 | N 11 | O 12 | S 13 | U 14 | UI 15 | Z 16 | a 17 | b 18 | d 19 | e 20 | f 21 | g 22 | h 23 | i 24 | j 25 | k 26 | l 27 | m 28 | n 29 | o 30 | p 31 | r 32 | s 33 | t 34 | u 35 | v 36 | w 37 | x 38 | y 39 | z 40 | -------------------------------------------------------------------------------- /gruut_ipa/data/nl/phonemes.txt: -------------------------------------------------------------------------------- 1 | # dutch (nl) 2 | # phoneme example [homophone] [homophone]... 3 | 4 | # sources: 5 | # https://nl.wiktionary.org/wiki/wikiwoordenboek:standaardweergave_uitspraak_nederlands 6 | # https://en.wikipedia.org/wiki/dutch_phonology 7 | 8 | # normal vowels 9 | a h[aa]r 10 | ɑ k[a]n ɑ̃ 11 | e m[ee]r 12 | ɛ p[e]t æ 13 | i b[i]et i̯ 14 | ɪ k[i]p ɪ̯ ɪː 15 | o [oo]g o̝ 16 | ɔ [o]s ʌ ʊ̯ 17 | y [uu]r y̯ 18 | ʏ [u]k ʏ̯ 19 | ø [eu]ro ɵ 20 | u [oe]ver u̯ 21 | ə g[e]makkelijk 22 | 23 | # elongated vowels 24 | aː j[a] 25 | ɑː f[a]rm 26 | eː v[ee]r 27 | ɛː [oe]ver 28 | iː anal[y]se 29 | ɔː r[o]ze oː øː 30 | yː centrif[u]ge 31 | uː c[oo]l 32 | œː man[oeu]vre œ 33 | ʏː res[ea]rch 34 | oː b[oo]t 35 | 36 | # dipthongs 37 | ɛi [ij]s 38 | ɑu [au]gurk 39 | œy [ui]l ʌʊ 40 | 41 | # plosives (stops) 42 | p [p]as 43 | b [b]ij 44 | t [t]ien 45 | d [d]en 46 | c [tj]alk 47 | ʔ na[-]apen 48 | k [k]at 49 | ɡ [g]oal g 50 | 51 | # fricatives 52 | f [f]oto 53 | v [v]ier 54 | s [s]ok 55 | z [z]es 56 | x [ch]emie 57 | ɣ [g]at χ 58 | h [h]eer ɦ 59 | ʃ [sj]aal 60 | ʒ [j]am 61 | 62 | # other consonants 63 | m [m]an 64 | n [n]ul ɲ 65 | ŋ to[ng] 66 | ɱ i[n]fuus 67 | l [l]ip ɫ 68 | ɹ [r]ol r 69 | j [j]as ʲ 70 | w [w]iel ʋ β w̞ β̞ 71 | -------------------------------------------------------------------------------- /gruut_ipa/data/phoneme_distances.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rhasspy/gruut-ipa/ec9ae6ce7cca0103d9a563fcde3352f805a8e27e/gruut_ipa/data/phoneme_distances.json.gz -------------------------------------------------------------------------------- /gruut_ipa/data/pt/phonemes.txt: -------------------------------------------------------------------------------- 1 | # https://en.wikipedia.org/wiki/Portuguese_phonology 2 | 3 | # Vowels 4 | a f[a]lo 5 | i v[i]nte ɨ 6 | ĩ f[im] 7 | u j[u]s ʊ 8 | ũ [u]m 9 | e al[e]x 10 | ẽ [e]mprego 11 | o am[o]r 12 | õ b[o]ns 13 | ɛ b[e]lo 14 | ɐ f[a]lo 15 | ɐ̃ c[a]ma 16 | ɔ pi[o]r 17 | 18 | # Consonants 19 | p [p]ai 20 | b [b]arco 21 | t [t]enho 22 | d [d]oce 23 | k [c]om 24 | ɡ [g]rande g 25 | f [f]alo 26 | v [v]erde 27 | s [c]éu 28 | z ca[s]a 29 | ʃ [ch]apéu 30 | ʒ [j]óia 31 | m [m]ar 32 | n [n]ada 33 | ɲ vi[nh]o 34 | l [l]anche 35 | ʎ taba[lh]o 36 | ɾ ca[r]o 37 | ʁ [r]ua 38 | ɹ agi[r] 39 | w ág[u]a 40 | j aqu[i] 41 | 42 | # Dipthongs 43 | aj p[ai] 44 | ɐj pl[ai]na 45 | ej r[ei] 46 | ɛj gel[ei]a 47 | oj d[oi]s 48 | ɔj d[ói] 49 | uj f[ui] 50 | ɐw s[au]dade 51 | ew s[eu] 52 | ɛw c[éu] 53 | iw v[iu] 54 | ow [ou]ro 55 | 56 | # Nasalated dipthongs 57 | ɐ̃j̃ m[ãe] 58 | ẽj̃ b[em] 59 | õj̃ p[õe] 60 | ũj̃ m[ui]to uj̃ 61 | ɐ̃w̃ fal[am] 62 | õw̃ b[om] 63 | -------------------------------------------------------------------------------- /gruut_ipa/data/ru-ru/phonemes.txt: -------------------------------------------------------------------------------- 1 | a [а]лло 2 | aː бр[а]т 3 | b [б]лин 4 | bʲ се[б]е 5 | d во[д]у 6 | dʲ буд[ь] 7 | e ваш[е] 8 | eː ве[д]ь 9 | f [ф]ото 10 | fʲ ко[ф]е 11 | ɡ [г]лаз 12 | ɡʲ дру[г]ие 13 | x все[х] 14 | xʲ взма[х]е 15 | i ваш[и] 16 | iː в[и]жу 17 | j все[й] 18 | k [к]атя 19 | kʲ [к]ино 20 | l бы[л]а 21 | lʲ г[л]еб 22 | m идё[м] 23 | mʲ [м]еня 24 | n же[н]а 25 | nʲ де[нь] 26 | o дел[о] 27 | oː дв[о]е 28 | p [п]апа 29 | pʲ [п]ить 30 | r ве[р]а 31 | rʲ в[р]яд 32 | s е[с]ли 33 | ɕː жен[щ]ин 34 | ʂ ва[ш]а 35 | sʲ вес[ь] 36 | t идё[т] 37 | tʲ бы[ть] 38 | t͡s ли[ц]о 39 | t͡ɕ вра[ч] 40 | u ваш[у] 41 | uː буд[у] 42 | v [в]зял 43 | vʲ [в]ещи 44 | ɨ сил[ы] 45 | ɨː в[ы]ше 46 | z вни[з] 47 | ʐ да[ж]е 48 | zʲ в[з]яла 49 | -------------------------------------------------------------------------------- /gruut_ipa/data/sv-se/phonemes.txt: -------------------------------------------------------------------------------- 1 | # https://en.wikipedia.org/wiki/Swedish_phonology 2 | 3 | # Vowels 4 | ɪ s[i]ll 5 | ʏ s[y]ll 6 | ʊ b[o]tt 7 | ʉː f[u]l 8 | ɑː m[a]t 9 | a m[a]tt ä å 10 | eː h[e]l e 11 | ɛ h[e]tta 12 | ɛː h[ä]l 13 | iː s[i]l i 14 | oː m[å]l o ö 15 | ɵ f[u]ll 16 | øː n[ö]t 17 | ɔ m[o]ll 18 | œ n[ö]tt 19 | uː b[o]t u 20 | yː s[y]l 21 | 22 | # Consonants 23 | ɡ [g]od g gː gː 24 | ɕ [kj]ol ʂ ɕː ʂː 25 | ɧ [sj]ok ɧː 26 | b [b]ok bː 27 | d [d]op ɖ dː ɖː 28 | f [f]ot fː 29 | h [h]ot hː 30 | j [j]ord jː 31 | k [k]on kː 32 | l [l]ov lː ɭ ɭː 33 | m [m]od mː 34 | n [n]od nː ɳ ɳː 35 | ŋ lå[ng] ŋː 36 | p [p]ol pː 37 | r [r]ov rː 38 | s [s]ot sː 39 | t [t]ok ʈ tː ʈː 40 | v [v]åt vː 41 | -------------------------------------------------------------------------------- /gruut_ipa/data/sw/alffa/ipa_map.txt: -------------------------------------------------------------------------------- 1 | BB ᵐɓ 2 | CC t͡ʃ 3 | DD ⁿɗ 4 | GG ᵑg 5 | JJ ⁿɗ͡ʒ 6 | LL ð 7 | NN _ 8 | RR ɣ 9 | SS ʃ 10 | TT θ 11 | VV ᶬv 12 | XX x 13 | ZZ ⁿz 14 | a ɑ 15 | b ɓ 16 | d ɗ 17 | e ɛ 18 | f f 19 | g ɠ 20 | h h 21 | i i 22 | j ʄ 23 | k k 24 | l l 25 | m m 26 | n n 27 | o ɔ 28 | p p 29 | r ɾ 30 | s s 31 | t t 32 | u u 33 | v v 34 | w w 35 | y j 36 | z z 37 | -------------------------------------------------------------------------------- /gruut_ipa/data/sw/alffa/phonemes.txt: -------------------------------------------------------------------------------- 1 | BB 2 | CC 3 | DD 4 | GG 5 | JJ 6 | LL 7 | NN 8 | RR 9 | SS 10 | TT 11 | VV 12 | XX 13 | ZZ 14 | a 15 | b 16 | d 17 | e 18 | f 19 | g 20 | h 21 | i 22 | j 23 | k 24 | l 25 | m 26 | n 27 | o 28 | p 29 | r 30 | s 31 | t 32 | u 33 | v 34 | w 35 | y 36 | z 37 | -------------------------------------------------------------------------------- /gruut_ipa/data/sw/phonemes.txt: -------------------------------------------------------------------------------- 1 | # https://en.wikipedia.org/wiki/Help:IPA/Swahili 2 | 3 | # Vowels 4 | ɑ b[a]b[a] a 5 | ɛ nd[e]g[e] e 6 | i K[i]swah[i]l[i] 7 | ɔ mt[o]t[o] o 8 | u [u]h[u]r[u] 9 | 10 | # Consonants 11 | ɓ [b]a[b]a b 12 | ɗ [d]ola d 13 | ð [dh]ambi 14 | ʄ ma[j]f j 15 | f [f]isi 16 | ɠ [g]ani g g 17 | ɣ [gh]ali 18 | h u[h]uru 19 | j [y]e[y]e y 20 | k [k]itabu 21 | l [l]akini 22 | m da[m]u m m̩ 23 | ᵐɓ [mb]ali ᵐb 24 | ᶬv [mv]inyo 25 | n [n]i[n]i n̩ 26 | ⁿɗ muhi[nd]i 27 | ᵑg [ng]oma ᵑg 28 | ⁿɗ͡ʒ i[nj]ili ɗʒ 29 | ⁿz kwa[nz]a 30 | p kika[p]u 31 | ɾ [r]afiki r 32 | s [s]i[s]i 33 | ʃ [sh]amba 34 | t mo[t]o 35 | t͡ʃ [ch]umba tʃ 36 | θ [th]ela[th]ini 37 | v [v]itabu 38 | w [w]atu 39 | x subul[kh]eri 40 | z ma[z]iwa 41 | 42 | # http://alffa.imag.fr/ 43 | # IPA ALFFA 44 | # ᵐɓ BB 45 | # t͡ʃ CC 46 | # ⁿɗ DD 47 | # ᵑg GG 48 | # ⁿɗ͡ʒ JJ 49 | # ð LL 50 | # _ NN 51 | # ɣ RR 52 | # ʃ SS 53 | # θ TT 54 | # ᶬv VV 55 | # x XX 56 | # ⁿz ZZ 57 | # ɑ a 58 | # ɓ b 59 | # ɗ d 60 | # ɛ e 61 | # f f 62 | # ɠ g 63 | # h h 64 | # i i 65 | # ʄ j 66 | # k k 67 | # l l 68 | # m m 69 | # n n 70 | # ɔ o 71 | # p p 72 | # ɾ r 73 | # s s 74 | # t t 75 | # u u 76 | # v v 77 | # w w 78 | # j y 79 | # z z 80 | -------------------------------------------------------------------------------- /gruut_ipa/data/vi-n/phonemes.txt: -------------------------------------------------------------------------------- 1 | # https://en.wikipedia.org/wiki/Vietnamese_phonology 2 | 3 | # Vowels 4 | ɨ ? ɯ ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 5 | a ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 6 | ă ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 7 | e ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 8 | ə ? ɤ ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 9 | ɛ ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 10 | ə̆ ? ɤ̆ ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 11 | i ? y ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 12 | o ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 13 | ɔ ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 14 | u ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 15 | 16 | # Dipthongs and tripthongs 17 | aj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 18 | ăj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 19 | aw ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 20 | ăw ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 21 | ɨə̯ ? ɯə ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 22 | əj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 23 | ɨə̯j ? ɯə̯j ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 24 | ə̆j ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 25 | ew ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 26 | ɛw ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 27 | ɨə̯w ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 28 | ə̆w ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 29 | iə̯ ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 30 | iə̯w ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 31 | iw ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 32 | ɨj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 33 | oj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 34 | ɔj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 35 | uə̯ ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 36 | uə̯j ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 37 | uj ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 38 | ɨw ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 39 | 40 | # Consonants 41 | ɲ ? 42 | ɣ ? 43 | b ? ɓ 44 | c [ch]ẻ ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 45 | d ? ɗ 46 | f ? 47 | h ? 48 | j ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 49 | k ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 50 | k͡p ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 51 | l ? 52 | m ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 53 | n ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 54 | ŋ ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 55 | ŋ͡m ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 56 | ɹ [r]a 57 | s [x]inh ʂ 58 | t ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 59 | tʰ ? 60 | c [tr]ẻ 61 | v [v]ợ 62 | w ? ! ˧˧ ˧˨ ˨˦ ˨ˀ˩ʔ ˧˩˨ ˧ˀ˥ 63 | x ? 64 | z [d]a 65 | p chấ[p] 66 | -------------------------------------------------------------------------------- /gruut_ipa/distances.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Functions for comparing phonemes by a distance metric""" 3 | import gzip 4 | import itertools 5 | import json 6 | import sys 7 | import threading 8 | import typing 9 | 10 | import numpy as np 11 | 12 | from gruut_ipa.constants import ( 13 | _CONSONANTS, 14 | _DATA_DIR, 15 | _SCHWAS, 16 | _VOWELS, 17 | FEATURE_KEYS, 18 | Consonant, 19 | Schwa, 20 | Vowel, 21 | ) 22 | from gruut_ipa.features import to_vector 23 | 24 | _CLOSEST_TYPE = typing.Mapping[str, typing.Sequence[str]] 25 | _CLOSEST: typing.Optional[_CLOSEST_TYPE] = None 26 | 27 | 28 | def create_closest( 29 | symbols: typing.Optional[ 30 | typing.Iterable[typing.Union[Vowel, Consonant, Schwa]] 31 | ] = None 32 | ) -> _CLOSEST_TYPE: 33 | """Create mapping from each IPA symbol to a list of other IPA symbols reverse ordered by feature distance""" 34 | import sklearn.metrics 35 | 36 | if not symbols: 37 | symbols = itertools.chain(_VOWELS, _CONSONANTS, _SCHWAS,) 38 | 39 | symbol_list = list(symbols) 40 | vectors = {} 41 | for symbol in symbol_list: 42 | if symbol.ipa in vectors: 43 | continue 44 | 45 | vectors[symbol.ipa] = to_vector(symbol) 46 | 47 | matrix = np.vstack(list(vectors.values())) 48 | 49 | w = np.ones(matrix.shape[1]) 50 | 51 | # Adjust feature weights 52 | w[FEATURE_KEYS["vowel_place"]] = 0.5 53 | w[FEATURE_KEYS["vowel_height"]] = 1 54 | w[FEATURE_KEYS["vowel_rounded"]] = 0.01 55 | 56 | w[FEATURE_KEYS["consonant_place"]] = 0.15 57 | w[FEATURE_KEYS["consonant_voiced"]] = 0.5 58 | w[FEATURE_KEYS["consonant_sounds_like"]] = 0.5 59 | 60 | dist = sklearn.metrics.pairwise_distances(matrix, metric="minkowski", p=2, w=w) 61 | 62 | dist_symbols = list(vectors.keys()) 63 | closest = { 64 | s: [dist_symbols[j] for j in dist[i].argsort() if s != dist_symbols[j]] 65 | for i, s in enumerate(dist_symbols) 66 | } 67 | 68 | return closest 69 | 70 | 71 | _CLOSEST_LOCK = threading.Lock() 72 | 73 | 74 | def get_closest(ipa: str) -> typing.Optional[typing.Sequence[str]]: 75 | """Get a list of IPA symbols that are closest, ordered by increasing distance.""" 76 | global _CLOSEST 77 | 78 | with _CLOSEST_LOCK: 79 | if _CLOSEST is None: 80 | closest_path = _DATA_DIR / "phoneme_distances.json.gz" 81 | with gzip.open(closest_path, "r") as closest_file: 82 | _CLOSEST = json.load(closest_file) 83 | 84 | assert _CLOSEST is not None 85 | 86 | return _CLOSEST.get(ipa) 87 | 88 | 89 | # ----------------------------------------------------------------------------- 90 | 91 | if __name__ == "__main__": 92 | # { 93 | # "": ["", "", ...], 94 | # ... 95 | # } 96 | json.dump(create_closest(), sys.stdout, indent=4, ensure_ascii=False) 97 | -------------------------------------------------------------------------------- /gruut_ipa/espeak.py: -------------------------------------------------------------------------------- 1 | """Mapping between IPA and Espeak""" 2 | import re 3 | import unicodedata 4 | 5 | # http://espeak.sourceforge.net/phonemes.html 6 | 7 | 8 | def ipa_to_espeak(ipa: str, keep_whitespace: bool = True) -> str: 9 | """Convert IPA string to eSpeak phonemes""" 10 | ipa_codepoints = unicodedata.normalize("NFD", ipa) 11 | 12 | return IPA_PATTERN.sub( 13 | lambda match: IPA_TO_ESPEAK.get(match.group(1), ""), ipa_codepoints 14 | ) 15 | 16 | 17 | def espeak_to_ipa(espeak: str) -> str: 18 | """Convert eSpeak phonemes to IPA phones""" 19 | # Remove brackets 20 | espeak_codepoints = "".join( 21 | c for c in unicodedata.normalize("NFD", espeak) if c not in {"[", "]"} 22 | ) 23 | 24 | return ESPEAK_PATTERN.sub( 25 | lambda match: ESPEAK_TO_IPA.get(match.group(1), ""), espeak_codepoints 26 | ) 27 | 28 | 29 | # ----------------------------------------------------------------------------- 30 | 31 | IPA_TO_ESPEAK = { 32 | "\u00e6": "a", 33 | "\u0061": "a", 34 | "\u0251": "A", 35 | "\u0252": "A.", 36 | "\u028c": "V", 37 | "\u0250": "V", 38 | "\u0062": "b", 39 | "\u0253": "b`", 40 | "\u0299": "b", 41 | "\u03b2": "B", 42 | "\u0063": "c", 43 | "\u00e7": "C", 44 | "\u0063\u0327": "C", 45 | "\u0255": "S;", 46 | "\u0064": "d", 47 | "\u0257": "d`", 48 | "\u0256": "d.", 49 | "\u00f0": "D", 50 | "\u0065": "e", 51 | "\u0259": "@", 52 | "\u025a": "3", 53 | "\u0258": "@", 54 | "\u025b": "E", 55 | "\u025c": 'V"', 56 | "\u025d": "3", 57 | "\u025e": 'O"', 58 | "\u0066": "f", 59 | "\u0261": "g", 60 | "\u0067": "g", 61 | "\u0260": "g`", 62 | "\u0262": "G", 63 | "\u029b": "G`", 64 | "\u0263": "Q", 65 | "\u02e0": "~", 66 | "\u0264": "o-", 67 | "\u0068": "h", 68 | "\u02b0": "", 69 | "\u0127": "H", 70 | "\u0266": "h", 71 | "\u0267": "", 72 | "\u0265": "j", 73 | "\u029c": "", 74 | "\u0069": "i", 75 | "\u0268": 'i"', 76 | "\u026a": "I", 77 | "\u006a": "j", 78 | "\u02b2": ";", 79 | "\u029d": "C", 80 | "\u025f": "J", 81 | "\u0284": "J`", 82 | "\u006b": "k", 83 | "\u006c": "l", 84 | "\u026b": "l", 85 | "\u026c": "s", 86 | "\u026d": "l.", 87 | "\u026e": "z", 88 | "\u029f": "L", 89 | "\u006d": "m", 90 | "\u0271": "M", 91 | "\u026f": "u-", 92 | "\u0270": "Q", 93 | "\u006e": "n", 94 | "\u0272": "n^", 95 | "\u014b": "N", 96 | "\u0273": "n.", 97 | "\u0274": 'n"', 98 | "\u006f": "o", 99 | "\u0298": "p!", 100 | "\u0275": "@.", 101 | "\u00f8": "Y", 102 | "\u0153": "W", 103 | "\u0276": "W", 104 | "\u0254": "O", 105 | "\u0070": "p", 106 | "\u0278": "F", 107 | "\u0071": "q", 108 | "\u0072": "r", 109 | "\u027e": "R", 110 | "\u027c": "", 111 | "\u027d": "*.", 112 | "\u0279": "r", 113 | "\u027b": "r.", 114 | "\u027a": "*", 115 | "\u0280": 'r"', 116 | "\u0281": "r", 117 | "\u0073": "s", 118 | "\u0282": "s.", 119 | "\u0283": "S", 120 | "\u0074": "t", 121 | "\u0288": "t.", 122 | "\u03b8": "T", 123 | "\u0075": "u", 124 | "\u0289": 'u"', 125 | "\u028a": "U", 126 | "\u0076": "v", 127 | "\u028b": "v#", 128 | "\u0077": "w", 129 | "\u02b7": "", 130 | "\u028d": "w", 131 | "\u0078": "x", 132 | "\u03c7": "X", 133 | "\u0079": "y", 134 | "\u028e": "l^", 135 | "\u028f": "I.", 136 | "\u007a": "z", 137 | "\u0291": "Z;", 138 | "\u0290": "z.", 139 | "\u0292": "Z", 140 | "\u0294": "?", 141 | "\u02a1": "", 142 | "\u0295": "H", 143 | "\u02a2": "", 144 | "\u02e4": "", 145 | "\u01c3": "c!", 146 | "\u01c0": "t!", 147 | "\u01c2": "c!", 148 | "\u01c1": "l!", 149 | "\u0320": "", 150 | "\u032a": "", 151 | "\u033a": "", 152 | "\u031f": "", 153 | "\u031d": "", 154 | "\u031e": "", 155 | "\u02c8": "'", 156 | "\u02cc": ",", 157 | "\u0329": "-", 158 | "\u031a": "", 159 | "\u002e": "", 160 | "\u02d1": "", 161 | "\u0308": "", 162 | "\u0324": "", 163 | "\u02d0": ":", 164 | "\u02bc": "`", 165 | "\u0325": "", 166 | "\u030a": "", 167 | "\u031c": "", 168 | "\u0339": "", 169 | "\u0303": "~", 170 | "\u0334": "~", 171 | "\u0330": "", 172 | "\u032c": "", 173 | "\u0306": "", 174 | "\u032f": "", 175 | "\u033d": "", 176 | "\u02de": "", 177 | "\u033b": "", 178 | "\u0318": "", 179 | "\u0319": "", 180 | "\u033c": "", 181 | "\u2197": "", 182 | "\u2191": "", 183 | "\u2198": "", 184 | "\u2193": "", 185 | # 186 | # Ties 187 | "\u0361": "", 188 | "\u035C": "", 189 | # 190 | # Tied symbols 191 | "\u0288\u0361\u0282": "tS", 192 | "\u0256\u0361\u0290": "dz", 193 | # 194 | # Breaks 195 | "|": "_::", 196 | "\u2016": "_::_::", 197 | "#": "", 198 | } 199 | 200 | ESPEAK_TO_IPA = {v: k for k, v in IPA_TO_ESPEAK.items() if v} 201 | 202 | # Regex disjunction in descending length order 203 | ESPEAK_PATTERN = re.compile( 204 | "({})".format( 205 | "|".join(re.escape(espeak) for espeak in sorted(ESPEAK_TO_IPA, reverse=True)) 206 | ) 207 | ) 208 | 209 | IPA_PATTERN = re.compile( 210 | "({})".format( 211 | "|".join(re.escape(ipa) for ipa in sorted(IPA_TO_ESPEAK, key=len, reverse=True)) 212 | ) 213 | ) 214 | -------------------------------------------------------------------------------- /gruut_ipa/features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Functions for converting IPA symbols to and from feature vectors.""" 3 | import dataclasses 4 | import typing 5 | 6 | from gruut_ipa.constants import ( 7 | CONSONANTS, 8 | FEATURE_COLUMNS, 9 | FEATURE_EMPTY, 10 | FEATURE_KEYS, 11 | FEATURE_ORDINAL_COLUMNS, 12 | IPA, 13 | SCHWAS, 14 | VOWELS, 15 | Break, 16 | BreakType, 17 | Consonant, 18 | ConsonantPlace, 19 | ConsonantType, 20 | PhonemeLength, 21 | Schwa, 22 | Stress, 23 | Vowel, 24 | VowelHeight, 25 | VowelPlacement, 26 | ) 27 | 28 | 29 | def to_vector( 30 | symbol: typing.Union[Vowel, Consonant, Schwa, Break] 31 | ) -> typing.Sequence[float]: 32 | """Converts a symbol into a feature vector""" 33 | features: typing.Dict[str, str] = {} 34 | 35 | if isinstance(symbol, Vowel): 36 | features["symbol_type"] = "phoneme" 37 | features["phoneme_type"] = "vowel" 38 | features["vowel_height"] = symbol.height.value 39 | features["vowel_place"] = symbol.placement.value 40 | features["vowel_rounded"] = "rounded" if symbol.rounded else "unrounded" 41 | features["phoneme_length"] = symbol.length.value 42 | 43 | if symbol.nasalated: 44 | features["diacritic"] = "nasalated" 45 | 46 | if symbol.stress is not None: 47 | features["vowel_stress"] = symbol.stress.value 48 | 49 | elif isinstance(symbol, Consonant): 50 | features["symbol_type"] = "phoneme" 51 | features["phoneme_type"] = "consonant" 52 | features["consonant_voiced"] = "voiced" if symbol.voiced else "unvoiced" 53 | features["consonant_type"] = symbol.type.value 54 | features["consonant_place"] = symbol.place.value 55 | features["consonant_sounds_like"] = symbol.sounds_like.value 56 | features["phoneme_length"] = symbol.length.value 57 | 58 | if symbol.velarized: 59 | features["diacritic"] = "velarized" 60 | 61 | elif isinstance(symbol, Schwa): 62 | features["symbol_type"] = "phoneme" 63 | features["phoneme_type"] = "schwa" 64 | features["phoneme_length"] = symbol.length.value 65 | 66 | if symbol.r_coloured: 67 | features["consonant_sounds_like"] = "r" 68 | 69 | elif isinstance(symbol, Break): 70 | features["symbol_type"] = "break" 71 | features["break_type"] = symbol.type.value 72 | else: 73 | # Unsupported symbol type 74 | raise ValueError(symbol) 75 | 76 | return features_to_vector(features) 77 | 78 | 79 | def from_vector( 80 | vector: typing.Sequence[float], 81 | ) -> typing.Union[Vowel, Consonant, Schwa, Break]: 82 | """Converts a feature vector back into a symbol""" 83 | features = vector_to_features(vector) 84 | if features["symbol_type"] == "break": 85 | break_type = BreakType(features["break_type"]) 86 | return Break(break_type) 87 | 88 | if features["symbol_type"] == "phoneme": 89 | if features["phoneme_type"] == "vowel": 90 | height = VowelHeight(features["vowel_height"]) 91 | placement = VowelPlacement(features["vowel_place"]) 92 | rounded = features["vowel_rounded"] == "rounded" 93 | nasalated = features["diacritic"] == "nasalated" 94 | length = PhonemeLength(features["phoneme_length"]) 95 | 96 | stress: typing.Optional[Stress] = None 97 | stress_val = features["vowel_stress"] 98 | if stress_val != FEATURE_EMPTY: 99 | stress = Stress(stress_val) 100 | 101 | for vowel in VOWELS.values(): 102 | if ( 103 | (vowel.height == height) 104 | and (vowel.placement == placement) 105 | and (vowel.rounded == rounded) 106 | and (vowel.nasalated == nasalated) 107 | ): 108 | if (stress is None) and (length == PhonemeLength.NORMAL): 109 | # Don't need to make a copy 110 | return vowel 111 | 112 | return dataclasses.replace(vowel, stress=stress) 113 | 114 | raise ValueError(f"Unknown vowel: {features}") 115 | 116 | if features["phoneme_type"] == "consonant": 117 | c_type = ConsonantType(features["consonant_type"]) 118 | place = ConsonantPlace(features["consonant_place"]) 119 | voiced = features["consonant_voiced"] == "voiced" 120 | velarized = features["diacritic"] == "velarized" 121 | length = PhonemeLength(features["phoneme_length"]) 122 | 123 | for consonant in CONSONANTS.values(): 124 | if ( 125 | (consonant.type == c_type) 126 | and (consonant.place == place) 127 | and (consonant.voiced == voiced) 128 | and (consonant.velarized == velarized) 129 | ): 130 | if length == PhonemeLength.NORMAL: 131 | # Don't need to make a copy 132 | return consonant 133 | 134 | return dataclasses.replace(consonant, length=length) 135 | 136 | raise ValueError(f"Unknown vowel: {features}") 137 | 138 | if features["phoneme_type"] == "schwa": 139 | r_coloured = features["consonant_sounds_like"] == "r" 140 | length = PhonemeLength(features["phoneme_length"]) 141 | 142 | for schwa in SCHWAS.values(): 143 | if schwa.r_coloured == r_coloured: 144 | if length == PhonemeLength.NORMAL: 145 | # Don't need to make a copy 146 | return schwa 147 | 148 | return dataclasses.replace(schwa, length=length) 149 | 150 | raise ValueError(f"Unknown vowel: {features}") 151 | 152 | # Unsupported phoneme type 153 | raise ValueError(f"Unknown phoneme type: {features}") 154 | 155 | # Unsupported symbol type 156 | raise ValueError(f"Unknown symbol type: {features}") 157 | 158 | 159 | def string_to_symbol(symbol_str: str) -> typing.Union[Vowel, Consonant, Schwa, Break]: 160 | """Get gruut IPA object for IPA symbol""" 161 | if not symbol_str: 162 | raise ValueError("Empty symbol") 163 | 164 | # Check break first 165 | if symbol_str == IPA.BREAK_WORD: 166 | return Break(BreakType.WORD) 167 | 168 | if symbol_str == IPA.BREAK_MINOR: 169 | return Break(BreakType.MINOR) 170 | 171 | if symbol_str == IPA.BREAK_MAJOR: 172 | return Break(BreakType.MAJOR) 173 | 174 | # Strip stress 175 | maybe_stress: typing.Optional[Stress] = None 176 | if symbol_str[0] == IPA.STRESS_PRIMARY: 177 | maybe_stress = Stress.PRIMARY 178 | symbol_str = symbol_str[1:] 179 | elif symbol_str[0] == IPA.STRESS_SECONDARY: 180 | maybe_stress = Stress.SECONDARY 181 | symbol_str = symbol_str[1:] 182 | 183 | if not symbol_str: 184 | raise ValueError("No letters") 185 | 186 | # Strip length 187 | length = PhonemeLength.NORMAL 188 | if symbol_str[-1] == IPA.HALF_LONG: 189 | length = PhonemeLength.SHORT 190 | symbol_str = symbol_str[:-1] 191 | elif symbol_str[-1] == IPA.LONG: 192 | length = PhonemeLength.LONG 193 | symbol_str = symbol_str[:-1] 194 | 195 | if not symbol_str: 196 | raise ValueError("No letters") 197 | 198 | # Look up 199 | maybe_vowel = VOWELS.get(symbol_str) 200 | if maybe_vowel is not None: 201 | return dataclasses.replace(maybe_vowel, stress=maybe_stress, length=length) 202 | 203 | maybe_consonant = CONSONANTS.get(symbol_str) 204 | if maybe_consonant is not None: 205 | return dataclasses.replace(maybe_consonant, length=length) 206 | 207 | maybe_schwa = SCHWAS.get(symbol_str) 208 | if maybe_schwa is not None: 209 | return dataclasses.replace(maybe_schwa, length=length) 210 | 211 | raise ValueError(f"Unsupported symbol type: {symbol_str}") 212 | 213 | 214 | def features_to_vector(features: typing.Mapping[str, str]) -> typing.Sequence[float]: 215 | """Create phoneme feature vector from mapping""" 216 | vector: typing.List[float] = [] 217 | 218 | for col, values in FEATURE_COLUMNS.items(): 219 | value = features.get(col, FEATURE_EMPTY) 220 | 221 | if col in FEATURE_ORDINAL_COLUMNS: 222 | # Single value normalized by number of possible values 223 | vector.append(values.index(value) / len(values)) 224 | else: 225 | # One-hot vector 226 | for v in values: 227 | vector.append(1.0 if (v == value) else 0.0) 228 | 229 | return vector 230 | 231 | 232 | def vector_to_features(vector: typing.Sequence[float]) -> typing.Mapping[str, str]: 233 | """Create mapping from phoneme feature vector""" 234 | features: typing.Dict[str, str] = {} 235 | 236 | for col_name, values in FEATURE_COLUMNS.items(): 237 | col_key = FEATURE_KEYS[col_name] 238 | if col_name in FEATURE_ORDINAL_COLUMNS: 239 | # Single value normalized by number of possible values 240 | assert isinstance(col_key, int) 241 | val_idx = int(vector[col_key] * len(values)) 242 | else: 243 | # One-hot vector 244 | assert isinstance(col_key, slice) 245 | if 1.0 not in vector[col_key]: 246 | assert False, (col_name, col_key, vector[col_key]) 247 | val_idx = vector[col_key].index(1.0) 248 | 249 | features[col_name] = values[val_idx] 250 | 251 | return features 252 | -------------------------------------------------------------------------------- /gruut_ipa/kirshenbaum.py: -------------------------------------------------------------------------------- 1 | """Mapping between IPA and Kirshenbaum""" 2 | 3 | # http://www.blahedo.org/ascii-ipa.html 4 | 5 | IPA_TO_KIRSHENBAUM = { 6 | "\u0061": "a", 7 | "\u0250": "", 8 | "\u0251": "A", 9 | "\u0252": "A.", 10 | "\u00e6": "&", 11 | "\u028c": "V", 12 | "\u0062": "b", 13 | "\u0253": "b`", 14 | "\u0299": "b", 15 | "\u03b2": "B", 16 | "\u0063": "c", 17 | "\u00e7": "C", 18 | "\u0255": "", 19 | "\u0064": "d", 20 | "\u0257": "d`", 21 | "\u0256": "d.", 22 | "\u00f0": "D", 23 | "\u0065": "e", 24 | "\u0259": "@", 25 | "\u025a": "R", 26 | "\u0258": "@", 27 | "\u025b": "E", 28 | "\u025c": 'V"', 29 | "\u025d": "R", 30 | "\u025e": 'O"', 31 | "\u0066": "f", 32 | "\u0261": "g", 33 | "\u0260": "g`", 34 | "\u0262": "G", 35 | "\u029b": "G`", 36 | "\u0263": "Q", 37 | "\u02e0": "~", 38 | "\u0264": "o-", 39 | "\u0068": "h", 40 | "\u02b0": "", 41 | "\u0127": "H", 42 | "\u0266": "h", 43 | "\u0267": "", 44 | "\u0265": "j", 45 | "\u029c": "", 46 | "\u0069": "i", 47 | "\u0268": 'i"', 48 | "\u026a": "I", 49 | "\u006a": "j", 50 | "\u02b2": ";", 51 | "\u029d": "C", 52 | "\u025f": "J", 53 | "\u0284": "J`", 54 | "\u006b": "k", 55 | "\u006c": "l", 56 | "\u026b": "", 57 | "\u026c": "s", 58 | "\u026d": "l.", 59 | "\u026e": "z", 60 | "\u029f": "L", 61 | "\u006d": "m", 62 | "\u0271": "M", 63 | "\u026f": "u-", 64 | "\u0270": "j", 65 | "\u006e": "n", 66 | "\u0272": "n^", 67 | "\u014b": "N", 68 | "\u0273": "n.", 69 | "\u0274": 'n"', 70 | "\u006f": "o", 71 | "\u0298": "p!", 72 | "\u0275": "@.", 73 | "\u00f8": "Y", 74 | "\u0153": "&.", 75 | "\u0276": "W", 76 | "\u0254": "O", 77 | "\u0070": "p", 78 | "\u0278": "F", 79 | "\u0071": "q", 80 | "\u0072": "r", 81 | "\u027e": "*", 82 | "\u027c": "", 83 | "\u027d": "*.", 84 | "\u0279": "r", 85 | "\u027b": "r.", 86 | "\u027a": "*", 87 | "\u0280": 'r"', 88 | "\u0281": 'g"', 89 | "\u0073": "s", 90 | "\u0282": "s.", 91 | "\u0283": "S", 92 | "\u0074": "t", 93 | "\u0288": "t.", 94 | "\u03b8": "T", 95 | "\u0075": "u", 96 | "\u0289": 'u"', 97 | "\u028a": "U", 98 | "\u0076": "v", 99 | "\u028b": "r", 100 | "\u0077": "w", 101 | "\u02b7": "", 102 | "\u028d": "w", 103 | "\u0078": "x", 104 | "\u03c7": "X", 105 | "\u0079": "y", 106 | "\u028e": "l^", 107 | "\u028f": "I.", 108 | "\u007a": "z", 109 | "\u0291": "", 110 | "\u0290": "z.", 111 | "\u0292": "Z", 112 | "\u0294": "?", 113 | "\u02a1": "", 114 | "\u0295": "H", 115 | "\u02a2": "", 116 | "\u02e4": "", 117 | "\u01c3": "c!", 118 | "\u01c0": "t!", 119 | "\u01c2": "c!", 120 | "\u01c1": "l!", 121 | "\u0320": "", 122 | "\u032a": "", 123 | "\u033a": "", 124 | "\u031f": "", 125 | "\u031d": "", 126 | "\u031e": "", 127 | "\u02c8": "'", 128 | "\u02cc": ",", 129 | "\u0329": "-", 130 | "\u031a": "", 131 | "\u002e": "", 132 | "\u02d1": "", 133 | "\u0308": "", 134 | "\u0324": "", 135 | "\u02d0": ":", 136 | "\u02bc": "`", 137 | "\u0325": "", 138 | "\u030a": "", 139 | "\u031c": "", 140 | "\u0339": "", 141 | "\u0303": "~", 142 | "\u0334": "~", 143 | "\u0330": "", 144 | "\u032c": "", 145 | "\u0306": "", 146 | "\u032f": "", 147 | "\u033d": "", 148 | "\u02de": "", 149 | "\u033b": "", 150 | "\u0318": "", 151 | "\u0319": "", 152 | "\u033c": "", 153 | "\u2197": "", 154 | "\u2191": "", 155 | "\u2198": "", 156 | "\u2193": "", 157 | "\u030f": "1", 158 | "\u0300": "2", 159 | "\u0304": "3", 160 | "\u0301": "4", 161 | "\u030b": "5", 162 | } 163 | -------------------------------------------------------------------------------- /gruut_ipa/phonemes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Functions for manipulating phones/phonemes""" 3 | import logging 4 | import re 5 | import typing 6 | import unicodedata 7 | from collections import defaultdict 8 | 9 | from gruut_ipa.constants import ( # noqa: F401 10 | _DATA_DIR, 11 | _DIR, 12 | CONSONANTS, 13 | FEATURE_COLUMNS, 14 | FEATURE_EMPTY, 15 | FEATURE_KEYS, 16 | FEATURE_ORDINAL_COLUMNS, 17 | IPA, 18 | LANG_ALIASES, 19 | SCHWAS, 20 | VOWELS, 21 | Accent, 22 | Break, 23 | BreakType, 24 | Consonant, 25 | ConsonantPlace, 26 | ConsonantType, 27 | Dipthong, 28 | Intonation, 29 | PhonemeLength, 30 | Schwa, 31 | Stress, 32 | Vowel, 33 | VowelHeight, 34 | VowelPlacement, 35 | ) 36 | from gruut_ipa.utils import resolve_lang 37 | 38 | _LOGGER = logging.getLogger("gruut_ipa") 39 | 40 | # ----------------------------------------------------------------------------- 41 | 42 | 43 | class Phone: 44 | """Single IPA phone with diacritics and suprasegmentals""" 45 | 46 | def __init__( 47 | self, 48 | letters: str, 49 | stress: typing.Optional[Stress] = None, 50 | accents: typing.Optional[typing.Iterable[Accent]] = None, 51 | is_long: bool = False, 52 | nasal: typing.Optional[typing.Set[int]] = None, 53 | raised: typing.Optional[typing.Set[int]] = None, 54 | diacritics: typing.Optional[typing.Dict[int, typing.Set[str]]] = None, 55 | suprasegmentals: typing.Optional[typing.Set[str]] = None, 56 | tone: str = "", 57 | ): 58 | self.letters: str = unicodedata.normalize("NFC", letters) 59 | self.stress = stress 60 | self.accents: typing.List[Accent] = list(accents or []) 61 | self.is_long: bool = is_long 62 | 63 | self.nasal: typing.Set[int] = nasal or set() 64 | self.is_nasal = bool(self.nasal) 65 | 66 | self.raised: typing.Set[int] = raised or set() 67 | self.is_raised = bool(self.raised) 68 | 69 | self.tone: str = tone 70 | 71 | self.diacritics: typing.Dict[int, typing.Set[str]] = diacritics or defaultdict( 72 | set 73 | ) 74 | self.suprasegmentals: typing.Set[str] = suprasegmentals or set() 75 | 76 | # Decompose suprasegmentals and diacritics 77 | if self.stress == Stress.PRIMARY: 78 | self.suprasegmentals.add(IPA.STRESS_PRIMARY) 79 | elif self.stress == Stress.SECONDARY: 80 | self.suprasegmentals.add(IPA.STRESS_SECONDARY) 81 | 82 | if Accent.ACUTE in self.accents: 83 | self.suprasegmentals.add(IPA.ACCENT_ACUTE) 84 | 85 | if Accent.GRAVE in self.accents: 86 | self.suprasegmentals.add(IPA.ACCENT_GRAVE) 87 | 88 | if self.is_long: 89 | self.suprasegmentals.add(IPA.LONG) 90 | 91 | # Nasal 92 | for letter_index in self.nasal: 93 | letter_diacritics = self.diacritics.get(letter_index) 94 | if letter_diacritics is None: 95 | letter_diacritics = set() 96 | self.diacritics[letter_index] = letter_diacritics 97 | 98 | letter_diacritics.add(IPA.NASAL) 99 | 100 | # Raised 101 | for letter_index in self.raised: 102 | letter_diacritics = self.diacritics.get(letter_index) 103 | if letter_diacritics is None: 104 | letter_diacritics = set() 105 | self.diacritics[letter_index] = letter_diacritics 106 | 107 | letter_diacritics.add(IPA.RAISED) 108 | 109 | self._text: str = "" 110 | 111 | self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters) 112 | self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters) 113 | self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters) 114 | 115 | @property 116 | def text(self) -> str: 117 | """Get textual representation of phone (NFC normalized)""" 118 | if self._text: 119 | return self._text 120 | 121 | # Pre-letter suprasegmentals 122 | for accent in self.accents: 123 | if accent == Accent.ACUTE: 124 | self._text += IPA.ACCENT_ACUTE 125 | elif accent == Accent.GRAVE: 126 | self._text += IPA.ACCENT_GRAVE 127 | 128 | if self.stress == Stress.PRIMARY: 129 | self._text += IPA.STRESS_PRIMARY 130 | elif self.stress == Stress.SECONDARY: 131 | self._text += IPA.STRESS_SECONDARY 132 | 133 | # Letters and diacritics 134 | for letter_index, letter in enumerate(self.letters): 135 | self._text += letter 136 | 137 | # Diacritics 138 | for diacritic in self.diacritics.get(letter_index, []): 139 | self._text += diacritic 140 | 141 | # Tone 142 | if self.tone: 143 | self._text += self.tone 144 | 145 | # Post-letter suprasegmentals 146 | if self.is_long: 147 | self._text += IPA.LONG 148 | 149 | # Re-normalize and combine 150 | self._text = unicodedata.normalize("NFC", self._text) 151 | 152 | return self._text 153 | 154 | @property 155 | def is_vowel(self) -> bool: 156 | """True if phone is a vowel""" 157 | return self.vowel is not None 158 | 159 | @property 160 | def is_consonant(self) -> bool: 161 | """True if phone is a consonant""" 162 | return self.consonant is not None 163 | 164 | @property 165 | def is_schwa(self) -> bool: 166 | """True if phone is a schwa""" 167 | return self.schwa is not None 168 | 169 | def __repr__(self) -> str: 170 | return self.text 171 | 172 | @staticmethod 173 | def from_string(phone_str: str) -> "Phone": 174 | """Parse phone from string""" 175 | # Decompose into base and combining characters 176 | codepoints = unicodedata.normalize("NFD", phone_str) 177 | kwargs: typing.Dict[str, typing.Any] = { 178 | "letters": "", 179 | "diacritics": defaultdict(set), 180 | "tone": "", 181 | "accents": [], 182 | "nasal": set(), 183 | "raised": set(), 184 | } 185 | 186 | in_tone = False 187 | new_letter = False 188 | letter_index = 0 189 | 190 | for c in codepoints: 191 | # Check for stress 192 | if (c == IPA.ACCENT_ACUTE) and not in_tone: 193 | kwargs["accents"].append(Accent.ACUTE) 194 | elif (c == IPA.ACCENT_GRAVE) and not in_tone: 195 | kwargs["accents"].append(Accent.GRAVE) 196 | elif c == IPA.STRESS_PRIMARY: 197 | kwargs["stress"] = Stress.PRIMARY 198 | elif c == IPA.STRESS_SECONDARY: 199 | kwargs["stress"] = Stress.SECONDARY 200 | elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}): 201 | # Interpret as part of tone 202 | kwargs["tone"] += c 203 | elif IPA.is_long(c): 204 | # Check for elongation 205 | kwargs["is_long"] = True 206 | elif IPA.is_nasal(c): 207 | # Check for nasalation 208 | kwargs["nasal"].add(letter_index) 209 | elif IPA.is_raised(c): 210 | # Check for raised articulation 211 | kwargs["raised"].add(letter_index) 212 | elif IPA.is_bracket(c) or IPA.is_break(c): 213 | # Skip brackets/syllable breaks 214 | pass 215 | elif IPA.is_tie(c): 216 | # Keep ties in letters 217 | kwargs["letters"] += c 218 | letter_index += 1 219 | elif IPA.is_tone(c): 220 | # Tone numbers/letters 221 | kwargs["tone"] += c 222 | in_tone = True 223 | elif unicodedata.combining(c) > 0: 224 | # Stow some diacritics that we don't do anything with 225 | kwargs["diacritics"][letter_index].add(c) 226 | else: 227 | # Include all other characters in letters 228 | kwargs["letters"] += c 229 | if new_letter: 230 | letter_index += 1 231 | 232 | new_letter = True 233 | 234 | return Phone(**kwargs) 235 | 236 | 237 | # ----------------------------------------------------------------------------- 238 | 239 | 240 | class Pronunciation: 241 | """Collection of phones and breaks for some unit of text (word, sentence, etc.)""" 242 | 243 | def __init__( 244 | self, phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]] 245 | ): 246 | self.phones_and_others = phones_and_others 247 | 248 | self.phones: typing.List[Phone] = [] 249 | self.breaks: typing.List[Break] = [] 250 | self.intonations: typing.List[Intonation] = [] 251 | 252 | # Decompose into phones, breaks, and intonations 253 | for p in self.phones_and_others: 254 | if isinstance(p, Phone): 255 | self.phones.append(p) 256 | elif isinstance(p, Break): 257 | self.breaks.append(p) 258 | elif isinstance(p, Intonation): 259 | self.intonations.append(p) 260 | 261 | self._text = "" 262 | 263 | @property 264 | def text(self) -> str: 265 | """Get text representation of pronunciation (NFC normalized)""" 266 | if not self._text: 267 | self._text = "".join(p.text for p in self.phones_and_others) 268 | 269 | return self._text 270 | 271 | def __repr__(self) -> str: 272 | return self.text 273 | 274 | def __iter__(self): 275 | return iter(self.phones_and_others) 276 | 277 | def __getitem__(self, idx): 278 | return self.phones_and_others[idx] 279 | 280 | @staticmethod 281 | def from_string( 282 | pron_str: str, 283 | keep_stress: bool = True, 284 | keep_accents: typing.Optional[bool] = None, 285 | drop_tones: bool = False, 286 | keep_ties: bool = True, 287 | ) -> "Pronunciation": 288 | """Split an IPA pronunciation into phones. 289 | 290 | Stress/accent markers bind to the next non-combining codepoint (e.g., ˈa). 291 | Elongation markers bind to the previous non-combining codepoint (e.g., aː). 292 | Ties join two non-combining sequences (e.g. t͡ʃ). 293 | 294 | Whitespace and brackets are skipped. 295 | 296 | Returns list of phones. 297 | """ 298 | if keep_accents is None: 299 | keep_accents = keep_stress 300 | 301 | clusters = [] 302 | cluster = "" 303 | stress = "" 304 | is_stress = False 305 | accents = "" 306 | is_accent = False 307 | tone = "" 308 | in_tone = False 309 | skip_next_cluster = False 310 | 311 | codepoints = unicodedata.normalize("NFD", pron_str) 312 | 313 | for codepoint in codepoints: 314 | new_cluster = False 315 | is_stress = False 316 | is_accent = False 317 | 318 | if ( 319 | codepoint.isspace() 320 | or IPA.is_bracket(codepoint) 321 | or (codepoint in {IPA.BREAK_SYLLABLE}) 322 | ): 323 | # Skip whitespace, brackets, and syllable breaks 324 | continue 325 | 326 | if IPA.is_break(codepoint) or IPA.is_intonation(codepoint): 327 | # Keep minor/major/word breaks and intonation markers 328 | new_cluster = True 329 | 330 | if IPA.is_accent(codepoint) and not in_tone: 331 | is_accent = True 332 | if cluster: 333 | new_cluster = True 334 | skip_next_cluster = True 335 | elif IPA.is_stress(codepoint): 336 | is_stress = True 337 | if cluster: 338 | new_cluster = True 339 | skip_next_cluster = True 340 | elif in_tone and (codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}): 341 | # Interpret as part of tone 342 | if not drop_tones: 343 | tone += codepoint 344 | 345 | continue 346 | elif IPA.is_long(codepoint): 347 | # Add to current cluster 348 | pass 349 | elif IPA.is_tie(codepoint): 350 | if keep_ties: 351 | # Add next non-combining to current cluster 352 | skip_next_cluster = True 353 | else: 354 | # Ignore ties 355 | continue 356 | elif IPA.is_tone(codepoint): 357 | # Add to end of current cluster 358 | if not drop_tones: 359 | tone += codepoint 360 | 361 | in_tone = True 362 | continue 363 | elif unicodedata.combining(codepoint) == 0: 364 | # Non-combining character 365 | if skip_next_cluster: 366 | # Add to current cluster 367 | skip_next_cluster = False 368 | elif cluster: 369 | # Start a new cluster 370 | new_cluster = True 371 | 372 | if new_cluster and cluster: 373 | clusters.append(accents + stress + cluster + tone) 374 | accents = "" 375 | stress = "" 376 | cluster = "" 377 | tone = "" 378 | 379 | if is_accent: 380 | if keep_accents: 381 | accents += codepoint 382 | elif is_stress: 383 | if keep_stress: 384 | stress += codepoint 385 | else: 386 | cluster += codepoint 387 | 388 | if cluster: 389 | clusters.append(accents + stress + cluster + tone) 390 | 391 | phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]] = [] 392 | for cluster in clusters: 393 | if IPA.is_break(cluster): 394 | phones_and_others.append(Break.from_string(cluster)) 395 | elif IPA.is_intonation(cluster): 396 | phones_and_others.append(Intonation.from_string(cluster)) 397 | else: 398 | phones_and_others.append(Phone.from_string(cluster)) 399 | 400 | return Pronunciation(phones_and_others) 401 | 402 | 403 | # ----------------------------------------------------------------------------- 404 | 405 | 406 | class Phoneme: 407 | """Phoneme composed of international phonetic alphabet symbols""" 408 | 409 | def __init__( 410 | self, 411 | text: str, 412 | example: str = "", 413 | unknown: bool = False, 414 | tones: typing.Optional[typing.Iterable[str]] = None, 415 | is_ipa: bool = True, 416 | ): 417 | self._text = "" 418 | self._text_compare = "" 419 | self.example = example 420 | self.unknown = unknown 421 | 422 | # List of allowable tones for phoneme 423 | self.tones = list(tones or []) 424 | 425 | self.stress: typing.Optional[Stress] = None 426 | self.accents: typing.List[Accent] = [] 427 | self.elongated: bool = False 428 | self.nasalated: typing.Set[int] = set() 429 | self.raised: typing.Set[int] = set() 430 | self._extra_combining: typing.Dict[int, typing.List[str]] = defaultdict(list) 431 | 432 | # Decompose into base and combining characters 433 | codepoints = unicodedata.normalize("NFD", text) 434 | self.letters = "" 435 | self.tone = "" 436 | 437 | if is_ipa: 438 | in_tone = False 439 | letter_index = 0 440 | new_letter = False 441 | 442 | for c in codepoints: 443 | # Check for stress 444 | if (c == IPA.ACCENT_ACUTE) and (not in_tone): 445 | self.accents.append(Accent.ACUTE) 446 | elif (c == IPA.ACCENT_GRAVE) and (not in_tone): 447 | self.accents.append(Accent.GRAVE) 448 | elif c == IPA.STRESS_PRIMARY: 449 | self.stress = Stress.PRIMARY 450 | elif c == IPA.STRESS_SECONDARY: 451 | self.stress = Stress.SECONDARY 452 | elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}): 453 | # Interpret as part of tone 454 | self.tone += c 455 | elif IPA.is_long(c): 456 | # Check for elongation 457 | self.elongated = True 458 | elif IPA.is_nasal(c): 459 | # Check for nasalation 460 | self.nasalated.add(letter_index) 461 | elif IPA.is_raised(c): 462 | # Check for raised articulation 463 | self.raised.add(letter_index) 464 | elif IPA.is_bracket(c) or IPA.is_break(c): 465 | # Skip brackets/syllable breaks 466 | pass 467 | elif IPA.is_tone(c): 468 | # Keep tone separate 469 | self.tone += c 470 | in_tone = True 471 | elif c in {IPA.SYLLABIC, IPA.NON_SYLLABIC, IPA.EXTRA_SHORT}: 472 | # Stow some diacritics that we don't do anything with 473 | self._extra_combining[letter_index].append(c) 474 | else: 475 | # Include all other characters in base 476 | self.letters += c 477 | 478 | if new_letter: 479 | letter_index += 1 480 | 481 | new_letter = True 482 | else: 483 | self.letters = text 484 | 485 | # Re-normalize and combine letters 486 | self.letters = unicodedata.normalize("NFC", self.letters) 487 | self.letters_graphemes = IPA.graphemes(self.letters) 488 | 489 | # Categorize 490 | self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters) 491 | self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters) 492 | self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters) 493 | self.dipthong: typing.Optional[Dipthong] = None 494 | 495 | if ( 496 | (not self.vowel) 497 | and (not self.consonant) 498 | and (not self.schwa) 499 | and (len(self.letters) == 2) 500 | ): 501 | # Check if dipthong (two vowels) 502 | vowel1 = VOWELS.get(self.letters[0]) 503 | vowel2 = VOWELS.get(self.letters[1]) 504 | if vowel1 and vowel2: 505 | self.dipthong = Dipthong(vowel1, vowel2) 506 | 507 | @property 508 | def text(self) -> str: 509 | """Return letters with stress and elongation (NFC normalized)""" 510 | if self._text: 511 | return self._text 512 | 513 | for accent in self.accents: 514 | if accent == Accent.ACUTE: 515 | self._text += IPA.ACCENT_ACUTE 516 | elif accent == Accent.GRAVE: 517 | self._text += IPA.ACCENT_GRAVE 518 | 519 | if self.stress == Stress.PRIMARY: 520 | self._text += IPA.STRESS_PRIMARY 521 | elif self.stress == Stress.SECONDARY: 522 | self._text += IPA.STRESS_SECONDARY 523 | 524 | for letter_index, letter in enumerate(self.letters): 525 | self._text += letter 526 | 527 | if letter_index in self.nasalated: 528 | self._text += IPA.NASAL 529 | 530 | if letter_index in self.raised: 531 | self._text += IPA.RAISED 532 | 533 | for c in self._extra_combining[letter_index]: 534 | self._text += c 535 | 536 | if self.tone: 537 | self._text += self.tone 538 | 539 | if self.elongated: 540 | self._text += IPA.LONG 541 | 542 | # Re-normalize and combine 543 | self._text = unicodedata.normalize("NFC", self._text) 544 | 545 | return self._text 546 | 547 | @property 548 | def text_compare(self) -> str: 549 | """Return letters and elongation with no stress/tones (NFC normalized)""" 550 | if self._text_compare: 551 | return self._text_compare 552 | 553 | for letter_index, letter in enumerate(self.letters): 554 | self._text_compare += letter 555 | 556 | if letter_index in self.nasalated: 557 | self._text_compare += IPA.NASAL 558 | 559 | if letter_index in self.raised: 560 | self._text_compare += IPA.RAISED 561 | 562 | for c in self._extra_combining[letter_index]: 563 | self._text_compare += c 564 | 565 | if self.elongated: 566 | self._text_compare += IPA.LONG 567 | 568 | # Re-normalize and combine 569 | self._text_compare = unicodedata.normalize("NFC", self._text_compare) 570 | 571 | return self._text_compare 572 | 573 | def copy(self) -> "Phoneme": 574 | """Create a copy of this phonemes""" 575 | return Phoneme(text=self.text, example=self.example, unknown=self.unknown) 576 | 577 | def __repr__(self) -> str: 578 | """Return symbol with stress and elongation.""" 579 | return self.text 580 | 581 | def to_dict(self) -> typing.Dict[str, typing.Any]: 582 | """Return properties of phoneme as a dict""" 583 | type_name = "Phoneme" 584 | props: typing.Dict[str, typing.Any] = { 585 | "text": repr(self), 586 | "letters": self.letters, 587 | "tone": self.tone, 588 | "tones": self.tones, 589 | } 590 | 591 | if self.unknown: 592 | props["unknown"] = True 593 | 594 | if self.example: 595 | props["example"] = self.example 596 | 597 | props["accents"] = [a.value for a in self.accents] 598 | props["stress"] = self.stress.value if self.stress is not None else "" 599 | 600 | if self.vowel: 601 | type_name = "Vowel" 602 | props["height"] = self.vowel.height.value 603 | props["placement"] = self.vowel.placement.value 604 | props["rounded"] = self.vowel.rounded 605 | elif self.consonant: 606 | type_name = "Consonant" 607 | props["type"] = self.consonant.type.value 608 | props["place"] = self.consonant.place.value 609 | props["voiced"] = self.consonant.voiced 610 | elif self.dipthong: 611 | type_name = "Dipthong" 612 | elif self.schwa: 613 | type_name = "Schwa" 614 | props["r_coloured"] = self.schwa.r_coloured 615 | 616 | props["type"] = type_name 617 | 618 | props["nasalated"] = list(self.nasalated) 619 | props["raised"] = list(self.raised) 620 | props["elongated"] = self.elongated 621 | 622 | return props 623 | 624 | def to_string(self) -> str: 625 | """Return descriptive string of phoneme""" 626 | props = self.to_dict() 627 | type_name = props.get("type", "Phoneme") 628 | 629 | prop_strs = [f"{k}={v}" for k, v in props.items()] 630 | 631 | return f"{type_name}(" + ", ".join(prop_strs) + ")" 632 | 633 | 634 | # ----------------------------------------------------------------------------- 635 | 636 | 637 | class Phonemes: 638 | """Set of phonemes and allophones for a language""" 639 | 640 | COMMENT_STR = "#" 641 | 642 | def __init__(self, phonemes=None, ipa_map=None): 643 | self.phonemes = phonemes or [] 644 | self.ipa_map = ipa_map or {} 645 | 646 | # Regex for replacing IPA 647 | self._ipa_map_regex = None 648 | 649 | # Phonemes sorted by descreasing length 650 | self._phonemes_sorted = None 651 | 652 | # Map from original phoneme to gruut IPA 653 | self.gruut_ipa_map: typing.Dict[str, str] = {} 654 | 655 | self.phoneme_texts: typing.Set[str] = {} 656 | self.update() 657 | 658 | def __iter__(self): 659 | return iter(self.phonemes) 660 | 661 | def __len__(self): 662 | return len(self.phonemes) 663 | 664 | def __getitem__(self, key): 665 | return self.phonemes[key] 666 | 667 | def __contains__(self, item): 668 | if isinstance(item, str): 669 | # Compare IPA text 670 | return item in self.phoneme_texts 671 | 672 | return item in self.phonemes 673 | 674 | @staticmethod 675 | def from_language(language: str) -> "Phonemes": 676 | """Load phonemes for a given language""" 677 | language = resolve_lang(language) 678 | 679 | # Load phonemes themselves 680 | phonemes_path = _DATA_DIR / language / "phonemes.txt" 681 | with open(phonemes_path, "r", encoding="utf-8") as phonemes_file: 682 | phonemes = Phonemes.from_text(phonemes_file) 683 | 684 | # Try to load optional map from original phoneme to gruut IPA 685 | gruut_ipa_map: typing.Optional[typing.Dict[str, str]] = None 686 | map_path = _DATA_DIR / language / "ipa_map.txt" 687 | if map_path.is_file(): 688 | gruut_ipa_map = {} 689 | with open(map_path, "r", encoding="utf-8") as map_file: 690 | for line in map_file: 691 | line = line.strip() 692 | if not line: 693 | continue 694 | 695 | from_phoneme, to_ipa = line.split(maxsplit=1) 696 | gruut_ipa_map[from_phoneme] = to_ipa 697 | 698 | if gruut_ipa_map: 699 | phonemes.gruut_ipa_map = gruut_ipa_map 700 | 701 | return phonemes 702 | 703 | @staticmethod 704 | def from_text(text_file) -> "Phonemes": 705 | """Load text file with phonemes, examples, and allophones""" 706 | lang = Phonemes() 707 | 708 | for line in text_file: 709 | # Remove comments 710 | line, *_ = line.split(Phonemes.COMMENT_STR, maxsplit=1) 711 | line = line.strip() 712 | if line: 713 | # phoneme [example] [allophone] [allophone] ! [tone] [tone]... 714 | parts = line.split() 715 | phoneme_ipa = parts[0] 716 | example = "" 717 | 718 | if len(parts) > 1: 719 | example = parts[1] 720 | 721 | tones = [] 722 | if len(parts) > 2: 723 | in_tone = False 724 | 725 | # Map allophone back to phoneme 726 | for part in parts[2:]: 727 | if part == "!": 728 | # Begin possible tones for this phoneme 729 | in_tone = True 730 | elif in_tone: 731 | tones.append(part) 732 | else: 733 | lang.ipa_map[part] = phoneme_ipa 734 | 735 | lang.phonemes.append( 736 | Phoneme(text=phoneme_ipa, example=example, tones=tones) 737 | ) 738 | 739 | lang.update() 740 | 741 | return lang 742 | 743 | def update(self): 744 | """Call after modifying phonemes or IPA map to re-sort""" 745 | # Create single regex that will be used to replace IPA. 746 | # The final regex is of the form (AAA|BB|C) where each case is in 747 | # decreasing length order. 748 | # 749 | # If the replacement is not a substring of any phonemes, then the 750 | # replacement is straightforward. 751 | # 752 | # If it is a substring of some phoneme, however, we need to be careful. 753 | # For example, naively replacing "e" with "eɪ" in the string "beɪ" will 754 | # produce "beeɪ" when we want it to be "beɪ". 755 | # 756 | # So the substring case becomes "e(?!ɪ)" which uses a negative lookahead 757 | # to avoid the problem. 758 | cases = [] 759 | for match_text in sorted(self.ipa_map.keys(), key=len, reverse=True): 760 | if match_text.startswith(","): 761 | # Raw regex 762 | cases.append(match_text[1:]) 763 | continue 764 | 765 | # Check against all of the phonemes 766 | case_added = False 767 | for phoneme in self.phonemes: 768 | num_extra = len(phoneme.text) - len(match_text) 769 | if (num_extra > 0) and phoneme.text.startswith(match_text): 770 | # Use negative lookahead to avoid replacing part of a valid 771 | # phoneme. 772 | cases.append( 773 | "{}(?!{})".format( 774 | re.escape(match_text[:num_extra]), 775 | re.escape(phoneme.text[num_extra:]), 776 | ) 777 | ) 778 | 779 | case_added = True 780 | break 781 | 782 | if not case_added: 783 | # No substring problem 784 | cases.append(re.escape(match_text)) 785 | 786 | ipa_map_regex_str = "({})".format("|".join(cases)) 787 | self._ipa_map_regex = re.compile(ipa_map_regex_str) 788 | 789 | # Split phonemes and sort by reverse length 790 | split_phonemes = [ 791 | ([pb.text for pb in Pronunciation.from_string(p.text)], p) 792 | for p in self.phonemes 793 | ] 794 | 795 | self._phonemes_sorted = sorted( 796 | split_phonemes, key=lambda kp: len(kp[0]), reverse=True 797 | ) 798 | 799 | # Update IPA texts set for phonemes 800 | self.phoneme_texts = set(p.text for p in self.phonemes) 801 | 802 | def split( 803 | self, 804 | pron_str: typing.Union[str, Pronunciation], 805 | keep_stress: bool = True, 806 | keep_accents: typing.Optional[bool] = None, 807 | drop_tones: bool = False, 808 | is_ipa: bool = True, 809 | ) -> typing.List[Phoneme]: 810 | """Split an IPA pronunciation into phonemes""" 811 | if not self._ipa_map_regex: 812 | self.update() 813 | 814 | if keep_accents is None: 815 | keep_accents = keep_stress 816 | 817 | word_phonemes: typing.List[Phoneme] = [] 818 | 819 | if self.ipa_map: 820 | if isinstance(pron_str, Pronunciation): 821 | pron_str = "".join(p.text for p in pron_str) 822 | 823 | def handle_replace(match): 824 | text = match.group(1) 825 | return self.ipa_map.get(text, text) 826 | 827 | pron_str = self._ipa_map_regex.sub(handle_replace, pron_str) 828 | 829 | # Get text for IPA phones 830 | if isinstance(pron_str, Pronunciation): 831 | # Use supplied pronunication 832 | ipas = [pb.text for pb in pron_str] 833 | elif is_ipa: 834 | # Split string into pronunciation 835 | pron = Pronunciation.from_string( 836 | pron_str, 837 | keep_stress=keep_stress, 838 | keep_accents=keep_accents, 839 | drop_tones=drop_tones, 840 | ) 841 | ipas = [pb.text for pb in pron] 842 | else: 843 | ipas = IPA.graphemes(pron_str) 844 | 845 | # Keep stress and tones separate to make phoneme comparisons easier 846 | ipa_stress: typing.Dict[int, str] = defaultdict(str) 847 | ipa_tones: typing.Dict[int, str] = defaultdict(str) 848 | 849 | if is_ipa: 850 | in_tone = False 851 | for ipa_idx, ipa in enumerate(ipas): 852 | if ipa: 853 | keep_ipa = "" 854 | for codepoint in ipa: 855 | if IPA.is_accent(codepoint) and (not in_tone): 856 | if keep_accents: 857 | ipa_stress[ipa_idx] += codepoint 858 | elif IPA.is_stress(codepoint): 859 | if keep_stress: 860 | ipa_stress[ipa_idx] += codepoint 861 | elif in_tone and ( 862 | codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT} 863 | ): 864 | # Interpret as part of time 865 | if not drop_tones: 866 | ipa_tones[ipa_idx] += codepoint 867 | elif IPA.is_tone(codepoint): 868 | if not drop_tones: 869 | ipa_tones[ipa_idx] += codepoint 870 | 871 | in_tone = True 872 | else: 873 | keep_ipa += codepoint 874 | 875 | ipas[ipa_idx] = keep_ipa 876 | 877 | num_ipas: int = len(ipas) 878 | 879 | # --------------------------------------------------------------------- 880 | 881 | # pylint: disable=consider-using-enumerate 882 | for ipa_idx in range(len(ipas)): 883 | ipa = ipas[ipa_idx] 884 | if ipa is None: 885 | # Skip replaced piece 886 | continue 887 | 888 | phoneme_match = False 889 | for phoneme_ipas, phoneme in self._phonemes_sorted: 890 | if ipa_idx <= (num_ipas - len(phoneme_ipas)): 891 | phoneme_match = True 892 | phoneme_stress = "" 893 | phoneme_tones = "" 894 | 895 | # Look forward into sequence 896 | for phoneme_idx in range(len(phoneme_ipas)): 897 | phoneme_stress += ipa_stress[ipa_idx + phoneme_idx] 898 | phoneme_tones += ipa_tones[ipa_idx + phoneme_idx] 899 | 900 | if phoneme_ipas[phoneme_idx] != ipas[ipa_idx + phoneme_idx]: 901 | phoneme_match = False 902 | break 903 | 904 | if phoneme_match: 905 | # Successful match 906 | if phoneme_stress or phoneme_tones: 907 | # Create a copy of the phoneme with applied stress/tones 908 | phoneme = Phoneme( 909 | text=(phoneme_stress + phoneme.text + phoneme_tones), 910 | example=phoneme.example, 911 | ) 912 | 913 | word_phonemes.append(phoneme) 914 | 915 | # Patch ipas to skip replaced pieces 916 | for phoneme_idx in range(1, len(phoneme_ipas)): 917 | ipas[ipa_idx + phoneme_idx] = None 918 | 919 | break 920 | 921 | if not phoneme_match: 922 | # Add unknown phoneme 923 | word_phonemes.append(Phoneme(text=ipa, unknown=True)) 924 | 925 | return word_phonemes 926 | -------------------------------------------------------------------------------- /gruut_ipa/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rhasspy/gruut-ipa/ec9ae6ce7cca0103d9a563fcde3352f805a8e27e/gruut_ipa/py.typed -------------------------------------------------------------------------------- /gruut_ipa/sampa.py: -------------------------------------------------------------------------------- 1 | """Mapping between IPA and Sampa""" 2 | import re 3 | import unicodedata 4 | 5 | # http://www.blahedo.org/ascii-ipa.html 6 | 7 | 8 | def ipa_to_sampa(ipa: str) -> str: 9 | """Convert IPA string to sampa phonemes""" 10 | ipa_codepoints = unicodedata.normalize("NFD", ipa) 11 | 12 | return IPA_PATTERN.sub( 13 | lambda match: IPA_TO_SAMPA.get(match.group(1), ""), ipa_codepoints 14 | ) 15 | 16 | 17 | def sampa_to_ipa(sampa: str) -> str: 18 | """Convert sampa phonemes to IPA phones""" 19 | sampa_codepoints = unicodedata.normalize("NFD", sampa) 20 | 21 | return SAMPA_PATTERN.sub( 22 | lambda match: SAMPA_TO_IPA.get(match.group(1), ""), sampa_codepoints 23 | ) 24 | 25 | 26 | # ----------------------------------------------------------------------------- 27 | 28 | IPA_TO_SAMPA = { 29 | "\u0061": "a", 30 | "\u0250": "6", 31 | "\u0251": "A", 32 | "\u0252": "Q", 33 | "\u00e6": "{", 34 | "\u028c": "V", 35 | "\u0062": "b", 36 | "\u0253": "", 37 | "\u0299": "B\\", 38 | "\u03b2": "B", 39 | "\u0063": "c", 40 | "\u00e7": "C", 41 | "\u0063\u0327": "C", 42 | "\u0255": "s\\", 43 | "\u0064": "d", 44 | "\u0257": "", 45 | "\u0256": "d`", 46 | "\u00f0": "D", 47 | "\u0065": "e", 48 | "\u0259": "@", 49 | "\u025a": "@`", 50 | "\u0258": "@\\", 51 | "\u025b": "E", 52 | "\u025c": "3", 53 | "\u025d": "@`", 54 | "\u025e": "3\\", 55 | "\u0066": "f", 56 | "\u0261": "g", 57 | "\u0067": "g", 58 | "\u0260": "", 59 | "\u0262": "G\\", 60 | "\u029b": "G\\_<", 61 | "\u0263": "G", 62 | "\u02e0": "_G", 63 | "\u0264": "7", 64 | "\u0068": "h", 65 | "\u02b0": "_h", 66 | "\u0127": "X\\", 67 | "\u0266": "h\\", 68 | "\u0267": "x\\", 69 | "\u0265": "H", 70 | "\u029c": "H\\", 71 | "\u0069": "i", 72 | "\u0268": "1", 73 | "\u026a": "I", 74 | "\u006a": "j", 75 | "\u02b2": "', _j", 76 | "\u029d": "j\\", 77 | "\u025f": "J\\", 78 | "\u0284": "J\\_<", 79 | "\u006b": "k", 80 | "\u006c": "l", 81 | "\u026b": "5", 82 | "\u026c": "K", 83 | "\u026d": "l`", 84 | "\u026e": "K\\", 85 | "\u029f": "L\\", 86 | "\u006d": "m", 87 | "\u0271": "F", 88 | "\u026f": "M", 89 | "\u0270": "M\\", 90 | "\u006e": "n", 91 | "\u0272": "J", 92 | "\u014b": "N", 93 | "\u0273": "n`", 94 | "\u0274": "N\\", 95 | "\u006f": "o", 96 | "\u0298": "O\\", 97 | "\u0275": "8", 98 | "\u00f8": "2", 99 | "\u0153": "9", 100 | "\u0276": "&", 101 | "\u0254": "O", 102 | "\u0070": "p", 103 | "\u0278": "p\\", 104 | "\u0071": "q", 105 | "\u0072": "r", 106 | "\u027e": "4", 107 | "\u027c": "", 108 | "\u027d": "r`", 109 | "\u0279": "r\\", 110 | "\u027b": "r\\`", 111 | "\u027a": "l\\", 112 | "\u0280": "R\\", 113 | "\u0281": "R", 114 | "\u0073": "s", 115 | "\u0282": "s`", 116 | "\u0283": "S", 117 | "\u0074": "t", 118 | "\u0288": "t`", 119 | "\u03b8": "T", 120 | "\u0075": "u", 121 | "\u0289": "}", 122 | "\u028a": "U", 123 | "\u0076": "v", 124 | "\u028b": "v\\", 125 | "\u0077": "w", 126 | "\u02b7": "_w", 127 | "\u028d": "W", 128 | "\u0078": "x", 129 | "\u03c7": "X", 130 | "\u0079": "y", 131 | "\u028e": "L", 132 | "\u028f": "Y", 133 | "\u007a": "z", 134 | "\u0291": "z\\", 135 | "\u0290": "z`", 136 | "\u0292": "Z", 137 | "\u0294": "?", 138 | "\u02a1": ">\\", 139 | "\u0295": "?\\", 140 | "\u02a2": "<\\", 141 | "\u02e4": "_?\\", 142 | "\u01c3": "!\\", 143 | "\u01c0": "|\\", 144 | "\u01c1": "|\\|\\", 145 | "\u0320": "_-", 146 | "\u032a": "_d", 147 | "\u033a": "_a", 148 | "\u031f": "_+", 149 | "\u031d": "_r", 150 | "\u031e": "_o", 151 | "\u02c8": '"', 152 | "\u02cc": "%", 153 | "\u031a": "_}", 154 | "\u002e": "", 155 | "\u02d1": ":\\", 156 | "\u0308": '_"', 157 | "\u0324": "_t", 158 | "\u02d0": ":", 159 | "\u02bc": "", 160 | "\u0325": "_0", 161 | "\u030a": "", 162 | "\u031c": "_c", 163 | "\u0339": "_O", 164 | "\u0303": "~, _~", 165 | "\u0334": "_e", 166 | "\u0330": "_k", 167 | "\u032c": "_v", 168 | "\u0306": "_X", 169 | "\u032f": "_^", 170 | "\u033d": "", 171 | "\u02de": "`", 172 | "\u033b": "_m", 173 | "\u0318": "_A", 174 | "\u0319": "_q", 175 | "\u033c": "_N", 176 | "\u2197": "", 177 | "\u2191": "^", 178 | "\u2198": "", 179 | "\u2193": "!", 180 | "\u030f": "_B", 181 | "\u0300": "_L", 182 | "\u0304": "_M", 183 | "\u0301": "_H", 184 | "\u030b": "_T", 185 | # 186 | # Ties 187 | "\u0361": "", 188 | "\u035C": "", 189 | # 190 | # Tied symbols 191 | "\u0288\u0361\u0282": "ts`", 192 | "\u0256\u0361\u0290": "dz`", 193 | "\u006b\u0361\u0078": "k_x", 194 | # 195 | # Breaks 196 | "|": "", 197 | "\u2016": "", 198 | "#": "", 199 | } 200 | 201 | SAMPA_TO_IPA = {v: k for k, v in IPA_TO_SAMPA.items() if v} 202 | 203 | # Regex disjunctions in descending length order 204 | SAMPA_PATTERN = re.compile( 205 | "({})".format( 206 | "|".join( 207 | re.escape(sampa) for sampa in sorted(SAMPA_TO_IPA, key=len, reverse=True) 208 | ) 209 | ) 210 | ) 211 | 212 | IPA_PATTERN = re.compile( 213 | "({})".format( 214 | "|".join(re.escape(ipa) for ipa in sorted(IPA_TO_SAMPA, key=len, reverse=True)) 215 | ) 216 | ) 217 | -------------------------------------------------------------------------------- /gruut_ipa/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Utility methods""" 3 | 4 | from gruut_ipa.constants import LANG_ALIASES 5 | 6 | 7 | def resolve_lang(lang: str) -> str: 8 | """Resolve language with known aliases""" 9 | if "/" in lang: 10 | lang, rest = lang.split("/", maxsplit=1) 11 | lang = LANG_ALIASES.get(lang, lang) 12 | return f"{lang}/{rest}" 13 | 14 | return LANG_ALIASES.get(lang, lang) 15 | -------------------------------------------------------------------------------- /img/ipa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rhasspy/gruut-ipa/ec9ae6ce7cca0103d9a563fcde3352f805a8e27e/img/ipa.png -------------------------------------------------------------------------------- /img/ipa.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 22 | 24 | 47 | 49 | 50 | 52 | image/svg+xml 53 | 55 | 56 | 57 | 58 | 59 | 64 | stress 75 | letters 86 | long 97 | suprasegmentals 108 | tie 119 | diacritics 130 | 141 | | 152 | breaks 163 | 165 | 167 | 172 | 177 | 183 | 188 | 194 | 195 | 202 | 203 | phone 214 | 217 | 220 | 225 | 230 | 235 | 240 | 241 | 248 | 249 | phone 260 | 262 | 268 | 274 | 275 | 277 | 283 | 289 | 290 | 292 | 298 | 304 | 305 | 307 | 313 | 319 | 320 | 323 | 329 | 335 | 336 | 339 | 345 | 351 | 352 | 358 | 364 | 370 | 371 | 372 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | 2 | [mypy] 3 | 4 | [mypy-setuptools.*] 5 | ignore_missing_imports = True 6 | 7 | [mypy-sklearn.*] 8 | ignore_missing_imports = True 9 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | disable= 3 | format, 4 | abstract-class-little-used, 5 | abstract-method, 6 | cyclic-import, 7 | duplicate-code, 8 | global-statement, 9 | import-outside-toplevel, 10 | inconsistent-return-statements, 11 | locally-disabled, 12 | not-context-manager, 13 | redefined-variable-type, 14 | too-few-public-methods, 15 | too-many-arguments, 16 | too-many-branches, 17 | too-many-instance-attributes, 18 | too-many-lines, 19 | too-many-locals, 20 | too-many-public-methods, 21 | too-many-return-statements, 22 | too-many-statements, 23 | too-many-boolean-expressions, 24 | unnecessary-pass, 25 | unused-argument, 26 | broad-except, 27 | too-many-nested-blocks, 28 | invalid-name, 29 | unused-import 30 | 31 | [FORMAT] 32 | expected-line-ending-format=LF -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | black==19.10b0 2 | coverage==5.0.4 3 | flake8==3.7.9 4 | mypy==0.910 5 | pylint==2.10.2 6 | pytest==5.4.1 7 | pytest-cov==2.8.1 8 | -------------------------------------------------------------------------------- /requirements_test.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | -------------------------------------------------------------------------------- /scripts/check-code.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Directory of *this* script 5 | this_dir="$( cd "$( dirname "$0" )" && pwd )" 6 | src_dir="$(realpath "${this_dir}/..")" 7 | 8 | venv="${src_dir}/.venv" 9 | if [[ -d "${venv}" ]]; then 10 | source "${venv}/bin/activate" 11 | fi 12 | 13 | python_files=("${src_dir}/gruut_ipa/"*.py "${src_dir}/setup.py") 14 | python_files+=("${src_dir}/tests/"*.py) 15 | 16 | # ----------------------------------------------------------------------------- 17 | 18 | flake8 "${python_files[@]}" 19 | pylint "${python_files[@]}" 20 | mypy "${python_files[@]}" 21 | black --check "${python_files[@]}" 22 | isort --check-only "${python_files[@]}" 23 | 24 | # ----------------------------------------------------------------------------- 25 | 26 | echo "OK" 27 | -------------------------------------------------------------------------------- /scripts/create-venv.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | if [[ -z "${PIP_INSTALL}" ]]; then 5 | PIP_INSTALL='install' 6 | fi 7 | 8 | # Directory of *this* script 9 | this_dir="$( cd "$( dirname "$0" )" && pwd )" 10 | src_dir="$(realpath "${this_dir}/..")" 11 | 12 | # ----------------------------------------------------------------------------- 13 | 14 | venv="${src_dir}/.venv" 15 | 16 | # ----------------------------------------------------------------------------- 17 | 18 | : "${PYTHON=python3}" 19 | 20 | python_version="$(${PYTHON} --version)" 21 | 22 | # Create virtual environment 23 | echo "Creating virtual environment at ${venv} (${python_version})" 24 | rm -rf "${venv}" 25 | "${PYTHON}" -m venv "${venv}" 26 | source "${venv}/bin/activate" 27 | 28 | # Install Python dependencies 29 | echo "Installing Python dependencies" 30 | pip3 ${PIP_INSTALL} --upgrade pip 31 | pip3 ${PIP_INSTALL} --upgrade wheel setuptools 32 | 33 | pip3 ${PIP_INSTALL} "${src_dir}" 34 | 35 | # Development dependencies 36 | if [[ -f requirements_dev.txt ]]; then 37 | pip3 ${PIP_INSTALL} -r requirements_dev.txt || echo "Failed to install development dependencies" >&2 38 | fi 39 | 40 | # ----------------------------------------------------------------------------- 41 | 42 | echo "OK" 43 | -------------------------------------------------------------------------------- /scripts/format-code.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Directory of *this* script 5 | this_dir="$( cd "$( dirname "$0" )" && pwd )" 6 | src_dir="$(realpath "${this_dir}/..")" 7 | 8 | venv="${src_dir}/.venv" 9 | if [[ -d "${venv}" ]]; then 10 | source "${venv}/bin/activate" 11 | fi 12 | 13 | python_files=("${src_dir}/gruut_ipa/"*.py "${src_dir}/setup.py") 14 | python_files+=("${src_dir}/tests/"*.py) 15 | 16 | # ----------------------------------------------------------------------------- 17 | 18 | black "${python_files[@]}" 19 | isort "${python_files[@]}" 20 | 21 | # ----------------------------------------------------------------------------- 22 | 23 | echo "OK" 24 | -------------------------------------------------------------------------------- /scripts/run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Directory of *this* script 5 | this_dir="$( cd "$( dirname "$0" )" && pwd )" 6 | src_dir="$(realpath "${this_dir}/..")" 7 | 8 | venv="${src_dir}/.venv" 9 | if [[ -d "${venv}" ]]; then 10 | source "${venv}/bin/activate" 11 | fi 12 | 13 | python_files=("${src_dir}/tests/"*.py) 14 | 15 | # ----------------------------------------------------------------------------- 16 | 17 | python3 -m unittest "${python_files[@]}" 18 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # To work with Black 3 | max-line-length = 88 4 | # E501: line too long 5 | # W503: Line break occurred before a binary operator 6 | # E203: Whitespace before ':' 7 | # D202 No blank lines allowed after function docstring 8 | # W504 line break after binary operator 9 | ignore = 10 | E501, 11 | W503, 12 | E203, 13 | D202, 14 | W504 15 | 16 | # F401 import unused 17 | per-file-ignores = 18 | dodo.py:F401 19 | 20 | [isort] 21 | multi_line_output = 3 22 | include_trailing_comma=True 23 | force_grid_wrap=0 24 | use_parentheses=True 25 | line_length=88 26 | indent = " " -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Setup file for gruut-ipa""" 2 | from pathlib import Path 3 | 4 | import setuptools 5 | 6 | this_dir = Path(__file__).parent 7 | 8 | # ----------------------------------------------------------------------------- 9 | 10 | # Load README in as long description 11 | long_description: str = "" 12 | readme_path = this_dir / "README.md" 13 | if readme_path.is_file(): 14 | long_description = readme_path.read_text(encoding="UTF-8") 15 | 16 | requirements = [] 17 | requirements_path = this_dir / "requirements.txt" 18 | if requirements_path.is_file(): 19 | with open(requirements_path, "r", encoding="utf-8") as requirements_file: 20 | requirements = requirements_file.read().splitlines() 21 | 22 | version_path = this_dir / "VERSION" 23 | with open(version_path, "r", encoding="utf-8") as version_file: 24 | version = version_file.read().strip() 25 | 26 | # ----------------------------------------------------------------------------- 27 | 28 | module_dir = this_dir / "gruut_ipa" 29 | data_dir = module_dir / "data" 30 | data_files = [str(f.relative_to(module_dir)) for f in data_dir.rglob("*")] 31 | 32 | setuptools.setup( 33 | name="gruut-ipa", 34 | description="Library for manipulating pronunciations using the International Phonetic Alphabet (IPA)", 35 | version=version, 36 | author="Michael Hansen", 37 | author_email="mike@rhasspy.org", 38 | url="https://github.com/rhasspy/gruut-ipa", 39 | packages=setuptools.find_packages(), 40 | package_data={"gruut_ipa": data_files + ["py.typed"]}, 41 | install_requires=requirements, 42 | extras_require={':python_version<"3.7"': "dataclasses"}, 43 | entry_points={"console_scripts": ["gruut-ipa = gruut_ipa.__main__:main"]}, 44 | classifiers=[ 45 | "Programming Language :: Python :: 3", 46 | "Programming Language :: Python :: 3.6", 47 | "Programming Language :: Python :: 3.7", 48 | "Programming Language :: Python :: 3.8", 49 | "Programming Language :: Python :: 3.9", 50 | "License :: OSI Approved :: MIT License", 51 | ], 52 | long_description=long_description, 53 | long_description_content_type="text/markdown", 54 | python_requires=">=3.6", 55 | ) 56 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rhasspy/gruut-ipa/ec9ae6ce7cca0103d9a563fcde3352f805a8e27e/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_accent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Tests for phoneme conversion between languages""" 3 | import unittest 4 | 5 | from gruut_ipa import Phonemes 6 | from gruut_ipa.accent import guess_phonemes 7 | 8 | 9 | class AccentTestCase(unittest.TestCase): 10 | """Test cases for phoneme conversion between languages""" 11 | 12 | @classmethod 13 | def setUpClass(cls): 14 | cls.de_phonemes = Phonemes.from_language("de-de") 15 | 16 | def test_exact(self): 17 | """Test exact match""" 18 | guessed = guess_phonemes("k", self.de_phonemes) 19 | 20 | self.assertEqual(len(guessed.phonemes), 1) 21 | self.assertEqual(guessed.phonemes[0].text, "k") 22 | 23 | def test_letters(self): 24 | """Test matching letters""" 25 | guessed = guess_phonemes("ɐ̯ː", self.de_phonemes) 26 | 27 | self.assertEqual(len(guessed.phonemes), 1) 28 | self.assertEqual(guessed.phonemes[0].text, "ɐ") 29 | 30 | def test_close_vowel(self): 31 | """Test nearby vowel""" 32 | guessed = guess_phonemes("ɑ", self.de_phonemes) 33 | 34 | self.assertEqual(len(guessed.phonemes), 1) 35 | 36 | # Placement is more important that height 37 | self.assertEqual(guessed.phonemes[0].text, "ɐ") 38 | 39 | def test_close_consonant(self): 40 | """Test nearby consonant""" 41 | guessed = guess_phonemes("ð", self.de_phonemes) 42 | 43 | self.assertEqual(len(guessed.phonemes), 1) 44 | 45 | # Should match a nearby voiced consonant 46 | self.assertIn(guessed.phonemes[0].text, {"v", "z"}) 47 | 48 | def test_dipthong_letters_match(self): 49 | """Test dipthong (two vowels) with matching letters""" 50 | guessed = guess_phonemes("aʊ", self.de_phonemes) 51 | 52 | self.assertEqual(len(guessed.phonemes), 1) 53 | self.assertEqual(guessed.phonemes[0].text, "aʊ̯") 54 | 55 | def test_dipthong_split(self): 56 | """Test dipthong (two vowels) split into two phonemes""" 57 | guessed = guess_phonemes("oʊ", self.de_phonemes) 58 | 59 | self.assertEqual(len(guessed.phonemes), 2) 60 | self.assertEqual(guessed.phonemes[0].text, "oː") 61 | self.assertEqual(guessed.phonemes[1].text, "ʊ") 62 | 63 | def test_g(self): 64 | """Test ɡ/g mapping""" 65 | from gruut_ipa.accent import GS 66 | 67 | for g in GS: 68 | guessed = guess_phonemes(g, self.de_phonemes) 69 | 70 | self.assertEqual(len(guessed.phonemes), 1) 71 | self.assertIn(guessed.phonemes[0].text, GS) 72 | 73 | def test_r(self): 74 | """Test r-like mapping""" 75 | from gruut_ipa.accent import R_LIKE 76 | 77 | for r in R_LIKE: 78 | guessed = guess_phonemes(r, self.de_phonemes) 79 | 80 | self.assertEqual(len(guessed.phonemes), 1) 81 | self.assertIn(guessed.phonemes[0].text, R_LIKE) 82 | 83 | def test_schwa(self): 84 | """Test schwa mapping""" 85 | from gruut_ipa.accent import R_LIKE, SCHWA_PREFERRED 86 | from gruut_ipa.constants import SCHWAS 87 | 88 | for s in SCHWAS: 89 | guessed = guess_phonemes(s, self.de_phonemes) 90 | 91 | self.assertEqual(len(guessed.phonemes), 1) 92 | self.assertIn(guessed.phonemes[0].text, SCHWA_PREFERRED + R_LIKE) 93 | 94 | 95 | # ----------------------------------------------------------------------------- 96 | 97 | if __name__ == "__main__": 98 | unittest.main() 99 | -------------------------------------------------------------------------------- /tests/test_distances.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Tests for phoneme distances""" 3 | import unittest 4 | 5 | from gruut_ipa import get_closest 6 | 7 | 8 | class DistancesTestCase(unittest.TestCase): 9 | """Test cases for phoneme distances""" 10 | 11 | def test_vowels(self): 12 | """Test distances for vowels""" 13 | self.assertEqual(get_closest("p")[0], "t") 14 | 15 | def test_consonants(self): 16 | """Test distances for consonants""" 17 | self.assertEqual(get_closest("ɑ")[0], "ɒ") 18 | self.assertEqual(get_closest("ʝ")[0], "ç") 19 | 20 | def test_schwas(self): 21 | """Test distances for schwas""" 22 | self.assertEqual(get_closest("ɝ")[0], "ɚ") 23 | 24 | 25 | # ----------------------------------------------------------------------------- 26 | 27 | if __name__ == "__main__": 28 | unittest.main() 29 | -------------------------------------------------------------------------------- /tests/test_features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Tests for phoneme features""" 3 | import dataclasses 4 | import unittest 5 | 6 | from gruut_ipa import ( 7 | CONSONANTS, 8 | SCHWAS, 9 | VOWELS, 10 | Break, 11 | BreakType, 12 | PhonemeLength, 13 | Stress, 14 | from_vector, 15 | string_to_symbol, 16 | to_vector, 17 | ) 18 | 19 | 20 | class FeaturesTestCase(unittest.TestCase): 21 | """Test cases for phoneme features""" 22 | 23 | def test_vowels(self): 24 | """Test to/from feature vector for vowels""" 25 | for vowel in VOWELS.values(): 26 | if vowel.alias_of: 27 | continue 28 | 29 | feat_vec = to_vector(vowel) 30 | self.assertEqual(vowel, from_vector(feat_vec)) 31 | 32 | # Test with stress 33 | for stress in Stress: 34 | vowel_stressed = dataclasses.replace(vowel, stress=stress) 35 | feat_vec = to_vector(vowel_stressed) 36 | self.assertEqual(vowel_stressed, from_vector(feat_vec)) 37 | 38 | def test_consonants(self): 39 | """Test to/from feature vector for consonants""" 40 | for consonant in CONSONANTS.values(): 41 | if consonant.alias_of: 42 | continue 43 | 44 | feat_vec = to_vector(consonant) 45 | self.assertEqual(consonant, from_vector(feat_vec)) 46 | 47 | def test_schwas(self): 48 | """Test to/from feature vector for schwas""" 49 | for schwa in SCHWAS.values(): 50 | if schwa.alias_of: 51 | continue 52 | 53 | feat_vec = to_vector(schwa) 54 | self.assertEqual(schwa, from_vector(feat_vec)) 55 | 56 | def test_breaks(self): 57 | """Test to/from feature vector for breaks""" 58 | for break_type in BreakType: 59 | ipa_break = Break(break_type) 60 | feat_vec = to_vector(ipa_break) 61 | self.assertEqual(ipa_break, from_vector(feat_vec)) 62 | 63 | def test_string_to_symbol(self): 64 | """Test symbol parsing""" 65 | self.assertEqual( 66 | string_to_symbol("ˈãː"), 67 | dataclasses.replace( 68 | VOWELS["ã"], stress=Stress.PRIMARY, length=PhonemeLength.LONG 69 | ), 70 | ) 71 | 72 | self.assertEqual( 73 | string_to_symbol("ɫː"), 74 | dataclasses.replace(CONSONANTS["ɫ"], length=PhonemeLength.LONG), 75 | ) 76 | 77 | self.assertEqual( 78 | string_to_symbol("ɚː"), 79 | dataclasses.replace(SCHWAS["ɚ"], length=PhonemeLength.LONG), 80 | ) 81 | 82 | 83 | # ----------------------------------------------------------------------------- 84 | 85 | if __name__ == "__main__": 86 | unittest.main() 87 | -------------------------------------------------------------------------------- /tests/test_phone.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Tests for Phone class""" 3 | import unicodedata 4 | import unittest 5 | 6 | from gruut_ipa import IPA, Phone, Stress, VowelHeight, VowelPlacement 7 | 8 | 9 | class PhoneTestCase(unittest.TestCase): 10 | """Test cases for Phone class""" 11 | 12 | def test_from_string(self): 13 | """Test Phone.from_string""" 14 | # ˈãː 15 | codepoints = [IPA.STRESS_PRIMARY, "a", IPA.NASAL, IPA.LONG] 16 | ipa = "".join(codepoints) 17 | 18 | phone = Phone.from_string(ipa) 19 | 20 | # Important: text is NFC normalized, so combining characters are 21 | # elimiated if possible. 22 | self.assertEqual(phone.text, unicodedata.normalize("NFC", "ˈãː")) 23 | 24 | self.assertEqual(phone.letters, "a") 25 | self.assertEqual(phone.diacritics[0], {IPA.NASAL}) 26 | self.assertEqual(phone.suprasegmentals, {IPA.STRESS_PRIMARY, IPA.LONG}) 27 | 28 | self.assertEqual(phone.stress, Stress.PRIMARY) 29 | self.assertTrue(phone.is_nasal) 30 | self.assertTrue(phone.is_long) 31 | 32 | self.assertTrue(phone.is_vowel) 33 | self.assertEqual(phone.vowel.height, VowelHeight.OPEN) 34 | self.assertEqual(phone.vowel.placement, VowelPlacement.FRONT) 35 | 36 | 37 | # ----------------------------------------------------------------------------- 38 | 39 | if __name__ == "__main__": 40 | unittest.main() 41 | -------------------------------------------------------------------------------- /tests/test_phonemes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Tests for Phonemes class""" 3 | import unittest 4 | 5 | from gruut_ipa import Phonemes 6 | 7 | 8 | class PhonemesTestCase(unittest.TestCase): 9 | """Test cases for Phonemes class""" 10 | 11 | def test_split(self): 12 | """Test Phonemes.from_string""" 13 | # "Just a cow." 14 | pron_str = "/dʒʌst ə kˈaʊ/" 15 | 16 | lang_phonemes = Phonemes.from_language("en-us") 17 | pron_phonemes = lang_phonemes.split(pron_str, keep_stress=True) 18 | 19 | # Ensure "d ʒ" -> "d͡ʒ" and "a ʊ" -> "aʊ" 20 | phoneme_strs = [p.text for p in pron_phonemes] 21 | self.assertEqual(phoneme_strs, ["d͡ʒ", "ʌ", "s", "t", "ə", "k", "ˈaʊ"]) 22 | 23 | def test_split_substring(self): 24 | """Test Phonemes.split with a substring replacement""" 25 | pron_str = "/viːtɛt͡ʃnaː/" 26 | 27 | lang_phonemes = Phonemes.from_language("cs-cz") 28 | pron_phonemes = lang_phonemes.split(pron_str, keep_stress=False) 29 | 30 | # Ensure iː doesn't get transformed into ɪː 31 | phoneme_strs = [p.text for p in pron_phonemes] 32 | self.assertEqual(phoneme_strs, ["v", "iː", "t", "ɛ", "t͡ʃ", "n", "aː"]) 33 | 34 | def test_split_diacritics(self): 35 | """Test Phonemes.split with a diacritic substring replacement""" 36 | pron_str = "/ɑɑ̃/" 37 | 38 | lang_phonemes = Phonemes.from_language("fr-fr") 39 | pron_phonemes = lang_phonemes.split(pron_str, keep_stress=False) 40 | 41 | # Ensure first ɑ is transformed into a, but not the second 42 | phoneme_strs = [p.text for p in pron_phonemes] 43 | self.assertEqual(phoneme_strs, ["a", "ɑ̃"]) 44 | 45 | def test_dipthong(self): 46 | """Test Phonemes.from_string with a dipthong""" 47 | # ampliam 48 | pron_str = "/ɐ̃pliɐ̃w̃/" 49 | 50 | lang_phonemes = Phonemes.from_language("pt") 51 | pron_phonemes = lang_phonemes.split(pron_str) 52 | 53 | # Ensure "ɐ̃" and "ɐ̃w̃" are kept 54 | phoneme_strs = [p.text for p in pron_phonemes] 55 | self.assertEqual(phoneme_strs, ["ɐ̃", "p", "l", "i", "ɐ̃w̃"]) 56 | 57 | def test_split_dipthong(self): 58 | """Test Phonemes.split with a dipthong""" 59 | pron_str = "/neu̯rt͡ʃɪtou̯/" 60 | 61 | lang_phonemes = Phonemes.from_language("cs-cz") 62 | pron_phonemes = lang_phonemes.split(pron_str, keep_stress=False) 63 | 64 | # Ensure eu̯ ends up as eu̯ 65 | phoneme_strs = [p.text for p in pron_phonemes] 66 | self.assertEqual(phoneme_strs, ["n", "eu̯", "r", "t͡ʃ", "ɪ", "t", "ou̯"]) 67 | 68 | def test_tones(self): 69 | """Test Phonemes.split with tones""" 70 | # á khôi 71 | pron_str = "/a˨˦xoj˧˧/" 72 | 73 | lang_phonemes = Phonemes.from_language("vi-n") 74 | pron_phonemes = lang_phonemes.split(pron_str) 75 | 76 | # Ensure tones are kept 77 | phoneme_strs = [p.text for p in pron_phonemes] 78 | self.assertEqual(phoneme_strs, ["a˨˦", "x", "oj˧˧"]) 79 | 80 | 81 | # ----------------------------------------------------------------------------- 82 | 83 | if __name__ == "__main__": 84 | unittest.main() 85 | -------------------------------------------------------------------------------- /tests/test_pronunciation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Tests for Pronunciation class""" 3 | import unittest 4 | 5 | from gruut_ipa import IPA, Pronunciation 6 | 7 | 8 | class PronunciationTestCase(unittest.TestCase): 9 | """Test cases for Pronunciation class""" 10 | 11 | def test_from_string(self): 12 | """Test Pronuncation.from_string""" 13 | # "Yes, choose IPA." 14 | pron_str = "↗ˈjɛs|ˈt͡ʃuːz#↘aɪpiːeɪ‖" 15 | 16 | pron = Pronunciation.from_string(pron_str, keep_stress=False) 17 | 18 | phone_strs = [p.text for p in pron.phones] 19 | self.assertEqual( 20 | phone_strs, ["j", "ɛ", "s", "t͡ʃ", "uː", "z", "a", "ɪ", "p", "iː", "e", "ɪ"] 21 | ) 22 | 23 | phone_strs = [p.text for p in pron] 24 | self.assertEqual( 25 | phone_strs, 26 | [ 27 | IPA.INTONATION_RISING, 28 | "j", 29 | "ɛ", 30 | "s", 31 | IPA.BREAK_MINOR, 32 | "t͡ʃ", 33 | "uː", 34 | "z", 35 | IPA.BREAK_WORD, 36 | IPA.INTONATION_FALLING, 37 | "a", 38 | "ɪ", 39 | "p", 40 | "iː", 41 | "e", 42 | "ɪ", 43 | IPA.BREAK_MAJOR, 44 | ], 45 | ) 46 | 47 | def test_diacritics(self): 48 | """Test Pronuncation.from_string with extra diacritics""" 49 | pron_str = "ɔʊ̯" 50 | pron = Pronunciation.from_string(pron_str) 51 | 52 | self.assertEqual(pron.text, pron_str) 53 | 54 | def test_tones(self): 55 | """Test Pronuncation.from_string with tone numbers""" 56 | pron_str = "/hwiən˧˨ ziəw˨ˀ˩ʔ/" 57 | pron = Pronunciation.from_string(pron_str) 58 | 59 | phone_strs = [p.text for p in pron] 60 | self.assertEqual( 61 | phone_strs, ["h", "w", "i", "ə", "n˧˨", "z", "i", "ə", "w˨ˀ˩ʔ"] 62 | ) 63 | 64 | def test_accents(self): 65 | """Test Pronuncation.from_string with accents""" 66 | pron_str = "/²'alːdɑːglɪg/" 67 | pron = Pronunciation.from_string(pron_str) 68 | 69 | phone_strs = [p.text for p in pron] 70 | self.assertEqual(phone_strs, ["²'a", "lː", "d", "ɑː", "g", "l", "ɪ", "g"]) 71 | 72 | 73 | # ----------------------------------------------------------------------------- 74 | 75 | if __name__ == "__main__": 76 | unittest.main() 77 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36,py37,py38,py39 3 | 4 | [testenv] 5 | deps = -r{toxinidir}/requirements_test.txt 6 | commands = 7 | pytest 8 | --------------------------------------------------------------------------------