├── .gitignore ├── README.md ├── __init__.py ├── manual.pdf ├── syllabify.py └── wcm.py /.gitignore: -------------------------------------------------------------------------------- 1 | manual/* 2 | eval/* 3 | *.py[co] 4 | .DS_Store 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | === Port for Python3 2 | 3 | syllabify.py is a Python module for syllabifying ARPABET transcriptions; 4 | the method used is informed by subtle details of English phonology. 5 | 6 | * See `manual.pdf` for usage 7 | * See `syllabify.py` for the license and API 8 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kylebgorman/syllabify/d816db784436e9de87ec1ef9bd11b8e229853710/__init__.py -------------------------------------------------------------------------------- /manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kylebgorman/syllabify/d816db784436e9de87ec1ef9bd11b8e229853710/manual.pdf -------------------------------------------------------------------------------- /syllabify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2012-2013 Kyle Gorman 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the 6 | # "Software"), to deal in the Software without restriction, including 7 | # without limitation the rights to use, copy, modify, merge, publish, 8 | # distribute, sublicense, and/or sell copies of the Software, and to 9 | # permit persons to whom the Software is furnished to do so, subject to 10 | # the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included 13 | # in all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 16 | # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | # 23 | # syllabify.py: prosodic parsing of ARPABET entries 24 | 25 | from itertools import chain 26 | 27 | ## constants 28 | SLAX = {'IH1', 'IH2', 'EH1', 'EH2', 'AE1', 'AE2', 'AH1', 'AH2', 29 | 'UH1', 'UH2',} 30 | VOWELS = {'IY1', 'IY2', 'IY0', 'EY1', 'EY2', 'EY0', 'AA1', 'AA2', 'AA0', 31 | 'ER1', 'ER2', 'ER0', 'AW1', 'AW2', 'AW0', 'AO1', 'AO2', 'AO0', 32 | 'AY1', 'AY2', 'AY0', 'OW1', 'OW2', 'OW0', 'OY1', 'OY2', 'OY0', 33 | 'IH0', 'EH0', 'AE0', 'AH0', 'UH0', 'UW1', 'UW2', 'UW0', 'UW', 34 | 'IY', 'EY', 'AA', 'ER', 'AW', 'AO', 'AY', 'OW', 'OY', 35 | 'UH', 'IH', 'EH', 'AE', 'AH', 'UH',} | SLAX 36 | 37 | ## licit medial onsets 38 | 39 | O2 = {('P', 'R'), ('T', 'R'), ('K', 'R'), ('B', 'R'), ('D', 'R'), 40 | ('G', 'R'), ('F', 'R'), ('TH', 'R'), 41 | ('P', 'L'), ('K', 'L'), ('B', 'L'), ('G', 'L'), 42 | ('F', 'L'), ('S', 'L'), 43 | ('K', 'W'), ('G', 'W'), ('S', 'W'), 44 | ('S', 'P'), ('S', 'T'), ('S', 'K'), 45 | ('HH', 'Y'), # "clerihew" 46 | ('R', 'W'),} 47 | O3 = {('S', 'T', 'R'), ('S', 'K', 'L'), ('T', 'R', 'W')} # "octroi" 48 | 49 | # This does not represent anything like a complete list of onsets, but 50 | # merely those that need to be maximized in medial position. 51 | 52 | def syllabify(pron, alaska_rule=True): 53 | """ 54 | Syllabifies a CMU dictionary (ARPABET) word string 55 | 56 | # Alaska rule: 57 | >>> pprint(syllabify('AH0 L AE1 S K AH0'.split())) # Alaska 58 | '-AH0-.L-AE1-S.K-AH0-' 59 | >>> pprint(syllabify('AH0 L AE1 S K AH0'.split(), 0)) # Alaska 60 | '-AH0-.L-AE1-.S K-AH0-' 61 | 62 | # huge medial onsets: 63 | >>> pprint(syllabify('M IH1 N S T R AH0 L'.split())) # minstrel 64 | 'M-IH1-N.S T R-AH0-L' 65 | >>> pprint(syllabify('AA1 K T R W AA0 R'.split())) # octroi 66 | '-AA1-K.T R W-AA0-R' 67 | 68 | # destressing 69 | >>> pprint(destress(syllabify('M IH1 L AH0 T EH2 R IY0'.split()))) 70 | 'M-IH-.L-AH-.T-EH-.R-IY-' 71 | 72 | # normal treatment of 'j': 73 | >>> pprint(syllabify('M EH1 N Y UW0'.split())) # menu 74 | 'M-EH1-N.Y-UW0-' 75 | >>> pprint(syllabify('S P AE1 N Y AH0 L'.split())) # spaniel 76 | 'S P-AE1-N.Y-AH0-L' 77 | >>> pprint(syllabify('K AE1 N Y AH0 N'.split())) # canyon 78 | 'K-AE1-N.Y-AH0-N' 79 | >>> pprint(syllabify('M IH0 N Y UW2 EH1 T'.split())) # minuet 80 | 'M-IH0-N.Y-UW2-.-EH1-T' 81 | >>> pprint(syllabify('JH UW1 N Y ER0'.split())) # junior 82 | 'JH-UW1-N.Y-ER0-' 83 | >>> pprint(syllabify('K L EH R IH HH Y UW'.split())) # clerihew 84 | 'K L-EH-.R-IH-.HH Y-UW-' 85 | 86 | # nuclear treatment of 'j' 87 | >>> pprint(syllabify('R EH1 S K Y UW0'.split())) # rescue 88 | 'R-EH1-S.K-Y UW0-' 89 | >>> pprint(syllabify('T R IH1 B Y UW0 T'.split())) # tribute 90 | 'T R-IH1-B.Y-UW0-T' 91 | >>> pprint(syllabify('N EH1 B Y AH0 L AH0'.split())) # nebula 92 | 'N-EH1-B.Y-AH0-.L-AH0-' 93 | >>> pprint(syllabify('S P AE1 CH UH0 L AH0'.split())) # spatula 94 | 'S P-AE1-.CH-UH0-.L-AH0-' 95 | >>> pprint(syllabify('AH0 K Y UW1 M AH0 N'.split())) # acumen 96 | '-AH0-K.Y-UW1-.M-AH0-N' 97 | >>> pprint(syllabify('S AH1 K Y AH0 L IH0 N T'.split())) # succulent 98 | 'S-AH1-K.Y-AH0-.L-IH0-N T' 99 | >>> pprint(syllabify('F AO1 R M Y AH0 L AH0'.split())) # formula 100 | 'F-AO1 R-M.Y-AH0-.L-AH0-' 101 | >>> pprint(syllabify('V AE1 L Y UW0'.split())) # value 102 | 'V-AE1-L.Y-UW0-' 103 | 104 | # everything else 105 | >>> pprint(syllabify('N AO0 S T AE1 L JH IH0 K'.split())) # nostalgic 106 | 'N-AO0-.S T-AE1-L.JH-IH0-K' 107 | >>> pprint(syllabify('CH ER1 CH M AH0 N'.split())) # churchmen 108 | 'CH-ER1-CH.M-AH0-N' 109 | >>> pprint(syllabify('K AA1 M P AH0 N S EY2 T'.split())) # compensate 110 | 'K-AA1-M.P-AH0-N.S-EY2-T' 111 | >>> pprint(syllabify('IH0 N S EH1 N S'.split())) # inCENSE 112 | '-IH0-N.S-EH1-N S' 113 | >>> pprint(syllabify('IH1 N S EH2 N S'.split())) # INcense 114 | '-IH1-N.S-EH2-N S' 115 | >>> pprint(syllabify('AH0 S EH1 N D'.split())) # ascend 116 | '-AH0-.S-EH1-N D' 117 | >>> pprint(syllabify('R OW1 T EY2 T'.split())) # rotate 118 | 'R-OW1-.T-EY2-T' 119 | >>> pprint(syllabify('AA1 R T AH0 S T'.split())) # artist 120 | '-AA1 R-.T-AH0-S T' 121 | >>> pprint(syllabify('AE1 K T ER0'.split())) # actor 122 | '-AE1-K.T-ER0-' 123 | >>> pprint(syllabify('P L AE1 S T ER0'.split())) # plaster 124 | 'P L-AE1-S.T-ER0-' 125 | >>> pprint(syllabify('B AH1 T ER0'.split())) # butter 126 | 'B-AH1-.T-ER0-' 127 | >>> pprint(syllabify('K AE1 M AH0 L'.split())) # camel 128 | 'K-AE1-.M-AH0-L' 129 | >>> pprint(syllabify('AH1 P ER0'.split())) # upper 130 | '-AH1-.P-ER0-' 131 | >>> pprint(syllabify('B AH0 L UW1 N'.split())) # balloon 132 | 'B-AH0-.L-UW1-N' 133 | >>> pprint(syllabify('P R OW0 K L EY1 M'.split())) # proclaim 134 | 'P R-OW0-.K L-EY1-M' 135 | >>> pprint(syllabify('IH0 N S EY1 N'.split())) # insane 136 | '-IH0-N.S-EY1-N' 137 | >>> pprint(syllabify('IH0 K S K L UW1 D'.split())) # exclude 138 | '-IH0-K.S K L-UW1-D' 139 | """ 140 | ## main pass 141 | mypron = list(pron) 142 | nuclei = [] 143 | onsets = [] 144 | i = -1 145 | for (j, seg) in enumerate(mypron): 146 | if seg in VOWELS: 147 | nuclei.append([seg]) 148 | onsets.append(mypron[i + 1:j]) # actually interludes, r.n. 149 | i = j 150 | codas = [mypron[i + 1:]] 151 | ## resolve disputes and compute coda 152 | for i in range(1, len(onsets)): 153 | coda = [] 154 | # boundary cases 155 | if len(onsets[i]) > 1 and onsets[i][0] == 'R': 156 | nuclei[i - 1].append(onsets[i].pop(0)) 157 | if len(onsets[i]) > 2 and onsets[i][-1] == 'Y': 158 | nuclei[i].insert(0, onsets[i].pop()) 159 | if len(onsets[i]) > 1 and alaska_rule and nuclei[i-1][-1] in SLAX \ 160 | and onsets[i][0] == 'S': 161 | coda.append(onsets[i].pop(0)) 162 | # onset maximization 163 | depth = 1 164 | if len(onsets[i]) > 1: 165 | if tuple(onsets[i][-2:]) in O2: 166 | depth = 3 if tuple(onsets[i][-3:]) in O3 else 2 167 | for j in range(len(onsets[i]) - depth): 168 | coda.append(onsets[i].pop(0)) 169 | # store coda 170 | codas.insert(i - 1, coda) 171 | 172 | ## verify that all segments are included in the ouput 173 | output = list(zip(onsets, nuclei, codas)) # in Python3 zip is a generator 174 | flat_output = list(chain.from_iterable(chain.from_iterable(output))) 175 | if flat_output != mypron: 176 | raise ValueError(f"could not syllabify {mypron}, got {flat_output}") 177 | return output 178 | 179 | 180 | def pprint(syllab): 181 | """ 182 | Pretty-print a syllabification 183 | """ 184 | return '.'.join('-'.join(' '.join(p) for p in syl) for syl in syllab) 185 | 186 | 187 | def destress(syllab): 188 | """ 189 | Generate a syllabification with nuclear stress information removed 190 | """ 191 | syls = [] 192 | for (onset, nucleus, coda) in syllab: 193 | nuke = [p[:-1] if p[-1] in {'0', '1', '2'} else p for p in nucleus] 194 | syls.append((onset, nuke, coda)) 195 | return syls 196 | 197 | 198 | if __name__ == '__main__': 199 | import doctest 200 | doctest.testmod() 201 | -------------------------------------------------------------------------------- /wcm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from syllabify import syllabify 4 | 5 | ## constants 6 | DORSALS = {'K', 'G', 'NG'} 7 | LIQUIDS = {'L', 'R'} 8 | VOICED_AF = {'V', 'DH', 'Z', 'ZH'} 9 | AF = {'F', 'TH', 'S', 'SH', 'CH'} | VOICED_AF 10 | 11 | 12 | def wcm(phonemes, *sylab): 13 | """ 14 | The "Word Complexity Measure", as proposed in: 15 | 16 | C. Stoel-Gammon. 2010. The Word Complexity Measure: Description and 17 | application to developmental phonology and disorders. Clinical 18 | Linguistics and Phonetics 24(4-5): 271-282. 19 | """ 20 | syls = syllabify(phonemes) 21 | # begin scoring 22 | score = 0 23 | ## Word patterns 24 | # (1) Productions with more than two syllables receive 1 point 25 | if len(syls) > 2: 26 | score += 1 27 | # FIXME 28 | # (2) Productions with stress on any syllable but the first receive 29 | # 1 point [this rule is stupid --KG] 30 | if len(syls) > 1 and not syls[0][1][-1].endswith('1'): 31 | score += 1 32 | # FIXME 33 | ## Syllable structures 34 | # (1) Productions with a word-final consonant receive 1 point 35 | if syls[-1][2] != []: 36 | score += 1 37 | # (2) Productions with a syllable cluster (defined as a sequence of 38 | # two or more consonants within a syllable) receive one point for 39 | # each cluster: 40 | for syl in syls: 41 | if len(syl[0]) > 1: 42 | score += 1 43 | if len(syl[2]) > 1: 44 | score += 1 45 | ## Sound classes 46 | # (1) Productions with a velar consonant receive 1 point for each 47 | # velar 48 | for syl in syls: 49 | score += sum(ph in DORSALS for ph in (syl[0] + syl[2])) 50 | # (2) Productions with a liquid, a syllabic liquid, or a rhotic vowel 51 | # receive 1 point for each liquid, syllabic liquid, and rhotic vowel 52 | for syl in syls: 53 | score += sum(ph in LIQUIDS for ph in (syl[0] + syl[2])) 54 | score += sum(len(ph) > 1 and ph[1] == 'R' for ph in syl[1]) 55 | # (3) Productions with a fricative or affricate receive 1 point for 56 | # each fricative and affricate 57 | score += sum(ph in AF for ph in (syl[0] + syl[2])) 58 | # (4) Productions with a voiced fricative or affricate receive 1 point 59 | # for each fricative and affricate (in addition to the point received 60 | # for #3) 61 | for syl in syls: 62 | score += sum(ph in VOICED_AF for ph in (syl[0] + syl[2])) 63 | # and we're done 64 | return score 65 | --------------------------------------------------------------------------------