├── .gitignore
├── README.md
├── __init__.py
├── manual.pdf
├── syllabify.py
└── wcm.py


/.gitignore:
--------------------------------------------------------------------------------
1 | manual/*
2 | eval/*
3 | *.py[co]
4 | .DS_Store
5 | 
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | === Port for Python3
2 | 
3 | syllabify.py is a Python module for syllabifying ARPABET transcriptions; 
4 | the method used is informed by subtle details of English phonology.
5 | 
6 | * See `manual.pdf` for usage
7 | * See `syllabify.py` for the license and API
8 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kylebgorman/syllabify/d816db784436e9de87ec1ef9bd11b8e229853710/__init__.py


--------------------------------------------------------------------------------
/manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kylebgorman/syllabify/d816db784436e9de87ec1ef9bd11b8e229853710/manual.pdf


--------------------------------------------------------------------------------
/syllabify.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) 2012-2013 Kyle Gorman <gormanky@ohsu.edu>
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a
  5 | # copy of this software and associated documentation files (the
  6 | # "Software"), to deal in the Software without restriction, including
  7 | # without limitation the rights to use, copy, modify, merge, publish,
  8 | # distribute, sublicense, and/or sell copies of the Software, and to
  9 | # permit persons to whom the Software is furnished to do so, subject to
 10 | # the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included
 13 | # in all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 16 | # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 18 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 19 | # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 20 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 21 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 22 | #
 23 | # syllabify.py: prosodic parsing of ARPABET entries
 24 | 
 25 | from itertools import chain
 26 | 
 27 | ## constants
 28 | SLAX   = {'IH1', 'IH2', 'EH1', 'EH2', 'AE1', 'AE2', 'AH1', 'AH2',
 29 |                                                     'UH1', 'UH2',}
 30 | VOWELS = {'IY1', 'IY2', 'IY0', 'EY1', 'EY2', 'EY0', 'AA1', 'AA2', 'AA0',
 31 |           'ER1', 'ER2', 'ER0', 'AW1', 'AW2', 'AW0', 'AO1', 'AO2', 'AO0',
 32 |           'AY1', 'AY2', 'AY0', 'OW1', 'OW2', 'OW0', 'OY1', 'OY2', 'OY0',
 33 |           'IH0', 'EH0', 'AE0', 'AH0', 'UH0', 'UW1', 'UW2', 'UW0', 'UW',
 34 |           'IY',  'EY',  'AA',  'ER',   'AW', 'AO',  'AY',  'OW',  'OY',
 35 |           'UH',  'IH',  'EH',  'AE',  'AH',  'UH',} | SLAX
 36 | 
 37 | ## licit medial onsets
 38 | 
 39 | O2 = {('P', 'R'), ('T', 'R'), ('K', 'R'), ('B', 'R'), ('D', 'R'),
 40 |       ('G', 'R'), ('F', 'R'), ('TH', 'R'),
 41 |       ('P', 'L'), ('K', 'L'), ('B', 'L'), ('G', 'L'),
 42 |       ('F', 'L'), ('S', 'L'),
 43 |       ('K', 'W'), ('G', 'W'), ('S', 'W'),
 44 |       ('S', 'P'), ('S', 'T'), ('S', 'K'),
 45 |       ('HH', 'Y'), # "clerihew"
 46 |       ('R', 'W'),}
 47 | O3 = {('S', 'T', 'R'), ('S', 'K', 'L'), ('T', 'R', 'W')} # "octroi"
 48 | 
 49 | # This does not represent anything like a complete list of onsets, but
 50 | # merely those that need to be maximized in medial position.
 51 | 
 52 | def syllabify(pron, alaska_rule=True):
 53 |     """
 54 |     Syllabifies a CMU dictionary (ARPABET) word string
 55 | 
 56 |     # Alaska rule:
 57 |     >>> pprint(syllabify('AH0 L AE1 S K AH0'.split())) # Alaska
 58 |     '-AH0-.L-AE1-S.K-AH0-'
 59 |     >>> pprint(syllabify('AH0 L AE1 S K AH0'.split(), 0)) # Alaska
 60 |     '-AH0-.L-AE1-.S K-AH0-'
 61 | 
 62 |     # huge medial onsets:
 63 |     >>> pprint(syllabify('M IH1 N S T R AH0 L'.split())) # minstrel
 64 |     'M-IH1-N.S T R-AH0-L'
 65 |     >>> pprint(syllabify('AA1  K T R W AA0 R'.split())) # octroi
 66 |     '-AA1-K.T R W-AA0-R'
 67 | 
 68 |     # destressing
 69 |     >>> pprint(destress(syllabify('M IH1 L AH0 T EH2 R IY0'.split())))
 70 |     'M-IH-.L-AH-.T-EH-.R-IY-'
 71 | 
 72 |     # normal treatment of 'j':
 73 |     >>> pprint(syllabify('M EH1 N Y UW0'.split())) # menu
 74 |     'M-EH1-N.Y-UW0-'
 75 |     >>> pprint(syllabify('S P AE1 N Y AH0 L'.split())) # spaniel
 76 |     'S P-AE1-N.Y-AH0-L'
 77 |     >>> pprint(syllabify('K AE1 N Y AH0 N'.split())) # canyon
 78 |     'K-AE1-N.Y-AH0-N'
 79 |     >>> pprint(syllabify('M IH0 N Y UW2 EH1 T'.split())) # minuet
 80 |     'M-IH0-N.Y-UW2-.-EH1-T'
 81 |     >>> pprint(syllabify('JH UW1 N Y ER0'.split())) # junior
 82 |     'JH-UW1-N.Y-ER0-'
 83 |     >>> pprint(syllabify('K L EH R IH HH Y UW'.split())) # clerihew
 84 |     'K L-EH-.R-IH-.HH Y-UW-'
 85 | 
 86 |     # nuclear treatment of 'j'
 87 |     >>> pprint(syllabify('R EH1 S K Y UW0'.split())) # rescue
 88 |     'R-EH1-S.K-Y UW0-'
 89 |     >>> pprint(syllabify('T R IH1 B Y UW0 T'.split())) # tribute
 90 |     'T R-IH1-B.Y-UW0-T'
 91 |     >>> pprint(syllabify('N EH1 B Y AH0 L AH0'.split())) # nebula
 92 |     'N-EH1-B.Y-AH0-.L-AH0-'
 93 |     >>> pprint(syllabify('S P AE1 CH UH0 L AH0'.split())) # spatula
 94 |     'S P-AE1-.CH-UH0-.L-AH0-'
 95 |     >>> pprint(syllabify('AH0 K Y UW1 M AH0 N'.split())) # acumen
 96 |     '-AH0-K.Y-UW1-.M-AH0-N'
 97 |     >>> pprint(syllabify('S AH1 K Y AH0 L IH0 N T'.split())) # succulent
 98 |     'S-AH1-K.Y-AH0-.L-IH0-N T'
 99 |     >>> pprint(syllabify('F AO1 R M Y AH0 L AH0'.split())) # formula
100 |     'F-AO1 R-M.Y-AH0-.L-AH0-'
101 |     >>> pprint(syllabify('V AE1 L Y UW0'.split())) # value
102 |     'V-AE1-L.Y-UW0-'
103 | 
104 |     # everything else
105 |     >>> pprint(syllabify('N AO0 S T AE1 L JH IH0 K'.split())) # nostalgic
106 |     'N-AO0-.S T-AE1-L.JH-IH0-K'
107 |     >>> pprint(syllabify('CH ER1 CH M AH0 N'.split())) # churchmen
108 |     'CH-ER1-CH.M-AH0-N'
109 |     >>> pprint(syllabify('K AA1 M P AH0 N S EY2 T'.split())) # compensate
110 |     'K-AA1-M.P-AH0-N.S-EY2-T'
111 |     >>> pprint(syllabify('IH0 N S EH1 N S'.split())) # inCENSE
112 |     '-IH0-N.S-EH1-N S'
113 |     >>> pprint(syllabify('IH1 N S EH2 N S'.split())) # INcense
114 |     '-IH1-N.S-EH2-N S'
115 |     >>> pprint(syllabify('AH0 S EH1 N D'.split())) # ascend
116 |     '-AH0-.S-EH1-N D'
117 |     >>> pprint(syllabify('R OW1 T EY2 T'.split())) # rotate
118 |     'R-OW1-.T-EY2-T'
119 |     >>> pprint(syllabify('AA1 R T AH0 S T'.split())) # artist
120 |     '-AA1 R-.T-AH0-S T'
121 |     >>> pprint(syllabify('AE1 K T ER0'.split())) # actor
122 |     '-AE1-K.T-ER0-'
123 |     >>> pprint(syllabify('P L AE1 S T ER0'.split())) # plaster
124 |     'P L-AE1-S.T-ER0-'
125 |     >>> pprint(syllabify('B AH1 T ER0'.split())) # butter
126 |     'B-AH1-.T-ER0-'
127 |     >>> pprint(syllabify('K AE1 M AH0 L'.split())) # camel
128 |     'K-AE1-.M-AH0-L'
129 |     >>> pprint(syllabify('AH1 P ER0'.split())) # upper
130 |     '-AH1-.P-ER0-'
131 |     >>> pprint(syllabify('B AH0 L UW1 N'.split())) # balloon
132 |     'B-AH0-.L-UW1-N'
133 |     >>> pprint(syllabify('P R OW0 K L EY1 M'.split())) # proclaim
134 |     'P R-OW0-.K L-EY1-M'
135 |     >>> pprint(syllabify('IH0 N S EY1 N'.split())) # insane
136 |     '-IH0-N.S-EY1-N'
137 |     >>> pprint(syllabify('IH0 K S K L UW1 D'.split())) # exclude
138 |     '-IH0-K.S K L-UW1-D'
139 |     """
140 |     ## main pass
141 |     mypron = list(pron)
142 |     nuclei = []
143 |     onsets = []
144 |     i = -1
145 |     for (j, seg) in enumerate(mypron):
146 |         if seg in VOWELS:
147 |             nuclei.append([seg])
148 |             onsets.append(mypron[i + 1:j]) # actually interludes, r.n.
149 |             i = j
150 |     codas = [mypron[i + 1:]]
151 |     ## resolve disputes and compute coda
152 |     for i in range(1, len(onsets)):
153 |         coda = []
154 |         # boundary cases
155 |         if len(onsets[i]) > 1 and onsets[i][0] == 'R':
156 |             nuclei[i - 1].append(onsets[i].pop(0))
157 |         if len(onsets[i]) > 2 and onsets[i][-1] == 'Y':
158 |             nuclei[i].insert(0, onsets[i].pop())
159 |         if len(onsets[i]) > 1 and alaska_rule and nuclei[i-1][-1] in SLAX \
160 |                                               and onsets[i][0] == 'S':
161 |             coda.append(onsets[i].pop(0))
162 |         # onset maximization
163 |         depth = 1
164 |         if len(onsets[i]) > 1:
165 |             if tuple(onsets[i][-2:]) in O2:
166 |                 depth = 3 if tuple(onsets[i][-3:]) in O3 else 2
167 |         for j in range(len(onsets[i]) - depth):
168 |             coda.append(onsets[i].pop(0))
169 |         # store coda
170 |         codas.insert(i - 1, coda)
171 | 
172 |     ## verify that all segments are included in the ouput
173 |     output = list(zip(onsets, nuclei, codas))  # in Python3 zip is a generator
174 |     flat_output = list(chain.from_iterable(chain.from_iterable(output)))
175 |     if flat_output != mypron:
176 |         raise ValueError(f"could not syllabify {mypron}, got {flat_output}")
177 |     return output
178 | 
179 | 
180 | def pprint(syllab):
181 |     """
182 |     Pretty-print a syllabification
183 |     """
184 |     return '.'.join('-'.join(' '.join(p) for p in syl) for syl in syllab)
185 | 
186 | 
187 | def destress(syllab):
188 |     """
189 |     Generate a syllabification with nuclear stress information removed
190 |     """
191 |     syls = []
192 |     for (onset, nucleus, coda) in syllab:
193 |         nuke = [p[:-1] if p[-1] in {'0', '1', '2'} else p for p in nucleus]
194 |         syls.append((onset, nuke, coda))
195 |     return syls
196 | 
197 | 
198 | if __name__ == '__main__':
199 |     import doctest
200 |     doctest.testmod()
201 | 


--------------------------------------------------------------------------------
/wcm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from syllabify import syllabify
 4 | 
 5 | ## constants
 6 | DORSALS = {'K', 'G', 'NG'}
 7 | LIQUIDS = {'L', 'R'}
 8 | VOICED_AF = {'V', 'DH', 'Z', 'ZH'}
 9 | AF = {'F', 'TH', 'S', 'SH', 'CH'} | VOICED_AF
10 | 
11 | 
12 | def wcm(phonemes, *sylab):
13 |     """
14 |     The "Word Complexity Measure", as proposed in:
15 | 
16 |     C. Stoel-Gammon. 2010. The Word Complexity Measure: Description and 
17 |     application to developmental phonology and disorders. Clinical
18 |     Linguistics and Phonetics 24(4-5): 271-282.
19 |     """
20 |     syls = syllabify(phonemes) 
21 |     # begin scoring
22 |     score = 0
23 |     ## Word patterns
24 |     # (1) Productions with more than two syllables receive 1 point
25 |     if len(syls) > 2:
26 |         score += 1
27 |     # FIXME <stupid_rule>
28 |     # (2) Productions with stress on any syllable but the first receive 
29 |     # 1 point [this rule is stupid --KG]
30 |     if len(syls) > 1 and not syls[0][1][-1].endswith('1'):
31 |         score += 1
32 |     # FIXME </stupid_rule>
33 |     ## Syllable structures
34 |     # (1) Productions with a word-final consonant receive 1 point
35 |     if syls[-1][2] != []:
36 |         score += 1
37 |     # (2) Productions with a syllable cluster (defined as a sequence of 
38 |     # two or more consonants within a syllable) receive one point for 
39 |     # each cluster:
40 |     for syl in syls:
41 |         if len(syl[0]) > 1:
42 |             score += 1
43 |         if len(syl[2]) > 1:
44 |             score += 1
45 |     ## Sound classes
46 |     # (1) Productions with a velar consonant receive 1 point for each 
47 |     # velar
48 |     for syl in syls:
49 |         score += sum(ph in DORSALS for ph in (syl[0] + syl[2]))
50 |     # (2) Productions with a liquid, a syllabic liquid, or a rhotic vowel 
51 |     # receive 1 point for each liquid, syllabic liquid, and rhotic vowel
52 |     for syl in syls:
53 |         score += sum(ph in LIQUIDS for ph in (syl[0] + syl[2]))
54 |         score += sum(len(ph) > 1 and ph[1] == 'R' for ph in syl[1])
55 |     # (3) Productions with a fricative or affricate receive 1 point for
56 |     # each fricative and affricate
57 |         score += sum(ph in AF for ph in (syl[0] + syl[2]))
58 |     # (4) Productions with a voiced fricative or affricate receive 1 point
59 |     # for each fricative and affricate (in addition to the point received
60 |     # for #3)
61 |     for syl in syls:
62 |         score += sum(ph in VOICED_AF for ph in (syl[0] + syl[2]))
63 |     # and we're done
64 |     return score
65 | 


--------------------------------------------------------------------------------