├── LICENSE
├── Language.py
├── Phoneme.py
├── README.txt
├── Syllable.py
├── Text.py
├── Word.py
├── config.txt
├── ipa.py
├── lib
    ├── __init__.py
    ├── en
    │   └── english.tsv
    └── fi
    │   ├── __init__.py
    │   ├── compound.txt
    │   ├── config.txt
    │   ├── finnish_annotator.py
    │   ├── finnish_functions.py
    │   ├── finnish_sonority.py
    │   ├── finnish_stress.py
    │   ├── finnish_syllables.py
    │   ├── finnish_weight.py
    │   ├── function_words.txt
    │   ├── initial.txt
    │   ├── presyllabified.txt
    │   ├── suffix.txt
    │   └── unstressed.txt
├── prosodic.py
└── tools.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     {description}
294 |     Copyright (C) {year}  {fullname}
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 
341 | 


--------------------------------------------------------------------------------
/Language.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | 
 3 | # This method of selecting the language is ugly. I'll try to improve it in the future. (2/23/2013)
 4 | 
 5 | import codecs, os
 6 | import lib.fi.finnish_annotator
 7 | 
 8 | class English(object):
 9 | 	vowels = set([u'ɑ', u'a', u'æ', u'ə', u'ʌ', u'ɔ', u'ɜ', u'ɛ', u'e', u'ɪ', u'i', u'o', u'ʊ', u'u'])
10 | 	name = "English"
11 | 	
12 | 	def __init__(self):
13 | 		self.annotation_dict = {}
14 | 		cmu = codecs.open(os.path.join('lib', 'en', 'english.tsv'), 'r', 'utf-8')
15 | 		raw = cmu.read()
16 | 		words = raw.split('\n')
17 | 		for word in words:
18 | 			text, transcription = word.split('\t')
19 | 			if not text.lower() in self.annotation_dict:
20 | 				self.annotation_dict[text.lower()] = transcription
21 | 		
22 | 	def annotate_word(self, text):
23 | 		if type(text) != type(u''):
24 | 			text = unicode(text, 'utf-8').lower()
25 | 		if not text in self.annotation_dict:
26 | 			return None
27 | 		return self.annotation_dict[text]
28 | 
29 | class Finnish(object):
30 | 	vowels = set([u'i', u'e', u'æ', u'y', u'ø', u'ɑ', u'u', u'o'])
31 | 	name = "Finnish"
32 | 	
33 | 	def __init__(self):
34 | 		pass
35 | 	
36 | 	def annotate_word(self, text):
37 | 		return lib.fi.finnish_annotator.ipa_annotation(text)
38 | 
39 | language_dict = {'English': English(), 'Finnish': Finnish()}
40 | 
41 | def set_language(language):
42 | 	if language in language_dict:
43 | 		global selected_language
44 | 		selected_language = language_dict[language]
45 | 	else:
46 | 		print "ERROR: invalid language choice"
47 | 
48 | set_language("English")
49 | 
50 | try:
51 | 	f = open('config.txt', 'r')
52 | 	for line in f.read().split('\n'):
53 | 		if len(line) == 0 or line[0] == '#': # empty or comment
54 | 			continue
55 | 		configuration = line.split('=')
56 | 		if len(configuration) != 2:
57 | 			continue
58 | 		if configuration[0] == 'language':
59 | 			set_language(configuration[1].strip())
60 | except IOError:
61 | 	print "ERROR: configuration file not found"


--------------------------------------------------------------------------------
/Phoneme.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | 
 3 | from ipa import *
 4 | 
 5 | class Phoneme(object):
 6 | 	def __init__(self, text):
 7 | 		if type(text) != type(u''):
 8 | 			text = unicode(text, 'utf-8')
 9 | 		self.text = text
10 | 		self.broken = False
11 | 		if self.text not in ipa_to_features:
12 | 			self.broken = True
13 | 			self.features = {}
14 | 		else:
15 | 			self.features = ipa_to_features[self.text]
16 | 	
17 | 	def __repr__(self):
18 | 		return self.text.encode('utf-8')
19 | 	
20 | 	def __getitem__(self, key):
21 | 		if key in abbreviations:
22 | 			key = abbreviations[key]
23 | 		if key in self.features:
24 | 			return self.features[key]
25 | 		raise Exception("Phoneme '" + self.text + "' does not have feature '" + key + "'")


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | Prosodic 1.0b (February 23, 2013)
 2 | ---------------------------------
 3 | 
 4 | Prosodic is a piece of software designed to aid research in phonology. This version of Prosodic can convert text in English or Finnish to an IPA transcription, including syllable boundaries and stress. Prosodic also allows users to convert a phoneme to its feature set, and to get a list of phonemes in a given natural class.
 5 | 
 6 | Examples of how to use the script:
 7 | 
 8 | python prosodic.pyc
 9 | This launches the command line interface to Prosodic.
10 | 
11 | python prosodic.py [infile]
12 | This converts the text of infile to IPA and prints the result.
13 | 
14 | python prosodic.py [infile] [outfile]
15 | This converts the text of infile to IPA and stores the result in outfile.
16 | 
17 | Prosodic currently supports English and Finnish. The language setting can be changed in config.txt
18 | 
19 | English transcription is done using the CMU pronouncing dictionary. Note that English R-colored vowels are treated as a vowel followed by an /r/ (in contrast to the CMU dictionary). Finnish transcription is done through an original script, with syllable splitting based on Karlsson 1985 and stress based on Anttila 2008 (though simplified). Finnish long vowels are currently represented as doubled short vowels, rather than single phonemes with a length feature.
20 | 
21 | Prosodic may also be used within a Python script, offering objects that encapsulate texts, words, syllables, and phonemes. To use Prosodic within a script, just type "from prosodic import *" from within Python.
22 | 
23 | To change the language, type Language.set_language("English")
24 | 
25 | Prosodic offers the following hierarchy of objects based on containment: Text > Word > Syllable > Phoneme. Any object on the hierarchy contains a list of elements within it. For example, if text is a Text object, text.words is a list of Word objects representing the words within the text, and text.phonemes is a list of Phoneme objects representing the phonemes within the text. The syllable object also has fields for the onset, nucleus, and coda, which are lists of phonemes. The features of a phoneme may be gotten from the feature field, or by direct indexing through brackets. For example, both Phoneme('p').features['labial'] and Phoneme('p')['labial'] will return True. A positive feature value is represented as True, a negative feature value is represented as False, and a zero feature value is represented as None. The feature system is taken from Hayes 2008.
26 | 
27 | This branch of Prosodic (the "b" branch) was written by Josh Falk, but it is significantly based on earlier versions of Prosodic consisting of joint work by Ryan Heuser, Josh Falk, and Arto Anttila.
28 | 
29 | Feedback or suggestions are appreciated (falk1729@gmail.com). Search functionality and metrical parsing should be coming soon.
30 | 
31 | References:
32 | Anttila, Arto. 2008. Word stress in Finnish.
33 | 
34 | Hayes, Bruce. 2009. Introductory phonology. West Sussex, U.K.: Wiley-Blackwell.
35 | 
36 | Karlsson, Fred. 1985. Automatic hyphenation of Finnish. In Computational Morphosyntax, Report on Research 1981-84, vol. 13 of Publications of the Department of General Linguistics, University of Helsinki.


--------------------------------------------------------------------------------
/Syllable.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | 
 3 | from Phoneme import Phoneme
 4 | import Language
 5 | 
 6 | def split_syllable(body):
 7 | 	vowels = Language.selected_language.vowels
 8 | 	onset = []
 9 | 	i = 0
10 | 	while i < len(body):
11 | 		if body[i] in vowels:
12 | 			break
13 | 		onset.append(Phoneme(body[i]))
14 | 		i += 1
15 | 	nucleus = []
16 | 	while i < len(body):
17 | 		if body[i] not in vowels:
18 | 			break
19 | 		nucleus.append(Phoneme(body[i]))
20 | 		i += 1
21 | 	coda = [Phoneme(cons) for cons in body[i:]]
22 | 	return (onset, nucleus, coda)
23 | 
24 | class Syllable(object):
25 | 	def __init__(self, body):
26 | 		content = body
27 | 		if body[0] == u'ˈ':
28 | 			self.stress = 'P'
29 | 			content = body[1:]
30 | 		elif body[0] == u'ˌ':
31 | 			self.stress = 'S'
32 | 			content = body[1:]
33 | 		else:
34 | 			self.stress = 'U'
35 | 		self.body = body
36 | 		self.onset, self.nucleus, self.coda = split_syllable(content)
37 | 		self.rhyme = self.nucleus + self.coda
38 | 		self.phonemes = self.onset + self.rhyme
39 | 	
40 | 	def __repr__(self):
41 | 		return self.body.encode('utf-8')


--------------------------------------------------------------------------------
/Text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | 
 3 | from tools import separate_words
 4 | from Word import Word
 5 | 
 6 | class Text(object):
 7 | 	def __init__(self, text):
 8 | 		if type(text) != type(u''):
 9 | 			text = unicode(text, 'utf-8')
10 | 		self.text = text
11 | 		words = separate_words(text)
12 | 		self.words = [Word(word) for word in words]
13 | 		self.syllables = sum([word.syllables for word in self.words], []) # flatten the list of phonemes
14 | 		self.phonemes = sum([word.phonemes for word in self.words], []) # flatten the list of phonemes
15 | 	
16 | 	def __repr__(self):
17 | 		return ' '.join([word.transcription for word in self.words]).encode('utf-8')


--------------------------------------------------------------------------------
/Word.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | 
 3 | import Language
 4 | from Syllable import Syllable
 5 | 
 6 | def word(text):
 7 | 	return Word(text)
 8 | 
 9 | class Word(object):
10 | 	def __init__(self, text):
11 | 		self.broken = False
12 | 		text = text.strip()
13 | 		if type(text) != type(u''):
14 | 			text = unicode(text, 'utf-8')
15 | 		self.text = text
16 | 		self.transcription = Language.selected_language.annotate_word(self.text.lower())
17 | 		if self.transcription == None:
18 | 			self.broken = True
19 | 			self.transcription = "[" + self.text + "]" # only kept if broken
20 | 			self.syllables = []
21 | 		else:
22 | 			self.syllables = [Syllable(syll) for syll in self.transcription.split('.')]
23 | 		self.phonemes = sum([syll.phonemes for syll in self.syllables], []) # flatten the list of phonemes
24 | 	
25 | 	def __repr__(self):
26 | 		return self.transcription.encode('utf-8')
27 | 	
28 | 	def __getitem__(self, index):
29 | 		return self.phonemes[index]


--------------------------------------------------------------------------------
/config.txt:
--------------------------------------------------------------------------------
1 | # currently supported languages: English, Finnish
2 | language=English


--------------------------------------------------------------------------------
/ipa.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | 
  3 | import re
  4 | 
  5 | # Feature system taken from Hayes 2008 (pp. 95-98)
  6 | 
  7 | ipa_to_features = {}
  8 | abbreviations = {'cons': 'consonantal', 'son': 'sonorant', 'cont': 'continuant', 'del.rel': 'delayed.release', 'approx': 'approximant', 'nas': 'nasal', 'voi': 'voice', 'lab': 'labial', 'rnd': 'round', 'cor': 'coronal', 'ant': 'anterior', 'dist': 'distributed', 'strid': 'strident', 'lat': 'lateral', 'dor': 'dorsal'}
  9 | 
 10 | ##############
 11 | # Consonants #
 12 | ##############
 13 | consonant_features = ['consonantal', 'sonorant', 'continuant', 'delayed.release', 'approximant', 'tap', 'trill', 'nasal', 'voice', 'spread.gl', 'constr.gl', 'labial', 'round', 'labiodental', 'coronal', 'anterior', 'distributed', 'strident', 'lateral', 'dorsal', 'high', 'low', 'front', 'back', 'tense']
 14 | 
 15 | # bilabial
 16 | ipa_to_features[u'p'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 17 | ipa_to_features[u'b'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 18 | ipa_to_features[u'ɸ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 19 | ipa_to_features[u'β'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 20 | ipa_to_features[u'm'] = {'consonantal': True, 'sonorant': True, 'continuant': False, 'delayed.release': None, 'approximant': False, 'tap': False, 'trill': False, 'nasal': True, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 21 | ipa_to_features[u'ʙ'] = {'consonantal': True, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': True, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 22 | # labiodental
 23 | ipa_to_features[u'p͡f'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': True, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 24 | ipa_to_features[u'f'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': True, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 25 | ipa_to_features[u'v'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': True, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 26 | ipa_to_features[u'ɱ'] = {'consonantal': True, 'sonorant': True, 'continuant': False, 'delayed.release': None, 'approximant': False, 'tap': False, 'trill': False, 'nasal': True, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': True, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 27 | ipa_to_features[u'ʋ'] = {'consonantal': False, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': True, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 28 | # dental
 29 | ipa_to_features[u't̪'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': True, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 30 | ipa_to_features[u'd̪'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': True, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 31 | ipa_to_features[u'θ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': True, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 32 | ipa_to_features[u'ð'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': True, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 33 | # alveolar
 34 | ipa_to_features[u't'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 35 | ipa_to_features[u'd'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 36 | ipa_to_features[u'ʦ'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': True, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 37 | ipa_to_features[u'ʣ'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': True, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 38 | ipa_to_features[u't͡s'] = ipa_to_features[u'ʦ']
 39 | ipa_to_features[u'd͡z'] = ipa_to_features[u'ʣ']
 40 | ipa_to_features[u't͡ɬ'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': False, 'lateral': True, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 41 | ipa_to_features[u's'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': True, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 42 | ipa_to_features[u'z'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': True, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 43 | ipa_to_features[u'n'] = {'consonantal': True, 'sonorant': True, 'continuant': False, 'delayed.release': None, 'approximant': False, 'tap': False, 'trill': False, 'nasal': True, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 44 | ipa_to_features[u'l'] = {'consonantal': True, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': False, 'lateral': True, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 45 | ipa_to_features[u'ɬ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': False, 'lateral': True, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 46 | ipa_to_features[u'ɮ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': False, 'lateral': True, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 47 | ipa_to_features[u'ɾ'] = {'consonantal': True, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': True, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 48 | ipa_to_features[u'ɺ'] = {'consonantal': True, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': True, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': False, 'lateral': True, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 49 | ipa_to_features[u'r'] = {'consonantal': True, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': True, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': False, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 50 | # palato-alveolar
 51 | ipa_to_features[u'ʧ'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': True, 'strident': True, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 52 | ipa_to_features[u'ʤ'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': True, 'strident': True, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 53 | ipa_to_features[u't͡ʃ'] = ipa_to_features[u'ʧ']
 54 | ipa_to_features[u'd͡ʒ'] = ipa_to_features[u'ʤ']
 55 | ipa_to_features[u'ʃ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': True, 'strident': True, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 56 | ipa_to_features[u'ʒ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': True, 'strident': True, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 57 | ipa_to_features[u'ɹ'] = {'consonantal': False, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': True, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 58 | # retroflex
 59 | ipa_to_features[u'ʈ'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': False, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 60 | ipa_to_features[u'ɖ'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': False, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 61 | ipa_to_features[u'ʂ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': False, 'strident': True, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 62 | ipa_to_features[u'ʐ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': False, 'strident': True, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 63 | ipa_to_features[u'ɳ'] = {'consonantal': True, 'sonorant': True, 'continuant': False, 'delayed.release': None, 'approximant': False, 'tap': False, 'trill': False, 'nasal': True, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': False, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 64 | ipa_to_features[u'ɭ'] = {'consonantal': True, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': False, 'strident': False, 'lateral': True, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 65 | ipa_to_features[u'ɽ'] = {'consonantal': True, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': True, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': False, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 66 | ipa_to_features[u'ɻ'] = {'consonantal': True, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': False, 'strident': False, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 67 | # fronted velar
 68 | ipa_to_features[u'k̟̟'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
 69 | ipa_to_features[u'g̟'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
 70 | ipa_to_features[u'x̟'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
 71 | ipa_to_features[u'j'] = {'consonantal': False, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': True}
 72 | # velar
 73 | ipa_to_features[u'k'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': None, 'back': None, 'tense': None}
 74 | ipa_to_features[u'g'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': None, 'back': None, 'tense': None}
 75 | ipa_to_features[u'ŋ'] = {'consonantal': True, 'sonorant': True, 'continuant': False, 'delayed.release': None, 'approximant': False, 'tap': False, 'trill': False, 'nasal': True, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': None, 'back': None, 'tense': None}
 76 | ipa_to_features[u'k͡x'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': None, 'back': None, 'tense': None}
 77 | ipa_to_features[u'x'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': None, 'back': None, 'tense': None}
 78 | ipa_to_features[u'ɣ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': None, 'back': None, 'tense': None} # note: incorrectly +lateral in Hayes
 79 | ipa_to_features[u'ʟ'] = {'consonantal': True, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': True, 'dorsal': True, 'high': True, 'low': False, 'front': None, 'back': None, 'tense': None} # note: incorrectly -lateral in Hayes
 80 | # back velar
 81 | ipa_to_features[u'k̠'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': False, 'back': True, 'tense': None} # note: incorrectly +lateral in Hayes
 82 | ipa_to_features[u'g̠'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': False, 'back': True, 'tense': None}
 83 | ipa_to_features[u'x̠'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': False, 'back': True, 'tense': None}
 84 | ipa_to_features[u'ɣ̠'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': False, 'back': True, 'tense': None}
 85 | # uvular
 86 | ipa_to_features[u'q'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': False, 'low': False, 'front': False, 'back': True, 'tense': None}
 87 | ipa_to_features[u'ɢ'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': False, 'low': False, 'front': False, 'back': True, 'tense': None}
 88 | ipa_to_features[u'χ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': False, 'low': False, 'front': False, 'back': True, 'tense': None}
 89 | ipa_to_features[u'ʁ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': False, 'low': False, 'front': False, 'back': True, 'tense': None}
 90 | ipa_to_features[u'ɴ'] = {'consonantal': True, 'sonorant': True, 'continuant': False, 'delayed.release': None, 'approximant': False, 'tap': False, 'trill': False, 'nasal': True, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': False, 'low': False, 'front': False, 'back': True, 'tense': None}
 91 | ipa_to_features[u'ʀ'] = {'consonantal': True, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': True, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': False, 'low': False, 'front': False, 'back': True, 'tense': None}
 92 | # pharyngeal
 93 | ipa_to_features[u'ħ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': False, 'low': True, 'front': False, 'back': True, 'tense': None}
 94 | ipa_to_features[u'ʕ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': False, 'low': True, 'front': False, 'back': True, 'tense': None} # note: incorrectly -delayed.release in Hayes
 95 | # glottal
 96 | ipa_to_features[u'ʔ'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': True, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 97 | ipa_to_features[u'h'] = {'consonantal': False, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': True, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 98 | ipa_to_features[u'ɦ'] = {'consonantal': False, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': True, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': False, 'high': None, 'low': None, 'front': None, 'back': None, 'tense': None}
 99 | # labial back velar
100 | ipa_to_features[u'w'] = {'consonantal': False, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': True, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': False, 'back': True, 'tense': True}
101 | ipa_to_features[u'ʍ'] = {'consonantal': False, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': True, 'constr.gl': False, 'labial': True, 'round': True, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': False, 'back': True, 'tense': True}
102 | # labial velar
103 | ipa_to_features[u'k͡p'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': None, 'back': None, 'tense': None}
104 | ipa_to_features[u'g͡b'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': False, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': None, 'back': None, 'tense': None}
105 | # labial front velar
106 | ipa_to_features[u'ɥ'] = {'consonantal': False, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': True, 'round': True, 'labiodental': False, 'coronal': False, 'anterior': None, 'distributed': None, 'strident': None, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': True}
107 | # alveolopalatal
108 | ipa_to_features[u'ʨ'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': True, 'strident': True, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
109 | ipa_to_features[u'ʥ'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': True, 'strident': True, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
110 | ipa_to_features[u't͡ɕ'] = ipa_to_features[u'ʨ']
111 | ipa_to_features[u'd͡ʑ'] = ipa_to_features[u'ʥ']
112 | ipa_to_features[u'ɕ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': True, 'strident': True, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
113 | ipa_to_features[u'ʑ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': True, 'distributed': True, 'strident': True, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
114 | # palatal
115 | ipa_to_features[u'c'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': True, 'strident': False, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
116 | ipa_to_features[u'ɟ'] = {'consonantal': True, 'sonorant': False, 'continuant': False, 'delayed.release': False, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': True, 'strident': False, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
117 | ipa_to_features[u'ç'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': False, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': True, 'strident': False, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
118 | ipa_to_features[u'ʝ'] = {'consonantal': True, 'sonorant': False, 'continuant': True, 'delayed.release': True, 'approximant': False, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': True, 'strident': False, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
119 | ipa_to_features[u'ɲ'] = {'consonantal': True, 'sonorant': True, 'continuant': False, 'delayed.release': None, 'approximant': False, 'tap': False, 'trill': False, 'nasal': True, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': True, 'strident': False, 'lateral': False, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
120 | ipa_to_features[u'ʎ'] = {'consonantal': True, 'sonorant': True, 'continuant': True, 'delayed.release': None, 'approximant': True, 'tap': False, 'trill': False, 'nasal': False, 'voice': True, 'spread.gl': False, 'constr.gl': False, 'labial': False, 'round': False, 'labiodental': False, 'coronal': True, 'anterior': False, 'distributed': True, 'strident': False, 'lateral': True, 'dorsal': True, 'high': True, 'low': False, 'front': True, 'back': False, 'tense': None}
121 | 
122 | ##########
123 | # Vowels #
124 | ##########
125 | vowel_features = ['high', 'low', 'tense', 'front', 'back', 'round']
126 | 
127 | # high tense
128 | ipa_to_features[u'i'] = {'high': True, 'low': False, 'tense': True, 'front': True, 'back': False, 'round': False}
129 | ipa_to_features[u'y'] = {'high': True, 'low': False, 'tense': True, 'front': True, 'back': False, 'round': True}
130 | ipa_to_features[u'ɨ'] = {'high': True, 'low': False, 'tense': True, 'front': False, 'back': False, 'round': False}
131 | ipa_to_features[u'ʉ'] = {'high': True, 'low': False, 'tense': True, 'front': False, 'back': False, 'round': True}
132 | ipa_to_features[u'ɯ'] = {'high': True, 'low': False, 'tense': True, 'front': False, 'back': True, 'round': False}
133 | ipa_to_features[u'u'] = {'high': True, 'low': False, 'tense': True, 'front': False, 'back': True, 'round': True}
134 | # high lax
135 | ipa_to_features[u'ɪ'] = {'high': True, 'low': False, 'tense': False, 'front': True, 'back': False, 'round': False}
136 | ipa_to_features[u'ʏ'] = {'high': True, 'low': False, 'tense': False, 'front': True, 'back': False, 'round': True}
137 | ipa_to_features[u'ʊ'] = {'high': True, 'low': False, 'tense': False, 'front': False, 'back': True, 'round': True}
138 | # mid tense
139 | ipa_to_features[u'e'] = {'high': False, 'low': False, 'tense': True, 'front': True, 'back': False, 'round': False}
140 | ipa_to_features[u'ø'] = {'high': False, 'low': False, 'tense': True, 'front': True, 'back': False, 'round': True}
141 | ipa_to_features[u'ɘ'] = {'high': False, 'low': False, 'tense': True, 'front': False, 'back': False, 'round': False}
142 | ipa_to_features[u'ɜ'] = {'high': False, 'low': False, 'tense': True, 'front': False, 'back': False, 'round': False} # not in Hayes, but given the same representation as 'ɘ'
143 | ipa_to_features[u'ɵ'] = {'high': False, 'low': False, 'tense': True, 'front': False, 'back': False, 'round': True}
144 | ipa_to_features[u'ɤ'] = {'high': False, 'low': False, 'tense': True, 'front': False, 'back': True, 'round': False}
145 | ipa_to_features[u'o'] = {'high': False, 'low': False, 'tense': True, 'front': False, 'back': True, 'round': True}
146 | # mid lax
147 | ipa_to_features[u'ɛ'] = {'high': False, 'low': False, 'tense': False, 'front': True, 'back': False, 'round': False}
148 | ipa_to_features[u'œ'] = {'high': False, 'low': False, 'tense': False, 'front': True, 'back': False, 'round': True}
149 | ipa_to_features[u'ə'] = {'high': False, 'low': False, 'tense': False, 'front': False, 'back': False, 'round': False}
150 | ipa_to_features[u'ɞ'] = {'high': False, 'low': False, 'tense': False, 'front': False, 'back': False, 'round': True}
151 | ipa_to_features[u'ʌ'] = {'high': False, 'low': False, 'tense': False, 'front': False, 'back': True, 'round': False}
152 | ipa_to_features[u'ɔ'] = {'high': False, 'low': False, 'tense': False, 'front': False, 'back': True, 'round': True}
153 | # low
154 | ipa_to_features[u'æ'] = {'high': False, 'low': True, 'tense': None, 'front': True, 'back': False, 'round': False}
155 | ipa_to_features[u'ɶ'] = {'high': False, 'low': True, 'tense': None, 'front': True, 'back': False, 'round': True}
156 | ipa_to_features[u'a'] = {'high': False, 'low': True, 'tense': None, 'front': False, 'back': False, 'round': False}
157 | ipa_to_features[u'ɑ'] = {'high': False, 'low': True, 'tense': None, 'front': False, 'back': True, 'round': False}
158 | ipa_to_features[u'ɒ'] = {'high': False, 'low': True, 'tense': None, 'front': False, 'back': True, 'round': True}
159 | 
160 | all_features = set(consonant_features + vowel_features)
161 | 
162 | value_map = {'+': True, '-': False, '0': None}
163 | reverse_value_map = {True: '+', False: '-', None: '0'}
164 | 
165 | # Split a combined feature and value into separate parts
166 | # E.g. "+sonorant" -> ("sonorant", True)
167 | def split_feature_and_value(feature):
168 | 	if len(feature) < 1:
169 | 		return None
170 | 	value = feature[0]
171 | 	if value not in value_map:
172 | 		return None
173 | 	value = value_map[value]
174 | 	feature = feature[1:]
175 | 	if feature not in all_features:
176 | 		return None
177 | 	return (feature, value)
178 | 
179 | # combine a feature and a value into one representation
180 | # E.g. ("sonorant", True) -> "+sonorant"
181 | def combine_feature_and_value(feature, value):
182 | 	if value not in reverse_value_map:
183 | 		return None
184 | 	return reverse_value_map[value] + feature
185 | 
186 | # Note: this may need to change if the feature system is modified
187 | def is_vowel(phoneme):
188 | 	return 'consonantal' not in ipa_to_features[phoneme]
189 | 
190 | def is_consonant(phoneme):
191 | 	return not is_vowel(phoneme)
192 | 
193 | def phoneme_feature_string(phoneme):
194 | 	if phoneme not in ipa_to_features:
195 | 		return None
196 | 	features = vowel_features if is_vowel(phoneme) else consonant_features
197 | 	feature_value_pairs = [(feature, ipa_to_features[phoneme][feature]) for feature in features]
198 | 	return ", ".join([combine_feature_and_value(feature, value) for feature, value in feature_value_pairs if value != None])
199 | 
200 | def get_natural_class(features):
201 | 	features = re.findall('\s*([^\s]+)\s*', features, re.UNICODE)
202 | 	result = set(ipa_to_features.keys())
203 | 	for feature in features:
204 | 		matches = set()
205 | 		split = split_feature_and_value(feature)
206 | 		if split == None:
207 | 			print "Invalid feature: " + feature
208 | 			return set()
209 | 		feature, value = split
210 | 		for phoneme in ipa_to_features:
211 | 			feature_values = ipa_to_features[phoneme]
212 | 			if feature in feature_values and feature_values[feature] == value:
213 | 				matches.add(phoneme)		
214 | 		result = result.intersection(matches)
215 | 	return result


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jsfalk/prosodic1b/42417e40264a42c0e4af0b1e4a3dc0f0bb207849/lib/__init__.py


--------------------------------------------------------------------------------
/lib/fi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jsfalk/prosodic1b/42417e40264a42c0e4af0b1e4a3dc0f0bb207849/lib/fi/__init__.py


--------------------------------------------------------------------------------
/lib/fi/compound.txt:
--------------------------------------------------------------------------------
1 | -


--------------------------------------------------------------------------------
/lib/fi/config.txt:
--------------------------------------------------------------------------------
1 | presyllabified.txt
2 | initial.txt
3 | suffix.txt
4 | compound.txt


--------------------------------------------------------------------------------
/lib/fi/finnish_annotator.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | from finnish_functions import *
  4 | from finnish_syllables import initialize_presyllabified, make_syllables, potential_word_index_to_end
  5 | from finnish_weight import make_weights
  6 | from finnish_sonority import make_sonorities
  7 | from finnish_stress import make_stresses
  8 | 
  9 | from copy import deepcopy
 10 | 
 11 | import os, sys, re
 12 | 
 13 | # location in list of user files for each file
 14 | PRESYLL = 0
 15 | INITIAL = 1
 16 | SUFFIX = 2
 17 | COMPOUND = 3
 18 | FUNCTION = 4
 19 | 
 20 | dirself = os.path.join(os.getcwd(), 'lib', 'fi')
 21 | user_files = [os.path.join(dirself,'presyllabified.txt'), os.path.join(dirself,'initial.txt'), os.path.join(dirself,'suffix.txt'), os.path.join(dirself,'compound.txt'), os.path.join(dirself,'function_words.txt')] # default values, in case user input is ill-formed or unavailable
 22 | config_file = os.path.join(dirself,'config.txt')
 23 | 
 24 | initial_compounds = []
 25 | suffixes = []
 26 | function_words = []
 27 | compound_dict = {}
 28 | 
 29 | # initialize list l with words from filename, a file with words on individual lines
 30 | def initialize_list(l, filename):
 31 |     try:
 32 |         f = open(filename, 'r')
 33 |         entries = f.read()
 34 |         f.close()
 35 |         entries = re.findall('[\r\n]*([^\r\n]+)[\r\n]*', entries, re.UNICODE)
 36 |         for entry in entries:
 37 |             l.append(unicode(entry.lower(), 'utf-8'))
 38 |     except IOError:        
 39 |         print "Error: File not found."
 40 | 
 41 | # initialize dict with entries, where key is entry from entries in lowercase without separator, and value is list of words in entry split at separator
 42 | def initialize_dict(dict, entries, separator):
 43 |     for entry in entries:
 44 |         entry = entry.lower()
 45 |         hyphen_free = entry.replace(separator, '')
 46 |         words = entry.split(separator)
 47 |         dict[hyphen_free] = words
 48 | 
 49 | # initialize a dictionary from a file
 50 | # the first line of the file is the separator character
 51 | # the remaining lines are words with separations marked by the separator character
 52 | def initialize_dict_from_file(dict, filename):    
 53 |     try:        
 54 |         f = open(filename, 'r')
 55 |         entries = f.readlines()
 56 |         f.close()
 57 |         
 58 |         for i in range(len(entries)):
 59 |             if entries[i][-1] == '\n' or entries[i][-1] == '\r':
 60 |                 # remove final newline character
 61 |                 entries[i] = entries[i][:-1]
 62 |         
 63 |         separator = entries[0]
 64 |         entries = [unicode(entry, 'utf-8') for entry in entries[1:]]
 65 |         initialize_dict(dict, entries, separator)        
 66 |     except IOError:        
 67 |         print "Error: File not found."
 68 | 
 69 | # initialize configuration
 70 | def initialize_config():
 71 |     try:
 72 |         f = open(config_file, 'r')
 73 |         entries = f.readlines()
 74 |         f.close()
 75 |         
 76 |         if len(entries) != len(user_files):
 77 |             return
 78 |          
 79 |         for i in range(len(user_files)-1): # last word does not end in newline
 80 |             entries[i] = entries[i][:-1]
 81 |         for i in range(len(user_files)):
 82 |             if os.path.isfile(entries[i]):
 83 |                 user_files[i] = entries[i]
 84 |     except IOError:
 85 |         print "Error: Config file not found."
 86 |     
 87 |     initialize_presyllabified(user_files[PRESYLL])
 88 |     initialize_list(initial_compounds, user_files[INITIAL])
 89 |     initialize_list(suffixes, user_files[SUFFIX])
 90 |     initialize_dict_from_file(compound_dict, user_files[COMPOUND])
 91 |     initialize_list(function_words, user_files[FUNCTION])
 92 | 
 93 | initialize_config()
 94 | 
 95 | def is_lexical(word):
 96 |     word = word.lower()
 97 |     if not potential_word_index_to_end(word, 0):
 98 |         return False
 99 |     function_words = []
100 |     initialize_list(function_words, user_files[FUNCTION])
101 |     return not word in function_words
102 | 
103 | # a class representing an annotation
104 | # the constructor assumes that the word contains no compounds
105 | class Annotation:
106 |     def __init__(self, word):
107 |         if type(word) != unicode:
108 |             word = unicode(word, 'utf-8')
109 |         self.word = word
110 |         self.syllables = make_syllables(word)
111 |         self.split_sylls = [split_syllable(syll) for syll in self.syllables]
112 |         self.weights = make_weights(self.split_sylls)
113 |         self.sonorities = make_sonorities(self.split_sylls)
114 |         self.stresses = make_stresses(self.weights)
115 |         self.lexical = is_lexical(word)
116 |         self.str = annotation_string(self)[1:-1]
117 |     
118 |     def __repr__(self):
119 |         return self.str.encode('utf-8')
120 |     
121 |     def join(self, annotation):
122 |         self.word += annotation.word
123 |         self.syllables += annotation.syllables
124 |         self.weights += annotation.weights
125 |         self.sonorities += annotation.sonorities
126 |         self.split_sylls += annotation.split_sylls
127 |         
128 |         # only concatenate stresses if there is something to concatenate
129 |         if len(annotation.stresses[0]) > 0:
130 |             total_stresses = []       
131 |             for i in range(len(self.stresses)):
132 |                 for j in range(len(annotation.stresses)):
133 |                     total_stresses += [deepcopy(self.stresses[i])]
134 |                     total_stresses[-1] += [Stress.secondary]
135 |                     # replace initial (primary) stress of annotation with secondary stress
136 |                     total_stresses[-1] += annotation.stresses[j][1:]
137 |             self.stresses = total_stresses
138 |         
139 |         self.str = annotation_string(self)[1:-1]
140 | 
141 | # if the final word in the list of words starts with a word in the list of compound-initial words, split the word and apply the function again
142 | # (i.e., split off all initial words in initial_compounds)
143 | def split_initial_compounds(words):
144 |     for word in initial_compounds:
145 |         if words[-1].lower().startswith(word):
146 |             return split_initial_compounds(words[:-1] + [words[-1][:len(word)]] + [words[-1][len(word):]])
147 |     return words
148 | 
149 | # if the final word in the list of words ends with a suffix in suffixes, split the word at the suffix
150 | def split_suffix(words):
151 |     for suffix in suffixes:
152 |         if words[-1].lower().endswith(suffix):
153 |             boundary = len(words[-1]) - len(suffix)
154 |             return words[:-1] + [words[-1][:-len(suffix)]] + [words[-1][-len(suffix):]]
155 |     return words
156 | 
157 | # split each word in words apart if it appears in the dictionary of compounds
158 | def split_preannotated_compounds(words):
159 |     result = []
160 |     for i in range(len(words)):
161 |         if words[i].lower() in compound_dict:
162 |            result += compound_dict[words[i].lower()]
163 |         else:
164 |             result += [words[i]]
165 |     return result
166 |             
167 |     
168 | ORTHOGRAPHIC_COMPOUND_MARKER = '-' # the symbol in Finnish orthography marking compound boundaries
169 | 
170 | # combine subminimal morphs (CVC*)
171 | def attach_subminimal_morphs(morphs):
172 | 	# if first morph is subminimal, attach to the right
173 | 	if not potential_word_index_to_end(morphs[0], 0) and len(morphs) > 1:
174 | 		morphs[1] = morphs[0] + morphs[1]
175 | 		del morphs[0]
176 | 	# for all other morphs, attach to the left if subminimal
177 | 	i = 1
178 | 	while i < len(morphs):
179 | 		if not potential_word_index_to_end(morphs[i], 0):
180 | 			morphs[i-1] += morphs[i]
181 | 			del morphs[i]
182 | 		else:
183 | 			i += 1
184 | 	return morphs
185 | 
186 | # make an annotation for a word
187 | def make_annotation(word):
188 |     words = [word]
189 |     words = split_initial_compounds(words)
190 |     words = words[:-1] + words[-1].split(ORTHOGRAPHIC_COMPOUND_MARKER)
191 |     words = split_suffix(words)
192 |     words = split_preannotated_compounds(words)
193 |     words = attach_subminimal_morphs(words)
194 |     annotations = [Annotation(word) for word in words]
195 | 
196 |     for i in range(1, len(annotations)):
197 |         annotations[0].join(annotations[i])
198 |     return annotations[0]
199 | 
200 | # print a representation of an annotation for a word
201 | def print_annotation(word_annotation):
202 |     print annotation_string(word_annotation)
203 |     print pattern_string(word_annotation)
204 |     print
205 | 
206 | # return the string annotation of a word (in IPA)
207 | def ipa_annotation(word):
208 |     return to_ipa(annotation_string(make_annotation(word)))[1:-1]
209 | 
210 | def annotation_string(word_annotation):
211 |     result = ''
212 |     for i in range(len(word_annotation.stresses)):
213 |         result += SYLLABLE_SEPARATOR
214 |         for j in range(len(word_annotation.syllables)):
215 |             # mark stresses
216 |             if word_annotation.stresses[i][j] == Stress.primary:
217 |                 result += u'ˈ'
218 |             elif word_annotation.stresses[i][j] == Stress.secondary:
219 |                 result += u'ˌ'
220 |             # add syllable content and separator
221 |             result += word_annotation.syllables[j] + SYLLABLE_SEPARATOR
222 |         result += '\n'
223 |         break # TEMPORARY, RETURN ONLY ONE ANNOTATION
224 |     return result[:-1] # remove final newline
225 | 
226 | # return a string representing the weight pattern
227 | # e.g. the weights for ".´ny.ky.`en.nus.te." are represented 'LLHHL'
228 | def syll_pattern(weights):    
229 |     result = ''    
230 |     for w in weights:
231 |         result += Weight.dict[w]            
232 |     return result
233 | 
234 | # return a string representing the stress pattern
235 | # e.g. the stresses for ".´ny.ky.`en.nus.te." are represented 'PUSUU'
236 | def stress_pattern(stresses):    
237 |     result = ''    
238 |     for i in range(len(stresses)):
239 |         for s in stresses[i]:
240 |             result += Stress.dict[s]
241 |         result += ', '            
242 |     return result[:-2] # remove last comma and space
243 | 
244 | # return a string representing the sonority pattern
245 | # e.g. the sonority for taloiden is represented 'AAI'
246 | def sonority_pattern(sonorities):
247 |     result = ''
248 |     for s in sonorities:
249 |         result += s
250 |     return result
251 | 
252 | # print a representation of the weights and stresses
253 | def pattern_string(word_annotation):
254 |     return 'Weight: ' + syll_pattern(word_annotation.weights) + '   Stress: ' + stress_pattern(word_annotation.stresses) + '   Sonority: ' + sonority_pattern(word_annotation.sonorities)


--------------------------------------------------------------------------------
/lib/fi/finnish_functions.py:
--------------------------------------------------------------------------------
 1 | ﻿# coding=utf-8
 2 | # symbol to demarcate syllable boundaries; should be one character
 3 | SYLLABLE_SEPARATOR = u'.'
 4 | 
 5 | # consonant clusters that should be kept together, following Karlsson 1985: (4)
 6 | # note: as suggested by Anttila (p.c.), the consonants are split if there is only one vowel to the left or right of the consonants
 7 | # this works because Karlsson's rule is designed to handle compounds, but Finnish words must contain at least two vocalic moras
 8 | CLUSTERS = set([u'bl', u'br', u'dr', u'fl', u'fr', u'gl', u'gr', u'kl', u'kr', u'kv', u'pl', u'pr', u'cl', u'qv', u'schm'])
 9 | CLUSTER_LENGTHS = set(len(cluster) for cluster in CLUSTERS)
10 | 
11 | # sets of Finnish vowels, diphthongs, and consonants
12 | VOWELS = set([u'i', u'e', u'ä', u'y', u'ö', u'a', u'u', u'o'])
13 | DIPHTHONGS = set([u'ai', u'ei', u'oi', u'äi', u'öi', u'au', u'eu', u'ou', u'ey', u'äy', u'öy', u'ui', u'yi', u'iu', u'iy', u'ie', u'uo', u'yö'])
14 | CONSONANTS = set([u'b', u'c', u'd', u'f', u'g', u'h', u'j', u'k', u'l', u'm', u'n', u'p', u'q', u'r', u's', u't', u'v', u'w', u'x', u'z', u"'"]) # ' included for purposes of words like vaa'an
15 | 
16 | # following Anttila 2008 on Finnish stress (p. 5)
17 | SON_HIGH = set([u'i', u'e', u'u', u'y'])
18 | SON_LOW = set([u'a', u'ä', u'o', u'ö'])
19 | 
20 | def is_vowel(ch):
21 |     return ch in VOWELS
22 | 
23 | def is_consonant(ch):
24 |     return ch in CONSONANTS
25 | 
26 | def is_cluster(ch):
27 |     return ch in CLUSTERS
28 | 
29 | def is_diphthong(chars):
30 |     return chars in DIPHTHONGS
31 | 
32 | def is_long(chars):
33 |     return chars[0] == chars[1] # no error checking
34 | 
35 | # in a split syllable, the onset is the 0th element, the nucleus is the 1st, and the coda is the 2nd
36 | class Syllable:
37 |     onset = 0
38 |     nucleus = 1
39 |     coda = 2
40 | 
41 | # syllable weights in increasing order of weight, for purposes of deciding which syllable to stress in a sequence of two syllables
42 | class Weight:
43 |     CV = 0 # (C)V
44 |     CVC = 1 # (C)VC+
45 |     CVV = 2 # (C)VV+C*
46 |     
47 |     dict = {CV:'L', CVC:'H', CVV:'H'}
48 | 
49 | # return true if weight is greater than the weight of a light syllable
50 | def is_heavy(weight):
51 |     return weight > Weight.CV
52 | 
53 | # return true if weight1 is greater than weight2
54 | def is_heavier(weight1, weight2):
55 |     return weight1 > weight2
56 | 
57 | # modelled after CMU Pronouncing Dictionary
58 | class Stress:
59 |     none = 0
60 |     primary = 1
61 |     secondary = 2
62 |     dict = {none:'U', primary:'P', secondary:'S'}
63 | 
64 | # given a single syllable, split it into a list of its onset, nucleus, and coda
65 | def split_syllable(syllable):
66 |     result = []
67 |     i = 0
68 |     while i < len(syllable) and is_consonant(syllable[i].lower()):
69 |         i += 1
70 |     nucleus_start = i
71 |     result += [syllable[0:nucleus_start]] # store onset (composed of consonants)
72 |     while i < len(syllable) and is_vowel(syllable[i].lower()):
73 |         i += 1
74 |     coda_start = i  
75 |     result += [syllable[nucleus_start:coda_start]] # store nucleus (composed of vowels)
76 |     result += [syllable[coda_start:]] # store coda (composed of consonants)
77 |     return result
78 | 
79 | def to_ipa(word):
80 | 	return word.replace(u'a', u'ɑ').replace(u'ä', u'æ').replace(u'ö', u'ø')


--------------------------------------------------------------------------------
/lib/fi/finnish_sonority.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from finnish_functions import *
 3 | 
 4 | # return the sonority of a syllable
 5 | def get_sonority(vowel):
 6 |     if len(vowel) == 0:
 7 |         return '?' # no vowel in this "syllable"
 8 |     return vowel[0].upper()
 9 | 
10 | def make_sonorities(split_sylls):
11 |     return [get_sonority(syll[Syllable.nucleus]) for syll in split_sylls]


--------------------------------------------------------------------------------
/lib/fi/finnish_stress.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from finnish_functions import *
 3 | 
 4 | from copy import deepcopy
 5 | 
 6 | stress_dict = {} # map between words and their hand-annotated for stress
 7 | 
 8 | # given a list of weights, and a list of equal length of stresses, store added stress markings in stresses
 9 | def make_stresses(weights):
10 |     stresses = []
11 |     if len(weights) == 1 and not is_heavier(weights[0], Weight.CVC):
12 |         return [[Stress.none]]
13 |     if len(weights) > 0:
14 |         stresses += [Stress.primary]     
15 |     for i in range(1, len(weights)):
16 |         stresses += [Stress.none]
17 |     
18 |     stress_parity = 0 # currently stressing odd syllables, located at even indices
19 |     
20 |     # first syllable is always stressed, and following syllable is never stressed, so start with third syllable
21 |     i = 2
22 |     while i < len(weights) - 1:
23 |         # if at a syllable to potentially be stressed
24 |         if i % 2 == stress_parity:
25 |             # shift stress forward one if following syllable is already stressed (to avoid clash), or if the following syllable is non-final and heavier
26 |             if stresses[i+1] != Stress.none or (is_heavier(weights[i+1], weights[i]) and i+1 < len(weights) - 1):
27 |                 stresses[i+1] = Stress.secondary
28 |                 i += 1
29 |                 stress_parity = (stress_parity + 1) % 2 # swap which parity to stress on, since stress assignment continues from stressed syllable
30 |             else:
31 |                 stresses[i] = Stress.secondary
32 |         i += 2 # can ignore syllable after the one just stressed, since it won't be stressed to avoid clash
33 |     stresses = [stresses]
34 |     
35 |     # optionally stress a final heavy where appropriate, and if the preceding syllable is light and stressed make its stress optional
36 |     if len(weights) > 1 and is_heavy(weights[-1]):
37 |         if stresses[0][-2] == Stress.none:
38 |             stresses += deepcopy(stresses)
39 |             stresses[1][-1] = Stress.secondary
40 |         elif stresses[0][-2] == Stress.secondary and not is_heavy(weights[-2]):
41 |             stresses += deepcopy(stresses)
42 |             stresses[1][-1] = Stress.secondary
43 |             stresses[1][-2] = Stress.none
44 |     return stresses
45 | 


--------------------------------------------------------------------------------
/lib/fi/finnish_syllables.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from finnish_functions import *
  3 | 
  4 | # -*- coding: utf-8 -*-
  5 | 
  6 | # note: initial consonant cluster not listed in inseparable clusters will be split up (e.g., traffic -> .t.raf.fic.)
  7 | 
  8 | # for a word w, syllable boundaries are represented by a list l of length len(w)+1
  9 | # l[i] = 1 iff w[i] should be preceded by a syllable boundary, else l[i] = 0
 10 | # thus, the first and last elements of l are always 1 (since words are surrounded by syllable boundaries)
 11 | 
 12 | # fill dict with key-value pairs where key is an entry from entries with the hyphens removed,
 13 | # value is a list representing syllable boundaries as described above
 14 | def initialize_dict(dict, entries, separator):
 15 |     for entry in entries:
 16 |         hyphen_free = entry.replace(separator, '').lower()
 17 |         boundary_list = [1]
 18 |         i = 1
 19 |         while i < len(entry):
 20 |             if entry[i] == separator:
 21 |                 boundary_list += [1]
 22 |                 i += 1
 23 |             else:
 24 |                 boundary_list += [0]
 25 |             i += 1
 26 |         dict[hyphen_free] = boundary_list + [1]
 27 | 
 28 | # initialize a dictionary from a file
 29 | # the first line of the file is the separator character
 30 | # the remaining lines are words with separations marked by the separator character
 31 | def initialize_dict_from_file(dict, filename):  
 32 |     try:  
 33 |         f = open(filename, 'r')
 34 |         entries = f.readlines()
 35 |         f.close()
 36 |         for i in range(len(entries)-1):
 37 |             entries[i] = entries[i][:-1] # remove final newline character
 38 |         separator = entries[0]
 39 |         entries = entries[1:]
 40 |         initialize_dict(dict, entries, separator)
 41 |     except IOError:
 42 |         print "Error: File not found."
 43 | 
 44 | pre_sep_dict = {} # map between words that have been hand-annotated and their annotations
 45 | 
 46 | # initialize the presyllabified words from a file of format described above
 47 | def initialize_presyllabified(filename):
 48 |     initialize_dict_from_file(pre_sep_dict, filename)
 49 | 
 50 | vowel_seq_dict = {} # map between sequences of three and for vowels and their syllabifications [modelled after Karlsson 1985: (2b), but using Karlsson 1982 T-5/T-7 to deal with 'ieu', 'uoi']
 51 | VOWEL_SEQUENCES = ['ai-oi', 'ai-ui', 'au-oi', 'eu-oi', 'ie-oi', 'ie-ui', 'oi-oi', 'oi-ui', 'uo-ui', 'yö-yi', 'a-ei', 'a-oi', 'e-ai', 'e-oi', 'e-äi', 'e-öi', 'i-ai', 'i-au',
 52 |                    'i-oi', 'i-äi', 'i-öi', 'o-ai', 'u-ai', 'u-ei', 'u-oi', 'y-ei', 'y-äi', 'ä-yi', 'ä-öi', 'ai-a', 'ai-e', 'ai-o', 'ai-u', 'au-a', 'au-e', 'eu-a', 'ie-a', 'ie-o', 'ie-u', 'ie-y',
 53 |                    'i-o-a', 'i-o-e', 'i-ö-e', 'i-ö-ä', 'iu-a', 'iu-e', 'iu-o', 'oi-a', 'oi-e', 'oi-o', 'oi-u', 'ou-e', 'ou-o', 'u-e-a', 'ui-e', 'uo-a', 'uo-u', 'y-e-ä', 'yö-e', 'äi-e']
 54 | initialize_dict(vowel_seq_dict, VOWEL_SEQUENCES, '-')
 55 | 
 56 | # return the index of the start of the first long vowel in chars; -1 if absent
 57 | def locate_long(chars):
 58 |     for i in range(len(chars)-1):
 59 |         if is_long(chars[i:i+2]):
 60 |             return i
 61 |     return -1
 62 | 
 63 | # diphthongs and long vowels should not be split
 64 | def is_inseparable_vowels(chars):
 65 |     return is_diphthong(chars) or is_long(chars)
 66 | 
 67 | # return true if chars is an inseparable cluster or a lone consonant
 68 | def consonantal_onset(chars):
 69 |     return is_cluster(chars) or is_consonant(chars)
 70 | 
 71 | # applied Karlsson (3c); only checks for 'ien', since others are handled by vowel splitting rules
 72 | # word-final 'ien' will be syllabified 'i-en', unless following a 't'
 73 | def apply_3c(word, boundary_list):
 74 |     sequence = 'ien'
 75 |     seq_len = len(sequence)
 76 |     if len(word) > seq_len:
 77 |         if word[-seq_len:] == sequence and word[-(seq_len+1)] != 't':
 78 |             boundary_list[-3] = 1 # last entry is for word-final syllable boundary
 79 | 
 80 | # Karlsson 1982: T-4 applies to diphthongs ending in 'u' and 'y'
 81 | t4_final_v = ['u', 'y']
 82 | t4_diphthongs = set(vv for vv in DIPHTHONGS if vv[-1] in t4_final_v)
 83 | 
 84 | # apply rule T-4 from Karlsson 1982 to two vowels, assuming the word is already syllabified
 85 | def apply_t4(word, boundary_list):
 86 |     for i in range(3, len(word)): # check for rule application at syllable boundary (including word end); first possible boundary at index 3 (VVC-)
 87 |         if boundary_list[i] == 1:
 88 |             # if syllable ends in a T-4 diphthong followed by a consonant, introduce split in former diphthong
 89 |             if is_consonant(word[i-1]) and word[i-3:i-1] in t4_diphthongs:
 90 |                     boundary_list[i-2] = 1
 91 |     return word
 92 | 
 93 | # return vowels with syllable boundaries for appropriate separations
 94 | def separate_vowels(vowels, boundary_list, start):
 95 |     v_len = len(vowels)    
 96 |     if v_len == 2 and not is_inseparable_vowels(vowels):          
 97 |         boundary_list[start+1] = 1 # insert boundary before the second vowel      
 98 |     elif v_len > 2:
 99 |         if vowels in vowel_seq_dict:
100 |             # store information from vowel sequence dictionary; ignore first entry, as the dictionary does not know if a syllable boundary precedes the vowel sequence
101 |             boundary_list[start+1:start+v_len] = vowel_seq_dict[vowels][1:-1] # ignore initial syllable separator and first vowel
102 |         else:
103 |             # first look for long vowels, following Karlsson 1985: (2a)
104 |             boundary = locate_long(vowels)
105 |             if boundary != -1:
106 |                 # if long vowel starts the sequence, separation should precede the third vowel; otherwise it should procede the location of the long vowel
107 |                 if boundary == 0:
108 |                     boundary = 2
109 |                     separate_vowels(vowels[boundary:], boundary_list, start+boundary) # syllabify vowels following long vowel
110 |                 else:
111 |                     separate_vowels(vowels[:boundary], boundary_list, start) # syllabify vowels preceding long vowel            
112 |                 boundary_list[start + boundary] = 1 # split vowel from long vowel
113 |             else: # if no such sequence, simply separate all separable VV sequences
114 |                 for i in range(len(vowels)-1):
115 |                     if not is_inseparable_vowels(vowels[i:i+2]):
116 |                         boundary_list[start + (i + 1)] = 1 # insert boundary before the second vowel
117 | 
118 | # return the syllabification of word, preserving capitalization; syllable boundaries are placed at the start and end of the word
119 | def make_syllables(word):    
120 |     entry = word.lower()
121 |     boundary_list = [1]
122 |     
123 |     if entry in pre_sep_dict: # introduces annotations, but will still be syllabified so that only partial annotations are required
124 |         boundary_list = pre_sep_dict[entry]
125 |     else:
126 |         for i in range(1, len(entry)):
127 |             boundary_list += [0 if entry[i] != "'" else 1]
128 |         boundary_list += [1]
129 | 
130 |     make_splits(entry + SYLLABLE_SEPARATOR, boundary_list) # syllable separator added to ensure that final vowel sequence is syllabified
131 |     syllables = introduce_splits(word, boundary_list) 
132 |     return syllables
133 | 
134 | # return a string with the syllable boundaries represented in syllabified_word but the capitalization represented in original_word
135 | def introduce_splits(word, boundary_list):
136 |     result = []
137 |     start = 0
138 |     end = 0
139 |     while end < len(word):
140 |         end += 1
141 |         if boundary_list[end] == 1:            
142 |             if word[start] == "'":
143 |                 result += [word[start+1:end]] # do not start a syllable with '                
144 |             else:
145 |                 result += [word[start:end]]                
146 |             start = end        
147 |     return result
148 | 
149 | 
150 | # account for Karlsson 1985: (4); certain consonants should be clusters
151 | # stored in order: test clusters first, then the basic CV-rule
152 | onset_lengths = [cluster_length for cluster_length in CLUSTER_LENGTHS]
153 | onset_lengths += [1]
154 | 
155 | # return true if word has at least two moras starting from index to the end
156 | def potential_word_index_to_end(word, index):
157 | 	vowel_count = 0
158 | 	for ch in word[index:]:
159 | 		if is_vowel(ch):
160 | 			vowel_count += 1
161 | 			if vowel_count > 1:
162 | 				return True
163 | 	return False
164 | 
165 | # store syllable boundaries in boundary_list
166 | def make_splits(word, boundary_list):
167 |     # stores the location of the start and end of the longest vowel sequence encountered so far
168 |     v_seq_start = 0
169 |     v_seq_end = 0    
170 |     # number of vowels encountered to avoid splitting initial consonant clusters
171 |     # also decides whether to split a cluster as if it were a compound boundary
172 |     # (words must have at least two moras)
173 |     vowel_count = 0    
174 |     for i in range(len(word)):        
175 |         if is_vowel(word[i]): # continuing or starting vowel sequence            
176 |             vowel_count += 1
177 |             v_seq_end += 1            
178 |             # deals with initial consonant clusters
179 |             if vowel_count == 1:
180 |                 continue
181 |             # potential application of CV-rule [Karlsson 1985: (1)]
182 |             if v_seq_end - v_seq_start == 1:                
183 |                 # test possible onsets
184 |                 for onset_length in onset_lengths:
185 |                     cluster_start = i - onset_length
186 |                     # if encounter a good boundary, only insert separator if not already present; break regardless so that basic CV won't apply if appropriate cluster exists
187 |                     if cluster_start >= 0 and consonantal_onset(word[cluster_start:i]):                        
188 |                         # do not split as if a compound if the preceding word would have less than 2 moras
189 |                         if onset_length > 1:
190 |                             if vowel_count < 3: # an extra vowel has been added, so 2 moras plus 1
191 |                                 continue                        
192 |                         no_syllable_break = True                        
193 |                         for h_index in range(cluster_start, i):                            
194 |                             if boundary_list[h_index] == 1:
195 |                                 no_syllable_break = False                        
196 |                         if no_syllable_break:
197 |                             if onset_length > 1:
198 |                                 # do not split as if cluster if no word to the right
199 |                                 if not potential_word_index_to_end(word, i):
200 |                                 	continue
201 |                                 vowel_count = 1 # split as if cluster, starting new word having encountered one vowel
202 |                             boundary_list[cluster_start] = 1                            
203 |                         break                    
204 |         else: # vowel sequence interrupted; if there is a sequence to be split, deal with it            
205 |             if v_seq_end - v_seq_start > 1:                
206 |                 separate_vowels(word[v_seq_start:v_seq_end], boundary_list, v_seq_start)
207 |             v_seq_start = v_seq_end = i+1 # vowel sequence (if any) starts after current index
208 |     apply_3c(word[:-1], boundary_list) # chop off final syllable separator
209 |     apply_t4(word, boundary_list)
210 | 


--------------------------------------------------------------------------------
/lib/fi/finnish_weight.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from finnish_functions import *
 3 | 
 4 | # return the syllable weight of a single syllable
 5 | def syll_weight(syll_split):
 6 |     
 7 |     if len(syll_split[Syllable.nucleus]) > 1: # if the nucleus is long, heaviest
 8 |         return Weight.CVV
 9 | 
10 |     elif len(syll_split[Syllable.coda]) > 0: # if a coda is present, heavy
11 |         return Weight.CVC
12 | 
13 |     else:
14 |         return Weight.CV # light
15 | 
16 | # given a list of syllables, store their weights in weights
17 | def make_weights(syllables):
18 | 
19 |     weights = []
20 |     
21 |     for syll in syllables:
22 |         weights += [syll_weight(syll)]
23 | 
24 |     return weights
25 | 


--------------------------------------------------------------------------------
/lib/fi/function_words.txt:
--------------------------------------------------------------------------------
 1 | minä
 2 | minun
 3 | mun
 4 | mua
 5 | sinä
 6 | sinun
 7 | sun
 8 | sua
 9 | hän
10 | hänen
11 | häntä
12 | se
13 | sen
14 | sitä
15 | me
16 | meidän
17 | meitä
18 | te
19 | teidän
20 | teitä
21 | he
22 | heidän
23 | heitä
24 | ne
25 | niiden
26 | niitä
27 | läpi
28 | ohi
29 | yli


--------------------------------------------------------------------------------
/lib/fi/initial.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jsfalk/prosodic1b/42417e40264a42c0e4af0b1e4a3dc0f0bb207849/lib/fi/initial.txt


--------------------------------------------------------------------------------
/lib/fi/presyllabified.txt:
--------------------------------------------------------------------------------
1 | -


--------------------------------------------------------------------------------
/lib/fi/suffix.txt:
--------------------------------------------------------------------------------
 1 | logi
 2 | login
 3 | logia
 4 | logina
 5 | logiksi
 6 | logissa
 7 | logista
 8 | logiin
 9 | logilla
10 | logilta
11 | logille
12 | logeitta
13 | logein
14 | logit
15 | logien
16 | logeja
17 | logeina
18 | logeiksi
19 | logeissa
20 | logeista
21 | logeihin
22 | logeilla
23 | logeilta
24 | logeille


--------------------------------------------------------------------------------
/lib/fi/unstressed.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jsfalk/prosodic1b/42417e40264a42c0e4af0b1e4a3dc0f0bb207849/lib/fi/unstressed.txt


--------------------------------------------------------------------------------
/prosodic.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | 
  3 | import codecs, sys
  4 | import Language
  5 | from Text import Text
  6 | from Word import Word
  7 | from Syllable import Syllable
  8 | from Phoneme import Phoneme
  9 | from ipa import *
 10 | 
 11 | def print_intro():
 12 | 	print "#################"
 13 | 	print "# Prosodic 1.0b #"
 14 | 	print "#################"
 15 | 	print
 16 | 
 17 | def print_main_instructions():
 18 | 	print
 19 | 	print "Please type one of the following commands:"
 20 | 	print "/annotation: convert text to IPA"
 21 | 	print "/ipa: work with IPA features"
 22 | 	print "/exit or /quit: exit the program"
 23 | 	print
 24 | 
 25 | def print_annotation_instructions():
 26 | 	print
 27 | 	print "Current language: " + Language.selected_language.name 
 28 | 	print "Please type one of the following commands:"
 29 | 	print "/file [filename]: convert a file to IPA and dispaly the output"
 30 | 	print "/file [input] [output]: convert a file to IPA and store the output"
 31 | 	print "/language [lang]: change the language for annotation (" + ", ".join(Language.language_dict.keys()) + ")"
 32 | 	print "/main: return to the main menu"
 33 | 	print "or enter " + Language.selected_language.name + " text to display an IPA transcription"
 34 | 	print
 35 | 
 36 | def print_ipa_instructions():
 37 | 	print
 38 | 	print "Please type one of the following commands:"
 39 | 	print "/class [features]: display all phonemes belonging to a natural class"
 40 | 	print "(e.g.: /class +labial -voice)"
 41 | 	print "/features: display a list of all phonological features in the system"
 42 | 	print "/main: return to the main menu"
 43 | 	print "or enter a phoneme to display its feature values"
 44 | 	print
 45 | 
 46 | def process_files(filenames):
 47 | 	if len(filenames) < 1 or len(filenames) > 2:
 48 | 		print "ERROR: enter one or two filenames"
 49 | 		return
 50 | 	infile = filenames[0]
 51 | 	try:
 52 | 		input = codecs.open(infile, 'r', 'utf-8')
 53 | 		raw = input.read()
 54 | 		input.close()
 55 | 		text = Text(raw)
 56 | 		if len(filenames) == 1:
 57 | 			print text
 58 | 		elif len(filenames) == 2:
 59 | 			outfile = filenames[1]
 60 | 			output = codecs.open(outfile, 'w', 'utf-8')
 61 | 			output.write(unicode(str(text), 'utf-8'))
 62 | 			output.close()
 63 | 	except IOError:
 64 | 		print "ERROR: could not open input file (" + infile + ")"
 65 | 
 66 | def run_annotation():
 67 | 	print_annotation_instructions()
 68 | 	while True:
 69 | 		print ">>",
 70 | 		input = unicode(raw_input(), 'utf-8')
 71 | 		if input.startswith("/help"):
 72 | 			print_annotation_instructions()
 73 | 			continue
 74 | 		elif input == "/main":
 75 | 			return
 76 | 		elif input == "/quit" or input == "/exit":
 77 | 			sys.exit()
 78 | 		elif input.startswith("/language"):
 79 | 			language = input[len("/language"):].strip() # remove the command
 80 | 			if language in Language.language_dict:
 81 | 				Language.selected_language = Language.language_dict[language]
 82 | 				print "Language set to " + language
 83 | 			else:
 84 | 				print "ERROR: invalid language choice"
 85 | 		elif input.startswith("/file "):
 86 | 			files = input.split(' ')
 87 | 			process_files(files[1:]) # remove the command
 88 | 		else:
 89 | 			text = Text(input)
 90 | 			print text
 91 | 		print
 92 | 
 93 | def run_ipa():
 94 | 	print_ipa_instructions()
 95 | 	while True:
 96 | 		print ">>",
 97 | 		input = unicode(raw_input(), 'utf-8')
 98 | 		if input.startswith("/help"):
 99 | 			print_ipa_instructions()
100 | 			continue
101 | 		elif input == "/main":
102 | 			return
103 | 		elif input == "/quit" or input == "/exit":
104 | 			sys.exit()
105 | 		elif input.startswith("/class"):
106 | 			input = input[len("/class"):] # remove the command
107 | 			natural_class = get_natural_class(input)
108 | 			print ", ".join(natural_class)
109 | 		elif input == "/features":
110 | 			print "Vowel features:", ", ".join(vowel_features)
111 | 			print "Consonant features:", ", ".join(consonant_features)
112 | 		elif input in ipa_to_features:
113 | 			print phoneme_feature_string(input)
114 | 		else:
115 | 			print "ERROR: invalid command or unknown phoneme"
116 | 		print
117 | 
118 | def run_program():
119 | 	print_intro()
120 | 	while True:
121 | 		print_main_instructions()
122 | 		print ">>",
123 | 		input = raw_input()
124 | 		if input == "/annotation":
125 | 			run_annotation()
126 | 		elif input == "/ipa":
127 | 			run_ipa()
128 | 		elif input == "/quit" or input == "/exit":
129 | 			return
130 | 		else:
131 | 			print "ERROR: invalid command"
132 | 
133 | def print_command_instructions():
134 | 	print "Usage: python prosodic.py [infile] [outfile]"
135 | 
136 | if __name__ == "__main__":
137 | 	if len(sys.argv) == 1:
138 | 		run_program()
139 | 	elif len(sys.argv) <= 3: # if given one or two filenames
140 | 		process_files(sys.argv[1:])
141 | 	else:
142 | 		print_command_instructions()


--------------------------------------------------------------------------------
/tools.py:
--------------------------------------------------------------------------------
1 | import re
2 | 
3 | def separate_words(text):
4 | 	words = re.findall('\s*([^\s\d]+)\s*', text, re.UNICODE) # find sequences of letters and punctuation, separated by whitespace
5 | 	words = [''.join(re.findall("[\w']", word, re.UNICODE)) for word in words] # eliminate punctuation, except apostrophe
6 | 	return words


--------------------------------------------------------------------------------