├── LICENSE
├── README.md
├── convert_label.py
└── convertwav_to_16khz.praat


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Syuparn
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TextGridConverter
 2 | 
 3 | convert .lab files to .TextGrid files, which can be used in Praat
 4 | 
 5 | ## Description
 6 | 
 7 | TextGridConverter converts .lab files generated by Julius segmentation kit to .TextGrid files,
 8 | which can be used in Praat. With this code and Julius segmentation kit, you can get phonetic
 9 | labels of .wav files for Praat automatically.
10 | 
11 | ## Features
12 | 
13 | - convert all .lab files in a directory (recursively)
14 | - method to change segmantation unit (phonemes to moras)
15 | 
16 | ## Requirement
17 | 
18 | - Python 3.6 or more
19 | 
20 | ## Usage
21 | 
22 | 1. To convert .lab files in directory dir/to/path:
23 | 
24 | ```
25 | python convert_label.py dir/to/path
26 | ```
27 | 
28 | 2. Choose weather separate unit (phoneme or mora) is used
29 | 
30 | ```
31 | change segmentation unit to mora? (default:phoneme) y/n:
32 | ```
33 | 
34 | 3. This converts all .lab files in dir/to/path to .TextGrid files (recursively)
35 | 
36 | ## Installation
37 | 
38 |     $ git clone https://github.com/Syuparn/TextGridConverter
39 | 
40 | ## Anything Else
41 | 
42 | Before using Julius segmentation kit, you should resample .wav files to 16kHz.
43 | convertwav_to_16khz.praat will help you to resample. This converts all wav files
44 | in a directory.
45 | 
46 | ## Author
47 | 
48 | syuparn
49 | 
50 | ## License
51 | 
52 | [MIT](http://b4b4r07.mit-license.org)


--------------------------------------------------------------------------------
/convert_label.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import sys
  4 | 
  5 | 
  6 | class ExtentionException(Exception):
  7 |     pass
  8 | 
  9 | class EmptyLabelException(Exception):
 10 |     pass
 11 | 
 12 | 
 13 | class Segment:
 14 |     """
 15 |     a unit of speech (i.e. phoneme, mora)
 16 |     """
 17 |     def __init__(self, tStart, tEnd, label):
 18 |         self.tStart = tStart
 19 |         self.tEnd = tEnd
 20 |         self.label = label
 21 | 
 22 |     def __add__(self, other):
 23 |         return Segment(self.tStart, other.tEnd, self.label + other.label)
 24 | 
 25 |     def can_follow(self, other):
 26 |         """
 27 |         return True if Segment self can follow Segment other in one mora,
 28 |         otherwise return False
 29 |         example: (other, self)
 30 |              True: ('s', 'a'), ('sh', 'i'), ('ky', 'o:'), ('t', 's')
 31 |              False: ('a', 'q'), ('a', 's'), ('u', 'e'), ('s', 'ha')
 32 |         """
 33 |         vowels = ['a', 'i', 'u', 'e', 'o', 'a:', 'i:', 'u:', 'e:', 'o:']
 34 |         consonants = ['w', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j',
 35 |                       'k', 'z', 'c', 'b', 'n', 'm']
 36 |         only_consonants = lambda x: all([c in consonants for c in x])
 37 |         if only_consonants(other.label) and self.label in vowels:
 38 |             return True
 39 |         if only_consonants(other.label) and only_consonants(self.label):
 40 |             return True
 41 |         return False
 42 | 
 43 |     def to_textgrid_lines(self, segmentIndex):
 44 |         label = '' if self.label in ['silB', 'silE'] else self.label
 45 |         return [f'        intervals [{segmentIndex}]:',
 46 |                 f'            xmin = {self.tStart} ',
 47 |                 f'            xmax = {self.tEnd} ',
 48 |                 f'            text = "{label}" ']
 49 | 
 50 | 
 51 | def read_lab(filename):
 52 |     """
 53 |     read label file (.lab) generated by Julius segmentation kit and 
 54 |     return SegmentationLabel object
 55 |     """
 56 |     try:
 57 |         if not re.search(r'\.lab$', filename):
 58 |             raise ExtentionException("read_lab supports only .lab")
 59 |     except ExtentionException as e:
 60 |         print(e)
 61 |         return None
 62 |         
 63 |     with open(filename, 'r') as f:
 64 |         labeldata = [line.split() for line in f if line != '']
 65 |         segments = [Segment(tStart=float(line[0]), tEnd=float(line[1]), 
 66 |                             label=line[2])
 67 |                     for line in labeldata]
 68 |         return SegmentationLabel(segments)
 69 | 
 70 | 
 71 | class SegmentationLabel:
 72 |     """
 73 |     list of segments
 74 |     """
 75 |     def __init__(self, segments, separatedByMora=False):
 76 |         self.segments = segments
 77 |         self.separatedByMora = separatedByMora
 78 | 
 79 |     def by_moras(self):
 80 |         """
 81 |         return new SegmentationLabel object whose segment are moras 
 82 |         """
 83 |         if self.separatedByMora == True:
 84 |             return self
 85 | 
 86 |         moraSegments = []
 87 |         curMoraSegment = None
 88 |         for segment in self.segments:
 89 |             if curMoraSegment is None:
 90 |                 curMoraSegment = segment
 91 |             elif segment.can_follow(curMoraSegment):
 92 |                 curMoraSegment += segment
 93 |             else:
 94 |                 moraSegments.append(curMoraSegment)
 95 |                 curMoraSegment = segment
 96 |         if curMoraSegment:
 97 |             moraSegments.append(curMoraSegment)
 98 |         return SegmentationLabel(moraSegments, separatedByMora=True)
 99 | 
100 |     def _textgrid_headers(self):
101 |         segmentKind = 'mora' if self.separatedByMora else 'phoneme'
102 |         return ['File type = "ooTextFile"',
103 |                 'Object class = "TextGrid"',
104 |                 ' ',
105 |                 'xmin = 0 ',
106 |                f'xmax = {self.segments[-1].tEnd} ',
107 |                 'tiers? <exists> ',
108 |                 'size = 1 ',
109 |                 'item []: ',
110 |                 '    item [1]: ',
111 |                 '        class = "IntervalTier" ',
112 |                f'        name = "{segmentKind}" ',
113 |                 '        xmin = 0 ',
114 |                f'        xmax = {self.segments[-1].tEnd} ',
115 |                f'        intervals: size = {len(self.segments)} ']
116 | 
117 |     def to_textgrid(self, textgridFileName):
118 |         """
119 |         save to .TextGrid file, which is available for Praat
120 |         """
121 |         try:
122 |             if not self.segments:
123 |                 raise EmptyLabelException(f'warning: no label data found in '
124 |                                           f'{textgridFileName}')
125 |         except EmptyLabelException as e:
126 |             print(e)
127 |             return
128 | 
129 |         textgridLines = self._textgrid_headers()
130 |         for i, segment in enumerate(self.segments):
131 |             textgridLines.extend(segment.to_textgrid_lines(i + 1))
132 |         with open(textgridFileName, 'w') as f:
133 |             f.write('\n'.join(textgridLines))
134 | 
135 | 
136 | if __name__ == '__main__':
137 |     args = sys.argv
138 |     if len(args) >= 2:
139 |         mainDirectory = args[1]
140 |     else:
141 |         mainDirectory = os.curdir
142 | 
143 |     answer = None
144 |     while not answer in ['y', 'Y', 'n', 'N']:
145 |         answer = input('change segmentation unit to mora?'\
146 |                        ' (default:phoneme) y/n:')
147 |         choosesMora = answer in ['y', 'Y']
148 | 
149 |     for dirPath, dirNames, fileNames in os.walk(mainDirectory):
150 |         labFileNames = [n for n in fileNames if re.search(r'\.lab$', n)]
151 | 
152 |         for labFileName in labFileNames:
153 |             label = read_lab(os.path.join(dirPath, labFileName))
154 |             if choosesMora:
155 |                 label = label.by_moras()
156 |             textgridFileName = re.sub(r"\.lab$", ".TextGrid", labFileName)
157 |             label.to_textgrid(os.path.join(dirPath, textgridFileName))
158 | 


--------------------------------------------------------------------------------
/convertwav_to_16khz.praat:
--------------------------------------------------------------------------------
 1 | ##change sampling rate of .wav to 16kHz
 2 | 
 3 | clearinfo
 4 | 
 5 | form directory_path
 6 | 	text directory
 7 | endform
 8 | 
 9 | @convertAllWav: directory$, 16000, 1
10 | 
11 | 
12 | ##call convertWav recersively
13 | 
14 | procedure convertAllWav: .directory$, .resampleRate, .nestDepth
15 | 	.directory$['.nestDepth'] = .directory$
16 | 
17 | 	# convert wav files in current directory
18 | 	@convertWav: .directory$['.nestDepth'], .resampleRate
19 | 	
20 | 	#search inner directories
21 | 	.listname$ = "dir" + .directory$['.nestDepth']
22 | 	.strings['.nestDepth'] = Create Strings as directory list: .listname$, .directory$['.nestDepth']
23 | 	.numDirectories['.nestDepth'] = Get number of strings
24 | 
25 | 	for .i['.nestDepth'] to .numDirectories['.nestDepth']
26 | 		selectObject: .strings['.nestDepth']
27 | 		.innerDirectoryName$ = Get string: .i['.nestDepth']
28 | 		.curDirectory$ = .directory$['.nestDepth'] + "/" + .innerDirectoryName$
29 | 		@convertAllWav: .curDirectory$, .resampleRate, .nestDepth + 1
30 | 	endfor
31 | 	.nestDepth -= 1
32 | endproc
33 | 
34 | 
35 | ##resample .wav files in .directory$ by .resampleRate
36 | 
37 | procedure convertWav: .directory$, .resampleRate
38 | 	#make list object, which contains .wav file names
39 | 	.listName$ = "wav" + .directory$
40 | 	.strings = Create Strings as file list: .listName$, .directory$ + "/*.wav"
41 | 	.numFiles = Get number of strings
42 | 
43 | 	for .i to .numFiles
44 |     		selectObject: .strings
45 |     		.fileName$ = Get string: .i
46 |     		Read from file: .directory$ + "/" + .fileName$
47 | 		
48 | 		#convert to monoral
49 | 		do("Convert to mono")
50 | 		
51 |     		#resample wav file
52 |     		Resample: .resampleRate, 50
53 | 		appendInfoLine: .directory$ + "/" + .fileName$
54 |     		nowarn Save as WAV file: .directory$ + "/" + .fileName$
55 | 	endfor
56 | endproc
57 | 


--------------------------------------------------------------------------------