├── LICENSE ├── README.md ├── convert_label.py └── convertwav_to_16khz.praat /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Syuparn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TextGridConverter 2 | 3 | convert .lab files to .TextGrid files, which can be used in Praat 4 | 5 | ## Description 6 | 7 | TextGridConverter converts .lab files generated by Julius segmentation kit to .TextGrid files, 8 | which can be used in Praat. With this code and Julius segmentation kit, you can get phonetic 9 | labels of .wav files for Praat automatically. 10 | 11 | ## Features 12 | 13 | - convert all .lab files in a directory (recursively) 14 | - method to change segmantation unit (phonemes to moras) 15 | 16 | ## Requirement 17 | 18 | - Python 3.6 or more 19 | 20 | ## Usage 21 | 22 | 1. To convert .lab files in directory dir/to/path: 23 | 24 | ``` 25 | python convert_label.py dir/to/path 26 | ``` 27 | 28 | 2. Choose weather separate unit (phoneme or mora) is used 29 | 30 | ``` 31 | change segmentation unit to mora? (default:phoneme) y/n: 32 | ``` 33 | 34 | 3. This converts all .lab files in dir/to/path to .TextGrid files (recursively) 35 | 36 | ## Installation 37 | 38 | $ git clone https://github.com/Syuparn/TextGridConverter 39 | 40 | ## Anything Else 41 | 42 | Before using Julius segmentation kit, you should resample .wav files to 16kHz. 43 | convertwav_to_16khz.praat will help you to resample. This converts all wav files 44 | in a directory. 45 | 46 | ## Author 47 | 48 | syuparn 49 | 50 | ## License 51 | 52 | [MIT](http://b4b4r07.mit-license.org) -------------------------------------------------------------------------------- /convert_label.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sys 4 | 5 | 6 | class ExtentionException(Exception): 7 | pass 8 | 9 | class EmptyLabelException(Exception): 10 | pass 11 | 12 | 13 | class Segment: 14 | """ 15 | a unit of speech (i.e. phoneme, mora) 16 | """ 17 | def __init__(self, tStart, tEnd, label): 18 | self.tStart = tStart 19 | self.tEnd = tEnd 20 | self.label = label 21 | 22 | def __add__(self, other): 23 | return Segment(self.tStart, other.tEnd, self.label + other.label) 24 | 25 | def can_follow(self, other): 26 | """ 27 | return True if Segment self can follow Segment other in one mora, 28 | otherwise return False 29 | example: (other, self) 30 | True: ('s', 'a'), ('sh', 'i'), ('ky', 'o:'), ('t', 's') 31 | False: ('a', 'q'), ('a', 's'), ('u', 'e'), ('s', 'ha') 32 | """ 33 | vowels = ['a', 'i', 'u', 'e', 'o', 'a:', 'i:', 'u:', 'e:', 'o:'] 34 | consonants = ['w', 'r', 't', 'y', 'p', 's', 'd', 'f', 'g', 'h', 'j', 35 | 'k', 'z', 'c', 'b', 'n', 'm'] 36 | only_consonants = lambda x: all([c in consonants for c in x]) 37 | if only_consonants(other.label) and self.label in vowels: 38 | return True 39 | if only_consonants(other.label) and only_consonants(self.label): 40 | return True 41 | return False 42 | 43 | def to_textgrid_lines(self, segmentIndex): 44 | label = '' if self.label in ['silB', 'silE'] else self.label 45 | return [f' intervals [{segmentIndex}]:', 46 | f' xmin = {self.tStart} ', 47 | f' xmax = {self.tEnd} ', 48 | f' text = "{label}" '] 49 | 50 | 51 | def read_lab(filename): 52 | """ 53 | read label file (.lab) generated by Julius segmentation kit and 54 | return SegmentationLabel object 55 | """ 56 | try: 57 | if not re.search(r'\.lab$', filename): 58 | raise ExtentionException("read_lab supports only .lab") 59 | except ExtentionException as e: 60 | print(e) 61 | return None 62 | 63 | with open(filename, 'r') as f: 64 | labeldata = [line.split() for line in f if line != ''] 65 | segments = [Segment(tStart=float(line[0]), tEnd=float(line[1]), 66 | label=line[2]) 67 | for line in labeldata] 68 | return SegmentationLabel(segments) 69 | 70 | 71 | class SegmentationLabel: 72 | """ 73 | list of segments 74 | """ 75 | def __init__(self, segments, separatedByMora=False): 76 | self.segments = segments 77 | self.separatedByMora = separatedByMora 78 | 79 | def by_moras(self): 80 | """ 81 | return new SegmentationLabel object whose segment are moras 82 | """ 83 | if self.separatedByMora == True: 84 | return self 85 | 86 | moraSegments = [] 87 | curMoraSegment = None 88 | for segment in self.segments: 89 | if curMoraSegment is None: 90 | curMoraSegment = segment 91 | elif segment.can_follow(curMoraSegment): 92 | curMoraSegment += segment 93 | else: 94 | moraSegments.append(curMoraSegment) 95 | curMoraSegment = segment 96 | if curMoraSegment: 97 | moraSegments.append(curMoraSegment) 98 | return SegmentationLabel(moraSegments, separatedByMora=True) 99 | 100 | def _textgrid_headers(self): 101 | segmentKind = 'mora' if self.separatedByMora else 'phoneme' 102 | return ['File type = "ooTextFile"', 103 | 'Object class = "TextGrid"', 104 | ' ', 105 | 'xmin = 0 ', 106 | f'xmax = {self.segments[-1].tEnd} ', 107 | 'tiers? ', 108 | 'size = 1 ', 109 | 'item []: ', 110 | ' item [1]: ', 111 | ' class = "IntervalTier" ', 112 | f' name = "{segmentKind}" ', 113 | ' xmin = 0 ', 114 | f' xmax = {self.segments[-1].tEnd} ', 115 | f' intervals: size = {len(self.segments)} '] 116 | 117 | def to_textgrid(self, textgridFileName): 118 | """ 119 | save to .TextGrid file, which is available for Praat 120 | """ 121 | try: 122 | if not self.segments: 123 | raise EmptyLabelException(f'warning: no label data found in ' 124 | f'{textgridFileName}') 125 | except EmptyLabelException as e: 126 | print(e) 127 | return 128 | 129 | textgridLines = self._textgrid_headers() 130 | for i, segment in enumerate(self.segments): 131 | textgridLines.extend(segment.to_textgrid_lines(i + 1)) 132 | with open(textgridFileName, 'w') as f: 133 | f.write('\n'.join(textgridLines)) 134 | 135 | 136 | if __name__ == '__main__': 137 | args = sys.argv 138 | if len(args) >= 2: 139 | mainDirectory = args[1] 140 | else: 141 | mainDirectory = os.curdir 142 | 143 | answer = None 144 | while not answer in ['y', 'Y', 'n', 'N']: 145 | answer = input('change segmentation unit to mora?'\ 146 | ' (default:phoneme) y/n:') 147 | choosesMora = answer in ['y', 'Y'] 148 | 149 | for dirPath, dirNames, fileNames in os.walk(mainDirectory): 150 | labFileNames = [n for n in fileNames if re.search(r'\.lab$', n)] 151 | 152 | for labFileName in labFileNames: 153 | label = read_lab(os.path.join(dirPath, labFileName)) 154 | if choosesMora: 155 | label = label.by_moras() 156 | textgridFileName = re.sub(r"\.lab$", ".TextGrid", labFileName) 157 | label.to_textgrid(os.path.join(dirPath, textgridFileName)) 158 | -------------------------------------------------------------------------------- /convertwav_to_16khz.praat: -------------------------------------------------------------------------------- 1 | ##change sampling rate of .wav to 16kHz 2 | 3 | clearinfo 4 | 5 | form directory_path 6 | text directory 7 | endform 8 | 9 | @convertAllWav: directory$, 16000, 1 10 | 11 | 12 | ##call convertWav recersively 13 | 14 | procedure convertAllWav: .directory$, .resampleRate, .nestDepth 15 | .directory$['.nestDepth'] = .directory$ 16 | 17 | # convert wav files in current directory 18 | @convertWav: .directory$['.nestDepth'], .resampleRate 19 | 20 | #search inner directories 21 | .listname$ = "dir" + .directory$['.nestDepth'] 22 | .strings['.nestDepth'] = Create Strings as directory list: .listname$, .directory$['.nestDepth'] 23 | .numDirectories['.nestDepth'] = Get number of strings 24 | 25 | for .i['.nestDepth'] to .numDirectories['.nestDepth'] 26 | selectObject: .strings['.nestDepth'] 27 | .innerDirectoryName$ = Get string: .i['.nestDepth'] 28 | .curDirectory$ = .directory$['.nestDepth'] + "/" + .innerDirectoryName$ 29 | @convertAllWav: .curDirectory$, .resampleRate, .nestDepth + 1 30 | endfor 31 | .nestDepth -= 1 32 | endproc 33 | 34 | 35 | ##resample .wav files in .directory$ by .resampleRate 36 | 37 | procedure convertWav: .directory$, .resampleRate 38 | #make list object, which contains .wav file names 39 | .listName$ = "wav" + .directory$ 40 | .strings = Create Strings as file list: .listName$, .directory$ + "/*.wav" 41 | .numFiles = Get number of strings 42 | 43 | for .i to .numFiles 44 | selectObject: .strings 45 | .fileName$ = Get string: .i 46 | Read from file: .directory$ + "/" + .fileName$ 47 | 48 | #convert to monoral 49 | do("Convert to mono") 50 | 51 | #resample wav file 52 | Resample: .resampleRate, 50 53 | appendInfoLine: .directory$ + "/" + .fileName$ 54 | nowarn Save as WAV file: .directory$ + "/" + .fileName$ 55 | endfor 56 | endproc 57 | --------------------------------------------------------------------------------