├── .gitignore ├── parse.sh ├── data └── test.textgrid ├── result └── test.json ├── parse_textgrid.py └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | -------------------------------------------------------------------------------- /parse.sh: -------------------------------------------------------------------------------- 1 | python parse_textgrid.py --input ./data/test.textgrid --output ./result/test.json 2 | -------------------------------------------------------------------------------- /data/test.textgrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 2045.144149659864 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "utterances" 12 | xmin = 0 13 | xmax = 2045.144149659864 14 | intervals: size = 5 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 2041.4217474125382 18 | text = "" 19 | intervals [2]: 20 | xmin = 2041.4217474125382 21 | xmax = 2041.968276643991 22 | text = "this" 23 | intervals [3]: 24 | xmin = 2041.968276643991 25 | xmax = 2042.5281632653062 26 | text = "is" 27 | intervals [4]: 28 | xmin = 2042.5281632653062 29 | xmax = 2044.0487352585324 30 | text = "a" 31 | intervals [5]: 32 | xmin = 2044.0487352585324 33 | xmax = 2045.144149659864 34 | text = "demo" 35 | item [2]: 36 | class = "IntervalTier" 37 | name = "phones" 38 | xmin = 0 39 | xmax = 2045.144149659864 40 | intervals: size = 12 41 | intervals [1]: 42 | xmin = 0 43 | xmax = 2041.4217474125382 44 | text = "刚才我们看到短片当中介绍的年轻人啊" 45 | intervals [2]: 46 | xmin = 2041.4217474125382 47 | xmax = 2041.5438290324326 48 | text = "就是我们今天的嘉宾" 49 | intervals [3]: 50 | xmin = 2041.5438290324326 51 | xmax = 2041.7321032910372 52 | text = "嗯很多人已经介绍过他了" 53 | intervals [4]: 54 | xmin = 2041.7321032910372 55 | xmax = 2041.968276643991 56 | text = "什么天才呀" 57 | intervals [5]: 58 | xmin = 2041.968276643991 59 | xmax = 2042.232189031843 60 | text = "曾经是神童之类这样的词语" 61 | intervals [6]: 62 | xmin = 2042.232189031843 63 | xmax = 2042.5281632653062 64 | text = "嗯他的确是一个非常有才华的青年演奏家" 65 | intervals [7]: 66 | xmin = 2042.5281632653062 67 | xmax = 2044.0487352585324 68 | text = "如今已经是个青年人了" 69 | intervals [8]: 70 | xmin = 2044.0487352585324 71 | xmax = 2044.2487352585324 72 | text = "曾经是个很可爱的小神童" 73 | intervals [9]: 74 | xmin = 2044.2487352585324 75 | xmax = 2044.3102321849011 76 | text = "这样我们掌声有请李传韵" 77 | intervals [10]: 78 | xmin = 2044.3102321849011 79 | xmax = 2044.5748932104329 80 | text = "主持人,主持人好" 81 | intervals [11]: 82 | xmin = 2044.5748932104329 83 | xmax = 2044.8329108578437 84 | text = "我必须要先说一些,李传韵特别特别紧张。" 85 | intervals [12]: 86 | xmin = 2044.8329108578437 87 | xmax = 2045.144149659864 88 | text = "嗯" 89 | -------------------------------------------------------------------------------- /result/test.json: -------------------------------------------------------------------------------- 1 | { 2 | "file_type": "ooTextFile", 3 | "xmin": "0", 4 | "xmax": "2045.144149659864", 5 | "size": 2, 6 | "tiers": [ 7 | { 8 | "idx": "1", 9 | "class": "IntervalTier", 10 | "name": "utterances", 11 | "xmin": "0", 12 | "xmax": "2045.144149659864", 13 | "size": "5", 14 | "items": [ 15 | { 16 | "idx": "1", 17 | "xmin": "0", 18 | "xmax": "2041.4217474125382", 19 | "text": "" 20 | }, 21 | { 22 | "idx": "2", 23 | "xmin": "2041.4217474125382", 24 | "xmax": "2041.968276643991", 25 | "text": "this" 26 | }, 27 | { 28 | "idx": "3", 29 | "xmin": "2041.968276643991", 30 | "xmax": "2042.5281632653062", 31 | "text": "is" 32 | }, 33 | { 34 | "idx": "4", 35 | "xmin": "2042.5281632653062", 36 | "xmax": "2044.0487352585324", 37 | "text": "a" 38 | }, 39 | { 40 | "idx": "5", 41 | "xmin": "2044.0487352585324", 42 | "xmax": "2045.144149659864", 43 | "text": "demo" 44 | } 45 | ] 46 | }, 47 | { 48 | "idx": "2", 49 | "class": "IntervalTier", 50 | "name": "phones", 51 | "xmin": "0", 52 | "xmax": "2045.144149659864", 53 | "size": "12", 54 | "items": [ 55 | { 56 | "idx": "1", 57 | "xmin": "0", 58 | "xmax": "2041.4217474125382", 59 | "text": "刚才我们看到短片当中介绍的年轻人啊" 60 | }, 61 | { 62 | "idx": "2", 63 | "xmin": "2041.4217474125382", 64 | "xmax": "2041.5438290324326", 65 | "text": "就是我们今天的嘉宾" 66 | }, 67 | { 68 | "idx": "3", 69 | "xmin": "2041.5438290324326", 70 | "xmax": "2041.7321032910372", 71 | "text": "嗯很多人已经介绍过他了" 72 | }, 73 | { 74 | "idx": "4", 75 | "xmin": "2041.7321032910372", 76 | "xmax": "2041.968276643991", 77 | "text": "什么天才呀" 78 | }, 79 | { 80 | "idx": "5", 81 | "xmin": "2041.968276643991", 82 | "xmax": "2042.232189031843", 83 | "text": "曾经是神童之类这样的词语" 84 | }, 85 | { 86 | "idx": "6", 87 | "xmin": "2042.232189031843", 88 | "xmax": "2042.5281632653062", 89 | "text": "嗯他的确是一个非常有才华的青年演奏家" 90 | }, 91 | { 92 | "idx": "7", 93 | "xmin": "2042.5281632653062", 94 | "xmax": "2044.0487352585324", 95 | "text": "如今已经是个青年人了" 96 | }, 97 | { 98 | "idx": "8", 99 | "xmin": "2044.0487352585324", 100 | "xmax": "2044.2487352585324", 101 | "text": "曾经是个很可爱的小神童" 102 | }, 103 | { 104 | "idx": "9", 105 | "xmin": "2044.2487352585324", 106 | "xmax": "2044.3102321849011", 107 | "text": "这样我们掌声有请李传韵" 108 | }, 109 | { 110 | "idx": "10", 111 | "xmin": "2044.3102321849011", 112 | "xmax": "2044.5748932104329", 113 | "text": "主持人,主持人好" 114 | }, 115 | { 116 | "idx": "11", 117 | "xmin": "2044.5748932104329", 118 | "xmax": "2044.8329108578437", 119 | "text": "我必须要先说一些,李传韵特别特别紧张。" 120 | }, 121 | { 122 | "idx": "12", 123 | "xmin": "2044.8329108578437", 124 | "xmax": "2045.144149659864", 125 | "text": "嗯" 126 | } 127 | ] 128 | } 129 | ] 130 | } -------------------------------------------------------------------------------- /parse_textgrid.py: -------------------------------------------------------------------------------- 1 | import re 2 | import argparse 3 | import json 4 | from collections import OrderedDict 5 | 6 | def parse_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--input", help="input path of textgrid") 9 | parser.add_argument("--output", help="output path of json") 10 | return parser.parse_args() 11 | 12 | def remove_empty_lines(text): 13 | """remove empty lines""" 14 | assert(len(text)>0) 15 | assert(isinstance(text, list)) 16 | text = [t.strip() for t in text] 17 | if "" in text: 18 | text.remove("") 19 | return text 20 | 21 | class TextGrid(object): 22 | def __init__(self, text): 23 | self.text = text 24 | self.line_count = 0 25 | self._get_type() 26 | self._get_time_intval() 27 | self._get_size() 28 | self.tier_list = [] 29 | self._get_item_list() 30 | 31 | def _extract_pattern(self, pattern, inc): 32 | """ 33 | 34 | Parameters 35 | ---------- 36 | pattern : regex to extract pattern 37 | inc : increment of line count after extraction 38 | 39 | Returns 40 | ------- 41 | group : extracted info 42 | 43 | """ 44 | try: 45 | group = re.match(pattern, self.text[self.line_count].decode("utf-8")).group(1) 46 | self.line_count += inc 47 | except AttributeError: 48 | raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count])) 49 | return group 50 | 51 | def _get_type(self): 52 | self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2) 53 | 54 | def _get_time_intval(self): 55 | self.xmin = self._extract_pattern(r"xmin = (.*)", 1) 56 | self.xmax = self._extract_pattern(r"xmax = (.*)", 2) 57 | 58 | def _get_size(self): 59 | self.size = int(self._extract_pattern(r"size = (.*)", 2)) 60 | 61 | def _get_item_list(self): 62 | """Only supports IntervalTier currently""" 63 | for itemIdx in range(1, self.size + 1): 64 | tier = OrderedDict() 65 | item_list = [] 66 | tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1) 67 | tier_class = self._extract_pattern(r"class = \"(.*)\"", 1) 68 | if tier_class != "IntervalTier": 69 | raise NotImplementedError("Only IntervalTier class is supported currently") 70 | tier_name = self._extract_pattern(r"name = \"(.*)\"", 1) 71 | tier_xmin = self._extract_pattern(r"xmin = (.*)", 1) 72 | tier_xmax = self._extract_pattern(r"xmax = (.*)", 1) 73 | tier_size = self._extract_pattern(r"intervals: size = (.*)", 1) 74 | for i in range(int(tier_size)): 75 | item = OrderedDict() 76 | item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1) 77 | item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1) 78 | item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1) 79 | item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1) 80 | item_list.append(item) 81 | tier["idx"] = tier_idx 82 | tier["class"] = tier_class 83 | tier["name"] = tier_name 84 | tier["xmin"] = tier_xmin 85 | tier["xmax"] = tier_xmax 86 | tier["size"] = tier_size 87 | tier["items"] = item_list 88 | self.tier_list.append(tier) 89 | 90 | 91 | def toJson(self): 92 | _json = OrderedDict() 93 | _json["file_type"] = self.file_type 94 | _json["xmin"] = self.xmin 95 | _json["xmax"] = self.xmax 96 | _json["size"] = self.size 97 | _json["tiers"] = self.tier_list 98 | return json.dumps(_json, ensure_ascii=False, indent=2).encode("utf-8") 99 | 100 | if __name__=="__main__": 101 | args = parse_args() 102 | input = args.input 103 | output = args.output 104 | text = [] 105 | with open(input, "rb") as f: 106 | text = f.readlines() 107 | if len(text)==0: 108 | raise IOError("input textgrid file can't be empty") 109 | text = remove_empty_lines(text) 110 | textgrid = TextGrid(text) 111 | with open(output, "wb") as f: 112 | f.write(textgrid.toJson()) 113 | 114 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # textgrid-parser 2 | 3 | This repository is created for parsing and converting `.textgrid` files into `.json` files.To run this code, you should modify the path of input and output files in `parse.sh`: 4 | `python parse_textgrid.py --input ./data/test.textgrid --output ./result/test.json 5 | ` 6 | 7 | Sample input: 8 | ``` 9 | File type = "ooTextFile" 10 | Object class = "TextGrid" 11 | 12 | xmin = 0 13 | xmax = 2045.144149659864 14 | tiers? 15 | size = 2 16 | item []: 17 | item [1]: 18 | class = "IntervalTier" 19 | name = "utterances" 20 | xmin = 0 21 | xmax = 2045.144149659864 22 | intervals: size = 5 23 | intervals [1]: 24 | xmin = 0 25 | xmax = 2041.4217474125382 26 | text = "" 27 | intervals [2]: 28 | xmin = 2041.4217474125382 29 | xmax = 2041.968276643991 30 | text = "this" 31 | intervals [3]: 32 | xmin = 2041.968276643991 33 | xmax = 2042.5281632653062 34 | text = "is" 35 | intervals [4]: 36 | xmin = 2042.5281632653062 37 | xmax = 2044.0487352585324 38 | text = "a" 39 | intervals [5]: 40 | xmin = 2044.0487352585324 41 | xmax = 2045.144149659864 42 | text = "demo" 43 | item [2]: 44 | class = "IntervalTier" 45 | name = "phones" 46 | xmin = 0 47 | xmax = 2045.144149659864 48 | intervals: size = 12 49 | intervals [1]: 50 | xmin = 0 51 | xmax = 2041.4217474125382 52 | text = "刚才我们看到短片当中介绍的年轻人啊" 53 | intervals [2]: 54 | xmin = 2041.4217474125382 55 | xmax = 2041.5438290324326 56 | text = "就是我们今天的嘉宾" 57 | intervals [3]: 58 | xmin = 2041.5438290324326 59 | xmax = 2041.7321032910372 60 | text = "嗯很多人已经介绍过他了" 61 | intervals [4]: 62 | xmin = 2041.7321032910372 63 | xmax = 2041.968276643991 64 | text = "什么天才呀" 65 | intervals [5]: 66 | xmin = 2041.968276643991 67 | xmax = 2042.232189031843 68 | text = "曾经是神童之类这样的词语" 69 | intervals [6]: 70 | xmin = 2042.232189031843 71 | xmax = 2042.5281632653062 72 | text = "嗯他的确是一个非常有才华的青年演奏家" 73 | intervals [7]: 74 | xmin = 2042.5281632653062 75 | xmax = 2044.0487352585324 76 | text = "如今已经是个青年人了" 77 | intervals [8]: 78 | xmin = 2044.0487352585324 79 | xmax = 2044.2487352585324 80 | text = "曾经是个很可爱的小神童" 81 | intervals [9]: 82 | xmin = 2044.2487352585324 83 | xmax = 2044.3102321849011 84 | text = "这样我们掌声有请李传韵" 85 | intervals [10]: 86 | xmin = 2044.3102321849011 87 | xmax = 2044.5748932104329 88 | text = "主持人,主持人好" 89 | intervals [11]: 90 | xmin = 2044.5748932104329 91 | xmax = 2044.8329108578437 92 | text = "我必须要先说一些,李传韵特别特别紧张。" 93 | intervals [12]: 94 | xmin = 2044.8329108578437 95 | xmax = 2045.144149659864 96 | text = "嗯" 97 | ``` 98 | 99 | Sample output: 100 | ``` 101 | { 102 | "file_type": "ooTextFile", 103 | "xmin": "0", 104 | "xmax": "2045.144149659864", 105 | "size": 2, 106 | "tiers": [ 107 | { 108 | "idx": "1", 109 | "class": "IntervalTier", 110 | "name": "utterances", 111 | "xmin": "0", 112 | "xmax": "2045.144149659864", 113 | "size": "5", 114 | "items": [ 115 | { 116 | "idx": "1", 117 | "xmin": "0", 118 | "xmax": "2041.4217474125382", 119 | "text": "" 120 | }, 121 | { 122 | "idx": "2", 123 | "xmin": "2041.4217474125382", 124 | "xmax": "2041.968276643991", 125 | "text": "this" 126 | }, 127 | { 128 | "idx": "3", 129 | "xmin": "2041.968276643991", 130 | "xmax": "2042.5281632653062", 131 | "text": "is" 132 | }, 133 | { 134 | "idx": "4", 135 | "xmin": "2042.5281632653062", 136 | "xmax": "2044.0487352585324", 137 | "text": "a" 138 | }, 139 | { 140 | "idx": "5", 141 | "xmin": "2044.0487352585324", 142 | "xmax": "2045.144149659864", 143 | "text": "demo" 144 | } 145 | ] 146 | }, 147 | { 148 | "idx": "2", 149 | "class": "IntervalTier", 150 | "name": "phones", 151 | "xmin": "0", 152 | "xmax": "2045.144149659864", 153 | "size": "12", 154 | "items": [ 155 | { 156 | "idx": "1", 157 | "xmin": "0", 158 | "xmax": "2041.4217474125382", 159 | "text": "刚才我们看到短片当中介绍的年轻人啊" 160 | }, 161 | { 162 | "idx": "2", 163 | "xmin": "2041.4217474125382", 164 | "xmax": "2041.5438290324326", 165 | "text": "就是我们今天的嘉宾" 166 | }, 167 | { 168 | "idx": "3", 169 | "xmin": "2041.5438290324326", 170 | "xmax": "2041.7321032910372", 171 | "text": "嗯很多人已经介绍过他了" 172 | }, 173 | { 174 | "idx": "4", 175 | "xmin": "2041.7321032910372", 176 | "xmax": "2041.968276643991", 177 | "text": "什么天才呀" 178 | }, 179 | { 180 | "idx": "5", 181 | "xmin": "2041.968276643991", 182 | "xmax": "2042.232189031843", 183 | "text": "曾经是神童之类这样的词语" 184 | }, 185 | { 186 | "idx": "6", 187 | "xmin": "2042.232189031843", 188 | "xmax": "2042.5281632653062", 189 | "text": "嗯他的确是一个非常有才华的青年演奏家" 190 | }, 191 | { 192 | "idx": "7", 193 | "xmin": "2042.5281632653062", 194 | "xmax": "2044.0487352585324", 195 | "text": "如今已经是个青年人了" 196 | }, 197 | { 198 | "idx": "8", 199 | "xmin": "2044.0487352585324", 200 | "xmax": "2044.2487352585324", 201 | "text": "曾经是个很可爱的小神童" 202 | }, 203 | { 204 | "idx": "9", 205 | "xmin": "2044.2487352585324", 206 | "xmax": "2044.3102321849011", 207 | "text": "这样我们掌声有请李传韵" 208 | }, 209 | { 210 | "idx": "10", 211 | "xmin": "2044.3102321849011", 212 | "xmax": "2044.5748932104329", 213 | "text": "主持人,主持人好" 214 | }, 215 | { 216 | "idx": "11", 217 | "xmin": "2044.5748932104329", 218 | "xmax": "2044.8329108578437", 219 | "text": "我必须要先说一些,李传韵特别特别紧张。" 220 | }, 221 | { 222 | "idx": "12", 223 | "xmin": "2044.8329108578437", 224 | "xmax": "2045.144149659864", 225 | "text": "嗯" 226 | } 227 | ] 228 | } 229 | ] 230 | } 231 | ``` --------------------------------------------------------------------------------