├── .gitignore
├── parse.sh
├── data
    └── test.textgrid
├── result
    └── test.json
├── parse_textgrid.py
└── readme.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | 


--------------------------------------------------------------------------------
/parse.sh:
--------------------------------------------------------------------------------
1 | python parse_textgrid.py --input ./data/test.textgrid --output ./result/test.json
2 | 


--------------------------------------------------------------------------------
/data/test.textgrid:
--------------------------------------------------------------------------------
 1 | File type = "ooTextFile"
 2 | Object class = "TextGrid"
 3 | 
 4 | xmin = 0 
 5 | xmax = 2045.144149659864
 6 | tiers? <exists> 
 7 | size = 2 
 8 | item []: 
 9 |     item [1]:
10 |         class = "IntervalTier" 
11 |         name = "utterances" 
12 |         xmin = 0 
13 |         xmax = 2045.144149659864 
14 |         intervals: size = 5 
15 |         intervals [1]:
16 |             xmin = 0 
17 |             xmax = 2041.4217474125382 
18 |             text = "" 
19 |         intervals [2]:
20 |             xmin = 2041.4217474125382 
21 |             xmax = 2041.968276643991 
22 |             text = "this" 
23 |         intervals [3]:
24 |             xmin = 2041.968276643991 
25 |             xmax = 2042.5281632653062 
26 |             text = "is" 
27 |         intervals [4]:
28 |             xmin = 2042.5281632653062 
29 |             xmax = 2044.0487352585324 
30 |             text = "a" 
31 |         intervals [5]:
32 |             xmin = 2044.0487352585324 
33 |             xmax = 2045.144149659864 
34 |             text = "demo" 
35 |     item [2]:
36 |         class = "IntervalTier" 
37 |         name = "phones" 
38 |         xmin = 0 
39 |         xmax = 2045.144149659864
40 |         intervals: size = 12
41 |         intervals [1]:
42 |             xmin = 0 
43 |             xmax = 2041.4217474125382 
44 |             text = "刚才我们看到短片当中介绍的年轻人啊" 
45 |         intervals [2]:
46 |             xmin = 2041.4217474125382 
47 |             xmax = 2041.5438290324326 
48 |             text = "就是我们今天的嘉宾"
49 |         intervals [3]:
50 |             xmin = 2041.5438290324326
51 |             xmax = 2041.7321032910372
52 |             text = "嗯很多人已经介绍过他了"
53 |         intervals [4]:
54 |             xmin = 2041.7321032910372            
55 |             xmax = 2041.968276643991 
56 |             text = "什么天才呀" 
57 |         intervals [5]:
58 |             xmin = 2041.968276643991 
59 |             xmax = 2042.232189031843
60 |             text = "曾经是神童之类这样的词语"
61 |         intervals [6]:
62 |             xmin = 2042.232189031843
63 |             xmax = 2042.5281632653062 
64 |             text = "嗯他的确是一个非常有才华的青年演奏家" 
65 |         intervals [7]:
66 |             xmin = 2042.5281632653062 
67 |             xmax = 2044.0487352585324 
68 |             text = "如今已经是个青年人了" 
69 |         intervals [8]:
70 |             xmin = 2044.0487352585324 
71 |             xmax = 2044.2487352585324
72 |             text = "曾经是个很可爱的小神童"
73 |         intervals [9]:
74 |             xmin = 2044.2487352585324
75 |             xmax = 2044.3102321849011
76 |             text = "这样我们掌声有请李传韵"
77 |         intervals [10]:
78 |             xmin = 2044.3102321849011
79 |             xmax = 2044.5748932104329
80 |             text = "主持人，主持人好"
81 |         intervals [11]:
82 |             xmin = 2044.5748932104329
83 |             xmax = 2044.8329108578437
84 |             text = "我必须要先说一些，李传韵特别特别紧张。"
85 |         intervals [12]:
86 |             xmin = 2044.8329108578437
87 |             xmax = 2045.144149659864 
88 |             text = "嗯" 
89 | 


--------------------------------------------------------------------------------
/result/test.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "file_type": "ooTextFile", 
  3 |   "xmin": "0", 
  4 |   "xmax": "2045.144149659864", 
  5 |   "size": 2, 
  6 |   "tiers": [
  7 |     {
  8 |       "idx": "1", 
  9 |       "class": "IntervalTier", 
 10 |       "name": "utterances", 
 11 |       "xmin": "0", 
 12 |       "xmax": "2045.144149659864", 
 13 |       "size": "5", 
 14 |       "items": [
 15 |         {
 16 |           "idx": "1", 
 17 |           "xmin": "0", 
 18 |           "xmax": "2041.4217474125382", 
 19 |           "text": ""
 20 |         }, 
 21 |         {
 22 |           "idx": "2", 
 23 |           "xmin": "2041.4217474125382", 
 24 |           "xmax": "2041.968276643991", 
 25 |           "text": "this"
 26 |         }, 
 27 |         {
 28 |           "idx": "3", 
 29 |           "xmin": "2041.968276643991", 
 30 |           "xmax": "2042.5281632653062", 
 31 |           "text": "is"
 32 |         }, 
 33 |         {
 34 |           "idx": "4", 
 35 |           "xmin": "2042.5281632653062", 
 36 |           "xmax": "2044.0487352585324", 
 37 |           "text": "a"
 38 |         }, 
 39 |         {
 40 |           "idx": "5", 
 41 |           "xmin": "2044.0487352585324", 
 42 |           "xmax": "2045.144149659864", 
 43 |           "text": "demo"
 44 |         }
 45 |       ]
 46 |     }, 
 47 |     {
 48 |       "idx": "2", 
 49 |       "class": "IntervalTier", 
 50 |       "name": "phones", 
 51 |       "xmin": "0", 
 52 |       "xmax": "2045.144149659864", 
 53 |       "size": "12", 
 54 |       "items": [
 55 |         {
 56 |           "idx": "1", 
 57 |           "xmin": "0", 
 58 |           "xmax": "2041.4217474125382", 
 59 |           "text": "刚才我们看到短片当中介绍的年轻人啊"
 60 |         }, 
 61 |         {
 62 |           "idx": "2", 
 63 |           "xmin": "2041.4217474125382", 
 64 |           "xmax": "2041.5438290324326", 
 65 |           "text": "就是我们今天的嘉宾"
 66 |         }, 
 67 |         {
 68 |           "idx": "3", 
 69 |           "xmin": "2041.5438290324326", 
 70 |           "xmax": "2041.7321032910372", 
 71 |           "text": "嗯很多人已经介绍过他了"
 72 |         }, 
 73 |         {
 74 |           "idx": "4", 
 75 |           "xmin": "2041.7321032910372", 
 76 |           "xmax": "2041.968276643991", 
 77 |           "text": "什么天才呀"
 78 |         }, 
 79 |         {
 80 |           "idx": "5", 
 81 |           "xmin": "2041.968276643991", 
 82 |           "xmax": "2042.232189031843", 
 83 |           "text": "曾经是神童之类这样的词语"
 84 |         }, 
 85 |         {
 86 |           "idx": "6", 
 87 |           "xmin": "2042.232189031843", 
 88 |           "xmax": "2042.5281632653062", 
 89 |           "text": "嗯他的确是一个非常有才华的青年演奏家"
 90 |         }, 
 91 |         {
 92 |           "idx": "7", 
 93 |           "xmin": "2042.5281632653062", 
 94 |           "xmax": "2044.0487352585324", 
 95 |           "text": "如今已经是个青年人了"
 96 |         }, 
 97 |         {
 98 |           "idx": "8", 
 99 |           "xmin": "2044.0487352585324", 
100 |           "xmax": "2044.2487352585324", 
101 |           "text": "曾经是个很可爱的小神童"
102 |         }, 
103 |         {
104 |           "idx": "9", 
105 |           "xmin": "2044.2487352585324", 
106 |           "xmax": "2044.3102321849011", 
107 |           "text": "这样我们掌声有请李传韵"
108 |         }, 
109 |         {
110 |           "idx": "10", 
111 |           "xmin": "2044.3102321849011", 
112 |           "xmax": "2044.5748932104329", 
113 |           "text": "主持人，主持人好"
114 |         }, 
115 |         {
116 |           "idx": "11", 
117 |           "xmin": "2044.5748932104329", 
118 |           "xmax": "2044.8329108578437", 
119 |           "text": "我必须要先说一些，李传韵特别特别紧张。"
120 |         }, 
121 |         {
122 |           "idx": "12", 
123 |           "xmin": "2044.8329108578437", 
124 |           "xmax": "2045.144149659864", 
125 |           "text": "嗯"
126 |         }
127 |       ]
128 |     }
129 |   ]
130 | }


--------------------------------------------------------------------------------
/parse_textgrid.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import argparse
  3 | import json
  4 | from collections import OrderedDict
  5 | 
  6 | def parse_args():
  7 |     parser = argparse.ArgumentParser()
  8 |     parser.add_argument("--input", help="input path of textgrid")
  9 |     parser.add_argument("--output", help="output path of json")
 10 |     return parser.parse_args()
 11 | 
 12 | def remove_empty_lines(text):
 13 |     """remove empty lines"""
 14 |     assert(len(text)>0)
 15 |     assert(isinstance(text, list))
 16 |     text = [t.strip() for t in text]
 17 |     if "" in text:
 18 |         text.remove("")
 19 |     return text
 20 | 
 21 | class TextGrid(object):
 22 |     def __init__(self, text):
 23 |         self.text = text
 24 |         self.line_count = 0
 25 |         self._get_type()
 26 |         self._get_time_intval()
 27 |         self._get_size()
 28 |         self.tier_list = []
 29 |         self._get_item_list()
 30 | 
 31 |     def _extract_pattern(self, pattern, inc):
 32 |         """
 33 | 
 34 |         Parameters
 35 |         ----------
 36 |         pattern : regex to extract pattern
 37 |         inc : increment of line count after extraction
 38 | 
 39 |         Returns
 40 |         -------
 41 |         group : extracted info
 42 | 
 43 |         """
 44 |         try:
 45 |             group = re.match(pattern, self.text[self.line_count].decode("utf-8")).group(1)
 46 |             self.line_count += inc
 47 |         except AttributeError:
 48 |             raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count]))
 49 |         return group
 50 | 
 51 |     def _get_type(self):
 52 |         self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
 53 | 
 54 |     def _get_time_intval(self):
 55 |         self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
 56 |         self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
 57 | 
 58 |     def _get_size(self):
 59 |         self.size = int(self._extract_pattern(r"size = (.*)", 2))
 60 | 
 61 |     def _get_item_list(self):
 62 |         """Only supports IntervalTier currently"""
 63 |         for itemIdx in range(1, self.size + 1):
 64 |             tier = OrderedDict()
 65 |             item_list = []
 66 |             tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
 67 |             tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
 68 |             if tier_class != "IntervalTier":
 69 |                 raise NotImplementedError("Only IntervalTier class is supported currently")
 70 |             tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
 71 |             tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
 72 |             tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
 73 |             tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
 74 |             for i in range(int(tier_size)):
 75 |                 item = OrderedDict()
 76 |                 item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
 77 |                 item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
 78 |                 item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
 79 |                 item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
 80 |                 item_list.append(item)
 81 |             tier["idx"] = tier_idx
 82 |             tier["class"] = tier_class
 83 |             tier["name"] = tier_name
 84 |             tier["xmin"] = tier_xmin
 85 |             tier["xmax"] = tier_xmax
 86 |             tier["size"] = tier_size
 87 |             tier["items"] = item_list
 88 |             self.tier_list.append(tier)
 89 | 
 90 | 
 91 |     def toJson(self):
 92 |         _json = OrderedDict()
 93 |         _json["file_type"] = self.file_type
 94 |         _json["xmin"] = self.xmin
 95 |         _json["xmax"] = self.xmax
 96 |         _json["size"] = self.size
 97 |         _json["tiers"] = self.tier_list
 98 |         return json.dumps(_json, ensure_ascii=False, indent=2).encode("utf-8")
 99 | 
100 | if __name__=="__main__":
101 |     args = parse_args()
102 |     input = args.input
103 |     output = args.output
104 |     text = []
105 |     with open(input, "rb") as f:
106 |         text = f.readlines()
107 |     if len(text)==0:
108 |         raise IOError("input textgrid file can't be empty")
109 |     text = remove_empty_lines(text)
110 |     textgrid = TextGrid(text)
111 |     with open(output, "wb") as f:
112 |         f.write(textgrid.toJson())
113 | 
114 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # textgrid-parser
  2 | 
  3 | This repository is created for parsing and converting `.textgrid` files into `.json` files.To run this code, you should modify the path of input and output files in `parse.sh`:
  4 | `python parse_textgrid.py --input ./data/test.textgrid --output ./result/test.json
  5 | `
  6 | 
  7 | Sample input:
  8 | ```
  9 | File type = "ooTextFile"
 10 | Object class = "TextGrid"
 11 | 
 12 | xmin = 0 
 13 | xmax = 2045.144149659864
 14 | tiers? <exists> 
 15 | size = 2 
 16 | item []: 
 17 |     item [1]:
 18 |         class = "IntervalTier" 
 19 |         name = "utterances" 
 20 |         xmin = 0 
 21 |         xmax = 2045.144149659864 
 22 |         intervals: size = 5 
 23 |         intervals [1]:
 24 |             xmin = 0 
 25 |             xmax = 2041.4217474125382 
 26 |             text = "" 
 27 |         intervals [2]:
 28 |             xmin = 2041.4217474125382 
 29 |             xmax = 2041.968276643991 
 30 |             text = "this" 
 31 |         intervals [3]:
 32 |             xmin = 2041.968276643991 
 33 |             xmax = 2042.5281632653062 
 34 |             text = "is" 
 35 |         intervals [4]:
 36 |             xmin = 2042.5281632653062 
 37 |             xmax = 2044.0487352585324 
 38 |             text = "a" 
 39 |         intervals [5]:
 40 |             xmin = 2044.0487352585324 
 41 |             xmax = 2045.144149659864 
 42 |             text = "demo" 
 43 |     item [2]:
 44 |         class = "IntervalTier" 
 45 |         name = "phones" 
 46 |         xmin = 0 
 47 |         xmax = 2045.144149659864
 48 |         intervals: size = 12
 49 |         intervals [1]:
 50 |             xmin = 0 
 51 |             xmax = 2041.4217474125382 
 52 |             text = "刚才我们看到短片当中介绍的年轻人啊" 
 53 |         intervals [2]:
 54 |             xmin = 2041.4217474125382 
 55 |             xmax = 2041.5438290324326 
 56 |             text = "就是我们今天的嘉宾"
 57 |         intervals [3]:
 58 |             xmin = 2041.5438290324326
 59 |             xmax = 2041.7321032910372
 60 |             text = "嗯很多人已经介绍过他了"
 61 |         intervals [4]:
 62 |             xmin = 2041.7321032910372            
 63 |             xmax = 2041.968276643991 
 64 |             text = "什么天才呀" 
 65 |         intervals [5]:
 66 |             xmin = 2041.968276643991 
 67 |             xmax = 2042.232189031843
 68 |             text = "曾经是神童之类这样的词语"
 69 |         intervals [6]:
 70 |             xmin = 2042.232189031843
 71 |             xmax = 2042.5281632653062 
 72 |             text = "嗯他的确是一个非常有才华的青年演奏家" 
 73 |         intervals [7]:
 74 |             xmin = 2042.5281632653062 
 75 |             xmax = 2044.0487352585324 
 76 |             text = "如今已经是个青年人了" 
 77 |         intervals [8]:
 78 |             xmin = 2044.0487352585324 
 79 |             xmax = 2044.2487352585324
 80 |             text = "曾经是个很可爱的小神童"
 81 |         intervals [9]:
 82 |             xmin = 2044.2487352585324
 83 |             xmax = 2044.3102321849011
 84 |             text = "这样我们掌声有请李传韵"
 85 |         intervals [10]:
 86 |             xmin = 2044.3102321849011
 87 |             xmax = 2044.5748932104329
 88 |             text = "主持人，主持人好"
 89 |         intervals [11]:
 90 |             xmin = 2044.5748932104329
 91 |             xmax = 2044.8329108578437
 92 |             text = "我必须要先说一些，李传韵特别特别紧张。"
 93 |         intervals [12]:
 94 |             xmin = 2044.8329108578437
 95 |             xmax = 2045.144149659864 
 96 |             text = "嗯" 
 97 | ```
 98 | 
 99 | Sample output:
100 | ```
101 | {
102 |   "file_type": "ooTextFile", 
103 |   "xmin": "0", 
104 |   "xmax": "2045.144149659864", 
105 |   "size": 2, 
106 |   "tiers": [
107 |     {
108 |       "idx": "1", 
109 |       "class": "IntervalTier", 
110 |       "name": "utterances", 
111 |       "xmin": "0", 
112 |       "xmax": "2045.144149659864", 
113 |       "size": "5", 
114 |       "items": [
115 |         {
116 |           "idx": "1", 
117 |           "xmin": "0", 
118 |           "xmax": "2041.4217474125382", 
119 |           "text": ""
120 |         }, 
121 |         {
122 |           "idx": "2", 
123 |           "xmin": "2041.4217474125382", 
124 |           "xmax": "2041.968276643991", 
125 |           "text": "this"
126 |         }, 
127 |         {
128 |           "idx": "3", 
129 |           "xmin": "2041.968276643991", 
130 |           "xmax": "2042.5281632653062", 
131 |           "text": "is"
132 |         }, 
133 |         {
134 |           "idx": "4", 
135 |           "xmin": "2042.5281632653062", 
136 |           "xmax": "2044.0487352585324", 
137 |           "text": "a"
138 |         }, 
139 |         {
140 |           "idx": "5", 
141 |           "xmin": "2044.0487352585324", 
142 |           "xmax": "2045.144149659864", 
143 |           "text": "demo"
144 |         }
145 |       ]
146 |     }, 
147 |     {
148 |       "idx": "2", 
149 |       "class": "IntervalTier", 
150 |       "name": "phones", 
151 |       "xmin": "0", 
152 |       "xmax": "2045.144149659864", 
153 |       "size": "12", 
154 |       "items": [
155 |         {
156 |           "idx": "1", 
157 |           "xmin": "0", 
158 |           "xmax": "2041.4217474125382", 
159 |           "text": "刚才我们看到短片当中介绍的年轻人啊"
160 |         }, 
161 |         {
162 |           "idx": "2", 
163 |           "xmin": "2041.4217474125382", 
164 |           "xmax": "2041.5438290324326", 
165 |           "text": "就是我们今天的嘉宾"
166 |         }, 
167 |         {
168 |           "idx": "3", 
169 |           "xmin": "2041.5438290324326", 
170 |           "xmax": "2041.7321032910372", 
171 |           "text": "嗯很多人已经介绍过他了"
172 |         }, 
173 |         {
174 |           "idx": "4", 
175 |           "xmin": "2041.7321032910372", 
176 |           "xmax": "2041.968276643991", 
177 |           "text": "什么天才呀"
178 |         }, 
179 |         {
180 |           "idx": "5", 
181 |           "xmin": "2041.968276643991", 
182 |           "xmax": "2042.232189031843", 
183 |           "text": "曾经是神童之类这样的词语"
184 |         }, 
185 |         {
186 |           "idx": "6", 
187 |           "xmin": "2042.232189031843", 
188 |           "xmax": "2042.5281632653062", 
189 |           "text": "嗯他的确是一个非常有才华的青年演奏家"
190 |         }, 
191 |         {
192 |           "idx": "7", 
193 |           "xmin": "2042.5281632653062", 
194 |           "xmax": "2044.0487352585324", 
195 |           "text": "如今已经是个青年人了"
196 |         }, 
197 |         {
198 |           "idx": "8", 
199 |           "xmin": "2044.0487352585324", 
200 |           "xmax": "2044.2487352585324", 
201 |           "text": "曾经是个很可爱的小神童"
202 |         }, 
203 |         {
204 |           "idx": "9", 
205 |           "xmin": "2044.2487352585324", 
206 |           "xmax": "2044.3102321849011", 
207 |           "text": "这样我们掌声有请李传韵"
208 |         }, 
209 |         {
210 |           "idx": "10", 
211 |           "xmin": "2044.3102321849011", 
212 |           "xmax": "2044.5748932104329", 
213 |           "text": "主持人，主持人好"
214 |         }, 
215 |         {
216 |           "idx": "11", 
217 |           "xmin": "2044.5748932104329", 
218 |           "xmax": "2044.8329108578437", 
219 |           "text": "我必须要先说一些，李传韵特别特别紧张。"
220 |         }, 
221 |         {
222 |           "idx": "12", 
223 |           "xmin": "2044.8329108578437", 
224 |           "xmax": "2045.144149659864", 
225 |           "text": "嗯"
226 |         }
227 |       ]
228 |     }
229 |   ]
230 | }
231 | ```


--------------------------------------------------------------------------------