sil\n')
125 |
126 | for l in lexicon:
127 | f.write(l)
128 | f.write('\n')
129 |
130 |
131 | if __name__ == '__main__':
132 |
133 | # train: organize lexicon with repetition,
134 | # test: organize lexicon without repetition.
135 | train_test = 'test'
136 |
137 | lexicon = []
138 |
139 | if train_test == 'train':
140 |
141 | for rec in recordings_train+recordings_test:
142 | data_path, sub_folder, textgrid_folder, \
143 | wav_folder, filename, line_tier, longsyllable_tier, syllable_tier, \
144 | phoneme_tier, special_tier, special_class_tier, roletype = parse_recordings(rec)
145 |
146 | lexicon = collectLexicon(path_textgrid=os.path.join(path_root, data_path, textgrid_folder, sub_folder),
147 | recording=filename,
148 | tier0=special_tier,
149 | tier1=phoneme_tier,
150 | lexicon=lexicon)
151 |
152 | lexicon = list(set(lexicon))
153 |
154 | lexicon_organized, dict_lexicon_organized = organizeRepetition(lexicon, repetition=True)
155 |
156 | writeLexicon(path_lang, lexicon_organized, repetition=True)
157 |
158 | with open(os.path.join(path_lang, "dict_lexicon_repetition.json"), "w") as write_file:
159 | json.dump(dict_lexicon_organized, write_file)
160 | else:
161 | lexicon_special = []
162 | lexicon_syllable_special = []
163 |
164 | for rec in recordings_train+recordings_test:
165 | data_path, sub_folder, textgrid_folder, \
166 | wav_folder, filename, line_tier, longsyllable_tier, syllable_tier, \
167 | phoneme_tier, special_tier, special_class_tier, roletype = parse_recordings(rec)
168 |
169 | lexicon = collectLexicon(path_textgrid=os.path.join(path_root, data_path, textgrid_folder, sub_folder),
170 | recording=filename,
171 | tier0=syllable_tier,
172 | tier1=phoneme_tier,
173 | lexicon=lexicon)
174 |
175 | lexicon_special = collectLexicon(
176 | path_textgrid=os.path.join(path_root, data_path, textgrid_folder, sub_folder),
177 | recording=filename,
178 | tier0=special_tier,
179 | tier1=phoneme_tier,
180 | lexicon=lexicon_special)
181 |
182 | lexicon_syllable_special = collect_lexicon_syllable_special(path_textgrid=os.path.join(path_root, data_path, textgrid_folder, sub_folder),
183 | recording=filename,
184 | syllable_tier=syllable_tier,
185 | special_tier=special_tier,
186 | phoneme_tier=phoneme_tier,
187 | lexicon=lexicon_syllable_special)
188 |
189 | lexicon = list(set(lexicon))
190 |
191 | lexicon_special = list(set(lexicon_special))
192 |
193 | lexicon_syllable_special = list(set(lexicon_syllable_special))
194 |
195 | # get a list ['SYL phn0 phn1 phn2', 'SPECIAL']
196 | lexicon_remove_rep = []
197 | for pron_special in lexicon_special:
198 | lexicon_unit = [pron_special]
199 | for word_entry in lexicon_syllable_special:
200 | syl = word_entry.split(' ')[0]
201 | if pron_special == ' '.join(word_entry.split(' ')[1:]):
202 | lexicon_unit.append(syl)
203 | lexicon_remove_rep.append(lexicon_unit)
204 |
205 | lexicon_organized, dict_lexicon_organized = organizeRepetition(lexicon, repetition=False)
206 |
207 | # dict_lexicon_organized_syllable_special, {SPECIAL: [[phn0 phn1 phn2], [SYL0 SYL1]]}
208 | lexicon_organized_syllable_special, dict_lexicon_organized_syllable_special = \
209 | organize_repetition_syllable_special(lexicon_remove_rep, repetition=True)
210 |
211 | with open(os.path.join(path_lang, "dict_lexicon_repetition_syllable_special.json"), "w") as write_file:
212 | json.dump(dict_lexicon_organized_syllable_special, write_file)
--------------------------------------------------------------------------------
/kaldi_alignment/srcPy/parse_decoded_pronunciation.py:
--------------------------------------------------------------------------------
1 | """
2 | write the decoded text for test set
3 | """
4 | import json
5 | from kaldi_alignment.srcPy.filePath import *
6 |
7 |
8 | def open_decoded_pronunciation(filename):
9 | utt = []
10 | with open(filename) as file:
11 | for row in file.readlines():
12 | utt.append(row.replace('\t', '').replace('\t\n', ''))
13 | return utt
14 |
15 |
16 | def parse_lexicon_to_list(lexicon):
17 | list_lexicon = []
18 | with open(lexicon) as file:
19 | for row in file.readlines():
20 | row = row.replace('\n', '')
21 | list_lexicon.append([row.split(' ')[0], row.split(' ')[1:]])
22 | return list_lexicon
23 |
24 |
25 | def lexicon_finder(dict_lexicon_organized, pho_list):
26 | """
27 | find the corresponding pho_list in lexicon organized
28 | """
29 | for syl_organized, dict_pho_list in dict_lexicon_organized.items():
30 | if pho_list == dict_pho_list:
31 | return syl_organized
32 |
33 | raise ValueError("Not found word entry for {}".format(pho_list))
34 |
35 |
36 | if __name__ == "__main__":
37 |
38 | path_test_ali = "/Users/ronggong/PycharmProjects/mispronunciation-detection/kaldi_alignment/exp/mono_test_ali/"
39 | path_lang_test = "/Users/ronggong/PycharmProjects/mispronunciation-detection/kaldi_alignment/data/dict_test"
40 | filename_decoded_pronunciation = os.path.join(path_test_ali, "pron_perutt_nowb.txt")
41 |
42 | list_lexicon = parse_lexicon_to_list(os.path.join(path_lang_test, 'lexicon.txt'))
43 |
44 | with open(os.path.join(path_lang, "dict_lexicon_repetition_syllable_special.json"), "r") as read_file:
45 | dict_lexicon = json.load(read_file)
46 |
47 | utts = open_decoded_pronunciation(filename_decoded_pronunciation)
48 |
49 | with open(os.path.join(path_test_ali, 'text_decoded'), "w") as f:
50 | for utt in utts:
51 | utt_list = utt.split('\t')
52 | utt_organized = [utt_list[0]]
53 |
54 | for pho_list in utt_list[1:]:
55 | if pho_list != 'SIL sil':
56 | # find all the pronunciations for the syl in repetitive lexicon
57 | syl = pho_list.split(' ')[0]
58 | pron_decoded = pho_list.split(' ')[1:]
59 |
60 | # gather all the special pronunciation for the syl
61 | list_syllable_special_unit = []
62 | for special, pron_syl in dict_lexicon.items():
63 | if syl in pron_syl[1]:
64 | list_syllable_special_unit.append([special, pron_syl[0]])
65 |
66 | # match the special pronunciation
67 | for special, pron in list_syllable_special_unit:
68 | if pron_decoded == pron:
69 | utt_organized.append(''.join([i for i in special if not i.isdigit()]))
70 | break
71 | f.write(' '.join(utt_organized)+'\n')
72 |
--------------------------------------------------------------------------------
/kaldi_alignment/srcPy/textgridParser.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 |
4 | import kaldi_alignment.srcPy.textgrid as tgp
5 |
6 |
7 | def textGrid2WordList(textgrid_file, whichTier = 'pinyin', utf16 = True):
8 | '''
9 | parse textGrid into a python list of tokens
10 | @param whichTier : 'pinyin' default tier name
11 | '''
12 | if not os.path.isfile(textgrid_file): raise Exception("file {} not found".format(textgrid_file))
13 | beginTsAndWordList = []
14 |
15 | if utf16:
16 | par_obj = tgp.TextGrid.loadUTF16(textgrid_file) #loading the object
17 | else:
18 | par_obj = tgp.TextGrid.load(textgrid_file) #loading the object
19 |
20 | tiers= tgp.TextGrid._find_tiers(par_obj) #finding existing tiers
21 |
22 | isTierFound = False
23 | for tier in tiers:
24 | tierName= tier.tier_name().replace('.','')
25 | #iterating over tiers and selecting the one specified
26 | if tierName == whichTier:
27 | isTierFound = True
28 | #this function parse the file nicely and return cool tuples
29 | tier_details = tier.make_simple_transcript()
30 |
31 | for line in tier_details:
32 | beginTsAndWordList.append([float(line[0]), float(line[1]), line[2]])
33 |
34 | if not isTierFound:
35 | print('Missing tier {1} in file {0}' .format(textgrid_file, whichTier))
36 |
37 | return beginTsAndWordList, isTierFound
38 |
39 |
40 | def line2WordList(line, entireWordList):
41 | '''
42 | find the nested wordList of entireWordList by line tuple
43 | :param line: line tuple [startTime, endTime, string]
44 | :param entireWordList: entire word list
45 | :return: nested wordList
46 | '''
47 | nestedWordList = []
48 | vault = False
49 | for wordlist in entireWordList:
50 | # the ending of the line
51 | if wordlist[1] == line[1]:
52 | nestedWordList.append(wordlist)
53 | break
54 | # the beginning of the line
55 | if wordlist[0] == line[0]:
56 | vault = True
57 | if vault == True:
58 | nestedWordList.append(wordlist)
59 |
60 | return nestedWordList
61 |
62 |
63 | def wordListsParseByLines(entireLine, entireWordList):
64 | '''
65 | find the wordList for each line, cut the word list according to line
66 | :param entireLine: entire lines in line tier
67 | :param entirewWordList: entire word lists in pinyin tier
68 | :return:
69 | nestedWordLists: [[line0, wordList0], [line1, wordList1], ...]
70 | numLines: sum of number of lines
71 | numWords: sum of number of words
72 | '''
73 | nestedWordLists = []
74 | numLines = 0
75 | numWords = 0
76 |
77 | for line in entireLine:
78 | # asciiLine=line[2].encode("ascii", "replace")
79 | asciiLine = line[2]
80 | if len(asciiLine.replace(" ", "")): # if line is not empty
81 | numLines += 1
82 | nestedWordList = []
83 | wordList = line2WordList(line, entireWordList)
84 | for word in wordList:
85 | # asciiWord = word[2].encode("ascii", "replace")
86 | asciiWord = word[2]
87 | # if len(asciiWord.replace(" ","")): # if word is not empty
88 | numWords += 1
89 | nestedWordList.append(word)
90 | nestedWordLists.append([line,nestedWordList])
91 |
92 | return nestedWordLists, numLines, numWords
93 |
94 |
95 | def syllableTextgridExtraction(textgrid_path, recording, tier0, tier1):
96 |
97 | '''
98 | Extract syllable boundary and phoneme boundary from textgrid
99 | :param textgrid_path:
100 | :param recording:
101 | :param tier0: parent tier
102 | :param tier1: child tier which should be covered by parent tier
103 | :return:
104 | nestedPhonemeList, element[0] - syllable, element[1] - a list containing the phoneme of the syllable
105 | '''
106 |
107 | print(textgrid_path, recording)
108 | textgrid_file = os.path.join(textgrid_path, recording+'.TextGrid')
109 |
110 | syllableList, _ = textGrid2WordList(textgrid_file, whichTier=tier0)
111 | phonemeList, _ = textGrid2WordList(textgrid_file, whichTier=tier1)
112 |
113 | # parse syllables of groundtruth
114 | nestedPhonemeLists, numSyllables, numPhonemes = wordListsParseByLines(syllableList, phonemeList)
115 |
116 | return nestedPhonemeLists, numSyllables, numPhonemes
117 |
118 |
--------------------------------------------------------------------------------
/neural_net/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronggong/mispronunciation-detection/bed6f39e7e90a76a87332db425e14363b477ccb4/neural_net/__init__.py
--------------------------------------------------------------------------------
/neural_net/combine_feature_label.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import numpy as np
3 | from neural_net.file_path import *
4 |
5 |
6 | def combine_feature_label(dict_positive, dict_negative):
7 | """
8 | Combine positive and negative features and labels into two lists
9 | :param dict_positive:
10 | :param dict_negative:
11 | :return:
12 | """
13 | X = []
14 | y = []
15 | for key in dict_positive:
16 | X += dict_positive[key]
17 | y += [1]*len(dict_positive[key])
18 |
19 | for key in dict_negative:
20 | X += dict_negative[key]
21 | y += [0]*len(dict_negative[key])
22 |
23 | return X, np.array(y)
24 |
25 |
26 | if __name__ == "__main__":
27 | with open(dict_special_positive, "rb") as f:
28 | feature_special_pos = pickle.load(f)
29 |
30 | with open(dict_special_negative, "rb") as f:
31 | feature_special_neg = pickle.load(f)
32 |
33 | with open(dict_jianzi_positive, "rb") as f:
34 | feature_jianzi_pos = pickle.load(f)
35 |
36 | with open(dict_jianzi_negative, "rb") as f:
37 | feature_jianzi_neg = pickle.load(f)
38 |
39 | X_special, y_special = combine_feature_label(dict_positive=feature_special_pos,
40 | dict_negative=feature_special_neg)
41 |
42 | X_jianzi, y_jianzi = combine_feature_label(dict_positive=feature_jianzi_pos,
43 | dict_negative=feature_jianzi_neg)
44 |
45 | print(np.count_nonzero(y_special), len(y_special))
46 | print(np.count_nonzero(y_jianzi), len(y_jianzi))
--------------------------------------------------------------------------------
/neural_net/data/mispronunciation_filelist_test.csv:
--------------------------------------------------------------------------------
1 | part3,20171211SongRuoXuan,daxp-Fei_shi_wo-Hua_tian_cuo-dxjky,student01,,
2 | part3,20171211SongRuoXuan,daxp-Fei_shi_wo-Hua_tian_cuo-dxjky,student02,,
3 | part3,20171211SongRuoXuan,daxp-Fei_shi_wo-Hua_tian_cuo-dxjky,student03,,
4 | part3,20171211SongRuoXuan,daxp-Fei_shi_wo-Hua_tian_cuo-dxjky,student04,,
5 | part3,20171211SongRuoXuan,daxp-Meng_ting_de-Mu_gui_ying_gua_shuai-dxjky,student01,,
6 | part3,20171211SongRuoXuan,daxp-Meng_ting_de-Mu_gui_ying_gua_shuai-dxjky,student02,,
7 | part3,20171211SongRuoXuan,daxp-Meng_ting_de-Mu_gui_ying_gua_shuai-dxjky,student03,,
8 | part3,20171211SongRuoXuan,daxp-Meng_ting_de-Mu_gui_ying_gua_shuai-dxjky,student04,,
9 | part3,20171211SongRuoXuan,daxp-Meng_ting_de-Mu_gui_ying_gua_shuai-dxjky,student05,,
10 | part3,20171211SongRuoXuan,daxp-Meng_ting_de-Mu_gui_ying_gua_shuai-dxjky,student06,,
11 | part3,20171211SongRuoXuan,daxp-Meng_ting_de-Mu_gui_ying_gua_shuai-dxjky,student07,,
12 | part3,20171214SongRuoXuan,daeh-Yang_yu_huan-Tai_zhen_wai_zhuan-nanluo,student_01,,
13 | part3,20171214SongRuoXuan,daeh-Yang_yu_huan-Tai_zhen_wai_zhuan-nanluo,student_02,,
14 | part3,20171214SongRuoXuan,daeh-Yang_yu_huan-Tai_zhen_wai_zhuan-nanluo,student_03,,
15 | part3,20171214SongRuoXuan,daeh-Yang_yu_huan-Tai_zhen_wai_zhuan-nanluo,student_04,,
16 | part3,20171214SongRuoXuan,daeh-Yang_yu_huan-Tai_zhen_wai_zhuan-nanluo,student_05,,
17 | part3,20171214SongRuoXuan,daxp-Quan_jun_wang-Ba_wang_bie_ji-nanluo,student_02,,
18 | part3,20171215SongRuoXuan,daxp-Jiao_zhang_sheng-Xi_shi-qianmen,student_01,,
19 | part3,20171215SongRuoXuan,daxp-Jiao_zhang_sheng-Xi_shi-qianmen,student_02,,
20 | part3,20171215SongRuoXuan,daxp-Jiao_zhang_sheng-Xi_shi-qianmen,student_03,,
21 | part3,20171215SongRuoXuan,daxp-Jiao_zhang_sheng-Xi_shi-qianmen,student_04,,
22 | part3,20171215SongRuoXuan,daxp-Jiao_zhang_sheng-Xi_shi-qianmen,student_05,,
23 | part3,20171215SongRuoXuan,daxp-Jiao_zhang_sheng-Xi_shi-qianmen,student_06,,
24 | part3,20171217TianHao,lsxp-Jiang_shen_er-San_jia_dian-sizhu,student_01,,
25 | part3,20171217TianHao,lsxp-Jiang_shen_er-San_jia_dian-sizhu,student_02,,
26 | part3,20171217TianHao,lsxp-Jiang_shen_er-San_jia_dian-sizhu,student_04_mentougou,,
27 | part3,20171217TianHao,lsxp-Wei_guo_jia-Hong_yang_dong-sizhu,student_01,,
28 | part3,20171217TianHao,lsxp-Wei_guo_jia-Hong_yang_dong-sizhu,student_02,,
29 | part3,20171217TianHao,lsxp-Wei_guo_jia-Hong_yang_dong-sizhu,student_03,,
30 | part3,2017121718SongRuoXuan,lsxp-Zhe_yi_feng-Ding_jun_shan-dxjky-sizhu,student_01_dxjky,,
31 | part3,2017121718SongRuoXuan,lsxp-Zhe_yi_feng-Ding_jun_shan-dxjky-sizhu,student_01_sizhu,,
32 | part3,2017121718SongRuoXuan,lsxp-Zhe_yi_feng-Ding_jun_shan-dxjky-sizhu,student_02_dxjky,,
33 | part3,2017121718SongRuoXuan,lsxp-Zhe_yi_feng-Ding_jun_shan-dxjky-sizhu,student_03_dxjky,,
34 | part3,2017121718SongRuoXuan,lsxp-Zhe_yi_feng-Ding_jun_shan-dxjky-sizhu,student_04_dxjky,,
35 | part3,2017121718SongRuoXuan,lsxp-Zhe_yi_feng-Ding_jun_shan-dxjky-sizhu,student_05_dxjky,,
36 | part3,2017121718SongRuoXuan,lsxp-Zhe_yi_feng-Ding_jun_shan-dxjky-sizhu,student_06_mentougou,,
--------------------------------------------------------------------------------
/neural_net/data/mispronunciation_filelist_train.csv:
--------------------------------------------------------------------------------
1 | part1,,danAll,dafeh-Bi_yun_tian-Xi_xiang_ji01-qm,,
2 | part1,,danAll,danbz-Bei_jiu_chan-Chun_gui_men01-qm,,
3 | part1,,danAll,danbz-Kan_dai_wang-Ba_wang_bie_ji01-qm,,
4 | part1,,danAll,daspd-Hai_dao_bing-Gui_fei_zui_jiu02-qm,,
5 | part1,,danAll,daxp-Chun_qiu_ting-Suo_lin_nang01-qm,,
6 | part1,,danAll,daxp-Jiao_Zhang_sheng-Hong_niang01-qm,,
7 | part1,,danAll,daxp-Jiao_Zhang_sheng-Hong_niang04-qm,,
8 | part1,,danAll,daxp-Meng_ting_de-Mu_Gui_ying_gua_shuai02-qm,,
9 | part1,,danAll,daxp-Meng_ting_de-Mu_Gui_ying_gua_shuai04-qm,,
10 | part1,,danAll,daxp-Zhe_cai_shi-Suo_lin_nang01-qm,,
11 | part1,,laosheng,lseh-Tan_Yang_jia-Hong_yang_dong-qm,,
12 | part1,,laosheng,lseh-Wei_guo_jia-Hong_yang_dong02-qm,,
13 | part1,,laosheng,lseh-Wo_ben_shi-Qiong_lin_yan-qm,,
14 | part1,,laosheng,lseh-Yi_lun_ming-Wen_zhao_guan-qm,,
15 | part1,,laosheng,lseh-Zi_na_ri-Hong_yang_dong-qm,,
16 | part1,,laosheng,lsxp-Guo_liao_yi-Wen_zhao_guan02-qm,,
17 | part1,,laosheng,lsxp-Huai_nan_wang-Huai_he_ying02-qm,,
18 | part1,,laosheng,lsxp-Jiang_shen_er-San_jia_dian02-qm,,
19 | part1,,laosheng,lsxp-Qian_bai_wan-Si_lang_tang_mu01-qm,,
20 | part1,,laosheng,lsxp-Quan_qian_sui-Gan_lu_si-qm,,
21 | part1,,laosheng,lsxp-Shi_ye_shuo-Ding_jun_shan-qm,,
22 | part1,,laosheng,lsxp-Wo_ben_shi-Kong_cheng_ji-qm,,
23 | part1,,laosheng,lsxp-Wo_zheng_zai-Kong_cheng_ji04-qm,,
24 | part1,,laosheng,lsxp-Xi_ri_you-Zhu_lian_zhai-qm,,
25 | part1,,danAll,daeh-Yang_Yu_huan-Tai_zhen_wai_zhuan-lon,,
26 | part1,,danAll,daspd-Du_shou_kong-Wang_jiang_ting-upf,,
27 | part1,,danAll,daspd-Hai_dao_bing-Gui_fei_zui_jiu01-lon,,
28 | part1,,danAll,daxp-Guan_Shi_yin-Tian_nv_san_hua-lon,,
29 | part1,,danAll,daxp-Meng_ting_de-Mu_Gui_ying_gua_shuai01-upf,,
30 | part1,,laosheng,lseh-Wei_guo_jia-Hong_yang_dong01-lon,,
31 | part1,,laosheng,lsxp-Huai_nan_wang-Huai_he_ying01-lon,,
32 | part1,,laosheng,lsxp-Jiang_shen_er-San_jia_dian01-1-upf,,
33 | part1,,laosheng,lsxp-Jiang_shen_er-San_jia_dian01-2-upf,,
34 | part1,,laosheng,lsxp-Wo_zheng_zai-Kong_cheng_ji01-upf,,
35 | part2,,20170327LiaoJiaNi,lsxp-Yi_ma_li-Wu_jia_po-nacta,,
36 | part2,,20170327LiaoJiaNi,lseh-Niang_zi_bu-Sou_gu_jiu-nacta,,
37 | part3,20171211SongRuoXuan,daxp_Qing_zao_qi_lai-Mai_shui-dxjky,teacher,,
38 | part3,20171211SongRuoXuan,daxp-Fei_shi_wo-Hua_tian_cuo-dxjky,teacher,,
39 | part3,20171211SongRuoXuan,daxp-Wo_jia_di-Hong_deng_ji-dxjky,teacher,,
40 | part3,20171214SongRuoXuan,daeh-Yang_yu_huan-Tai_zhen_wai_zhuan-nanluo,teacher,,
41 | part3,20171214SongRuoXuan,danbz-Kan_dai_wang-Ba_wang_bie_ji-nanluo,teacher,,
42 | part3,20171214SongRuoXuan,daspd-Hai_dao_bing-Gui_fei_zui_jiu-nanluo,teacher,,
43 | part3,20171214SongRuoXuan,daxp-Quan_jun_wang-Ba_wang_bie_ji-nanluo,teacher,,
44 | part3,20171217TianHao,lseh-Wo_men_shi-Zhi_qu-sizhu,teacher,,
45 | part3,20171217TianHao,lsxp-Jiang_shen_er-San_jia_dian-sizhu,teacher,,
46 | part3,20171217TianHao,lsxp-Lin_xing_he_ma-Hong_deng_ji-sizhu,teacher,,
47 | part3,20171217TianHao,lsxp-Wei_guo_jia-Hong_yang_dong-sizhu,teacher,,
48 | part3,2017121718SongRuoXuan,lsxp-Zhe_yi_feng-Ding_jun_shan-dxjky-sizhu,teacher,,
49 | part3,20171217TianHao,lsxp-Ti_lan_xiao_mai-Hong_deng_ji-sizhu_mentougou,teacher,,
50 | part3,20171211SongRuoXuan,daxp_Qing_zao_qi_lai-Mai_shui-dxjky,student01,,
51 | part3,20171211SongRuoXuan,daxp_Qing_zao_qi_lai-Mai_shui-dxjky,student02_first_half,,
52 | part3,20171211SongRuoXuan,daxp_Qing_zao_qi_lai-Mai_shui-dxjky,student02,,
53 | part3,20171211SongRuoXuan,daxp_Qing_zao_qi_lai-Mai_shui-dxjky,student03,,
54 | part3,20171211SongRuoXuan,daxp_Qing_zao_qi_lai-Mai_shui-dxjky,student04,,
55 | part3,20171211SongRuoXuan,daxp_Qing_zao_qi_lai-Mai_shui-dxjky,student05,,
56 | part3,20171211SongRuoXuan,daxp_Qing_zao_qi_lai-Mai_shui-dxjky,student06,,
57 | part3,20171211SongRuoXuan,daxp-Wo_jia_di-Hong_deng_ji-dxjky,student01,,
58 | part3,20171211SongRuoXuan,daxp-Wo_jia_di-Hong_deng_ji-dxjky,student02,,
59 | part3,20171211SongRuoXuan,daxp-Wo_jia_di-Hong_deng_ji-dxjky,student03,,
60 | part3,20171211SongRuoXuan,daxp-Wo_jia_di-Hong_deng_ji-dxjky,student04,,
61 | part3,20171211SongRuoXuan,daxp-Wo_jia_di-Hong_deng_ji-dxjky,student05,,
62 | part3,20171211SongRuoXuan,daxp-Wo_jia_di-Hong_deng_ji-dxjky,student06,,
63 | part3,20171214SongRuoXuan,danbz-Kan_dai_wang-Ba_wang_bie_ji-nanluo,student_01,,
64 | part3,20171214SongRuoXuan,danbz-Kan_dai_wang-Ba_wang_bie_ji-nanluo,student_02,,
65 | part3,20171214SongRuoXuan,danbz-Kan_dai_wang-Ba_wang_bie_ji-nanluo,student_03,,
66 | part3,20171214SongRuoXuan,daspd-Hai_dao_bing-Gui_fei_zui_jiu-nanluo,student_01,,
67 | part3,20171214SongRuoXuan,daxp-Quan_jun_wang-Ba_wang_bie_ji-nanluo,student_01,,
68 | part3,20171214SongRuoXuan,daxp-Quan_jun_wang-Ba_wang_bie_ji-nanluo,student_03,,
69 | part3,20171217TianHao,lseh-Wo_men_shi-Zhi_qu-sizhu,student_01,,
70 | part3,20171217TianHao,lseh-Wo_men_shi-Zhi_qu-sizhu,student_02,,
71 | part3,20171217TianHao,lsxp-Lin_xing_he_ma-Hong_deng_ji-sizhu,student_01,,
72 | part3,20171217TianHao,lsxp-Lin_xing_he_ma-Hong_deng_ji-sizhu,student_02,,
73 | part3,20171217TianHao,lsxp-Ti_lan_xiao_mai-Hong_deng_ji-sizhu_mentougou,student_01_sizhu,,
--------------------------------------------------------------------------------
/neural_net/data/normal_jianzi.json:
--------------------------------------------------------------------------------
1 | ["xiu", "jue", "xian", "xiang", "zheng", "xin", "qiu", "qie", "chu", "qing", "ji", "xiao", "qian", "xi"]
--------------------------------------------------------------------------------
/neural_net/data/normal_special.json:
--------------------------------------------------------------------------------
1 | ["na", "ai", "ri", "zhi", "ru", "shuo", "zhuang", "ming", "bei", "peng", "nei", "lai", "bai", "chang", "he", "cheng", "lei", "neng", "zhu", "jing", "shi", "lv", "me", "ding", "sheng", "wu", "xing", "qing", "hai", "shu", "ting", "ping", "meng", "ge", "quan", "an", "zheng", "bing", "ying", "e", "wo", "jie", "chu", "ke", "ceng", "que", "ji", "mao", "zei", "chun", "ling", "yuan", "fei", "ning", "deng", "zeng", "mai", "xie", "zhan", "zhao", "feng", "huai", "luo", "zhe"]
--------------------------------------------------------------------------------
/neural_net/file_path.py:
--------------------------------------------------------------------------------
1 | import os
2 | from neural_net.utils.csv_preprocessing import open_csv_recordings
3 |
4 | dir_path = os.path.dirname(os.path.realpath(__file__))
5 |
6 | path_root = '/Users/ronggong/Documents_using/MTG_document/Jingju_arias/'
7 |
8 | path_nacta = 'jingju_a_cappella_singing_dataset'
9 | path_nacta2017 = 'jingju_a_cappella_singing_dataset_extended_nacta2017'
10 | path_primary = 'primary_school_recording'
11 |
12 | recordings_train = open_csv_recordings(os.path.join(dir_path, "data/mispronunciation_filelist_train.csv"))
13 | recordings_test = open_csv_recordings(os.path.join(dir_path, "data/mispronunciation_filelist_test.csv"))
14 |
15 | filename_normal_special = os.path.join(dir_path, "data/normal_special.json")
16 | filename_normal_jianzi = os.path.join(dir_path, "data/normal_jianzi.json")
17 |
18 | dict_special_positive = os.path.join(dir_path, "data/special_positive.pkl")
19 | dict_special_negative = os.path.join(dir_path, "data/special_negative.pkl")
20 | dict_jianzi_positive = os.path.join(dir_path, "data/jianzi_positive.pkl")
21 | dict_jianzi_negative = os.path.join(dir_path, "data/jianzi_negative.pkl")
22 |
23 | joint_cnn_model_path = os.path.join(dir_path, 'model', 'segmentation')
24 |
25 | filename_special_model = os.path.join(dir_path, "model", "special_model_prod_True_True_0.5.h5")
26 | filename_jianzi_model = os.path.join(dir_path, "model", "jianzi_model_prod_True_True_0.5.h5")
27 | # filename_jianzi_model = os.path.join(dir_path, "model", "jianzi_model_prod_feedforward_True_0.5.h5")
28 |
29 | # filename_special_model = os.path.join(dir_path, "model", "special_model_prod_tcn_0.05.h5")
30 | # filename_jianzi_model = os.path.join(dir_path, "model", "jianzi_model_prod_tcn_0.05.h5")
31 |
32 | filename_result_decoded_mispronunciaiton = os.path.join(dir_path, "results", "text_decoded_special_True_True_0.5")
33 | # filename_result_decoded_mispronunciaiton = os.path.join(dir_path, "results", "text_decoded_special_feedforward_True_0.5")
34 |
35 | path_figs_jianzi = "/Users/ronggong/PycharmProjects/mispronunciation-detection/neural_net/figs/jianzi"
36 |
37 |
38 | def getRecordings(wav_path):
39 | recordings = []
40 | for root, subFolders, files in os.walk(wav_path):
41 | for f in files:
42 | file_prefix, file_extension = os.path.splitext(f)
43 | if file_prefix != '.DS_Store':
44 | recordings.append(file_prefix)
45 |
46 | return recordings
47 |
48 |
49 | def parse_recordings(rec):
50 | if rec[0] == "part1":
51 | data_path = path_nacta
52 | sub_folder = rec[2]
53 | textgrid_folder = "textgrid"
54 | wav_folder = "wav_left"
55 | syllable_tier = "dian"
56 | if rec[3][:2] == 'da':
57 | roletype = 'Dan'
58 | elif rec[3][:2] == 'ls':
59 | roletype = 'Laosheng'
60 | else:
61 | raise ValueError("Not exist a role-type {} for file {}".format(rec[3][:2], rec))
62 | elif rec[0] == "part2":
63 | data_path = path_nacta2017
64 | sub_folder = rec[2]
65 | textgrid_folder = "textgridDetails"
66 | wav_folder = "wav"
67 | syllable_tier = "dianSilence"
68 | if rec[3][:2] == 'da':
69 | roletype = 'Dan'
70 | elif rec[3][:2] == 'ls':
71 | roletype = 'Laosheng'
72 | else:
73 | raise ValueError("Not exist a role-type {} for file {}".format(rec[3][:2], rec))
74 | else:
75 | data_path = path_primary
76 | sub_folder = rec[1] + "/" + rec[2]
77 | textgrid_folder = "textgrid"
78 | wav_folder = "wav_left"
79 | syllable_tier = "dianSilence"
80 | if rec[2][:2] == 'da':
81 | roletype = 'Dan'
82 | elif rec[2][:2] == 'ls':
83 | roletype = 'Laosheng'
84 | else:
85 | raise ValueError("Not exist a role-type {} for file {}".format(rec[2][:2], rec))
86 |
87 | filename = rec[3]
88 | line_tier = "line"
89 | longsyllable_tier = "longsyllable"
90 | phoneme_tier = "details"
91 | special_tier = "special"
92 | special_class_tier = "specialClass"
93 |
94 | return data_path, sub_folder, textgrid_folder, \
95 | wav_folder, filename, line_tier, \
96 | longsyllable_tier, syllable_tier, phoneme_tier, \
97 | special_tier, special_class_tier, roletype
--------------------------------------------------------------------------------
/neural_net/keras_tcn/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | .idea/
6 | .DS_Store
7 |
8 | *.tsv
9 | *.tar.gz
10 | *out*
11 | credentials.json
12 |
13 | *.json
14 |
15 | nohup.out
16 | *.out
17 | *.txt
18 |
19 | # C extensions
20 | *.so
21 |
22 | # Distribution / packaging
23 | .Python
24 | env/
25 | build/
26 | develop-eggs/
27 | dist/
28 | downloads/
29 | eggs/
30 | .eggs/
31 | lib/
32 | lib64/
33 | parts/
34 | sdist/
35 | var/
36 | wheels/
37 | *.egg-info/
38 | .installed.cfg
39 | *.egg
40 |
41 | # PyInstaller
42 | # Usually these files are written by a python script from a template
43 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
44 | *.manifest
45 | *.spec
46 |
47 | # Installer logs
48 | pip-log.txt
49 | pip-delete-this-directory.txt
50 |
51 | # Unit test / coverage reports
52 | htmlcov/
53 | .tox/
54 | .coverage
55 | .coverage.*
56 | .cache
57 | nosetests.xml
58 | coverage.xml
59 | *.cover
60 | .hypothesis/
61 |
62 | # Translations
63 | *.mo
64 | *.pot
65 |
66 | # Django stuff:
67 | *.log
68 | local_settings.py
69 |
70 | # Flask stuff:
71 | instance/
72 | .webassets-cache
73 |
74 | # Scrapy stuff:
75 | .scrapy
76 |
77 | # Sphinx documentation
78 | docs/_build/
79 |
80 | # PyBuilder
81 | target/
82 |
83 | # Jupyter Notebook
84 | .ipynb_checkpoints
85 |
86 | # pyenv
87 | .python-version
88 |
89 | # celery beat schedule file
90 | celerybeat-schedule
91 |
92 | # SageMath parsed files
93 | *.sage.py
94 |
95 | # dotenv
96 | .env
97 |
98 | # virtualenv
99 | .venv
100 | venv/
101 | ENV/
102 |
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 |
107 | # Rope project settings
108 | .ropeproject
109 |
110 | # mkdocs documentation
111 | /site
112 |
113 | # mypy
114 | .mypy_cache/
115 |
--------------------------------------------------------------------------------
/neural_net/keras_tcn/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Philippe Rémy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/neural_net/keras_tcn/README.md:
--------------------------------------------------------------------------------
1 | # Keras TCN
2 | *Keras Temporal Convolutional Network*
3 |
4 | * [Keras TCN](#keras-tcn)
5 | * [Why Temporal Convolutional Network?](#why-temporal-convolutional-network)
6 | * [API](#api)
7 | * [Regression (Many to one) e.g. adding problem](#--regression-many-to-one-eg-adding-problem)
8 | * [Classification (Many to one) e.g. copy memory task](#--classification-many-to-one-eg-copy-memory-task)
9 | * [Classification (Many to one) e.g. sequential mnist task](#--classification-many-to-one-eg-sequential-mnist-task)
10 | * [Installation](#installation)
11 | * [Run](#run)
12 | * [Tasks](#tasks)
13 | * [References](#references)
14 |
15 | ## Why Temporal Convolutional Network?
16 |
17 | - TCNs exhibit longer memory than recurrent architectures with the same capacity.
18 | - Constantly performs better than LSTM/GRU architectures on a vast range of tasks (Seq. MNIST, Adding Problem, Copy Memory, Word-level PTB...).
19 | - Parallelism, flexible receptive field size, stable gradients, low memory requirements for training, variable length inputs...
20 |
21 |
22 |
23 | Visualization of a stack of dilated causal convolutional layers (Wavenet, 2016)
24 |
25 |
26 | ## API
27 |
28 | After installation, the model can be imported like this:
29 |
30 | ```
31 | from tcn import tcn
32 | ```
33 |
34 | In the following examples, we assume the input to have a shape `(batch_size, timesteps, input_dim)`.
35 |
36 | The model is a Keras model. The model functions (`model.summary`, `model.fit`, `model.predict`...) are all functional.
37 |
38 |
39 |
40 | ### - Regression (Many to one) e.g. adding problem
41 |
42 | ```
43 | model = tcn.dilated_tcn(output_slice_index='last',
44 | num_feat=input_dim,
45 | num_classes=None,
46 | nb_filters=24,
47 | kernel_size=8,
48 | dilatations=[1, 2, 4, 8],
49 | nb_stacks=8,
50 | max_len=timesteps,
51 | activation='norm_relu',
52 | regression=True)
53 | ```
54 |
55 | For a Many to Many regression, a cheap fix for now is to change the [number of units of the final Dense layer](https://github.com/philipperemy/keras-tcn/blob/8151b4a87f906fd856fd1c113c48392d542d0994/tcn/tcn.py#L90).
56 |
57 | ### - Classification (Many to many) e.g. copy memory task
58 |
59 | ```
60 | model = tcn.dilated_tcn(num_feat=input_dim,
61 | num_classes=10,
62 | nb_filters=10,
63 | kernel_size=8,
64 | dilatations=[1, 2, 4, 8],
65 | nb_stacks=8,
66 | max_len=timesteps,
67 | activation='norm_relu')
68 | ```
69 |
70 | ### - Classification (Many to one) e.g. sequential mnist task
71 |
72 | ```
73 | model = tcn.dilated_tcn(output_slice_index='last',
74 | num_feat=input_dim,
75 | num_classes=10,
76 | nb_filters=64,
77 | kernel_size=8,
78 | dilatations=[1, 2, 4, 8],
79 | nb_stacks=8,
80 | max_len=timesteps,
81 | activation='norm_relu')
82 | ```
83 |
84 | ## Installation
85 |
86 | ```
87 | git clone git@github.com:philipperemy/keras-tcn.git
88 | cd keras-tcn
89 | virtualenv -p python3.6 venv
90 | source venv/bin/activate
91 | pip install -r requirements.txt # change to tensorflow if you dont have a gpu.
92 | python setup.py install # install keras-tcn as a package
93 | ```
94 |
95 | ## Run
96 |
97 | Once `keras-tcn` is installed as a package, you can take a glimpse of what's possible to do with TCNs. Some tasks examples are available in the repository for this purpose:
98 |
99 | ```
100 | cd adding_problem/
101 | python main.py # run adding problem task
102 |
103 | cd copy_memory/
104 | python main.py # run copy memory task
105 |
106 | cd mnist_pixel/
107 | python main.py # run sequential mnist pixel task
108 | ```
109 |
110 | ## Tasks
111 |
112 | ### Adding Task
113 |
114 | The task consists of feeding a large array of decimal numbers to the network, along with a boolean array of the same length. The objective is to sum the two decimals where the boolean array contain the two 1s.
115 |
116 | #### Explanation
117 |
118 |
119 |
120 | Adding Problem Task
121 |
122 |
123 | #### Implementation results
124 |
125 | The model takes time to learn this task. It's symbolized by a very long plateau (could take ~8 epochs on some runs).
126 |
127 | ```
128 | 200000/200000 [==============================] - 451s 2ms/step - loss: 0.1749 - val_loss: 0.1662
129 | 200000/200000 [==============================] - 449s 2ms/step - loss: 0.1681 - val_loss: 0.1676
130 | 200000/200000 [==============================] - 449s 2ms/step - loss: 0.1677 - val_loss: 0.1663
131 | 200000/200000 [==============================] - 449s 2ms/step - loss: 0.1676 - val_loss: 0.1652
132 | 200000/200000 [==============================] - 449s 2ms/step - loss: 0.1165 - val_loss: 0.0093
133 | 200000/200000 [==============================] - 448s 2ms/step - loss: 0.0083 - val_loss: 0.0033
134 | 200000/200000 [==============================] - 448s 2ms/step - loss: 0.0040 - val_loss: 0.0012
135 | ```
136 |
137 | ### Copy Memory Task
138 |
139 | The copy memory consists of a very large array:
140 | - At the beginning, there's the vector x of length N. This is the vector to copy.
141 | - At the end, N+1 9s are present. The first 9 is seen as a delimiter.
142 | - In the middle, only 0s are there.
143 |
144 | The idea is to copy the content of the vector x to the end of the large array. The task is made sufficiently complex by increasing the number of 0s in the middle.
145 |
146 | #### Explanation
147 |
148 |
149 |
150 | Copy Memory Task
151 |
152 |
153 | #### Implementation results
154 |
155 | ```
156 | 10000/10000 [==============================] - 20s 2ms/step - loss: 0.3474 - acc: 0.8985 - val_loss: 0.0362 - val_acc: 0.9859
157 | 10000/10000 [==============================] - 13s 1ms/step - loss: 0.0360 - acc: 0.9859 - val_loss: 0.0353 - val_acc: 0.9859
158 | 10000/10000 [==============================] - 13s 1ms/step - loss: 0.0351 - acc: 0.9859 - val_loss: 0.0345 - val_acc: 0.9859
159 | 10000/10000 [==============================] - 13s 1ms/step - loss: 0.0342 - acc: 0.9860 - val_loss: 0.0336 - val_acc: 0.9860
160 | 10000/10000 [==============================] - 13s 1ms/step - loss: 0.0332 - acc: 0.9865 - val_loss: 0.0307 - val_acc: 0.9883
161 | 10000/10000 [==============================] - 13s 1ms/step - loss: 0.0240 - acc: 0.9898 - val_loss: 0.0157 - val_acc: 0.9933
162 | 10000/10000 [==============================] - 13s 1ms/step - loss: 0.0136 - acc: 0.9951 - val_loss: 0.0094 - val_acc: 0.9976
163 | 10000/10000 [==============================] - 13s 1ms/step - loss: 0.0087 - acc: 0.9978 - val_loss: 0.0049 - val_acc: 1.0000
164 | 10000/10000 [==============================] - 14s 1ms/step - loss: 0.0050 - acc: 0.9992 - val_loss: 0.0020 - val_acc: 1.0000
165 | ```
166 |
167 | ### Sequential MNIST
168 |
169 | #### Explanation
170 |
171 | The idea here is to consider MNIST images as 1-D sequences and feed them to the network. This task is particularly hard because sequences are 28*28 = 784 elements. In order to classify correctly, the network has to remember all the sequence. Usual LSTM are unable to perform well on this task.
172 |
173 |
174 |
175 | Sequential MNIST
176 |
177 |
178 | #### Implementation results
179 |
180 | ```
181 | 60000/60000 [==============================] - 569s 9ms/step - loss: 0.2209 - acc: 0.9303 - val_loss: 0.0699 - val_acc: 0.9781
182 | 60000/60000 [==============================] - 545s 9ms/step - loss: 0.0784 - acc: 0.9760 - val_loss: 0.0507 - val_acc: 0.9843
183 | 60000/60000 [==============================] - 553s 9ms/step - loss: 0.0599 - acc: 0.9824 - val_loss: 0.0512 - val_acc: 0.9840
184 | 60000/60000 [==============================] - 555s 9ms/step - loss: 0.0493 - acc: 0.9851 - val_loss: 0.0569 - val_acc: 0.9824
185 | 60000/60000 [==============================] - 549s 9ms/step - loss: 0.0421 - acc: 0.9868 - val_loss: 0.0424 - val_acc: 0.9864
186 | 60000/60000 [==============================] - 558s 9ms/step - loss: 0.0358 - acc: 0.9886 - val_loss: 0.0416 - val_acc: 0.9874
187 | 60000/60000 [==============================] - 536s 9ms/step - loss: 0.0317 - acc: 0.9901 - val_loss: 0.0566 - val_acc: 0.9835
188 | 60000/60000 [==============================] - 483s 8ms/step - loss: 0.0272 - acc: 0.9915 - val_loss: 0.0565 - val_acc: 0.9845
189 | 60000/60000 [==============================] - 489s 8ms/step - loss: 0.0278 - acc: 0.9915 - val_loss: 0.0421 - val_acc: 0.9874
190 | 60000/60000 [==============================] - 483s 8ms/step - loss: 0.0227 - acc: 0.9929 - val_loss: 0.0464 - val_acc: 0.9882
191 | 60000/60000 [==============================] - 484s 8ms/step - loss: 0.0203 - acc: 0.9935 - val_loss: 0.0428 - val_acc: 0.9890
192 | 60000/60000 [==============================] - 484s 8ms/step - loss: 0.0212 - acc: 0.9934 - val_loss: 0.0539 - val_acc: 0.9884
193 | 60000/60000 [==============================] - 483s 8ms/step - loss: 0.0167 - acc: 0.9947 - val_loss: 0.0393 - val_acc: 0.9900
194 | ```
195 |
196 |
197 |
198 | ## References
199 | - https://github.com/locuslab/TCN/ (TCN for Pytorch)
200 | - https://arxiv.org/pdf/1803.01271.pdf (An Empirical Evaluation of Generic Convolutional and Recurrent Networks
201 | for Sequence Modeling)
202 | - https://arxiv.org/pdf/1609.03499.pdf (Original Wavenet paper)
203 |
--------------------------------------------------------------------------------
/neural_net/keras_tcn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronggong/mispronunciation-detection/bed6f39e7e90a76a87332db425e14363b477ccb4/neural_net/keras_tcn/__init__.py
--------------------------------------------------------------------------------
/neural_net/keras_tcn/adding_problem/README.md:
--------------------------------------------------------------------------------
1 | ## The Adding Problem
2 |
3 | ### Overview
4 |
5 | In this task, each input consists of a length-T sequence of depth 2, with all values randomly
6 | chosen randomly in [0, 1] in dimension 1. The second dimension consists of all zeros except for
7 | two elements, which are marked by 1. The objective is to sum the two random values whose second
8 | dimensions are marked by 1. One can think of this as computing the dot product of two dimensions.
9 |
10 | Simply predicting the sum to be 1 should give an MSE of about 0.1767.
11 |
12 | ### Data Generation
13 |
14 | See `data_generator` in `utils.py`.
15 |
16 | ### Note
17 |
18 | Because a TCN's receptive field depends on depth of the network and the filter size, we need
19 | to make sure these the model we use can cover the sequence length T.
20 |
21 | From: https://github.com/locuslab/TCN/
--------------------------------------------------------------------------------
/neural_net/keras_tcn/adding_problem/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronggong/mispronunciation-detection/bed6f39e7e90a76a87332db425e14363b477ccb4/neural_net/keras_tcn/adding_problem/__init__.py
--------------------------------------------------------------------------------
/neural_net/keras_tcn/adding_problem/main.py:
--------------------------------------------------------------------------------
1 | import keras
2 |
3 | from tcn import tcn
4 | from utils import data_generator
5 |
6 | x_train, y_train = data_generator(n=200000, seq_length=600)
7 | x_test, y_test = data_generator(n=40000, seq_length=600)
8 |
9 |
10 | class PrintSomeValues(keras.callbacks.Callback):
11 |
12 | def on_epoch_begin(self, epoch, logs={}):
13 | print(f'x_test[0:1] = {x_test[0:1]}.')
14 | print(f'y_test[0:1] = {y_test[0:1]}.')
15 | print(f'pred = {self.model.predict(x_test[0:1])}.')
16 |
17 |
18 | def run_task():
19 | model, param_str = tcn.dilated_tcn(output_slice_index='last',
20 | num_feat=x_train.shape[2],
21 | num_classes=0,
22 | nb_filters=24,
23 | kernel_size=8,
24 | dilatations=[1, 2, 4, 8],
25 | nb_stacks=8,
26 | max_len=x_train.shape[1],
27 | activation='norm_relu',
28 | use_skip_connections=False,
29 | return_param_str=True,
30 | regression=True)
31 |
32 | print(f'x_train.shape = {x_train.shape}')
33 | print(f'y_train.shape = {y_train.shape}')
34 |
35 | psv = PrintSomeValues()
36 |
37 | # Using sparse softmax.
38 | # http://chappers.github.io/web%20micro%20log/2017/01/26/quick-models-in-keras/
39 | model.summary()
40 |
41 | model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=500,
42 | callbacks=[psv], batch_size=128)
43 |
44 |
45 | if __name__ == '__main__':
46 | run_task()
47 |
--------------------------------------------------------------------------------
/neural_net/keras_tcn/adding_problem/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def data_generator(n, seq_length):
5 | """
6 | Args:
7 | seq_length: Length of the adding problem data
8 | n: # of data in the set
9 | """
10 | x_num = np.random.uniform(0, 1, (n, 1, seq_length))
11 | x_mask = np.zeros([n, 1, seq_length])
12 | y = np.zeros([n, 1])
13 | for i in range(n):
14 | positions = np.random.choice(seq_length, size=2, replace=False)
15 | x_mask[i, 0, positions[0]] = 1
16 | x_mask[i, 0, positions[1]] = 1
17 | y[i, 0] = x_num[i, 0, positions[0]] + x_num[i, 0, positions[1]]
18 | x = np.concatenate((x_num, x_mask), axis=1)
19 | x = np.transpose(x, (0, 2, 1))
20 | return x, y
21 |
22 |
23 | if __name__ == '__main__':
24 | print(data_generator(n=20, seq_length=10))
25 |
--------------------------------------------------------------------------------
/neural_net/keras_tcn/copy_memory/README.md:
--------------------------------------------------------------------------------
1 | ## Copying Memory Task
2 |
3 | ### Overview
4 |
5 | In this task, each input sequence has length T+20. The first 10 values are chosen randomly
6 | among the digits 1-8, with the rest being all zeros, except for the last 11 entries that are
7 | filled with the digit ‘9’ (the first ‘9’ is a delimiter). The goal is to generate an output
8 | of same length that is zero everywhere, except the last 10 values after the delimiter, where
9 | the model is expected to repeat the 10 values it encountered at the start of the input.
10 |
11 | ### Data Generation
12 |
13 | See `data_generator` in `utils.py`.
14 |
15 | ### Note
16 |
17 | - Because a TCN's receptive field depends on depth of the network and the filter size, we need
18 | to make sure these the model we use can cover the sequence length T+20.
19 |
20 | - Using the `--seq_len` flag, one can change the # of values to recall (the typical setup is 10).
21 |
22 | From: From: https://github.com/locuslab/TCN/
--------------------------------------------------------------------------------
/neural_net/keras_tcn/copy_memory/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronggong/mispronunciation-detection/bed6f39e7e90a76a87332db425e14363b477ccb4/neural_net/keras_tcn/copy_memory/__init__.py
--------------------------------------------------------------------------------
/neural_net/keras_tcn/copy_memory/main.py:
--------------------------------------------------------------------------------
1 | import keras
2 |
3 | from utils import data_generator
4 | from tcn import tcn
5 |
6 | x_train, y_train = data_generator(601, 10, 10000)
7 | x_test, y_test = data_generator(601, 10, 2000)
8 |
9 |
10 | class PrintSomeValues(keras.callbacks.Callback):
11 |
12 | def on_epoch_begin(self, epoch, logs={}):
13 | print(f'x_test[0:1] = {x_test[0:1].flatten()}.')
14 | print(f'y_test[0:1] = {y_test[0:1].flatten()}.')
15 | print(f'p.shape = {self.model.predict(x_test[0:1]).shape}.')
16 | print(f'p(x_test[0:1]) = {self.model.predict(x_test[0:1]).argmax(axis=2).flatten()}.')
17 |
18 |
19 | def run_task():
20 | print(sum(x_train[0].tolist(), []))
21 | print(sum(y_train[0].tolist(), []))
22 |
23 | model, param_str = tcn.dilated_tcn(num_feat=1,
24 | num_classes=10,
25 | nb_filters=10,
26 | kernel_size=8,
27 | dilatations=[1, 2, 4, 8],
28 | nb_stacks=8,
29 | max_len=x_train[0:1].shape[1],
30 | activation='norm_relu',
31 | use_skip_connections=False,
32 | return_param_str=True)
33 |
34 | print(f'x_train.shape = {x_train.shape}')
35 | print(f'y_train.shape = {y_train.shape}')
36 |
37 | psv = PrintSomeValues()
38 |
39 | # Using sparse softmax.
40 | # http://chappers.github.io/web%20micro%20log/2017/01/26/quick-models-in-keras/
41 | model.summary()
42 |
43 | model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=100,
44 | callbacks=[psv], batch_size=128)
45 |
46 |
47 | if __name__ == '__main__':
48 | run_task()
49 |
--------------------------------------------------------------------------------
/neural_net/keras_tcn/copy_memory/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def data_generator(t, mem_length, b_size):
5 | """
6 | Generate data for the copying memory task
7 | :param t: The total blank time length
8 | :param mem_length: The length of the memory to be recalled
9 | :param b_size: The batch size
10 | :return: Input and target data tensor
11 | """
12 | seq = np.array(np.random.randint(1, 9, size=(b_size, mem_length)), dtype=float)
13 | zeros = np.zeros((b_size, t))
14 | marker = 9 * np.ones((b_size, mem_length + 1))
15 | placeholders = np.zeros((b_size, mem_length))
16 |
17 | x = np.array(np.concatenate((seq, zeros[:, :-1], marker), 1), dtype=int)
18 | y = np.array(np.concatenate((placeholders, zeros, seq), 1), dtype=int)
19 | return np.expand_dims(x, axis=2), np.expand_dims(y, axis=2)
20 |
21 |
22 | if __name__ == '__main__':
23 | print(data_generator(t=601, mem_length=10, b_size=1)[0].flatten())
24 |
--------------------------------------------------------------------------------
/neural_net/keras_tcn/misc/Adding_Task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronggong/mispronunciation-detection/bed6f39e7e90a76a87332db425e14363b477ccb4/neural_net/keras_tcn/misc/Adding_Task.png
--------------------------------------------------------------------------------
/neural_net/keras_tcn/misc/Copy_Memory_Task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronggong/mispronunciation-detection/bed6f39e7e90a76a87332db425e14363b477ccb4/neural_net/keras_tcn/misc/Copy_Memory_Task.png
--------------------------------------------------------------------------------
/neural_net/keras_tcn/misc/Dilated_Conv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronggong/mispronunciation-detection/bed6f39e7e90a76a87332db425e14363b477ccb4/neural_net/keras_tcn/misc/Dilated_Conv.png
--------------------------------------------------------------------------------
/neural_net/keras_tcn/misc/Sequential_MNIST_Task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronggong/mispronunciation-detection/bed6f39e7e90a76a87332db425e14363b477ccb4/neural_net/keras_tcn/misc/Sequential_MNIST_Task.png
--------------------------------------------------------------------------------
/neural_net/keras_tcn/mnist_pixel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronggong/mispronunciation-detection/bed6f39e7e90a76a87332db425e14363b477ccb4/neural_net/keras_tcn/mnist_pixel/__init__.py
--------------------------------------------------------------------------------
/neural_net/keras_tcn/mnist_pixel/main.py:
--------------------------------------------------------------------------------
1 | import keras.backend as K
2 |
3 | from utils import data_generator
4 | from tcn import tcn
5 |
6 |
7 | def get_activations(model, model_inputs, print_shape_only=False, layer_name=None):
8 | print('----- activations -----')
9 | activations = []
10 | inp = model.input
11 |
12 | model_multi_inputs_cond = True
13 | if not isinstance(inp, list):
14 | # only one input! let's wrap it in a list.
15 | inp = [inp]
16 | model_multi_inputs_cond = False
17 |
18 | outputs = [layer.output for layer in model.layers if
19 | layer.name == layer_name or layer_name is None] # all layer outputs
20 |
21 | funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs] # evaluation functions
22 |
23 | if model_multi_inputs_cond:
24 | list_inputs = []
25 | list_inputs.extend(model_inputs)
26 | list_inputs.append(0.)
27 | else:
28 | list_inputs = [model_inputs, 0.]
29 |
30 | # Learning phase. 0 = Test mode (no dropout or batch normalization)
31 | # layer_outputs = [func([model_inputs, 0.])[0] for func in funcs]
32 | layer_outputs = [func(list_inputs)[0] for func in funcs]
33 | for layer_activations in layer_outputs:
34 | activations.append(layer_activations)
35 | if print_shape_only:
36 | print(layer_activations.shape)
37 | else:
38 | print(layer_activations)
39 | return activations
40 | # np.sum(activations[15].squeeze(), axis=1)
41 |
42 |
43 | def run_task():
44 | (x_train, y_train), (x_test, y_test) = data_generator()
45 |
46 | model, param_str = tcn.dilated_tcn(output_slice_index='last', # try 'first'.
47 | num_feat=1,
48 | num_classes=10,
49 | nb_filters=64,
50 | kernel_size=8,
51 | dilatations=[1, 2, 4, 8],
52 | nb_stacks=8,
53 | max_len=x_train[0:1].shape[1],
54 | activation='norm_relu',
55 | use_skip_connections=False,
56 | return_param_str=True)
57 |
58 | print(f'x_train.shape = {x_train.shape}')
59 | print(f'y_train.shape = {y_train.shape}')
60 | print(f'x_test.shape = {x_test.shape}')
61 | print(f'y_test.shape = {y_test.shape}')
62 |
63 | model.summary()
64 |
65 | # a = np.zeros_like(x_train[0:1])
66 | # a[:, 0, :] = 1.0
67 | # print(get_activations(model, a))
68 |
69 | model.fit(x_train, y_train.squeeze().argmax(axis=1), epochs=100,
70 | validation_data=(x_test, y_test.squeeze().argmax(axis=1)))
71 |
72 |
73 | if __name__ == '__main__':
74 | run_task()
75 |
--------------------------------------------------------------------------------
/neural_net/keras_tcn/mnist_pixel/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from keras.datasets import mnist
3 | from keras.utils import to_categorical
4 |
5 |
6 | def data_generator():
7 | # input image dimensions
8 | img_rows, img_cols = 28, 28
9 | (x_train, y_train), (x_test, y_test) = mnist.load_data()
10 | x_train = x_train.reshape(-1, img_rows * img_cols, 1)
11 | x_test = x_test.reshape(-1, img_rows * img_cols, 1)
12 |
13 | num_classes = 10
14 | y_train = to_categorical(y_train, num_classes)
15 | y_test = to_categorical(y_test, num_classes)
16 |
17 | y_train = np.expand_dims(y_train, axis=2)
18 | y_test = np.expand_dims(y_test, axis=2)
19 |
20 | x_train = x_train.astype('float32')
21 | x_test = x_test.astype('float32')
22 | x_train /= 255
23 | x_test /= 255
24 |
25 | return (x_train, y_train), (x_test, y_test)
26 |
27 |
28 | if __name__ == '__main__':
29 | print(data_generator())
30 |
--------------------------------------------------------------------------------
/neural_net/keras_tcn/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | name='keras-tcn',
5 | version='1.4.0',
6 | description='Keras TCN',
7 | author='Philippe Remy',
8 | license='MIT',
9 | packages=['tcn'],
10 | install_requires=['tensorflow-gpu', 'numpy']
11 | )
12 |
--------------------------------------------------------------------------------
/neural_net/keras_tcn/tcn/__init__.py:
--------------------------------------------------------------------------------
1 | from neural_net.keras_tcn.tcn import tcn
2 |
--------------------------------------------------------------------------------
/neural_net/keras_tcn/tcn/tcn.py:
--------------------------------------------------------------------------------
1 | import keras.backend as K
2 | from keras import optimizers
3 | from keras.layers import Conv1D, SpatialDropout1D
4 | from keras.layers import Activation, Lambda
5 | from keras.layers import Convolution1D, Dense
6 | from keras.models import Input, Model
7 | import keras.layers
8 |
9 |
10 | def channel_normalization(x):
11 | # Normalize by the highest activation
12 | max_values = K.max(K.abs(x), 2, keepdims=True) + 1e-5
13 | out = x / max_values
14 | return out
15 |
16 |
17 | def wave_net_activation(x):
18 | tanh_out = Activation('tanh')(x)
19 | sigm_out = Activation('sigmoid')(x)
20 | return keras.layers.multiply([tanh_out, sigm_out])
21 |
22 |
23 | def residual_block(x, s, i, activation, nb_filters, kernel_size, dropout):
24 | original_x = x
25 | conv = Conv1D(filters=nb_filters, kernel_size=kernel_size,
26 | dilation_rate=2 ** i, padding='causal',
27 | name='dilated_conv_%d_tanh_s%d' % (2 ** i, s))(x)
28 | if activation == 'norm_relu':
29 | x = Activation('relu')(conv)
30 | x = Lambda(channel_normalization)(x)
31 | elif activation == 'wavenet':
32 | x = wave_net_activation(conv)
33 | else:
34 | x = Activation(activation)(conv)
35 |
36 | x = SpatialDropout1D(dropout)(x)
37 |
38 | # 1x1 conv.
39 | x = Convolution1D(nb_filters, 1, padding='same')(x)
40 | res_x = keras.layers.add([original_x, x])
41 | return res_x, x
42 |
43 |
44 | def dilated_tcn(num_feat, num_classes, nb_filters,
45 | kernel_size, dilatations, nb_stacks, max_len, dropout,
46 | activation='wavenet', use_skip_connections=True,
47 | return_param_str=False, output_slice_index=None,
48 | regression=False):
49 | """
50 | dilation_depth : number of layers per stack
51 | nb_stacks : number of stacks.
52 | """
53 | input_layer = Input(name='input_layer', shape=(max_len, num_feat))
54 | x = input_layer
55 | x = Convolution1D(nb_filters, kernel_size, padding='causal', name='initial_conv')(x)
56 |
57 | skip_connections = []
58 | for s in range(nb_stacks):
59 | for i in dilatations:
60 | x, skip_out = residual_block(x, s, i, activation, nb_filters, kernel_size, dropout)
61 | skip_connections.append(skip_out)
62 |
63 | if use_skip_connections:
64 | x = keras.layers.add(skip_connections)
65 | x = Activation('relu')(x)
66 |
67 | if output_slice_index is not None: # can test with 0 or -1.
68 | if output_slice_index == 'last':
69 | output_slice_index = -1
70 | if output_slice_index == 'first':
71 | output_slice_index = 0
72 | x = Lambda(lambda tt: tt[:, output_slice_index, :])(x)
73 |
74 | print('x.shape=', x.shape)
75 |
76 | if not regression:
77 | if num_classes == 2:
78 | x = Dense(1)(x)
79 | x = Activation('sigmoid', name='output_sigmoid')(x)
80 | output_layer = x
81 | print(f'model.x = {input_layer.shape}')
82 | print(f'model.y = {output_layer.shape}')
83 | model = Model(input_layer, output_layer)
84 | adam = optimizers.Adam(lr=0.002, clipnorm=1.)
85 | model.compile(adam, loss='binary_crossentropy', metrics=['accuracy'])
86 | print('Adam with norm clipping.')
87 | elif num_classes > 2:
88 | # classification
89 | x = Dense(num_classes)(x)
90 | x = Activation('softmax', name='output_softmax')(x)
91 | output_layer = x
92 | print(f'model.x = {input_layer.shape}')
93 | print(f'model.y = {output_layer.shape}')
94 | model = Model(input_layer, output_layer)
95 |
96 | adam = optimizers.Adam(lr=0.002, clipnorm=1.)
97 | model.compile(adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
98 | print('Adam with norm clipping.')
99 | else:
100 | raise ValueError
101 | else:
102 | # regression
103 | x = Dense(1)(x)
104 | x = Activation('linear', name='output_dense')(x)
105 | output_layer = x
106 | print(f'model.x = {input_layer.shape}')
107 | print(f'model.y = {output_layer.shape}')
108 | model = Model(input_layer, output_layer)
109 | adam = optimizers.Adam(lr=0.002, clipnorm=1.)
110 | model.compile(adam, loss='mean_squared_error')
111 |
112 | if return_param_str:
113 | param_str = 'D-TCN_C{}_B{}_L{}'.format(2, nb_stacks, dilatations)
114 | return model, param_str
115 | else:
116 | return model
117 |
--------------------------------------------------------------------------------
/neural_net/model/segmentation/jan_joint0.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronggong/mispronunciation-detection/bed6f39e7e90a76a87332db425e14363b477ccb4/neural_net/model/segmentation/jan_joint0.h5
--------------------------------------------------------------------------------
/neural_net/normal_pronunciation.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions related to normal pronunciation manipulation
3 | """
4 | import os
5 | import json
6 | from neural_net.file_path import path_root
7 | from neural_net.file_path import recordings_train
8 | from neural_net.file_path import parse_recordings
9 | from neural_net.utils.textgrid_preprocessing import parse_syllable_line_list
10 |
11 |
12 | if __name__ == "__main__":
13 |
14 | list_normal_special = [] # the normal counterpart of the special pronunciation
15 | list_normal_jianzi = [] # the normal counterpart of jianzi
16 |
17 | for rec in recordings_train:
18 | data_path, sub_folder, textgrid_folder, \
19 | wav_folder, filename, line_tier, longsyllable_tier, syllable_tier, \
20 | phoneme_tier, special_tier, special_class_tier, roletype = parse_recordings(rec)
21 |
22 | textgrid_filename = os.path.join(path_root, data_path, textgrid_folder, sub_folder, filename + ".textgrid")
23 |
24 | print("Parse textgrid file {}".format(textgrid_filename))
25 |
26 | nested_syllable_list, is_file_exist, is_syllable_found = \
27 | parse_syllable_line_list(ground_truth_text_grid_file=textgrid_filename,
28 | parent_tier=longsyllable_tier,
29 | child_tier=syllable_tier)
30 |
31 | nested_special_list, is_file_exist, is_special_found = \
32 | parse_syllable_line_list(ground_truth_text_grid_file=textgrid_filename,
33 | parent_tier=longsyllable_tier,
34 | child_tier=special_tier)
35 |
36 | nested_specialClass_list, is_file_exist, is_specialClass_found = \
37 | parse_syllable_line_list(ground_truth_text_grid_file=textgrid_filename,
38 | parent_tier=longsyllable_tier,
39 | child_tier=special_class_tier)
40 |
41 | nested_phoneme_list, is_file_exist, is_phoneme_found = \
42 | parse_syllable_line_list(ground_truth_text_grid_file=textgrid_filename,
43 | parent_tier=longsyllable_tier,
44 | child_tier=phoneme_tier)
45 |
46 | for ii_line in range(len(nested_special_list)):
47 | line_special_list = nested_special_list[ii_line]
48 | if line_special_list[0][2] != "1":
49 | line_syllable_list = nested_syllable_list[ii_line]
50 | line_specialClass_list = nested_specialClass_list[ii_line]
51 |
52 | for ii_syl in range(len(line_specialClass_list[1])):
53 | special_class = line_specialClass_list[1][ii_syl][2]
54 | try:
55 | syllable = line_syllable_list[1][ii_syl][2]
56 | except IndexError:
57 | raise IndexError(rec, ii_line)
58 |
59 | if special_class == "1": # shangkou
60 | shangkou = line_special_list[1][ii_syl][2]
61 | list_normal_special.append(syllable)
62 | # print("shangkou", syllable, shangkou, rec, ii_line)
63 | if special_class == "2": # jiantuan
64 | jiantuan = line_special_list[1][ii_syl][2]
65 | list_normal_jianzi.append(syllable)
66 |
67 | list_normal_special = list(set(list_normal_special))
68 | list_normal_jianzi = list(set(list_normal_jianzi))
69 |
70 | with open("./data/normal_special.json", "w") as f:
71 | json.dump(list_normal_special, f)
72 |
73 | with open("./data/normal_jianzi.json", "w") as f:
74 | json.dump(list_normal_jianzi, f)
--------------------------------------------------------------------------------
/neural_net/onsetSegmentEval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ronggong/mispronunciation-detection/bed6f39e7e90a76a87332db425e14363b477ccb4/neural_net/onsetSegmentEval/__init__.py
--------------------------------------------------------------------------------
/neural_net/onsetSegmentEval/evaluation.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | Syllable segmentation evaluation: landmark and boundary evaluations
5 | Only evaluate boundary onset
6 |
7 | [1] A new hybrid approach for automatic speech signal segmentation
8 | using silence signal detection, energy convex hull, and spectral variation
9 |
10 | [2] Syll-O-Matic: An adaptive time-frequency representation
11 | for the automatic segmentation of speech into syllables
12 |
13 | [3] EVALUATION FRAMEWORK FOR AUTOMATIC SINGING
14 | TRANSCRIPTION
15 | """
16 |
17 | from neural_net.onsetSegmentEval.phonemeMap import misMatchIgnorePhn
18 | from neural_net.onsetSegmentEval.phonemeMap import misMatchIgnoreSyl
19 | from neural_net.parameters import hopsize_t
20 | import numpy as np
21 |
22 |
23 | def onsetEval(groundtruthOnsets, detectedOnsets, tolerance, label):
24 | """
25 | :param groundtruthOnsets: [[onset time, onset label], ...]
26 | :param detectedOnsets: [[onset time, onset label], ...]
27 | :param tolerance: 0.025 or 0.05
28 | :param label: True or False, if we want to evaluate the label
29 | :return:
30 | """
31 |
32 | numDetectedOnsets = len(detectedOnsets)
33 | numGroundtruthOnsets = len(groundtruthOnsets)
34 |
35 | onsetCorrectlist = [0]*numDetectedOnsets
36 |
37 | for gtb in groundtruthOnsets:
38 | for idx, db in enumerate(detectedOnsets):
39 | onsetTh = tolerance # onset threshold
40 |
41 | if abs(db[0]-gtb[0])= len(list_feature):
21 | ii = 0
22 | if shuffle:
23 | p = np.random.permutation(len(list_feature))
24 | list_feature = [list_feature[ii_p] for ii_p in p]
25 | labels = labels[p] # labels is a numpy array
--------------------------------------------------------------------------------
/neural_net/training_scripts/hpc_code/train_run_jianzi.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import pickle
4 | import numpy as np
5 | from sklearn.model_selection import StratifiedKFold
6 | from sklearn.model_selection import train_test_split
7 | from sklearn.preprocessing import StandardScaler
8 |
9 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10 |
11 | from neural_net.training_scripts.models_RNN import train_RNN_batch
12 | from neural_net.training_scripts.models_RNN import eval_RNN_model
13 | from neural_net.combine_feature_label import combine_feature_label
14 | from neural_net.file_path import *
15 |
16 |
17 | if __name__ == '__main__':
18 |
19 | cv_prod = "prod"
20 | batch_size = 1
21 | input_shape = (batch_size, None, 80)
22 | patience = 15
23 | attention = "feedforward"
24 | conv = True
25 | dropout = 0.5
26 | epoch = 500
27 |
28 | path_model = '/Users/ronggong/PycharmProjects/mispronunciation-detection/neural_net/model/'
29 |
30 | with open(dict_jianzi_positive, "rb") as f:
31 | feature_jianzi_pos = pickle.load(f)
32 |
33 | with open(dict_jianzi_negative, "rb") as f:
34 | feature_jianzi_neg = pickle.load(f)
35 |
36 | X_jianzi, y_jianzi = combine_feature_label(dict_positive=feature_jianzi_pos,
37 | dict_negative=feature_jianzi_neg)
38 |
39 | if cv_prod == "cv":
40 | list_loss = []
41 | list_acc = []
42 | skf = StratifiedKFold(n_splits=5)
43 | for ii, (train_index, val_index) in enumerate(skf.split(X_jianzi, y_jianzi)):
44 |
45 | model_name = 'jianzi_model_{}_{}_{}'.format(attention, conv, dropout)
46 | file_path_model = os.path.join(path_model, model_name + '_' + str(ii) + '.h5')
47 | file_path_log = os.path.join(path_model, 'log', model_name + '_' + str(ii) + '.csv')
48 |
49 | print("TRAIN:", train_index, "TEST:", val_index)
50 |
51 | X_train, X_test = [X_jianzi[ii] for ii in train_index], [X_jianzi[ii] for ii in val_index]
52 | y_train, y_test = y_jianzi[train_index], y_jianzi[val_index]
53 |
54 | X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.1)
55 |
56 | # standarization
57 | scaler = StandardScaler()
58 | X_train_conc = np.concatenate(X_train)
59 | scaler.fit(X_train_conc)
60 |
61 | model = train_RNN_batch(list_feature_fold_train=X_train,
62 | labels_fold_train=y_train,
63 | list_feature_fold_val=X_val,
64 | labels_fold_val=y_val,
65 | batch_size=batch_size,
66 | input_shape=input_shape,
67 | output_shape=1,
68 | file_path_model=file_path_model,
69 | filename_log=file_path_log,
70 | epoch=epoch,
71 | patience=patience,
72 | scaler=scaler,
73 | attention=attention,
74 | conv=conv,
75 | dropout=dropout,
76 | summ=True,
77 | verbose=2)
78 |
79 | loss_test = eval_RNN_model(list_feature_test=X_test,
80 | labels_test=y_test,
81 | file_path_model=file_path_model,
82 | attention=attention,
83 | scaler=scaler)
84 |
85 | list_loss.append(loss_test)
86 |
87 | with open(os.path.join(path_model, 'log', 'jianzi_esults_{}_{}_{}.txt'.format(attention, conv, dropout)), 'w') as f:
88 | f.write("attention {} conv {} dropout {} loss {}".format(attention, conv, dropout, np.mean(list_loss)))
89 |
90 | elif cv_prod == "prod":
91 | X_train, X_val, y_train, y_val = train_test_split(X_jianzi, y_jianzi, stratify=y_jianzi, test_size=0.1)
92 |
93 | model_name = 'jianzi_model_prod_{}_{}_{}'.format(attention, conv, dropout)
94 | file_path_model = os.path.join(path_model, model_name + '.h5')
95 | file_path_log = os.path.join(path_model, 'log', model_name + '.csv')
96 |
97 | # standarization
98 | scaler = StandardScaler()
99 | X_train_conc = np.concatenate(X_train)
100 | scaler.fit(X_train_conc)
101 |
102 | train_RNN_batch(list_feature_fold_train=X_train,
103 | labels_fold_train=y_train,
104 | list_feature_fold_val=X_val,
105 | labels_fold_val=y_val,
106 | batch_size=batch_size,
107 | input_shape=input_shape,
108 | output_shape=1,
109 | file_path_model=file_path_model,
110 | filename_log=file_path_log,
111 | epoch=epoch,
112 | patience=patience,
113 | scaler=scaler,
114 | attention=attention,
115 | conv=conv,
116 | dropout=dropout,
117 | summ=True,
118 | verbose=2)
119 | else:
120 | raise ValueError("{} is not a valid option.".format(cv_prod))
121 |
--------------------------------------------------------------------------------
/neural_net/training_scripts/hpc_code/train_run_jianzi_tcn.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import pickle
4 | import numpy as np
5 | from sklearn.model_selection import StratifiedKFold
6 | from sklearn.model_selection import train_test_split
7 | from sklearn.preprocessing import StandardScaler
8 |
9 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10 |
11 | from neural_net.training_scripts.models_TCN import train_TCN_batch
12 | from neural_net.training_scripts.models_RNN import eval_RNN_model
13 | from neural_net.combine_feature_label import combine_feature_label
14 | from neural_net.file_path import *
15 |
16 |
17 | if __name__ == '__main__':
18 |
19 | cv_prod = "cv"
20 | batch_size = 1
21 | input_shape = (batch_size, None, 80)
22 | patience = 15
23 | attention = False
24 | dropout = 0.05
25 | epoch = 500
26 |
27 | path_model = '/Users/ronggong/PycharmProjects/mispronunciation-detection/neural_net/model/'
28 |
29 | with open(dict_jianzi_positive, "rb") as f:
30 | feature_jianzi_pos = pickle.load(f)
31 |
32 | with open(dict_jianzi_negative, "rb") as f:
33 | feature_jianzi_neg = pickle.load(f)
34 |
35 | X_jianzi, y_jianzi = combine_feature_label(dict_positive=feature_jianzi_pos,
36 | dict_negative=feature_jianzi_neg)
37 |
38 | if cv_prod == "cv":
39 | list_loss = []
40 | list_acc = []
41 | skf = StratifiedKFold(n_splits=5)
42 | for ii, (train_index, val_index) in enumerate(skf.split(X_jianzi, y_jianzi)):
43 |
44 | model_name = 'jianzi_model_tcn_1_stack_3_{}'.format(dropout)
45 | file_path_model = os.path.join(path_model, model_name + '_' + str(ii) + '.h5')
46 | file_path_log = os.path.join(path_model, 'log', model_name + '_' + str(ii) + '.csv')
47 |
48 | print("TRAIN:", train_index, "TEST:", val_index)
49 |
50 | X_train, X_test = [X_jianzi[ii] for ii in train_index], [X_jianzi[ii] for ii in val_index]
51 | y_train, y_test = y_jianzi[train_index], y_jianzi[val_index]
52 |
53 | X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.1)
54 |
55 | # standarization
56 | scaler = StandardScaler()
57 | X_train_conc = np.concatenate(X_train)
58 | scaler.fit(X_train_conc)
59 |
60 | model = train_TCN_batch(list_feature_fold_train=X_train,
61 | labels_fold_train=y_train,
62 | list_feature_fold_val=X_val,
63 | labels_fold_val=y_val,
64 | batch_size=batch_size,
65 | input_shape=input_shape,
66 | file_path_model=file_path_model,
67 | filename_log=file_path_log,
68 | epoch=epoch,
69 | patience=patience,
70 | scaler=scaler,
71 | dropout=dropout,
72 | summ=True,
73 | verbose=2)
74 |
75 | loss_test = eval_RNN_model(list_feature_test=X_test,
76 | labels_test=y_test,
77 | file_path_model=file_path_model,
78 | attention=attention,
79 | scaler=scaler)
80 |
81 | list_loss.append(loss_test)
82 |
83 | with open(os.path.join(path_model, 'log', 'jianzi_esults_tcn_1_stack_3_{}.txt'.format(dropout)), 'w') as f:
84 | f.write("loss {}".format(np.mean(list_loss)))
85 |
86 | elif cv_prod == "prod":
87 | X_train, X_val, y_train, y_val = train_test_split(X_jianzi, y_jianzi, stratify=y_jianzi, test_size=0.1)
88 |
89 | model_name = 'jianzi_model_prod_tcn_{}'.format(dropout)
90 | file_path_model = os.path.join(path_model, model_name + '.h5')
91 | file_path_log = os.path.join(path_model, 'log', model_name + '.csv')
92 |
93 | # standarization
94 | scaler = StandardScaler()
95 | X_train_conc = np.concatenate(X_train)
96 | scaler.fit(X_train_conc)
97 |
98 | train_TCN_batch(list_feature_fold_train=X_train,
99 | labels_fold_train=y_train,
100 | list_feature_fold_val=X_val,
101 | labels_fold_val=y_val,
102 | batch_size=batch_size,
103 | input_shape=input_shape,
104 | file_path_model=file_path_model,
105 | filename_log=file_path_log,
106 | epoch=epoch,
107 | patience=patience,
108 | scaler=scaler,
109 | dropout=dropout,
110 | summ=True,
111 | verbose=2)
112 | else:
113 | raise ValueError("{} is not a valid option.".format(cv_prod))
114 |
--------------------------------------------------------------------------------
/neural_net/training_scripts/hpc_code/train_run_special.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import pickle
4 | import numpy as np
5 | from sklearn.model_selection import StratifiedKFold
6 | from sklearn.model_selection import train_test_split
7 | from sklearn.preprocessing import StandardScaler
8 |
9 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10 |
11 | from neural_net.training_scripts.models_RNN import train_RNN_batch
12 | from neural_net.training_scripts.models_RNN import eval_RNN_model
13 | from neural_net.combine_feature_label import combine_feature_label
14 | from neural_net.file_path import *
15 |
16 |
17 | if __name__ == '__main__':
18 |
19 | cv_prod = "cv"
20 | batch_size = 1
21 | input_shape = (batch_size, None, 80)
22 | patience = 15
23 | attention = "selfatt"
24 | conv = True
25 | dropout = 0.5
26 | epoch = 500
27 |
28 | path_model = '/Users/ronggong/PycharmProjects/mispronunciation-detection/neural_net/model/'
29 |
30 | with open(dict_special_positive, "rb") as f:
31 | feature_special_pos = pickle.load(f)
32 |
33 | with open(dict_special_negative, "rb") as f:
34 | feature_special_neg = pickle.load(f)
35 |
36 | X_special, y_special = combine_feature_label(dict_positive=feature_special_pos,
37 | dict_negative=feature_special_neg)
38 |
39 | if cv_prod == "cv":
40 | list_loss = []
41 | list_acc = []
42 | skf = StratifiedKFold(n_splits=5)
43 | for ii, (train_index, val_index) in enumerate(skf.split(X_special, y_special)):
44 |
45 | model_name = 'special_model_{}_{}_{}'.format(attention, conv, dropout)
46 | file_path_model = os.path.join(path_model, model_name + '_' + str(ii) + '.h5')
47 | file_path_log = os.path.join(path_model, 'log', model_name + '_' + str(ii) + '.csv')
48 |
49 | print("TRAIN:", train_index, "TEST:", val_index)
50 |
51 | X_train, X_test = [X_special[ii] for ii in train_index], [X_special[ii] for ii in val_index]
52 | y_train, y_test = y_special[train_index], y_special[val_index]
53 |
54 | X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.1)
55 |
56 | # standarization
57 | scaler = StandardScaler()
58 | X_train_conc = np.concatenate(X_train)
59 | scaler.fit(X_train_conc)
60 |
61 | model = train_RNN_batch(list_feature_fold_train=X_train,
62 | labels_fold_train=y_train,
63 | list_feature_fold_val=X_val,
64 | labels_fold_val=y_val,
65 | batch_size=batch_size,
66 | input_shape=input_shape,
67 | output_shape=1,
68 | file_path_model=file_path_model,
69 | filename_log=file_path_log,
70 | epoch=epoch,
71 | patience=patience,
72 | scaler=scaler,
73 | attention=attention,
74 | conv=conv,
75 | dropout=dropout,
76 | summ=True,
77 | verbose=2)
78 |
79 | loss_test = eval_RNN_model(list_feature_test=X_test,
80 | labels_test=y_test,
81 | file_path_model=file_path_model,
82 | attention=attention,
83 | scaler=scaler)
84 |
85 | list_loss.append(loss_test)
86 |
87 | with open(os.path.join(path_model, 'log', 'special_results_{}_{}_{}.txt'.format(attention, conv, dropout)), 'w') as f:
88 | f.write("attention {} conv {} dropout {} loss {}".format(attention, conv, dropout, np.mean(list_loss)))
89 |
90 | elif cv_prod == "prod":
91 | X_train, X_val, y_train, y_val = train_test_split(X_special, y_special, stratify=y_special, test_size=0.1)
92 |
93 | model_name = 'special_model_prod_{}_{}_{}'.format(attention, conv, dropout)
94 | file_path_model = os.path.join(path_model, model_name + '.h5')
95 | file_path_log = os.path.join(path_model, 'log', model_name + '.csv')
96 |
97 | # standarization
98 | scaler = StandardScaler()
99 | X_train_conc = np.concatenate(X_train)
100 | scaler.fit(X_train_conc)
101 |
102 | train_RNN_batch(list_feature_fold_train=X_train,
103 | labels_fold_train=y_train,
104 | list_feature_fold_val=X_val,
105 | labels_fold_val=y_val,
106 | batch_size=batch_size,
107 | input_shape=input_shape,
108 | output_shape=1,
109 | file_path_model=file_path_model,
110 | filename_log=file_path_log,
111 | epoch=epoch,
112 | patience=patience,
113 | scaler=scaler,
114 | attention=attention,
115 | conv=conv,
116 | dropout=dropout,
117 | summ=True,
118 | verbose=2)
119 | else:
120 | raise ValueError("{} is not a valid option.".format(cv_prod))
121 |
--------------------------------------------------------------------------------
/neural_net/training_scripts/hpc_code/train_run_special_tcn.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import pickle
4 | import numpy as np
5 | from sklearn.model_selection import StratifiedKFold
6 | from sklearn.model_selection import train_test_split
7 | from sklearn.preprocessing import StandardScaler
8 |
9 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10 |
11 | from neural_net.training_scripts.models_TCN import train_TCN_batch
12 | from neural_net.training_scripts.models_RNN import eval_RNN_model
13 | from neural_net.combine_feature_label import combine_feature_label
14 | from neural_net.file_path import *
15 |
16 |
17 | if __name__ == '__main__':
18 |
19 | cv_prod = "cv"
20 | batch_size = 1
21 | input_shape = (batch_size, None, 80)
22 | patience = 15
23 | attention = False
24 | dropout = 0.05
25 | epoch = 500
26 |
27 | path_model = '/Users/ronggong/PycharmProjects/mispronunciation-detection/neural_net/model/'
28 |
29 | with open(dict_special_positive, "rb") as f:
30 | feature_special_pos = pickle.load(f)
31 |
32 | with open(dict_special_negative, "rb") as f:
33 | feature_special_neg = pickle.load(f)
34 |
35 | X_special, y_special = combine_feature_label(dict_positive=feature_special_pos,
36 | dict_negative=feature_special_neg)
37 |
38 | if cv_prod == "cv":
39 | list_loss = []
40 | list_acc = []
41 | skf = StratifiedKFold(n_splits=5)
42 | for ii, (train_index, val_index) in enumerate(skf.split(X_special, y_special)):
43 |
44 | model_name = 'special_model_tcn_1_stack_3_{}'.format(dropout)
45 | file_path_model = os.path.join(path_model, model_name + '_' + str(ii) + '.h5')
46 | file_path_log = os.path.join(path_model, 'log', model_name + '_' + str(ii) + '.csv')
47 |
48 | print("TRAIN:", train_index, "TEST:", val_index)
49 |
50 | X_train, X_test = [X_special[ii] for ii in train_index], [X_special[ii] for ii in val_index]
51 | y_train, y_test = y_special[train_index], y_special[val_index]
52 |
53 | X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.1)
54 |
55 | # standarization
56 | scaler = StandardScaler()
57 | X_train_conc = np.concatenate(X_train)
58 | scaler.fit(X_train_conc)
59 |
60 | model = train_TCN_batch(list_feature_fold_train=X_train,
61 | labels_fold_train=y_train,
62 | list_feature_fold_val=X_val,
63 | labels_fold_val=y_val,
64 | batch_size=batch_size,
65 | input_shape=input_shape,
66 | file_path_model=file_path_model,
67 | filename_log=file_path_log,
68 | epoch=epoch,
69 | patience=patience,
70 | scaler=scaler,
71 | dropout=dropout,
72 | summ=True,
73 | verbose=2)
74 |
75 | loss_test = eval_RNN_model(list_feature_test=X_test,
76 | labels_test=y_test,
77 | file_path_model=file_path_model,
78 | attention=attention,
79 | scaler=scaler)
80 |
81 | list_loss.append(loss_test)
82 |
83 | with open(os.path.join(path_model, 'log', 'special_results_tcn_1_stack_3_{}.txt'.format(dropout)), 'w') as f:
84 | f.write("loss {}".format(np.mean(list_loss)))
85 |
86 | elif cv_prod == "prod":
87 | X_train, X_val, y_train, y_val = train_test_split(X_special, y_special, stratify=y_special, test_size=0.1)
88 |
89 | model_name = 'special_model_prod_tcn_{}'.format(dropout)
90 | file_path_model = os.path.join(path_model, model_name + '.h5')
91 | file_path_log = os.path.join(path_model, 'log', model_name + '.csv')
92 |
93 | # standarization
94 | scaler = StandardScaler()
95 | X_train_conc = np.concatenate(X_train)
96 | scaler.fit(X_train_conc)
97 |
98 | train_TCN_batch(list_feature_fold_train=X_train,
99 | labels_fold_train=y_train,
100 | list_feature_fold_val=X_val,
101 | labels_fold_val=y_val,
102 | batch_size=batch_size,
103 | input_shape=input_shape,
104 | file_path_model=file_path_model,
105 | filename_log=file_path_log,
106 | epoch=epoch,
107 | patience=patience,
108 | scaler=scaler,
109 | dropout=dropout,
110 | summ=True,
111 | verbose=2)
112 | else:
113 | raise ValueError("{} is not a valid option.".format(cv_prod))
114 |
--------------------------------------------------------------------------------
/neural_net/training_scripts/models_RNN.py:
--------------------------------------------------------------------------------
1 | from keras.models import Input
2 | from keras.models import Model
3 | from keras.models import load_model
4 | from keras.layers import Dropout
5 | from keras.layers import LSTM
6 | from keras.layers import CuDNNLSTM
7 | from keras.layers import Bidirectional
8 | from keras.layers import Dense
9 | from keras.layers import Conv1D
10 | from keras.layers import Conv2D
11 | from keras.layers import Dot
12 | from keras.layers import Lambda
13 | from keras.layers import MaxPooling2D
14 | from keras.layers import Reshape
15 | from keras import backend as K
16 | from keras.callbacks import EarlyStopping
17 | from keras.callbacks import CSVLogger
18 | from keras.callbacks import ModelCheckpoint
19 | from keras.activations import softmax
20 | from tensorflow.python.client import device_lib
21 |
22 | import os
23 | import sys
24 | import numpy as np
25 | from sklearn.metrics import log_loss
26 | from sklearn.metrics import accuracy_score
27 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__))))
28 | from neural_net.training_scripts.attention import Attention
29 | from neural_net.training_scripts.generator import generator_batch1
30 |
31 |
32 | def conv_module(conv, input_shape, input):
33 | if conv:
34 | x = Reshape((-1, input_shape[2]) + (1,))(input)
35 | x = Conv2D(filters=8, kernel_size=(1, 3), activation="relu")(x)
36 | x = MaxPooling2D(pool_size=(1, 3))(x)
37 |
38 | x = Conv2D(filters=16, kernel_size=(1, 3), activation="relu")(x)
39 | x = MaxPooling2D(pool_size=(1, 3))(x)
40 | shape = K.int_shape(x)
41 | x = Reshape((-1, shape[2] * shape[3]))(x)
42 | else:
43 | x = input
44 | return x
45 |
46 |
47 | def embedding_RNN_1_lstm(input_shape, conv=False, dropout=False, att=False):
48 |
49 | device = device_lib.list_local_devices()[0].device_type
50 |
51 | input = Input(batch_shape=input_shape)
52 |
53 | x = conv_module(conv, input_shape, input)
54 |
55 | if att:
56 | return_sequence = True
57 | else:
58 | return_sequence = False
59 |
60 | if device == 'CPU':
61 | if dropout:
62 | x = Bidirectional(LSTM(units=8, return_sequences=return_sequence, dropout=dropout))(x)
63 | x = Dropout(dropout)(x)
64 | else:
65 | x = Bidirectional(LSTM(units=8, return_sequences=return_sequence))(x)
66 | else:
67 | x = Bidirectional(CuDNNLSTM(units=8, return_sequences=return_sequence))(x)
68 |
69 | if att == "feedforward":
70 | print(K.shape(x))
71 | x, attention = Attention(return_attention=True)(x)
72 | elif att == "selfatt":
73 | attention = Conv1D(filters=16, kernel_size=1, activation='tanh', padding='same', use_bias=True,
74 | kernel_initializer='glorot_uniform', bias_initializer='zeros',
75 | name="attention_layer1")(x)
76 | attention = Conv1D(filters=16, kernel_size=1, activation='linear', padding='same',
77 | use_bias=True,
78 | kernel_initializer='glorot_uniform', bias_initializer='zeros',
79 | name="attention_layer2")(attention)
80 | attention = Lambda(lambda x: softmax(x, axis=1), name="attention_vector")(attention)
81 |
82 | # Apply attention weights
83 | weighted_sequence_embedding = Dot(axes=[1, 1], normalize=False, name="weighted_sequence_embedding")(
84 | [attention, x])
85 |
86 | # Add and normalize to obtain final sequence embedding
87 | x = Lambda(lambda x: K.l2_normalize(K.sum(x, axis=1)))(weighted_sequence_embedding)
88 | attention = weighted_sequence_embedding
89 | else:
90 | attention = None
91 |
92 | return x, input, attention
93 |
94 |
95 | def RNN_model_definition(input_shape,
96 | conv,
97 | dropout,
98 | attention,
99 | output_shape):
100 | x, input, att_vector = embedding_RNN_1_lstm(input_shape=input_shape,
101 | conv=conv,
102 | dropout=dropout,
103 | att=attention)
104 |
105 | # print("attention shape {}".format(K.shape(att_vector)))
106 |
107 | outputs = [Dense(output_shape, activation='sigmoid')(x), att_vector]
108 |
109 | model = Model(inputs=input, outputs=outputs)
110 |
111 | # model.compile(optimizer='adam',
112 | # loss='binary_crossentropy',
113 | # metrics=['accuracy'])
114 |
115 | return model, input, att_vector
116 |
117 |
118 | def train_RNN_batch(list_feature_fold_train,
119 | labels_fold_train,
120 | list_feature_fold_val,
121 | labels_fold_val,
122 | batch_size,
123 | input_shape,
124 | output_shape,
125 | file_path_model,
126 | filename_log,
127 | epoch,
128 | patience,
129 | scaler,
130 | attention,
131 | conv,
132 | dropout,
133 | summ=False,
134 | verbose=2):
135 |
136 | x, input, att_vector = embedding_RNN_1_lstm(input_shape=input_shape,
137 | conv=conv,
138 | dropout=dropout,
139 | att=attention)
140 |
141 | # print("attention shape {}".format(K.shape(att_vector)))
142 |
143 | outputs = Dense(output_shape, activation='sigmoid')(x)
144 |
145 | model = Model(inputs=input, outputs=outputs)
146 |
147 | model.compile(optimizer='adam',
148 | loss='binary_crossentropy',
149 | metrics=['accuracy'])
150 |
151 | if summ:
152 | model.summary()
153 |
154 | callbacks = [ModelCheckpoint(file_path_model, monitor='val_loss', verbose=0, save_best_only=True),
155 | EarlyStopping(monitor='val_loss', patience=patience, verbose=0),
156 | CSVLogger(filename=filename_log, separator=';')]
157 |
158 | print("start training with validation...")
159 |
160 | generator_train = generator_batch1(list_feature=list_feature_fold_train,
161 | labels=labels_fold_train,
162 | scaler=scaler)
163 |
164 | generator_val = generator_batch1(list_feature=list_feature_fold_val,
165 | labels=labels_fold_val,
166 | scaler=scaler)
167 |
168 | model.fit_generator(generator=generator_train,
169 | steps_per_epoch=len(list_feature_fold_train)/batch_size,
170 | validation_data=generator_val,
171 | validation_steps=len(list_feature_fold_val)/batch_size,
172 | callbacks=callbacks,
173 | epochs=epoch,
174 | verbose=verbose)
175 |
176 | return model
177 |
178 |
179 | def eval_RNN_model(list_feature_test,
180 | labels_test,
181 | file_path_model,
182 | attention,
183 | scaler):
184 | if attention == "feedforward":
185 | model = load_model(filepath=file_path_model,
186 | custom_objects={'Attention': Attention(return_attention=True)})
187 | elif attention == "selfatt":
188 | model = load_model(filepath=file_path_model,
189 | custom_objects={'softmax': softmax})
190 | else:
191 | model = load_model(file_path_model)
192 |
193 | list_y_pred = np.zeros((len(labels_test, )))
194 | for ii in range(len(list_feature_test)):
195 | fea = list_feature_test[ii]
196 | fea = scaler.transform(fea)
197 | fea = np.expand_dims(fea, axis=0)
198 | y_pred = model.predict_on_batch(fea)
199 | list_y_pred[ii] = y_pred[0][0]
200 |
201 | loss_test = log_loss(y_true=labels_test, y_pred=list_y_pred)
202 |
203 | return loss_test
--------------------------------------------------------------------------------
/neural_net/training_scripts/models_TCN.py:
--------------------------------------------------------------------------------
1 | from neural_net.keras_tcn.tcn import tcn
2 | from neural_net.training_scripts.generator import generator_batch1
3 | from keras.callbacks import EarlyStopping
4 | from keras.callbacks import CSVLogger
5 | from keras.callbacks import ModelCheckpoint
6 |
7 |
8 | def train_TCN_batch(list_feature_fold_train,
9 | labels_fold_train,
10 | list_feature_fold_val,
11 | labels_fold_val,
12 | batch_size,
13 | input_shape,
14 | file_path_model,
15 | filename_log,
16 | epoch,
17 | patience,
18 | scaler,
19 | dropout,
20 | summ=False,
21 | verbose=2):
22 |
23 | model, param_str = tcn.dilated_tcn(output_slice_index='last', # try 'first'.
24 | num_feat=input_shape[-1],
25 | num_classes=2,
26 | nb_filters=16,
27 | kernel_size=3,
28 | dilatations=[0, 1, 3, 5],
29 | nb_stacks=1,
30 | max_len=None,
31 | dropout=dropout,
32 | activation='norm_relu',
33 | use_skip_connections=False,
34 | return_param_str=True)
35 |
36 | if summ:
37 | model.summary()
38 |
39 | callbacks = [ModelCheckpoint(file_path_model, monitor='val_loss', verbose=0, save_best_only=True),
40 | EarlyStopping(monitor='val_loss', patience=patience, verbose=0),
41 | CSVLogger(filename=filename_log, separator=';')]
42 |
43 | print("start training with validation...")
44 |
45 | generator_train = generator_batch1(list_feature=list_feature_fold_train,
46 | labels=labels_fold_train,
47 | scaler=scaler)
48 |
49 | generator_val = generator_batch1(list_feature=list_feature_fold_val,
50 | labels=labels_fold_val,
51 | scaler=scaler)
52 |
53 | model.fit_generator(generator=generator_train,
54 | steps_per_epoch=len(list_feature_fold_train)/batch_size,
55 | validation_data=generator_val,
56 | validation_steps=len(list_feature_fold_val)/batch_size,
57 | callbacks=callbacks,
58 | epochs=epoch,
59 | verbose=verbose)
60 |
61 | return model
--------------------------------------------------------------------------------
/neural_net/utils/audio_preprocessing.py:
--------------------------------------------------------------------------------
1 | from madmom.processors import SequentialProcessor
2 | import numpy as np
3 |
4 | EPSILON = np.spacing(1)
5 |
6 |
7 | def Fprev_sub(x,w=2):
8 | """
9 | # D = prev_sub(X,W) calculate the shifted x, with shifting frames 2
10 | input feature*frame
11 | """
12 | # pad data by repeating first and last columns
13 | if w > 0:
14 | # shift to right
15 | xx = np.hstack((np.tile(x[:,0], (w,1)).transpose(), x[:,:-w]))
16 | if w < 0:
17 | # shift to left
18 | xx = np.hstack((x[:,-w:], np.tile(x[:,-1], (-w,1)).transpose()))
19 | if w==0:
20 | raise ValueError("shifting frame coef can't be 0.")
21 |
22 | # plt.figure()
23 | # plt.pcolormesh(xx)
24 | # plt.show()
25 |
26 | return xx
27 |
28 |
29 | def _nbf_2D(log_mel, nlen):
30 | """shift the feature and concatenate it in both left and right sides for nlen"""
31 |
32 | log_mel = np.array(log_mel).transpose()
33 | log_mel_out = np.array(log_mel, copy=True)
34 | for ii in range(1, nlen + 1):
35 | log_mel_right_shift = Fprev_sub(log_mel, w=ii)
36 | log_mel_left_shift = Fprev_sub(log_mel, w=-ii)
37 | log_mel_out = np.vstack((log_mel_right_shift, log_mel_out, log_mel_left_shift))
38 | feature = log_mel_out.transpose()
39 | return feature
40 |
41 |
42 | class MadmomMelbankProcessor(SequentialProcessor):
43 |
44 | def __init__(self, fs, hopsize_t):
45 | from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
46 | from madmom.audio.stft import ShortTimeFourierTransformProcessor
47 | from madmom.audio.filters import MelFilterbank
48 | from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
49 | LogarithmicSpectrogramProcessor)
50 |
51 | # define pre-processing chain
52 | sig = SignalProcessor(num_channels=1, sample_rate=fs)
53 | frames = FramedSignalProcessor(frame_size=2048, hopsize=int(fs*hopsize_t))
54 | stft = ShortTimeFourierTransformProcessor() # caching FFT window
55 | filt = FilteredSpectrogramProcessor(
56 | filterbank=MelFilterbank, num_bands=80, fmin=27.5, fmax=16000,
57 | norm_filters=True, unique_filters=False)
58 | spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON)
59 |
60 | single = SequentialProcessor([frames, stft, filt, spec])
61 |
62 | pre_processor = SequentialProcessor([sig, single])
63 |
64 | super(MadmomMelbankProcessor, self).__init__([pre_processor])
65 |
66 |
67 | def get_log_mel_madmom(audio_fn, fs, hopsize_t, channel, context=False):
68 | """
69 | calculate log mel feature by madmom
70 | :param audio_fn:
71 | :param fs:
72 | :param hopsize_t:
73 | :param channel:
74 | :return:
75 | """
76 | madmomMelbankProc = MadmomMelbankProcessor(fs, hopsize_t)
77 | mfcc = madmomMelbankProc(audio_fn)
78 |
79 | if context:
80 | if channel == 1:
81 | mfcc = _nbf_2D(mfcc, 7)
82 | else:
83 | mfcc_conc = []
84 | for ii in range(3):
85 | mfcc_conc.append(_nbf_2D(mfcc[:,:,ii], 7))
86 | mfcc = np.stack(mfcc_conc, axis=2)
87 |
88 | return mfcc
89 |
90 |
91 | def feature_reshape(feature, nlen=10):
92 | """
93 | reshape mfccBands feature into n_sample * n_row * n_col
94 | :param feature:
95 | :param nlen:
96 | :return:
97 | """
98 |
99 | n_sample = feature.shape[0]
100 | n_row = 80
101 | n_col = nlen*2+1
102 |
103 | feature_reshaped = np.zeros((n_sample,n_row,n_col),dtype='float32')
104 | # print("reshaping feature...")
105 | for ii in range(n_sample):
106 | # print ii
107 | feature_frame = np.zeros((n_row,n_col),dtype='float32')
108 | for jj in range(n_col):
109 | feature_frame[:,jj] = feature[ii][n_row*jj:n_row*(jj+1)]
110 | feature_reshaped[ii,:,:] = feature_frame
111 | return feature_reshaped
112 |
113 |
114 | def segmentMfccLine(line, hopsize_t, mfccs):
115 | """
116 | segment line level mfccs
117 | :param line: [start_time, end_time, lyrics]
118 | :return:
119 | """
120 | # start and end time
121 | time_start = line[0]
122 | time_end = line[1]
123 | frame_start = int(round(time_start / hopsize_t))
124 | frame_end = int(round(time_end / hopsize_t))
125 |
126 | # log_mel_reshape line
127 | mfccs_line = mfccs[:, frame_start: frame_end]
128 | return mfccs_line
--------------------------------------------------------------------------------
/neural_net/utils/csv_preprocessing.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 |
4 | def open_csv_recordings(filename):
5 | recordings = []
6 | with open(filename) as csvfile:
7 | readCSV = csv.reader(csvfile, delimiter=',')
8 | for row in readCSV:
9 | recordings.append(row)
10 | return recordings
11 |
12 |
13 | def write_csv_two_columns_list(two_columns_list, filename):
14 | with open(filename, 'wb') as csvfile:
15 | two_columns_writer = csv.writer(csvfile, delimiter=',')
16 | for l in two_columns_list:
17 | two_columns_writer.writerow(l)
18 |
--------------------------------------------------------------------------------
/neural_net/utils/textgridParser.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import sys, os
3 |
4 | currentPath = os.path.dirname(__file__)
5 | utilsPath = os.path.join(currentPath, 'utils')
6 | sys.path.append(utilsPath)
7 |
8 | import neural_net.utils.textgrid as tgp
9 |
10 |
11 | def textGrid2WordList(textgrid_file, whichTier = 'pinyin', utf16 = True):
12 | '''
13 | parse textGrid into a python list of tokens
14 | @param whichTier : 'pinyin' default tier name
15 | '''
16 | if not os.path.isfile(textgrid_file): raise Exception("file {} not found".format(textgrid_file))
17 | beginTsAndWordList = []
18 |
19 | if utf16:
20 | par_obj = tgp.TextGrid.loadUTF16(textgrid_file) #loading the object
21 | else:
22 | par_obj = tgp.TextGrid.load(textgrid_file) #loading the object
23 |
24 | tiers = tgp.TextGrid._find_tiers(par_obj) #finding existing tiers
25 |
26 | isTierFound = False
27 | for tier in tiers:
28 | tierName= tier.tier_name().replace('.', '')
29 | #iterating over tiers and selecting the one specified
30 | if tierName == whichTier:
31 | isTierFound = True
32 | #this function parse the file nicely and return cool tuples
33 | tier_details = tier.make_simple_transcript()
34 |
35 | for line in tier_details:
36 | beginTsAndWordList.append([float(line[0]), float(line[1]), line[2]])
37 |
38 | if not isTierFound:
39 | print ('Missing tier {1} in file {0}' .format(textgrid_file, whichTier))
40 |
41 | return beginTsAndWordList, isTierFound
42 |
43 |
44 | def line2WordList(line, entireWordList):
45 | '''
46 | find the nested wordList of entireWordList by line tuple
47 | :param line: line tuple [startTime, endTime, string]
48 | :param entireWordList: entire word list
49 | :return: nested wordList
50 | '''
51 | nestedWordList = []
52 | vault = False
53 | for wordlist in entireWordList:
54 | # the ending of the line
55 | if wordlist[1] == line[1]:
56 | nestedWordList.append(wordlist)
57 | break
58 | # the beginning of the line
59 | if wordlist[0] == line[0]:
60 | vault = True
61 | if vault == True:
62 | nestedWordList.append(wordlist)
63 |
64 | return nestedWordList
65 |
66 | def wordListsParseByLines(entireLine, entireWordList):
67 | '''
68 | find the wordList for each line, cut the word list according to line
69 | :param entireLine: entire lines in line tier
70 | :param entirewWordList: entire word lists in pinyin tier
71 | :return:
72 | nestedWordLists: [[line0, wordList0], [line1, wordList1], ...]
73 | numLines: sum of number of lines
74 | numWords: sum of number of words
75 | '''
76 | nestedWordLists = []
77 | numLines = 0
78 | numWords = 0
79 |
80 | for line in entireLine:
81 | # asciiLine=line[2].encode("ascii", "replace")
82 | if len(line[2].replace(" ", "")): # if line is not empty
83 | numLines += 1
84 | nestedWordList = []
85 | wordList = line2WordList(line, entireWordList)
86 | for word in wordList:
87 | # asciiWord = word[2].encode("ascii", "replace")
88 | if len(word[2].replace(" ", "")): # if word is not empty
89 | numWords += 1
90 | nestedWordList.append(word)
91 | nestedWordLists.append([line,nestedWordList])
92 |
93 | return nestedWordLists, numLines, numWords
94 |
95 | def syllableTextgridExtraction(textgrid_path, recording, tier0, tier1):
96 |
97 | '''
98 | Extract syllable boundary and phoneme boundary from textgrid
99 | :param textgrid_path:
100 | :param recording:
101 | :param tier0: parent tier
102 | :param tier1: child tier which should be covered by parent tier
103 | :return:
104 | nestedPhonemeList, element[0] - syllable, element[1] - a list containing the phoneme of the syllable
105 | '''
106 |
107 | textgrid_file = os.path.join(textgrid_path,recording+'.TextGrid')
108 |
109 | syllableList = textGrid2WordList(textgrid_file, whichTier=tier0)
110 | phonemeList = textGrid2WordList(textgrid_file, whichTier=tier1)
111 |
112 | # parse syllables of groundtruth
113 | nestedPhonemeLists, numSyllables, numPhonemes = wordListsParseByLines(syllableList, phonemeList)
114 |
115 | return nestedPhonemeLists, numSyllables, numPhonemes
116 |
117 |
118 |
--------------------------------------------------------------------------------
/neural_net/utils/textgrid_preprocessing.py:
--------------------------------------------------------------------------------
1 | import os
2 | from neural_net.utils.textgridParser import textGrid2WordList
3 | from neural_net.utils.textgridParser import wordListsParseByLines
4 |
5 |
6 | def parse_syllable_line_list(ground_truth_text_grid_file, parent_tier, child_tier):
7 |
8 | if not os.path.isfile(ground_truth_text_grid_file):
9 | is_file_exist = False
10 | return False, is_file_exist, False
11 | else:
12 | is_file_exist = True
13 |
14 | # parse line
15 | line_list, _ = textGrid2WordList(ground_truth_text_grid_file, whichTier=parent_tier)
16 |
17 | # parse syllable
18 | syllable_list, is_syllable_found = textGrid2WordList(ground_truth_text_grid_file, whichTier=child_tier)
19 |
20 | # parse lines of ground truth
21 | nested_syllable_lists, _, _ = wordListsParseByLines(line_list, syllable_list)
22 |
23 | return nested_syllable_lists, is_file_exist, is_syllable_found
24 |
--------------------------------------------------------------------------------
/neural_net/utils/utils_functions.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def smooth_obs(obs):
5 | """
6 | hanning window smooth the onset observation function
7 | :param obs: syllable/phoneme onset function
8 | :return:
9 | """
10 | hann = np.hanning(5)
11 | hann /= np.sum(hann)
12 |
13 | obs = np.convolve(hann, obs, mode='same')
14 |
15 | return obs
--------------------------------------------------------------------------------
/neural_net/viterbiDecodingPhonemeSeg.pyx:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.stats import norm
3 | from neural_net.parameters import hopsize_t
4 | cimport cython
5 |
6 | value_eps = np.finfo(float).eps
7 |
8 | def FdurationProba2( syllable_duration, param_s ):
9 |
10 | M1 = syllable_duration/hopsize_t
11 |
12 | # % delta
13 | if param_s['delta_mode'] == 'constant':
14 | delta = param_s['delta']
15 | elif param_s['delta_mode'] == 'proportion':
16 | delta = syllable_duration * param_s['delta']
17 | else:
18 | raise('Error: delta_default should be either constant or proportion.')
19 | S1 = delta/hopsize_t
20 |
21 | # % duration max is three times of standard deviation
22 | duration_max = syllable_duration + 3.0*delta
23 |
24 | tmin = 0
25 |
26 | tmax = int(duration_max/hopsize_t)
27 |
28 | # Ps = pdf('Normal',(tmin : tmax), M1, S1)
29 | x = range(tmin, tmax)
30 | Ps = norm.pdf(x, M1, S1)
31 | return Ps, tmin, tmax
32 |
33 | @cython.cdivision(True)
34 | @cython.boundscheck(False)
35 | @cython.wraparound(False)
36 | def viterbiSegmental2(P, sd, param_s):
37 | """
38 | :param P: NxT emission probability state sequence (P(j,t) = emission probability of symbol j at time t)
39 | :param sd: 1xT score duration array
40 | :param param_s:
41 | :return:
42 | """
43 |
44 | # preventsingularities
45 | P[P == 0] = value_eps
46 |
47 | i_bound = np.where(P > value_eps)[0]
48 | N = len(i_bound)
49 | T = len(sd)
50 |
51 | # log - likelihood
52 | delta = np.zeros((N, T), dtype=np.double)
53 | psi = np.zeros((N, T), dtype=np.double)
54 | logP = np.log(P, dtype=np.double)
55 |
56 | # duration probability
57 | Ps, _, _ = FdurationProba2(sd[0], param_s)
58 | Ps[Ps == 0] = value_eps
59 | C = len(Ps)
60 | logPs = np.log(Ps, dtype=np.double)
61 |
62 | cdef double [:, ::1] cdelta = delta
63 | cdef double [:, ::1] cpsi = psi
64 | cdef double [::1] clogP = logP
65 | cdef double [::1] clogPs = logPs
66 | cdef int [::1] ci_bound = np.array(i_bound, dtype=np.intc)
67 | # % % % % % % % % % % % % % % % % % %
68 | # % Initialisation %
69 | # % % % % % % % % % % % % % % % % % %
70 |
71 | # % not a possible transition from > 0 time to 1
72 | cdelta[0, 0] = -np.inf
73 | cpsi[:,0] = 0
74 | for jj in range(1,N):
75 | d = ci_bound[jj] - ci_bound[0]
76 | # print(jj, i_bound[jj], d, C)
77 | if d >= C:
78 | cdelta[jj, 0] = -np.inf
79 | else:
80 | cdelta[jj, 0] = clogPs[d] + clogP[i_bound[jj]]
81 |
82 | clogPs = None
83 |
84 | # % % % % % % % % % % % % % % % % % %
85 | # % Recursion %
86 | # % % % % % % % % % % % % % % % % % %
87 | delta_current = np.zeros((N,), dtype=np.double)
88 | cdef double [::1] cdelta_current = delta_current
89 |
90 | for t in range(1,T - 1):
91 | # print(t)
92 | # % duration probability
93 | Ps, _, _ = FdurationProba2(sd[t], param_s)
94 | Ps[Ps == 0] = value_eps
95 | C = len(Ps)
96 | logPs = np.log(Ps, dtype=np.double)
97 |
98 | for jj in range(N):
99 | for ii in range(N):
100 | # print(i_bound, jj, ii)
101 | d = ci_bound[jj] - ci_bound[ii]
102 | # print(d, C)
103 | if d >= C or d <= 0:
104 | cdelta_current[ii] = -np.inf
105 | else:
106 | cdelta_current[ii] = cdelta[ii, t - 1] + logPs[d]
107 |
108 | I_delta = np.argmax(cdelta_current)
109 | M_delta = cdelta_current[I_delta]
110 | cdelta[jj, t] = M_delta + clogP[i_bound[jj]] # add emission because it's a constance
111 | cpsi[jj, t] = I_delta
112 |
113 | # % duration probability
114 | Ps, tmin, tmax = FdurationProba2(sd[T-1], param_s)
115 | Ps[Ps == 0] = value_eps
116 | C = len(Ps)
117 | logPs = np.log(Ps, dtype=np.double)
118 | clogPs = logPs
119 | # delta_current = np.zeros((N,))
120 |
121 | for ii in range(N):
122 | d = ci_bound[N-1] - ci_bound[ii]
123 | if d >= C or d <= 0:
124 | cdelta_current[ii] = -np.inf
125 | else:
126 | cdelta_current[ii] = cdelta[ii, T-2] + clogPs[d]
127 |
128 | I_delta = np.argmax(cdelta_current)
129 | M_delta = cdelta_current[I_delta] # the posterior proba
130 | cdelta[N-1, T-1] = M_delta + clogP[i_bound[N-1]]
131 | cpsi[N-1, T-1] = I_delta
132 |
133 | # % % % % % % % % % % % % % % % % % %
134 | # % Backtrack %
135 | # % % % % % % % % % % % % % % % % % %
136 | i_best_sequence = np.zeros((T+1,),dtype=int)
137 | # print(i_best_sequence)
138 | i_best_sequence[T] = N-1
139 | for t in range(T)[::-1]:
140 | # print(t+1, i_best_sequence[t+1])
141 | i_best_sequence[t] = int(cpsi[int(i_best_sequence[t + 1]), t])
142 | # print(i_best_sequence)
143 | i_boundary = [i_bound[ii] for ii in i_best_sequence]
144 |
145 | cdelta = None
146 | cdelta_current = None
147 | clogPs = None
148 | clogP = None
149 | cpsi = None
150 | ci_bound = None
151 |
152 | return i_boundary
153 |
--------------------------------------------------------------------------------