├── README.md
├── __pycache__
    └── evaluation.cpython-37.pyc
├── data.json
├── evaluation.py
└── timit_pre_process.py


/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Timit data process for dnn ai-aec (acoustic echo cancellation) experiments
 3 | ==============================
 4 | This repo is following the data setup from [Deep Learning for Acoustic Echo Cancellation in Noisy and Double-TalkScenarios](https://www.isca-speech.org/archive/Interspeech_2018/pdfs/1484.pdf).
 5 | 
 6 | It' a draft script, I will modify it and put all changeable configurations into a json so that it can be used more friendly.
 7 | 
 8 | By the way, if you want to do some work in deep learning aec, I recommend using farend data from AEC-challenge and mix with other clean open source datasets.
 9 | 
10 | Notification
11 | ============
12 | 
13 | References:
14 | 
15 | Paper: [Deep Learning for Acoustic Echo Cancellation in Noisy and Double-TalkScenarios](https://www.isca-speech.org/archive/Interspeech_2018/pdfs/1484.pdf)  
16 | 
17 | DNS-CHALLENGE: [INTERSPEECH 2021 Deep Noise Suppression Challenge](https://arxiv.org/pdf/2101.01902.pdf)  
18 | DNS-CHALLENGE CODE: [INTERSPEECH 2021 Deep Noise Suppression Challenge](https://github.com/microsoft/DNS-Challenge)  
19 | 
20 | AEC-CHALLENGE:[ICASSP 2021 ACOUSTIC ECHO CANCELLATION CHALLENGE: DATASETS, TESTINGFRAMEWORK, AND RESULTS](https://arxiv.org/pdf/2009.04972.pdf)  
21 | AEC-CHALLENGE CODE:[ICASSP 2021 ACOUSTIC ECHO CANCELLATION CHALLENGE: DATASETS, TESTINGFRAMEWORK, AND RESULTS](https://github.com/microsoft/AEC-Challenge)  
22 | 
23 | 
24 | How to use
25 | ==========
26 | 1. change __dataPath__, __noisePath__, __outPath__ and __rirPath__ according to your setups, p.s. __rirPath__ is provided from DNS-CHALLENGE where you can review above
27 | 
28 | 2. python timit_pre_process.py
29 | 
30 | Last Modification
31 | ============
32 | 
33 | 1. add json
34 | 2. randomly pad signal to certain length
35 | 3. add non-linear
36 | 


--------------------------------------------------------------------------------
/__pycache__/evaluation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YongyuG/dnn_aec_data_process/71083bce4f12e7cdd20888c4067ffa306c5f9c34/__pycache__/evaluation.cpython-37.pyc


--------------------------------------------------------------------------------
/data.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "datasets": {
 3 |     "timit_data_path": "/home/yongyug/data/timit/TIMIT",
 4 |     "noise_path": "/home/yongyug/data/aec_challenge/datasets/noise",
 5 |     "rir_table": "/home/yongyug/data/aec_challenge/datasets/acoustic_params/RIR_table_simple.csv",
 6 |     "output_path": "/home/yongyug/data/timit_aec_output"
 7 |   },
 8 |   "configs": {
 9 |     "audio_length": 8,
10 |     "samplerate": 16000,
11 |     "use_reverb": true,
12 |     "clipping_threshold": 0.99,
13 |     "lowerbound_ser": -10,
14 |     "upperbound_ser": 13,
15 |     "lowerbound_snr": -5,
16 |     "upperbound_snr": 20,
17 |     "target_level_lower": -35,
18 |     "target_level_upper": -15,
19 |     "target_level": -25,
20 |     "lower_t60": 0.6,
21 |     "upper_t60": 1.3,
22 |     "predelay": 50,
23 |     "silence_length": 0.2,
24 |     "add_nonlinear": true,
25 |     "train": {
26 |       "same_gender_pair": 30,
27 |       "diff_gender_pair": 40,
28 |       "csv_file_name": "train.csv"
29 |     },
30 |     "test": {
31 |       "same_gender_pair": 3,
32 |       "diff_gender_pair": 4,
33 |       "csv_file_name": "test.csv"
34 |     }
35 | 
36 |   }
37 | 
38 | }


--------------------------------------------------------------------------------
/evaluation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import soundfile as sf
 3 | import os
 4 | 
 5 | def ERLE(nearend_mic_signal, error_signal):
 6 |     erle = 10 * np.log10(
 7 |         np.mean(nearend_mic_signal**2) / np.mean( error_signal **2)
 8 |     )
 9 |     return erle
10 | 
11 | def SER(nearend_speech, far_echo):
12 |     return 10 * np.log10(((nearend_speech ** 2 ).mean()**0.5) / (far_echo **2).mean()**0.5)
13 | 
14 | if __name__ == "__main__":
15 | 
16 |     fileid = 9999
17 |     nearend_mic_path = "/home/yongyug/data/aec_challenge/datasets/synthetic/nearend_mic_signal/nearend_mic_fileid_{}.wav".format(fileid)
18 |     nearend_speech_path = "/home/yongyug/data/aec_challenge/datasets/synthetic/nearend_speech/nearend_speech_fileid_{}.wav".format(fileid)
19 |     error_path = "/home/yongyug/data/aec_challenge/datasets/synthetic/filter_out/mixdata_fileid_{}.wav".format(fileid)
20 |     echo_path = "/home/yongyug/data/aec_challenge/datasets/synthetic/echo_signal/echo_fileid_{}.wav".format(fileid)
21 | 
22 |     nlp_path = "/home/yongyug/data/aec_challenge/datasets/synthetic/nearend_mic_mix_farend_speech_signal/mixdata_fileid_{}_aec_native.wav".format(fileid)
23 | 
24 |     nearend_mic_signal, sr = sf.read(nearend_mic_path)
25 |     error_signal, _ = sf.read(error_path)
26 |     echo_signal, _ = sf.read(echo_path)
27 |     nearend_speech, _ = sf.read(nearend_speech_path)
28 | 
29 | 
30 |     nlp_signal, _ = sf.read(nlp_path)
31 | 
32 |     erle_nonlp = ERLE(nearend_mic_signal, error_signal)
33 |     erle_nlp = ERLE(nearend_mic_signal, nlp_signal)
34 |     print(erle_nonlp)
35 |     print(erle_nlp)
36 | 
37 |     ser = SER(nearend_speech, echo_signal)
38 |     print(ser)


--------------------------------------------------------------------------------
/timit_pre_process.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This process is strictly following experimental data setup from paper:
  3 | 
  4 | <<Deep Learning for Acoustic Echo Cancellation in Noisy and Double-Talk Scenarios>>
  5 | 
  6 | '''
  7 | 
  8 | import argparse
  9 | import glob
 10 | import json
 11 | import librosa
 12 | import os
 13 | import random
 14 | import pandas as pd
 15 | import numpy as np
 16 | from random import shuffle
 17 | import soundfile as sf
 18 | from scipy import signal
 19 | from evaluation import SER
 20 | 
 21 | MAXTRIES = 50
 22 | MAXFILELEN = 50
 23 | # np.random.seed(9999)
 24 | # random.seed(9999)
 25 | EPS = np.finfo(float).eps
 26 | 
 27 | def get_single_gender_index_list(data_list, num_pair=30):    #获取不重复的单性别说话人对
 28 |     index_list = []
 29 |     seen_list = []
 30 |     i = 0
 31 |     while i < num_pair:
 32 |         index_set = list(np.random.randint(0, len(data_list), 2))
 33 | 
 34 |         if index_set[0] not in seen_list and index_set[1] not in seen_list:
 35 |             index_list.append(index_set)
 36 |             seen_list.append(index_set[0])
 37 |             seen_list.append(index_set[1])
 38 | 
 39 |             i += 1
 40 | 
 41 |     return index_list, seen_list
 42 | 
 43 | def get_double_gender_index_list(male_list, female_list, male_seen_list, female_seenlist, num_pair=40):
 44 |     male_female_index_list = []
 45 | 
 46 |     i = 0
 47 |     while i < num_pair:
 48 |         male_index = np.random.randint(0, len(male_list))
 49 |         female_index = np.random.randint(0, len(female_list))
 50 |         index_set = [male_index, female_index]
 51 | 
 52 |         if male_index not in male_seen_list and female_index not in female_seenlist:
 53 |             male_female_index_list.append(index_set)
 54 |             male_seen_list.append(male_index)
 55 |             female_seenlist.append(female_index)
 56 |             i += 1
 57 |     return male_female_index_list, male_seen_list, female_seenlist
 58 | 
 59 | def get_gender_index_list(male_list, female_list, conf):
 60 |     # male_male_index_list = []
 61 |     # female_female_index_list = []
 62 |     # male_female_index_list = []
 63 |     male_seen_list = []
 64 |     female_seen_list = []
 65 | 
 66 |     male_male_index_list, temp_male_seenlist = get_single_gender_index_list(male_list, conf['configs']['samle_gender_pair'])
 67 |     female_female_index_list, temp_female_seenlist = get_single_gender_index_list(male_list, conf['configs']['samle_gender_pair'])
 68 |     male_female_index_list, male_seen_list, female_seen_list = get_double_gender_index_list(male_list, female_list, temp_male_seenlist, temp_female_seenlist, conf['configs']['diff_gender_pair'])
 69 |     return male_male_index_list, female_female_index_list, male_female_index_list
 70 | 
 71 | 
 72 | def random_three_nonrepeat_sample(data_len):
 73 |     three_sample_list = []
 74 |     for i in range(data_len):
 75 |         for j in range(i + 1, data_len):
 76 |             for k in range(j + 1, data_len):
 77 |                 three_sample_list.append([i, j, k])
 78 | 
 79 |     return three_sample_list
 80 | 
 81 | def add_pyreverb(clean_speech, rir, predelay=50):
 82 |     predelay = predelay
 83 |     early_delay_samples = (predelay * 16000) // 1000
 84 |     early_rir = rir[:early_delay_samples]
 85 | 
 86 |     reverb_speech = signal.fftconvolve(clean_speech, rir, mode="full")
 87 |     noreverb_speech = signal.fftconvolve(clean_speech, early_rir, mode="full")
 88 | 
 89 |     # make reverb_speech same length as clean_speech
 90 |     reverb_speech = reverb_speech[0 : clean_speech.shape[0]]
 91 |     noreverb_speech = noreverb_speech[0 : clean_speech.shape[0]]
 92 | 
 93 |     return reverb_speech, noreverb_speech
 94 | 
 95 | def signal_pad(signal, audio_sample_length):
 96 | 
 97 | 
 98 | 
 99 |     # if len(signal) < audio_sample_length:  # 设定一个统一的长度,如果长度不够, 则前后补零
100 |     #     if len(signal) % 2 == 0:
101 |     #         signal = np.pad(signal, ((audio_sample_length - len(signal)) // 2,
102 |     #                                                  (audio_sample_length - len(signal)) // 2), 'constant',
103 |     #                                 constant_values=(0, 0))
104 |     #     elif len(signal) % 2 != 0:
105 |     #         signal = np.pad(signal, ((audio_sample_length - len(signal)) // 2,
106 |     #                                                  (audio_sample_length - len(signal)) // 2 + len(
107 |     #                                                      signal) % 2), 'constant',
108 |     #                                 constant_values=(0, 0))  # 无法被2整除则把余数补零至最后
109 |     if len(signal) < audio_sample_length:
110 |         diff_len = audio_sample_length - len(signal)
111 |         padfront = np.random.randint(diff_len)
112 |         signal = np.pad(signal, (padfront, diff_len - padfront), 'constant', constant_values=(0, 0))   ##randomly pad in front and end
113 | 
114 |     elif len(signal) >= audio_sample_length:
115 |         signal = signal[:audio_sample_length]
116 |     return signal
117 | 
118 | # def generate_single_gender_wav_pair(nearend_data_list, farend_data_list, data_dict, pairname):
119 | def generate_gender_wav_pair(nearend_data_list, farend_data_list, data_dict1, data_dict2, pairname):
120 | 
121 |     farend_three_sample_index = random_three_nonrepeat_sample(10)  # 将所有farend不重复的组合list列出来
122 | 
123 | 
124 |     train_res_list = []
125 |     validate_res_list = []
126 |     test_res_list = []
127 |     count = 0
128 |     for i in range(len(nearend_data_list)):
129 |         nearend_spk = nearend_data_list[i]
130 |         farend_spk = farend_data_list[i]
131 |         if nearend_spk[0] == 'M':
132 |             nearend_spk_wav = data_dict1[nearend_spk]
133 |         else:
134 |             nearend_spk_wav = data_dict2[nearend_spk]
135 |         if farend_spk[0] == 'M':
136 |             farend_spk_wav = data_dict1[farend_spk]
137 |         else:
138 |             farend_spk_wav = data_dict2[farend_spk]
139 | 
140 | 
141 |         nearend_select_index = np.arange(10)  ## 每个人10条语音, 做一个随机
142 |         np.random.shuffle(nearend_select_index)  # for nearend_spk in nearend_spk_list:
143 | 
144 |         nearend_wav_pick = np.array(nearend_spk_wav)[nearend_select_index]
145 |         if pairname.upper() == 'TRAIN':
146 |             farend_group = [[i for i in range(j * 5, (j + 1) * 5)] for j in range(len(nearend_select_index))]  # 这里是对于nearend来说, 渠道每个wav对应farend的index
147 |         elif pairname.upper() == 'TEST':
148 |             farend_group = [[i for i in range(j * 1, (j + 1) * 1)] for j in range(len(nearend_select_index))]  # 这里是对于nearend来说, 渠道每个wav对应farend的index
149 | 
150 |         random.shuffle(farend_three_sample_index)  # 把farend的三元list随机一下
151 |         farend_select_index = np.array(farend_three_sample_index)[np.array(farend_group)]  # 把每个nearend选择的farend取出来
152 |         farend_wav_pick = np.array(farend_spk_wav)[farend_select_index]
153 |         if pairname.upper() == 'TRAIN':
154 |             for k in range(len(nearend_wav_pick)):
155 |                 if k < 7:
156 |                     count += 1
157 |                     #print(nearend_wav_pick[k], farend_wav_pick[k])
158 |                     train_res_list.append((nearend_wav_pick[k], farend_wav_pick[k]))
159 | 
160 |                 else:
161 |                     validate_res_list.append((nearend_wav_pick[k], farend_wav_pick[k]))
162 |         else:
163 |             for k in range(len(nearend_wav_pick)):
164 |                 test_res_list.append((nearend_wav_pick[k], farend_wav_pick[k]))
165 |     if pairname.upper() == 'TRAIN':
166 |         return train_res_list, validate_res_list
167 |     else:
168 |         return test_res_list
169 | 
170 | def add_nonlinear_distortion(farend_signal, hard=True):
171 |     alpha = 0.8
172 |     x_max = alpha * np.max(np.abs(farend_signal))
173 |                 ### x_max is the maximum value of output signal
174 |                 ### I set it 0.8 scale to the maximum of the input signal
175 |                 ### you can set your x_max yourself
176 | 
177 |     if hard:
178 |         farend_signal[farend_signal < -x_max] = -x_max
179 |         farend_signal[farend_signal > x_max] = x_max
180 |         xn = farend_signal
181 |     else:
182 | 
183 |         x_soft = (x_max * farend_signal) / ((np.abs(x_max) ** 2 + np.abs(farend_signal) ** 2) ** (1 / 2))
184 |         xn = x_soft
185 | 
186 | 
187 | 
188 |     sigmoid_gian = 0.4
189 |     bn = 1.5 * xn - 0.3 * xn ** 2
190 |     alpha = [4 if i > 0 else 0.5 for i in bn]
191 |     x_nl = sigmoid_gian * ((2 / (1 + np.exp(alpha * bn))) - 1)
192 | 
193 |     return x_nl
194 | 
195 | 
196 | 
197 | def get_data_pair(
198 |         dataPath,
199 |         conf,
200 |         ):
201 | 
202 |     male_dict = {}
203 |     female_dict = {}
204 |     count = 0
205 | 
206 |     #分别将timit中男女相关的
207 |     for root, _, files in os.walk(dataPath):
208 |         for file in files:
209 |             if file.endswith('WAV'):
210 |                 count += 1
211 |                 dataType, spk = root.split(os.path.sep)[-3], root.split(os.path.sep)[-1]
212 |                 gender = spk[0]
213 |                 if gender == "M":
214 |                     if spk not in male_dict.keys():
215 |                         male_dict[spk] = []
216 |                     male_dict[spk].append(os.path.join(root, file))
217 |                 elif gender == "F":
218 |                     if spk not in female_dict.keys():
219 |                         female_dict[spk] = []
220 |                     female_dict[spk].append(os.path.join(root, file))
221 | 
222 | 
223 | 
224 |     male_name_list = list(male_dict.keys())
225 |     female_name_list = list(female_dict.keys())
226 |     male_name_index_list = [i for i in range(len(male_name_list))]
227 |     female_name_index_list = [i for i in range(len(female_name_list))]
228 | 
229 |     #Randomize coresponding data-pairs, get the speaker idx from the list
230 |     train_male_male_index_list, temp_maleseen = get_single_gender_index_list(male_name_list, num_pair=conf["configs"]['test']['same_gender_pair'])
231 |     train_female_female_index_list, temp_female_seen = get_single_gender_index_list(female_name_list, num_pair=conf["configs"]['test']['same_gender_pair'])
232 |     train_male_female_index_list, male_seen_list, female_seen_list = get_double_gender_index_list(male_name_list, female_name_list, temp_maleseen, temp_female_seen, num_pair=conf["configs"]['test']['diff_gender_pair'])
233 |     #get_gender_index_list can use this function to get this three list
234 | 
235 |     #Get the remain speaker index for test sets
236 |     rest_male_list = [i for i in male_name_index_list if i not in male_seen_list]
237 |     rest_female_list = [i for i in female_name_index_list if i not in female_seen_list]
238 |     test_male_male_index_list, temp_maleseen = get_single_gender_index_list(rest_male_list, num_pair=conf["configs"]['test']['same_gender_pair'])
239 |     test_female_female_index_list, temp_female_seen = get_single_gender_index_list(rest_female_list, num_pair=conf["configs"]['test']['same_gender_pair'])
240 |     test_male_female_index_list, male_seen_list, female_seen_list = get_double_gender_index_list(rest_male_list, rest_female_list, temp_maleseen, temp_female_seen, num_pair=conf["configs"]['test']['diff_gender_pair'])
241 | 
242 |     #Randomize which speaker as for farend spk
243 |     train_male_female_farend_choice = np.random.randint(0, 2, len(train_male_female_index_list)) #这里是随机选择pair中哪一个spk当作farend
244 |     train_male_male_farend_choice = np.random.randint(0, 2, len(train_male_male_index_list))
245 |     train_female_female_farend_choice = np.random.randint(0, 2, len(train_female_female_index_list))
246 | 
247 |     test_male_female_farend_choice = np.random.randint(0, 2, len(test_male_female_index_list)) #这里是随机选择pair中哪一个spk当作farend
248 |     test_male_male_farend_choice = np.random.randint(0, 2, len(test_male_male_index_list))
249 |     test_female_female_farend_choice = np.random.randint(0, 2, len(test_female_female_index_list))
250 | 
251 | 
252 |     # print(male_male_index_list)
253 |     # print(male_female_index_list)
254 |     # for i in male_male_index_list:
255 |     #     for j in male_female_index_list:
256 |     #         if i[0] == j[0] or i[1] == j[0]:
257 |     #             print(i,j)
258 |     # print(female_female_index_list)
259 |     # print(male_female_index_list)
260 |     # for i in female_female_index_list:
261 |     #     for j in male_female_index_list:
262 |     #         if i[0] == j[1] or i[1] == j[1]:
263 |     #             print(i,j)
264 | 
265 | 
266 |     #Get the speaker name from training and testdata-pair
267 |     male_name_arr = np.array(male_name_list)
268 |     test_male_name_arr = male_name_arr[np.array(test_male_male_index_list)]
269 |     train_male_name_arr = male_name_arr[np.array(train_male_male_index_list)]
270 | 
271 |     female_name_arr = np.array(female_name_list)
272 |     test_female_name_arr = female_name_arr[np.array(test_female_female_index_list)]
273 |     train_female_name_arr = female_name_arr[np.array(train_female_female_index_list)]
274 | 
275 |     train_male_female_name_arr = np.array([np.array(male_name_list)[np.array(train_male_female_index_list).T[0]],
276 |                         np.array(female_name_list)[np.array(train_male_female_index_list).T[1]]]).T  #转置是为了把male和female分开, 因为male_female_index_list是 [male, female顺序排列的]
277 |     test_male_female_name_arr = np.array([np.array(male_name_list)[np.array(test_male_female_index_list).T[0]],
278 |                         np.array(female_name_list)[np.array(test_male_female_index_list).T[1]]]).T  #转置是为了把male和female分开, 因为male_female_index_list是 [male, female顺序排列的]
279 | 
280 | 
281 |     #Get specific farend and nearend speaker key for training sets
282 |     train_male_male_nearend_spk_list = [train_male_name_arr[i][train_male_male_farend_choice[i] ^ 1] for i in range(len(train_male_name_arr))]
283 |     train_male_male_farend_spk_list = [train_male_name_arr[i][train_male_male_farend_choice[i]] for i in range(len(train_male_name_arr))]
284 |     train_female_female_nearend_spk_list = [train_female_name_arr[i][train_female_female_farend_choice[i] ^ 1] for i in range(len(train_female_name_arr))]
285 |     train_female_female_farend_spk_list = [train_female_name_arr[i][train_female_female_farend_choice[i]] for i in range(len(train_female_name_arr))]
286 |     train_male_female_nearend_spk_list = [train_male_female_name_arr[i][train_male_female_farend_choice[i] ^ 1] for i in range(len(train_male_female_name_arr))]
287 |     train_male_female_farend_spk_list = [train_male_female_name_arr[i][train_male_female_farend_choice[i]] for i in range(len(train_male_female_name_arr))]
288 | 
289 |     test_male_male_nearend_spk_list = [test_male_name_arr[i][test_male_male_farend_choice[i] ^ 1] for i in range(len(test_male_name_arr))]
290 |     test_male_male_farend_spk_list = [test_male_name_arr[i][test_male_male_farend_choice[i]] for i in range(len(test_male_name_arr))]
291 |     test_female_female_nearend_spk_list = [test_female_name_arr[i][test_female_female_farend_choice[i] ^ 1] for i in range(len(test_female_name_arr))]
292 |     test_female_female_farend_spk_list = [test_female_name_arr[i][test_female_female_farend_choice[i]] for i in range(len(test_female_name_arr))]
293 |     test_male_female_nearend_spk_list = [test_male_female_name_arr[i][test_male_female_farend_choice[i] ^ 1] for i in range(len(test_male_female_name_arr))]
294 |     test_male_female_farend_spk_list = [test_male_female_name_arr[i][test_male_female_farend_choice[i]] for i in range(len(test_male_female_name_arr))]
295 | 
296 |     #Generate specifc wav_pair for each data_pair
297 |     male_male_train, male_male_validate = generate_gender_wav_pair(train_male_male_nearend_spk_list, train_male_male_farend_spk_list, male_dict, male_dict, 'train')
298 |     female_female_train, female_female_validate = generate_gender_wav_pair(train_female_female_nearend_spk_list, train_female_female_farend_spk_list , female_dict, female_dict,'train')
299 |     male_female_train, male_female_validate = generate_gender_wav_pair(train_male_female_nearend_spk_list, train_male_female_farend_spk_list, male_dict, female_dict, 'train')
300 | 
301 |     male_male_test = generate_gender_wav_pair(test_male_male_nearend_spk_list, test_male_male_farend_spk_list, male_dict, male_dict, 'test')
302 |     female_female_test = generate_gender_wav_pair(test_female_female_nearend_spk_list, test_female_female_farend_spk_list , female_dict, female_dict,'test')
303 |     male_female_test = generate_gender_wav_pair(test_male_female_nearend_spk_list, test_male_female_farend_spk_list, male_dict, female_dict, 'test')
304 | 
305 |     train_dataset = male_male_train + female_female_train + male_female_train
306 |     validate_dataset = male_male_validate + female_female_validate + male_female_validate
307 |     test_dataset = male_male_test + female_female_test + male_female_test
308 | 
309 |     return train_dataset, validate_dataset, test_dataset
310 | 
311 |     #all 3-type pairs will be merged in male_male_final_data with both train and validate
312 | 
313 |     # train_dict = {}
314 |     # train_dict.update(male_male_final_data['train'])
315 |     # train_dict.update(female_female_final_data['train'])
316 |     # train_dict.update(male_female_final_data['train'])
317 |     #
318 |     # validate_dict = {}
319 |     # validate_dict.update(male_male_final_data['validate'])
320 |     # validate_dict.update(female_female_final_data['validate'])
321 |     # validate_dict.update(male_female_final_data['validate'])
322 | 
323 |     # male_male_final_data['train'].update(female_female_final_data['train'])
324 |     # male_male_final_data['train'].update(male_female_final_data['train'])
325 |     # male_male_final_data['validate'].update(female_female_final_data['validate'])
326 |     # male_male_final_data['validate'].update(male_female_final_data['validate'])
327 |     # res_dict = male_male_final_data
328 |     # return res_dict
329 | 
330 | def get_rir_dict(rir_csv_path, conf):
331 |     temp = pd.read_csv(rir_csv_path, skiprows=[1], sep=',', header=None,
332 |                        names=['wavfile', 'channel', 'T60_WB', 'C50_WB', 'isRealRIR'])
333 |     #temp.keys()
334 | 
335 |     rir_wav = temp['wavfile'][1:]  # 115413
336 |     rir_channel = temp['channel'][1:]
337 |     rir_t60 = temp['T60_WB'][1:]
338 |     rir_isreal = temp['isRealRIR'][1:]
339 | 
340 |     rir_wav2 = [w.replace('\\', '/') for w in rir_wav]
341 |     rir_channel2 = [w for w in rir_channel]
342 |     rir_t60_2 = [w for w in rir_t60]
343 |     rir_isreal2 = [w for w in rir_isreal]
344 | 
345 |     myrir = []
346 |     mychannel = []
347 |     myt60 = []
348 | 
349 |     lower_t60 = conf['configs']['lower_t60']
350 |     upper_t60 = conf['configs']['upper_t60']
351 | 
352 |     all_indices = [i for i, x in enumerate(rir_isreal2)]
353 | 
354 |     chosen_i = []
355 |     for i in all_indices:
356 |         if (float(rir_t60_2[i]) >= lower_t60) and (float(rir_t60_2[i]) <= upper_t60):
357 |             chosen_i.append(i)
358 | 
359 |     myrir = [rir_wav2[i] for i in chosen_i]
360 |     mychannel = [rir_channel2[i] for i in chosen_i]
361 |     myt60 = [rir_t60_2[i] for i in chosen_i]
362 | 
363 |     rir_dict = {"myrir":myrir, 'mychannel':mychannel, 'myt60':myt60}
364 |     return rir_dict
365 | 
366 | def get_rir_samples(rir_dict):
367 |     myrir = rir_dict['myrir']
368 |     mychannel = rir_dict['mychannel']
369 |     myt60 = rir_dict['myt60']
370 |     #
371 |     #
372 |     rir_index = random.randint(0, len(myrir) - 1)
373 |     my_rir = myrir[rir_index]
374 | 
375 |     while not os.path.exists(my_rir):
376 |         rir_index = random.randint(0, len(myrir) - 1)
377 |         my_rir = myrir[rir_index]
378 | 
379 |     samples_rir, fs_rir = sf.read(my_rir)
380 | 
381 |     my_channel = int(mychannel[rir_index])
382 | 
383 |     if samples_rir.ndim == 1:
384 |         samples_rir_ch = np.array(samples_rir)
385 |     elif my_channel > 1:
386 |         samples_rir_ch = samples_rir[:, my_channel - 1]
387 |     else:
388 |         samples_rir_ch = samples_rir[:, my_channel - 1]
389 | 
390 |     return samples_rir_ch
391 | 
392 | def get_noise_files(noise_path):
393 |     sources_files_names = glob.glob(os.path.join(noise_path, "*.wav"))
394 |     shuffle(sources_files_names)
395 |     return sources_files_names
396 | 
397 | def is_clipped(audio, clipping_threshold=0.99):
398 |     return any(abs(audio) > clipping_threshold)
399 | 
400 | def build_noise_audio(
401 |         noise_path,
402 |         fs=16000,
403 |         audio_length=8,
404 |         audio_samples_length=-1,
405 |         silence_length = 0.2):
406 |     '''Construct an audio signal from source files'''
407 | 
408 |     fs_output = fs
409 |     if audio_samples_length == -1:
410 |         audio_samples_length = int(audio_length*fs)
411 | 
412 |     output_audio = np.zeros(0)
413 |     remaining_length = audio_samples_length
414 |     files_used = []
415 |     clipped_files = []
416 | 
417 |     source_files = glob.glob(os.path.join(noise_path,
418 |                                           "*.wav"))
419 |     shuffle(source_files)
420 |     # pick a noise source file index randomly
421 |     idx = np.random.randint(0, np.size(source_files))
422 | 
423 |     # initialize silence
424 |     silence = np.zeros(int(fs_output*silence_length))
425 | 
426 |     # iterate through multiple clips until we have a long enough signal
427 |     tries_left = MAXTRIES
428 |     while remaining_length > 0 and tries_left > 0:
429 | 
430 |         # read next audio file and resample if necessary
431 | 
432 |         idx = (idx + 1) % np.size(source_files) #这里有种shift的感觉, 第0个是最后的时候才process的
433 |         input_audio, fs_input = sf.read(source_files[idx])
434 |         if fs_input != fs_output:
435 |             input_audio = librosa.resample(input_audio, fs_input, fs_output)
436 | 
437 |         # if current file is longer than remaining desired length, and this is
438 |         # noise generation or this is training set, subsample it randomly
439 |         if len(input_audio) > remaining_length:
440 |             idx_seg = np.random.randint(0, len(input_audio)-remaining_length)
441 |             input_audio = input_audio[idx_seg:idx_seg+remaining_length]
442 | 
443 |         # check for clipping, and if found move onto next file
444 |         if is_clipped(input_audio):
445 |             clipped_files.append(source_files[idx])
446 |             tries_left -= 1
447 |             continue
448 | 
449 |         # concatenate current input audio to output audio stream
450 |         files_used.append(source_files[idx])
451 |         output_audio = np.append(output_audio, input_audio)
452 |         remaining_length -= len(input_audio)
453 | 
454 |         # add some silence if we have not reached desired audio length
455 |         if remaining_length > 0:
456 |             silence_len = min(remaining_length, len(silence))
457 |             output_audio = np.append(output_audio, silence[:silence_len])
458 |             remaining_length -= silence_len
459 | 
460 |     return output_audio, files_used, clipped_files
461 | 
462 | def normalize(audio, target_level=-25):
463 |     '''Normalize the signal to the target level'''
464 |     rms = (audio ** 2).mean() ** 0.5
465 |     scalar = 10 ** (target_level / 20) / (rms+EPS)
466 |     audio = audio * scalar
467 |     return audio
468 | 
469 | def normalize_segmental_rms(audio, rms, target_level=-25):
470 |     '''Normalize the signal to the target level
471 |     based on segmental RMS'''
472 |     scalar = 10 ** (target_level / 20) / (rms+EPS)
473 |     audio = audio * scalar
474 |     return audio
475 | 
476 | 
477 | def active_rms(clean, noise, fs=16000, energy_thresh=-50):
478 |     '''Returns the clean and noise RMS of the noise calculated only in the active portions'''
479 |     window_size = 100  # in ms
480 |     window_samples = int(fs * window_size / 1000)
481 |     sample_start = 0
482 |     noise_active_segs = []
483 |     clean_active_segs = []
484 | 
485 |     while sample_start < len(noise):
486 |         sample_end = min(sample_start + window_samples, len(noise))
487 |         noise_win = noise[sample_start:sample_end]
488 |         clean_win = clean[sample_start:sample_end]
489 |         noise_seg_rms = 20 * np.log10((noise_win ** 2).mean() + EPS)
490 |         # Considering frames with energy
491 |         if noise_seg_rms > energy_thresh:
492 |             noise_active_segs = np.append(noise_active_segs, noise_win)
493 |             clean_active_segs = np.append(clean_active_segs, clean_win)
494 |         sample_start += window_samples
495 | 
496 |     if len(noise_active_segs) != 0:
497 |         noise_rms = (noise_active_segs ** 2).mean() ** 0.5
498 |     else:
499 |         noise_rms = EPS
500 | 
501 |     if len(clean_active_segs) != 0:
502 |         clean_rms = (clean_active_segs ** 2).mean() ** 0.5
503 |     else:
504 |         clean_rms = EPS
505 | 
506 |     return clean_rms, noise_rms
507 | 
508 | 
509 | def segmental_snr_mixer(clean, noise, snr, target_level=-25, clipping_threshold=0.99, target_level_lower=-35, target_level_upper=-15):
510 |     '''Function to mix clean speech and noise at various segmental SNR levels'''
511 |     if len(clean) > len(noise):
512 |         noise = np.append(noise, np.zeros(len(clean)-len(noise)))
513 |     else:
514 |         clean = np.append(clean, np.zeros(len(noise)-len(clean)))
515 |     clean = clean/(max(abs(clean))+EPS)
516 |     noise = noise/(max(abs(noise))+EPS)
517 |     rmsclean, rmsnoise = active_rms(clean=clean, noise=noise)
518 |     clean = normalize_segmental_rms(clean, rms=rmsclean, target_level=target_level)
519 |     noise = normalize_segmental_rms(noise, rms=rmsnoise, target_level=target_level)
520 |     # Set the noise level for a given SNR
521 |     noisescalar = rmsclean / (10**(snr/20)) / (rmsnoise+EPS)
522 |     noisenewlevel = noise * noisescalar
523 | 
524 |     # Mix noise and clean speech
525 |     noisyspeech = clean + noisenewlevel
526 |     # Randomly select RMS value between -15 dBFS and -35 dBFS and normalize noisyspeech with that value
527 |     # There is a chance of clipping that might happen with very less probability, which is not a major issue.
528 |     noisy_rms_level = np.random.randint(target_level_lower, target_level_upper)
529 |     rmsnoisy = (noisyspeech**2).mean()**0.5
530 |     scalarnoisy = 10 ** (noisy_rms_level / 20) / (rmsnoisy+EPS)
531 |     noisyspeech = noisyspeech * scalarnoisy
532 |     clean = clean * scalarnoisy
533 |     noisenewlevel = noisenewlevel * scalarnoisy
534 |     # Final check to see if there are any amplitudes exceeding +/- 1. If so, normalize all the signals accordingly
535 |     if is_clipped(noisyspeech):
536 |         noisyspeech_maxamplevel = max(abs(noisyspeech))/(clipping_threshold-EPS)
537 |         noisyspeech = noisyspeech/noisyspeech_maxamplevel
538 |         clean = clean/noisyspeech_maxamplevel
539 |         noisenewlevel = noisenewlevel/noisyspeech_maxamplevel
540 |         noisy_rms_level = int(20*np.log10(scalarnoisy/noisyspeech_maxamplevel*(rmsnoisy+EPS)))
541 | 
542 |     return clean, noisenewlevel, noisyspeech, noisy_rms_level
543 | 
544 | def nearend_farend_mixer(nearend_data, echo_data, lowerbound_ser=-10, upperbound_ser=13, clipping_threshold=0.99):
545 |     ser = np.random.uniform(lowerbound_ser, upperbound_ser)
546 |     ser = float(format(ser, '.4f'))
547 | 
548 |     local_ser = SER(nearend_data, echo_data)
549 |     # print(nearend[nearend_index], farend_dict[key], local_ser)
550 |     # print(len(echo_data), len(farend_speech), len(nearend_data))
551 | 
552 |     nearend_data = nearend_data / max(abs(nearend_data))
553 |     echo_data = echo_data / max(abs(echo_data))
554 | 
555 |     nearend_rms = np.mean(nearend_data ** 2) ** 0.5
556 |     echo_rms = np.mean(echo_data ** 2) ** 0.5
557 | 
558 |     echocalar = nearend_rms / (10 ** (ser / 10)) / (echo_rms + EPS)
559 |     new_echo = echo_data * echocalar
560 |     new_ser = SER(nearend_data, new_echo)
561 | 
562 |     nearend_mic = nearend_data + new_echo
563 |     if is_clipped(nearend_mic):
564 |         nearnendmic_maxamplevel = max(abs(nearend_mic)) / (clipping_threshold - EPS)
565 |         nearend_mic = nearend_mic / nearnendmic_maxamplevel
566 |         nearend_data = nearend_data / nearnendmic_maxamplevel
567 |         new_echo = new_echo / nearnendmic_maxamplevel
568 |         #echo_rms_level = int(10 * np.log10(echocalar / nearnendmic_maxamplevel * (echo_rms + EPS)))
569 |     return nearend_mic, nearend_data, new_echo, ser
570 | 
571 | 
572 | '''
573 | We follow the dataset allocation from AEC-CHALLENGE,
574 | The validation data are placed at first 300 fileid
575 | the remaining file are training sets
576 | 我们数据分布方式根据aec-challenge来,即前N个,在汪德凉老师的paper中为300,是验证集, 剩余的是训练集
577 | '''
578 | # def generate_pair_audio(
579 | #         train_dataset,
580 | #         validate_dataset,
581 | #         rir_path,
582 | #         noise_dataset,
583 | #         outputPath,
584 | #         use_reverb=True,
585 | #         sample_rate=16000,
586 | #         audio_length=8):
587 | def generate_pair_audio(train_dataset, validate_dataset, conf):
588 |     rir_path = conf['datasets']['rir_table']        ##using rir from DNS-challenge datasets
589 |     noise_dataset = conf['datasets']['noise_path']  ##noise from DNS-challenge
590 |     outputPath = conf['datasets']['output_path']    ##OUTPUT path for saving the generated datasets
591 |     use_reverb = conf['configs']['use_reverb']
592 |     sample_rate = conf['configs']['samplerate']
593 |     audio_length = conf['configs']['audio_length']
594 |     outputPath = os.path.join(outputPath, 'train')
595 |     if not os.path.exists(outputPath):
596 |         os.makedirs(outputPath)
597 |     csv_path = os.path.join(outputPath, "csv")
598 |     if not os.path.exists(csv_path):
599 |         os.makedirs(csv_path)
600 | 
601 |     nearend_speech_out = os.path.join(outputPath, "nearend_speech")
602 |     farend_speech_out = os.path.join(outputPath, "farend_speech")
603 |     echo_out = os.path.join(outputPath, "echo_signal")
604 |     nearend_mic_out = os.path.join(outputPath, "nearend_mic")
605 | 
606 |     subdir_list = [nearend_speech_out, echo_out, nearend_mic_out, farend_speech_out]
607 |     for i in subdir_list:
608 |         if not os.path.exists(i):
609 |             os.makedirs(i)
610 | 
611 | 
612 |     count = 0
613 |     audio_sample_length = sample_rate * audio_length
614 | 
615 |     farend_speech_name_1 = []
616 |     farend_speech_name_2 = []
617 |     farend_speech_name_3 = []
618 |     nearend_speech_name = []
619 |     filed_id = []
620 |     noise_file_name = []
621 | 
622 |     noise_clipped_files = []
623 |     noise_source_files = []
624 |     split = []
625 |     ser_list = []
626 |     snr_list = []
627 | 
628 | 
629 | 
630 |     rir_dict = get_rir_dict(rir_path, conf)
631 | 
632 |     # TODO: clean it up, make it in a function
633 |     for i in range(len(validate_dataset)):
634 |         print("validate = ", count)
635 |         validate_data_pair = validate_dataset[i]
636 |         validate_nearend_path = validate_data_pair[0]
637 |         validate_farend_path_sets = validate_data_pair[1][np.random.randint(len(validate_data_pair[1]))]
638 |         validate_nearend_speech, validate_sr = sf.read(validate_nearend_path)
639 |         assert validate_sr == sample_rate
640 |         validate_nearend_speech = signal_pad(validate_nearend_speech, audio_sample_length)
641 |         validate_farend_speech = np.concatenate([sf.read(wav)[0] for wav in validate_farend_path_sets])
642 |         validate_farend_speech = signal_pad(validate_farend_speech, audio_sample_length)
643 |         samples_rir_ch = get_rir_samples(rir_dict)
644 |         validate_reverb_farend, validate_noreverb_farend = add_pyreverb(validate_farend_speech, samples_rir_ch, predelay=conf['configs']['predelay'])
645 | 
646 |         #TODO: 加噪,但有些部分我有争议, 可能不需要对输入做那么多的归一和scaling这个到时候实验看看
647 |         validate_noise_audio, validate_noise_file, validate_noise_cf = build_noise_audio(noise_dataset,
648 |                                                                                          fs=sample_rate,
649 |                                                                                          audio_length=audio_length,
650 |                                                                                          audio_samples_length=-1,
651 |                                                                                          silence_length=conf['configs']['silence_length'])
652 | 
653 | 
654 |         snr = np.random.randint(conf['configs']['lowerbound_snr'], conf['configs']['upperbound_snr'])
655 |         snr_list.append(snr)
656 |         if use_reverb:
657 |             validate_farend_snr, validate_noise_snr, validate_echo_signal, target_level = segmental_snr_mixer(clean=validate_reverb_farend,
658 |                                                                                                               noise=validate_noise_audio,
659 |                                                                                                               snr=snr,
660 |                                                                                                               target_level=conf['configs']['target_level'],
661 |                                                                                                               clipping_threshold=conf['configs']['clipping_threshold'],
662 |                                                                                                               target_level_lower=conf['configs']['target_level_lower'],
663 |                                                                                                               target_level_upper=conf['configs']['target_level_upper'])
664 |         else:
665 |             validate_farend_snr, validate_noise_snr, validate_echo_signal, target_level = segmental_snr_mixer(clean=validate_noreverb_farend,
666 |                                                                                                               noise=validate_noise_audio,
667 |                                                                                                               snr=snr,
668 |                                                                                                               target_level=conf['configs']['target_level'],
669 |                                                                                                               clipping_threshold=conf['configs']['clipping_threshold'],
670 |                                                                                                               target_level_lower=conf['configs']['target_level_lower'],
671 |                                                                                                               target_level_upper=conf['configs']['target_level_upper'])
672 | 
673 |         validate_nearend_mic, validate_nearend_speech2, validate_echo_signal, validate_ser= nearend_farend_mixer(validate_nearend_speech,
674 |                                                                                                                  validate_echo_signal,
675 |                                                                                                                  lowerbound_ser=conf['configs']['lowerbound_ser'],
676 |                                                                                                                  upperbound_ser=conf['configs']['upperbound_ser'],
677 |                                                                                                                  clipping_threshold=conf['configs']['clipping_threshold'])
678 |         ser_list.append(validate_ser)
679 |         nearend_data_path = os.path.join(nearend_speech_out, "nearend_speech_fileid_{}.wav".format(count))
680 |         farend_data_path = os.path.join(farend_speech_out, "farend_speech_fileid_{}.wav".format(count))
681 |         nearend_mic_data_path = os.path.join(nearend_mic_out, "nearend_mic_fileid_{}.wav".format(count))
682 |         echo_data_path = os.path.join(echo_out, "echo_fileid_{}.wav".format(count))
683 |         if use_reverb:
684 |             audio_signals = [validate_nearend_speech2, validate_farend_speech, validate_nearend_mic, validate_echo_signal]
685 |         else:
686 |             audio_signals = [validate_nearend_speech2, validate_farend_speech, validate_nearend_mic, validate_echo_signal]
687 |         file_paths = [nearend_data_path, farend_data_path, nearend_mic_data_path, echo_data_path]
688 |         for k in range(len(audio_signals)):
689 |             try:
690 |                 pass
691 |                 sf.write(file_paths[k], audio_signals[k], sample_rate)
692 |             except Exception as e:
693 |                 print(str(e))
694 | 
695 |         noise_clipped_files += validate_noise_cf
696 |         noise_source_files += validate_noise_file
697 |         hyphen = '-'
698 |         noise_source_filenamesonly = [i[:-4].split(os.path.sep)[-1] for i in validate_noise_file]
699 | 
700 |         noise_file_name.append(hyphen.join(noise_source_filenamesonly)[:MAXFILELEN])
701 |         farend_speech_name_1.append(validate_farend_path_sets[0])
702 |         farend_speech_name_2.append(validate_farend_path_sets[1])
703 |         farend_speech_name_3.append(validate_farend_path_sets[2])
704 |         nearend_speech_name.append(validate_nearend_path)
705 |         split.append('validate')
706 |         filed_id.append(count)
707 |         count += 1
708 | 
709 | 
710 |     for i in range(len(train_dataset)):  ##TODO  为了方便就直接复制下来了,后期要改成函数
711 |         data_pair = train_dataset[i]
712 | 
713 |         nearend_speech_path = data_pair[0]
714 |         farend_speech_list = data_pair[1]
715 |         nearend_speech, sr = sf.read(nearend_speech_path)
716 |         assert sr == sample_rate
717 |         nearend_speech = signal_pad(nearend_speech, audio_sample_length)
718 |         for j in range(len(farend_speech_list)):
719 | 
720 |             three_farend_sets = farend_speech_list[j]
721 | 
722 |             farend_speech = np.concatenate([sf.read(wav)[0] for wav in three_farend_sets])
723 |             #print(len(farend_speech), [len(sf.read(k)[0]) for k in three_farend_sets], np.array([len(sf.read(k)[0]) for k in three_farend_sets]).sum()) 检查是否被concate到一起了
724 | 
725 |             farend_speech = signal_pad(farend_speech, audio_sample_length)
726 |             samples_rir_ch = get_rir_samples(rir_dict)
727 |             reverb_farend, noreverb_farend = add_pyreverb(farend_speech, samples_rir_ch, predelay=conf['configs']['predelay'])
728 |             #noise_sample, noise_sr = sf.read(noise_files[np.random.randint(0, np.size(noise_files))])
729 |             noise_audio, noise_file, noise_cf = build_noise_audio(noise_dataset,
730 |                                                                   fs=sample_rate,
731 |                                                                   audio_length=audio_length,
732 |                                                                   audio_samples_length=-1,
733 |                                                                   silence_length=conf['configs']['silence_length'])
734 | 
735 |             snr = np.random.randint(conf['configs']['lowerbound_snr'], conf['configs']['upperbound_snr'])
736 |             snr_list.append(snr)
737 |             if use_reverb:
738 |                 farend_snr, noise_snr, echo_signal, target_level = segmental_snr_mixer(clean=reverb_farend,
739 |                                                                                        noise=noise_audio,
740 |                                                                                        snr=snr,
741 |                                                                                        target_level=conf['configs']['target_level'],
742 |                                                                                        clipping_threshold=conf['configs']['clipping_threshold'],
743 |                                                                                        target_level_lower=conf['configs']['target_level_lower'],
744 |                                                                                        target_level_upper=conf['configs']['target_level_upper'])
745 | 
746 |             else:
747 |                 farend_snr, noise_snr, echo_signal, target_level = segmental_snr_mixer(clean=noreverb_farend,
748 |                                                                                        noise=noise_audio,
749 |                                                                                        snr=snr,
750 |                                                                                        target_level=conf['configs']['target_level'],
751 |                                                                                        clipping_threshold=conf['configs']['clipping_threshold'],
752 |                                                                                        target_level_lower=conf['configs']['target_level_lower'],
753 |                                                                                        target_level_upper=conf['configs']['target_level_upper'])
754 | 
755 |             nearend_mic, nearend_speech2, echo_signal, ser = nearend_farend_mixer(nearend_speech,
756 |                                                                                   echo_signal,
757 |                                                                                   lowerbound_ser=conf['configs']['lowerbound_ser'],
758 |                                                                                   upperbound_ser=conf['configs']['upperbound_ser'],
759 |                                                                                   clipping_threshold=conf['configs']['clipping_threshold'])
760 |             ser_list.append(ser)
761 |             #print("%%%%%%%%%%%Processing fileid%%%%%%%%%%%: {}".format(count))
762 | 
763 |             nearend_data_path = os.path.join(nearend_speech_out, "nearend_speech_fileid_{}.wav".format(count))
764 |             farend_data_path = os.path.join(farend_speech_out, "farend_speech_fileid_{}.wav".format(count))
765 |             nearend_mic_data_path = os.path.join(nearend_mic_out, "nearend_mic_fileid_{}.wav".format(count))
766 |             echo_data_path = os.path.join(echo_out, "echo_fileid_{}.wav".format(count))
767 |             if use_reverb:
768 |                 audio_signals = [nearend_speech2, farend_speech, nearend_mic, echo_signal]
769 |             else:
770 |                 audio_signals = [nearend_speech2, noreverb_farend, nearend_mic, echo_signal]
771 |             file_paths = [nearend_data_path, farend_data_path, nearend_mic_data_path, echo_data_path]
772 |             for k in range(len(audio_signals)):
773 |                 try:
774 | 
775 |                     sf.write(file_paths[k], audio_signals[k], sample_rate)
776 |                 except Exception as e:
777 |                     print(str(e))
778 |             print("train = ", count)
779 |             noise_clipped_files += noise_cf
780 |             noise_source_files += noise_file
781 |             hyphen = '-'
782 |             noise_source_filenamesonly = [i[:-4].split(os.path.sep)[-1] for i in noise_file]
783 | 
784 |             noise_file_name.append(hyphen.join(noise_source_filenamesonly)[:MAXFILELEN])
785 |             farend_speech_name_1.append(three_farend_sets[0])
786 |             farend_speech_name_2.append(three_farend_sets[1])
787 |             farend_speech_name_3.append(three_farend_sets[2])
788 |             nearend_speech_name.append(nearend_speech_path)
789 |             filed_id.append(count)
790 |             split.append('train')
791 |             count += 1
792 |     print(len(nearend_speech_name), len(farend_speech_name_1), len(farend_speech_name_2), len(farend_speech_name_3), len(noise_file_name), len(filed_id), len(snr_list), len(ser_list))
793 |     dataFrame = pd.DataFrame({'nearend_speech_path': nearend_speech_name, 'farend_speech_path_1':farend_speech_name_1, 'farend_speech_path_2': farend_speech_name_2,
794 |                               'farend_speech_path_3': farend_speech_name_3, 'noise_file_path':noise_file_name, 'filed_id':filed_id, 'snr':snr_list, 'ser':ser_list})
795 |     dataFrame.to_csv(os.path.join(csv_path, conf['configs']['csv_file_name']), index=False, sep=',')
796 | 
797 | 
798 | def main(args):
799 |     #TODO: This is a draft procesing script, will update and make it clean after
800 |     #
801 |     with open(args.conf, "r") as f:
802 |         conf = json.load(f)
803 | 
804 | 
805 |     train_dataset, validate_dataset, test_dataset = get_data_pair(conf['datasets']['timit_data_path'],
806 |                                                     conf
807 |                                                     )
808 |     generate_pair_audio(train_dataset, validate_dataset, conf)
809 | 
810 | if __name__ == '__main__':
811 |     parser = argparse.ArgumentParser(
812 |         description="Configuration for timit data preparation"
813 |     )
814 |     parser.add_argument(
815 |         "-conf",
816 |         type=str,
817 |         required=True,
818 |         help="configuration for timit data preparation"
819 |     )
820 |     args = parser.parse_args()
821 |     main(args)
822 | 


--------------------------------------------------------------------------------