├── README.md ├── __pycache__ └── evaluation.cpython-37.pyc ├── data.json ├── evaluation.py └── timit_pre_process.py /README.md: -------------------------------------------------------------------------------- 1 | 2 | Timit data process for dnn ai-aec (acoustic echo cancellation) experiments 3 | ============================== 4 | This repo is following the data setup from [Deep Learning for Acoustic Echo Cancellation in Noisy and Double-TalkScenarios](https://www.isca-speech.org/archive/Interspeech_2018/pdfs/1484.pdf). 5 | 6 | It' a draft script, I will modify it and put all changeable configurations into a json so that it can be used more friendly. 7 | 8 | By the way, if you want to do some work in deep learning aec, I recommend using farend data from AEC-challenge and mix with other clean open source datasets. 9 | 10 | Notification 11 | ============ 12 | 13 | References: 14 | 15 | Paper: [Deep Learning for Acoustic Echo Cancellation in Noisy and Double-TalkScenarios](https://www.isca-speech.org/archive/Interspeech_2018/pdfs/1484.pdf) 16 | 17 | DNS-CHALLENGE: [INTERSPEECH 2021 Deep Noise Suppression Challenge](https://arxiv.org/pdf/2101.01902.pdf) 18 | DNS-CHALLENGE CODE: [INTERSPEECH 2021 Deep Noise Suppression Challenge](https://github.com/microsoft/DNS-Challenge) 19 | 20 | AEC-CHALLENGE:[ICASSP 2021 ACOUSTIC ECHO CANCELLATION CHALLENGE: DATASETS, TESTINGFRAMEWORK, AND RESULTS](https://arxiv.org/pdf/2009.04972.pdf) 21 | AEC-CHALLENGE CODE:[ICASSP 2021 ACOUSTIC ECHO CANCELLATION CHALLENGE: DATASETS, TESTINGFRAMEWORK, AND RESULTS](https://github.com/microsoft/AEC-Challenge) 22 | 23 | 24 | How to use 25 | ========== 26 | 1. change __dataPath__, __noisePath__, __outPath__ and __rirPath__ according to your setups, p.s. __rirPath__ is provided from DNS-CHALLENGE where you can review above 27 | 28 | 2. python timit_pre_process.py 29 | 30 | Last Modification 31 | ============ 32 | 33 | 1. add json 34 | 2. randomly pad signal to certain length 35 | 3. add non-linear 36 | -------------------------------------------------------------------------------- /__pycache__/evaluation.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YongyuG/dnn_aec_data_process/71083bce4f12e7cdd20888c4067ffa306c5f9c34/__pycache__/evaluation.cpython-37.pyc -------------------------------------------------------------------------------- /data.json: -------------------------------------------------------------------------------- 1 | { 2 | "datasets": { 3 | "timit_data_path": "/home/yongyug/data/timit/TIMIT", 4 | "noise_path": "/home/yongyug/data/aec_challenge/datasets/noise", 5 | "rir_table": "/home/yongyug/data/aec_challenge/datasets/acoustic_params/RIR_table_simple.csv", 6 | "output_path": "/home/yongyug/data/timit_aec_output" 7 | }, 8 | "configs": { 9 | "audio_length": 8, 10 | "samplerate": 16000, 11 | "use_reverb": true, 12 | "clipping_threshold": 0.99, 13 | "lowerbound_ser": -10, 14 | "upperbound_ser": 13, 15 | "lowerbound_snr": -5, 16 | "upperbound_snr": 20, 17 | "target_level_lower": -35, 18 | "target_level_upper": -15, 19 | "target_level": -25, 20 | "lower_t60": 0.6, 21 | "upper_t60": 1.3, 22 | "predelay": 50, 23 | "silence_length": 0.2, 24 | "add_nonlinear": true, 25 | "train": { 26 | "same_gender_pair": 30, 27 | "diff_gender_pair": 40, 28 | "csv_file_name": "train.csv" 29 | }, 30 | "test": { 31 | "same_gender_pair": 3, 32 | "diff_gender_pair": 4, 33 | "csv_file_name": "test.csv" 34 | } 35 | 36 | } 37 | 38 | } -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import soundfile as sf 3 | import os 4 | 5 | def ERLE(nearend_mic_signal, error_signal): 6 | erle = 10 * np.log10( 7 | np.mean(nearend_mic_signal**2) / np.mean( error_signal **2) 8 | ) 9 | return erle 10 | 11 | def SER(nearend_speech, far_echo): 12 | return 10 * np.log10(((nearend_speech ** 2 ).mean()**0.5) / (far_echo **2).mean()**0.5) 13 | 14 | if __name__ == "__main__": 15 | 16 | fileid = 9999 17 | nearend_mic_path = "/home/yongyug/data/aec_challenge/datasets/synthetic/nearend_mic_signal/nearend_mic_fileid_{}.wav".format(fileid) 18 | nearend_speech_path = "/home/yongyug/data/aec_challenge/datasets/synthetic/nearend_speech/nearend_speech_fileid_{}.wav".format(fileid) 19 | error_path = "/home/yongyug/data/aec_challenge/datasets/synthetic/filter_out/mixdata_fileid_{}.wav".format(fileid) 20 | echo_path = "/home/yongyug/data/aec_challenge/datasets/synthetic/echo_signal/echo_fileid_{}.wav".format(fileid) 21 | 22 | nlp_path = "/home/yongyug/data/aec_challenge/datasets/synthetic/nearend_mic_mix_farend_speech_signal/mixdata_fileid_{}_aec_native.wav".format(fileid) 23 | 24 | nearend_mic_signal, sr = sf.read(nearend_mic_path) 25 | error_signal, _ = sf.read(error_path) 26 | echo_signal, _ = sf.read(echo_path) 27 | nearend_speech, _ = sf.read(nearend_speech_path) 28 | 29 | 30 | nlp_signal, _ = sf.read(nlp_path) 31 | 32 | erle_nonlp = ERLE(nearend_mic_signal, error_signal) 33 | erle_nlp = ERLE(nearend_mic_signal, nlp_signal) 34 | print(erle_nonlp) 35 | print(erle_nlp) 36 | 37 | ser = SER(nearend_speech, echo_signal) 38 | print(ser) -------------------------------------------------------------------------------- /timit_pre_process.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This process is strictly following experimental data setup from paper: 3 | 4 | <> 5 | 6 | ''' 7 | 8 | import argparse 9 | import glob 10 | import json 11 | import librosa 12 | import os 13 | import random 14 | import pandas as pd 15 | import numpy as np 16 | from random import shuffle 17 | import soundfile as sf 18 | from scipy import signal 19 | from evaluation import SER 20 | 21 | MAXTRIES = 50 22 | MAXFILELEN = 50 23 | # np.random.seed(9999) 24 | # random.seed(9999) 25 | EPS = np.finfo(float).eps 26 | 27 | def get_single_gender_index_list(data_list, num_pair=30): #获取不重复的单性别说话人对 28 | index_list = [] 29 | seen_list = [] 30 | i = 0 31 | while i < num_pair: 32 | index_set = list(np.random.randint(0, len(data_list), 2)) 33 | 34 | if index_set[0] not in seen_list and index_set[1] not in seen_list: 35 | index_list.append(index_set) 36 | seen_list.append(index_set[0]) 37 | seen_list.append(index_set[1]) 38 | 39 | i += 1 40 | 41 | return index_list, seen_list 42 | 43 | def get_double_gender_index_list(male_list, female_list, male_seen_list, female_seenlist, num_pair=40): 44 | male_female_index_list = [] 45 | 46 | i = 0 47 | while i < num_pair: 48 | male_index = np.random.randint(0, len(male_list)) 49 | female_index = np.random.randint(0, len(female_list)) 50 | index_set = [male_index, female_index] 51 | 52 | if male_index not in male_seen_list and female_index not in female_seenlist: 53 | male_female_index_list.append(index_set) 54 | male_seen_list.append(male_index) 55 | female_seenlist.append(female_index) 56 | i += 1 57 | return male_female_index_list, male_seen_list, female_seenlist 58 | 59 | def get_gender_index_list(male_list, female_list, conf): 60 | # male_male_index_list = [] 61 | # female_female_index_list = [] 62 | # male_female_index_list = [] 63 | male_seen_list = [] 64 | female_seen_list = [] 65 | 66 | male_male_index_list, temp_male_seenlist = get_single_gender_index_list(male_list, conf['configs']['samle_gender_pair']) 67 | female_female_index_list, temp_female_seenlist = get_single_gender_index_list(male_list, conf['configs']['samle_gender_pair']) 68 | male_female_index_list, male_seen_list, female_seen_list = get_double_gender_index_list(male_list, female_list, temp_male_seenlist, temp_female_seenlist, conf['configs']['diff_gender_pair']) 69 | return male_male_index_list, female_female_index_list, male_female_index_list 70 | 71 | 72 | def random_three_nonrepeat_sample(data_len): 73 | three_sample_list = [] 74 | for i in range(data_len): 75 | for j in range(i + 1, data_len): 76 | for k in range(j + 1, data_len): 77 | three_sample_list.append([i, j, k]) 78 | 79 | return three_sample_list 80 | 81 | def add_pyreverb(clean_speech, rir, predelay=50): 82 | predelay = predelay 83 | early_delay_samples = (predelay * 16000) // 1000 84 | early_rir = rir[:early_delay_samples] 85 | 86 | reverb_speech = signal.fftconvolve(clean_speech, rir, mode="full") 87 | noreverb_speech = signal.fftconvolve(clean_speech, early_rir, mode="full") 88 | 89 | # make reverb_speech same length as clean_speech 90 | reverb_speech = reverb_speech[0 : clean_speech.shape[0]] 91 | noreverb_speech = noreverb_speech[0 : clean_speech.shape[0]] 92 | 93 | return reverb_speech, noreverb_speech 94 | 95 | def signal_pad(signal, audio_sample_length): 96 | 97 | 98 | 99 | # if len(signal) < audio_sample_length: # 设定一个统一的长度,如果长度不够, 则前后补零 100 | # if len(signal) % 2 == 0: 101 | # signal = np.pad(signal, ((audio_sample_length - len(signal)) // 2, 102 | # (audio_sample_length - len(signal)) // 2), 'constant', 103 | # constant_values=(0, 0)) 104 | # elif len(signal) % 2 != 0: 105 | # signal = np.pad(signal, ((audio_sample_length - len(signal)) // 2, 106 | # (audio_sample_length - len(signal)) // 2 + len( 107 | # signal) % 2), 'constant', 108 | # constant_values=(0, 0)) # 无法被2整除则把余数补零至最后 109 | if len(signal) < audio_sample_length: 110 | diff_len = audio_sample_length - len(signal) 111 | padfront = np.random.randint(diff_len) 112 | signal = np.pad(signal, (padfront, diff_len - padfront), 'constant', constant_values=(0, 0)) ##randomly pad in front and end 113 | 114 | elif len(signal) >= audio_sample_length: 115 | signal = signal[:audio_sample_length] 116 | return signal 117 | 118 | # def generate_single_gender_wav_pair(nearend_data_list, farend_data_list, data_dict, pairname): 119 | def generate_gender_wav_pair(nearend_data_list, farend_data_list, data_dict1, data_dict2, pairname): 120 | 121 | farend_three_sample_index = random_three_nonrepeat_sample(10) # 将所有farend不重复的组合list列出来 122 | 123 | 124 | train_res_list = [] 125 | validate_res_list = [] 126 | test_res_list = [] 127 | count = 0 128 | for i in range(len(nearend_data_list)): 129 | nearend_spk = nearend_data_list[i] 130 | farend_spk = farend_data_list[i] 131 | if nearend_spk[0] == 'M': 132 | nearend_spk_wav = data_dict1[nearend_spk] 133 | else: 134 | nearend_spk_wav = data_dict2[nearend_spk] 135 | if farend_spk[0] == 'M': 136 | farend_spk_wav = data_dict1[farend_spk] 137 | else: 138 | farend_spk_wav = data_dict2[farend_spk] 139 | 140 | 141 | nearend_select_index = np.arange(10) ## 每个人10条语音, 做一个随机 142 | np.random.shuffle(nearend_select_index) # for nearend_spk in nearend_spk_list: 143 | 144 | nearend_wav_pick = np.array(nearend_spk_wav)[nearend_select_index] 145 | if pairname.upper() == 'TRAIN': 146 | farend_group = [[i for i in range(j * 5, (j + 1) * 5)] for j in range(len(nearend_select_index))] # 这里是对于nearend来说, 渠道每个wav对应farend的index 147 | elif pairname.upper() == 'TEST': 148 | farend_group = [[i for i in range(j * 1, (j + 1) * 1)] for j in range(len(nearend_select_index))] # 这里是对于nearend来说, 渠道每个wav对应farend的index 149 | 150 | random.shuffle(farend_three_sample_index) # 把farend的三元list随机一下 151 | farend_select_index = np.array(farend_three_sample_index)[np.array(farend_group)] # 把每个nearend选择的farend取出来 152 | farend_wav_pick = np.array(farend_spk_wav)[farend_select_index] 153 | if pairname.upper() == 'TRAIN': 154 | for k in range(len(nearend_wav_pick)): 155 | if k < 7: 156 | count += 1 157 | #print(nearend_wav_pick[k], farend_wav_pick[k]) 158 | train_res_list.append((nearend_wav_pick[k], farend_wav_pick[k])) 159 | 160 | else: 161 | validate_res_list.append((nearend_wav_pick[k], farend_wav_pick[k])) 162 | else: 163 | for k in range(len(nearend_wav_pick)): 164 | test_res_list.append((nearend_wav_pick[k], farend_wav_pick[k])) 165 | if pairname.upper() == 'TRAIN': 166 | return train_res_list, validate_res_list 167 | else: 168 | return test_res_list 169 | 170 | def add_nonlinear_distortion(farend_signal, hard=True): 171 | alpha = 0.8 172 | x_max = alpha * np.max(np.abs(farend_signal)) 173 | ### x_max is the maximum value of output signal 174 | ### I set it 0.8 scale to the maximum of the input signal 175 | ### you can set your x_max yourself 176 | 177 | if hard: 178 | farend_signal[farend_signal < -x_max] = -x_max 179 | farend_signal[farend_signal > x_max] = x_max 180 | xn = farend_signal 181 | else: 182 | 183 | x_soft = (x_max * farend_signal) / ((np.abs(x_max) ** 2 + np.abs(farend_signal) ** 2) ** (1 / 2)) 184 | xn = x_soft 185 | 186 | 187 | 188 | sigmoid_gian = 0.4 189 | bn = 1.5 * xn - 0.3 * xn ** 2 190 | alpha = [4 if i > 0 else 0.5 for i in bn] 191 | x_nl = sigmoid_gian * ((2 / (1 + np.exp(alpha * bn))) - 1) 192 | 193 | return x_nl 194 | 195 | 196 | 197 | def get_data_pair( 198 | dataPath, 199 | conf, 200 | ): 201 | 202 | male_dict = {} 203 | female_dict = {} 204 | count = 0 205 | 206 | #分别将timit中男女相关的 207 | for root, _, files in os.walk(dataPath): 208 | for file in files: 209 | if file.endswith('WAV'): 210 | count += 1 211 | dataType, spk = root.split(os.path.sep)[-3], root.split(os.path.sep)[-1] 212 | gender = spk[0] 213 | if gender == "M": 214 | if spk not in male_dict.keys(): 215 | male_dict[spk] = [] 216 | male_dict[spk].append(os.path.join(root, file)) 217 | elif gender == "F": 218 | if spk not in female_dict.keys(): 219 | female_dict[spk] = [] 220 | female_dict[spk].append(os.path.join(root, file)) 221 | 222 | 223 | 224 | male_name_list = list(male_dict.keys()) 225 | female_name_list = list(female_dict.keys()) 226 | male_name_index_list = [i for i in range(len(male_name_list))] 227 | female_name_index_list = [i for i in range(len(female_name_list))] 228 | 229 | #Randomize coresponding data-pairs, get the speaker idx from the list 230 | train_male_male_index_list, temp_maleseen = get_single_gender_index_list(male_name_list, num_pair=conf["configs"]['test']['same_gender_pair']) 231 | train_female_female_index_list, temp_female_seen = get_single_gender_index_list(female_name_list, num_pair=conf["configs"]['test']['same_gender_pair']) 232 | train_male_female_index_list, male_seen_list, female_seen_list = get_double_gender_index_list(male_name_list, female_name_list, temp_maleseen, temp_female_seen, num_pair=conf["configs"]['test']['diff_gender_pair']) 233 | #get_gender_index_list can use this function to get this three list 234 | 235 | #Get the remain speaker index for test sets 236 | rest_male_list = [i for i in male_name_index_list if i not in male_seen_list] 237 | rest_female_list = [i for i in female_name_index_list if i not in female_seen_list] 238 | test_male_male_index_list, temp_maleseen = get_single_gender_index_list(rest_male_list, num_pair=conf["configs"]['test']['same_gender_pair']) 239 | test_female_female_index_list, temp_female_seen = get_single_gender_index_list(rest_female_list, num_pair=conf["configs"]['test']['same_gender_pair']) 240 | test_male_female_index_list, male_seen_list, female_seen_list = get_double_gender_index_list(rest_male_list, rest_female_list, temp_maleseen, temp_female_seen, num_pair=conf["configs"]['test']['diff_gender_pair']) 241 | 242 | #Randomize which speaker as for farend spk 243 | train_male_female_farend_choice = np.random.randint(0, 2, len(train_male_female_index_list)) #这里是随机选择pair中哪一个spk当作farend 244 | train_male_male_farend_choice = np.random.randint(0, 2, len(train_male_male_index_list)) 245 | train_female_female_farend_choice = np.random.randint(0, 2, len(train_female_female_index_list)) 246 | 247 | test_male_female_farend_choice = np.random.randint(0, 2, len(test_male_female_index_list)) #这里是随机选择pair中哪一个spk当作farend 248 | test_male_male_farend_choice = np.random.randint(0, 2, len(test_male_male_index_list)) 249 | test_female_female_farend_choice = np.random.randint(0, 2, len(test_female_female_index_list)) 250 | 251 | 252 | # print(male_male_index_list) 253 | # print(male_female_index_list) 254 | # for i in male_male_index_list: 255 | # for j in male_female_index_list: 256 | # if i[0] == j[0] or i[1] == j[0]: 257 | # print(i,j) 258 | # print(female_female_index_list) 259 | # print(male_female_index_list) 260 | # for i in female_female_index_list: 261 | # for j in male_female_index_list: 262 | # if i[0] == j[1] or i[1] == j[1]: 263 | # print(i,j) 264 | 265 | 266 | #Get the speaker name from training and testdata-pair 267 | male_name_arr = np.array(male_name_list) 268 | test_male_name_arr = male_name_arr[np.array(test_male_male_index_list)] 269 | train_male_name_arr = male_name_arr[np.array(train_male_male_index_list)] 270 | 271 | female_name_arr = np.array(female_name_list) 272 | test_female_name_arr = female_name_arr[np.array(test_female_female_index_list)] 273 | train_female_name_arr = female_name_arr[np.array(train_female_female_index_list)] 274 | 275 | train_male_female_name_arr = np.array([np.array(male_name_list)[np.array(train_male_female_index_list).T[0]], 276 | np.array(female_name_list)[np.array(train_male_female_index_list).T[1]]]).T #转置是为了把male和female分开, 因为male_female_index_list是 [male, female顺序排列的] 277 | test_male_female_name_arr = np.array([np.array(male_name_list)[np.array(test_male_female_index_list).T[0]], 278 | np.array(female_name_list)[np.array(test_male_female_index_list).T[1]]]).T #转置是为了把male和female分开, 因为male_female_index_list是 [male, female顺序排列的] 279 | 280 | 281 | #Get specific farend and nearend speaker key for training sets 282 | train_male_male_nearend_spk_list = [train_male_name_arr[i][train_male_male_farend_choice[i] ^ 1] for i in range(len(train_male_name_arr))] 283 | train_male_male_farend_spk_list = [train_male_name_arr[i][train_male_male_farend_choice[i]] for i in range(len(train_male_name_arr))] 284 | train_female_female_nearend_spk_list = [train_female_name_arr[i][train_female_female_farend_choice[i] ^ 1] for i in range(len(train_female_name_arr))] 285 | train_female_female_farend_spk_list = [train_female_name_arr[i][train_female_female_farend_choice[i]] for i in range(len(train_female_name_arr))] 286 | train_male_female_nearend_spk_list = [train_male_female_name_arr[i][train_male_female_farend_choice[i] ^ 1] for i in range(len(train_male_female_name_arr))] 287 | train_male_female_farend_spk_list = [train_male_female_name_arr[i][train_male_female_farend_choice[i]] for i in range(len(train_male_female_name_arr))] 288 | 289 | test_male_male_nearend_spk_list = [test_male_name_arr[i][test_male_male_farend_choice[i] ^ 1] for i in range(len(test_male_name_arr))] 290 | test_male_male_farend_spk_list = [test_male_name_arr[i][test_male_male_farend_choice[i]] for i in range(len(test_male_name_arr))] 291 | test_female_female_nearend_spk_list = [test_female_name_arr[i][test_female_female_farend_choice[i] ^ 1] for i in range(len(test_female_name_arr))] 292 | test_female_female_farend_spk_list = [test_female_name_arr[i][test_female_female_farend_choice[i]] for i in range(len(test_female_name_arr))] 293 | test_male_female_nearend_spk_list = [test_male_female_name_arr[i][test_male_female_farend_choice[i] ^ 1] for i in range(len(test_male_female_name_arr))] 294 | test_male_female_farend_spk_list = [test_male_female_name_arr[i][test_male_female_farend_choice[i]] for i in range(len(test_male_female_name_arr))] 295 | 296 | #Generate specifc wav_pair for each data_pair 297 | male_male_train, male_male_validate = generate_gender_wav_pair(train_male_male_nearend_spk_list, train_male_male_farend_spk_list, male_dict, male_dict, 'train') 298 | female_female_train, female_female_validate = generate_gender_wav_pair(train_female_female_nearend_spk_list, train_female_female_farend_spk_list , female_dict, female_dict,'train') 299 | male_female_train, male_female_validate = generate_gender_wav_pair(train_male_female_nearend_spk_list, train_male_female_farend_spk_list, male_dict, female_dict, 'train') 300 | 301 | male_male_test = generate_gender_wav_pair(test_male_male_nearend_spk_list, test_male_male_farend_spk_list, male_dict, male_dict, 'test') 302 | female_female_test = generate_gender_wav_pair(test_female_female_nearend_spk_list, test_female_female_farend_spk_list , female_dict, female_dict,'test') 303 | male_female_test = generate_gender_wav_pair(test_male_female_nearend_spk_list, test_male_female_farend_spk_list, male_dict, female_dict, 'test') 304 | 305 | train_dataset = male_male_train + female_female_train + male_female_train 306 | validate_dataset = male_male_validate + female_female_validate + male_female_validate 307 | test_dataset = male_male_test + female_female_test + male_female_test 308 | 309 | return train_dataset, validate_dataset, test_dataset 310 | 311 | #all 3-type pairs will be merged in male_male_final_data with both train and validate 312 | 313 | # train_dict = {} 314 | # train_dict.update(male_male_final_data['train']) 315 | # train_dict.update(female_female_final_data['train']) 316 | # train_dict.update(male_female_final_data['train']) 317 | # 318 | # validate_dict = {} 319 | # validate_dict.update(male_male_final_data['validate']) 320 | # validate_dict.update(female_female_final_data['validate']) 321 | # validate_dict.update(male_female_final_data['validate']) 322 | 323 | # male_male_final_data['train'].update(female_female_final_data['train']) 324 | # male_male_final_data['train'].update(male_female_final_data['train']) 325 | # male_male_final_data['validate'].update(female_female_final_data['validate']) 326 | # male_male_final_data['validate'].update(male_female_final_data['validate']) 327 | # res_dict = male_male_final_data 328 | # return res_dict 329 | 330 | def get_rir_dict(rir_csv_path, conf): 331 | temp = pd.read_csv(rir_csv_path, skiprows=[1], sep=',', header=None, 332 | names=['wavfile', 'channel', 'T60_WB', 'C50_WB', 'isRealRIR']) 333 | #temp.keys() 334 | 335 | rir_wav = temp['wavfile'][1:] # 115413 336 | rir_channel = temp['channel'][1:] 337 | rir_t60 = temp['T60_WB'][1:] 338 | rir_isreal = temp['isRealRIR'][1:] 339 | 340 | rir_wav2 = [w.replace('\\', '/') for w in rir_wav] 341 | rir_channel2 = [w for w in rir_channel] 342 | rir_t60_2 = [w for w in rir_t60] 343 | rir_isreal2 = [w for w in rir_isreal] 344 | 345 | myrir = [] 346 | mychannel = [] 347 | myt60 = [] 348 | 349 | lower_t60 = conf['configs']['lower_t60'] 350 | upper_t60 = conf['configs']['upper_t60'] 351 | 352 | all_indices = [i for i, x in enumerate(rir_isreal2)] 353 | 354 | chosen_i = [] 355 | for i in all_indices: 356 | if (float(rir_t60_2[i]) >= lower_t60) and (float(rir_t60_2[i]) <= upper_t60): 357 | chosen_i.append(i) 358 | 359 | myrir = [rir_wav2[i] for i in chosen_i] 360 | mychannel = [rir_channel2[i] for i in chosen_i] 361 | myt60 = [rir_t60_2[i] for i in chosen_i] 362 | 363 | rir_dict = {"myrir":myrir, 'mychannel':mychannel, 'myt60':myt60} 364 | return rir_dict 365 | 366 | def get_rir_samples(rir_dict): 367 | myrir = rir_dict['myrir'] 368 | mychannel = rir_dict['mychannel'] 369 | myt60 = rir_dict['myt60'] 370 | # 371 | # 372 | rir_index = random.randint(0, len(myrir) - 1) 373 | my_rir = myrir[rir_index] 374 | 375 | while not os.path.exists(my_rir): 376 | rir_index = random.randint(0, len(myrir) - 1) 377 | my_rir = myrir[rir_index] 378 | 379 | samples_rir, fs_rir = sf.read(my_rir) 380 | 381 | my_channel = int(mychannel[rir_index]) 382 | 383 | if samples_rir.ndim == 1: 384 | samples_rir_ch = np.array(samples_rir) 385 | elif my_channel > 1: 386 | samples_rir_ch = samples_rir[:, my_channel - 1] 387 | else: 388 | samples_rir_ch = samples_rir[:, my_channel - 1] 389 | 390 | return samples_rir_ch 391 | 392 | def get_noise_files(noise_path): 393 | sources_files_names = glob.glob(os.path.join(noise_path, "*.wav")) 394 | shuffle(sources_files_names) 395 | return sources_files_names 396 | 397 | def is_clipped(audio, clipping_threshold=0.99): 398 | return any(abs(audio) > clipping_threshold) 399 | 400 | def build_noise_audio( 401 | noise_path, 402 | fs=16000, 403 | audio_length=8, 404 | audio_samples_length=-1, 405 | silence_length = 0.2): 406 | '''Construct an audio signal from source files''' 407 | 408 | fs_output = fs 409 | if audio_samples_length == -1: 410 | audio_samples_length = int(audio_length*fs) 411 | 412 | output_audio = np.zeros(0) 413 | remaining_length = audio_samples_length 414 | files_used = [] 415 | clipped_files = [] 416 | 417 | source_files = glob.glob(os.path.join(noise_path, 418 | "*.wav")) 419 | shuffle(source_files) 420 | # pick a noise source file index randomly 421 | idx = np.random.randint(0, np.size(source_files)) 422 | 423 | # initialize silence 424 | silence = np.zeros(int(fs_output*silence_length)) 425 | 426 | # iterate through multiple clips until we have a long enough signal 427 | tries_left = MAXTRIES 428 | while remaining_length > 0 and tries_left > 0: 429 | 430 | # read next audio file and resample if necessary 431 | 432 | idx = (idx + 1) % np.size(source_files) #这里有种shift的感觉, 第0个是最后的时候才process的 433 | input_audio, fs_input = sf.read(source_files[idx]) 434 | if fs_input != fs_output: 435 | input_audio = librosa.resample(input_audio, fs_input, fs_output) 436 | 437 | # if current file is longer than remaining desired length, and this is 438 | # noise generation or this is training set, subsample it randomly 439 | if len(input_audio) > remaining_length: 440 | idx_seg = np.random.randint(0, len(input_audio)-remaining_length) 441 | input_audio = input_audio[idx_seg:idx_seg+remaining_length] 442 | 443 | # check for clipping, and if found move onto next file 444 | if is_clipped(input_audio): 445 | clipped_files.append(source_files[idx]) 446 | tries_left -= 1 447 | continue 448 | 449 | # concatenate current input audio to output audio stream 450 | files_used.append(source_files[idx]) 451 | output_audio = np.append(output_audio, input_audio) 452 | remaining_length -= len(input_audio) 453 | 454 | # add some silence if we have not reached desired audio length 455 | if remaining_length > 0: 456 | silence_len = min(remaining_length, len(silence)) 457 | output_audio = np.append(output_audio, silence[:silence_len]) 458 | remaining_length -= silence_len 459 | 460 | return output_audio, files_used, clipped_files 461 | 462 | def normalize(audio, target_level=-25): 463 | '''Normalize the signal to the target level''' 464 | rms = (audio ** 2).mean() ** 0.5 465 | scalar = 10 ** (target_level / 20) / (rms+EPS) 466 | audio = audio * scalar 467 | return audio 468 | 469 | def normalize_segmental_rms(audio, rms, target_level=-25): 470 | '''Normalize the signal to the target level 471 | based on segmental RMS''' 472 | scalar = 10 ** (target_level / 20) / (rms+EPS) 473 | audio = audio * scalar 474 | return audio 475 | 476 | 477 | def active_rms(clean, noise, fs=16000, energy_thresh=-50): 478 | '''Returns the clean and noise RMS of the noise calculated only in the active portions''' 479 | window_size = 100 # in ms 480 | window_samples = int(fs * window_size / 1000) 481 | sample_start = 0 482 | noise_active_segs = [] 483 | clean_active_segs = [] 484 | 485 | while sample_start < len(noise): 486 | sample_end = min(sample_start + window_samples, len(noise)) 487 | noise_win = noise[sample_start:sample_end] 488 | clean_win = clean[sample_start:sample_end] 489 | noise_seg_rms = 20 * np.log10((noise_win ** 2).mean() + EPS) 490 | # Considering frames with energy 491 | if noise_seg_rms > energy_thresh: 492 | noise_active_segs = np.append(noise_active_segs, noise_win) 493 | clean_active_segs = np.append(clean_active_segs, clean_win) 494 | sample_start += window_samples 495 | 496 | if len(noise_active_segs) != 0: 497 | noise_rms = (noise_active_segs ** 2).mean() ** 0.5 498 | else: 499 | noise_rms = EPS 500 | 501 | if len(clean_active_segs) != 0: 502 | clean_rms = (clean_active_segs ** 2).mean() ** 0.5 503 | else: 504 | clean_rms = EPS 505 | 506 | return clean_rms, noise_rms 507 | 508 | 509 | def segmental_snr_mixer(clean, noise, snr, target_level=-25, clipping_threshold=0.99, target_level_lower=-35, target_level_upper=-15): 510 | '''Function to mix clean speech and noise at various segmental SNR levels''' 511 | if len(clean) > len(noise): 512 | noise = np.append(noise, np.zeros(len(clean)-len(noise))) 513 | else: 514 | clean = np.append(clean, np.zeros(len(noise)-len(clean))) 515 | clean = clean/(max(abs(clean))+EPS) 516 | noise = noise/(max(abs(noise))+EPS) 517 | rmsclean, rmsnoise = active_rms(clean=clean, noise=noise) 518 | clean = normalize_segmental_rms(clean, rms=rmsclean, target_level=target_level) 519 | noise = normalize_segmental_rms(noise, rms=rmsnoise, target_level=target_level) 520 | # Set the noise level for a given SNR 521 | noisescalar = rmsclean / (10**(snr/20)) / (rmsnoise+EPS) 522 | noisenewlevel = noise * noisescalar 523 | 524 | # Mix noise and clean speech 525 | noisyspeech = clean + noisenewlevel 526 | # Randomly select RMS value between -15 dBFS and -35 dBFS and normalize noisyspeech with that value 527 | # There is a chance of clipping that might happen with very less probability, which is not a major issue. 528 | noisy_rms_level = np.random.randint(target_level_lower, target_level_upper) 529 | rmsnoisy = (noisyspeech**2).mean()**0.5 530 | scalarnoisy = 10 ** (noisy_rms_level / 20) / (rmsnoisy+EPS) 531 | noisyspeech = noisyspeech * scalarnoisy 532 | clean = clean * scalarnoisy 533 | noisenewlevel = noisenewlevel * scalarnoisy 534 | # Final check to see if there are any amplitudes exceeding +/- 1. If so, normalize all the signals accordingly 535 | if is_clipped(noisyspeech): 536 | noisyspeech_maxamplevel = max(abs(noisyspeech))/(clipping_threshold-EPS) 537 | noisyspeech = noisyspeech/noisyspeech_maxamplevel 538 | clean = clean/noisyspeech_maxamplevel 539 | noisenewlevel = noisenewlevel/noisyspeech_maxamplevel 540 | noisy_rms_level = int(20*np.log10(scalarnoisy/noisyspeech_maxamplevel*(rmsnoisy+EPS))) 541 | 542 | return clean, noisenewlevel, noisyspeech, noisy_rms_level 543 | 544 | def nearend_farend_mixer(nearend_data, echo_data, lowerbound_ser=-10, upperbound_ser=13, clipping_threshold=0.99): 545 | ser = np.random.uniform(lowerbound_ser, upperbound_ser) 546 | ser = float(format(ser, '.4f')) 547 | 548 | local_ser = SER(nearend_data, echo_data) 549 | # print(nearend[nearend_index], farend_dict[key], local_ser) 550 | # print(len(echo_data), len(farend_speech), len(nearend_data)) 551 | 552 | nearend_data = nearend_data / max(abs(nearend_data)) 553 | echo_data = echo_data / max(abs(echo_data)) 554 | 555 | nearend_rms = np.mean(nearend_data ** 2) ** 0.5 556 | echo_rms = np.mean(echo_data ** 2) ** 0.5 557 | 558 | echocalar = nearend_rms / (10 ** (ser / 10)) / (echo_rms + EPS) 559 | new_echo = echo_data * echocalar 560 | new_ser = SER(nearend_data, new_echo) 561 | 562 | nearend_mic = nearend_data + new_echo 563 | if is_clipped(nearend_mic): 564 | nearnendmic_maxamplevel = max(abs(nearend_mic)) / (clipping_threshold - EPS) 565 | nearend_mic = nearend_mic / nearnendmic_maxamplevel 566 | nearend_data = nearend_data / nearnendmic_maxamplevel 567 | new_echo = new_echo / nearnendmic_maxamplevel 568 | #echo_rms_level = int(10 * np.log10(echocalar / nearnendmic_maxamplevel * (echo_rms + EPS))) 569 | return nearend_mic, nearend_data, new_echo, ser 570 | 571 | 572 | ''' 573 | We follow the dataset allocation from AEC-CHALLENGE, 574 | The validation data are placed at first 300 fileid 575 | the remaining file are training sets 576 | 我们数据分布方式根据aec-challenge来,即前N个,在汪德凉老师的paper中为300,是验证集, 剩余的是训练集 577 | ''' 578 | # def generate_pair_audio( 579 | # train_dataset, 580 | # validate_dataset, 581 | # rir_path, 582 | # noise_dataset, 583 | # outputPath, 584 | # use_reverb=True, 585 | # sample_rate=16000, 586 | # audio_length=8): 587 | def generate_pair_audio(train_dataset, validate_dataset, conf): 588 | rir_path = conf['datasets']['rir_table'] ##using rir from DNS-challenge datasets 589 | noise_dataset = conf['datasets']['noise_path'] ##noise from DNS-challenge 590 | outputPath = conf['datasets']['output_path'] ##OUTPUT path for saving the generated datasets 591 | use_reverb = conf['configs']['use_reverb'] 592 | sample_rate = conf['configs']['samplerate'] 593 | audio_length = conf['configs']['audio_length'] 594 | outputPath = os.path.join(outputPath, 'train') 595 | if not os.path.exists(outputPath): 596 | os.makedirs(outputPath) 597 | csv_path = os.path.join(outputPath, "csv") 598 | if not os.path.exists(csv_path): 599 | os.makedirs(csv_path) 600 | 601 | nearend_speech_out = os.path.join(outputPath, "nearend_speech") 602 | farend_speech_out = os.path.join(outputPath, "farend_speech") 603 | echo_out = os.path.join(outputPath, "echo_signal") 604 | nearend_mic_out = os.path.join(outputPath, "nearend_mic") 605 | 606 | subdir_list = [nearend_speech_out, echo_out, nearend_mic_out, farend_speech_out] 607 | for i in subdir_list: 608 | if not os.path.exists(i): 609 | os.makedirs(i) 610 | 611 | 612 | count = 0 613 | audio_sample_length = sample_rate * audio_length 614 | 615 | farend_speech_name_1 = [] 616 | farend_speech_name_2 = [] 617 | farend_speech_name_3 = [] 618 | nearend_speech_name = [] 619 | filed_id = [] 620 | noise_file_name = [] 621 | 622 | noise_clipped_files = [] 623 | noise_source_files = [] 624 | split = [] 625 | ser_list = [] 626 | snr_list = [] 627 | 628 | 629 | 630 | rir_dict = get_rir_dict(rir_path, conf) 631 | 632 | # TODO: clean it up, make it in a function 633 | for i in range(len(validate_dataset)): 634 | print("validate = ", count) 635 | validate_data_pair = validate_dataset[i] 636 | validate_nearend_path = validate_data_pair[0] 637 | validate_farend_path_sets = validate_data_pair[1][np.random.randint(len(validate_data_pair[1]))] 638 | validate_nearend_speech, validate_sr = sf.read(validate_nearend_path) 639 | assert validate_sr == sample_rate 640 | validate_nearend_speech = signal_pad(validate_nearend_speech, audio_sample_length) 641 | validate_farend_speech = np.concatenate([sf.read(wav)[0] for wav in validate_farend_path_sets]) 642 | validate_farend_speech = signal_pad(validate_farend_speech, audio_sample_length) 643 | samples_rir_ch = get_rir_samples(rir_dict) 644 | validate_reverb_farend, validate_noreverb_farend = add_pyreverb(validate_farend_speech, samples_rir_ch, predelay=conf['configs']['predelay']) 645 | 646 | #TODO: 加噪,但有些部分我有争议, 可能不需要对输入做那么多的归一和scaling这个到时候实验看看 647 | validate_noise_audio, validate_noise_file, validate_noise_cf = build_noise_audio(noise_dataset, 648 | fs=sample_rate, 649 | audio_length=audio_length, 650 | audio_samples_length=-1, 651 | silence_length=conf['configs']['silence_length']) 652 | 653 | 654 | snr = np.random.randint(conf['configs']['lowerbound_snr'], conf['configs']['upperbound_snr']) 655 | snr_list.append(snr) 656 | if use_reverb: 657 | validate_farend_snr, validate_noise_snr, validate_echo_signal, target_level = segmental_snr_mixer(clean=validate_reverb_farend, 658 | noise=validate_noise_audio, 659 | snr=snr, 660 | target_level=conf['configs']['target_level'], 661 | clipping_threshold=conf['configs']['clipping_threshold'], 662 | target_level_lower=conf['configs']['target_level_lower'], 663 | target_level_upper=conf['configs']['target_level_upper']) 664 | else: 665 | validate_farend_snr, validate_noise_snr, validate_echo_signal, target_level = segmental_snr_mixer(clean=validate_noreverb_farend, 666 | noise=validate_noise_audio, 667 | snr=snr, 668 | target_level=conf['configs']['target_level'], 669 | clipping_threshold=conf['configs']['clipping_threshold'], 670 | target_level_lower=conf['configs']['target_level_lower'], 671 | target_level_upper=conf['configs']['target_level_upper']) 672 | 673 | validate_nearend_mic, validate_nearend_speech2, validate_echo_signal, validate_ser= nearend_farend_mixer(validate_nearend_speech, 674 | validate_echo_signal, 675 | lowerbound_ser=conf['configs']['lowerbound_ser'], 676 | upperbound_ser=conf['configs']['upperbound_ser'], 677 | clipping_threshold=conf['configs']['clipping_threshold']) 678 | ser_list.append(validate_ser) 679 | nearend_data_path = os.path.join(nearend_speech_out, "nearend_speech_fileid_{}.wav".format(count)) 680 | farend_data_path = os.path.join(farend_speech_out, "farend_speech_fileid_{}.wav".format(count)) 681 | nearend_mic_data_path = os.path.join(nearend_mic_out, "nearend_mic_fileid_{}.wav".format(count)) 682 | echo_data_path = os.path.join(echo_out, "echo_fileid_{}.wav".format(count)) 683 | if use_reverb: 684 | audio_signals = [validate_nearend_speech2, validate_farend_speech, validate_nearend_mic, validate_echo_signal] 685 | else: 686 | audio_signals = [validate_nearend_speech2, validate_farend_speech, validate_nearend_mic, validate_echo_signal] 687 | file_paths = [nearend_data_path, farend_data_path, nearend_mic_data_path, echo_data_path] 688 | for k in range(len(audio_signals)): 689 | try: 690 | pass 691 | sf.write(file_paths[k], audio_signals[k], sample_rate) 692 | except Exception as e: 693 | print(str(e)) 694 | 695 | noise_clipped_files += validate_noise_cf 696 | noise_source_files += validate_noise_file 697 | hyphen = '-' 698 | noise_source_filenamesonly = [i[:-4].split(os.path.sep)[-1] for i in validate_noise_file] 699 | 700 | noise_file_name.append(hyphen.join(noise_source_filenamesonly)[:MAXFILELEN]) 701 | farend_speech_name_1.append(validate_farend_path_sets[0]) 702 | farend_speech_name_2.append(validate_farend_path_sets[1]) 703 | farend_speech_name_3.append(validate_farend_path_sets[2]) 704 | nearend_speech_name.append(validate_nearend_path) 705 | split.append('validate') 706 | filed_id.append(count) 707 | count += 1 708 | 709 | 710 | for i in range(len(train_dataset)): ##TODO 为了方便就直接复制下来了,后期要改成函数 711 | data_pair = train_dataset[i] 712 | 713 | nearend_speech_path = data_pair[0] 714 | farend_speech_list = data_pair[1] 715 | nearend_speech, sr = sf.read(nearend_speech_path) 716 | assert sr == sample_rate 717 | nearend_speech = signal_pad(nearend_speech, audio_sample_length) 718 | for j in range(len(farend_speech_list)): 719 | 720 | three_farend_sets = farend_speech_list[j] 721 | 722 | farend_speech = np.concatenate([sf.read(wav)[0] for wav in three_farend_sets]) 723 | #print(len(farend_speech), [len(sf.read(k)[0]) for k in three_farend_sets], np.array([len(sf.read(k)[0]) for k in three_farend_sets]).sum()) 检查是否被concate到一起了 724 | 725 | farend_speech = signal_pad(farend_speech, audio_sample_length) 726 | samples_rir_ch = get_rir_samples(rir_dict) 727 | reverb_farend, noreverb_farend = add_pyreverb(farend_speech, samples_rir_ch, predelay=conf['configs']['predelay']) 728 | #noise_sample, noise_sr = sf.read(noise_files[np.random.randint(0, np.size(noise_files))]) 729 | noise_audio, noise_file, noise_cf = build_noise_audio(noise_dataset, 730 | fs=sample_rate, 731 | audio_length=audio_length, 732 | audio_samples_length=-1, 733 | silence_length=conf['configs']['silence_length']) 734 | 735 | snr = np.random.randint(conf['configs']['lowerbound_snr'], conf['configs']['upperbound_snr']) 736 | snr_list.append(snr) 737 | if use_reverb: 738 | farend_snr, noise_snr, echo_signal, target_level = segmental_snr_mixer(clean=reverb_farend, 739 | noise=noise_audio, 740 | snr=snr, 741 | target_level=conf['configs']['target_level'], 742 | clipping_threshold=conf['configs']['clipping_threshold'], 743 | target_level_lower=conf['configs']['target_level_lower'], 744 | target_level_upper=conf['configs']['target_level_upper']) 745 | 746 | else: 747 | farend_snr, noise_snr, echo_signal, target_level = segmental_snr_mixer(clean=noreverb_farend, 748 | noise=noise_audio, 749 | snr=snr, 750 | target_level=conf['configs']['target_level'], 751 | clipping_threshold=conf['configs']['clipping_threshold'], 752 | target_level_lower=conf['configs']['target_level_lower'], 753 | target_level_upper=conf['configs']['target_level_upper']) 754 | 755 | nearend_mic, nearend_speech2, echo_signal, ser = nearend_farend_mixer(nearend_speech, 756 | echo_signal, 757 | lowerbound_ser=conf['configs']['lowerbound_ser'], 758 | upperbound_ser=conf['configs']['upperbound_ser'], 759 | clipping_threshold=conf['configs']['clipping_threshold']) 760 | ser_list.append(ser) 761 | #print("%%%%%%%%%%%Processing fileid%%%%%%%%%%%: {}".format(count)) 762 | 763 | nearend_data_path = os.path.join(nearend_speech_out, "nearend_speech_fileid_{}.wav".format(count)) 764 | farend_data_path = os.path.join(farend_speech_out, "farend_speech_fileid_{}.wav".format(count)) 765 | nearend_mic_data_path = os.path.join(nearend_mic_out, "nearend_mic_fileid_{}.wav".format(count)) 766 | echo_data_path = os.path.join(echo_out, "echo_fileid_{}.wav".format(count)) 767 | if use_reverb: 768 | audio_signals = [nearend_speech2, farend_speech, nearend_mic, echo_signal] 769 | else: 770 | audio_signals = [nearend_speech2, noreverb_farend, nearend_mic, echo_signal] 771 | file_paths = [nearend_data_path, farend_data_path, nearend_mic_data_path, echo_data_path] 772 | for k in range(len(audio_signals)): 773 | try: 774 | 775 | sf.write(file_paths[k], audio_signals[k], sample_rate) 776 | except Exception as e: 777 | print(str(e)) 778 | print("train = ", count) 779 | noise_clipped_files += noise_cf 780 | noise_source_files += noise_file 781 | hyphen = '-' 782 | noise_source_filenamesonly = [i[:-4].split(os.path.sep)[-1] for i in noise_file] 783 | 784 | noise_file_name.append(hyphen.join(noise_source_filenamesonly)[:MAXFILELEN]) 785 | farend_speech_name_1.append(three_farend_sets[0]) 786 | farend_speech_name_2.append(three_farend_sets[1]) 787 | farend_speech_name_3.append(three_farend_sets[2]) 788 | nearend_speech_name.append(nearend_speech_path) 789 | filed_id.append(count) 790 | split.append('train') 791 | count += 1 792 | print(len(nearend_speech_name), len(farend_speech_name_1), len(farend_speech_name_2), len(farend_speech_name_3), len(noise_file_name), len(filed_id), len(snr_list), len(ser_list)) 793 | dataFrame = pd.DataFrame({'nearend_speech_path': nearend_speech_name, 'farend_speech_path_1':farend_speech_name_1, 'farend_speech_path_2': farend_speech_name_2, 794 | 'farend_speech_path_3': farend_speech_name_3, 'noise_file_path':noise_file_name, 'filed_id':filed_id, 'snr':snr_list, 'ser':ser_list}) 795 | dataFrame.to_csv(os.path.join(csv_path, conf['configs']['csv_file_name']), index=False, sep=',') 796 | 797 | 798 | def main(args): 799 | #TODO: This is a draft procesing script, will update and make it clean after 800 | # 801 | with open(args.conf, "r") as f: 802 | conf = json.load(f) 803 | 804 | 805 | train_dataset, validate_dataset, test_dataset = get_data_pair(conf['datasets']['timit_data_path'], 806 | conf 807 | ) 808 | generate_pair_audio(train_dataset, validate_dataset, conf) 809 | 810 | if __name__ == '__main__': 811 | parser = argparse.ArgumentParser( 812 | description="Configuration for timit data preparation" 813 | ) 814 | parser.add_argument( 815 | "-conf", 816 | type=str, 817 | required=True, 818 | help="configuration for timit data preparation" 819 | ) 820 | args = parser.parse_args() 821 | main(args) 822 | --------------------------------------------------------------------------------