├── README.md ├── denoise-autoencoder.py ├── lib ├── AVHandler.py └── AVPreprocess.py └── pic └── networkstructure.png /README.md: -------------------------------------------------------------------------------- 1 | # Speech-denoising-Autoencoder 2 | 3 | Speech denoising systems usually enhance only the magnitude spectrum while leaving the phase spectrum. This system try to improve the performance of denoising system based on denoising autoencoder neural network. The estimation of clean audio is computed by complex ideal ratio mask to enhance the phase information. 4 | 5 | ## Structure 6 | 7 | Input : audio data on mel-frequency domain 8 | 9 | Output: complex ratio mask (cRM)[1] 10 | 11 | This model built in linear shape (2049-500-180) without weight lock[2]. 12 | 13 | ## Source 14 | 15 | [youtube-dl](http://rg3.github.io/youtube-dl/) : a command-line program to download videos from YouTube.com and a few more sites 16 | 17 | [SoX](http://sox.sourceforge.net/) : a cross-platform command line utility to convert various formats of audio files in to other formats 18 | 19 | [FFmpeg](https://www.ffmpeg.org/) : a complete, cross-platform solution to record, convert and stream audio and video 20 | 21 | [librosa](https://librosa.github.io/librosa/) : python package for music and audio analysis 22 | 23 | ## Reference 24 | 25 | [1] [Complex Ratio Masking for Monaural Speech Separation, D.Williamson, IEEE/ACM TRANSACTIONS ON AUDIO, SPEECH, AND LANGUAGE PROCESSING, VOL. 24, NO. 3, MARCH 2016](https://ieeexplore.ieee.org/document/7364200/) 26 | 27 | [2] [Speech Synthesis with Deep Denoising Autoencoder, Zhenzhou Wu](http://gram.cs.mcgill.ca/theses/wu-15-speech.pdf) 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /denoise-autoencoder.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('lib') 3 | import AVHandler as avh 4 | import AVPreprocess as avp 5 | import numpy as np 6 | import os 7 | import librosa 8 | import matplotlib.pyplot as plt 9 | import scipy.io.wavfile as wavfile 10 | from sklearn.model_selection import train_test_split 11 | 12 | # option 13 | DOWNLOAD = 0 #download and preprocess the data with little samples (demo) 14 | WAVE_PLOT = 0 # plot original wave, noise wave, mixed wave 15 | INVERSE_CHECK = 0 # check the inverse function of mel 16 | DUMP = 0 # dump wave data to real wav 17 | TRAIN_DENOISE = 0 # train the denoising model with mel freq input and output 18 | DENOISE = 1 # use the pretrained denoise autoencoder 19 | 20 | ################################################################### 21 | if DOWNLOAD: 22 | avh.mkdir('sample') 23 | clean_video_list = ['https://www.youtube.com/watch?v=DCS6t6NUAGQ&t', 24 | 'https://www.youtube.com/watch?v=gN9dlisaQVM', 25 | 'https://www.youtube.com/watch?v=c0KYU2j0TM4', 26 | 'https://www.youtube.com/watch?v=8S0FDjFBj8o'] 27 | 28 | for i in range(len(clean_video_list)): 29 | name = 'clean_' + str(i) 30 | avh.download('sample',name,clean_video_list[i]) 31 | start_time = 60 32 | end_time = 180 33 | avh.cut('sample',name,start_time,end_time) 34 | avh.conc('sample','clean') 35 | 36 | noise_link = 'https://www.youtube.com/watch?v=BOdLmxy06H0' 37 | name = 'noise' 38 | avh.download('sample', name, noise_link) 39 | noise_s_time = 60 40 | noise_e_time = 540 41 | avh.cut('sample',name,noise_s_time,noise_e_time) 42 | 43 | avh.mix('sample','mix','clean','noise',0,480) 44 | 45 | #################################################################### 46 | 47 | # mix clean audio and noise audio 48 | with open('sample/clean.wav', 'rb') as f: 49 | clean_data, clean_sr = librosa.load('sample/clean.wav', sr=None) # time series data,sample rate 50 | with open('sample/noise.wav', 'rb') as f: 51 | noise_data, noise_sr = librosa.load('sample/noise.wav', sr=None) # time series data,sample rate 52 | 53 | 54 | # normalize expand the noise 55 | noise_max = np.max(noise_data) 56 | expand_rate = 1/noise_max 57 | noise_data = noise_data*expand_rate 58 | 59 | assert clean_sr == noise_sr 60 | mix_data = clean_data*0.8 + noise_data*0.2 61 | mix_sr = clean_sr 62 | 63 | #################################################################### 64 | if WAVE_PLOT: 65 | # plot orignial wave 66 | size = clean_data.shape[0] 67 | time = np.arange(0,size)*(1.0 / clean_sr) 68 | plt.figure(1) 69 | plt.plot(time,clean_data) 70 | plt.xlabel("Time(s)") 71 | plt.ylabel("Amplitude") 72 | plt.title("original wavedata") 73 | plt.grid('on') 74 | 75 | # plot noise wave 76 | size = noise_data.shape[0] 77 | time = np.arange(0,size)*(1.0 / noise_sr) 78 | plt.figure(2) 79 | plt.plot(time,noise_data) 80 | plt.xlabel("Time(s)") 81 | plt.ylabel("Amplitude") 82 | plt.title("noise wavedata") 83 | plt.grid('on') 84 | 85 | # plot mix wave 86 | # plot orignial wave 87 | size = mix_data.shape[0] 88 | time = np.arange(0,size)*(1.0 / mix_sr) 89 | plt.figure(3) 90 | plt.plot(time,mix_data) 91 | plt.xlabel("Time(s)") 92 | plt.ylabel("Amplitude") 93 | plt.title("mixed wavedata") 94 | plt.grid('on') 95 | plt.show() 96 | ###################################################################### 97 | if DUMP: 98 | if(os.path.isfile('mix') == False): 99 | os.system('mkdir mix') 100 | wavfile.write("mix/mix.wav",mix_sr,mix_data) 101 | 102 | ##################################################################### 103 | # convert time data to frequency data 104 | 105 | # fft windowing parameter # 106 | fft_size = 1024 107 | step_size = fft_size // 3 # distance to slide along the window 108 | 109 | # fequency to mel parameter # 110 | n_mels = 40 # number of mel frequency 111 | start_freq = 0.0 112 | end_freq = 8000.0 113 | 114 | # implement mel to time (just to check inverse function) 115 | if INVERSE_CHECK: 116 | M = avp.time_to_mel(mix_data,mix_sr,fft_size,n_mels,step_size) 117 | T = avp.mel_to_time(M,mix_sr,fft_size,n_mels,step_size) 118 | plt.figure() 119 | plt.plot(T) 120 | plt.show() 121 | Tint = T/max(T)*32767 122 | wavfile.write("mix/test.wav",mix_sr,Tint.astype('int16')) 123 | 124 | ################################################################### 125 | #split data 126 | 127 | mel_mix_data = avp.time_to_mel(mix_data,mix_sr,fft_size,n_mels,step_size) 128 | D_X = avp.real_imag_expand(mel_mix_data) 129 | 130 | mel_clean_data = avp.time_to_mel(clean_data,clean_sr,fft_size,n_mels,step_size,fmax=8000) 131 | D_y = avp.real_imag_expand(mel_clean_data) 132 | 133 | # separate data to train test sets 134 | D_X_train = avp.min_max_norm(D_X[:int(D_X.shape[0]*0.9),:]) 135 | D_y_train = D_y[:int(D_y.shape[0]*0.9),:] / D_X[:int(D_X.shape[0]*0.9),:] 136 | G_max = np.max(D_y_train) 137 | D_y_train = D_y_train/G_max 138 | 139 | X_test = avp.min_max_norm(D_X[int(D_X.shape[0]*0.9):,:]) 140 | y_test = D_y[int(D_y.shape[0]*0.9):,:] / D_X[int(D_X.shape[0]*0.9):,:] 141 | y_test = y_test/G_max 142 | 143 | X_train, X_val, y_train, y_val = train_test_split(D_X_train, D_y_train, test_size=0.15, random_state=87) 144 | 145 | # Denoise autoencoder model # 146 | 147 | ## import keras modules 148 | from keras.layers import BatchNormalization,Dropout,Dense,Input,LeakyReLU 149 | from keras import backend as K 150 | from keras.callbacks import ModelCheckpoint,TensorBoard 151 | from keras.models import Model 152 | from keras.utils import plot_model 153 | from keras.initializers import he_normal 154 | from keras.models import model_from_json 155 | from keras import optimizers 156 | 157 | if TRAIN_DENOISE: 158 | n_input_dim = X_train.shape[1] 159 | n_output_dim = y_train.shape[1] 160 | 161 | n_hidden1 = 2049 162 | n_hidden2 = 500 163 | n_hidden3 = 180 164 | 165 | InputLayer1 = Input(shape=(n_input_dim,), name="InputLayer") 166 | InputLayer2 = BatchNormalization(axis=1, momentum=0.6)(InputLayer1) 167 | 168 | HiddenLayer1_1 = Dense(n_hidden1, name="H1", activation='relu', kernel_initializer=he_normal(seed=27))(InputLayer2) 169 | HiddenLayer1_2 = BatchNormalization(axis=1, momentum=0.6)(HiddenLayer1_1) 170 | HiddenLayer1_3 = Dropout(0.1)(HiddenLayer1_2) 171 | 172 | HiddenLayer2_1 = Dense(n_hidden2, name="H2", activation='relu', kernel_initializer=he_normal(seed=42))(HiddenLayer1_3) 173 | HiddenLayer2_2 = BatchNormalization(axis=1, momentum=0.6)(HiddenLayer2_1) 174 | 175 | HiddenLayer3_1 = Dense(n_hidden3, name="H3", activation='relu', kernel_initializer=he_normal(seed=65))(HiddenLayer2_2) 176 | HiddenLayer3_2 = BatchNormalization(axis=1, momentum=0.6)(HiddenLayer3_1) 177 | 178 | HiddenLayer2__1 = Dense(n_hidden2, name="H2_R", activation='relu', kernel_initializer=he_normal(seed=42))(HiddenLayer3_2) 179 | HiddenLayer2__2 = BatchNormalization(axis=1, momentum=0.6)(HiddenLayer2__1) 180 | 181 | HiddenLayer1__1 = Dense(n_hidden1, name="H1_R", activation='relu', kernel_initializer=he_normal(seed=27))(HiddenLayer2__2) 182 | HiddenLayer1__2 = BatchNormalization(axis=1, momentum=0.6)(HiddenLayer1__1) 183 | HiddenLayer1__3 = Dropout(0.1)(HiddenLayer1__2) 184 | 185 | OutputLayer = Dense(n_output_dim, name="OutputLayer", kernel_initializer=he_normal(seed=62))(HiddenLayer1__3) 186 | 187 | model = Model(inputs=[InputLayer1], outputs=[OutputLayer]) 188 | opt = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, decay=0.0001, amsgrad=False) 189 | # loss = p_loss(OutputLayer,K.placeholder()) 190 | model.compile(loss='mse', optimizer=opt) 191 | 192 | plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True) 193 | model.summary() 194 | 195 | tensorboard = TensorBoard(log_dir="./logs", histogram_freq=0, write_graph=True, write_images=True) 196 | # fit the model 197 | hist = model.fit(X_train, y_train, batch_size=512, epochs=100, verbose=1, validation_data=([X_val], [y_val]), 198 | callbacks=[tensorboard]) 199 | 200 | plt.figure(figsize=(10, 8)) 201 | plt.plot(hist.history['loss'], label='Loss') 202 | plt.plot(hist.history['val_loss'], label='Val_Loss') 203 | plt.legend(loc='best') 204 | plt.title('Training Loss and Validation Loss') 205 | plt.show() 206 | 207 | results = model.evaluate(X_test, y_test, batch_size=len(y_test)) 208 | print('Test loss:%3f' % results) 209 | 210 | # serialize model to JSON 211 | model_json = model.to_json() 212 | avh.mkdir('model') 213 | with open("model/model.json", 'w') as f: 214 | f.write(model_json) 215 | # serialize weights to HDF5 216 | model.save_weights("model/model.h5") 217 | print("Saved model to disk") 218 | 219 | if DENOISE: 220 | # load josn and create model 221 | with open('model/model.json','r') as f: 222 | loaded_model_json = f.read() 223 | denoise_model = model_from_json(loaded_model_json) 224 | denoise_model.load_weights("model/model.h5") 225 | print("Loaded model from disk") 226 | 227 | gain = denoise_model.predict(D_X) * G_max 228 | M_gain = gain[:,::2]+1j*gain[:,1::2] 229 | F_gain = avp.mel2freq(M_gain,mix_sr,fft_size,n_mels) 230 | 231 | F = F_gain * avp.stft(mix_data,fft_size,step_size) 232 | #ratio[np.isnan(ratio)] = 0.0 233 | print("shape of F_out:",F.shape) 234 | T = avp.istft(F,fft_size,step_size) 235 | 236 | # write the result 237 | Tint = T/np.max(T)*32767 238 | wavfile.write("Denoise_reconstruction.wav",mix_sr,Tint.astype('int16')) 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | -------------------------------------------------------------------------------- /lib/AVHandler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import librosa 3 | import scipy.io.wavfile as wavfile 4 | import numpy as np 5 | # A file for downloading files and handling audio and video 6 | 7 | # command line functions # 8 | 9 | 10 | def mkdir(dir_name,loc=''): 11 | # make directory use command line 12 | # dir_name | name of the directory 13 | # loc | the location for the directory to be created 14 | command = "" 15 | if loc != '': 16 | command += "cd %s" % loc 17 | command += 'mkdir ' + dir_name 18 | os.system(command) 19 | 20 | 21 | def download(loc,name,link,type='audio'): 22 | # download audio/video from the link 23 | # loc | the location for downloaded file 24 | # name | the name for the file 25 | # link | the link to downloaded by youtube-dl 26 | # type | the type of downloaded file 27 | 28 | if type == 'audio': 29 | # download wav file from the youtube link 30 | command = 'cd %s;' % loc 31 | command += 'youtube-dl -x --audio-format wav -o o' + name + '.wav ' + link + ';' 32 | command += 'ffmpeg -i o%s.wav -ar 48000 -ac 1 %s.wav;' % (name, name) 33 | command += 'rm o%s.wav;' % name 34 | os.system(command) 35 | 36 | 37 | def cut(loc,name,start_time,end_time): 38 | # trim the audio/video by sox 39 | # loc | the location of the file 40 | # name | the name of file to trim 41 | # start_time | the start time of the audio segment 42 | # end_time | the end time of the audio segment 43 | length = end_time - start_time 44 | command = 'cd %s;' % loc 45 | command += 'sox %s.wav c_%s.wav trim %s %s;' % (name,name,start_time,length) 46 | command += 'rm %s.wav;' % name 47 | command += 'mv c_%s.wav %s.wav' % (name,name) 48 | os.system(command) 49 | 50 | 51 | def conc(loc,name,trim_clean=False): 52 | # concatenate the data in the loc (name_*.wav) 53 | command = 'cd %s;' % loc 54 | command += 'sox --combine concatenate %s_*.wav %s.wav;' % (name,name) 55 | if trim_clean: 56 | command += 'rm %s*.wav;' % name 57 | os.system(command) 58 | 59 | 60 | def mix(loc,name,file1,file2,start,end,trim_clean=False): 61 | # mix the audio/video via sox 62 | # loc | location of the mix files 63 | # name | output name of wav 64 | # file1 | first file to mix 65 | # file2 | second file to mix 66 | # start | mixture starting time 67 | # end | mixture end time 68 | # trim_clean | delete the trim file or not 69 | command = 'cd %s;' % loc 70 | cut(loc,file1,start,end) 71 | cut(loc,file2,start,end) 72 | trim1 = '%s/%s.wav' % (loc,file1) 73 | trim2 = '%s/%s.wav' % (loc,file2) 74 | with open(trim1, 'rb') as f: 75 | wav1, wav1_sr = librosa.load(trim1, sr=None) # time series data,sample rate 76 | with open(trim2, 'rb') as f: 77 | wav2, wav2_sr = librosa.load(trim2, sr=None) 78 | 79 | # compress the audio to same volume level 80 | wav1 = wav1 / np.max(wav1) 81 | wav2 = wav2 / np.max(wav2) 82 | assert wav1_sr == wav2_sr 83 | mix_wav = wav1*0.5+wav2*0.5 84 | 85 | path = '%s/%s.wav' % (loc,name) 86 | wavfile.write(path,wav1_sr,mix_wav) 87 | if trim_clean: 88 | command += 'rm trim_%s.wav;rm trim_%s.wav;' % (file1,file2) 89 | os.system(command) 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /lib/AVPreprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | 4 | # windowing fft/ifft function 5 | def stft(data, fft_size, step_size): 6 | # short time fourier transform 7 | window = np.hamming(fft_size) 8 | win_num = (len(data) - 2 * fft_size) // step_size 9 | out = np.ndarray((win_num, fft_size), dtype=data.dtype) 10 | for i in range(win_num): 11 | left = int(i * step_size) 12 | right = int(left + fft_size) 13 | out[i] = data[left: right] * window 14 | F = np.fft.rfft(out, axis=1) 15 | return F 16 | 17 | def istft(F, fft_size, step_size): 18 | # inverse short time fourier transform 19 | data = np.fft.irfft(F, axis=-1) 20 | window = np.hamming(fft_size) 21 | number_windows = F.shape[0] 22 | T = np.zeros((number_windows * step_size + fft_size)) 23 | for i in range(number_windows): 24 | head = int(i * step_size) 25 | tail = int(head + fft_size) 26 | T[head:tail] = T[head:tail] + data[i, :] * window 27 | return T 28 | 29 | # combine FFT bins to mel frequency bins 30 | def mel2freq(mel_data,sr,fft_size,n_mel,fmax=8000): 31 | matrix= librosa.filters.mel(sr, fft_size, n_mel, fmax=fmax) 32 | return np.dot(mel_data,matrix) 33 | 34 | def freq2mel(f_data,sr,fft_size,n_mel,fmax=8000): 35 | pre_matrix = librosa.filters.mel(sr, fft_size, n_mel, fmax=fmax) 36 | matrix = pre_matrix.T / np.sum(pre_matrix.T,axis=0) 37 | return np.dot(f_data,matrix) 38 | 39 | # directly time to mel domain transformation 40 | def time_to_mel(data,sr,fft_size,n_mel,step_size,fmax=8000): 41 | F = stft(data,fft_size,step_size) 42 | M = freq2mel(F,sr,fft_size,n_mel,fmax=8000) 43 | return M 44 | 45 | def mel_to_time(M,sr,fft_size,n_mel,step_size,fmax=8000): 46 | F = mel2freq(M,sr,fft_size,n_mel) 47 | T = istft(F,fft_size,step_size) 48 | return T 49 | 50 | def real_imag_expand(mel_data): 51 | # expand the complex data to 2X data with true real and image number 52 | D = np.zeros((mel_data.shape[0],mel_data.shape[1]*2)) 53 | D[:,::2] = np.real(mel_data) 54 | D[:,1::2] = np.imag(mel_data) 55 | return D 56 | 57 | # normalization function 58 | def min_max_norm(x): 59 | # x should be numpy M*N matrix , normalize the N axis 60 | return (x-np.min(x,axis=0)) / (np.max(x,axis=0)-np.min(x,axis=0)) 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /pic/networkstructure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bill9800/Speech-denoise-Autoencoder/f6537c5b32a268a2c91e564fec752d23b87563e1/pic/networkstructure.png --------------------------------------------------------------------------------