├── README.md
├── denoise-autoencoder.py
├── lib
    ├── AVHandler.py
    └── AVPreprocess.py
└── pic
    └── networkstructure.png


/README.md:
--------------------------------------------------------------------------------
 1 | # Speech-denoising-Autoencoder
 2 | 
 3 | Speech denoising systems usually enhance only the magnitude spectrum while leaving the phase spectrum. This system try to improve the performance of denoising system based on denoising autoencoder neural network. The estimation of clean audio is computed by complex ideal ratio mask to enhance the phase information. 
 4 | 
 5 | ## Structure
 6 | 
 7 | Input : audio data on mel-frequency domain
 8 | 
 9 | Output: complex ratio mask (cRM)[1]
10 | 
11 | This model built in linear shape (2049-500-180) without weight lock[2].
12 | 
13 | ## Source
14 | 
15 | [youtube-dl](http://rg3.github.io/youtube-dl/) : a command-line program to download videos from YouTube.com and a few more sites
16 | 
17 | [SoX](http://sox.sourceforge.net/) : a cross-platform command line utility to convert various formats of audio files in to other formats
18 | 
19 | [FFmpeg](https://www.ffmpeg.org/) : a complete, cross-platform solution to record, convert and stream audio and video
20 | 
21 | [librosa](https://librosa.github.io/librosa/) : python package for music and audio analysis
22 | 
23 | ## Reference
24 | 
25 | [1] [Complex Ratio Masking for Monaural Speech Separation, D.Williamson, IEEE/ACM TRANSACTIONS ON AUDIO, SPEECH, AND LANGUAGE PROCESSING, VOL. 24, NO. 3, MARCH 2016](https://ieeexplore.ieee.org/document/7364200/)
26 | 
27 | [2] [Speech Synthesis with Deep Denoising Autoencoder, Zhenzhou Wu](http://gram.cs.mcgill.ca/theses/wu-15-speech.pdf)
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/denoise-autoencoder.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('lib')
  3 | import AVHandler as avh
  4 | import AVPreprocess as avp
  5 | import numpy as np
  6 | import os
  7 | import librosa
  8 | import matplotlib.pyplot as plt
  9 | import scipy.io.wavfile as wavfile
 10 | from sklearn.model_selection import train_test_split
 11 | 
 12 | # option
 13 | DOWNLOAD = 0 #download and preprocess the data with little samples (demo)
 14 | WAVE_PLOT = 0 # plot original wave, noise wave, mixed wave
 15 | INVERSE_CHECK = 0 # check the inverse function of mel
 16 | DUMP = 0 # dump wave data to real wav
 17 | TRAIN_DENOISE = 0 # train the denoising model with mel freq input and output
 18 | DENOISE = 1 # use the pretrained denoise autoencoder
 19 | 
 20 | ###################################################################
 21 | if DOWNLOAD:
 22 |     avh.mkdir('sample')
 23 |     clean_video_list = ['https://www.youtube.com/watch?v=DCS6t6NUAGQ&t',
 24 |                         'https://www.youtube.com/watch?v=gN9dlisaQVM',
 25 |                         'https://www.youtube.com/watch?v=c0KYU2j0TM4',
 26 |                         'https://www.youtube.com/watch?v=8S0FDjFBj8o']
 27 | 
 28 |     for i in range(len(clean_video_list)):
 29 |         name = 'clean_' + str(i)
 30 |         avh.download('sample',name,clean_video_list[i])
 31 |         start_time = 60
 32 |         end_time = 180
 33 |         avh.cut('sample',name,start_time,end_time)
 34 |     avh.conc('sample','clean')
 35 | 
 36 |     noise_link = 'https://www.youtube.com/watch?v=BOdLmxy06H0'
 37 |     name = 'noise'
 38 |     avh.download('sample', name, noise_link)
 39 |     noise_s_time = 60
 40 |     noise_e_time = 540
 41 |     avh.cut('sample',name,noise_s_time,noise_e_time)
 42 | 
 43 |     avh.mix('sample','mix','clean','noise',0,480)
 44 | 
 45 | ####################################################################
 46 | 
 47 | # mix clean audio and noise audio
 48 | with open('sample/clean.wav', 'rb') as f:
 49 |     clean_data, clean_sr = librosa.load('sample/clean.wav', sr=None)  # time series data,sample rate
 50 | with open('sample/noise.wav', 'rb') as f:
 51 |     noise_data, noise_sr = librosa.load('sample/noise.wav', sr=None)  # time series data,sample rate
 52 | 
 53 | 
 54 | # normalize expand the noise
 55 | noise_max = np.max(noise_data)
 56 | expand_rate = 1/noise_max
 57 | noise_data = noise_data*expand_rate
 58 | 
 59 | assert clean_sr == noise_sr
 60 | mix_data = clean_data*0.8 + noise_data*0.2
 61 | mix_sr = clean_sr
 62 | 
 63 | ####################################################################
 64 | if WAVE_PLOT:
 65 |     # plot orignial wave
 66 |     size = clean_data.shape[0]
 67 |     time = np.arange(0,size)*(1.0 / clean_sr)
 68 |     plt.figure(1)
 69 |     plt.plot(time,clean_data)
 70 |     plt.xlabel("Time(s)")
 71 |     plt.ylabel("Amplitude")
 72 |     plt.title("original wavedata")
 73 |     plt.grid('on')
 74 | 
 75 |     # plot noise wave
 76 |     size = noise_data.shape[0]
 77 |     time = np.arange(0,size)*(1.0 / noise_sr)
 78 |     plt.figure(2)
 79 |     plt.plot(time,noise_data)
 80 |     plt.xlabel("Time(s)")
 81 |     plt.ylabel("Amplitude")
 82 |     plt.title("noise wavedata")
 83 |     plt.grid('on')
 84 | 
 85 |     # plot mix wave
 86 |     # plot orignial wave
 87 |     size = mix_data.shape[0]
 88 |     time = np.arange(0,size)*(1.0 / mix_sr)
 89 |     plt.figure(3)
 90 |     plt.plot(time,mix_data)
 91 |     plt.xlabel("Time(s)")
 92 |     plt.ylabel("Amplitude")
 93 |     plt.title("mixed wavedata")
 94 |     plt.grid('on')
 95 |     plt.show()
 96 | ######################################################################
 97 | if DUMP:
 98 |     if(os.path.isfile('mix') == False):
 99 |         os.system('mkdir mix')
100 |     wavfile.write("mix/mix.wav",mix_sr,mix_data)
101 | 
102 | #####################################################################
103 | # convert time data to frequency data
104 | 
105 | # fft windowing parameter #
106 | fft_size = 1024
107 | step_size = fft_size // 3 # distance to slide along the window
108 | 
109 | # fequency to mel parameter #
110 | n_mels = 40 # number of mel frequency
111 | start_freq = 0.0
112 | end_freq = 8000.0
113 | 
114 | # implement mel to time (just to check inverse function)
115 | if INVERSE_CHECK:
116 |     M = avp.time_to_mel(mix_data,mix_sr,fft_size,n_mels,step_size)
117 |     T = avp.mel_to_time(M,mix_sr,fft_size,n_mels,step_size)
118 |     plt.figure()
119 |     plt.plot(T)
120 |     plt.show()
121 |     Tint = T/max(T)*32767
122 |     wavfile.write("mix/test.wav",mix_sr,Tint.astype('int16'))
123 | 
124 | ###################################################################
125 | #split data
126 | 
127 | mel_mix_data = avp.time_to_mel(mix_data,mix_sr,fft_size,n_mels,step_size)
128 | D_X = avp.real_imag_expand(mel_mix_data)
129 | 
130 | mel_clean_data = avp.time_to_mel(clean_data,clean_sr,fft_size,n_mels,step_size,fmax=8000)
131 | D_y = avp.real_imag_expand(mel_clean_data)
132 | 
133 | # separate data to train test sets
134 | D_X_train = avp.min_max_norm(D_X[:int(D_X.shape[0]*0.9),:])
135 | D_y_train = D_y[:int(D_y.shape[0]*0.9),:] / D_X[:int(D_X.shape[0]*0.9),:]
136 | G_max = np.max(D_y_train)
137 | D_y_train = D_y_train/G_max
138 | 
139 | X_test = avp.min_max_norm(D_X[int(D_X.shape[0]*0.9):,:])
140 | y_test = D_y[int(D_y.shape[0]*0.9):,:] / D_X[int(D_X.shape[0]*0.9):,:]
141 | y_test = y_test/G_max
142 | 
143 | X_train, X_val, y_train, y_val = train_test_split(D_X_train, D_y_train, test_size=0.15, random_state=87)
144 | 
145 | # Denoise autoencoder model #
146 | 
147 | ## import keras modules
148 | from keras.layers import BatchNormalization,Dropout,Dense,Input,LeakyReLU
149 | from keras import backend as K
150 | from keras.callbacks import ModelCheckpoint,TensorBoard
151 | from keras.models import Model
152 | from keras.utils import plot_model
153 | from keras.initializers import he_normal
154 | from keras.models import model_from_json
155 | from keras import optimizers
156 | 
157 | if TRAIN_DENOISE:
158 |     n_input_dim = X_train.shape[1]
159 |     n_output_dim = y_train.shape[1]
160 | 
161 |     n_hidden1 = 2049
162 |     n_hidden2 = 500
163 |     n_hidden3 = 180
164 | 
165 |     InputLayer1 = Input(shape=(n_input_dim,), name="InputLayer")
166 |     InputLayer2 = BatchNormalization(axis=1, momentum=0.6)(InputLayer1)
167 | 
168 |     HiddenLayer1_1 = Dense(n_hidden1, name="H1", activation='relu', kernel_initializer=he_normal(seed=27))(InputLayer2)
169 |     HiddenLayer1_2 = BatchNormalization(axis=1, momentum=0.6)(HiddenLayer1_1)
170 |     HiddenLayer1_3 = Dropout(0.1)(HiddenLayer1_2)
171 | 
172 |     HiddenLayer2_1 = Dense(n_hidden2, name="H2", activation='relu', kernel_initializer=he_normal(seed=42))(HiddenLayer1_3)
173 |     HiddenLayer2_2 = BatchNormalization(axis=1, momentum=0.6)(HiddenLayer2_1)
174 | 
175 |     HiddenLayer3_1 = Dense(n_hidden3, name="H3", activation='relu', kernel_initializer=he_normal(seed=65))(HiddenLayer2_2)
176 |     HiddenLayer3_2 = BatchNormalization(axis=1, momentum=0.6)(HiddenLayer3_1)
177 | 
178 |     HiddenLayer2__1 = Dense(n_hidden2, name="H2_R", activation='relu', kernel_initializer=he_normal(seed=42))(HiddenLayer3_2)
179 |     HiddenLayer2__2 = BatchNormalization(axis=1, momentum=0.6)(HiddenLayer2__1)
180 | 
181 |     HiddenLayer1__1 = Dense(n_hidden1, name="H1_R", activation='relu', kernel_initializer=he_normal(seed=27))(HiddenLayer2__2)
182 |     HiddenLayer1__2 = BatchNormalization(axis=1, momentum=0.6)(HiddenLayer1__1)
183 |     HiddenLayer1__3 = Dropout(0.1)(HiddenLayer1__2)
184 | 
185 |     OutputLayer = Dense(n_output_dim, name="OutputLayer", kernel_initializer=he_normal(seed=62))(HiddenLayer1__3)
186 | 
187 |     model = Model(inputs=[InputLayer1], outputs=[OutputLayer])
188 |     opt = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, decay=0.0001, amsgrad=False)
189 |     # loss = p_loss(OutputLayer,K.placeholder())
190 |     model.compile(loss='mse', optimizer=opt)
191 | 
192 |     plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
193 |     model.summary()
194 | 
195 |     tensorboard = TensorBoard(log_dir="./logs", histogram_freq=0, write_graph=True, write_images=True)
196 |     # fit the model
197 |     hist = model.fit(X_train, y_train, batch_size=512, epochs=100, verbose=1, validation_data=([X_val], [y_val]),
198 |                      callbacks=[tensorboard])
199 | 
200 |     plt.figure(figsize=(10, 8))
201 |     plt.plot(hist.history['loss'], label='Loss')
202 |     plt.plot(hist.history['val_loss'], label='Val_Loss')
203 |     plt.legend(loc='best')
204 |     plt.title('Training Loss and Validation Loss')
205 |     plt.show()
206 | 
207 |     results = model.evaluate(X_test, y_test, batch_size=len(y_test))
208 |     print('Test loss:%3f' % results)
209 | 
210 |     # serialize model to JSON
211 |     model_json = model.to_json()
212 |     avh.mkdir('model')
213 |     with open("model/model.json", 'w') as f:
214 |         f.write(model_json)
215 |     # serialize weights to HDF5
216 |     model.save_weights("model/model.h5")
217 |     print("Saved model to disk")
218 | 
219 | if DENOISE:
220 |     # load josn and create model
221 |     with open('model/model.json','r') as f:
222 |         loaded_model_json = f.read()
223 |     denoise_model = model_from_json(loaded_model_json)
224 |     denoise_model.load_weights("model/model.h5")
225 |     print("Loaded model from disk")
226 | 
227 |     gain = denoise_model.predict(D_X) * G_max
228 |     M_gain = gain[:,::2]+1j*gain[:,1::2]
229 |     F_gain = avp.mel2freq(M_gain,mix_sr,fft_size,n_mels)
230 | 
231 |     F = F_gain * avp.stft(mix_data,fft_size,step_size)
232 |     #ratio[np.isnan(ratio)] = 0.0
233 |     print("shape of F_out:",F.shape)
234 |     T = avp.istft(F,fft_size,step_size)
235 | 
236 |     # write the result
237 |     Tint = T/np.max(T)*32767
238 |     wavfile.write("Denoise_reconstruction.wav",mix_sr,Tint.astype('int16'))
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 


--------------------------------------------------------------------------------
/lib/AVHandler.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import librosa
 3 | import scipy.io.wavfile as wavfile
 4 | import numpy as np
 5 | # A file for downloading files and handling audio and video 
 6 | 
 7 | # command line functions #
 8 | 
 9 | 
10 | def mkdir(dir_name,loc=''):
11 |     # make directory use command line
12 |     # dir_name  | name of the directory
13 |     # loc       | the location for the directory to be created
14 |     command = ""
15 |     if loc != '':
16 |         command += "cd %s" % loc
17 |     command += 'mkdir ' + dir_name
18 |     os.system(command)
19 | 
20 | 
21 | def download(loc,name,link,type='audio'):
22 |     # download audio/video from the link
23 |     # loc   | the location for downloaded file
24 |     # name  | the name for the file
25 |     # link  | the link to downloaded by youtube-dl
26 |     # type  | the type of downloaded file
27 | 
28 |     if type == 'audio':
29 |         # download wav file from the youtube link
30 |         command = 'cd %s;' % loc
31 |         command += 'youtube-dl -x --audio-format wav -o o' + name + '.wav ' + link + ';'
32 |         command += 'ffmpeg -i o%s.wav -ar 48000 -ac 1 %s.wav;' % (name, name)
33 |         command += 'rm o%s.wav;' % name
34 |         os.system(command)
35 | 
36 | 
37 | def cut(loc,name,start_time,end_time):
38 |     # trim the audio/video by sox
39 |     # loc         | the location of the file
40 |     # name        | the name of file to trim
41 |     # start_time  | the start time of the audio segment
42 |     # end_time    | the end time of the audio segment
43 |     length = end_time - start_time
44 |     command = 'cd %s;' % loc
45 |     command += 'sox %s.wav c_%s.wav trim %s %s;' % (name,name,start_time,length)
46 |     command += 'rm %s.wav;' % name
47 |     command += 'mv c_%s.wav %s.wav' % (name,name)
48 |     os.system(command)
49 | 
50 | 
51 | def conc(loc,name,trim_clean=False):
52 |     # concatenate the data in the loc (name_*.wav)
53 |     command = 'cd %s;' % loc
54 |     command += 'sox --combine concatenate %s_*.wav %s.wav;' % (name,name)
55 |     if trim_clean:
56 |     	command += 'rm %s*.wav;' % name
57 |     os.system(command)
58 | 
59 | 
60 | def mix(loc,name,file1,file2,start,end,trim_clean=False):
61 |     # mix the audio/video via sox
62 |     # loc         | location of the mix files
63 |     # name        | output name of wav
64 |     # file1       | first file to mix
65 |     # file2       | second file to mix
66 |     # start       | mixture starting time
67 |     # end         | mixture end time
68 |     # trim_clean  | delete the trim file or not
69 |     command = 'cd %s;' % loc
70 |     cut(loc,file1,start,end)
71 |     cut(loc,file2,start,end)
72 |     trim1 = '%s/%s.wav' % (loc,file1)
73 |     trim2 = '%s/%s.wav' % (loc,file2)
74 |     with open(trim1, 'rb') as f:
75 |         wav1, wav1_sr = librosa.load(trim1, sr=None)  # time series data,sample rate
76 |     with open(trim2, 'rb') as f:
77 |         wav2, wav2_sr = librosa.load(trim2, sr=None)
78 | 
79 |     # compress the audio to same volume level
80 |     wav1 = wav1 / np.max(wav1)
81 |     wav2 = wav2 / np.max(wav2)
82 |     assert wav1_sr == wav2_sr
83 |     mix_wav = wav1*0.5+wav2*0.5
84 | 
85 |     path = '%s/%s.wav' % (loc,name)
86 |     wavfile.write(path,wav1_sr,mix_wav)
87 |     if trim_clean:
88 |         command += 'rm trim_%s.wav;rm trim_%s.wav;' % (file1,file2)
89 |     os.system(command)
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/lib/AVPreprocess.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import librosa
 3 | 
 4 | # windowing fft/ifft function
 5 | def stft(data, fft_size, step_size):
 6 |     # short time fourier transform
 7 |     window = np.hamming(fft_size)
 8 |     win_num = (len(data) - 2 * fft_size) // step_size
 9 |     out = np.ndarray((win_num, fft_size), dtype=data.dtype)
10 |     for i in range(win_num):
11 |         left = int(i * step_size)
12 |         right = int(left + fft_size)
13 |         out[i] = data[left: right] * window
14 |     F = np.fft.rfft(out, axis=1)
15 |     return F
16 | 
17 | def istft(F, fft_size, step_size):
18 |     # inverse short time fourier transform
19 |     data = np.fft.irfft(F, axis=-1)
20 |     window = np.hamming(fft_size)
21 |     number_windows = F.shape[0]
22 |     T = np.zeros((number_windows * step_size + fft_size))
23 |     for i in range(number_windows):
24 |         head = int(i * step_size)
25 |         tail = int(head + fft_size)
26 |         T[head:tail] = T[head:tail] + data[i, :] * window
27 |     return T
28 | 
29 | # combine FFT bins to mel frequency bins
30 | def mel2freq(mel_data,sr,fft_size,n_mel,fmax=8000):
31 |     matrix= librosa.filters.mel(sr, fft_size, n_mel, fmax=fmax)
32 |     return np.dot(mel_data,matrix)
33 | 
34 | def freq2mel(f_data,sr,fft_size,n_mel,fmax=8000):
35 |     pre_matrix = librosa.filters.mel(sr, fft_size, n_mel, fmax=fmax)
36 |     matrix = pre_matrix.T / np.sum(pre_matrix.T,axis=0)
37 |     return np.dot(f_data,matrix)
38 | 
39 | # directly time to mel domain transformation
40 | def time_to_mel(data,sr,fft_size,n_mel,step_size,fmax=8000):
41 |     F = stft(data,fft_size,step_size)
42 |     M = freq2mel(F,sr,fft_size,n_mel,fmax=8000)
43 |     return M
44 | 
45 | def mel_to_time(M,sr,fft_size,n_mel,step_size,fmax=8000):
46 |     F = mel2freq(M,sr,fft_size,n_mel)
47 |     T = istft(F,fft_size,step_size)
48 |     return T
49 | 
50 | def real_imag_expand(mel_data):
51 |     # expand the complex data to 2X data with true real and image number
52 |     D = np.zeros((mel_data.shape[0],mel_data.shape[1]*2))
53 |     D[:,::2] = np.real(mel_data)
54 |     D[:,1::2] = np.imag(mel_data)
55 |     return D
56 | 
57 | # normalization function
58 | def min_max_norm(x):
59 |     # x should be numpy M*N matrix , normalize the N axis
60 |     return (x-np.min(x,axis=0)) / (np.max(x,axis=0)-np.min(x,axis=0))
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/pic/networkstructure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bill9800/Speech-denoise-Autoencoder/f6537c5b32a268a2c91e564fec752d23b87563e1/pic/networkstructure.png


--------------------------------------------------------------------------------