├── .gitattributes ├── CodeTest └── Test.py ├── 02.GRUTraining ├── CreateTestDataset.py ├── ModelTest.py ├── GRUModelTest.py ├── LSTMTestTraining.py └── GRUTraining.py ├── .gitignore ├── README.md └── 01.CreateNoiseAddedDataset └── CreateNoiseAddDataset.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /CodeTest/Test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import scipy.io.wavfile as wav 3 | import scipy.signal as signal 4 | 5 | humanVoice = os.getcwd() + "/abjones_1_01_voice.wav" 6 | whiteNoise = os.getcwd() + "/abjones_1_01_wnoise.wav" 7 | brownNoise = os.getcwd() + "/abjones_1_01_bnoise.wav" 8 | pinkNoise = os.getcwd() + "/abjones_1_01_pnoise.wav" 9 | 10 | rate0, data0 = wav.read(humanVoice) 11 | rate1, data1 = wav.read(whiteNoise) 12 | rate2, data2 = wav.read(brownNoise) 13 | rate3, data3 = wav.read(pinkNoise) 14 | 15 | _, t0, _ = signal.stft(data0, fs = 16000, nperseg = 1024, return_onesided = True) 16 | _, t1, _ = signal.stft(data1, fs = 16000, nperseg = 1024, return_onesided = True) 17 | _, t2, _ = signal.stft(data2, fs = 16000, nperseg = 1024, return_onesided = True) 18 | _, t3, _ = signal.stft(data3, fs = 16000, nperseg = 1024, return_onesided = True) 19 | 20 | print("END") -------------------------------------------------------------------------------- /02.GRUTraining/CreateTestDataset.py: -------------------------------------------------------------------------------- 1 | # Code By adityatb at https://github.com/adityatb/noise-reduction-using-rnn 2 | # Create Test Database by sampling n files and moving to Test file directory 3 | # Maintain by ShYy, 2018. 4 | 5 | import os 6 | import random 7 | import shutil 8 | 9 | move_no_files = 1000 10 | 11 | work_path = os.getcwd()+"/Training/NoiseAdded/" 12 | test_files_path = os.getcwd()+"/Testing/NoiseAdded/" 13 | 14 | src_files = (os.listdir(work_path)) 15 | 16 | def valid_path(dir_path, filename): 17 | full_path = os.path.join(dir_path, filename) 18 | return os.path.isfile(full_path) 19 | 20 | files = [os.path.join(work_path, f) for f in src_files if valid_path(work_path, f)] 21 | choices = random.sample(files, move_no_files) 22 | for files in choices: 23 | shutil.move(files, test_files_path) 24 | print("Moved: " + str(files)) 25 | 26 | print("\nFinished!") -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask instance folder 57 | instance/ 58 | 59 | # Scrapy stuff: 60 | .scrapy 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | # IPython Notebook 69 | .ipynb_checkpoints 70 | 71 | # pyenv 72 | .python-version 73 | 74 | # celery beat schedule file 75 | celerybeat-schedule 76 | 77 | # dotenv 78 | .env 79 | 80 | # virtualenv 81 | venv/ 82 | ENV/ 83 | 84 | # Spyder project settings 85 | .spyderproject 86 | 87 | # Rope project settings 88 | .ropeproject 89 | 90 | # ========================= 91 | # Operating System Files 92 | # ========================= 93 | 94 | # OSX 95 | # ========================= 96 | 97 | .DS_Store 98 | .AppleDouble 99 | .LSOverride 100 | 101 | # Thumbnails 102 | ._* 103 | 104 | # Files that might appear in the root of a volume 105 | .DocumentRevisions-V100 106 | .fseventsd 107 | .Spotlight-V100 108 | .TemporaryItems 109 | .Trashes 110 | .VolumeIcon.icns 111 | 112 | # Directories potentially created on remote AFP share 113 | .AppleDB 114 | .AppleDesktop 115 | Network Trash Folder 116 | Temporary Items 117 | .apdisk 118 | 119 | # Windows 120 | # ========================= 121 | 122 | # Windows image file caches 123 | Thumbs.db 124 | ehthumbs.db 125 | 126 | # Folder config file 127 | Desktop.ini 128 | 129 | # Recycle Bin used on file shares 130 | $RECYCLE.BIN/ 131 | 132 | # Windows Installer files 133 | *.cab 134 | *.msi 135 | *.msm 136 | *.msp 137 | 138 | # Windows shortcuts 139 | *.lnk 140 | 141 | # Database files 142 | # ========================= 143 | 144 | # Audio files 145 | *.wav 146 | 147 | # .idea 148 | # ========================= 149 | 150 | .idea/ 151 | 152 | # Training Checkpoint 153 | # ========================= 154 | 155 | TF_Checkpoints/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NoiseReductionUsingGRU 2 | Graduation Project Title: Noise Reduction Using GRU(RNN). 3 | 4 | This is my graduation project in BIT, 2018. 5 | 6 | The project based on Python 3.5 and TensorFlow 1.8. 7 | The project learning from [Noise Reduction using RNNs with Tensorflow](https://github.com/adityatb/noise-reduction-using-rnn) by adityatb. 8 | Thanks for [adityatb's work](https://github.com/adityatb/noise-reduction-using-rnn)! adityatb uses LSTM to build noise reduction model. 9 | 10 | With the development of RNN, I think GRU may be a better method. 11 | This graduation project uses [MIR-1K](https://sites.google.com/site/unvoicedsoundseparation/mir-1k) dataset and try to use GRU to build noise reduction model. 12 | 13 | ## Introduction 14 | This project includes 2 main part. 15 | 1. CreateNoiseAddedDataset 16 | This script will add 3 kinds of noises(Brownian, Pink and White) to the clean human voice. 17 | If you have 1000 human voice files(MIR-1K has 1000 usable files), you will get 3000 noise added files. 18 | 19 | 2. GRUTraining 20 | This is a big part including 5 scripts. 21 | 1. CreateTestDataset.py 22 | In last step, we have got 3000 files. 23 | By this script, we randomly separate 1000 files to be used in Test process. 24 | 25 | 2. LSTMTestTraining.py 26 | This script and the next script aim to check LSTM model's performance. 27 | To see the GRU model works or not. 28 | 29 | adityatb used LSTM model, but the original code has a lot of problems: 30 | >Python 2.x -> Python 3.x 31 | >Array overflow problems 32 | >Fourier trasformation using error 33 | >No Learning Rate auto change 34 | >... 35 | I solved a lot. It is hard to introduce all at here. 36 | Please refer to the history of git publishment. 37 | 38 | 3. ModelTest.py 39 | This script will use the LSTM model trained in last script and execute 1000 test sounds. 40 | To listen and compare the spectrum of [clear voice], [noise added sound] and [output] to see the performance. 41 | 42 | 4. GRUTraining.py 43 | This script is modified by `LSTMTestTraining.py`. 44 | Change LSTM to GRU in TensorFlow framwork is very easy. 45 | 46 | 5. GRUModelTest.py 47 | This script is modified by `LSTMTestTraining.py`/ 48 | 49 | ## Execute Steps 50 | **NOTICE:The detail of the implement, please read the code, refer to the modification history.** 51 | 1. CreateNoiseAddedDataset 52 | Download [MIR-1K](https://sites.google.com/site/unvoicedsoundseparation/mir-1k) dataset. Decompress the folder `Wavfile`. We will get 1000 wav files. 53 | Set all wav files into the `Wavs` folder. 54 | Run `CreateNoiseAddedDataset.py`. 55 | We will get 1000 human voice files in `./Training/HumanVoices`. 56 | 3000 noise added sounds in `./Training/NoiseAdded`. 57 | 58 | 2. GRUTraining 59 | 1. CreateTestDataset 60 | Cut the `./Training` folder into this part folder. 61 | Run `CreateNoiseAddedDataset.py`. 62 | We will get 1000 random chosen noise added files in `./Testing/NoiseAdded`. 63 | And in `./Training/NoiseAdded` will 2000 files left. 64 | 65 | 2. GRUTraining 66 | Run `GRUTraining.py`. 67 | This script will train a GRU model from remaining 2000 files and corresponding pure human voice files. 68 | Finally we will get TensorFlow checkpoint files in `./TFCheckpoints` folder. 69 | 70 | 3. GRUModelTest 71 | Run `GRUModelTest.py`. 72 | We will get less than 1000 files(because some noise added files correspond to the same pure human voice file) in `./Testing/ModelOutput` folder. 73 | You can test the model or do something else you like. 74 | 75 | ***FINISH!*** 76 | -------------------------------------------------------------------------------- /01.CreateNoiseAddedDataset/CreateNoiseAddDataset.py: -------------------------------------------------------------------------------- 1 | # Code by ShYy 2 | 3 | # This script uses MIR-1K dataset, to create a Noise Added MIR-1K-NA. 4 | 5 | import os, random 6 | import numpy as np 7 | import scipy.io.wavfile as wav 8 | 9 | 10 | # Mix Audio and Noise as 75%Audio and 25%Noise 11 | def mix_audio(data, noise): 12 | mix = np.add(0.75*data, 0.25*noise) 13 | out = np.array(mix) 14 | 15 | return out 16 | 17 | 18 | # Set a start point from front 15s of the noise audios. The human voice waves are all shorter than 15s. 19 | def create_noise_piece(noisedata, data): 20 | datalength = len(data) 21 | startpoint = random.randrange(0, 240000) # 30s noise has 480000 sampled points at 16kHz 22 | 23 | outputdata = noisedata[startpoint:startpoint + datalength] 24 | 25 | return outputdata 26 | 27 | 28 | # ----- Main Function Start ----- 29 | 30 | # Directories 31 | wavsDir = os.getcwd() + "/Wavs/" 32 | noiseDir = os.getcwd() + "/Noises/" 33 | noiseAddedDir = os.getcwd() + "/Training/NoiseAdded/" 34 | humanVoiceDir = os.getcwd() + "/Training/HumanVoices/" 35 | 36 | # Noise File Name 37 | whiteNoise = "WhiteNoise.wav" 38 | brownianNoise = "BrownianNoise.wav" 39 | pinkNoise = "PinkNoise.wav" 40 | 41 | # The length of the noise file. Three noise file are all 30s. 42 | os.chdir(noiseDir) 43 | tempNoiseData = wav.read(whiteNoise) 44 | noiseLength = len(tempNoiseData) 45 | 46 | # Get Noises 47 | wNoiseRate, wNoise = wav.read(whiteNoise) 48 | bNoiseRate, bNoise = wav.read(brownianNoise) 49 | pNoiseRate, pNoise = wav.read(pinkNoise) 50 | 51 | # Define Mixture of audio 52 | WhiteNoiseMix = [] 53 | BrownianNoiseMix = [] 54 | PinkNoiseMix = [] 55 | 56 | # File Counter for Debugging 57 | fileCounter = 0 58 | 59 | # Enter /Wavs/ dir to start mixing processing 60 | os.chdir(wavsDir) 61 | 62 | # The wave files are combined with 2 channel, left for music and right for human voice. 63 | # For each wave file, use the right channel and sum it with three types of noises. 64 | for fileName in os.listdir(wavsDir): 65 | if fileName.endswith(".wav"): 66 | # Read the wave file 67 | wavFileRate, wavFile = wav.read(fileName) 68 | 69 | # Use the right channel 70 | rightChannel = wavFile[:, 1] 71 | humanVoice = np.array(rightChannel) 72 | 73 | # Normalize human voice sample. Gain from the other channel, BGM channel. 74 | humanVoicePeak = max(abs(humanVoice)) 75 | wavFilePeak = np.iinfo(wavFile.dtype).max 76 | gain = float(wavFilePeak)/humanVoicePeak 77 | humanVoiceNormalized = np.array(humanVoice * gain) 78 | 79 | # Create Mixtures 80 | print("Mixing " + fileName + " with Noises...") 81 | WhiteNoiseMix = np.array(mix_audio(humanVoiceNormalized, create_noise_piece(wNoise, humanVoiceNormalized))) 82 | BrownianNoiseMix = np.array(mix_audio(humanVoiceNormalized, create_noise_piece(bNoise, humanVoiceNormalized))) 83 | PinkNoiseMix = np.array(mix_audio(humanVoiceNormalized, create_noise_piece(pNoise, humanVoiceNormalized))) 84 | 85 | # Write mixture audio files in the training directory. 86 | os.chdir(noiseAddedDir) 87 | fName, extendName = os.path.splitext(fileName) 88 | wav.write(fName + "_wnoise" + extendName, wavFileRate, WhiteNoiseMix.astype(np.int16)) 89 | wav.write(fName + "_bnoise" + extendName, wavFileRate, BrownianNoiseMix.astype(np.int16)) 90 | wav.write(fName + "_pnoise" + extendName, wavFileRate, PinkNoiseMix.astype(np.int16)) 91 | 92 | # Write human voice audio to the directory for computing with Noise Added audios 93 | os.chdir(humanVoiceDir) 94 | wav.write(fName + "_voice" + extendName, wavFileRate, humanVoiceNormalized.astype(np.int16)) 95 | 96 | # End and back to the Waves directory 97 | print("Finished Processing: " + fileName) 98 | os.chdir(wavsDir) 99 | 100 | # Counter++ 101 | fileCounter = fileCounter + 1 102 | 103 | print("Total Processed: " + str(fileCounter) + " file(s).") -------------------------------------------------------------------------------- /02.GRUTraining/ModelTest.py: -------------------------------------------------------------------------------- 1 | # This script aims to test the model using the Test Dataset seperated from source Dataset by CreateTestDataset.py 2 | # Code by ShYy, 2018. 3 | 4 | import scipy 5 | import scipy.signal as signal 6 | import numpy as np 7 | import os 8 | import random 9 | import sys 10 | import scipy.io.wavfile as wav 11 | import tensorflow as tf 12 | import math 13 | 14 | # Get the source Human Voice file names by Noise Added file names. 15 | def formatSrcFilename(filename): 16 | return filename[:len(filename) - 11] + "_voice.wav" 17 | 18 | def formatOutputFilename(filename): 19 | return filename[:len(filename) - 11] + "_output.wav" 20 | 21 | def sequentialized_spectrum(batch): 22 | # Get maximum length of batch 23 | t = [] 24 | t_vec = [] 25 | Sxx_Vec = [] 26 | for each in batch: 27 | _, t, Sxx_Vec_Temp = signal.stft(each, fs=testNARateRepository[0], nperseg=stft_size, return_onesided = False) 28 | t_vec.append(t) 29 | Sxx_Vec.append(Sxx_Vec_Temp) 30 | maximum_length = findMaxlen(t_vec) 31 | 32 | max_run_total = int(math.ceil(float(maximum_length) / sequence_length)) 33 | final_data = np.zeros([len(batch), max_run_total, stft_size, sequence_length]) 34 | true_time = np.zeros([len(batch), max_run_total]) 35 | 36 | # Read in a file and compute spectrum 37 | # for batch_idx, each_set in enumerate(batch): 38 | for batch_idx, Sxx in enumerate(Sxx_Vec): 39 | # f, t, Sxx = signal.stft(each_set, fs=rate_repository[0], nperseg=stft_size, return_onesided = False) 40 | 41 | # Magnitude and Phase Spectra 42 | Mag = Sxx.real 43 | t = t_vec[batch_idx] 44 | # Phase = Sxx.imag 45 | 46 | # Break up the spectrum in sequence_length sized data 47 | run_full_steps = float(len(t)) / sequence_length 48 | run_total = int(math.ceil(run_full_steps)) 49 | 50 | # Run a loop long enough to break up all the data in the file into chunks of sequence_size 51 | for step in range(run_total): 52 | 53 | begin_point = step * sequence_length 54 | end_point = begin_point + sequence_length 55 | 56 | m, n = Mag[:, begin_point:end_point].shape 57 | 58 | # Store each chunk sequentially in a new array, accounting for zero padding when close to the end of the file 59 | if n == sequence_length: 60 | final_data[batch_idx, step, :, :] = np.copy(Mag[:, begin_point:end_point]) 61 | true_time[batch_idx, step] = n 62 | else: 63 | final_data[batch_idx, step, :, :] = np.copy(create_final_sequence(Mag[:, begin_point:end_point], sequence_length)) 64 | true_time[batch_idx, step] = n 65 | 66 | final_data = np.transpose(final_data, (0, 1, 3, 2)) 67 | 68 | return final_data, true_time, maximum_length 69 | 70 | def findMaxlen(data_vec): 71 | max_ = 0 72 | for each in data_vec: 73 | if len(each) > max_: 74 | max_ = len(each) 75 | return max_ 76 | 77 | def create_final_sequence(sequence, max_length): 78 | a, b = sequence.shape 79 | extra_len = max_length - b 80 | null_mat = np.zeros((len(sequence), extra_len), dtype=np.float32) 81 | sequence = np.concatenate((sequence, null_mat), axis=1) 82 | return sequence 83 | 84 | # Directories 85 | humanVoice = os.getcwd() + "/Training/HumanVoices/" 86 | testData = os.getcwd() + "/Testing/NoiseAdded/" 87 | modelOutput = os.getcwd() + "/Testing/ModelOutput/" 88 | graphPath = os.getcwd() + "/TF_Checkpoints/FINAL.ckpt" 89 | 90 | # Number of test files 91 | testFileNum = 0 92 | 93 | # File List 94 | testNAFileList = [] # Test Dataset. Noise Added File List. 95 | srcHVFileList = [] # Source Human Voice File List. 96 | outputFileList = [] # Output File List 97 | 98 | # File Repository 99 | testNARateRepository = [] 100 | testNADataRepository = [] 101 | srcHVRateRepository = [] 102 | srcHVDataRepository = [] 103 | 104 | norm_factor = (1.0 / 32768.0) # Let data map to -1 ~ 1 range for LSTM process 105 | 106 | # Walk all test NA files to File List and File Repository. 107 | for root, _, files in os.walk(testData): 108 | files = sorted(files) 109 | testFileNum = len(files) 110 | 111 | for f in files: 112 | if f.endswith(".wav"): 113 | testNAFileList.append(f) 114 | rate, data = wav.read(os.path.join(root, f)) 115 | testNARateRepository.append(rate) 116 | testNADataRepository.append(data * norm_factor) 117 | 118 | srcHVFileList = list(map(formatSrcFilename, testNAFileList)) 119 | outputFileList = list(map(formatOutputFilename, testNAFileList)) 120 | 121 | # Walk all source HV files to File Repository. 122 | for root, _, files in os.walk(humanVoice): 123 | files = sorted(files) 124 | 125 | for f in files: 126 | if(f.endswith(".wav")): 127 | for name in srcHVFileList: 128 | if f == name: 129 | rate, data = wav.read(os.path.join(root, f)) 130 | srcHVRateRepository.append(rate) 131 | srcHVDataRepository.append(data * norm_factor) 132 | 133 | # STFT Process Variables, also used in LSTM 134 | sequence_length = 100 135 | stft_size = 1024 136 | batch_size = 1 # Set 1 for process 1 Wav file a time. 137 | 138 | # Tensorflow vars + Graph and LSTM Params 139 | input_data = tf.placeholder(tf.float32, [None, sequence_length, stft_size]) 140 | # clean_data = tf.placeholder(tf.float32, [None, sequence_length, stft_size]) 141 | sequence_length_tensor = tf.placeholder(tf.int32, (None)) 142 | 143 | # TF Graph Definition 144 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(stft_size, forget_bias = 1.0, state_is_tuple = True) 145 | # stacked_lstm = tf.contrib.rnn.MultiRNNCell([[lstm_cell] for i in number_of_layers]) 146 | init_state = lstm_cell.zero_state(batch_size, tf.float32) 147 | rnn_outputs, final_state = tf.nn.dynamic_rnn(lstm_cell, input_data, sequence_length=sequence_length_tensor, initial_state=init_state, time_major=False) 148 | # mse_loss = tf.losses.mean_squared_error(rnn_outputs, clean_data) 149 | # train_optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(mse_loss) 150 | # train_optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(mse_loss) 151 | # train_optimizer = tf.train.AdagradDAOptimizer(learning_rate).minimize(mse_loss) 152 | # train_optimizer = tf.train.AdamOptimizer(learning_rate).minimize(mse_loss) 153 | saver = tf.train.Saver() 154 | 155 | # Initialize TF Graph and Restore the Graph 156 | init_op = tf.global_variables_initializer() # initialize_all_variables() 157 | gpu_options = tf.GPUOptions(allow_growth = True) # Set session GPU using growing. 158 | sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) 159 | sess.run(init_op) 160 | saver.restore(sess, graphPath) 161 | print("\t***** TF GRAPH RESTORED *****") 162 | 163 | # Start Processing 164 | for idx in range(testFileNum): 165 | nowNAFile = [] 166 | nowNAFile.append(testNADataRepository[idx]) 167 | 168 | # Get NA stft repository. 169 | nowNAData_STFT, sequenceLengthID, maxLength = sequentialized_spectrum(nowNAFile) 170 | 171 | # Get Time Steps. 172 | maxTimeSteps = len(nowNAData_STFT[0]) 173 | 174 | # Define outputData List to contain rnn_outputs_value. 175 | outputData = np.zeros([1, maxTimeSteps, stft_size, sequence_length]) # Transpose, [0, 1, 3, 2] 176 | 177 | for timeStep in range(maxTimeSteps): 178 | feed_dict = { 179 | input_data : nowNAData_STFT[:, timeStep, :], 180 | sequence_length_tensor : sequenceLengthID[:, timeStep] 181 | } 182 | final_state_value, rnn_outputs_value = sess.run([final_state, rnn_outputs], feed_dict=feed_dict) 183 | 184 | rnn_outputs_value = np.transpose(rnn_outputs_value, [0, 2, 1]) 185 | outputData[0][timeStep] = rnn_outputs_value 186 | 187 | # Define outputData_STFT, link outputData List by timeStep in 1 dimension. 188 | outputData_STFT = np.zeros([stft_size, maxLength]) 189 | beginTime = 0 190 | endTime = 0 191 | for timeStep in range(maxTimeSteps): 192 | if(timeStep < maxTimeSteps - 1): 193 | endTime = beginTime + sequence_length 194 | outputData_STFT[:, beginTime : endTime] = outputData[0, timeStep, :, :] 195 | else: 196 | endTime = beginTime + int(sequenceLengthID[0, timeStep]) 197 | outputData_STFT[:, beginTime : endTime] = outputData[0, timeStep, :, 0 : (endTime - beginTime)] 198 | 199 | beginTime = beginTime + sequence_length 200 | 201 | # Compute ISTFT 202 | _, outputData_ISTFT = signal.istft(outputData_STFT, fs=testNARateRepository[0], nperseg=stft_size, input_onesided = False) 203 | 204 | outputData_ISTFT = (outputData_ISTFT / norm_factor).real 205 | outputData_ISTFT = outputData_ISTFT.astype(np.int16) 206 | 207 | wav.write(modelOutput + outputFileList[idx], testNARateRepository[idx], outputData_ISTFT) 208 | print("Index: " + str(idx)) 209 | print("\tOutput File: " + str(outputFileList[idx])) -------------------------------------------------------------------------------- /02.GRUTraining/GRUModelTest.py: -------------------------------------------------------------------------------- 1 | # This script aims to test the model using the Test Dataset seperated from source Dataset by CreateTestDataset.py 2 | # Code by ShYy, 2018. 3 | 4 | import scipy 5 | import scipy.signal as signal 6 | import numpy as np 7 | import os 8 | import random 9 | import sys 10 | import scipy.io.wavfile as wav 11 | import tensorflow as tf 12 | import math 13 | 14 | # Get the source Human Voice file names by Noise Added file names. 15 | def formatSrcFilename(filename): 16 | return filename[:len(filename) - 11] + "_voice.wav" 17 | 18 | def formatOutputFilename(filename): 19 | return filename[:len(filename) - 11] + "_output.wav" 20 | 21 | def sequentialized_spectrum(batch): 22 | # Get maximum length of batch 23 | t = [] 24 | t_vec = [] 25 | Sxx_Vec = [] 26 | for each in batch: 27 | _, t, Sxx_Vec_Temp = signal.stft(each, fs=testNARateRepository[0], nperseg=stft_size, return_onesided = False) 28 | t_vec.append(t) 29 | Sxx_Vec.append(Sxx_Vec_Temp) 30 | maximum_length = findMaxlen(t_vec) 31 | 32 | max_run_total = int(math.ceil(float(maximum_length) / sequence_length)) 33 | final_data = np.zeros([len(batch), max_run_total, stft_size, sequence_length], dtype=np.float32) 34 | final_data_imag = np.zeros([len(batch), max_run_total, stft_size, sequence_length], dtype=np.float32) 35 | true_time = np.zeros([len(batch), max_run_total], dtype=np.int32) 36 | 37 | # Read in a file and compute spectrum 38 | # for batch_idx, each_set in enumerate(batch): 39 | for batch_idx, Sxx in enumerate(Sxx_Vec): 40 | # f, t, Sxx = signal.stft(each_set, fs=rate_repository[0], nperseg=stft_size, return_onesided = False) 41 | 42 | # Magnitude and Phase Spectra 43 | Mag = Sxx.real 44 | Mag_Imag = Sxx.imag 45 | t = t_vec[batch_idx] 46 | # Phase = Sxx.imag 47 | 48 | # Break up the spectrum in sequence_length sized data 49 | run_full_steps = float(len(t)) / sequence_length 50 | run_total = int(math.ceil(run_full_steps)) 51 | 52 | # Run a loop long enough to break up all the data in the file into chunks of sequence_size 53 | for step in range(run_total): 54 | 55 | begin_point = step * sequence_length 56 | end_point = begin_point + sequence_length 57 | 58 | m, n = Mag[:, begin_point:end_point].shape 59 | 60 | # Store each chunk sequentially in a new array, accounting for zero padding when close to the end of the file 61 | if n == sequence_length: 62 | final_data[batch_idx, step, :, :] = np.copy(Mag[:, begin_point:end_point]) 63 | final_data_imag[batch_idx, step, :, :] = np.copy(Mag_Imag[:, begin_point:end_point]) 64 | true_time[batch_idx, step] = n 65 | else: 66 | final_data[batch_idx, step, :, :] = np.copy(create_final_sequence(Mag[:, begin_point:end_point], sequence_length)) 67 | final_data_imag[batch_idx, step, :, :] = np.copy(create_final_sequence(Mag_Imag[:, begin_point:end_point], sequence_length)) 68 | true_time[batch_idx, step] = n 69 | 70 | final_data = np.transpose(final_data, (0, 1, 3, 2)) 71 | final_data_imag = np.transpose(final_data_imag, (0, 1, 3, 2)) 72 | 73 | return final_data, final_data_imag, true_time, maximum_length 74 | 75 | def findMaxlen(data_vec): 76 | max_ = 0 77 | for each in data_vec: 78 | if len(each) > max_: 79 | max_ = len(each) 80 | return max_ 81 | 82 | def create_final_sequence(sequence, max_length): 83 | a, b = sequence.shape 84 | extra_len = max_length - b 85 | null_mat = np.zeros((len(sequence), extra_len), dtype=np.float32) 86 | sequence = np.concatenate((sequence, null_mat), axis=1) 87 | return sequence 88 | 89 | # Directories 90 | humanVoice = os.getcwd() + "/Training/HumanVoices/" 91 | testData = os.getcwd() + "/Testing/NoiseAdded/" 92 | modelOutput = os.getcwd() + "/Testing/ModelOutput/" 93 | graphPath_Real = os.getcwd() + "/TF_Checkpoints/FINAL_Real.ckpt" 94 | graphPath_Imag = os.getcwd() + "/TF_Checkpoints/FINAL_Imag.ckpt" 95 | 96 | # Number of test files 97 | testFileNum = 0 98 | 99 | # File List 100 | testNAFileList = [] # Test Dataset. Noise Added File List. 101 | srcHVFileList = [] # Source Human Voice File List. 102 | outputFileList = [] # Output File List 103 | 104 | # File Repository 105 | testNARateRepository = [] 106 | testNADataRepository = [] 107 | srcHVRateRepository = [] 108 | srcHVDataRepository = [] 109 | 110 | norm_factor = (1.0 / 32768.0) # Let data map to -1 ~ 1 range for LSTM process 111 | 112 | # Walk all test NA files to File List and File Repository. 113 | for root, _, files in os.walk(testData): 114 | files = sorted(files) 115 | testFileNum = len(files) 116 | 117 | for f in files: 118 | if f.endswith(".wav"): 119 | testNAFileList.append(f) 120 | rate, data = wav.read(os.path.join(root, f)) 121 | testNARateRepository.append(rate) 122 | testNADataRepository.append(data * norm_factor) 123 | 124 | srcHVFileList = list(map(formatSrcFilename, testNAFileList)) 125 | outputFileList = list(map(formatOutputFilename, testNAFileList)) 126 | 127 | # Walk all source HV files to File Repository. 128 | for root, _, files in os.walk(humanVoice): 129 | files = sorted(files) 130 | 131 | for f in files: 132 | if(f.endswith(".wav")): 133 | for name in srcHVFileList: 134 | if f == name: 135 | rate, data = wav.read(os.path.join(root, f)) 136 | srcHVRateRepository.append(rate) 137 | srcHVDataRepository.append(data * norm_factor) 138 | 139 | # STFT Process Variables, also used in LSTM 140 | sequence_length = 100 141 | stft_size = 1024 142 | batch_size = 1 # Set 1 for process 1 Wav file a time. 143 | number_of_layers = 3 144 | 145 | # Tensorflow vars + Graph and LSTM Params 146 | input_data = tf.placeholder(tf.float32, [None, sequence_length, stft_size]) 147 | # clean_data = tf.placeholder(tf.float32, [None, sequence_length, stft_size]) 148 | sequence_length_tensor = tf.placeholder(tf.int32, (None)) 149 | 150 | # TF Graph Definition 151 | gru_cell = tf.contrib.rnn.GRUCell(stft_size, kernel_initializer = tf.zeros_initializer(dtype = tf.float32)) 152 | # gru_cell = tf.contrib.rnn.DropoutWrapper(gru_cell, dtype = tf.float32, output_keep_prob = 0.5) # Cancel Dropout 153 | stacked_gru = tf.contrib.rnn.MultiRNNCell([gru_cell] * number_of_layers, state_is_tuple=True) 154 | init_state = stacked_gru.zero_state(batch_size, tf.float32) 155 | rnn_outputs, final_state = tf.nn.dynamic_rnn(stacked_gru, input_data, sequence_length=sequence_length_tensor, initial_state=init_state, time_major=False) 156 | # mse_loss = tf.losses.mean_squared_error(rnn_outputs, clean_data) 157 | # train_optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(mse_loss) 158 | # train_optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(mse_loss) 159 | # train_optimizer = tf.train.AdagradDAOptimizer(learning_rate).minimize(mse_loss) 160 | # train_optimizer = tf.train.AdamOptimizer(learning_rate).minimize(mse_loss) 161 | saver = tf.train.Saver() 162 | 163 | # Initialize TF Graph and Restore the Graph 164 | init_op = tf.global_variables_initializer() # initialize_all_variables() 165 | gpu_options = tf.GPUOptions(allow_growth = True) # Set session GPU using growing. 166 | sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) 167 | sess.run(init_op) 168 | 169 | # Start Processing 170 | for idx in range(testFileNum): 171 | print("Index: " + str(idx + 1)) 172 | 173 | # Read Real Part Graph 174 | saver.restore(sess, graphPath_Real) 175 | print("\t***** TF GRAPH REAL RESTORED *****") 176 | 177 | nowNAFile = [] 178 | nowNAFile.append(testNADataRepository[idx]) 179 | 180 | # Get NA stft repository. 181 | nowNAData_STFT_Real, nowNAData_STFT_Imag, sequenceLengthID, maxLength = sequentialized_spectrum(nowNAFile) 182 | 183 | # Get Time Steps. 184 | maxTimeSteps = len(nowNAData_STFT_Real[0]) 185 | 186 | # Define outputData List to contain rnn_outputs_value. 187 | outputData_Real = np.zeros([1, maxTimeSteps, stft_size, sequence_length]) # Transpose, [0, 1, 3, 2] 188 | 189 | for timeStep in range(maxTimeSteps): 190 | feed_dict = { 191 | input_data : nowNAData_STFT_Real[:, timeStep, :], 192 | sequence_length_tensor : sequenceLengthID[:, timeStep] 193 | } 194 | final_state_value, rnn_outputs_value = sess.run([final_state, rnn_outputs], feed_dict=feed_dict) 195 | 196 | rnn_outputs_value = np.transpose(rnn_outputs_value, [0, 2, 1]) 197 | outputData_Real[0][timeStep] = rnn_outputs_value 198 | 199 | 200 | # Read Imag Part Graph 201 | saver.restore(sess, graphPath_Imag) 202 | print("\t***** TF GRAPH IMAG RESTORED *****") 203 | 204 | outputData_Imag = np.zeros([1, maxTimeSteps, stft_size, sequence_length]) # Transpose, [0, 1, 3, 2] 205 | 206 | for timeStep in range(maxTimeSteps): 207 | feed_dict = { 208 | input_data : nowNAData_STFT_Imag[:, timeStep, :], 209 | sequence_length_tensor : sequenceLengthID[:, timeStep] 210 | } 211 | final_state_value, rnn_outputs_value = sess.run([final_state, rnn_outputs], feed_dict=feed_dict) 212 | 213 | rnn_outputs_value = np.transpose(rnn_outputs_value, [0, 2, 1]) 214 | outputData_Imag[0][timeStep] = rnn_outputs_value 215 | 216 | 217 | # outputData = np.zeros([1, maxTimeSteps, stft_size, sequence_length], dtype=np.complex128) 218 | outputData = np.vectorize(complex)(outputData_Real, outputData_Imag) 219 | 220 | # Define outputData_STFT, link outputData List by timeStep in 1 dimension. 221 | outputData_STFT = np.zeros([stft_size, maxLength], dtype=np.complex128) 222 | beginTime = 0 223 | endTime = 0 224 | for timeStep in range(maxTimeSteps): 225 | if(timeStep < maxTimeSteps - 1): 226 | endTime = beginTime + sequence_length 227 | outputData_STFT[:, beginTime : endTime] = outputData[0, timeStep, :, :] 228 | else: 229 | endTime = beginTime + int(sequenceLengthID[0, timeStep]) 230 | outputData_STFT[:, beginTime : endTime] = outputData[0, timeStep, :, 0 : (endTime - beginTime)] 231 | 232 | beginTime = beginTime + sequence_length 233 | 234 | # Compute ISTFT 235 | _, outputData_ISTFT = signal.istft(outputData_STFT, fs=testNARateRepository[0], nperseg=stft_size, input_onesided = False) 236 | 237 | outputData_ISTFT = ((outputData_ISTFT / norm_factor).real) / 0.75 238 | outputData_ISTFT = outputData_ISTFT.astype(np.int16) 239 | 240 | wav.write(modelOutput + outputFileList[idx], testNARateRepository[idx], outputData_ISTFT) 241 | print("\tOutput File: " + str(outputFileList[idx]) + "\n") -------------------------------------------------------------------------------- /02.GRUTraining/LSTMTestTraining.py: -------------------------------------------------------------------------------- 1 | # Code By adityatb at https://github.com/adityatb/noise-reduction-using-rnn 2 | # LSTM method test. 3 | # Maintain by ShYy, 2018. 4 | 5 | import scipy 6 | import scipy.signal as signal 7 | import numpy as np 8 | import os, random, sys 9 | import scipy.io.wavfile as wav 10 | import tensorflow as tf 11 | import math 12 | 13 | 14 | os.environ['CUDA_VISIBLE_DEVICES'] = '2' 15 | 16 | def formatFilename(filename): 17 | return filename[:len(filename) - 11] + "_voice.wav" 18 | 19 | 20 | # Strip away the _xnoise.wav part of the filename, and append _voice.wav to obtain clean voice counterpart 21 | 22 | def create_final_sequence(sequence, max_length): 23 | a, b = sequence.shape 24 | extra_len = max_length - b 25 | null_mat = np.zeros((len(sequence), extra_len), dtype=np.float32) 26 | sequence = np.concatenate((sequence, null_mat), axis=1) 27 | return sequence 28 | 29 | 30 | def sequentialized_spectrum(batch): 31 | # Get maximum length of batch 32 | t = [] 33 | t_vec = [] 34 | Sxx_Vec = [] 35 | for each in batch: 36 | _, t, Sxx_Vec_Temp = signal.stft(each, fs=rate_repository[0], nperseg=stft_size, return_onesided = False) 37 | t_vec.append(t) 38 | Sxx_Vec.append(Sxx_Vec_Temp) 39 | maximum_length = findMaxlen(t_vec) 40 | 41 | max_run_total = int(math.ceil(float(maximum_length) / sequence_length)) 42 | final_data = np.zeros([len(batch), max_run_total, stft_size, sequence_length]) 43 | true_time = np.zeros([len(batch), max_run_total]) 44 | 45 | # Read in a file and compute spectrum 46 | # for batch_idx, each_set in enumerate(batch): 47 | for batch_idx, Sxx in enumerate(Sxx_Vec): 48 | # f, t, Sxx = signal.stft(each_set, fs=rate_repository[0], nperseg=stft_size, return_onesided = False) 49 | 50 | # Magnitude and Phase Spectra 51 | Mag = Sxx.real 52 | t = t_vec[batch_idx] 53 | # Phase = Sxx.imag 54 | 55 | # Break up the spectrum in sequence_length sized data 56 | run_full_steps = float(len(t)) / sequence_length 57 | run_total = int(math.ceil(run_full_steps)) 58 | 59 | # Run a loop long enough to break up all the data in the file into chunks of sequence_size 60 | for step in range(run_total): 61 | 62 | begin_point = step * sequence_length 63 | end_point = begin_point + sequence_length 64 | 65 | m, n = Mag[:, begin_point:end_point].shape 66 | 67 | # Store each chunk sequentially in a new array, accounting for zero padding when close to the end of the file 68 | if n == sequence_length: 69 | final_data[batch_idx, step, :, :] = np.copy(Mag[:, begin_point:end_point]) 70 | true_time[batch_idx, step] = n 71 | else: 72 | final_data[batch_idx, step, :, :] = np.copy(create_final_sequence(Mag[:, begin_point:end_point], sequence_length)) 73 | true_time[batch_idx, step] = n 74 | 75 | final_data = np.transpose(final_data, (0, 1, 3, 2)) 76 | 77 | return final_data, true_time, maximum_length 78 | 79 | 80 | def findMaxlen(data_vec): 81 | max_ = 0 82 | for each in data_vec: 83 | if len(each) > max_: 84 | max_ = len(each) 85 | return max_ 86 | 87 | 88 | # ----------------- Begin Vars --------------------- # 89 | 90 | # Training data directories 91 | traindata = os.getcwd() + "/Training/NoiseAdded/" 92 | voicedata = os.getcwd() + "/Training/HumanVoices/" 93 | checkpoints = os.getcwd() + "/TF_Checkpoints/" 94 | 95 | # NormConstant 96 | norm_factor = (1 / 32768.0) 97 | 98 | # Spectrogram Parameters 99 | stft_size = 1024 100 | 101 | # RNN Specs 102 | sequence_length = 100 103 | batch_size = 10 104 | learning_rate = 0.001 105 | epochs = 250 106 | # number_of_layers = 3 107 | 108 | # Tensorflow vars + Graph and LSTM Params 109 | input_data = tf.placeholder(tf.float32, [None, sequence_length, stft_size]) 110 | clean_data = tf.placeholder(tf.float32, [None, sequence_length, stft_size]) 111 | sequence_length_tensor = tf.placeholder(tf.int32, (None)) 112 | 113 | # Temp_data_variables 114 | no_of_files = 0 115 | temp_list = [] 116 | final_data = [] 117 | sequence_length_id = 0 118 | 119 | # Repositories 120 | file_repository = [] 121 | rate_repository = [] 122 | clean_repository = [] 123 | 124 | # Selected vectors 125 | files_vec = [] 126 | clean_files_fin_vec = [] 127 | clean_files_vec = [] 128 | 129 | # Graph 130 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(stft_size, forget_bias = 1.0, state_is_tuple = True) 131 | # stacked_lstm = tf.contrib.rnn.MultiRNNCell([[lstm_cell] for i in number_of_layers]) 132 | init_state = lstm_cell.zero_state(batch_size, tf.float32) 133 | rnn_outputs, final_state = tf.nn.dynamic_rnn(lstm_cell, input_data, sequence_length=sequence_length_tensor, initial_state=init_state, time_major=False) 134 | mse_loss = tf.losses.mean_squared_error(rnn_outputs, clean_data) 135 | # train_optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(mse_loss) 136 | # train_optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(mse_loss) 137 | # train_optimizer = tf.train.AdagradDAOptimizer(learning_rate).minimize(mse_loss) 138 | train_optimizer = tf.train.AdamOptimizer(learning_rate).minimize(mse_loss) 139 | saver = tf.train.Saver() 140 | 141 | # ------------------- Read all data to memory creating a repository of mixture and clean files --------------------- # 142 | 143 | os.chdir(traindata) 144 | # for file_iter in range(traindata): 145 | 146 | # Buffer training data to memory for faster execution: 147 | for root, _, files in os.walk(traindata): 148 | files = sorted(files) 149 | no_of_files = len(files) 150 | 151 | if batch_size > no_of_files: 152 | sys.exit("Error: batch_size cannot be more than number of files in the training directory") 153 | 154 | for f in files: 155 | if f.endswith(".wav"): 156 | temp_list.append(f) 157 | srate, data = wav.read(os.path.join(root, f)) 158 | file_repository.append(data) 159 | rate_repository.append(srate) 160 | 161 | # Generate a vector of file names that are clean files 162 | clean_files_vec = list(map(formatFilename, temp_list)) 163 | # clean_files_vec = list(map(None, *clean_files_vec)) 164 | 165 | # Find clean files that correspond to data in file_repository and buffer clean voice data to memory 166 | for root, _, files in os.walk(voicedata): 167 | files = sorted(files) 168 | for each in files: 169 | if each.endswith(".wav"): 170 | for name in clean_files_vec: 171 | if each == name: 172 | srate2, data2 = wav.read(os.path.join(root, name)) 173 | clean_repository.append(data2) 174 | 175 | # ------------------- Step 1: Prepare data in batches and perform STFTs --------------------- # 176 | 177 | 178 | # files_vec = [] 179 | run_epochs = int((no_of_files / batch_size) * epochs) 180 | 181 | # Initialize TF Graph 182 | init_op = tf.global_variables_initializer() # initialize_all_variables() 183 | gpu_options = tf.GPUOptions(allow_growth = True) # Set session GPU using growing. 184 | sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) 185 | sess.run(init_op) 186 | 187 | globalBatchLossSum = 0 # Sum of all batch losses 188 | globalStepsSum = 0 # Sum of all steps 189 | lastCumulativeLossAvg = 100 # Last Cumulative Loss Avg. 190 | 191 | for idx in range(int(run_epochs)): 192 | 193 | files_vec = [] 194 | # clean_files_vec = [] 195 | clean_files_fin_vec = [] 196 | 197 | # Select batch_size no. of random number of files from file_repository and the corresponding clean files 198 | for file_iter in range(batch_size): 199 | i = random.randint(0, len(file_repository) - 1) 200 | files_vec.append(file_repository[i] * norm_factor) 201 | clean_files_fin_vec.append(clean_repository[i] * norm_factor) 202 | 203 | stft_batch, sequence_length_id, maximum_length = sequentialized_spectrum(files_vec) 204 | clean_voice_batch, sequence_length_id_clean, maximum_length_clean = sequentialized_spectrum(clean_files_fin_vec) 205 | 206 | # ------------------- Step 2: Feed Data to Placeholders, and then, Initialise, Train and Save the Graph --------------------- # 207 | 208 | max_time_steps = stft_batch.shape[1] 209 | batchLossSum = 0 # Sum of batch losses in one index. 210 | 211 | for time_seq in range(max_time_steps): 212 | feed_dict = { 213 | input_data: stft_batch[:, time_seq, :, :], 214 | clean_data: clean_voice_batch[:, time_seq, :, :], 215 | sequence_length_tensor: sequence_length_id[:, time_seq] 216 | } 217 | _, loss_value, final_state_value, rnn_outputs_val = sess.run([train_optimizer, mse_loss, final_state, rnn_outputs], feed_dict=feed_dict) 218 | 219 | # print("Index " + str(idx + 1) + " in " + str(run_epochs)) 220 | # print("\tOutput Min:\t" + str(np.min(rnn_outputs_val))) 221 | # print("\tClean Min:\t" + str(np.min(clean_voice_batch[:, time_seq, :, :]))) 222 | # print("\tOutput Max:\t" + str(np.max(rnn_outputs_val))) 223 | # print("\tClean Max:\t" + str(np.max(clean_voice_batch[:, time_seq, :, :]))) 224 | # print("\tBatch Loss:\t" + str(loss_value * 32768)) # Multiplied 32768 to show the batch losses obviously. 225 | batchLossSum = batchLossSum + loss_value 226 | 227 | print("\t\tIndex " + str(idx + 1) + " Batch Loss Avg:\t" + str(batchLossSum / max_time_steps / norm_factor) + "\n") 228 | 229 | globalBatchLossSum = globalBatchLossSum + batchLossSum 230 | globalStepsSum = globalStepsSum + max_time_steps 231 | 232 | if (int((idx + 1) % no_of_files) == 0): 233 | # All batch losses sum divide global steps to get Avg 234 | cumulativLossAvg = globalBatchLossSum / globalStepsSum 235 | print("\n\t\tCumulative epochs loss Avg in latest " + str(idx + 1) + " indexes:\t" + str(cumulativLossAvg / norm_factor)) 236 | if(cumulativLossAvg <= lastCumulativeLossAvg): 237 | lastCumulativeLossAvg = cumulativLossAvg # If cumulative loss avg is smaller or equal to last avg, stay learning rate 238 | else: 239 | learning_rate = learning_rate / 5 # If cumulative loss avg is bigger than last avg, than change learning rate to 1/5 240 | lastCumulativeLossAvg = cumulativLossAvg 241 | print("\n\t\tLearning Rate changed to: " + str(learning_rate)) 242 | globalBatchLossSum = 0 # Initialize to 0, for next indexes batch loss calculation 243 | globalStepsSum = 0 244 | 245 | os.chdir(checkpoints) 246 | saver.save(sess, './ssep_model.ckpt', global_step=idx) 247 | print("\t\tSaved checkpoint\n") 248 | os.chdir(traindata) 249 | 250 | os.chdir(checkpoints) 251 | saver.save(sess, './FINAL.ckpt') 252 | print("Saved FINAL") 253 | sess.close() -------------------------------------------------------------------------------- /02.GRUTraining/GRUTraining.py: -------------------------------------------------------------------------------- 1 | # Code By adityatb at https://github.com/adityatb/noise-reduction-using-rnn 2 | # GRU method. 3 | # Maintain by ShYy, 2018. 4 | 5 | import scipy 6 | import scipy.signal as signal 7 | import numpy as np 8 | import os, random, sys 9 | import scipy.io.wavfile as wav 10 | import tensorflow as tf 11 | import math 12 | 13 | 14 | os.environ['CUDA_VISIBLE_DEVICES'] = '2' 15 | 16 | def formatFilename(filename): 17 | return filename[:len(filename) - 11] + "_voice.wav" 18 | 19 | 20 | # Strip away the _xnoise.wav part of the filename, and append _voice.wav to obtain clean voice counterpart 21 | 22 | def create_final_sequence(sequence, max_length): 23 | a, b = sequence.shape 24 | extra_len = max_length - b 25 | null_mat = np.zeros((len(sequence), extra_len), dtype=np.float32) 26 | sequence = np.concatenate((sequence, null_mat), axis=1) 27 | return sequence 28 | 29 | 30 | def sequentialized_spectrum(batch): 31 | # Get maximum length of batch 32 | t = [] 33 | t_vec = [] 34 | Sxx_Vec = [] 35 | for each in batch: 36 | _, t, Sxx_Vec_Temp = signal.stft(each, fs=rate_repository[0], nperseg=stft_size, return_onesided = False) 37 | t_vec.append(t) 38 | Sxx_Vec.append(Sxx_Vec_Temp) 39 | maximum_length = findMaxlen(t_vec) 40 | 41 | max_run_total = int(math.ceil(float(maximum_length) / sequence_length)) 42 | final_data = np.zeros([len(batch), max_run_total, stft_size, sequence_length], dtype=np.float32) 43 | true_time = np.zeros([len(batch), max_run_total], dtype=np.int32) 44 | 45 | # Read in a file and compute spectrum 46 | # for batch_idx, each_set in enumerate(batch): 47 | for batch_idx, Sxx in enumerate(Sxx_Vec): 48 | # f, t, Sxx = signal.stft(each_set, fs=rate_repository[0], nperseg=stft_size, return_onesided = False) 49 | 50 | # Magnitude and Phase Spectra 51 | # Mag = Sxx.real 52 | Mag = Sxx.imag # Get imaginary part of Sxx. This will try to get a imaginary model. 53 | t = t_vec[batch_idx] 54 | 55 | # # TESTING 56 | # _, outputData_ISTFT = signal.istft(Mag, fs=rate_repository[0], nperseg=stft_size, 57 | # input_onesided=False) 58 | # 59 | # outputData_ISTFT = ((outputData_ISTFT / norm_factor).real) / 0.75 60 | # outputData_ISTFT = outputData_ISTFT.astype(np.int16) 61 | # 62 | # wav.write("0.TEST_REAL.wav", rate_repository[idx], outputData_ISTFT) 63 | # 64 | # _, outputData_ISTFT = signal.istft(Sxx, fs=rate_repository[0], nperseg=stft_size, 65 | # input_onesided=False) 66 | # 67 | # outputData_ISTFT = ((outputData_ISTFT / norm_factor).real) / 0.75 68 | # outputData_ISTFT = outputData_ISTFT.astype(np.int16) 69 | # 70 | # wav.write("0.TEST_ORIG.wav", rate_repository[idx], outputData_ISTFT) 71 | # # TESTING END 72 | 73 | # Break up the spectrum in sequence_length sized data 74 | run_full_steps = float(len(t)) / sequence_length 75 | run_total = int(math.ceil(run_full_steps)) 76 | 77 | # Run a loop long enough to break up all the data in the file into chunks of sequence_size 78 | for step in range(run_total): 79 | 80 | begin_point = step * sequence_length 81 | end_point = begin_point + sequence_length 82 | 83 | m, n = Mag[:, begin_point:end_point].shape 84 | 85 | # Store each chunk sequentially in a new array, accounting for zero padding when close to the end of the file 86 | if n == sequence_length: 87 | final_data[batch_idx, step, :, :] = np.copy(Mag[:, begin_point:end_point]) 88 | true_time[batch_idx, step] = n 89 | else: 90 | final_data[batch_idx, step, :, :] = np.copy(create_final_sequence(Mag[:, begin_point:end_point], sequence_length)) 91 | true_time[batch_idx, step] = n 92 | 93 | final_data = np.transpose(final_data, (0, 1, 3, 2)) 94 | 95 | return final_data, true_time, maximum_length 96 | 97 | 98 | def findMaxlen(data_vec): 99 | max_ = 0 100 | for each in data_vec: 101 | if len(each) > max_: 102 | max_ = len(each) 103 | return max_ 104 | 105 | 106 | # ----------------- Begin Vars --------------------- # 107 | 108 | # Training data directories 109 | traindata = os.getcwd() + "/Training/NoiseAdded/" 110 | voicedata = os.getcwd() + "/Training/HumanVoices/" 111 | checkpoints = os.getcwd() + "/TF_Checkpoints/" 112 | 113 | # NormConstant 114 | norm_factor = (1 / 32768.0) 115 | 116 | # Spectrogram Parameters 117 | stft_size = 1024 118 | 119 | # RNN Specs 120 | sequence_length = 100 121 | batch_size = 10 122 | learning_rate = 0.0005 123 | epochs = 250 124 | number_of_layers = 3 125 | 126 | # Tensorflow vars + Graph and GRU Params 127 | input_data = tf.placeholder(tf.float32, [None, sequence_length, stft_size]) 128 | clean_data = tf.placeholder(tf.float32, [None, sequence_length, stft_size]) 129 | sequence_length_tensor = tf.placeholder(tf.int32, (None)) 130 | 131 | # Temp_data_variables 132 | no_of_files = 0 133 | temp_list = [] 134 | final_data = [] 135 | sequence_length_id = 0 136 | 137 | # Repositories 138 | file_repository = [] 139 | rate_repository = [] 140 | clean_repository = [] 141 | 142 | # Selected vectors 143 | files_vec = [] 144 | clean_files_fin_vec = [] 145 | clean_files_vec = [] 146 | 147 | # Graph 148 | gru_cell = tf.contrib.rnn.GRUCell(stft_size, kernel_initializer = tf.zeros_initializer(dtype = tf.float32)) 149 | gru_cell = tf.contrib.rnn.DropoutWrapper(gru_cell, dtype = tf.float32, output_keep_prob = 0.5) 150 | stacked_gru = tf.contrib.rnn.MultiRNNCell([gru_cell] * number_of_layers, state_is_tuple=True) 151 | init_state = stacked_gru.zero_state(batch_size, dtype=tf.float32) 152 | rnn_outputs, final_state = tf.nn.dynamic_rnn(stacked_gru, input_data, sequence_length=sequence_length_tensor, initial_state=init_state, time_major=False) 153 | mse_loss = tf.losses.mean_squared_error(rnn_outputs, clean_data) 154 | # train_optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(mse_loss) 155 | # train_optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(mse_loss) 156 | # train_optimizer = tf.train.AdagradDAOptimizer(learning_rate).minimize(mse_loss) 157 | train_optimizer = tf.train.AdamOptimizer(learning_rate).minimize(mse_loss) 158 | saver = tf.train.Saver() 159 | 160 | # ------------------- Read all data to memory creating a repository of mixture and clean files --------------------- # 161 | 162 | os.chdir(traindata) 163 | # for file_iter in range(traindata): 164 | 165 | # Buffer training data to memory for faster execution: 166 | for root, _, files in os.walk(traindata): 167 | files = sorted(files) 168 | no_of_files = len(files) 169 | 170 | if batch_size > no_of_files: 171 | sys.exit("Error: batch_size cannot be more than number of files in the training directory") 172 | 173 | for f in files: 174 | if f.endswith(".wav"): 175 | temp_list.append(f) 176 | srate, data = wav.read(os.path.join(root, f)) 177 | file_repository.append(data) 178 | rate_repository.append(srate) 179 | 180 | # Generate a vector of file names that are clean files 181 | clean_files_vec = list(map(formatFilename, temp_list)) 182 | # clean_files_vec = list(map(None, *clean_files_vec)) 183 | 184 | # Find clean files that correspond to data in file_repository and buffer clean voice data to memory 185 | for root, _, files in os.walk(voicedata): 186 | files = sorted(files) 187 | for each in files: 188 | if each.endswith(".wav"): 189 | for name in clean_files_vec: 190 | if each == name: 191 | srate2, data2 = wav.read(os.path.join(root, name)) 192 | # In Create Noise Adding Dataset, the NA audio is 0.75*Source + 0.25*Noise. 193 | # So we need let clean data*0.75. 194 | clean_repository.append(data2 * 0.75) 195 | 196 | # ------------------- Step 1: Prepare data in batches and perform STFTs --------------------- # 197 | 198 | 199 | # files_vec = [] 200 | run_epochs = int((no_of_files / batch_size) * epochs) 201 | 202 | # Initialize TF Graph 203 | init_op = tf.global_variables_initializer() # initialize_all_variables() 204 | gpu_options = tf.GPUOptions(allow_growth = True) # Set session GPU using growing. 205 | sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) 206 | sess.run(init_op) 207 | 208 | globalBatchLossSum = 0 # Sum of all batch losses 209 | # globalStepsSum = 0 # Sum of all steps 210 | lastCumulativeLossSum = 99999 # Last Cumulative Loss Sum. 211 | 212 | for idx in range(int(run_epochs)): 213 | 214 | files_vec = [] 215 | # clean_files_vec = [] 216 | clean_files_fin_vec = [] 217 | 218 | # Select batch_size no. of random number of files from file_repository and the corresponding clean files 219 | for file_iter in range(batch_size): 220 | i = random.randint(0, len(file_repository) - 1) 221 | files_vec.append(file_repository[i] * norm_factor) 222 | clean_files_fin_vec.append(clean_repository[i] * norm_factor) 223 | 224 | stft_batch, sequence_length_id, maximum_length = sequentialized_spectrum(files_vec) 225 | clean_voice_batch, sequence_length_id_clean, maximum_length_clean = sequentialized_spectrum(clean_files_fin_vec) 226 | 227 | # ------------------- Step 2: Feed Data to Placeholders, and then, Initialise, Train and Save the Graph --------------------- # 228 | 229 | max_time_steps = stft_batch.shape[1] 230 | batchLossSum = 0 # Sum of batch losses in one index. 231 | 232 | for time_seq in range(max_time_steps): 233 | feed_dict = { 234 | input_data: stft_batch[:, time_seq, :, :], 235 | clean_data: clean_voice_batch[:, time_seq, :, :], 236 | sequence_length_tensor: sequence_length_id[:, time_seq] 237 | } 238 | _, loss_value, final_state_value, rnn_outputs_val = sess.run([train_optimizer, mse_loss, final_state, rnn_outputs], feed_dict=feed_dict) 239 | 240 | # print("Index " + str(idx + 1) + " in " + str(run_epochs)) 241 | # print("\tOutput Min:\t" + str(np.min(rnn_outputs_val))) 242 | # print("\tClean Min:\t" + str(np.min(clean_voice_batch[:, time_seq, :, :]))) 243 | # print("\tOutput Max:\t" + str(np.max(rnn_outputs_val))) 244 | # print("\tClean Max:\t" + str(np.max(clean_voice_batch[:, time_seq, :, :]))) 245 | # print("\tBatch Loss:\t" + str(loss_value * 32768)) # Multiplied 32768 to show the batch losses obviously. 246 | batchLossSum = batchLossSum + loss_value 247 | 248 | print("Index " + str(idx + 1) + "/" + str(run_epochs) + " Batch Loss Sum:\t" + str(batchLossSum / norm_factor) + "\n") 249 | 250 | globalBatchLossSum = globalBatchLossSum + batchLossSum 251 | # globalStepsSum = globalStepsSum + max_time_steps 252 | 253 | if (int((idx + 1) % no_of_files) == 0): 254 | # All batch losses sum divide global steps to get Avg 255 | # cumulativLossAvg = globalBatchLossSum / globalStepsSum 256 | cumulativeLossSum = globalBatchLossSum 257 | print("\n\t\tCumulative epochs loss Sum in latest " + str(no_of_files) + " indexes:\t" + str(cumulativeLossSum / norm_factor)) 258 | if(cumulativeLossSum < lastCumulativeLossSum): 259 | lastCumulativeLossSum = cumulativeLossSum # If cumulative loss avg is smaller or equal to last avg, stay learning rate 260 | else: 261 | learning_rate = learning_rate / 5 # If cumulative loss avg is bigger than last avg, than change learning rate to 1/5 262 | lastCumulativeLossSum = cumulativeLossSum 263 | print("\n\t\tLearning Rate changed to: " + str(learning_rate)) 264 | globalBatchLossSum = 0 # Initialize to 0, for next indexes batch loss calculation 265 | # globalStepsSum = 0 266 | 267 | os.chdir(checkpoints) 268 | saver.save(sess, './ssep_model.ckpt', global_step=idx) 269 | print("\t\tSaved checkpoint\n") 270 | os.chdir(traindata) 271 | 272 | os.chdir(checkpoints) 273 | saver.save(sess, './FINAL.ckpt') 274 | print("Saved FINAL") 275 | sess.close() --------------------------------------------------------------------------------