├── README.md ├── adapt.py ├── adapt_data └── adaptation_data_loc.txt ├── adapt_speaker_list.txt ├── beamformer ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── cgmm_help.cpython-36.pyc │ ├── complexGMM_mvdr.cpython-36.pyc │ ├── complexGMM_mvdr_iterative.cpython-36.pyc │ ├── complexGMM_mvdr_snr_selective.cpython-36.pyc │ ├── delaysum.cpython-36.pyc │ ├── minimum_variance_distortioless_response.cpython-36.pyc │ ├── speaker_diarization_by_delaysum.cpython-36.pyc │ ├── util.cpython-36.pyc │ ├── vad.cpython-36.pyc │ └── weighted_prediction_error.cpython-36.pyc ├── complexGMM_mvdr.py ├── complexGMM_mvdr_snr_selective.py ├── delaysum.py └── util.py ├── dataset ├── adaptation_data │ ├── speaker1 │ │ ├── 251-136532-0000.flac │ │ ├── 251-136532-0001.flac │ │ ├── 251-136532-0002.flac │ │ ├── 251-136532-0003.flac │ │ └── 251-136532-0004.flac │ ├── speaker1_2 │ │ └── 251-137823-0023.flac │ └── speaker2 │ │ ├── 2412-153954-0002.flac │ │ ├── 2412-153954-0003.flac │ │ ├── 2412-153954-0004.flac │ │ ├── 2412-153954-0005.flac │ │ ├── 2412-153954-0006.flac │ │ └── 2412-153954-0007.flac ├── data_for_beamforming │ ├── F02_011C021A_BUS.CH1.wav │ ├── F02_011C021A_BUS.CH2.wav │ ├── F02_011C021A_BUS.CH3.wav │ ├── F02_011C021A_BUS.CH4.wav │ ├── F02_011C021A_BUS.CH5.wav │ └── F02_011C021A_BUS.CH6.wav ├── train │ ├── noise │ │ ├── 447o0301_0.32331_445c020s_-0.32331_12.244375.wav │ │ ├── 447o0301_0.56098_22ha010i_-0.56098_12.603875.wav │ │ ├── 447o0301_0.69862_050o020g_-0.69862_12.1389375.wav │ │ ├── 447o0301_0.9755_423o0308_-0.9755_13.431875.wav │ │ ├── 447o0302_1.3388_22ho010i_-1.3388_12.69025.wav │ │ ├── 447o0302_2.1067_422o030k_-2.1067_11.834.wav │ │ ├── 447o030x_0.98832_441o0308_-0.98832_16.4556875.wav │ │ ├── 447o030x_1.4783_422o030p_-1.4783_16.124125.wav │ │ └── 447o030x_1.6276_440o0304_-1.6276_14.556125.wav │ └── speech │ │ ├── 652-130737-0001.flac │ │ ├── 652-130737-0002.flac │ │ ├── 652-130737-0003.flac │ │ ├── 652-130737-0004.flac │ │ ├── 652-130737-0005.flac │ │ ├── 652-130737-0006.flac │ │ ├── 777-126732-0002.flac │ │ ├── 777-126732-0003.flac │ │ ├── 777-126732-0004.flac │ │ └── 777-126732-0005.flac └── validate │ ├── noise │ ├── 447o030q_2.4332_440o0309_-2.4332_12.56975.wav │ ├── 447o030r_0.25387_442c020t_-0.25387_14.1650625.wav │ ├── 447o030r_1.6517_422o0312_-1.6517_14.6394375.wav │ ├── 447o030t_1.3876_442o0305_-1.3876_11.87325.wav │ └── 447o030u_1.9508_051c0109_-1.9508_16.297875.wav │ └── speech │ ├── 174-84280-0001.flac │ ├── 174-84280-0002.flac │ ├── 174-84280-0003.flac │ ├── 174-84280-0004.flac │ ├── 174-84280-0005.flac │ ├── 84-121123-0001.flac │ ├── 84-121123-0002.flac │ ├── 84-121123-0003.flac │ ├── 84-121123-0004.flac │ ├── 84-121123-0005.flac │ └── 84-121123-0006.flac ├── generate_validate_data.py ├── image ├── model.png ├── sample_mask.png └── sample_mask_multi.png ├── maskestimator ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── adapt_model.cpython-36.pyc │ ├── augment.cpython-36.pyc │ ├── feature.cpython-36.pyc │ ├── model.cpython-36.pyc │ ├── shaper.cpython-36.pyc │ └── util.cpython-36.pyc ├── adapt_model.py ├── augment.py ├── feature.py ├── model.py ├── shaper.py └── util.py ├── model ├── 194sequence_false_e1.hdf5.data-00000-of-00001 ├── 194sequence_false_e1.hdf5.index └── checkpoint ├── non_adapt_speaker_list.txt ├── predict.py ├── predict_single.py ├── result ├── enhacement.wav ├── enhacement_all_channels.wav ├── enhacement_snr_select.wav ├── speech_clean.wav └── speech_noisy.wav ├── sp1_list.txt ├── sp2_list.txt ├── speaker_aware_mask_predict.py ├── tflog └── tflog_loc.txt ├── train.py └── validation_features └── val_data.txt /README.md: -------------------------------------------------------------------------------- 1 | # Neural-mask-estimation 2 | 3 | # key feature 4 | 5 | - LSTM-based Neural Mask Estimation for designing MVDR [1, 4] 6 | - on-the-fly data augmentation 7 | - pre-trained model 8 | - speaker-Aware mask training supported [2] 9 | - SNR-based reference mic selection for MVDR [1, 4] 10 | - small scale sample training data 11 | - You can perform experiment using any data by replacing the data 12 | - We put WHAM! noise data[2], Libri Speech and LJ speech as sample noise clean speech data. 13 | 14 | 15 | # How to use 16 | 17 | 1. Please run generate_validate_data.py 18 | - Please put data(noise and clean speech) ./dataset/validate/* 19 | - You will get validation_features/speech_mask.npy, validation_features/noise_mask.npy and validation_features/val_spec.npy 20 | 21 | 2. Please run train.py 22 | - Please put data(noise and clean speech) ./dataset/train/* 23 | - You will get model/neaural_mask_estimator{}.hdf5 24 | ・{} indicates the number of times of epoch 25 | 26 | 3. Please run predict.py 27 | - Perform mask estimation and design MVDR beamformer and you can get enhanced speech 28 | - Please put multi channel data ./dataset/data_for_beamforming/* for beamforming 29 | - You will get result in ./result/* 30 | ・ enhencement_all_channels.wav is result without channel selection 31 | -・enhacement_snr_select.wav is result with channel selection 32 | 33 | 34 | speaker-aware mask estimating 35 | 36 | 1: Please run adapt.py 37 | - Please prepare target speaker list and non target speaker list (e.g., sp1_list.txt, sp2_list.txt) 38 | - you will get speaker-aware model ./model/speaker_2.hdf5 39 | 40 | 2. Please run speaker_aware_mask_predict.py 41 | - you can compare mask results before/after adaptation 42 | 43 | # Reference: 44 | 45 | [1] EXPLORING PRACTICAL ASPECTS OF NEURAL MASK-BASED BEAMFORMING FOR FAR-FIELD SPEECH RECOGNITION 46 | - https://www.microsoft.com/en-us/research/uploads/prod/2018/04/ICASSP2018-Christoph.pdf 47 | 48 | 49 | [2] WHAM!: Extending Speech Separation to Noisy Environments 50 | - https://arxiv.org/abs/1907.01160 51 | 52 | [3] The Hitachi/JHU CHiME-5 system: Advances in speech recognition for veryday home environments using multiple microphone arrays 53 | - http://spandh.dcs.shef.ac.uk/chime_workshop/papers/CHiME_2018_paper_kanda.pdf 54 | 55 | 56 | [4] Improved MVDR beamforming using single-channel mask prediction networks 57 | - https://www.merl.com/publications/docs/TR2016-072.pdf 58 | 59 | ![sample_mask](https://user-images.githubusercontent.com/41845296/62979654-5b090880-be5f-11e9-8eb3-08afc616e279.png) 60 | ![sample_mask_multi](https://user-images.githubusercontent.com/41845296/62979655-5b090880-be5f-11e9-9fde-028cc82d4f33.png) 61 | ![model](https://user-images.githubusercontent.com/41845296/62979656-5b090880-be5f-11e9-9e4f-fa4e4be17560.png) 62 | 63 | # Requirement: 64 | python 3.6.7+ 65 | 66 | numpy 1.14.3 67 | soundfile 0.9.0 68 | pyroomacoustics 0.1.21 69 | librosa 0.6.2 70 | tensorflow 1.9.0 71 | scipy 1.2.0 72 | cython 0.25.2 73 | matplotlib 3.6.7 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /adapt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 26 19:29:57 2019 4 | 5 | @author: a-kojima 6 | """ 7 | 8 | import os 9 | import shutil 10 | 11 | from maskestimator import model, shaper, adapt_model 12 | 13 | #========================================== 14 | # ANALYSIS PARAMETERS 15 | #========================================== 16 | SAMPLING_FREQUENCY = 16000 17 | FFTL = 1024 18 | SHIFT = 256 19 | 20 | #========================================== 21 | # ESURAL MASL ESTIMATOR PARAMETERS 22 | #========================================== 23 | LEFT_CONTEXT = 0 24 | RIGHT_CONTEXT = 0 25 | NUMBER_OF_SKIP_FRAME = 0 26 | TRUNCATE_GRAD = 7 27 | 28 | #========================================== 29 | # ESURAL MASL ESTIMATOR TRAINNING PARAMERTERS 30 | #========================================== 31 | WEIGHT_PATH = r'./model/194sequence_false_e1.hdf5' 32 | ADAPT_LR = 0.001 33 | 34 | TARGET_SPEAKER_LIST = './sp2_list.txt' 35 | NON_TARGET_SPEAKER_LIST = './sp1_list.txt' 36 | SAVE_MODEL_NAME = r'./model/speaker_2.hdf5' 37 | ADAPT_LOC = r'./adapt_data' # place to output numpy features for adaptation 38 | RECURRENT_INIT_CELL = 0.00001 39 | 40 | 41 | 42 | NUMBER_OF_STACK = LEFT_CONTEXT + RIGHT_CONTEXT + 1 43 | 44 | #========================================== 45 | # get model 46 | #========================================== 47 | mask_estimator_generator = model.NeuralMaskEstimation(TRUNCATE_GRAD, 48 | NUMBER_OF_STACK, 49 | ADAPT_LR, 50 | FFTL // 2 + 1, 51 | recurrent_init=RECURRENT_INIT_CELL) 52 | 53 | mask_estimator = mask_estimator_generator.get_model(is_stateful=True, 54 | is_show_detail=True, 55 | is_adapt=False) 56 | 57 | mask_estimator = mask_estimator_generator.load_weight_param(mask_estimator, WEIGHT_PATH) 58 | 59 | #========================================== 60 | # predicting data shaper 61 | #========================================== 62 | data_shaper = shaper.Shape_data(LEFT_CONTEXT, 63 | RIGHT_CONTEXT, 64 | TRUNCATE_GRAD, 65 | NUMBER_OF_SKIP_FRAME ) 66 | 67 | #========================================== 68 | # adaptation 69 | #========================================== 70 | model_adapter = adapt_model.adapt_model(WEIGHT_PATH, 71 | TRUNCATE_GRAD, 72 | NUMBER_OF_STACK, 73 | ADAPT_LR, 74 | spec_dim=FFTL // 2 + 1, 75 | sampling_frequency=SAMPLING_FREQUENCY, 76 | fftl=FFTL, 77 | shift=SHIFT, 78 | left_context=LEFT_CONTEXT, 79 | right_contect=RIGHT_CONTEXT, 80 | number_of_skip_frame=NUMBER_OF_SKIP_FRAME, 81 | adapt_data_location=ADAPT_LOC) 82 | 83 | #========================================== 84 | # create data for adaptation 85 | #========================================== 86 | if os.path.exists(ADAPT_LOC): 87 | shutil.rmtree(ADAPT_LOC) 88 | os.makedirs(ADAPT_LOC) 89 | 90 | # target speaker 91 | model_adapter.create_data_for_adaptation(True, TARGET_SPEAKER_LIST) 92 | # non target speaker 93 | model_adapter.create_data_for_adaptation(False, NON_TARGET_SPEAKER_LIST) 94 | 95 | #========================================== 96 | # adaptation 97 | #========================================== 98 | model_adapter.save_adapt_model(SAVE_MODEL_NAME) 99 | -------------------------------------------------------------------------------- /adapt_data/adaptation_data_loc.txt: -------------------------------------------------------------------------------- 1 | adaptation -------------------------------------------------------------------------------- /adapt_speaker_list.txt: -------------------------------------------------------------------------------- 1 | ./dataset/speech_speaker1/JA009_1.wav 2 | ./dataset/speech_speaker1/JA009_2.wav 3 | ./dataset/speech_speaker1/JA009_3.wav 4 | ./dataset/speech_speaker1/JA009_4.wav 5 | ./dataset/speech_speaker1/JA009_5.wav 6 | ./dataset/speech_speaker1/JA009_6.wav 7 | ./dataset/speech_speaker1/JA009_7.wav 8 | ./dataset/speech_speaker1/JA009_8.wav 9 | ./dataset/speech_speaker1/JA009_9.wav -------------------------------------------------------------------------------- /beamformer/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 16 15:12:35 2019 4 | 5 | @author: a-kojima 6 | """ 7 | from . import util 8 | from . import delaysum 9 | from . import complexGMM_mvdr 10 | 11 | -------------------------------------------------------------------------------- /beamformer/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/beamformer/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /beamformer/__pycache__/cgmm_help.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/beamformer/__pycache__/cgmm_help.cpython-36.pyc -------------------------------------------------------------------------------- /beamformer/__pycache__/complexGMM_mvdr.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/beamformer/__pycache__/complexGMM_mvdr.cpython-36.pyc -------------------------------------------------------------------------------- /beamformer/__pycache__/complexGMM_mvdr_iterative.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/beamformer/__pycache__/complexGMM_mvdr_iterative.cpython-36.pyc -------------------------------------------------------------------------------- /beamformer/__pycache__/complexGMM_mvdr_snr_selective.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/beamformer/__pycache__/complexGMM_mvdr_snr_selective.cpython-36.pyc -------------------------------------------------------------------------------- /beamformer/__pycache__/delaysum.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/beamformer/__pycache__/delaysum.cpython-36.pyc -------------------------------------------------------------------------------- /beamformer/__pycache__/minimum_variance_distortioless_response.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/beamformer/__pycache__/minimum_variance_distortioless_response.cpython-36.pyc -------------------------------------------------------------------------------- /beamformer/__pycache__/speaker_diarization_by_delaysum.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/beamformer/__pycache__/speaker_diarization_by_delaysum.cpython-36.pyc -------------------------------------------------------------------------------- /beamformer/__pycache__/util.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/beamformer/__pycache__/util.cpython-36.pyc -------------------------------------------------------------------------------- /beamformer/__pycache__/vad.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/beamformer/__pycache__/vad.cpython-36.pyc -------------------------------------------------------------------------------- /beamformer/__pycache__/weighted_prediction_error.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/beamformer/__pycache__/weighted_prediction_error.cpython-36.pyc -------------------------------------------------------------------------------- /beamformer/complexGMM_mvdr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jan 11 11:40:29 2019 4 | 5 | @author: a-kojima 6 | """ 7 | import numpy as np 8 | import copy 9 | import matplotlib.pyplot as pl 10 | from . import util 11 | 12 | class complexGMM_mvdr: 13 | 14 | def __init__(self, 15 | sampling_frequency, 16 | fft_length, 17 | fft_shift, 18 | number_of_EM_iterate, 19 | min_segment_dur, 20 | condition_number_inv_threshold=10**(-6), 21 | scm_inv_threshold=10**(-10), 22 | beamformer_inv_threshold=10**(-6)): 23 | self.sampling_frequency=sampling_frequency 24 | self.fft_length=fft_length 25 | self.fft_shift=fft_shift 26 | self.number_of_EM_iterate=number_of_EM_iterate 27 | self.min_segment_dur=min_segment_dur 28 | self.condition_number_inv_threshold=condition_number_inv_threshold 29 | self.scm_inv_threshold=scm_inv_threshold 30 | self.beamformer_inv_threshold=beamformer_inv_threshold 31 | 32 | def get_spatial_correlation_matrix(self, speech_data): 33 | complex_spectrum, _ = util.get_3dim_spectrum_from_data(speech_data, self.fft_length, self.fft_shift, self.fft_length) 34 | number_of_channels, number_of_frames, number_of_bins = np.shape(complex_spectrum) 35 | 36 | # CGMM parameters 37 | lambda_noise = np.zeros((number_of_frames, number_of_bins), dtype=np.complex64) 38 | lambda_noisy = np.zeros((number_of_frames, number_of_bins), dtype=np.complex64) 39 | phi_noise = np.ones((number_of_frames, number_of_bins), dtype=np.float64) 40 | phi_noisy = np.ones((number_of_frames, number_of_bins), dtype=np.float64) 41 | R_noise = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 42 | R_noisy = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 43 | yyh = np.zeros((number_of_channels, number_of_channels, number_of_frames, number_of_bins), dtype=np.complex64) 44 | 45 | # init R_noisy and R_noise 46 | for f in range(0, number_of_bins): 47 | for t in range(0, number_of_frames): 48 | h = np.multiply.outer(complex_spectrum[:, t, f], np.conj(complex_spectrum[:, t, f]).T) 49 | yyh[:, :, t, f] = h 50 | R_noisy[:, :, f] = R_noisy[:, :, f] + h 51 | R_noisy[:, :, f] = R_noisy[:, :, f] / number_of_frames 52 | R_noise[:, :, f] = np.eye(number_of_channels, number_of_channels, dtype=np.complex64) 53 | R_xn = copy.deepcopy(R_noisy) 54 | p_noise = np.ones((number_of_frames, number_of_bins), dtype=np.float64) 55 | p_noisy = np.ones((number_of_frames, number_of_bins), dtype=np.float64) 56 | 57 | # go EMiteration 58 | for ite in range(0, self.number_of_EM_iterate): 59 | print('iter', str(ite + 1) + '/' + str(self.number_of_EM_iterate)) 60 | for f in range(0, number_of_bins): 61 | R_noisy_onbin = copy.deepcopy(R_noisy[:, :, f]) 62 | R_noise_onbin = copy.deepcopy(R_noise[:, :, f]) 63 | if np.linalg.cond(R_noisy_onbin) < self.condition_number_inv_threshold: 64 | R_noisy_onbin = R_noisy_onbin + self.condition_number_inv_threshold * np.eye(number_of_channels) * np.max(np.diag(R_noisy_onbin)) 65 | if np.linalg.cond(R_noise_onbin) < self.condition_number_inv_threshold: 66 | R_noise_onbin = R_noise_onbin + self.condition_number_inv_threshold * np.eye(number_of_channels) * np.max(np.diag(R_noise_onbin)) 67 | 68 | R_noisy_inv = np.linalg.pinv(R_noisy_onbin, rcond=self.scm_inv_threshold) 69 | R_noise_inv = np.linalg.pinv(R_noise_onbin, rcond=self.scm_inv_threshold) 70 | R_noisy_accu = np.zeros((number_of_channels, number_of_channels), dtype=np.complex64) 71 | R_noise_accu = np.zeros((number_of_channels, number_of_channels), dtype=np.complex64) 72 | 73 | for t in range(0, number_of_frames): 74 | corre = yyh[:, :, t, f] 75 | obs = complex_spectrum[:, t, f] 76 | 77 | # update phi (real) 78 | phi_noise[t, f] = np.real(np.trace(np.matmul(corre, R_noise_inv), dtype=np.float64) / number_of_channels) 79 | phi_noisy[t, f] = np.real(np.trace(np.matmul(corre, R_noisy_inv), dtype=np.float64) / number_of_channels) 80 | if phi_noise[t, f] == 0: 81 | phi_noise[t, f] = self.condition_number_inv_threshold 82 | if phi_noisy[t, f] == 0: 83 | phi_noisy[t, f] = self.condition_number_inv_threshold 84 | 85 | # update p (real) 86 | k_noise_1 = np.matmul(np.conj(obs).T , R_noise_inv / phi_noise[t, f]) 87 | k_noise = np.matmul(k_noise_1, obs) 88 | tmp_p_noise = np.linalg.det((phi_noise[t, f] * R_noise_onbin).astype(np.float64)) 89 | p_noise[t, f] = np.real(np.exp( - np.real(k_noise).astype(np.float64)) / (np.pi * tmp_p_noise)) 90 | # avoid nan or inf 91 | if np.isnan(p_noise[t, f]) == True or np.isinf(p_noise[t, f]) == True: 92 | p_noise[t, f] = np.nan_to_num(p_noise[t, f]) 93 | k_noisy_1 = np.matmul(np.conj(obs).T, R_noisy_inv / phi_noisy[t, f]) 94 | k_noisy = np.real(np.matmul(k_noisy_1, obs)) 95 | tmp_p_noisy = np.linalg.det((phi_noisy[t, f] * R_noisy_onbin).astype(np.float64)) 96 | p_noisy[t, f] = np.real(np.exp( - np.real(k_noisy).astype(np.float64)) / (np.pi * tmp_p_noisy)) 97 | # avoid nan or inf 98 | if np.isnan(p_noisy[t, f]) == True or np.isinf(p_noisy[t, f]) == True: 99 | p_noisy[t, f] = np.nan_to_num(p_noisy[t, f]) 100 | 101 | # update lambda 102 | lambda_noise[t, f] = p_noise[t, f] / (p_noise[t, f] + p_noisy[t, f]) 103 | lambda_noisy[t, f] = p_noisy[t, f] / (p_noise[t, f] + p_noisy[t, f]) 104 | 105 | # update R 106 | R_noise_accu = R_noise_accu + lambda_noise[t, f] / phi_noise[t, f] * corre 107 | R_noisy_accu = R_noisy_accu + lambda_noisy[t, f] / phi_noisy[t, f] * corre 108 | 109 | # update R 110 | R_noise[:, :, f] = R_noise_accu / np.sum(lambda_noise[:, f], dtype=np.complex64) 111 | R_noisy[:, :, f] = R_noisy_accu / np.sum(lambda_noisy[:, f], dtype=np.complex64) 112 | 113 | # detect noise cluster by entropy 114 | for f in range(0, number_of_bins): 115 | eig_value1 = np.linalg.eigvals(R_noise[:, :, f]) 116 | eig_value2 = np.linalg.eigvals(R_noisy[:, :, f]) 117 | en_noise = np.matmul( - eig_value1.T / np.sum(eig_value1), np.log(eig_value1 / np.sum(eig_value1))) 118 | en_noisy = np.matmul( - eig_value2.T / np.sum(eig_value2), np.log(eig_value2 / np.sum(eig_value2))) 119 | if en_noise < en_noisy: 120 | Rn = copy.deepcopy(R_noise[:, :, f]) 121 | R_noise[:, :, f] = R_noisy[:, :, f] 122 | R_noisy[:, :, f] = Rn 123 | 124 | R_n = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 125 | for f in range(0, number_of_bins): 126 | for t in range(0, number_of_frames): 127 | R_n[:, :, f] = R_n[:, :, f] + lambda_noise[t, f] * yyh[:, :, t, f] 128 | R_n[:, :, f] = R_n[:, :, f] / np.sum(lambda_noise[:, f], dtype=np.complex64) 129 | R_x = R_xn - R_n 130 | return (complex_spectrum, R_x, R_n, lambda_noise, lambda_noisy) 131 | 132 | def get_spatial_correlation_matrix_from_mask(self, speech_data, speech_mask, noise_mask=np.array([None])): 133 | if noise_mask.any() == None: 134 | print('make_noise_mask') 135 | noise_mask = (1 - speech_mask)+0.01 136 | else: 137 | noise_mask = noise_mask.T 138 | print(np.shape(speech_mask), np.shape(noise_mask)) 139 | complex_spectrum, _ = util.get_3dim_spectrum_from_data(speech_data, self.fft_length, self.fft_shift, self.fft_length) 140 | number_of_channels, number_of_frames, number_of_bins = np.shape(complex_spectrum) 141 | # safe guard for difference size between speakerbeam's mask and complex spectrum 142 | _, number_of_frames_on_speakerbeam_mask = np.shape(noise_mask) 143 | if number_of_frames != number_of_frames_on_speakerbeam_mask: 144 | maximum_number_of_frames = np.min([number_of_frames, number_of_frames_on_speakerbeam_mask]) 145 | complex_spectrum = complex_spectrum[:, 0:maximum_number_of_frames, :] 146 | speech_mask = speech_mask[:, 0:maximum_number_of_frames] 147 | noise_mask = noise_mask[:, 0:maximum_number_of_frames] 148 | number_of_frames = maximum_number_of_frames 149 | print(maximum_number_of_frames) 150 | noise_mask = noise_mask.T 151 | speech_mask = speech_mask.T 152 | yyh = np.zeros((number_of_channels, number_of_channels, number_of_frames, number_of_bins), dtype=np.complex64) 153 | R_xn = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 154 | # init R_noisy and R_noise 155 | for f in range(0, number_of_bins): 156 | for t in range(0, number_of_frames): 157 | h = np.multiply.outer(complex_spectrum[:, t, f], np.conj(complex_spectrum[:, t, f]).T) 158 | yyh[:, :, t, f] = h 159 | R_xn[:, :, f] = R_xn[:, :, f] + h 160 | R_n = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 161 | for f in range(0, number_of_bins): 162 | for t in range(0, number_of_frames): 163 | R_n[:, :, f] = R_n[:, :, f] + noise_mask[t, f] * yyh[:, :, t, f] 164 | R_n[:, :, f] = R_n[:, :, f] / np.sum(noise_mask[:, f], dtype=np.complex64) 165 | R_x = R_xn - R_n 166 | return (complex_spectrum, R_x, R_n, noise_mask, speech_mask) 167 | 168 | def get_spatial_correlation_matrix_from_mask_for_LSTM(self, speech_data, speech_mask, noise_mask=np.array([None]), less_frame=10): 169 | """ 170 | if noise_mask.any() == None: 171 | print('make_noise_mask') 172 | noise_mask = (1 - speech_mask)+0.01 173 | else: 174 | noise_mask = noise_mask.T 175 | """ 176 | #print(np.shape(speech_mask), np.shape(noise_mask)) 177 | complex_spectrum, _ = util.get_3dim_spectrum_from_data(speech_data, self.fft_length, self.fft_shift, self.fft_length) 178 | tmp_complex_spectrum = copy.deepcopy(complex_spectrum) 179 | # safe guard for difference size between speakerbeam's mask and complex spectrum 180 | 181 | # ad-hock selection 5/14 182 | complex_spectrum = complex_spectrum[:, less_frame:-(less_frame + 1), :] 183 | #speech_mask = speech_mask[:, less_frame:-(less_frame + 1)] 184 | #noise_mask = noise_mask[:, less_frame:-(less_frame + 1)] 185 | number_of_channels, number_of_frames, number_of_bins = np.shape(complex_spectrum) 186 | _, number_of_frames_on_speakerbeam_mask = np.shape(noise_mask) 187 | 188 | if number_of_frames != number_of_frames_on_speakerbeam_mask: 189 | maximum_number_of_frames = np.min([number_of_frames, number_of_frames_on_speakerbeam_mask]) 190 | complex_spectrum = complex_spectrum[:, 0:maximum_number_of_frames, :] 191 | speech_mask = speech_mask[:, 0:maximum_number_of_frames] 192 | noise_mask = noise_mask[:, 0:maximum_number_of_frames] 193 | number_of_frames = maximum_number_of_frames 194 | noise_mask = np.fliplr(noise_mask.T) 195 | speech_mask = np.fliplr(speech_mask.T) 196 | """ 197 | pl.figure() 198 | 199 | pl.imshow(noise_mask, aspect='auto') 200 | pl.title('n_mask_median') 201 | pl.figure() 202 | pl.imshow(speech_mask, aspect='auto') 203 | pl.title('s_mask_median') 204 | pl.show() 205 | """ 206 | yyh = np.zeros((number_of_channels, number_of_channels, number_of_frames, number_of_bins), dtype=np.complex64) 207 | R_xn = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 208 | # init R_noisy and R_noise 209 | for f in range(0, number_of_bins): 210 | for t in range(0, number_of_frames): 211 | h = np.multiply.outer(complex_spectrum[:, t, f], np.conj(complex_spectrum[:, t, f]).T) 212 | yyh[:, :, t, f] = h 213 | R_xn[:, :, f] = R_xn[:, :, f] + h 214 | R_n = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 215 | for f in range(0, number_of_bins): 216 | for t in range(0, number_of_frames): 217 | R_n[:, :, f] = R_n[:, :, f] + noise_mask[t, f] * yyh[:, :, t, f] 218 | R_n[:, :, f] = R_n[:, :, f] / np.sum(noise_mask[:, f], dtype=np.complex64) 219 | R_x = R_xn - R_n 220 | return (tmp_complex_spectrum, R_x, R_n, noise_mask, speech_mask) 221 | 222 | 223 | def get_mvdr_beamformer(self, R_x, R_n): 224 | number_of_channels, _, number_of_bins = np.shape(R_x) 225 | beamformer = np.ones((number_of_channels, number_of_bins), dtype=np.complex64) 226 | steering_vector_save = np.ones((number_of_channels, number_of_bins), dtype=np.complex64) 227 | for f in range(0, number_of_bins): 228 | _, eigen_vector = np.linalg.eig(R_x[:, :, f]) 229 | steering_vector = eigen_vector[:, 0] 230 | steering_vector_save[:, f] = eigen_vector[:, 0] 231 | Rn_inv = np.linalg.pinv(R_n[:, :, f], rcond=self.beamformer_inv_threshold) 232 | w1 = np.matmul(Rn_inv, steering_vector) 233 | w2 = np.matmul(np.conjugate(steering_vector).T, Rn_inv) 234 | w2 = np.matmul(w2, steering_vector) 235 | w2 = np.reshape(w2, [1, 1]) 236 | w = w1 / w2 237 | w = np.reshape(w, number_of_channels) 238 | beamformer[:, f] = w 239 | return (beamformer, steering_vector_save) 240 | 241 | def get_mvdr_beamformer_onehot(self, R_x, R_n, onehot): 242 | number_of_channels, _, number_of_bins = np.shape(R_x) 243 | beamformer = np.ones((number_of_channels, number_of_bins), dtype=np.complex64) 244 | for f in range(0, number_of_bins): 245 | #_, eigen_vector = np.linalg.eig(R_x[:, :, f]) 246 | #steering_vector = eigen_vector[:, 0] 247 | Rn_inv = np.linalg.pinv(R_n[:, :, f], rcond=self.beamformer_inv_threshold) 248 | w1 = np.matmul(Rn_inv, R_x[:, :, f]) 249 | w1 = np.matmul(w1, onehot) 250 | w2 = np.trace(np.matmul(Rn_inv, R_x[:, :, f])) 251 | w2 = np.reshape(w2, [1, 1]) 252 | w = w1 / w2 253 | w = np.reshape(w, number_of_channels) 254 | beamformer[:, f] = w 255 | return (beamformer, np.ones(10)) 256 | 257 | def get_mvdr_beamformer_onehot2(self, R_x, R_n,onehot): 258 | number_of_channels, _, number_of_bins = np.shape(R_x) 259 | beamformer = np.ones((number_of_channels, number_of_bins), dtype=np.complex64) 260 | for f in range(0, number_of_bins): 261 | #_, eigen_vector = np.linalg.eig(R_x[:, :, f]) 262 | #steering_vector = eigen_vector[:, 0] 263 | steering_vector = onehot 264 | Rn_inv = np.linalg.pinv(R_n[:, :, f], rcond=self.beamformer_inv_threshold) 265 | w1 = np.matmul(Rn_inv, steering_vector) 266 | w2 = np.matmul(np.conjugate(steering_vector).T, Rn_inv) 267 | w2 = np.matmul(w2, steering_vector) 268 | w2 = np.reshape(w2, [1, 1]) 269 | w = w1 / w2 270 | w = np.reshape(w, number_of_channels) 271 | beamformer[:, f] = w 272 | return (beamformer, steering_vector) 273 | 274 | def get_mvdr_beamformer_without_onehot(self, R_x, R_n): 275 | number_of_channels, _, number_of_bins = np.shape(R_x) 276 | beamformer = np.ones((number_of_channels, number_of_bins), dtype=np.complex64) 277 | for f in range(0, number_of_bins): 278 | #_, eigen_vector = np.linalg.eig(R_x[:, :, f]) 279 | #steering_vector = eigen_vector[:, 0] 280 | #steering_vector = onehot 281 | Rn_inv = np.linalg.pinv(R_n[:, :, f], rcond=self.beamformer_inv_threshold) 282 | w1 = np.matmul(Rn_inv, R_x[:, :, f]) 283 | w1 = np.matmul(w1, np.ones(number_of_channels) / number_of_channels) 284 | #w2 = np.matmul(np.conjugate(steering_vector).T, Rn_inv) 285 | w2 = np.trace(np.matmul(Rn_inv, R_x[:, :, f])) 286 | w2 = np.reshape(w2, [1, 1]) 287 | w = w1 / w2 288 | w = np.reshape(w, number_of_channels) 289 | beamformer[:, f] = w 290 | return (beamformer, np.ones(10)) 291 | 292 | 293 | def apply_beamformer(self, beamformer, complex_spectrum): 294 | number_of_channels, number_of_frames, number_of_bins = np.shape(complex_spectrum) 295 | enhanced_spectrum = np.zeros((number_of_frames, number_of_bins), dtype=np.complex64) 296 | for f in range(0, number_of_bins): 297 | enhanced_spectrum[:, f] = np.matmul(np.conjugate(beamformer[:, f]).T, complex_spectrum[:, :, f]) 298 | return util.spec2wav(enhanced_spectrum, self.sampling_frequency, self.fft_length, self.fft_length, self.fft_shift) 299 | -------------------------------------------------------------------------------- /beamformer/complexGMM_mvdr_snr_selective.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jan 11 11:40:29 2019 4 | 5 | @author: a-kojima 6 | """ 7 | import numpy as np 8 | import copy 9 | import matplotlib.pyplot as pl 10 | from . import util 11 | 12 | class complexGMM_mvdr: 13 | 14 | def __init__(self, 15 | sampling_frequency, 16 | fft_length, 17 | fft_shift, 18 | number_of_EM_iterate, 19 | min_segment_dur, 20 | condition_number_inv_threshold=10**(-6), 21 | scm_inv_threshold=10**(-10), 22 | beamformer_inv_threshold=10**(-6)): 23 | self.sampling_frequency=sampling_frequency 24 | self.fft_length=fft_length 25 | self.fft_shift=fft_shift 26 | self.number_of_EM_iterate=number_of_EM_iterate 27 | self.min_segment_dur=min_segment_dur 28 | self.condition_number_inv_threshold=condition_number_inv_threshold 29 | self.scm_inv_threshold=scm_inv_threshold 30 | self.beamformer_inv_threshold=beamformer_inv_threshold 31 | 32 | def get_spatial_correlation_matrix(self, speech_data): 33 | complex_spectrum, _ = util.get_3dim_spectrum_from_data(speech_data, self.fft_length, self.fft_shift, self.fft_length) 34 | number_of_channels, number_of_frames, number_of_bins = np.shape(complex_spectrum) 35 | 36 | # CGMM parameters 37 | lambda_noise = np.zeros((number_of_frames, number_of_bins), dtype=np.complex64) 38 | lambda_noisy = np.zeros((number_of_frames, number_of_bins), dtype=np.complex64) 39 | phi_noise = np.ones((number_of_frames, number_of_bins), dtype=np.float64) 40 | phi_noisy = np.ones((number_of_frames, number_of_bins), dtype=np.float64) 41 | R_noise = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 42 | R_noisy = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 43 | yyh = np.zeros((number_of_channels, number_of_channels, number_of_frames, number_of_bins), dtype=np.complex64) 44 | 45 | # init R_noisy and R_noise 46 | for f in range(0, number_of_bins): 47 | for t in range(0, number_of_frames): 48 | h = np.multiply.outer(complex_spectrum[:, t, f], np.conj(complex_spectrum[:, t, f]).T) 49 | yyh[:, :, t, f] = h 50 | R_noisy[:, :, f] = R_noisy[:, :, f] + h 51 | R_noisy[:, :, f] = R_noisy[:, :, f] / number_of_frames 52 | R_noise[:, :, f] = np.eye(number_of_channels, number_of_channels, dtype=np.complex64) 53 | R_xn = copy.deepcopy(R_noisy) 54 | p_noise = np.ones((number_of_frames, number_of_bins), dtype=np.float64) 55 | p_noisy = np.ones((number_of_frames, number_of_bins), dtype=np.float64) 56 | 57 | # go EMiteration 58 | for ite in range(0, self.number_of_EM_iterate): 59 | print('iter', str(ite + 1) + '/' + str(self.number_of_EM_iterate)) 60 | for f in range(0, number_of_bins): 61 | R_noisy_onbin = copy.deepcopy(R_noisy[:, :, f]) 62 | R_noise_onbin = copy.deepcopy(R_noise[:, :, f]) 63 | if np.linalg.cond(R_noisy_onbin) < self.condition_number_inv_threshold: 64 | R_noisy_onbin = R_noisy_onbin + self.condition_number_inv_threshold * np.eye(number_of_channels) * np.max(np.diag(R_noisy_onbin)) 65 | if np.linalg.cond(R_noise_onbin) < self.condition_number_inv_threshold: 66 | R_noise_onbin = R_noise_onbin + self.condition_number_inv_threshold * np.eye(number_of_channels) * np.max(np.diag(R_noise_onbin)) 67 | 68 | R_noisy_inv = np.linalg.pinv(R_noisy_onbin, rcond=self.scm_inv_threshold) 69 | R_noise_inv = np.linalg.pinv(R_noise_onbin, rcond=self.scm_inv_threshold) 70 | R_noisy_accu = np.zeros((number_of_channels, number_of_channels), dtype=np.complex64) 71 | R_noise_accu = np.zeros((number_of_channels, number_of_channels), dtype=np.complex64) 72 | 73 | for t in range(0, number_of_frames): 74 | corre = yyh[:, :, t, f] 75 | obs = complex_spectrum[:, t, f] 76 | 77 | # update phi (real) 78 | phi_noise[t, f] = np.real(np.trace(np.matmul(corre, R_noise_inv), dtype=np.float64) / number_of_channels) 79 | phi_noisy[t, f] = np.real(np.trace(np.matmul(corre, R_noisy_inv), dtype=np.float64) / number_of_channels) 80 | if phi_noise[t, f] == 0: 81 | phi_noise[t, f] = self.condition_number_inv_threshold 82 | if phi_noisy[t, f] == 0: 83 | phi_noisy[t, f] = self.condition_number_inv_threshold 84 | 85 | # update p (real) 86 | k_noise_1 = np.matmul(np.conj(obs).T , R_noise_inv / phi_noise[t, f]) 87 | k_noise = np.matmul(k_noise_1, obs) 88 | tmp_p_noise = np.linalg.det((phi_noise[t, f] * R_noise_onbin).astype(np.float64)) 89 | p_noise[t, f] = np.real(np.exp( - np.real(k_noise).astype(np.float64)) / (np.pi * tmp_p_noise)) 90 | # avoid nan or inf 91 | if np.isnan(p_noise[t, f]) == True or np.isinf(p_noise[t, f]) == True: 92 | p_noise[t, f] = np.nan_to_num(p_noise[t, f]) 93 | k_noisy_1 = np.matmul(np.conj(obs).T, R_noisy_inv / phi_noisy[t, f]) 94 | k_noisy = np.real(np.matmul(k_noisy_1, obs)) 95 | tmp_p_noisy = np.linalg.det((phi_noisy[t, f] * R_noisy_onbin).astype(np.float64)) 96 | p_noisy[t, f] = np.real(np.exp( - np.real(k_noisy).astype(np.float64)) / (np.pi * tmp_p_noisy)) 97 | # avoid nan or inf 98 | if np.isnan(p_noisy[t, f]) == True or np.isinf(p_noisy[t, f]) == True: 99 | p_noisy[t, f] = np.nan_to_num(p_noisy[t, f]) 100 | 101 | # update lambda 102 | lambda_noise[t, f] = p_noise[t, f] / (p_noise[t, f] + p_noisy[t, f]) 103 | lambda_noisy[t, f] = p_noisy[t, f] / (p_noise[t, f] + p_noisy[t, f]) 104 | 105 | # update R 106 | R_noise_accu = R_noise_accu + lambda_noise[t, f] / phi_noise[t, f] * corre 107 | R_noisy_accu = R_noisy_accu + lambda_noisy[t, f] / phi_noisy[t, f] * corre 108 | 109 | # update R 110 | R_noise[:, :, f] = R_noise_accu / np.sum(lambda_noise[:, f], dtype=np.complex64) 111 | R_noisy[:, :, f] = R_noisy_accu / np.sum(lambda_noisy[:, f], dtype=np.complex64) 112 | 113 | # detect noise cluster by entropy 114 | for f in range(0, number_of_bins): 115 | eig_value1 = np.linalg.eigvals(R_noise[:, :, f]) 116 | eig_value2 = np.linalg.eigvals(R_noisy[:, :, f]) 117 | en_noise = np.matmul( - eig_value1.T / np.sum(eig_value1), np.log(eig_value1 / np.sum(eig_value1))) 118 | en_noisy = np.matmul( - eig_value2.T / np.sum(eig_value2), np.log(eig_value2 / np.sum(eig_value2))) 119 | if en_noise < en_noisy: 120 | Rn = copy.deepcopy(R_noise[:, :, f]) 121 | R_noise[:, :, f] = R_noisy[:, :, f] 122 | R_noisy[:, :, f] = Rn 123 | 124 | R_n = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 125 | for f in range(0, number_of_bins): 126 | for t in range(0, number_of_frames): 127 | R_n[:, :, f] = R_n[:, :, f] + lambda_noise[t, f] * yyh[:, :, t, f] 128 | R_n[:, :, f] = R_n[:, :, f] / np.sum(lambda_noise[:, f], dtype=np.complex64) 129 | R_x = R_xn - R_n 130 | return (complex_spectrum, R_x, R_n, lambda_noise, lambda_noisy) 131 | 132 | 133 | def get_spatial_correlation_matrix_ver2(self, speech_data): 134 | complex_spectrum, _ = util.get_3dim_spectrum_from_data(speech_data, self.fft_length, self.fft_shift, self.fft_length) 135 | number_of_channels, number_of_frames, number_of_bins = np.shape(complex_spectrum) 136 | 137 | # CGMM parameters 138 | lambda_noise = np.zeros((number_of_frames, number_of_bins), dtype=np.complex64) 139 | lambda_noisy = np.zeros((number_of_frames, number_of_bins), dtype=np.complex64) 140 | phi_noise = np.ones((number_of_frames, number_of_bins), dtype=np.float64) 141 | phi_noisy = np.ones((number_of_frames, number_of_bins), dtype=np.float64) 142 | R_noise = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 143 | R_noisy = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 144 | yyh = np.zeros((number_of_channels, number_of_channels, number_of_frames, number_of_bins), dtype=np.complex64) 145 | 146 | # init R_noisy and R_noise 147 | for f in range(0, number_of_bins): 148 | for t in range(0, number_of_frames): 149 | h = np.multiply.outer(complex_spectrum[:, t, f], np.conj(complex_spectrum[:, t, f]).T) 150 | yyh[:, :, t, f] = h 151 | R_noisy[:, :, f] = R_noisy[:, :, f] + h 152 | R_noisy[:, :, f] = R_noisy[:, :, f] / number_of_frames 153 | R_noise[:, :, f] = np.eye(number_of_channels, number_of_channels, dtype=np.complex64) 154 | R_xn = copy.deepcopy(R_noisy) 155 | p_noise = np.ones((number_of_frames, number_of_bins), dtype=np.float64) 156 | p_noisy = np.ones((number_of_frames, number_of_bins), dtype=np.float64) 157 | 158 | # go EMiteration 159 | for ite in range(0, self.number_of_EM_iterate): 160 | print('iter', str(ite + 1) + '/' + str(self.number_of_EM_iterate)) 161 | for f in range(0, number_of_bins): 162 | R_noisy_onbin = copy.deepcopy(R_noisy[:, :, f]) 163 | R_noise_onbin = copy.deepcopy(R_noise[:, :, f]) 164 | if np.linalg.cond(R_noisy_onbin) < self.condition_number_inv_threshold: 165 | R_noisy_onbin = R_noisy_onbin + self.condition_number_inv_threshold * np.eye(number_of_channels) * np.max(np.diag(R_noisy_onbin)) 166 | if np.linalg.cond(R_noise_onbin) < self.condition_number_inv_threshold: 167 | R_noise_onbin = R_noise_onbin + self.condition_number_inv_threshold * np.eye(number_of_channels) * np.max(np.diag(R_noise_onbin)) 168 | 169 | R_noisy_inv = np.linalg.pinv(R_noisy_onbin, rcond=self.scm_inv_threshold) 170 | R_noise_inv = np.linalg.pinv(R_noise_onbin, rcond=self.scm_inv_threshold) 171 | R_noisy_accu = np.zeros((number_of_channels, number_of_channels), dtype=np.complex64) 172 | R_noise_accu = np.zeros((number_of_channels, number_of_channels), dtype=np.complex64) 173 | 174 | for t in range(0, number_of_frames): 175 | corre = yyh[:, :, t, f] 176 | obs = complex_spectrum[:, t, f] 177 | 178 | # update phi (real) 179 | phi_noise[t, f] = np.real(np.trace(np.matmul(corre, R_noise_inv), dtype=np.float64) / number_of_channels) 180 | phi_noisy[t, f] = np.real(np.trace(np.matmul(corre, R_noisy_inv), dtype=np.float64) / number_of_channels) 181 | if phi_noise[t, f] == 0: 182 | phi_noise[t, f] = self.condition_number_inv_threshold 183 | if phi_noisy[t, f] == 0: 184 | phi_noisy[t, f] = self.condition_number_inv_threshold 185 | 186 | # update p (real) 187 | k_noise_1 = np.matmul(np.conj(obs).T , R_noise_inv / phi_noise[t, f]) 188 | k_noise = np.matmul(k_noise_1, obs) 189 | tmp_p_noise = np.linalg.det((phi_noise[t, f] * R_noise_onbin).astype(np.float64)) 190 | p_noise[t, f] = np.real(np.exp( - np.real(k_noise).astype(np.float64)) / (np.pi * tmp_p_noise)) 191 | # avoid nan or inf 192 | if np.isnan(p_noise[t, f]) == True or np.isinf(p_noise[t, f]) == True: 193 | p_noise[t, f] = np.nan_to_num(p_noise[t, f]) 194 | k_noisy_1 = np.matmul(np.conj(obs).T, R_noisy_inv / phi_noisy[t, f]) 195 | k_noisy = np.real(np.matmul(k_noisy_1, obs)) 196 | tmp_p_noisy = np.linalg.det((phi_noisy[t, f] * R_noisy_onbin).astype(np.float64)) 197 | p_noisy[t, f] = np.real(np.exp( - np.real(k_noisy).astype(np.float64)) / (np.pi * tmp_p_noisy)) 198 | # avoid nan or inf 199 | if np.isnan(p_noisy[t, f]) == True or np.isinf(p_noisy[t, f]) == True: 200 | p_noisy[t, f] = np.nan_to_num(p_noisy[t, f]) 201 | 202 | # update lambda 203 | lambda_noise[t, f] = p_noise[t, f] / (p_noise[t, f] + p_noisy[t, f]) 204 | lambda_noisy[t, f] = p_noisy[t, f] / (p_noise[t, f] + p_noisy[t, f]) 205 | 206 | # update R 207 | R_noise_accu = R_noise_accu + lambda_noise[t, f] / phi_noise[t, f] * corre 208 | R_noisy_accu = R_noisy_accu + lambda_noisy[t, f] / phi_noisy[t, f] * corre 209 | 210 | # update R 211 | R_noise[:, :, f] = R_noise_accu / np.sum(lambda_noise[:, f], dtype=np.complex64) 212 | R_noisy[:, :, f] = R_noisy_accu / np.sum(lambda_noisy[:, f], dtype=np.complex64) 213 | 214 | # detect noise cluster by entropy 215 | for f in range(0, number_of_bins): 216 | eig_value1 = np.linalg.eigvals(R_noise[:, :, f]) 217 | eig_value2 = np.linalg.eigvals(R_noisy[:, :, f]) 218 | en_noise = np.matmul( - eig_value1.T / np.sum(eig_value1), np.log(eig_value1 / np.sum(eig_value1))) 219 | en_noisy = np.matmul( - eig_value2.T / np.sum(eig_value2), np.log(eig_value2 / np.sum(eig_value2))) 220 | if en_noise < en_noisy: 221 | Rn = copy.deepcopy(R_noise[:, :, f]) 222 | R_noise[:, :, f] = R_noisy[:, :, f] 223 | R_noisy[:, :, f] = Rn 224 | 225 | R_n = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 226 | R_x = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 227 | for f in range(0, number_of_bins): 228 | for t in range(0, number_of_frames): 229 | R_n[:, :, f] = R_n[:, :, f] + lambda_noise[t, f] * yyh[:, :, t, f] 230 | R_n[:, :, f] = R_n[:, :, f] / np.sum(lambda_noise[:, f], dtype=np.complex64) 231 | 232 | for f in range(0, number_of_bins): 233 | for t in range(0, number_of_frames): 234 | R_x[:, :, f] = R_x[:, :, f] + lambda_noisy[t, f] * yyh[:, :, t, f] 235 | R_x[:, :, f] = R_x[:, :, f] / np.sum(lambda_noisy[:, f], dtype=np.complex64) 236 | 237 | #R_x = R_xn - R_n 238 | return (complex_spectrum, R_x, R_n, lambda_noise, lambda_noisy) 239 | 240 | 241 | def get_spatial_correlation_matrix_from_mask(self, speech_data, speech_mask, noise_mask=np.array([None])): 242 | if noise_mask.any() == None: 243 | print('make_noise_mask') 244 | noise_mask = (1 - speech_mask)+0.01 245 | else: 246 | noise_mask = noise_mask.T 247 | print(np.shape(speech_mask), np.shape(noise_mask)) 248 | complex_spectrum, _ = util.get_3dim_spectrum_from_data(speech_data, self.fft_length, self.fft_shift, self.fft_length) 249 | number_of_channels, number_of_frames, number_of_bins = np.shape(complex_spectrum) 250 | # safe guard for difference size between speakerbeam's mask and complex spectrum 251 | _, number_of_frames_on_speakerbeam_mask = np.shape(noise_mask) 252 | if number_of_frames != number_of_frames_on_speakerbeam_mask: 253 | maximum_number_of_frames = np.min([number_of_frames, number_of_frames_on_speakerbeam_mask]) 254 | complex_spectrum = complex_spectrum[:, 0:maximum_number_of_frames, :] 255 | speech_mask = speech_mask[:, 0:maximum_number_of_frames] 256 | noise_mask = noise_mask[:, 0:maximum_number_of_frames] 257 | number_of_frames = maximum_number_of_frames 258 | print(maximum_number_of_frames) 259 | noise_mask = noise_mask.T 260 | speech_mask = speech_mask.T 261 | yyh = np.zeros((number_of_channels, number_of_channels, number_of_frames, number_of_bins), dtype=np.complex64) 262 | R_xn = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 263 | # init R_noisy and R_noise 264 | for f in range(0, number_of_bins): 265 | for t in range(0, number_of_frames): 266 | h = np.multiply.outer(complex_spectrum[:, t, f], np.conj(complex_spectrum[:, t, f]).T) 267 | yyh[:, :, t, f] = h 268 | R_xn[:, :, f] = R_xn[:, :, f] + h 269 | R_n = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 270 | for f in range(0, number_of_bins): 271 | for t in range(0, number_of_frames): 272 | R_n[:, :, f] = R_n[:, :, f] + noise_mask[t, f] * yyh[:, :, t, f] 273 | R_n[:, :, f] = R_n[:, :, f] / np.sum(noise_mask[:, f], dtype=np.complex64) 274 | R_x = R_xn - R_n 275 | return (complex_spectrum, R_x, R_n, noise_mask, speech_mask) 276 | 277 | def get_spatial_correlation_matrix_from_mask_for_LSTM(self, speech_data, speech_mask, noise_mask=np.array([None]), less_frame=3): 278 | """ 279 | if noise_mask.any() == None: 280 | print('make_noise_mask') 281 | noise_mask = (1 - speech_mask)+0.01 282 | else: 283 | noise_mask = noise_mask.T 284 | """ 285 | #print(np.shape(speech_mask), np.shape(noise_mask)) 286 | complex_spectrum, _ = util.get_3dim_spectrum_from_data(speech_data, self.fft_length, self.fft_shift, self.fft_length) 287 | tmp_complex_spectrum = copy.deepcopy(complex_spectrum) 288 | # safe guard for difference size between speakerbeam's mask and complex spectrum 289 | 290 | # ad-hock selection 5/14 291 | complex_spectrum = complex_spectrum[:, less_frame:-(less_frame + 1), :] 292 | #speech_mask = speech_mask[:, less_frame:-(less_frame + 1)] 293 | #noise_mask = noise_mask[:, less_frame:-(less_frame + 1)] 294 | number_of_channels, number_of_frames, number_of_bins = np.shape(complex_spectrum) 295 | _, number_of_frames_on_speakerbeam_mask = np.shape(noise_mask) 296 | 297 | if number_of_frames != number_of_frames_on_speakerbeam_mask: 298 | maximum_number_of_frames = np.min([number_of_frames, number_of_frames_on_speakerbeam_mask]) 299 | complex_spectrum = complex_spectrum[:, 0:maximum_number_of_frames, :] 300 | speech_mask = speech_mask[:, 0:maximum_number_of_frames] 301 | noise_mask = noise_mask[:, 0:maximum_number_of_frames] 302 | number_of_frames = maximum_number_of_frames 303 | noise_mask = np.fliplr(noise_mask.T) 304 | speech_mask = np.fliplr(speech_mask.T) 305 | """ 306 | pl.figure() 307 | 308 | pl.imshow(noise_mask, aspect='auto') 309 | pl.title('n_mask_median') 310 | pl.figure() 311 | pl.imshow(speech_mask, aspect='auto') 312 | pl.title('s_mask_median') 313 | pl.show() 314 | """ 315 | yyh = np.zeros((number_of_channels, number_of_channels, number_of_frames, number_of_bins), dtype=np.complex64) 316 | R_xn = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 317 | # init R_noisy and R_noise 318 | for f in range(0, number_of_bins): 319 | for t in range(0, number_of_frames): 320 | h = np.multiply.outer(complex_spectrum[:, t, f], np.conj(complex_spectrum[:, t, f]).T) 321 | yyh[:, :, t, f] = h 322 | R_xn[:, :, f] = R_xn[:, :, f] + h 323 | R_n = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 324 | for f in range(0, number_of_bins): 325 | for t in range(0, number_of_frames): 326 | R_n[:, :, f] = R_n[:, :, f] + noise_mask[t, f] * yyh[:, :, t, f] 327 | R_n[:, :, f] = R_n[:, :, f] / np.sum(noise_mask[:, f], dtype=np.complex64) 328 | R_x = R_xn - R_n 329 | return (tmp_complex_spectrum, R_x, R_n, noise_mask, speech_mask) 330 | 331 | 332 | def get_spatial_correlation_matrix_from_mask_for_LSTM_ver2(self, speech_data, speech_mask, noise_mask=np.array([None]), less_frame=10): 333 | """ 334 | if noise_mask.any() == None: 335 | print('make_noise_mask') 336 | noise_mask = (1 - speech_mask)+0.01 337 | else: 338 | noise_mask = noise_mask.T 339 | 340 | return noise/speech SCM respectivily 341 | """ 342 | #print(np.shape(speech_mask), np.shape(noise_mask)) 343 | complex_spectrum, _ = util.get_3dim_spectrum_from_data(speech_data, self.fft_length, self.fft_shift, self.fft_length) 344 | tmp_complex_spectrum = copy.deepcopy(complex_spectrum) 345 | # safe guard for difference size between speakerbeam's mask and complex spectrum 346 | 347 | # ad-hock selection 5/14 348 | complex_spectrum = complex_spectrum[:, less_frame:-(less_frame + 1), :] 349 | #speech_mask = speech_mask[:, less_frame:-(less_frame + 1)] 350 | #noise_mask = noise_mask[:, less_frame:-(less_frame + 1)] 351 | number_of_channels, number_of_frames, number_of_bins = np.shape(complex_spectrum) 352 | _, number_of_frames_on_speakerbeam_mask = np.shape(noise_mask) 353 | 354 | if number_of_frames != number_of_frames_on_speakerbeam_mask: 355 | maximum_number_of_frames = np.min([number_of_frames, number_of_frames_on_speakerbeam_mask]) 356 | complex_spectrum = complex_spectrum[:, 0:maximum_number_of_frames, :] 357 | speech_mask = speech_mask[:, 0:maximum_number_of_frames] 358 | noise_mask = noise_mask[:, 0:maximum_number_of_frames] 359 | number_of_frames = maximum_number_of_frames 360 | noise_mask = np.fliplr(noise_mask.T) 361 | speech_mask = np.fliplr(speech_mask.T) 362 | """ 363 | pl.figure() 364 | 365 | pl.imshow(noise_mask, aspect='auto') 366 | pl.title('n_mask_median') 367 | pl.figure() 368 | pl.imshow(speech_mask, aspect='auto') 369 | pl.title('s_mask_median') 370 | pl.show() 371 | """ 372 | yyh = np.zeros((number_of_channels, number_of_channels, number_of_frames, number_of_bins), dtype=np.complex64) 373 | R_xn = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 374 | # init R_noisy and R_noise 375 | for f in range(0, number_of_bins): 376 | for t in range(0, number_of_frames): 377 | h = np.multiply.outer(complex_spectrum[:, t, f], np.conj(complex_spectrum[:, t, f]).T) 378 | yyh[:, :, t, f] = h 379 | R_xn[:, :, f] = R_xn[:, :, f] + h 380 | R_n = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 381 | R_x = np.zeros((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 382 | 383 | for f in range(0, number_of_bins): 384 | for t in range(0, number_of_frames): 385 | R_n[:, :, f] = R_n[:, :, f] + noise_mask[t, f] * yyh[:, :, t, f] 386 | R_n[:, :, f] = R_n[:, :, f] / np.sum(noise_mask[:, f], dtype=np.complex64) 387 | 388 | for f in range(0, number_of_bins): 389 | for t in range(0, number_of_frames): 390 | R_x[:, :, f] = R_x[:, :, f] + speech_mask[t, f] * yyh[:, :, t, f] 391 | R_x[:, :, f] = R_x[:, :, f] / np.sum(speech_mask[:, f], dtype=np.complex64) 392 | 393 | #R_x = R_x - R_n 394 | return (tmp_complex_spectrum, R_x, R_n, noise_mask, speech_mask) 395 | 396 | 397 | def get_mvdr_beamformer(self, R_x, R_n): 398 | number_of_channels, _, number_of_bins = np.shape(R_x) 399 | beamformer = np.ones((number_of_channels, number_of_bins), dtype=np.complex64) 400 | for f in range(0, number_of_bins): 401 | _, eigen_vector = np.linalg.eig(R_x[:, :, f]) 402 | steering_vector = eigen_vector[:, 0] 403 | Rn_inv = np.linalg.pinv(R_n[:, :, f], rcond=self.beamformer_inv_threshold) 404 | w1 = np.matmul(Rn_inv, steering_vector) 405 | w2 = np.matmul(np.conjugate(steering_vector).T, Rn_inv) 406 | w2 = np.matmul(w2, steering_vector) 407 | w2 = np.reshape(w2, [1, 1]) 408 | w = w1 / w2 409 | w = np.reshape(w, number_of_channels) 410 | beamformer[:, f] = w 411 | return (beamformer, steering_vector) 412 | 413 | def get_mvdr_beamformer_by_maxsnr(self, R_x, R_n): 414 | ''' 415 | Improved MVDR beamforming using single-channel mask 416 | prediction networks [Erdogan, 2016] 417 | ''' 418 | 419 | number_of_channels, _, number_of_bins = np.shape(R_x) 420 | # beamformer >> (selectablebeam, number_of_channels, number_of_bins) 421 | beamformer = np.ones((number_of_channels, number_of_channels, number_of_bins), dtype=np.complex64) 422 | # all channles beamformer 423 | selected_SNR = np.zeros(number_of_channels, dtype=np.float32) 424 | for c in range(0, number_of_channels): 425 | r = np.zeros(number_of_channels, dtype=np.complex64) 426 | r[c] = 1 427 | for f in range(0, number_of_bins): 428 | Rn_inv = np.linalg.pinv(R_n[:, :, f], rcond=self.beamformer_inv_threshold) 429 | w1_1 = np.matmul(Rn_inv, R_x[:, :, f]) 430 | w1 = np.matmul(w1_1, r) 431 | # normalize factor 432 | w2 = np.trace(w1_1) 433 | w2 = np.reshape(w2, [1, 1]) 434 | w = w1 / w2 435 | w = np.reshape(w, number_of_channels) 436 | beamformer[c, :, f] = w 437 | w1_sum = 0 438 | w2_sum = 0 439 | for f2 in range(0, number_of_bins): 440 | snr_post_w1 = np.matmul(np.conjugate(beamformer[c, :, f2]).T, R_x[:, :, f2]) 441 | snr_post_w1 = np.matmul(snr_post_w1, beamformer[c, :, f2]) 442 | snr_post_w2 = np.matmul(np.conjugate(beamformer[c, :, f2]).T, R_n[:, :, f2]) 443 | snr_post_w2 = np.matmul(snr_post_w2, beamformer[c, :, f2]) 444 | w1_sum = w1_sum + snr_post_w1 445 | w2_sum = w2_sum + snr_post_w2 446 | selected_SNR[c] = np.float32(w1_sum) / np.float32(w2_sum) 447 | print('snr', selected_SNR) 448 | max_index = np.argmax(selected_SNR) 449 | return beamformer[max_index, :, :] 450 | 451 | def apply_beamformer(self, beamformer, complex_spectrum): 452 | number_of_channels, number_of_frames, number_of_bins = np.shape(complex_spectrum) 453 | enhanced_spectrum = np.zeros((number_of_frames, number_of_bins), dtype=np.complex64) 454 | for f in range(0, number_of_bins): 455 | enhanced_spectrum[:, f] = np.matmul(np.conjugate(beamformer[:, f]).T, complex_spectrum[:, :, f]) 456 | return util.spec2wav(enhanced_spectrum, self.sampling_frequency, self.fft_length, self.fft_length, self.fft_shift) 457 | -------------------------------------------------------------------------------- /beamformer/delaysum.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | from . import util 4 | 5 | class delaysum: 6 | 7 | def __init__(self, 8 | mic_angle_vector, 9 | mic_diameter, 10 | sound_speed=343, 11 | sampling_frequency=16000, 12 | fft_length=1024, 13 | fft_shift=512): 14 | self.mic_angle_vector=mic_angle_vector 15 | self.mic_diameter=mic_diameter 16 | self.sound_speed=sound_speed 17 | self.sampling_frequency=sampling_frequency 18 | self.fft_length=fft_length 19 | self.fft_shift=fft_shift 20 | 21 | def get_sterring_vector(self, look_direction): 22 | number_of_mic = len(self.mic_angle_vector) 23 | frequency_vector = np.linspace(0, self.sampling_frequency, self.fft_length) 24 | steering_vector = np.ones((len(frequency_vector), number_of_mic), dtype=np.complex64) 25 | for f, frequency in enumerate(frequency_vector): 26 | for m, mic_angle in enumerate(self.mic_angle_vector): 27 | steering_vector[f, m] = np.complex(np.exp(( - 1j) * ((2 * np.pi * frequency) / self.sound_speed) \ 28 | * (self.mic_diameter / 2) \ 29 | * np.cos(np.deg2rad(look_direction) - np.deg2rad(mic_angle)))) 30 | steering_vector = np.conjugate(steering_vector).T 31 | normalize_steering_vector = self.normalize(steering_vector) 32 | return normalize_steering_vector[:, 0:np.int(self.fft_length / 2) + 1] 33 | 34 | def normalize(self, steering_vector): 35 | for ii in range(0, self.fft_length): 36 | weight = np.matmul(np.conjugate(steering_vector[:, ii]).T, steering_vector[:, ii]) 37 | steering_vector[:, ii] = (steering_vector[:, ii] / weight) 38 | return steering_vector 39 | 40 | def apply_beamformer(self, beamformer, complex_spectrum): 41 | number_of_channels, number_of_frames, number_of_bins = np.shape(complex_spectrum) 42 | enhanced_spectrum = np.zeros((number_of_frames, number_of_bins), dtype=np.complex64) 43 | for f in range(0, number_of_bins): 44 | enhanced_spectrum[:, f] = np.matmul(np.conjugate(beamformer[:, f]).T, complex_spectrum[:, :, f]) 45 | return util.spec2wav(enhanced_spectrum, self.sampling_frequency, self.fft_length, self.fft_length, self.fft_shift) -------------------------------------------------------------------------------- /beamformer/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Nov 26 10:09:47 2018 4 | 5 | @author: a-kojima 6 | 7 | """ 8 | import numpy as np 9 | import soundfile as sf 10 | from scipy.fftpack import fft, ifft 11 | import numpy.matlib as npm 12 | from scipy import signal as sg 13 | 14 | 15 | def stab(mat, theta, num_channels): 16 | d = np.power(np.array(10, dtype=np.complex64) , np.arange( - num_channels, 0, dtype=np.float)) 17 | result_mat = mat 18 | for i in range(1, num_channels + 1): 19 | if np.linalg.cond(mat) > theta: 20 | return result_mat 21 | result_mat = result_mat + d[i - 1] * np.eye(num_channels, dtype=np.complex64) 22 | return result_mat 23 | 24 | def get_3dim_spectrum(wav_name, channel_vec, start_point, stop_point, frame, shift, fftl): 25 | """ 26 | dump_wav : channel_size * speech_size (2dim) 27 | """ 28 | samples, _ = sf.read(wav_name.replace('{}', str(channel_vec[0])), start=start_point, stop=stop_point, dtype='float32') 29 | if len(samples) == 0: 30 | return None,None 31 | dump_wav = np.zeros((len(channel_vec), len(samples)), dtype=np.float16) 32 | dump_wav[0, :] = samples.T 33 | for ii in range(0,len(channel_vec) - 1): 34 | samples,_ = sf.read(wav_name.replace('{}', str(channel_vec[ii +1 ])), start=start_point, stop=stop_point, dtype='float32') 35 | dump_wav[ii + 1, :] = samples.T 36 | 37 | dump_wav = dump_wav / np.max(np.abs(dump_wav)) * 0.7 38 | window = sg.hanning(fftl + 1, 'periodic')[: - 1] 39 | multi_window = npm.repmat(window, len(channel_vec), 1) 40 | st = 0 41 | ed = frame 42 | number_of_frame = np.int((len(samples) - frame) / shift) 43 | spectrums = np.zeros((len(channel_vec), number_of_frame, np.int(fftl / 2) + 1), dtype=np.complex64) 44 | for ii in range(0, number_of_frame): 45 | multi_signal_spectrum = fft(dump_wav[:, st:ed], n=fftl, axis=1)[:, 0:np.int(fftl / 2) + 1] # channel * number_of_bin 46 | spectrums[:, ii, :] = multi_signal_spectrum 47 | st = st + shift 48 | ed = ed + shift 49 | return spectrums, len(samples) 50 | 51 | def get_3dim_spectrum_from_data(wav_data, frame, shift, fftl): 52 | """ 53 | dump_wav : channel_size * speech_size (2dim) 54 | """ 55 | len_sample, len_channel_vec = np.shape(wav_data) 56 | dump_wav = wav_data.T 57 | dump_wav = dump_wav / np.max(np.abs(dump_wav)) * 0.7 58 | window = sg.hanning(fftl + 1, 'periodic')[: - 1] 59 | multi_window = npm.repmat(window, len_channel_vec, 1) 60 | st = 0 61 | ed = frame 62 | number_of_frame = np.int((len_sample - frame) / shift) 63 | spectrums = np.zeros((len_channel_vec, number_of_frame, np.int(fftl / 2) + 1), dtype=np.complex64) 64 | for ii in range(0, number_of_frame): 65 | multi_signal_spectrum = fft(dump_wav[:, st:ed], n=fftl, axis=1)[:, 0:np.int(fftl / 2) + 1] # channel * number_of_bin 66 | spectrums[:, ii, :] = multi_signal_spectrum 67 | st = st + shift 68 | ed = ed + shift 69 | return spectrums, len_sample 70 | 71 | def my_det(matrix_): 72 | sign, lodget = np.linalg.slogdet(matrix_) 73 | return np.exp(lodget) 74 | 75 | def spec2wav(spectrogram, sampling_frequency, fftl, frame_len, shift_len): 76 | n_of_frame, fft_half = np.shape(spectrogram) 77 | hanning = sg.hanning(fftl + 1, 'periodic')[: - 1] 78 | cut_data = np.zeros(fftl, dtype=np.complex64) 79 | result = np.zeros(sampling_frequency * 60 * 5, dtype=np.float32) 80 | start_point = 0 81 | end_point = start_point + frame_len 82 | for ii in range(0, n_of_frame): 83 | half_spec = spectrogram[ii, :] 84 | cut_data[0:np.int(fftl / 2) + 1] = half_spec.T 85 | cut_data[np.int(fftl / 2) + 1:] = np.flip(np.conjugate(half_spec[1:np.int(fftl / 2)]), axis=0) 86 | cut_data2 = np.real(ifft(cut_data, n=fftl)) 87 | result[start_point:end_point] = result[start_point:end_point] + np.real(cut_data2 * hanning.T) 88 | start_point = start_point + shift_len 89 | end_point = end_point + shift_len 90 | return result[0:end_point - shift_len] 91 | 92 | def multispec2wav(multi_spectrogram, beamformer, fftl, shift, multi_window, true_dur): 93 | channel, number_of_frame, fft_size = np.shape(multi_spectrogram) 94 | cut_data = np.zeros((channel, fftl), dtype=np.complex64) 95 | result = np.zeros((channel, true_dur), dtype=np.float32) 96 | start_p = 0 97 | end_p = start_p + fftl 98 | for ii in range(0, number_of_frame): 99 | cut_spec = multi_spectrogram[:, ii, :] * beamformer 100 | cut_data[:, 0:fft_size] = cut_spec 101 | cut_data[:, fft_size:] = np.transpose(np.flip(cut_spec[:, 1:fft_size - 1], axis=1).T) 102 | cut_data2 = np.real(ifft(cut_data, n=fftl, axis=1)) 103 | result[:, start_p:end_p] = result[:, start_p:end_p] + (cut_data2 * multi_window) 104 | start_p = start_p + shift 105 | end_p = end_p + shift 106 | return np.sum(result[:,0:end_p - shift], axis=0) 107 | 108 | 109 | def check_beamformer(freq_beamformer,theta_cov): 110 | freq_beamformer = np.real(freq_beamformer) 111 | if len(freq_beamformer[freq_beamformer>=theta_cov])!=0: 112 | return np.ones(np.shape(freq_beamformer),dtype=np.complex64) * (1+1j) 113 | return freq_beamformer 114 | 115 | 116 | -------------------------------------------------------------------------------- /dataset/adaptation_data/speaker1/251-136532-0000.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/adaptation_data/speaker1/251-136532-0000.flac -------------------------------------------------------------------------------- /dataset/adaptation_data/speaker1/251-136532-0001.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/adaptation_data/speaker1/251-136532-0001.flac -------------------------------------------------------------------------------- /dataset/adaptation_data/speaker1/251-136532-0002.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/adaptation_data/speaker1/251-136532-0002.flac -------------------------------------------------------------------------------- /dataset/adaptation_data/speaker1/251-136532-0003.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/adaptation_data/speaker1/251-136532-0003.flac -------------------------------------------------------------------------------- /dataset/adaptation_data/speaker1/251-136532-0004.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/adaptation_data/speaker1/251-136532-0004.flac -------------------------------------------------------------------------------- /dataset/adaptation_data/speaker1_2/251-137823-0023.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/adaptation_data/speaker1_2/251-137823-0023.flac -------------------------------------------------------------------------------- /dataset/adaptation_data/speaker2/2412-153954-0002.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/adaptation_data/speaker2/2412-153954-0002.flac -------------------------------------------------------------------------------- /dataset/adaptation_data/speaker2/2412-153954-0003.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/adaptation_data/speaker2/2412-153954-0003.flac -------------------------------------------------------------------------------- /dataset/adaptation_data/speaker2/2412-153954-0004.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/adaptation_data/speaker2/2412-153954-0004.flac -------------------------------------------------------------------------------- /dataset/adaptation_data/speaker2/2412-153954-0005.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/adaptation_data/speaker2/2412-153954-0005.flac -------------------------------------------------------------------------------- /dataset/adaptation_data/speaker2/2412-153954-0006.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/adaptation_data/speaker2/2412-153954-0006.flac -------------------------------------------------------------------------------- /dataset/adaptation_data/speaker2/2412-153954-0007.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/adaptation_data/speaker2/2412-153954-0007.flac -------------------------------------------------------------------------------- /dataset/data_for_beamforming/F02_011C021A_BUS.CH1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/data_for_beamforming/F02_011C021A_BUS.CH1.wav -------------------------------------------------------------------------------- /dataset/data_for_beamforming/F02_011C021A_BUS.CH2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/data_for_beamforming/F02_011C021A_BUS.CH2.wav -------------------------------------------------------------------------------- /dataset/data_for_beamforming/F02_011C021A_BUS.CH3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/data_for_beamforming/F02_011C021A_BUS.CH3.wav -------------------------------------------------------------------------------- /dataset/data_for_beamforming/F02_011C021A_BUS.CH4.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/data_for_beamforming/F02_011C021A_BUS.CH4.wav -------------------------------------------------------------------------------- /dataset/data_for_beamforming/F02_011C021A_BUS.CH5.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/data_for_beamforming/F02_011C021A_BUS.CH5.wav -------------------------------------------------------------------------------- /dataset/data_for_beamforming/F02_011C021A_BUS.CH6.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/data_for_beamforming/F02_011C021A_BUS.CH6.wav -------------------------------------------------------------------------------- /dataset/train/noise/447o0301_0.32331_445c020s_-0.32331_12.244375.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/noise/447o0301_0.32331_445c020s_-0.32331_12.244375.wav -------------------------------------------------------------------------------- /dataset/train/noise/447o0301_0.56098_22ha010i_-0.56098_12.603875.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/noise/447o0301_0.56098_22ha010i_-0.56098_12.603875.wav -------------------------------------------------------------------------------- /dataset/train/noise/447o0301_0.69862_050o020g_-0.69862_12.1389375.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/noise/447o0301_0.69862_050o020g_-0.69862_12.1389375.wav -------------------------------------------------------------------------------- /dataset/train/noise/447o0301_0.9755_423o0308_-0.9755_13.431875.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/noise/447o0301_0.9755_423o0308_-0.9755_13.431875.wav -------------------------------------------------------------------------------- /dataset/train/noise/447o0302_1.3388_22ho010i_-1.3388_12.69025.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/noise/447o0302_1.3388_22ho010i_-1.3388_12.69025.wav -------------------------------------------------------------------------------- /dataset/train/noise/447o0302_2.1067_422o030k_-2.1067_11.834.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/noise/447o0302_2.1067_422o030k_-2.1067_11.834.wav -------------------------------------------------------------------------------- /dataset/train/noise/447o030x_0.98832_441o0308_-0.98832_16.4556875.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/noise/447o030x_0.98832_441o0308_-0.98832_16.4556875.wav -------------------------------------------------------------------------------- /dataset/train/noise/447o030x_1.4783_422o030p_-1.4783_16.124125.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/noise/447o030x_1.4783_422o030p_-1.4783_16.124125.wav -------------------------------------------------------------------------------- /dataset/train/noise/447o030x_1.6276_440o0304_-1.6276_14.556125.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/noise/447o030x_1.6276_440o0304_-1.6276_14.556125.wav -------------------------------------------------------------------------------- /dataset/train/speech/652-130737-0001.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/speech/652-130737-0001.flac -------------------------------------------------------------------------------- /dataset/train/speech/652-130737-0002.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/speech/652-130737-0002.flac -------------------------------------------------------------------------------- /dataset/train/speech/652-130737-0003.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/speech/652-130737-0003.flac -------------------------------------------------------------------------------- /dataset/train/speech/652-130737-0004.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/speech/652-130737-0004.flac -------------------------------------------------------------------------------- /dataset/train/speech/652-130737-0005.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/speech/652-130737-0005.flac -------------------------------------------------------------------------------- /dataset/train/speech/652-130737-0006.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/speech/652-130737-0006.flac -------------------------------------------------------------------------------- /dataset/train/speech/777-126732-0002.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/speech/777-126732-0002.flac -------------------------------------------------------------------------------- /dataset/train/speech/777-126732-0003.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/speech/777-126732-0003.flac -------------------------------------------------------------------------------- /dataset/train/speech/777-126732-0004.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/speech/777-126732-0004.flac -------------------------------------------------------------------------------- /dataset/train/speech/777-126732-0005.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/train/speech/777-126732-0005.flac -------------------------------------------------------------------------------- /dataset/validate/noise/447o030q_2.4332_440o0309_-2.4332_12.56975.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/noise/447o030q_2.4332_440o0309_-2.4332_12.56975.wav -------------------------------------------------------------------------------- /dataset/validate/noise/447o030r_0.25387_442c020t_-0.25387_14.1650625.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/noise/447o030r_0.25387_442c020t_-0.25387_14.1650625.wav -------------------------------------------------------------------------------- /dataset/validate/noise/447o030r_1.6517_422o0312_-1.6517_14.6394375.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/noise/447o030r_1.6517_422o0312_-1.6517_14.6394375.wav -------------------------------------------------------------------------------- /dataset/validate/noise/447o030t_1.3876_442o0305_-1.3876_11.87325.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/noise/447o030t_1.3876_442o0305_-1.3876_11.87325.wav -------------------------------------------------------------------------------- /dataset/validate/noise/447o030u_1.9508_051c0109_-1.9508_16.297875.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/noise/447o030u_1.9508_051c0109_-1.9508_16.297875.wav -------------------------------------------------------------------------------- /dataset/validate/speech/174-84280-0001.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/speech/174-84280-0001.flac -------------------------------------------------------------------------------- /dataset/validate/speech/174-84280-0002.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/speech/174-84280-0002.flac -------------------------------------------------------------------------------- /dataset/validate/speech/174-84280-0003.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/speech/174-84280-0003.flac -------------------------------------------------------------------------------- /dataset/validate/speech/174-84280-0004.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/speech/174-84280-0004.flac -------------------------------------------------------------------------------- /dataset/validate/speech/174-84280-0005.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/speech/174-84280-0005.flac -------------------------------------------------------------------------------- /dataset/validate/speech/84-121123-0001.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/speech/84-121123-0001.flac -------------------------------------------------------------------------------- /dataset/validate/speech/84-121123-0002.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/speech/84-121123-0002.flac -------------------------------------------------------------------------------- /dataset/validate/speech/84-121123-0003.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/speech/84-121123-0003.flac -------------------------------------------------------------------------------- /dataset/validate/speech/84-121123-0004.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/speech/84-121123-0004.flac -------------------------------------------------------------------------------- /dataset/validate/speech/84-121123-0005.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/speech/84-121123-0005.flac -------------------------------------------------------------------------------- /dataset/validate/speech/84-121123-0006.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/dataset/validate/speech/84-121123-0006.flac -------------------------------------------------------------------------------- /generate_validate_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 24 19:29:31 2019 4 | 5 | @author: a-kojima 6 | 7 | Neural mask estimation for MVDR 8 | 9 | this script suppots on-the-fly training for data-augmentation efficiently 10 | 11 | """ 12 | import numpy as np 13 | import glob 14 | from scipy import stats 15 | import random 16 | import soundfile as sf 17 | import matplotlib.pyplot as pl 18 | import sys 19 | 20 | from maskestimator import shaper, feature, augment 21 | 22 | #========================================== 23 | # ANALYSIS PARAMETERS 24 | #========================================== 25 | SAMPLING_FREQUENCY = 16000 26 | FFTL = 1024 27 | SHIFT = 256 28 | 29 | #========================================== 30 | # ESURAL MASL ESTIMATOR PARAMETERS 31 | #========================================== 32 | LEFT_CONTEXT = 0 33 | RIGHT_CONTEXT = 0 34 | NUMBER_OF_SKIP_FRAME = 0 35 | 36 | #========================================== 37 | # NEURAL MASL ESTIMATOR TRAINNING PARAMERTERS 38 | #========================================== 39 | TRUNCATE_GRAD = 7 40 | SPEECH_DIRECTORY = r'./dataset/validate/speech/*' 41 | NOISE_DIRECTORY = r'./dataset/validate/noise/*' 42 | IS_DEBUG_SHOW_MASK_AND_SYNTHESIS = False 43 | 44 | #========================================== 45 | # NAME for making validation data 46 | #========================================== 47 | VALIDATION_SPEC = r'./validation_features/val_spec.npy' 48 | VALIDATION_SPEECH_MASK = r'./validation_features/speech_mask.npy' 49 | VALIDATION_NOISE_MASK = r'./validation_features/noise_mask.npy' 50 | 51 | 52 | NUMBER_OF_STACK = LEFT_CONTEXT + RIGHT_CONTEXT + 1 53 | 54 | #========================================== 55 | # augmentation parameters 56 | #========================================== 57 | ''' 58 | snr: [SNR20, SNR15, SNR10, SNR5, SNR0] 59 | prob.: [0.2,... 0.2] 60 | ''' 61 | SNR_generator = stats.rv_discrete(values=(np.array([0, 1, 2, 3, 4]), 62 | (0.2, 0.2, 0.2, 0.2, 0.2))) 63 | noise_list = glob.glob(NOISE_DIRECTORY) 64 | RIR_CONVOLVE_CHANCE_RATE = 0.25 # 0.5 means 50 % chanve rate 65 | 66 | #========================================== 67 | # prepare speech and noise file list 68 | #========================================== 69 | speech_list = glob.glob(SPEECH_DIRECTORY) 70 | noise_list = glob.glob(NOISE_DIRECTORY) 71 | 72 | 73 | #========================================== 74 | # training data shaper 75 | #========================================== 76 | data_shaper = shaper.Shape_data(LEFT_CONTEXT, 77 | RIGHT_CONTEXT, 78 | TRUNCATE_GRAD, 79 | NUMBER_OF_SKIP_FRAME ) 80 | 81 | 82 | #========================================== 83 | # get features 84 | #========================================== 85 | feature_extractor = feature.Feature(SAMPLING_FREQUENCY, FFTL, SHIFT) 86 | 87 | noise_generator = augment.Generate_random_noise(noise_list, SAMPLING_FREQUENCY) 88 | 89 | reverbarent_generator = augment.RIR_convolve(SAMPLING_FREQUENCY) 90 | 91 | #========================================== 92 | # go training 93 | #========================================== 94 | TRIM = np.int(0.05 * SAMPLING_FREQUENCY) # beginning and ending of uttearnce is not used for training 95 | freq_grid = np.linspace(0, SAMPLING_FREQUENCY, FFTL)[0:FFTL // 2 + 1] 96 | bin_index = np.argmin(np.abs(freq_grid - 2000)) 97 | 98 | 99 | speech_list_shuffle = random.sample(speech_list, len(speech_list)) 100 | 101 | # go NN parameters optimizer 102 | 103 | feature_stack = [] 104 | label_stack_sp = [] 105 | label_stack_n = [] 106 | 107 | # dumping frame until searching # of utterances 108 | while True: 109 | 110 | if len(speech_list_shuffle) <= 0: 111 | break 112 | 113 | index = np.random.randint(0, len(speech_list_shuffle), 1)[0] 114 | audio_path = speech_list_shuffle[index] 115 | speech_list_shuffle.pop(index) # remove uterance chosen yet 116 | 117 | 118 | speech = sf.read(audio_path, dtype='float32')[0] 119 | 120 | if len(speech) != 0: 121 | speech = feature_extractor.add_white_noise(speech) 122 | if IS_DEBUG_SHOW_MASK_AND_SYNTHESIS == True: 123 | sf.write('./result/speech_clean.wav', speech , 16000) 124 | SNR_index = SNR_generator.rvs(size=1)[0] 125 | noise = noise_generator.get_noise(len(speech)) 126 | noise = feature_extractor.add_white_noise(noise) 127 | 128 | if RIR_CONVOLVE_CHANCE_RATE != 0: 129 | # convolve RIR 130 | if np.random.randint(0, 1 // RIR_CONVOLVE_CHANCE_RATE, 1)[0] == 1: 131 | speech, noise = reverbarent_generator.get_reverbant_speech(speech, noise) 132 | 133 | snr_adjuster = augment.SNR_adjusting(speech, noise) 134 | if SNR_index == 0: 135 | SNR = 20 136 | elif SNR_index == 1: 137 | SNR = 15 138 | elif SNR_index == 2: 139 | SNR = 10 140 | elif SNR_index == 3: 141 | SNR = 5 142 | elif SNR_index == 4: 143 | SNR = 0 144 | speech, noise = snr_adjuster.add_speech_to_noise(SNR) 145 | speech, noise = snr_adjuster.avoid_clipping(speech, noise) 146 | 147 | # if get mask after SNR adjusting 148 | speech_spectrogram = feature_extractor.get_feature(speech) 149 | noise_spectrogram = feature_extractor.get_feature(noise) 150 | freq_grid = np.linspace(0, SAMPLING_FREQUENCY, FFTL)[0:FFTL // 2 + 1] 151 | bin_index = np.argmin(np.abs(freq_grid - 2000)) 152 | speech_mask, noise_mask = feature_extractor.get_ideal_binary_mask_herman(speech_spectrogram, 153 | noise_spectrogram, 154 | threshold_bin=bin_index, 155 | theta_sp_low=10**(-4), 156 | theta_sp_high=10**(-5), 157 | theta_n_low=10**(-5),#-0.01 158 | theta_n_high=10**(-5)) #-0.02 159 | 160 | noisy_spectrogram = (speech_spectrogram + noise_spectrogram) 161 | noisy_spectrogram = (np.flipud(noisy_spectrogram)) 162 | speech_mask = np.flipud(speech_mask) 163 | noise_mask = np.flipud(noise_mask) 164 | 165 | noisy_spectrogram = feature_extractor.apply_cmvn(noisy_spectrogram) 166 | noisy_spectrogram = noisy_spectrogram + np.random.normal(loc=0, scale=0.0001, size=np.shape(noisy_spectrogram)) 167 | features, label_sp, label_n = data_shaper.convert_for_train(noisy_spectrogram, speech_mask, noise_mask) 168 | 169 | if len(features) != 0: 170 | features = np.array(features) 171 | label_sp = np.array(label_sp) 172 | label_n = np.array(label_n) 173 | if IS_DEBUG_SHOW_MASK_AND_SYNTHESIS == True: 174 | sf.write('./result/speech_noisy.wav', speech + noise, 16000) 175 | pl.figure(), 176 | pl.imshow(noise_mask, aspect='auto', extent=[0, np.shape(noise_mask)[1], 0, 8000]) 177 | pl.title('noise mask') 178 | pl.figure(), 179 | pl.imshow(speech_mask, aspect='auto',extent=[0, np.shape(noise_mask)[1], 0, 8000]) 180 | pl.title('sp mask') 181 | pl.figure() 182 | pl.imshow(noisy_spectrogram, aspect='auto') 183 | pl.show() 184 | sys.exit() 185 | feature_stack.extend(features) 186 | label_stack_sp.extend(label_sp) 187 | label_stack_n.extend(label_n) 188 | 189 | train_features = np.array(feature_stack) 190 | train_label_sp = np.array(label_stack_sp) 191 | train_label_n = np.array(label_stack_n) 192 | 193 | np.save(VALIDATION_SPEC, train_features) 194 | np.save(VALIDATION_NOISE_MASK, train_label_n) 195 | np.save(VALIDATION_SPEECH_MASK, train_label_sp) 196 | -------------------------------------------------------------------------------- /image/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/image/model.png -------------------------------------------------------------------------------- /image/sample_mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/image/sample_mask.png -------------------------------------------------------------------------------- /image/sample_mask_multi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/image/sample_mask_multi.png -------------------------------------------------------------------------------- /maskestimator/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from . import augment 3 | from . import feature 4 | from . import model 5 | from . import shaper 6 | from . import util 7 | from . import adapt_model 8 | 9 | -------------------------------------------------------------------------------- /maskestimator/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/maskestimator/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /maskestimator/__pycache__/adapt_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/maskestimator/__pycache__/adapt_model.cpython-36.pyc -------------------------------------------------------------------------------- /maskestimator/__pycache__/augment.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/maskestimator/__pycache__/augment.cpython-36.pyc -------------------------------------------------------------------------------- /maskestimator/__pycache__/feature.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/maskestimator/__pycache__/feature.cpython-36.pyc -------------------------------------------------------------------------------- /maskestimator/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/maskestimator/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /maskestimator/__pycache__/shaper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/maskestimator/__pycache__/shaper.cpython-36.pyc -------------------------------------------------------------------------------- /maskestimator/__pycache__/util.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/maskestimator/__pycache__/util.cpython-36.pyc -------------------------------------------------------------------------------- /maskestimator/adapt_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | NMBF adaptation 4 | reference: 5 | The Hitachi/JHU CHiME-5 system: 6 | Advances in speech recognition for everyday home 7 | environments using multiple microphone arrays [Kanda, 2018] 8 | ''' 9 | 10 | import soundfile as sf 11 | import numpy as np 12 | import glob 13 | import random 14 | import os 15 | 16 | from . import model, feature, shaper 17 | 18 | MAX_SEQUENCE = 5000 19 | 20 | class adapt_model: 21 | def __init__(self, 22 | model_path, 23 | truncate_grad, 24 | number_of_stack, 25 | lr, 26 | spec_dim, 27 | sampling_frequency, 28 | fftl, 29 | shift, 30 | left_context, 31 | right_contect, 32 | number_of_skip_frame, 33 | adapt_data_location): 34 | self.sampling_frequency = sampling_frequency 35 | self.fftl = fftl 36 | self.shift = shift 37 | self.left_context = left_context 38 | self.right_context = right_contect 39 | self.number_of_skip_frame = number_of_skip_frame 40 | self.adapt_data_location = adapt_data_location 41 | self.lr = lr 42 | self.model_path = model_path 43 | self.truncate_grad = truncate_grad 44 | self.number_of_stack = number_of_stack 45 | self.spec_dim = spec_dim 46 | 47 | def get_data_list(self): 48 | data_list = [] 49 | file_list = glob.glob(self.adapt_data_location + '/**') 50 | for ii in range(0, len(file_list)): 51 | if 'sp_mask_' in file_list[ii]: 52 | data_list.append(file_list[ii]) 53 | return data_list 54 | 55 | 56 | def create_data_for_adaptation(self, 57 | is_target, 58 | speaker_uttearnce_list) : 59 | ''' data shape is 60 | training data for adaptation : (B, F) 61 | input data for adaptation: (B, Truncate, T)''' 62 | 63 | mask_estimator_generator = model.NeuralMaskEstimation(self.truncate_grad, self.number_of_stack, self.lr, self.spec_dim) 64 | mask_estimator = mask_estimator_generator.get_model(is_stateful=True, is_show_detail=False, is_adapt=False) 65 | mask_estimator = mask_estimator_generator.load_weight_param(mask_estimator, self.model_path) 66 | 67 | 68 | f = open(speaker_uttearnce_list, 'r', encoding='utf-8') 69 | wav_path = f.readlines() 70 | f.close() 71 | 72 | feature_extractor = feature.Feature(self.sampling_frequency, self.fftl, self.shift) 73 | data_shaper = shaper.Shape_data(self.left_context, 74 | self.right_context, 75 | self.truncate_grad, 76 | self.number_of_skip_frame) 77 | print('creating data for adaptation') 78 | for wav in wav_path: 79 | data = sf.read(wav.replace('\n', ''), dtype='float32')[0] 80 | if len(np.shape(data)) >= 2: 81 | data = data[:, 0] 82 | noisy_spectrogram = feature_extractor.get_feature(data) 83 | noisy_spectrogram = (np.flipud(noisy_spectrogram)) 84 | noisy_spectrogram = feature_extractor.apply_cmvn(noisy_spectrogram) 85 | features = data_shaper.convert_for_predict(noisy_spectrogram) 86 | print(np.shape(features)) 87 | features_padding, original_batch_size = data_shaper.get_padding_features(features) 88 | mask_estimator.reset_states() 89 | prefix = os.path.splitext(wav)[1] 90 | print(np.shape(features_padding)) 91 | sp_mask, n_mask = mask_estimator.predict(features_padding, batch_size=MAX_SEQUENCE) 92 | sp_mask = sp_mask[:original_batch_size, :] 93 | n_mask = n_mask[:original_batch_size, :] 94 | save_path_target = self.adapt_data_location + '/' + os.path.basename(wav).replace(prefix, 'sp_mask_' + str(np.int(is_target)) ) 95 | save_path_input = self.adapt_data_location + '/' + os.path.basename(wav).replace(prefix, 'amp_spec_' + str(np.int(is_target)) ) 96 | save_path_target = save_path_target.replace('\n', '') 97 | save_path_input = save_path_input.replace('\n', '') 98 | np.save(save_path_input, np.array(features)) 99 | np.save(save_path_target, np.array(sp_mask) * np.int(is_target)) 100 | print('done.') 101 | 102 | def save_adapt_model(self, save_name): 103 | ''' data shape is 104 | training data for adaptation : (B, F) 105 | input data for adaptation: (B, Truncate, T)''' 106 | mask_estimator_generator = model.NeuralMaskEstimation(self.truncate_grad, self.number_of_stack, self.lr, self.spec_dim) 107 | mask_estimator = mask_estimator_generator.get_model(is_stateful=False, is_show_detail=False, is_adapt=True) 108 | mask_estimator = mask_estimator_generator.load_weight_param(mask_estimator, self.model_path) 109 | 110 | # =========================== 111 | # get wav list for adaptation 112 | # =========================== 113 | training_list = self.get_data_list() 114 | 115 | # =========================== 116 | # fature dump 117 | # =========================== 118 | target_mask = np.zeros((1, self.spec_dim)) 119 | input_amp = np.zeros((1, self.truncate_grad, self.spec_dim)) 120 | for ii in range(0, len(training_list)): 121 | target_mask = np.concatenate((target_mask, np.load(training_list[ii])), axis=0) 122 | input_amp = np.concatenate((input_amp, np.load(training_list[ii].replace('sp_mask_', 'amp_spec_'))), axis=0) 123 | target_mask = target_mask[1:, :] 124 | input_amp = input_amp[1:, :, :] 125 | 126 | # =========================== 127 | # fit 128 | # =========================== 129 | shuffle_index = random.sample(range(0, np.shape(target_mask)[0]), np.shape(target_mask)[0]) 130 | print('adaptation...') 131 | history = mask_estimator.train_on_batch(x=input_amp[shuffle_index, :, :], 132 | y=[target_mask[shuffle_index, :], 133 | target_mask[shuffle_index, :]]) 134 | print('Done.', history) 135 | mask_estimator.save_weights(save_name) 136 | print('save done.' + str(save_name)) 137 | -------------------------------------------------------------------------------- /maskestimator/augment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 24 19:32:49 2019 4 | 5 | @author: a-kojima 6 | """ 7 | import numpy as np 8 | import os 9 | import soundfile as sf 10 | import pyroomacoustics as pra 11 | 12 | class Generate_random_noise: 13 | def __init__(self, 14 | noise_file_list, 15 | sampling_frequency): 16 | self.noise_file_list = noise_file_list 17 | self.sampling_frequency = sampling_frequency 18 | 19 | def get_noise(self, speech_length): 20 | selected_index = np.random.randint(0, len(self.noise_file_list), size=1)[0] 21 | selected_noise_path = self.noise_file_list[selected_index] 22 | noise_length = np.int(np.float(os.path.basename(selected_noise_path).split('_')[-1].split('.wav')[0])) * self.sampling_frequency 23 | if noise_length > speech_length: 24 | noise_cut_start = np.random.randint(0, noise_length - speech_length, size=1)[0] 25 | noise_data = sf.read(selected_noise_path, dtype='float32', start=noise_cut_start, stop=noise_cut_start + speech_length)[0] 26 | if len(np.shape(noise_data)) >= 2: 27 | noise_data = noise_data[:, 0] 28 | else: 29 | noise_data = sf.read(selected_noise_path)[0] 30 | if len(np.shape(noise_data)) >= 2: 31 | noise_data = noise_data[:, 0] 32 | noise_data = np.tile(noise_data, 30) # adhock-number 33 | noise_data = noise_data[0:speech_length] 34 | return noise_data 35 | 36 | class SNR_adjusting: 37 | def __init__(self, speech_data, noise_data): 38 | self.speech_data = speech_data 39 | self.noise_data = noise_data 40 | 41 | def adjust_SNR(self, speech_data, speech_rate): 42 | return speech_data * speech_rate 43 | 44 | def add_speech_to_noise(self, target_SNR): 45 | speech_data = self.normalize_amplitude(self.speech_data, 0.9) 46 | noise_data = self.normalize_amplitude(self.noise_data, 0.1) 47 | speech_power_coeficient = self.get_speech_rate(speech_data, noise_data, target_SNR) 48 | return (self.adjust_SNR(speech_power_coeficient, speech_data), noise_data) 49 | 50 | def normalize_amplitude(self, speech_data, max_amplitude): 51 | return speech_data/np.max(np.abs(speech_data)) * max_amplitude 52 | 53 | def get_speech_rate(self, speech, noise, target_SNR): 54 | return 10 ** (target_SNR / np.float(20)) * (np.sum(noise ** 2) / np.float(np.sum(speech ** 2))) 55 | 56 | def avoid_clipping(self, speech, noise): 57 | max_amp = (0.9 - 0.01) * np.random.rand() + 0.01 58 | if (np.max(np.abs(speech))) >= (np.max(np.abs(noise))): 59 | rate = max_amp / (np.max(np.abs(speech))) 60 | speech = speech * rate 61 | noise = noise * rate 62 | else: 63 | rate = (max_amp / (np.max(np.abs(noise)))) 64 | noise = noise * rate 65 | speech = speech * rate 66 | return speech, noise 67 | 68 | class RIR_convolve: 69 | ''' generate speech using image-method based room simulator 70 | ''' 71 | def __init__(self, sampling_frequency) : 72 | self.sampling_frequency = sampling_frequency 73 | 74 | def get_reverbant_speech(self, speech, noise): 75 | meters = np.random.randint(6, 10, 1)[0] 76 | distance = np.random.randint(2, 5, 1)[0] 77 | rt = (0.5 - 0.01) * np.random.rand() + 0.01 78 | # speech 79 | room = pra.ShoeBox([meters, meters], fs=self.sampling_frequency, t0=0., absorption=rt, max_order=12) 80 | R = pra.circular_2D_array(center=[distance, distance], M=1, phi0=0, radius=0.07) 81 | room.add_microphone_array(pra.MicrophoneArray(R, room.fs)) 82 | room.add_source([1, 1], signal=speech) 83 | room.simulate() 84 | ori_length = len(speech) 85 | speech = room.mic_array.signals.T 86 | speech = speech[0:ori_length, 0] 87 | # noise 88 | distance2 = np.random.randint(2, 5, 1)[0] 89 | room = pra.ShoeBox([meters, meters], fs=self.sampling_frequency, t0=0., absorption=rt, max_order=12) 90 | R = pra.circular_2D_array(center=[distance2, distance2], M=1, phi0=0, radius=0.07) 91 | room.add_microphone_array(pra.MicrophoneArray(R, room.fs)) 92 | room.add_source([distance2-0.5, distance2-0.5], signal=noise) 93 | room.simulate() 94 | ori_length = len(noise) 95 | noise = room.mic_array.signals.T 96 | noise = noise[0:ori_length, 0] 97 | return speech, noise -------------------------------------------------------------------------------- /maskestimator/feature.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 24 19:25:12 2019 4 | 5 | @author: a-kojima 6 | """ 7 | 8 | import numpy as np 9 | import librosa 10 | 11 | class Feature: 12 | 13 | def __init__(self, 14 | sampling_frequency, 15 | fftl, 16 | shift): 17 | self.sampling_frequency = sampling_frequency 18 | self.fftl = fftl 19 | self.shift = shift 20 | 21 | def add_white_noise(self, data, min_amp=0.00001): 22 | return data + np.random.normal(loc=0, scale=min_amp, size=len(data)) 23 | 24 | def get_feature(self, speech): 25 | spectrogram = librosa.core.stft(speech, 26 | n_fft=self.fftl, 27 | hop_length=self.shift, 28 | win_length=self.fftl) 29 | return np.abs(spectrogram) 30 | 31 | def get_ideal_binary_mask_herman(self, 32 | speech_spectrogram, 33 | noise_spectrogram, 34 | threshold_bin=100, 35 | theta_sp_low=0, 36 | theta_sp_high=0, 37 | theta_n_low=0, 38 | theta_n_high=0): 39 | speech_mask = np.sqrt(np.abs(speech_spectrogram) ** 2) / np.sqrt(np.abs(noise_spectrogram) ** 2) 40 | noise_mask = np.sqrt(np.abs(speech_spectrogram) ** 2) / np.sqrt(np.abs(noise_spectrogram) ** 2) 41 | 42 | speech_mask_low = speech_mask[0:threshold_bin, :] 43 | speech_mask_high = speech_mask[threshold_bin:, :] 44 | noise_mask_low = noise_mask[0:threshold_bin, :] 45 | noise_mask_high = noise_mask[threshold_bin:, :] 46 | speech_mask_low[speech_mask_low > theta_sp_low] = 1 47 | speech_mask_low[speech_mask_low <= theta_sp_low] = 0 48 | 49 | speech_mask_high[speech_mask_high > theta_sp_high] = 1 50 | speech_mask_high[speech_mask_high <= theta_sp_high] = 0 51 | 52 | noise_mask_low[noise_mask_low > theta_n_low] = 1 53 | noise_mask_low[noise_mask_low <= theta_n_low] = 0 54 | 55 | noise_mask_high[noise_mask_high > theta_n_high] = 1 56 | noise_mask_high[noise_mask_high <= theta_n_high] = 0 57 | 58 | speech_mask[0:threshold_bin, :] = speech_mask_low 59 | speech_mask[threshold_bin:, :] = speech_mask_high 60 | 61 | noise_mask[0:threshold_bin, :] = noise_mask_low 62 | noise_mask[threshold_bin:, :] = noise_mask_high 63 | 64 | noise_mask_tmp = self.apply_cmvn(speech_spectrogram ** 2) 65 | speech_mask_tmp = noise_mask_tmp 66 | noise_mask_tmp[noise_mask_tmp <= 0.0001] = 0 67 | noise_mask_tmp[noise_mask_tmp > 0.0001] = 1 68 | 69 | speech_mask_tmp[speech_mask_tmp <= 0.01] = 0 70 | speech_mask_tmp[speech_mask_tmp > 0.01] = 1 71 | 72 | noise_mask_tmp = 1 - noise_mask_tmp 73 | noise_mask = np.logical_and(noise_mask_tmp, noise_mask) 74 | speech_mask = np.logical_and(speech_mask_tmp, speech_mask) 75 | 76 | speech_mask, noise_mask = self.apply_filter_spech_component(speech_mask, noise_mask) 77 | 78 | return (speech_mask.astype(np.int), noise_mask.astype(np.int)) 79 | 80 | def apply_cmvn(self, specs): 81 | mean = np.mean(specs, axis=1) 82 | std_var = np.std(specs, axis=1) 83 | return ((specs.T - mean) / std_var).T 84 | 85 | def apply_range_norm(self, specs): 86 | specs = ((specs - np.min(specs)) / (np.max(specs) - np.min(specs))) * (1 - 0) + 0 87 | return specs 88 | 89 | def apply_filter_spech_component(self, speech_mask, noise_mask): 90 | freq_grid = np.linspace(0, self.sampling_frequency, self.fftl)[0:np.int(self.fftl / 2) + 1] 91 | hz_90_index = np.argmin(np.abs(freq_grid - 50)) 92 | speech_mask[ 0: hz_90_index, :] = 0.0 93 | noise_mask[ 0: hz_90_index, :] = 1.0 94 | hz_7800_index = np.argmin(np.abs(freq_grid - 7900)) 95 | speech_mask[ hz_7800_index:, :] = 0.0 96 | noise_mask[ hz_7800_index:, :] = 1.0 97 | return speech_mask, noise_mask 98 | -------------------------------------------------------------------------------- /maskestimator/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 24 19:25:12 2019 4 | 5 | @author: a-kojima 6 | """ 7 | from tensorflow.keras import Model 8 | from tensorflow.keras.layers import Dense, LSTM, Dropout 9 | from tensorflow.keras import layers 10 | from tensorflow.keras import optimizers 11 | from tensorflow.keras import initializers 12 | 13 | DROPOUT = 0.5 14 | MAX_SEQUENCE = 5000 15 | 16 | class NeuralMaskEstimation: 17 | 18 | def __init__(self, 19 | truncate_grad, 20 | number_of_stack, 21 | lr, 22 | spec_dim, 23 | ff_dropout=0, 24 | recurrent_dropout=0, 25 | recurrent_init=0.04): 26 | self.truncate_grad = truncate_grad 27 | self.number_of_stack = number_of_stack 28 | self.lr = lr 29 | self.spec_dim = spec_dim # 513 30 | self.ff_dropout = ff_dropout 31 | self.recurrent_dropout = recurrent_dropout 32 | self.recurrent_init=recurrent_init 33 | 34 | def get_model(self, 35 | is_stateful=True, 36 | is_show_detail=True, 37 | is_adapt=False): 38 | if is_stateful == True: 39 | input_sequence = layers.Input(shape=(self.truncate_grad, self.spec_dim * self.number_of_stack), batch_size=MAX_SEQUENCE) # time step * feature_size 40 | else: 41 | input_sequence = layers.Input(shape=(self.truncate_grad, self.spec_dim * self.number_of_stack)) # time step * feature_size 42 | LSTM_layer = (LSTM(self.spec_dim, 43 | activation='tanh', 44 | recurrent_activation='sigmoid', 45 | return_sequences=False, 46 | stateful=is_stateful, 47 | dropout=self.ff_dropout, 48 | recurrent_dropout=self.recurrent_dropout, 49 | go_backwards=False, 50 | unroll=True, 51 | recurrent_initializer=initializers.RandomUniform(minval=-self.recurrent_init, maxval=self.recurrent_init), #0.04 52 | name='lstm' 53 | ))(input_sequence) 54 | DROPOUT1 = Dropout(DROPOUT, name='dropout1')(LSTM_layer) 55 | FC1 = Dense(self.spec_dim,activation='relu', name='fc1')(DROPOUT1) 56 | FC2 = Dropout(DROPOUT, name='dropout2')(FC1) 57 | FC3 = Dense(self.spec_dim,activation='relu', name='fc2')(FC2) 58 | DROPOUT2 = Dropout(DROPOUT, name='dropout3')(FC3) 59 | OUTPUT1 = Dense(self.spec_dim, 60 | activation='sigmoid', 61 | name='speech_mask')(DROPOUT2) 62 | 63 | OUTPUT2 = Dense(513, 64 | activation='sigmoid', 65 | name='noise_mask')(DROPOUT2) 66 | model = Model(inputs=[input_sequence], outputs=[OUTPUT1, OUTPUT2]) 67 | 68 | if is_adapt == False: 69 | model.compile( 70 | loss={'speech_mask':'binary_crossentropy', 'noise_mask':'binary_crossentropy'}, 71 | metrics=['acc'], 72 | sample_weight_mode="None", 73 | loss_weights={'speech_mask':1, 'noise_mask':1}, 74 | optimizer=optimizers.RMSprop(lr=self.lr, decay=1e-6, epsilon=1e-06, clipnorm=1.0)) 75 | else: 76 | model.compile( 77 | loss={'speech_mask':'binary_crossentropy', 'noise_mask':'binary_crossentropy'}, 78 | metrics=['acc'], 79 | sample_weight_mode="None", 80 | loss_weights={'speech_mask':1, 'noise_mask':0}, 81 | #optimizer=optimizers.RMSprop(lr=self.lr, decay=1e-6, epsilon=1e-06)) 82 | optimizer=optimizers.RMSprop(lr=self.lr, clipnorm=1.0)) 83 | 84 | if is_show_detail == True: 85 | model.summary() 86 | 87 | #utils.plot_model(model, to_file='model.png') 88 | 89 | return model 90 | 91 | def load_weight_param(self, model, weight_path): 92 | model.load_weights(weight_path) 93 | model._make_predict_function() 94 | return model 95 | -------------------------------------------------------------------------------- /maskestimator/shaper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Created on Wed Aug 7 10:25:44 2019 4 | 5 | @author: a-kojima 6 | 7 | ''' 8 | import numpy as np 9 | 10 | MAX_SEQUENCE = 5000 11 | 12 | class Shape_data: 13 | def __init__(self, left_stack, right_stack, max_sequence, number_of_skip_frames): 14 | self.left_stack = left_stack 15 | self.right_stack = right_stack 16 | self.max_sequence = max_sequence 17 | self.number_of_skip_frames = number_of_skip_frames 18 | self.number_of_stack = self.left_stack + self.right_stack + 1 19 | 20 | def convert_for_train(self, data, label1, label2): 21 | ''' 22 | feature: (T, F) -> (B, TRUNCATE, F) 23 | label: (T, F) -> (B, F) 24 | ''' 25 | 26 | stack_data = [] 27 | stack_label_sp = [] 28 | stack_label_n = [] 29 | fftl, number_of_frames = np.shape(data) 30 | number_of_sample = np.int((number_of_frames - (self.left_stack + self.right_stack + 1)) / (self.number_of_skip_frames + 1)) # # number of sample 31 | number_of_mini_batch = np.int(number_of_sample - self.max_sequence) 32 | 33 | if number_of_mini_batch == 0: # less than 1 block 34 | return (np.array([]), np.array([]), np.array([])) 35 | utterance_pointer = self.left_stack 36 | 37 | for j in range(0, number_of_mini_batch): 38 | tmp_stack_data = [] 39 | center_position = utterance_pointer 40 | for i in range(0, self.max_sequence): 41 | cut_data = data[:, center_position - self.left_stack:center_position + self.right_stack + 1 ] 42 | if i == np.int(self.max_sequence / 2): 43 | cut_label1 = (label1[:, center_position]) 44 | cut_label2 = (label2[:, center_position]) 45 | vec_data = np.reshape(cut_data, fftl * self.number_of_stack) 46 | tmp_stack_data.append(vec_data) 47 | center_position = center_position + 1 + self.number_of_skip_frames 48 | stack_data.append(tmp_stack_data) 49 | stack_label_sp.append(cut_label1) 50 | stack_label_n.append(cut_label2) 51 | utterance_pointer = utterance_pointer + 1 52 | return (stack_data, stack_label_sp, stack_label_n) 53 | 54 | def convert_for_predict(self, data): 55 | ''' 56 | feature: (T, F) -> (B, TRUNCATE, F) 57 | ''' 58 | stack_data = [] 59 | fftl, number_of_frames = np.shape(data) 60 | number_of_sample = np.int((number_of_frames - (self.left_stack + self.right_stack + 1)) / (self.number_of_skip_frames + 1)) # # number of sample 61 | number_of_mini_batch = np.int(number_of_sample - self.max_sequence) 62 | 63 | if number_of_mini_batch == 0: # less than 1 block 64 | return (np.array([]), np.array([]), np.array([])) 65 | 66 | utterance_pointer = self.left_stack 67 | 68 | for j in range(0, number_of_mini_batch): 69 | tmp_stack_data = [] 70 | center_position = utterance_pointer 71 | for i in range(0, self.max_sequence): 72 | cut_data = data[:, center_position - self.left_stack:center_position + self.right_stack + 1 ] 73 | vec_data = np.reshape(cut_data, fftl * self.number_of_stack) 74 | tmp_stack_data.append(vec_data) 75 | center_position = center_position + 1 + self.number_of_skip_frames 76 | stack_data.append(tmp_stack_data) 77 | utterance_pointer = utterance_pointer + 1 78 | return stack_data 79 | 80 | def get_padding_features(self, predict_features): 81 | batch, sequence, feature_order = np.shape(predict_features) 82 | padding_feature = np.zeros((MAX_SEQUENCE, sequence, feature_order), dtype=np.float32) 83 | padding_feature[: batch, :, :] = predict_features 84 | return padding_feature, batch 85 | -------------------------------------------------------------------------------- /maskestimator/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | 4 | ''' 5 | import tensorflow as tf 6 | 7 | def write_log(callback, names, logs, batch_no): 8 | for name, value in zip(names, logs): 9 | summary = tf.Summary() 10 | summary_value = summary.value.add() 11 | summary_value.simple_value = value 12 | summary_value.tag = name 13 | callback.writer.add_summary(summary, batch_no) 14 | callback.writer.flush() 15 | 16 | def create_validation_data(validation_directory): 17 | '''create validation data and save them as numpy array 18 | 19 | ''' -------------------------------------------------------------------------------- /model/194sequence_false_e1.hdf5.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/model/194sequence_false_e1.hdf5.data-00000-of-00001 -------------------------------------------------------------------------------- /model/194sequence_false_e1.hdf5.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/model/194sequence_false_e1.hdf5.index -------------------------------------------------------------------------------- /model/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "speaker_2.hdf5" 2 | all_model_checkpoint_paths: "speaker_2.hdf5" 3 | -------------------------------------------------------------------------------- /non_adapt_speaker_list.txt: -------------------------------------------------------------------------------- 1 | ./dataset/speech_speaker2/JA018_1.wav 2 | ./dataset/speech_speaker2/JA018_2.wav 3 | ./dataset/speech_speaker2/JA018_3.wav 4 | ./dataset/speech_speaker2/JA018_4.wav 5 | ./dataset/speech_speaker2/JA018_5.wav 6 | ./dataset/speech_speaker2/JA018_9.wav 7 | ./dataset/speech_speaker2/JA018_10.wav -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 24 19:29:57 2019 4 | 5 | @author: a-kojima 6 | """ 7 | 8 | import numpy as np 9 | import soundfile as sf 10 | import matplotlib.pyplot as pl 11 | 12 | from beamformer import complexGMM_mvdr as cgmm 13 | from beamformer import complexGMM_mvdr_snr_selective as cgmm_snr 14 | from maskestimator import model, shaper, feature 15 | 16 | #========================================== 17 | # ANALYSIS PARAMETERS 18 | #========================================== 19 | SAMPLING_FREQUENCY = 16000 20 | FFTL = 1024 21 | SHIFT = 256 22 | 23 | #========================================== 24 | # ESURAL MASL ESTIMATOR PARAMETERS 25 | #========================================== 26 | LEFT_CONTEXT = 0 27 | RIGHT_CONTEXT = 0 28 | NUMBER_OF_SKIP_FRAME = 0 29 | 30 | #========================================== 31 | # ESURAL MASL ESTIMATOR TRAINNING PARAMERTERS 32 | #========================================== 33 | TRUNCATE_GRAD = 7 34 | IS_DEBUG_SHOW_PREDICT_MASK = True 35 | 36 | NOISY_SPEECH_PATH = r'./dataset/data_for_beamforming/F02_011C021A_BUS.CH{}.wav' 37 | CHANNEL_INDEX = [1, 2, 3, 4, 5, 6] 38 | WEIGHT_PATH = r'./model/194sequence_false_e1.hdf5' 39 | 40 | NUMBER_OF_STACK = LEFT_CONTEXT + RIGHT_CONTEXT + 1 41 | 42 | OPERATION = 'median' 43 | RECURRENT_CELL_INIT = 0.00001 #0.04 44 | 45 | MAX_SEQUENCE = 5000 46 | 47 | #========================================== 48 | # get model 49 | #========================================== 50 | mask_estimator_generator = model.NeuralMaskEstimation(TRUNCATE_GRAD, 51 | NUMBER_OF_STACK, 52 | 0.1, 53 | FFTL // 2 + 1, 54 | recurrent_init=RECURRENT_CELL_INIT) 55 | 56 | mask_estimator = mask_estimator_generator.get_model(is_stateful=True, is_show_detail=True, is_adapt=False) 57 | 58 | mask_estimator = mask_estimator_generator.load_weight_param(mask_estimator, WEIGHT_PATH) 59 | #========================================== 60 | # predicting data shaper 61 | #========================================== 62 | data_shaper = shaper.Shape_data(LEFT_CONTEXT, 63 | RIGHT_CONTEXT, 64 | TRUNCATE_GRAD, 65 | NUMBER_OF_SKIP_FRAME ) 66 | 67 | #========================================== 68 | # get features 69 | #========================================== 70 | feature_extractor = feature.Feature(SAMPLING_FREQUENCY, FFTL, SHIFT) 71 | 72 | for ii in range(0, len(CHANNEL_INDEX)): 73 | speech = sf.read(NOISY_SPEECH_PATH.replace('{}', str(CHANNEL_INDEX[ii])))[0] 74 | 75 | noisy_spectrogram = feature_extractor.get_feature(speech) 76 | noisy_spectrogram = (np.flipud(noisy_spectrogram)) 77 | noisy_spectrogram = feature_extractor.apply_cmvn(noisy_spectrogram) 78 | 79 | features = data_shaper.convert_for_predict(noisy_spectrogram) 80 | features = np.array(features) 81 | 82 | mask_estimator.reset_states() 83 | 84 | padding_feature, original_batch_size = data_shaper.get_padding_features(features) 85 | sp_mask, n_mask = mask_estimator.predict(padding_feature, batch_size=MAX_SEQUENCE) 86 | sp_mask = sp_mask[:original_batch_size, :] 87 | n_mask = n_mask[:original_batch_size, :] 88 | 89 | if IS_DEBUG_SHOW_PREDICT_MASK == True: 90 | pl.subplot(len(CHANNEL_INDEX), 2, ((ii + 1) * 2) - 1) 91 | pl.imshow(((n_mask).T), aspect='auto') 92 | pl.subplot(len(CHANNEL_INDEX), 2, ((ii + 1) * 2)) 93 | pl.imshow(((sp_mask).T), aspect='auto') 94 | 95 | 96 | if ii == 0: 97 | aa,bb = np.shape(n_mask) 98 | n_median = np.zeros((aa,bb,len(CHANNEL_INDEX))) 99 | sp_median = np.zeros((aa,bb,len(CHANNEL_INDEX))) 100 | 101 | n_median[:,:,ii] = n_mask 102 | sp_median[:,:,ii] = sp_mask 103 | dump_speech = np.zeros((len(speech), len(CHANNEL_INDEX))) 104 | dump_speech[:, ii] = speech 105 | else: 106 | n_median[:,:,ii] = n_mask 107 | sp_median[:,:,ii] = sp_mask 108 | dump_speech[:, ii] = speech 109 | 110 | if OPERATION == 'median': 111 | n_median_s = np.median(n_median, axis=2) 112 | sp_median_s = np.median(sp_median, axis=2) 113 | else: 114 | n_median_s = np.mean(n_median, axis=2) 115 | sp_median_s = np.mean(sp_median, axis=2) 116 | 117 | 118 | if IS_DEBUG_SHOW_PREDICT_MASK == True: 119 | pl.figure() 120 | pl.subplot(3,1,1) 121 | pl.imshow((np.log10(noisy_spectrogram[:, TRUNCATE_GRAD // 2:- TRUNCATE_GRAD // 2] ** 2) * 10), aspect='auto') 122 | pl.subplot(3,1,2) 123 | pl.imshow(((n_median_s.T)), aspect = "auto") 124 | pl.title('noise mask') 125 | pl.subplot(3,1,3) 126 | pl.imshow(((sp_median_s.T)), aspect = "auto") 127 | pl.title('speech mask') 128 | pl.show() 129 | 130 | #========================================== 131 | # beamforming 132 | #========================================== 133 | 134 | # sinple MVDR 135 | cgmm_bf = cgmm.complexGMM_mvdr(SAMPLING_FREQUENCY, FFTL, SHIFT, 10, 10) 136 | tmp_complex_spectrum, R_x, R_n, tt, nn = cgmm_bf.get_spatial_correlation_matrix_from_mask_for_LSTM(dump_speech, 137 | speech_mask=sp_median_s.T, 138 | noise_mask=n_median_s.T, 139 | less_frame=3) 140 | beamformer, steering_vector = cgmm_bf.get_mvdr_beamformer(R_x, R_n) 141 | enhan_speech = cgmm_bf.apply_beamformer(beamformer, tmp_complex_spectrum) 142 | 143 | # reference mic selection MVDR 144 | cgmm_bf_snr = cgmm_snr.complexGMM_mvdr(SAMPLING_FREQUENCY, FFTL, SHIFT, 10, 10) 145 | 146 | tmp_complex_spectrum, R_x, R_n, tt, nn = cgmm_bf_snr.get_spatial_correlation_matrix_from_mask_for_LSTM(dump_speech, 147 | speech_mask=sp_median_s.T, 148 | noise_mask=n_median_s.T, 149 | less_frame=3) 150 | 151 | selected_beamformer = cgmm_bf_snr.get_mvdr_beamformer_by_maxsnr(R_x, R_n) 152 | enhan_speech2 = cgmm_bf_snr.apply_beamformer(selected_beamformer, tmp_complex_spectrum) 153 | 154 | enhan_speech = enhan_speech / np.max(np.abs(enhan_speech)) * 0.75 155 | enhan_speech2 = enhan_speech2 / np.max(np.abs(enhan_speech2)) * 0.75 156 | sf.write('./result/enhacement_all_channels.wav', enhan_speech, 16000) 157 | sf.write('./result/enhacement_snr_select.wav', enhan_speech2, 16000) 158 | -------------------------------------------------------------------------------- /predict_single.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 24 19:29:57 2019 4 | 5 | @author: a-kojima 6 | """ 7 | 8 | import numpy as np 9 | import soundfile as sf 10 | import matplotlib.pyplot as pl 11 | import sys 12 | 13 | from beamformer import complexGMM_mvdr as cgmm 14 | from beamformer import util 15 | from beamformer import complexGMM_mvdr_snr_selective as cgmm_snr 16 | from maskestimator import model, shaper, feature 17 | 18 | 19 | def apply_range_norm(specs, min_val=0): 20 | specs = ((specs - np.min(specs)) / (np.max(specs) - np.min(specs))) * (1 - min_val) + min_val 21 | return specs 22 | 23 | #========================================== 24 | # ANALYSIS PARAMETERS 25 | #========================================== 26 | SAMPLING_FREQUENCY = 16000 27 | FFTL = 1024 28 | SHIFT = 256 29 | 30 | #========================================== 31 | # ESURAL MASL ESTIMATOR PARAMETERS 32 | #========================================== 33 | LEFT_CONTEXT = 0 34 | RIGHT_CONTEXT = 0 35 | NUMBER_OF_SKIP_FRAME = 0 36 | 37 | #========================================== 38 | # ESURAL MASL ESTIMATOR TRAINNING PARAMERTERS 39 | #========================================== 40 | TRUNCATE_GRAD = 7 41 | IS_DEBUG_SHOW_PREDICT_MASK = True 42 | 43 | NOISY_SPEECH_PATH = r'./dataset/data_for_beamforming/F02_011C021A_BUS.CH{}.wav' 44 | CHANNEL_INDEX = [1] 45 | WEIGHT_PATH = r'./model/194sequence_false_e1.hdf5' 46 | 47 | NUMBER_OF_STACK = LEFT_CONTEXT + RIGHT_CONTEXT + 1 48 | 49 | OPERATION = 'median' 50 | RECURRENT_CELL_INIT = 0.00001 #0.04 51 | 52 | MAX_SEQUENCE = 5000 53 | 54 | #========================================== 55 | # get model 56 | #========================================== 57 | mask_estimator_generator = model.NeuralMaskEstimation(TRUNCATE_GRAD, 58 | NUMBER_OF_STACK, 59 | 0.1, 60 | FFTL // 2 + 1, 61 | recurrent_init=RECURRENT_CELL_INIT) 62 | 63 | mask_estimator = mask_estimator_generator.get_model(is_stateful=True, is_show_detail=True, is_adapt=False) 64 | 65 | mask_estimator = mask_estimator_generator.load_weight_param(mask_estimator, WEIGHT_PATH) 66 | #========================================== 67 | # predicting data shaper 68 | #========================================== 69 | data_shaper = shaper.Shape_data(LEFT_CONTEXT, 70 | RIGHT_CONTEXT, 71 | TRUNCATE_GRAD, 72 | NUMBER_OF_SKIP_FRAME ) 73 | 74 | #========================================== 75 | # get features 76 | #========================================== 77 | feature_extractor = feature.Feature(SAMPLING_FREQUENCY, FFTL, SHIFT) 78 | 79 | for ii in range(0, len(CHANNEL_INDEX)): 80 | speech = sf.read(NOISY_SPEECH_PATH.replace('{}', str(CHANNEL_INDEX[ii])))[0] 81 | 82 | noisy_spectrogram = feature_extractor.get_feature(speech) 83 | noisy_spectrogram = (np.flipud(noisy_spectrogram)) 84 | noisy_spectrogram = feature_extractor.apply_cmvn(noisy_spectrogram) 85 | 86 | features = data_shaper.convert_for_predict(noisy_spectrogram) 87 | features = np.array(features) 88 | 89 | mask_estimator.reset_states() 90 | 91 | padding_feature, original_batch_size = data_shaper.get_padding_features(features) 92 | sp_mask, n_mask = mask_estimator.predict(padding_feature, batch_size=MAX_SEQUENCE) 93 | sp_mask = sp_mask[:original_batch_size, :] 94 | n_mask = n_mask[:original_batch_size, :] 95 | 96 | if IS_DEBUG_SHOW_PREDICT_MASK == True: 97 | pl.subplot(len(CHANNEL_INDEX), 2, ((ii + 1) * 2) - 1) 98 | pl.imshow(((n_mask).T), aspect='auto') 99 | pl.subplot(len(CHANNEL_INDEX), 2, ((ii + 1) * 2)) 100 | pl.imshow(((sp_mask).T), aspect='auto') 101 | 102 | 103 | if ii == 0: 104 | aa,bb = np.shape(n_mask) 105 | n_median = np.zeros((aa,bb,len(CHANNEL_INDEX))) 106 | sp_median = np.zeros((aa,bb,len(CHANNEL_INDEX))) 107 | 108 | n_median[:,:,ii] = n_mask 109 | sp_median[:,:,ii] = sp_mask 110 | dump_speech = np.zeros((len(speech), len(CHANNEL_INDEX))) 111 | dump_speech[:, ii] = speech 112 | else: 113 | n_median[:,:,ii] = n_mask 114 | sp_median[:,:,ii] = sp_mask 115 | dump_speech[:, ii] = speech 116 | 117 | if OPERATION == 'median': 118 | n_median_s = np.median(n_median, axis=2) 119 | sp_median_s = np.median(sp_median, axis=2) 120 | else: 121 | n_median_s = np.mean(n_median, axis=2) 122 | sp_median_s = np.mean(sp_median, axis=2) 123 | 124 | 125 | 126 | 127 | #========================================== 128 | # beamforming 129 | #========================================== 130 | 131 | # sinple MVDR 132 | cgmm_bf = cgmm.complexGMM_mvdr(SAMPLING_FREQUENCY, FFTL, SHIFT, 10, 10) 133 | tmp_complex_spectrum, R_x, R_n, tt, nn = cgmm_bf.get_spatial_correlation_matrix_from_mask_for_LSTM(dump_speech, 134 | speech_mask=sp_median_s.T, 135 | noise_mask=n_median_s.T, 136 | less_frame=3) 137 | # extract 138 | tmp_complex_spectrum = tmp_complex_spectrum[0,:,:] 139 | print(np.shape(sp_median_s)) 140 | print(np.shape(tmp_complex_spectrum)) 141 | # min frame size 142 | frame1 = np.shape(sp_median_s)[0] 143 | frame2 = np.shape(tmp_complex_spectrum)[1] 144 | min_f = np.min((frame1, frame2)) 145 | sp_median_s = sp_median_s[0:min_f, :] 146 | sp_median_s = apply_range_norm(sp_median_s) 147 | tmp_complex_spectrum = np.fliplr(tmp_complex_spectrum[0:min_f, :]) 148 | #enhanced_spectrum = sp_median_s * np.flipud(tmp_complex_spectrum) 149 | enhanced_spectrum = sp_median_s * (tmp_complex_spectrum) 150 | 151 | #enhanced_spectrum = np.flipud(np.fliplr(enhanced_spectrum)) 152 | enhanced_spectrum = (np.fliplr(enhanced_spectrum)) 153 | 154 | pl.figure() 155 | pl.imshow((sp_median_s), aspect='auto') 156 | 157 | pl.figure() 158 | pl.imshow(np.abs(tmp_complex_spectrum), aspect='auto') 159 | pl.show() 160 | 161 | test_wavform = util.spec2wav(enhanced_spectrum, SAMPLING_FREQUENCY, FFTL, FFTL, SHIFT) 162 | sf.write('./result/single_channel_enhancement.wav', test_wavform / np.max(np.abs(test_wavform)) * 0.8, 16000) 163 | -------------------------------------------------------------------------------- /result/enhacement.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/result/enhacement.wav -------------------------------------------------------------------------------- /result/enhacement_all_channels.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/result/enhacement_all_channels.wav -------------------------------------------------------------------------------- /result/enhacement_snr_select.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/result/enhacement_snr_select.wav -------------------------------------------------------------------------------- /result/speech_clean.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/result/speech_clean.wav -------------------------------------------------------------------------------- /result/speech_noisy.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkojimaSLP/Neural-mask-estimation/52512f19d95e2e11fb542415a50219bcb43e4e2e/result/speech_noisy.wav -------------------------------------------------------------------------------- /sp1_list.txt: -------------------------------------------------------------------------------- 1 | ./dataset/adaptation_data/speaker1/251-136532-0000.flac 2 | ./dataset/adaptation_data/speaker1/251-136532-0001.flac 3 | ./dataset/adaptation_data/speaker1/251-136532-0002.flac 4 | ./dataset/adaptation_data/speaker1/251-136532-0003.flac 5 | ./dataset/adaptation_data/speaker1/251-136532-0004.flac -------------------------------------------------------------------------------- /sp2_list.txt: -------------------------------------------------------------------------------- 1 | ./dataset/adaptation_data/speaker2/2412-153954-0002.flac 2 | ./dataset/adaptation_data/speaker2/2412-153954-0003.flac 3 | ./dataset/adaptation_data/speaker2/2412-153954-0004.flac 4 | ./dataset/adaptation_data/speaker2/2412-153954-0005.flac 5 | ./dataset/adaptation_data/speaker2/2412-153954-0006.flac 6 | ./dataset/adaptation_data/speaker2/2412-153954-0007.flac -------------------------------------------------------------------------------- /speaker_aware_mask_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jul 30 11:28:39 2019 4 | 5 | @author: a-kojima 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | """ 10 | Created on Wed Jul 24 19:29:57 2019 11 | 12 | @author: a-kojima 13 | """ 14 | 15 | import numpy as np 16 | import soundfile as sf 17 | import matplotlib.pyplot as pl 18 | from scipy import signal as sg 19 | from numpy.linalg import solve 20 | from scipy.linalg import eig 21 | from scipy.linalg import eigh 22 | 23 | from beamformer import complexGMM_mvdr as cgmm 24 | from beamformer import util 25 | from maskestimator import model, shaper, feature 26 | 27 | 28 | 29 | 30 | def get_stack_speech(speech1, speech2): 31 | return np.concatenate((speech1, speech2)) 32 | 33 | #========================================== 34 | # ANALYSIS PARAMETERS 35 | #========================================== 36 | SAMPLING_FREQUENCY = 16000 37 | FFTL = 1024 38 | SHIFT = 256 39 | 40 | #========================================== 41 | # ESURAL MASL ESTIMATOR PARAMETERS 42 | #========================================== 43 | LEFT_CONTEXT = 0 44 | RIGHT_CONTEXT = 0 45 | NUMBER_OF_SKIP_FRAME = 0 46 | 47 | #========================================== 48 | # ESURAL MASL ESTIMATOR TRAINNING PARAMERTERS 49 | #========================================== 50 | TRUNCATE_GRAD = 7 51 | IS_DEBUG_SHOW_PREDICT_MASK = True 52 | RECURRENT_INIT = 0.00001 53 | SPEECH_PATH = r'./dataset/adaptation_data/speaker1_2/251-137823-0023.flac' 54 | WEIGHT_PATH_ORI = r'./model/194sequence_false_e1.hdf5' #194 55 | WEIGHT_PATH_SP1 = r'./model/speaker_1.hdf5' 56 | WEIGHT_PATH_SP2 = r'./model/speaker_2.hdf5' 57 | NUMBER_OF_STACK = LEFT_CONTEXT + RIGHT_CONTEXT + 1 58 | 59 | #========================================== 60 | # get model 61 | #========================================== 62 | mask_estimator_generator1 = model.NeuralMaskEstimation(TRUNCATE_GRAD, 63 | NUMBER_OF_STACK, 64 | 0.1, 65 | FFTL // 2 + 1, 66 | recurrent_init=RECURRENT_INIT) 67 | mask_estimator1 = mask_estimator_generator1.get_model(is_stateful=True, is_show_detail=False, is_adapt=False,) 68 | mask_estimator1 = mask_estimator_generator1.load_weight_param(mask_estimator1, WEIGHT_PATH_SP1) 69 | 70 | mask_estimator_generator2 = model.NeuralMaskEstimation(TRUNCATE_GRAD, 71 | NUMBER_OF_STACK, 72 | 0.1, 73 | FFTL // 2 + 1, 74 | recurrent_init=RECURRENT_INIT) 75 | mask_estimator2 = mask_estimator_generator2.get_model(is_stateful=True, is_show_detail=False, is_adapt=False) 76 | mask_estimator2 = mask_estimator_generator2.load_weight_param(mask_estimator2, WEIGHT_PATH_SP2) 77 | 78 | mask_estimator_generator_ori = model.NeuralMaskEstimation(TRUNCATE_GRAD, 79 | NUMBER_OF_STACK, 80 | 0.1, 81 | FFTL // 2 + 1, 82 | recurrent_init=RECURRENT_INIT) 83 | mask_estimator_ori = mask_estimator_generator_ori.get_model(is_stateful=True, is_show_detail=False, is_adapt=False) 84 | 85 | mask_estimator_ori = mask_estimator_generator_ori.load_weight_param(mask_estimator_ori, WEIGHT_PATH_ORI) 86 | 87 | 88 | #========================================== 89 | # predicting data shaper 90 | #========================================== 91 | data_shaper = shaper.Shape_data(LEFT_CONTEXT, 92 | RIGHT_CONTEXT, 93 | TRUNCATE_GRAD, 94 | NUMBER_OF_SKIP_FRAME ) 95 | 96 | #========================================== 97 | # get features 98 | #========================================== 99 | feature_extractor = feature.Feature(SAMPLING_FREQUENCY, FFTL, SHIFT) 100 | speech = sf.read(SPEECH_PATH)[0] 101 | 102 | 103 | noisy_spectrogram = feature_extractor.get_feature(speech) 104 | noisy_spectrogram = np.flipud(noisy_spectrogram) 105 | noisy_spectrogram = feature_extractor.apply_cmvn(noisy_spectrogram) 106 | 107 | features = data_shaper.convert_for_predict(noisy_spectrogram) 108 | features = np.array(features) 109 | padding_feature, original_batch_size = data_shaper.get_padding_features(features) 110 | 111 | mask_estimator1.reset_states() 112 | sp_mask1, n_mask1 = mask_estimator1.predict_on_batch(padding_feature) 113 | sp_mask1 = sp_mask1[:original_batch_size, :] 114 | n_mask1 = n_mask1[:original_batch_size, :] 115 | 116 | mask_estimator2.reset_states() 117 | sp_mask2, n_mask2 = mask_estimator2.predict_on_batch(padding_feature) 118 | sp_mask2 = sp_mask2[:original_batch_size, :] 119 | n_mask2 = n_mask2[:original_batch_size, :] 120 | 121 | 122 | mask_estimator_ori.reset_states() 123 | sp_mask_ori, n_mask_ori = mask_estimator_ori.predict_on_batch(padding_feature) 124 | sp_mask_ori = sp_mask_ori[:original_batch_size, :] 125 | n_mask_ori = n_mask_ori[:original_batch_size, :] 126 | 127 | 128 | pl.figure(), 129 | pl.subplot(2, 1, 1) 130 | pl.imshow(n_mask1.T, aspect='auto') 131 | pl.title('sp1_original') 132 | pl.subplot(2, 1, 2) 133 | pl.imshow(sp_mask1.T, aspect='auto') 134 | 135 | pl.figure(), 136 | pl.subplot(2, 1, 1) 137 | pl.imshow(n_mask2.T, aspect='auto') 138 | pl.title('sp2_original') 139 | pl.subplot(2, 1, 2) 140 | pl.imshow(sp_mask2.T, aspect='auto') 141 | 142 | 143 | pl.figure(), 144 | pl.subplot(2, 1, 1) 145 | pl.imshow(n_mask_ori.T, aspect='auto') 146 | pl.title('original_model_predict') 147 | pl.subplot(2, 1, 2) 148 | pl.imshow(sp_mask_ori.T, aspect='auto') 149 | 150 | # ==================================== 151 | # subract and calculate final mask 152 | # ==================================== 153 | sub_mask_sp_mask = sp_mask1 - sp_mask2 154 | sub_mask_sp_mask[sub_mask_sp_mask<=0] = 0 155 | 156 | pl.figure(), 157 | pl.subplot(2, 1, 1) 158 | pl.imshow(1 - sub_mask_sp_mask.T, aspect='auto') 159 | pl.title('speaker1-aware mask') 160 | pl.subplot(2, 1, 2) 161 | pl.imshow(sub_mask_sp_mask.T, aspect='auto') 162 | 163 | pl.show() 164 | 165 | -------------------------------------------------------------------------------- /tflog/tflog_loc.txt: -------------------------------------------------------------------------------- 1 | log -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jul 24 19:29:31 2019 4 | 5 | @author: a-kojima 6 | 7 | Neural mask estimation for MVDR 8 | 9 | this script suppots on-the-fly training for data-augmentation efficiently 10 | 11 | """ 12 | import numpy as np 13 | import glob 14 | from scipy import stats 15 | import random 16 | import soundfile as sf 17 | import matplotlib.pyplot as pl 18 | import sys 19 | 20 | from maskestimator import model, shaper, feature, augment, util 21 | from tensorflow.keras.callbacks import TensorBoard 22 | 23 | 24 | #========================================== 25 | # ANALYSIS PARAMETERS 26 | #========================================== 27 | SAMPLING_FREQUENCY = 16000 28 | FFTL = 1024 29 | SHIFT = 256 30 | 31 | #========================================== 32 | # ESURAL MASL ESTIMATOR PARAMETERS 33 | #========================================== 34 | LEFT_CONTEXT = 0 35 | RIGHT_CONTEXT = 0 36 | NUMBER_OF_SKIP_FRAME = 0 37 | 38 | #========================================== 39 | # NEURAL MASL ESTIMATOR TRAINNING PARAMERTERS 40 | #========================================== 41 | EPOCHS = 1 42 | NUMBER_OF_UTTERANCE = 1 #15 43 | TRUNCATE_GRAD = 7 44 | LR = 0.001 45 | SPEECH_DIRECTORY = r'./dataset/train/speech/*' 46 | NOISE_DIRECTORY = r'./dataset/train/noise/*' 47 | IS_DEBUG_SHOW_MASK_AND_SYNTHESIS = False 48 | MODEL_NAME_PREFIX = r'./model/neaural_mask_estimator{}.hdf5' 49 | LOSS_NAME_PREFIX = r'./model/neaural_mask_estimator.npy' 50 | LOG_PATH = r'./tflog/' 51 | RECURRENT_INIT = 0.04 52 | VALIDATION_SPEC = r'./validation_features/val_spec.npy' 53 | VALIDATION_SPEECH_MASK = r'./validation_features/noise_mask.npy' 54 | VALIDATION_NOISE_MASK = r'./validation_features/speech_mask.npy' 55 | 56 | 57 | NUMBER_OF_STACK = LEFT_CONTEXT + RIGHT_CONTEXT + 1 58 | 59 | #========================================== 60 | # augmentation parameters 61 | #========================================== 62 | ''' 63 | snr: [SNR20, SNR15, SNR10, SNR5, SNR0] 64 | prob.: [0.2,... 0.2] 65 | ''' 66 | SNR_generator = stats.rv_discrete(values=(np.array([0, 1, 2, 3, 4]), 67 | (0.2, 0.2, 0.2, 0.2, 0.2))) 68 | noise_list = glob.glob(NOISE_DIRECTORY) 69 | RIR_CONVOLVE_CHANCE_RATE = 0.25 # ex., 0.5 means convolution speech with RIR with 50% chanve rate 70 | 71 | #========================================== 72 | # prepare speech and noise file list 73 | #========================================== 74 | speech_list = glob.glob(SPEECH_DIRECTORY) 75 | noise_list = glob.glob(NOISE_DIRECTORY) 76 | 77 | #========================================== 78 | # get model 79 | #========================================== 80 | mask_estimator_generator = model.NeuralMaskEstimation(TRUNCATE_GRAD, 81 | NUMBER_OF_STACK, 82 | LR, 83 | FFTL // 2 + 1, 84 | recurrent_init=RECURRENT_INIT) 85 | mask_estimator = mask_estimator_generator.get_model(is_stateful=False, 86 | is_show_detail=True, 87 | is_adapt=False) 88 | 89 | #========================================== 90 | # training data shaper 91 | #========================================== 92 | data_shaper = shaper.Shape_data(LEFT_CONTEXT, 93 | RIGHT_CONTEXT, 94 | TRUNCATE_GRAD, 95 | NUMBER_OF_SKIP_FRAME ) 96 | 97 | #========================================== 98 | # set tensorboard 99 | #========================================== 100 | callback = TensorBoard(LOG_PATH) 101 | callback.set_model(mask_estimator) 102 | 103 | #========================================== 104 | # get features 105 | #========================================== 106 | feature_extractor = feature.Feature(SAMPLING_FREQUENCY, FFTL, SHIFT) 107 | noise_generator = augment.Generate_random_noise(noise_list, SAMPLING_FREQUENCY) 108 | reverbarent_generator = augment.RIR_convolve(SAMPLING_FREQUENCY) 109 | 110 | #========================================== 111 | # go training 112 | #========================================== 113 | utterance_count = 0 114 | TRIM = np.int(0.05 * SAMPLING_FREQUENCY) # beginning and ending of uttearnce is not used for training 115 | val_loss = np.array([]) 116 | test_loss = np.array([]) 117 | validate_features = np.load(VALIDATION_SPEC) 118 | validate_label_sp = np.load(VALIDATION_SPEECH_MASK) 119 | validate_label_n = np.load(VALIDATION_NOISE_MASK) 120 | freq_grid = np.linspace(0, SAMPLING_FREQUENCY, FFTL)[0:FFTL // 2 + 1] 121 | bin_index = np.argmin(np.abs(freq_grid - 2000)) 122 | 123 | 124 | for i in range(0, EPOCHS): 125 | speech_list_shuffle = random.sample(speech_list, len(speech_list)) 126 | 127 | # go NN parameters optimizer 128 | while True: 129 | feature_stack = [] 130 | label_stack_sp = [] 131 | label_stack_n = [] 132 | if len(speech_list_shuffle) < NUMBER_OF_UTTERANCE: 133 | break 134 | 135 | # dumping frame until searching # of utterances 136 | while True: 137 | # all utterance is used for training 138 | if len(speech_list_shuffle) <= NUMBER_OF_UTTERANCE: 139 | break 140 | 141 | index = np.random.randint(0, len(speech_list_shuffle) - 1, 1)[0] 142 | audio_path = speech_list_shuffle[index] 143 | speech_list_shuffle.pop(index) # remove uterance chosen yet 144 | 145 | 146 | speech = sf.read(audio_path, dtype='float32')[0] 147 | speech = speech[TRIM:-TRIM] 148 | 149 | if len(speech) != 0: 150 | speech = feature_extractor.add_white_noise(speech) 151 | if IS_DEBUG_SHOW_MASK_AND_SYNTHESIS == True: 152 | sf.write('./result/speech_clean.wav', speech , 16000) 153 | SNR_index = SNR_generator.rvs(size=1)[0] 154 | noise = noise_generator.get_noise(len(speech)) 155 | noise = feature_extractor.add_white_noise(noise) 156 | 157 | if RIR_CONVOLVE_CHANCE_RATE != 0: 158 | # convolve RIR 159 | if np.random.randint(0, 1 // RIR_CONVOLVE_CHANCE_RATE, 1)[0] == 1: 160 | speech, noise = reverbarent_generator.get_reverbant_speech(speech, noise) 161 | 162 | snr_adjuster = augment.SNR_adjusting(speech, noise) 163 | if SNR_index == 0: 164 | SNR = 20 165 | elif SNR_index == 1: 166 | SNR = 15 167 | elif SNR_index == 2: 168 | SNR = 10 169 | elif SNR_index == 3: 170 | SNR = 5 171 | elif SNR_index == 4: 172 | SNR = 0 173 | speech, noise = snr_adjuster.add_speech_to_noise(SNR) 174 | speech, noise = snr_adjuster.avoid_clipping(speech, noise) 175 | 176 | # if get mask after SNR adjusting 177 | speech_spectrogram = feature_extractor.get_feature(speech) 178 | noise_spectrogram = feature_extractor.get_feature(noise) 179 | freq_grid = np.linspace(0, SAMPLING_FREQUENCY, FFTL)[0:FFTL // 2 + 1] 180 | bin_index = np.argmin(np.abs(freq_grid - 2000)) 181 | speech_mask, noise_mask = feature_extractor.get_ideal_binary_mask_herman(speech_spectrogram, 182 | noise_spectrogram, 183 | threshold_bin=bin_index, 184 | theta_sp_low=10**(-4), 185 | theta_sp_high=10**(-5), 186 | theta_n_low=10**(-5),#-0.01 187 | theta_n_high=10**(-5)) #-0.02 188 | 189 | noisy_spectrogram = (speech_spectrogram + noise_spectrogram) 190 | noisy_spectrogram = (np.flipud(noisy_spectrogram)) 191 | speech_mask = np.flipud(speech_mask) 192 | noise_mask = np.flipud(noise_mask) 193 | 194 | noisy_spectrogram = feature_extractor.apply_cmvn(noisy_spectrogram) 195 | noisy_spectrogram = noisy_spectrogram + np.random.normal(loc=0, scale=0.0001, size=np.shape(noisy_spectrogram)) 196 | features, label_sp, label_n = data_shaper.convert_for_train(noisy_spectrogram, speech_mask, noise_mask) 197 | 198 | if len(features) != 0: 199 | features = np.array(features) 200 | label_sp = np.array(label_sp) 201 | label_n = np.array(label_n) 202 | if IS_DEBUG_SHOW_MASK_AND_SYNTHESIS == True: 203 | sf.write('./result/speech_noisy.wav', speech + noise, 16000) 204 | pl.figure(), 205 | pl.imshow(noise_mask, aspect='auto', extent=[0, np.shape(noise_mask)[1], 0, 8000,]) 206 | pl.title('noise mask') 207 | pl.figure(), 208 | pl.imshow(speech_mask, aspect='auto',extent=[0, np.shape(noise_mask)[1], 0, 8000]) 209 | pl.title('sp mask') 210 | pl.figure() 211 | pl.imshow(noisy_spectrogram, aspect='auto') 212 | pl.show() 213 | sys.exit() 214 | feature_stack.extend(features) 215 | label_stack_sp.extend(label_sp) 216 | label_stack_n.extend(label_n) 217 | utterance_count = utterance_count + 1 218 | if utterance_count == NUMBER_OF_UTTERANCE: 219 | break 220 | 221 | train_features = np.array(feature_stack) 222 | train_label_sp = np.array(label_stack_sp) 223 | train_label_n = np.array(label_stack_n) 224 | 225 | 226 | if np.shape(train_features)[0] != 0: 227 | shuffle_index = random.sample(range(0, np.shape(train_features)[0]), np.shape(train_features)[0]) 228 | history = mask_estimator.train_on_batch(x=train_features[shuffle_index, :, :], 229 | y=[train_label_sp[shuffle_index, :], train_label_n[shuffle_index, :]]) 230 | print('train_loss:', history) 231 | print('epoch:', i) 232 | else: 233 | break 234 | 235 | # reset 236 | feature_stack = [] 237 | label_stack = [] 238 | utterance_count = 0 239 | 240 | # evaluate validation data 241 | val_loss_r = mask_estimator.evaluate(x=validate_features, 242 | y=[validate_label_sp, validate_label_n], 243 | verbose=0) 244 | print('val_loss:', val_loss_r) 245 | util.write_log(callback, ['val_loss', 'speech_loss', 'noise_loss', 'accu_speech', 'accu_noise'], val_loss_r, i) 246 | 247 | val_loss = np.append(val_loss, val_loss_r) 248 | np.save(LOSS_NAME_PREFIX, val_loss) 249 | mask_estimator.save_weights(MODEL_NAME_PREFIX.replace('{}', str(i))) 250 | 251 | -------------------------------------------------------------------------------- /validation_features/val_data.txt: -------------------------------------------------------------------------------- 1 | val_data --------------------------------------------------------------------------------