├── spatial_two_mics ├── __init__.py ├── dnn │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── data_conversions.py │ │ ├── update_history.py │ │ ├── experiment_command_line_parser.py │ │ ├── model_logger.py │ │ ├── experiment_command_line_parser_v2.py │ │ ├── fast_dataset_v3.py │ │ ├── dataset.py │ │ └── fast_dataset_v2.py │ ├── evaluation │ │ ├── __init__.py │ │ └── naive_evaluation_numpy.py │ ├── losses │ │ ├── __init__.py │ │ ├── test │ │ │ ├── __init__.py │ │ │ └── test_sanity_of_losses.py │ │ └── affinity_approximation.py │ ├── models │ │ ├── __init__.py │ │ └── simple_LSTM_encoder.py │ ├── modules │ │ ├── __init__.py │ │ ├── prob_estimation_initial_SDR.py │ │ ├── prob_estimation_ground_truth_masks.py │ │ ├── measure_initial_SDR.py │ │ ├── ground_truth_evaluation.py │ │ ├── find_best_model_and_estimate_prob.py │ │ └── model_evaluation.py │ └── experiments │ │ ├── __init__.py │ │ ├── sample_convergence_LSTM.py │ │ ├── simple_LSTM_encoder.py │ │ ├── check_overfitting.py │ │ ├── convergence_check_v2.py │ │ └── run_experiment_v1.py ├── examples │ ├── __init__.py │ └── mixture_example.py ├── utils │ ├── __init__.py │ ├── progress_display.py │ ├── robust_means_clustering.py │ └── audio_mixture_constructor.py ├── data_generator │ ├── __init__.py │ ├── parallel_dataset_creation.py │ ├── dataset_storage.py │ └── source_position_generator.py ├── data_loaders │ ├── __init__.py │ ├── wham_speaker_info.txt │ ├── timit.py │ └── wham.py ├── visualization │ └── __init__.py ├── labels_inference │ ├── __init__.py │ ├── ground_truth.py │ ├── duet_mask_estimation.py │ └── tf_label_estimator.py └── config.py ├── LICENSE └── README.md /spatial_two_mics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/data_generator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/data_loaders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/losses/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/losses/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/labels_inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spatial_two_mics/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3" 4 | 5 | BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 6 | TIMIT_PATH = "/mnt/data/Speech/timit-wav" 7 | DATASETS_DIR = "/mnt/nvme/spatial_two_mics_data/" 8 | MODELS_DIR = "/mnt/nvme/spatial_two_mics_models/" 9 | RESULTS_DIR = "/mnt/nvme/spatial_two_mics_results/" 10 | MODELS_RAW_PHASE_DIR = "/mnt/nvme/spatial_two_mics_models_raw_phase/" 11 | MODELS_GROUND_TRUTH = "/mnt/nvme/spatial_two_mics_models_ground_truth/" 12 | FINAL_RESULTS_DIR = "/mnt/nvme/spatial_two_mics_final_eval_results/" 13 | 14 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/utils/data_conversions.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Pytorch data tensors manipulations functions 3 | 4 | @author Efthymios Tzinis {etzinis2@illinois.edu} 5 | @copyright University of illinois at Urbana Champaign 6 | """ 7 | 8 | import torch 9 | 10 | 11 | def one_hot_3Dmasks(index_ys, n_classes): 12 | """! Converting a matrix of float labels for each class to a one 13 | hot vector of the same dimension plus the extra of one-hot 14 | correspondence 15 | 16 | :param index_ys: mask 3d tensor with integer labels 17 | :param n_classes: integer 18 | :return: whatever diomensions x n_classes => 1 hot correspondence 19 | """ 20 | clustered_ys = index_ys.unsqueeze(-1).long() 21 | 22 | one_hot = torch.cuda.FloatTensor(clustered_ys.size(0), 23 | clustered_ys.size(1), 24 | clustered_ys.size(2), 25 | n_classes).zero_() 26 | 27 | return one_hot.scatter_(3, clustered_ys, 1).cuda() 28 | 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Efthymios Tzinis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /spatial_two_mics/data_loaders/wham_speaker_info.txt: -------------------------------------------------------------------------------- 1 | 001 M 2 | 002 F 3 | 00a F 4 | 00b M 5 | 00c M 6 | 00d M 7 | 00f F 8 | 010 M 9 | 011 F 10 | 012 M 11 | 013 M 12 | 014 F 13 | 015 M 14 | 016 F 15 | 017 F 16 | 018 F 17 | 019 F 18 | 01l M 19 | 01a F 20 | 01b F 21 | 01c F 22 | 01d F 23 | 01e M 24 | 01f F 25 | 01g M 26 | 01h F 27 | 01i M 28 | 01j F 29 | 01k F 30 | 01m F 31 | 01n F 32 | 01o F 33 | 01p F 34 | 01q F 35 | 01r M 36 | 01s M 37 | 01t M 38 | 01u F 39 | 01v F 40 | 01w M 41 | 01x F 42 | 01y M 43 | 01z M 44 | 020 M 45 | 021 M 46 | 022 F 47 | 023 F 48 | 024 M 49 | 025 M 50 | 026 M 51 | 027 F 52 | 028 F 53 | 029 M 54 | 02a F 55 | 02b M 56 | 02c F 57 | 02d F 58 | 02e F 59 | 02f F 60 | 050 F 61 | 051 M 62 | 052 M 63 | 053 F 64 | 200 M 65 | 201 M 66 | 202 F 67 | 203 F 68 | 204 F 69 | 205 F 70 | 206 F 71 | 207 M 72 | 208 M 73 | 209 F 74 | 20a F 75 | 20b F 76 | 20c M 77 | 20d F 78 | 20e F 79 | 20f M 80 | 20g M 81 | 20h F 82 | 20i M 83 | 20j M 84 | 20k M 85 | 20l M 86 | 20m M 87 | 20n M 88 | 20o M 89 | 20p F 90 | 20q M 91 | 20r M 92 | 20s M 93 | 20t F 94 | 20u M 95 | 20v M 96 | 22g M 97 | 22h M 98 | 400 M 99 | 401 F 100 | 403 M 101 | 404 F 102 | 405 M 103 | 406 M 104 | 407 F 105 | 408 M 106 | 409 F 107 | 40a M 108 | 40b M 109 | 40c M 110 | 40d F 111 | 40e F 112 | 40f M 113 | 40g F 114 | 40h F 115 | 40i M 116 | 40j M 117 | 40k M 118 | 40l F 119 | 40m F 120 | 40n M 121 | 40o F 122 | 40p F 123 | 420 F 124 | 421 F 125 | 422 M 126 | 423 M 127 | 430 F 128 | 431 M 129 | 432 F 130 | 050 F 131 | 051 M 132 | 052 M 133 | 053 F 134 | 22g M 135 | 22h M 136 | 423 M 137 | 440 M 138 | 441 F 139 | 442 F 140 | 443 M 141 | 444 F 142 | 445 F 143 | 446 M 144 | 447 M -------------------------------------------------------------------------------- /spatial_two_mics/utils/progress_display.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief A general bar progress bar display container for all functions 3 | applied on a list or or an enumerable structure of elements 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of Illinois at Urbana Champaign 7 | """ 8 | 9 | from progress.bar import ChargingBar 10 | import numpy as np 11 | 12 | 13 | def progress_bar_wrapper(func, 14 | l, 15 | message='Processing...'): 16 | """ 17 | ! 18 | :param l: List of elements 19 | :param func: This function should be applicable to elements of 20 | the list l. E.g. a lamda func is also sufficient. 21 | :param message: A string that you want to be displayed 22 | :return: The result of map(func, l) 23 | """ 24 | 25 | l_copy = l.copy() 26 | n_elements = len(l) 27 | bar = ChargingBar(message, max=n_elements) 28 | 29 | for idx in np.arange(n_elements): 30 | l_copy[idx] = func(l[idx]) 31 | bar.next() 32 | 33 | bar.finish() 34 | return l_copy 35 | 36 | 37 | def test(): 38 | import pytest 39 | 40 | M = int(10e7) 41 | size = int(10e4) 42 | l = np.random.uniform(low=-M, high=M, size=size) 43 | funcs = { 44 | 'const_mul': lambda x: x*2, 45 | 'power_2': lambda x: x**2, 46 | 'subtraction': lambda x: x-x/2. 47 | } 48 | 49 | for name, func in funcs.items(): 50 | map_result = list(map(func, l)) 51 | wrapper_result = progress_bar_wrapper(func, l, message=name) 52 | assert any(map_result ==wrapper_result), 'Progress wrapper ' \ 53 | 'should provide the ' \ 54 | 'same result as map' 55 | 56 | if __name__ == "__main__": 57 | test() -------------------------------------------------------------------------------- /spatial_two_mics/dnn/losses/test/test_sanity_of_losses.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Testing the sanity of the losses comparing to naive 3 | implementations 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of illinois at Urbana Champaign 7 | """ 8 | import sys 9 | import numpy as np 10 | import torch 11 | from pprint import pprint 12 | sys.path.append('../') 13 | import affinity_approximation as losses 14 | 15 | 16 | def numpy_naive(vs, ys): 17 | frobenius_np = np.mean( 18 | np.array([np.linalg.norm(vs[b].dot(vs[b].T) - 19 | ys[b].dot(ys[b].T))**2 20 | for b in np.arange(vs.shape[0])])) 21 | return frobenius_np 22 | 23 | 24 | if __name__ == "__main__": 25 | batch_size = 1 26 | num_tfs = 100 27 | embedding_depth = 10 28 | n_sources = 2 29 | vs_np = np.random.rand(batch_size, num_tfs, embedding_depth) 30 | ys_np = np.abs(np.random.rand(batch_size, num_tfs, n_sources)) 31 | vs = torch.from_numpy(vs_np) 32 | ys = torch.from_numpy(ys_np) 33 | 34 | np_frobenius = numpy_naive(vs_np, ys_np) 35 | naive_torch_frobenius = losses.frobenius_naive(vs, ys).data.numpy() 36 | # 37 | print("Numpy Frobenius: {}".format(np_frobenius)) 38 | print("Naive Torch Frobenius: {}".format(naive_torch_frobenius)) 39 | 40 | assert np.abs(np_frobenius - 41 | naive_torch_frobenius) < 10e-5, 'Naive ' \ 42 | 'implementations of Frobenius norm should be equal' 43 | 44 | 45 | 46 | efficient_frobenius = losses.efficient_frobenius(vs, ys) 47 | print("Efficient Frobenius: {}".format(efficient_frobenius)) 48 | 49 | # assert np.abs(np_frobenius - 50 | # efficient_frobenius) < 10e-5, 'Efficient == Naive ' 51 | 52 | paris_wtf = losses.naive(vs, ys) 53 | print("Paris wtf: {}".format(paris_wtf)) 54 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/models/simple_LSTM_encoder.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Simple LSTM encoder for embedding the input using a simple 3 | LSTM architecture 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of Illinois at Urbana Champaign 7 | """ 8 | 9 | import torch 10 | import torch.nn as nn 11 | 12 | class BLSTMEncoder(nn.Module): 13 | def __init__(self, 14 | n_timesteps=250, 15 | n_features=257, 16 | num_layers=1, 17 | hidden_size=None, 18 | dropout=0.0, 19 | embedding_depth=None, 20 | bidirectional=True): 21 | super(BLSTMEncoder, self).__init__() 22 | 23 | if n_timesteps is None or n_features is None: 24 | raise ValueError("You have to define both the number of " 25 | "timesteps in each sequence and the " 26 | "number of features for each timestep.") 27 | else: 28 | self.emb_dim = n_features * embedding_depth 29 | 30 | self.embedding_depth = embedding_depth 31 | self.hidden_size = hidden_size 32 | self.n_timesteps = n_timesteps 33 | if bidirectional: 34 | self.n_directions = 2 35 | else: 36 | self.n_directions = 1 37 | # assert len(self.hidden_sizes) == num_layers, 'Each layer ' \ 38 | # 'should be defined by a corresponding hidden size.' 39 | self.rnn = nn.LSTM(input_size=n_features, 40 | num_layers=num_layers, 41 | hidden_size=self.hidden_size, 42 | bidirectional=bidirectional, 43 | dropout=dropout, 44 | batch_first=True) 45 | self.affine = nn.Linear(self.n_directions*self.hidden_size, 46 | self.emb_dim) 47 | 48 | def forward(self, x): 49 | rnn_out, (hidden, states) = self.rnn(x) 50 | nonl_embedding = torch.sigmoid(self.affine(rnn_out)) 51 | v = nonl_embedding.contiguous().view(x.size(0), 52 | -1, 53 | self.embedding_depth) 54 | # return nn.functional.normalize(v, dim=-1, p=2) 55 | return v 56 | 57 | if __name__ == "__main__": 58 | 59 | model = BLSTMEncoder() 60 | -------------------------------------------------------------------------------- /spatial_two_mics/examples/mixture_example.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief A simple example of how a compact mixture should look like 3 | 4 | @author Efthymios Tzinis {etzinis2@illinois.edu} 5 | @copyright University of Illinois at Urbana Champaign 6 | """ 7 | 8 | import os 9 | import sys 10 | root_dir = os.path.join( 11 | os.path.dirname(os.path.realpath(__file__)), 12 | '../../') 13 | sys.path.insert(0, root_dir) 14 | 15 | from spatial_two_mics.config import TIMIT_PATH 16 | import numpy as np 17 | 18 | 19 | def mixture_info_example(): 20 | ex = {'positions': 21 | {'amplitudes': np.array([0.73382382, 22 | 0.26617618]), 23 | 'd_thetas': np.array([1.06829948]), 24 | 'distances': {'m1m1': 0.0, 25 | 'm1m2': 0.03, 26 | 'm1s1': 3.015, 27 | 'm1s2': 3.0072529608785676, 28 | 'm2m1': 0.03, 29 | 'm2m2': 0.0, 30 | 'm2s1': 2.985, 31 | 'm2s2': 2.9928046426867034, 32 | 's1m1': 3.015, 33 | 's1m2': 2.985, 34 | 's1s1': 0.0, 35 | 's1s2': 3.054656422155759, 36 | 's2m1': 3.0072529608785676, 37 | 's2m2': 2.9928046426867034, 38 | 's2s1': 3.054656422155759, 39 | 's2s2': 0.0}, 40 | 'taus': np.array([1.39941691, 0.67397403]), 41 | 'thetas': np.array([0., 1.06829948]), 42 | 'xy_positons': np.array([[3., 0.], 43 | [1.44484569, 2.62914833]])}, 44 | 'sources_ids': [{'gender': 'f', 45 | 'sentence_id': 'sa1', 46 | 'speaker_id': 'flbw0', 47 | 'wav_path': os.path.join(TIMIT_PATH, 48 | 'test/dr4/flbw0/sa1.wav')}, 49 | {'gender': 'm', 50 | 'sentence_id': 'sa2', 51 | 'speaker_id': 'mbns0', 52 | 'wav_path': os.path.join(TIMIT_PATH, 53 | 'test/dr4/mbns0/sa2.wav')} 54 | ]} 55 | 56 | return ex 57 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/utils/update_history.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief History and callback update functions 3 | 4 | @author Efthymios Tzinis {etzinis2@illinois.edu} 5 | @copyright University of illinois at Urbana Champaign 6 | """ 7 | 8 | def values_update(list_of_pairs, 9 | history_dic, 10 | update_mode='batch'): 11 | """! Update the history dictionary for each key, value pair 12 | INPLACE and stores values for batch and epoch 13 | :param update_mode: In batch mode the values of the specific key 14 | would be summed and in epoch mode would be averaged throughout 15 | the batches. 16 | :param list_of_pairs: list of tuples e.g. [('loss', 0.9987), ...,] 17 | :param history_dic: a dictionary that we want to keep track for 18 | a metric under all epochs 19 | :return: history_dic updated with all the appropriate values for 20 | batch and epoch 21 | """ 22 | if update_mode == 'batch': 23 | for k, v in list_of_pairs: 24 | if not k+"_batch_total" in history_dic: 25 | history_dic[k] = [] 26 | history_dic[k+"_batch_total"] = v 27 | history_dic[k + '_batch_counter'] = 1 28 | else: 29 | history_dic[k + "_batch_total"] += v 30 | history_dic[k+'_batch_counter'] += 1 31 | elif update_mode == 'epoch': 32 | for k, v in list_of_pairs: 33 | history_dic[k].append(history_dic[k + "_batch_total"] / 34 | history_dic[k + '_batch_counter']) 35 | history_dic[k + "_batch_total"] = 0. 36 | history_dic[k + '_batch_counter'] = 0 37 | else: 38 | raise NotImplementedError('Please use an update mode of epoch ' 39 | 'or batch') 40 | 41 | return history_dic 42 | 43 | 44 | def update_best_performance(performance_dic, 45 | epoch, 46 | history_dic, 47 | buffer_size=0): 48 | """! Update the history dictionary for the best performance so far 49 | INPLACE and stores them in a list which has length equal to the 50 | predefined buffer size 51 | :return: history_dic updated with all the appropriate values for 52 | the best performance so far 53 | """ 54 | if 'best_performances' not in history_dic: 55 | history_dic['best_performances'] = [(performance_dic, epoch)] 56 | else: 57 | history_dic['best_performances'].append((performance_dic, 58 | epoch)) 59 | history_dic['best_performances'] = \ 60 | sorted(history_dic['best_performances'], 61 | key=lambda x: x[0]['sdr'])[::-1][:buffer_size] 62 | 63 | return history_dic 64 | -------------------------------------------------------------------------------- /spatial_two_mics/labels_inference/ground_truth.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Infering the masking for eah tf bin independently based on the 3 | maximum energy of the sources in each bin 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of Illinois at Urbana Champaign 7 | """ 8 | 9 | import numpy as np 10 | from pprint import pprint 11 | 12 | 13 | def infer_mask(mixture_info): 14 | """ 15 | :param mixture_info: 16 | mixture_info = { 17 | 'm1_raw': numpy array containing the raw m1 signal, 18 | 'm2_raw': numpy array containing the raw m2 signal, 19 | 'm1_tf': numpy array containing the m1 TF representation, 20 | 'm2_tf': numpy array containing the m2 TF representation, 21 | 'sources_raw': a list of numpy 1d vectors containing the 22 | sources , 23 | 'sources_tf': a list of numpy 2d vectors containing the 24 | TF represeantations of the sources 25 | 'amplitudes': the weights that each source contributes to 26 | the mixture of the second microphone 27 | } 28 | 29 | :return: A tf 2d matrix corresponding to the dominating source 30 | for each TF bin [0,1,...,n_sources] 31 | """ 32 | sources_complex_spectra = mixture_info['sources_tf'] 33 | amplitudes = mixture_info['amplitudes'] 34 | n_sources = len(sources_complex_spectra) 35 | 36 | assert len(amplitudes) == n_sources, "Length of weights: {} " \ 37 | "should be equal to the " \ 38 | "number of sources: {}" \ 39 | "".format(len(amplitudes), 40 | n_sources) 41 | 42 | same_dimensions = [(sources_complex_spectra[i].shape == 43 | sources_complex_spectra[0].shape) 44 | for i in np.arange(len(sources_complex_spectra))] 45 | 46 | assert all(same_dimensions), "All arrays should have the same " \ 47 | "dimensions. However, got sizes of {}"\ 48 | "".format([x.shape for x in 49 | sources_complex_spectra]) 50 | 51 | sources_complex_spectra = [amplitudes[i] * sources_complex_spectra[i] 52 | for i in np.arange(n_sources)] 53 | 54 | tf_real_sources = [np.abs(tf_complex) 55 | for tf_complex in sources_complex_spectra] 56 | 57 | mixture_tensor = np.dstack(tf_real_sources) 58 | dominating_source = np.argmax(mixture_tensor, axis=2) 59 | 60 | zipped_tf_labels = dominating_source.astype(np.uint8) 61 | 62 | assert np.array_equal(dominating_source, zipped_tf_labels), \ 63 | "Zipping the numpy matrix should not yield different labels" 64 | 65 | return zipped_tf_labels 66 | 67 | 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # unsupervised_spatial_dc 2 | Code for the paper: "Unsupervised Deep Clustering for Source Separation: Direct Learning from Mixtures using Spatial Information" 3 | 4 | > Please cite as: 5 | ``` 6 | @INPROCEEDINGS{8683201, 7 | author={E. {Tzinis} and S. {Venkataramani} and P. {Smaragdis}}, 8 | booktitle={ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 9 | title={Unsupervised Deep Clustering for Source Separation: Direct Learning from Mixtures Using Spatial Information}, 10 | year={2019}, 11 | volume={}, 12 | number={}, 13 | pages={81-85}, 14 | keywords={pattern clustering;source separation;unsupervised learning;training process;ground truth separation information;direct learning;spatial information;monophonic source separation system;multichannel mixtures;unsupervised deep clustering approach;sound separation performance;multichannel recordings;Deep clustering;source separation;unsupervised learning}, 15 | doi={10.1109/ICASSP.2019.8683201}, 16 | ISSN={}, 17 | month={May},} 18 | ``` 19 | 20 | ## Disclaimer 21 | University of Illinois Open Source License 22 | 23 | Copyright © 2018, University of Illinois at Urbana Champaign. All rights reserved. 24 | 25 | Developed by: Efthymios Tzinis 1, Shrikant Venkataramani 1, Paris Smaragdis 1,2 26 | 27 | 1: University of Illinois at Urbana-Champaign, 28 | 2: Adobe Research 29 | 30 | This work was supported by NSF grant 1453104. 31 | Paper link: https://doi.org/10.1109/ICASSP.2019.8683201 32 | 33 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. Neither the names of Computational Audio Group, University of Illinois at Urbana-Champaign, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. 34 | -------------------------------------------------------------------------------- /spatial_two_mics/labels_inference/duet_mask_estimation.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Infering the masking for eah tf bin based on DUET features, 3 | mainly phase difference and after that a robust K-means estimation 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of Illinois at Urbana Champaign 7 | """ 8 | 9 | import numpy as np 10 | import sys 11 | root_dir = '../../' 12 | sys.path.insert(0, root_dir) 13 | from spatial_two_mics.utils import robust_means_clustering as \ 14 | robust_kmeans 15 | 16 | 17 | def infer_mask(mixture_info, 18 | return_phase_features=False): 19 | """ 20 | :param mixture_info: 21 | mixture_info = { 22 | 'm1_raw': numpy array containing the raw m1 signal, 23 | 'm2_raw': numpy array containing the raw m2 signal, 24 | 'm1_tf': numpy array containing the m1 TF representation, 25 | 'm2_tf': numpy array containing the m2 TF representation, 26 | 'sources_raw': a list of numpy 1d vectors containing the 27 | sources , 28 | 'sources_tf': a list of numpy 2d vectors containing the 29 | TF represeantations of the sources , 30 | 'amplitudes': the weights that each source contributes to 31 | the mixture of the second microphone 32 | } 33 | 34 | :return: A tf 2d matrix corresponding to the dominating source 35 | for each TF bin [0,1,...,n_sources] 36 | """ 37 | sources_complex_spectra = mixture_info['sources_tf'] 38 | amplitudes = mixture_info['amplitudes'] 39 | n_sources = len(sources_complex_spectra) 40 | 41 | assert len(amplitudes) == n_sources, "Length of weights: {} " \ 42 | "should be equal to the " \ 43 | "number of sources: {}" \ 44 | "".format(len(amplitudes), 45 | n_sources) 46 | 47 | same_dimensions = [(sources_complex_spectra[i].shape == 48 | sources_complex_spectra[0].shape) 49 | for i in np.arange(len(sources_complex_spectra))] 50 | 51 | assert all(same_dimensions), "All arrays should have the same " \ 52 | "dimensions. However, got sizes of {}"\ 53 | "".format([x.shape for x in 54 | sources_complex_spectra]) 55 | 56 | r = mixture_info['m1_tf'] / (mixture_info['m2_tf'] + 1e-7) 57 | phase_dif = np.angle(r) / np.linspace(1e-5, np.pi, 58 | mixture_info['m1_tf'].shape[0])[:, None] 59 | 60 | d_feature = np.reshape(phase_dif, (np.product(phase_dif.shape), 1)) 61 | r_kmeans = robust_kmeans.RobustKmeans(n_true_clusters=n_sources, 62 | n_used_clusters=n_sources+3) 63 | d_labels = r_kmeans.fit(d_feature, cut_outlier_in_norm=2.) 64 | d_feature_mask = np.reshape(d_labels, phase_dif.shape) 65 | 66 | zipped_tf_labels = d_feature_mask.astype(np.uint8) 67 | 68 | assert np.array_equal(d_feature_mask, zipped_tf_labels), \ 69 | "Zipping the numpy matrix should not yield different labels" 70 | 71 | if return_phase_features: 72 | return zipped_tf_labels, phase_dif 73 | 74 | return zipped_tf_labels 75 | 76 | 77 | -------------------------------------------------------------------------------- /spatial_two_mics/data_loaders/timit.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Dataloader for timit dataset in order to store in an internal 3 | python dictionary structure the whole timit dataset. 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of illinois at Urbana Champaign 7 | """ 8 | 9 | import os 10 | import sys 11 | import scipy.io.wavfile as wavfile 12 | import glob2 13 | import numpy as np 14 | 15 | root_dir = os.path.join( 16 | os.path.dirname(os.path.realpath(__file__)), 17 | '../../') 18 | sys.path.insert(0, root_dir) 19 | 20 | from spatial_two_mics.config import TIMIT_PATH 21 | 22 | 23 | class TimitLoader(object): 24 | def __init__(self, 25 | normalize_audio_by_std=True): 26 | self.dataset_path = TIMIT_PATH 27 | self.normalize_audio_by_std = normalize_audio_by_std 28 | 29 | def get_all_wavs(self, path): 30 | data_dic = {} 31 | print("Searching inside: {}...".format(path)) 32 | dialects = os.listdir(path) 33 | for dial in dialects: 34 | if dial.startswith('.'): 35 | continue 36 | d_path = os.path.join(path, dial) 37 | speakers = os.listdir(os.path.join(d_path)) 38 | for speaker in speakers: 39 | if speaker.startswith('.'): 40 | continue 41 | speaker_path = os.path.join(d_path, speaker) 42 | wavs_paths = glob2.glob(os.path.join(speaker_path, 43 | '*.wav')) 44 | 45 | speaker_wavs = [list(wavfile.read(wav_p)) + [wav_p] 46 | for wav_p in wavs_paths] 47 | 48 | if self.normalize_audio_by_std: 49 | speaker_wavs = [(sr, wav / np.std(wav), wav_p) 50 | for (sr, wav, wav_p) in speaker_wavs] 51 | 52 | speaker_wavs = [(wav_p.split('/')[-1].split('.wav')[0], 53 | {'wav': wav, 'sr': sr, 'path': wav_p}) 54 | for (sr, wav, wav_p) in speaker_wavs] 55 | 56 | speaker_gender = speaker[0] 57 | data_dic[speaker] = { 58 | 'dialect': dial, 59 | 'gender': speaker_gender, 60 | 'sentences': dict(speaker_wavs) 61 | } 62 | 63 | return data_dic 64 | 65 | def load(self): 66 | """ 67 | Loading all the data inside a dictionary like the one below: 68 | 69 | { 70 | 'train': 71 | 'speaker_id_i': { 72 | 'dialect': which dialect the speaker belongs to, 73 | 'gender': f or m, 74 | 'sentences': { 75 | 'sentence_id_j': { 76 | 'wav': wav_on_a_numpy_matrix, 77 | 'sr': Fs in Hz integer, 78 | 'path': PAth of the located wav 79 | } 80 | } 81 | } 82 | 83 | * the same applies for test speakers 84 | } 85 | 86 | :return: Dictionary 87 | """ 88 | data_dic = {'train': {}, 89 | 'test': {} 90 | } 91 | 92 | for chunk in data_dic: 93 | wavs_path = os.path.join(self.dataset_path, chunk) 94 | all_wavs_dic = self.get_all_wavs(wavs_path) 95 | data_dic[chunk] = all_wavs_dic 96 | 97 | return data_dic 98 | 99 | 100 | if __name__ == "__main__": 101 | print("Loading TIMIT Dataset from {}...".format(TIMIT_PATH)) 102 | timit_loader = TimitLoader() 103 | timit_data = timit_loader.load() -------------------------------------------------------------------------------- /spatial_two_mics/dnn/evaluation/naive_evaluation_numpy.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief A naive implementation of how we evaluate the masks that are 3 | derived --> reconstruct the source signals and also extract SDR, 4 | SIR and SBR for the reconstructed signals and the true signals 5 | 6 | @author Efthymios Tzinis {etzinis2@illinois.edu} 7 | @copyright University of illinois at Urbana Champaign 8 | """ 9 | 10 | import numpy as np 11 | import librosa 12 | 13 | 14 | def bss_eval(sep, i, sources): 15 | # Current target 16 | min_len = min([len(sep), len(sources[i])]) 17 | sources = sources[:, :min_len] 18 | sep = sep[:min_len] 19 | target = sources[i] 20 | 21 | # Target contribution 22 | s_target = target * np.dot(target, sep.T) / np.dot(target, target.T) 23 | 24 | # Interference contribution 25 | pse = np.dot(np.dot( sources, sep.T), 26 | np.linalg.inv(np.dot( sources, sources.T))).T.dot( sources) 27 | e_interf = pse - s_target 28 | 29 | # Artifact contribution 30 | e_artif = sep - pse 31 | 32 | # Interference + artifacts contribution 33 | e_total = e_interf + e_artif 34 | 35 | # Computation of the log energy ratios 36 | sdr = 10*np.log10(sum(s_target**2) / sum(e_total**2)); 37 | sir = 10*np.log10(sum(s_target**2) / sum(e_interf**2)); 38 | sar = 10*np.log10(sum((s_target + e_interf)**2) / sum(e_artif**2)); 39 | 40 | # Done! 41 | return sdr, sir, sar 42 | 43 | 44 | def naive_cpu_bss_eval(embedding_labels, 45 | mix_real_tf, 46 | mix_imag_tf, 47 | sources_raw, 48 | n_sources, 49 | batch_index=0): 50 | 51 | mix_stft = mix_real_tf + 1j*mix_imag_tf 52 | 53 | if mix_stft.shape == embedding_labels.shape: 54 | embedding_clustered = embedding_labels 55 | else: 56 | embedding_clustered = embedding_labels.reshape( 57 | mix_stft.shape[::-1]).T 58 | 59 | sdr_t, sir_t, sar_t = 0., 0., 0. 60 | for i in np.arange(n_sources): 61 | embed_mask = mix_stft*(embedding_clustered == i) 62 | reconstructed = librosa.core.istft(embed_mask, 63 | hop_length=128, 64 | win_length=512) 65 | bss_results = [bss_eval(reconstructed, j, sources_raw) 66 | for j in np.arange(n_sources)] 67 | 68 | sdr, sir, sar = sorted(bss_results, key=lambda x: x[0])[-1] 69 | sdr_t += sdr 70 | sir_t += sir 71 | sar_t += sar 72 | 73 | # save_p = '/home/thymios/wavs/' 74 | # wav_p = os.path.join(save_p, 75 | # 'batch_{}_source_{}'.format( 76 | # batch_index + 1, i + 1)) 77 | # librosa.output.write_wav(wav_p, reconstructed, 16000) 78 | 79 | return sdr_t/n_sources, sir_t/n_sources, sar_t/n_sources 80 | 81 | 82 | def mixture_bss_eval(mix_real_tf, 83 | mix_imag_tf, 84 | sources_raw, 85 | n_sources): 86 | 87 | mix_stft = mix_real_tf + 1j*mix_imag_tf 88 | 89 | reconstructed = librosa.core.istft(mix_stft, 90 | hop_length=128, 91 | win_length=512) 92 | bss_results = [bss_eval(reconstructed, j, sources_raw) 93 | for j in np.arange(n_sources)] 94 | 95 | (sdrs, sirs, sars) = (np.array([x[0] for x in bss_results]), 96 | np.array([x[1] for x in bss_results]), 97 | np.array([x[2] for x in bss_results])) 98 | 99 | return np.mean(sdrs), np.mean(sirs), np.mean(sars) 100 | -------------------------------------------------------------------------------- /spatial_two_mics/utils/robust_means_clustering.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief This utility serves as a level of abstraction in order to 3 | construct audio mixtures 4 | 5 | 6 | @author Efthymios Tzinis {etzinis2@illinois.edu} 7 | @copyright University of Illinois at Urbana Champaign 8 | """ 9 | 10 | from pprint import pprint 11 | from sklearn.cluster import KMeans 12 | import numpy as np 13 | 14 | 15 | class RobustKmeans(object): 16 | def __init__(self, 17 | n_true_clusters=2, 18 | n_used_clusters=4): 19 | """! 20 | Sometimes K-means creates clusters around outlier groups which 21 | should not be the case. For this reason we run K-means with 22 | n_used_clusters > n_true_clusters and then we assign at the most 23 | probable n_true_clusters the residual clusters 24 | 25 | :param n_true_clusters: the true number of clusters we wanna 26 | cluster the data at the end 27 | :param n_used_clusters: The amount of clusters that will be used 28 | in total for running kmeans and after that the residual would be 29 | assigned in the top most prior n_true_clusters 30 | """ 31 | 32 | self.N_true = n_true_clusters 33 | self.N_used = n_used_clusters 34 | self.kmeans_obj = KMeans(n_clusters=self.N_used, 35 | random_state=7) 36 | 37 | def fit(self, x, cut_outlier_in_norm=2.): 38 | """! 39 | robust clustering for the input x 40 | 41 | :param x: nd array with shape: (n_samples, n_features) 42 | 43 | :return cluster_labels: 1d array with the corresponding 44 | labels from 0 to self.N_true - 1 45 | """ 46 | 47 | if cut_outlier_in_norm is not None: 48 | robust_points = x[np.where(np.linalg.norm(x, axis=1) <= 49 | cut_outlier_in_norm), :][0] 50 | 51 | fitted_centers = self.kmeans_obj.fit(robust_points) 52 | clustered = self.kmeans_obj.predict(x) 53 | else: 54 | fitted_centers = self.kmeans_obj.fit(x) 55 | clustered = fitted_centers.labels_ 56 | 57 | cluster_coordinates = fitted_centers.cluster_centers_ 58 | 59 | priors = np.bincount(clustered) 60 | cl_indexes = np.argsort(priors) 61 | true_clusters = cl_indexes[self.N_used - self.N_true:] 62 | 63 | fitted_centers.cluster_centers_ = cluster_coordinates[ 64 | true_clusters] 65 | 66 | # make the new prediction with the new clusters 67 | robust_estimation = fitted_centers.predict(x) 68 | 69 | return robust_estimation 70 | 71 | def fit_predict(self, x, cut_outlier_in_norm=2.): 72 | """! 73 | robust clustering for the input x 74 | 75 | :param x: nd array with shape: (n_samples, n_features) 76 | 77 | :return cluster_labels: 1d array with the corresponding 78 | labels from 0 to self.N_true - 1 79 | """ 80 | return self.fit(x, cut_outlier_in_norm=cut_outlier_in_norm) 81 | 82 | 83 | def example_of_usage(): 84 | """! 85 | How the class of Audio mixtures should be called""" 86 | 87 | from sklearn.datasets import load_iris 88 | data = load_iris() 89 | x = data.data 90 | y = data.target 91 | x /= np.linalg.norm(x) 92 | 93 | robust_clusterer = RobustKmeans(n_true_clusters=3, 94 | n_used_clusters=3) 95 | pred = robust_clusterer.fit(x) 96 | print("Using 3 True Clusters and 3 for Prediction: {}".format(pred)) 97 | 98 | robust_clusterer = RobustKmeans(n_true_clusters=3, 99 | n_used_clusters=5) 100 | pred = robust_clusterer.fit(x) 101 | print("Using 3 True Clusters and 5 for Prediction: {}".format(pred)) 102 | 103 | if __name__ == "__main__": 104 | example_of_usage() -------------------------------------------------------------------------------- /spatial_two_mics/data_generator/parallel_dataset_creation.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Create datasets for the experiments by individually assign 3 | them as jobs in different processors 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of illinois at Urbana Champaign 7 | """ 8 | 9 | import argparse 10 | import os 11 | import sys 12 | import itertools 13 | import copy 14 | from pprint import pprint 15 | root_dir = os.path.join( 16 | os.path.dirname(os.path.realpath(__file__)), 17 | '../../') 18 | sys.path.insert(0, root_dir) 19 | from joblib import Parallel, delayed 20 | import spatial_two_mics.data_generator.data_creator_and_storage_v2 as\ 21 | dataset_generator 22 | 23 | 24 | def generate_one_dataset_wrapper(this_dataset_args): 25 | dataset_generator.generate_dataset(this_dataset_args) 26 | return 1 27 | 28 | 29 | def generate_datasets(args): 30 | genders = list(map(list, args.genders)) 31 | n_sources = args.n_sources 32 | 33 | dataset_combinations = list(itertools.product(*[genders, 34 | n_sources])) 35 | 36 | specific_args = [] 37 | for (gndrs, sources) in dataset_combinations: 38 | this_args = copy.deepcopy(args) 39 | this_args.n_sources = sources 40 | this_args.genders = gndrs 41 | del this_args.n_jobs 42 | specific_args.append(this_args) 43 | 44 | pprint(specific_args) 45 | 46 | created_datasets = Parallel(n_jobs=args.n_jobs)( 47 | [delayed(generate_one_dataset_wrapper)(this_args) 48 | for this_args in specific_args]) 49 | 50 | print("Successfully created: {} datasets".format( 51 | sum(created_datasets))) 52 | 53 | return True 54 | 55 | 56 | def get_args(): 57 | """! Command line parser """ 58 | parser = argparse.ArgumentParser(description='Parallel Mixture ' 59 | 'datasets creator') 60 | parser.add_argument("--dataset", type=str, 61 | help="Dataset name", default="timit") 62 | parser.add_argument("--n_sources", type=int, nargs='+', 63 | help="How many sources in each mix", default=2) 64 | parser.add_argument("--n_samples", type=int, nargs='+', 65 | help="How many samples do u want to be " 66 | "created", 67 | default=[1, 1, 1]) 68 | parser.add_argument("--genders", type=str, nargs='+', 69 | help="Genders that will correspond to the " 70 | "genders in the mixtures", 71 | default=['m'], choices=['m', 'f', 'fm', 'mf']) 72 | parser.add_argument("-o", "--output_path", type=str, 73 | help="""The path that the resulting dataset 74 | would be stored. If the folder does not 75 | exist it will be created as well as its 76 | child folders train or test and val if it is 77 | selected""", 78 | required=True) 79 | parser.add_argument("-f", "--force_delays", nargs='+', type=int, 80 | help="""Whether you want to force integer 81 | delays of +- 1 in the sources e.g.""", 82 | default=None) 83 | parser.add_argument('--val_set', action="store_true", 84 | help='Force to create a separate val folder ' 85 | 'with the same amount of the mixtures as ' 86 | 'the initial test/train folder but using ' 87 | 'half of the available speakers') 88 | parser.add_argument("--n_jobs", type=int, 89 | help="Number of parallel spawning jobs", 90 | default=1) 91 | return parser.parse_args() 92 | 93 | 94 | if __name__ == "__main__": 95 | args = get_args() 96 | generate_datasets(args) -------------------------------------------------------------------------------- /spatial_two_mics/dnn/utils/experiment_command_line_parser.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Command line parser for experiments 3 | 4 | @author Efthymios Tzinis {etzinis2@illinois.edu} 5 | @copyright University of illinois at Urbana Champaign 6 | """ 7 | 8 | import argparse 9 | 10 | def get_args(): 11 | """! Command line parser for experiments""" 12 | parser = argparse.ArgumentParser(description='Deep Clustering for ' 13 | 'Audio Source ' 14 | 'Separation ' 15 | 'Experiment') 16 | parser.add_argument("--dataset", type=str, 17 | help="Dataset name", 18 | default="timit") 19 | parser.add_argument("--n_sources", type=int, 20 | help="How many sources in each mix", 21 | default=2) 22 | parser.add_argument("--n_samples", type=int, nargs='+', 23 | help="How many samples do u want to be " 24 | "created for train test val", 25 | default=[256, 64, 128]) 26 | parser.add_argument("--genders", type=str, nargs='+', 27 | help="Genders that will correspond to the " 28 | "genders in the mixtures", 29 | default=['m']) 30 | parser.add_argument("-f", "--force_delays", nargs='+', type=int, 31 | help="""Whether you want to force integer 32 | delays of +- 1 in the sources e.g.""", 33 | default=[-1, 1]) 34 | parser.add_argument("-nl", "--n_layers", type=int, 35 | help="""The number of layers of the LSTM 36 | encoder""", default=2) 37 | parser.add_argument("-ed", "--embedding_depth", type=int, 38 | help="""The depth of the embedding""", 39 | default=10) 40 | parser.add_argument("-hs", "--hidden_size", type=int, 41 | help="""The size of the LSTM cells """, 42 | default=10) 43 | parser.add_argument("-bs", "--batch_size", type=int, 44 | help="""The number of samples in each batch""", 45 | default=64) 46 | parser.add_argument("-name", "--experiment_name", type=str, 47 | help="""The name or identifier of this 48 | experiment""", 49 | default='A sample experiment'), 50 | parser.add_argument("-mt", "--labels_mask", type=str, 51 | help="""The type of masks that you want to 52 | use -- 'ground_truth' or 'duet'""", 53 | default='duet') 54 | parser.add_argument("-cad", "--cuda_available_devices", type=int, 55 | nargs="+", 56 | help="""A list of Cuda IDs that would be 57 | available for runnign this experiment""", 58 | default=[0]) 59 | parser.add_argument("--num_workers", type=int, 60 | help="""The number of cpu workers for 61 | loading the data, etc.""", default=3) 62 | parser.add_argument("--epochs", type=int, 63 | help="""The number of epochs that the 64 | experiment should run""", default=50) 65 | parser.add_argument("--evaluate_per", type=int, 66 | help="""The number of trianing epochs in 67 | order to run an evaluation""", default=5) 68 | parser.add_argument("--n_eval", type=int, 69 | help="""Reduce the number of eavluation 70 | samples to this number.""", default=256) 71 | parser.add_argument("-lr", "--learning_rate", type=float, 72 | help="""Initial Learning rate""", default=1e-3) 73 | parser.add_argument("--bidirectional", action='store_true', 74 | help="""Bidirectional or not""") 75 | 76 | return parser.parse_args() -------------------------------------------------------------------------------- /spatial_two_mics/dnn/modules/prob_estimation_initial_SDR.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Initial SDR all measurements and not only stat values 3 | 4 | @author Efthymios Tzinis {etzinis2@illinois.edu} 5 | @copyright University of illinois at Urbana Champaign 6 | """ 7 | 8 | import argparse 9 | import os 10 | import sys 11 | import numpy as np 12 | from pprint import pprint 13 | import joblib 14 | 15 | root_dir = os.path.join( 16 | os.path.dirname(os.path.realpath(__file__)), 17 | '../../../') 18 | sys.path.insert(0, root_dir) 19 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as data_loader 20 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as np_eval 21 | from spatial_two_mics.config import FINAL_RESULTS_DIR 22 | 23 | 24 | def eval(data_generator, 25 | dataset_path): 26 | 27 | data_dir = os.path.dirname(dataset_path) 28 | info = os.path.basename(data_dir) 29 | n_sources = int(info.split('_')[4]) 30 | 31 | eval_dic = {'sdr': [], 'sir': [], 'sar': []} 32 | 33 | for batch_data in data_generator: 34 | abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data 35 | 36 | for b in np.arange(abs_tfs.size(0)): 37 | 38 | sdr, sir, sar = np_eval.mixture_bss_eval( 39 | real_tfs[b].data.numpy(), 40 | imag_tfs[b].data.numpy(), 41 | wavs_lists[b].data.numpy(), 42 | n_sources) 43 | 44 | eval_dic['sdr'].append(sdr) 45 | eval_dic['sir'].append(sir) 46 | eval_dic['sar'].append(sar) 47 | 48 | # return all values 49 | result_dic = {} 50 | for k, v in eval_dic.items(): 51 | result_dic[k] = np.array(v) 52 | 53 | return result_dic 54 | 55 | 56 | def evaluate_bss_metrics(dataset_folder, 57 | n_jobs=1, 58 | get_top=None): 59 | 60 | (dataset_dir, partition) = (os.path.dirname(dataset_folder), 61 | os.path.basename(dataset_folder)) 62 | 63 | assert partition == 'test' or partition == 'val', '' \ 64 | 'All selected dataset folder to be evaluated have either ' \ 65 | 'to be test or val folder from a certain dataset!' 66 | 67 | print("Initializing the data loaders for all the datasets...") 68 | val_generator, n_val_batches = \ 69 | data_loader.get_data_generator( 70 | dataset_dir, partition=partition, 71 | get_top=get_top, num_workers=1, 72 | return_stats=False, 73 | return_n_batches=True, 74 | only_mask_evaluation=True) 75 | 76 | result_dic = eval(val_generator, 77 | os.path.join(dataset_dir, partition)) 78 | 79 | return result_dic 80 | 81 | 82 | def get_args(): 83 | """! Command line parser for computing the evaluation for 84 | specific datasets""" 85 | parser = argparse.ArgumentParser(description='Evaluating' 86 | ' initial SDR SAR and SIR for datasets') 87 | parser.add_argument("-i", "--dataset_folders", type=str, nargs='+', 88 | help="Dataset paths you want to evaluate", 89 | default=None) 90 | parser.add_argument("--n_jobs", type=int, 91 | help="Number of parallel spawinign jobs", 92 | default=1) 93 | parser.add_argument("--n_eval", type=int, 94 | help="""Reduce the number of evaluation 95 | samples to this number.""", default=None) 96 | return parser.parse_args() 97 | 98 | 99 | if __name__ == "__main__": 100 | args = get_args() 101 | 102 | for dataset_folder in args.dataset_folders: 103 | (dataset_dir, partition) = (os.path.dirname(dataset_folder), 104 | os.path.basename(dataset_folder)) 105 | 106 | eval_results = evaluate_bss_metrics(dataset_folder, 107 | n_jobs=args.n_jobs, 108 | get_top=args.n_eval) 109 | 110 | pprint(eval_results) 111 | 112 | test_on = os.path.basename(dataset_dir) + '_' + partition 113 | save_folder_name = os.path.join(FINAL_RESULTS_DIR, 114 | 'test_on_' + test_on) 115 | if not os.path.exists(save_folder_name): 116 | os.makedirs(save_folder_name) 117 | 118 | file_path = os.path.join(save_folder_name, 119 | 'initial_mixture_metrics.gz') 120 | 121 | joblib.dump(eval_results, file_path) 122 | -------------------------------------------------------------------------------- /spatial_two_mics/labels_inference/tf_label_estimator.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief An estimator of TF masks depending on Blind Source Separation 3 | Algorithms or even the energy in each bin (Ground Truth). 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of Illinois at Urbana Champaign 7 | """ 8 | 9 | import numpy as np 10 | import os 11 | import sys 12 | from pprint import pprint 13 | 14 | root_dir = os.path.join( 15 | os.path.dirname(os.path.realpath(__file__)), 16 | '../../') 17 | sys.path.insert(0, root_dir) 18 | import spatial_two_mics.labels_inference.ground_truth as gt_inference 19 | import spatial_two_mics.labels_inference.duet_mask_estimation as \ 20 | duet_kmeans_inference 21 | 22 | 23 | class TFMaskEstimator(object): 24 | """ 25 | This is a general compatible class for encapsulating the label 26 | inference / a TF max for mixtures of signals coming from 2 27 | microphones. 28 | """ 29 | def __init__(self, 30 | inference_method=None, 31 | return_duet_raw_features=False): 32 | if inference_method.lower() == "ground_truth": 33 | self.label_inference = gt_inference 34 | elif inference_method.lower() == "duet_kmeans": 35 | self.label_inference = duet_kmeans_inference 36 | else: 37 | raise NotImplementedError("Inference Method: {} is not yet " 38 | "implemented.".format(inference_method)) 39 | 40 | self.return_duet_raw_features = return_duet_raw_features 41 | 42 | def infer_mixture_labels(self, 43 | mixture_info): 44 | """ 45 | :param mixture_info: 46 | mixture_info = { 47 | 'm1_raw': numpy array containing the raw m1 signal, 48 | 'm2_raw': numpy array containing the raw m2 signal, 49 | 'm1_tf': numpy array containing the m1 TF representation, 50 | 'm2_tf': numpy array containing the m2 TF representation, 51 | 'sources_raw': a list of numpy 1d vectors containing the 52 | sources , 53 | 'sources_tf': a list of numpy 2d vectors containing the 54 | TF represeantations of the sources , 55 | 'delayed_sources_raw': a list of numpy 1d vectors containing 56 | the sources delayed with some tau, 57 | 'delayed_sources_tf': a list of numpy 2d vectors 58 | containing the TF representations of the delayed signals, 59 | 'amplitudes': the weights that each source contributes to 60 | the mixture of the second microphone 61 | } 62 | 63 | :return: A TF representation with each TF bin to correspond 64 | to the source which the algorithm predicts that is dominating 65 | """ 66 | 67 | if self.return_duet_raw_features: 68 | return self.label_inference.infer_mask(mixture_info, 69 | return_phase_features=True) 70 | else: 71 | return self.label_inference.infer_mask(mixture_info) 72 | 73 | 74 | def example_of_usage(): 75 | """! 76 | How the class of Audio mixtures should be called""" 77 | 78 | import os 79 | import sys 80 | root_dir = os.path.join( 81 | os.path.dirname(os.path.realpath(__file__)), 82 | '../../') 83 | sys.path.insert(0, root_dir) 84 | import spatial_two_mics.examples.mixture_example as me 85 | import spatial_two_mics.utils.audio_mixture_constructor as \ 86 | mix_constructor 87 | 88 | mixture_info = me.mixture_info_example() 89 | mixture_creator = mix_constructor.AudioMixtureConstructor( 90 | n_fft=1024, win_len=400, hop_len=200, mixture_duration=2.0, 91 | force_delays=[-1, 1]) 92 | 93 | tf_mixtures = mixture_creator.construct_mixture(mixture_info) 94 | 95 | duet_estimator = TFMaskEstimator(inference_method='duet_Kmeans') 96 | 97 | tf_labels = duet_estimator.infer_mixture_labels(tf_mixtures) 98 | print("DUET Kmeans") 99 | pprint(tf_labels.shape) 100 | 101 | ground_truth_estimator = TFMaskEstimator( 102 | inference_method='ground_truth') 103 | 104 | gt_labels = ground_truth_estimator.infer_mixture_labels(tf_mixtures) 105 | print("Ground Truth") 106 | pprint(gt_labels.shape) 107 | 108 | n_bins = np.product(gt_labels.shape) 109 | print("Estimation differs at {} out of {} points".format( 110 | min(np.count_nonzero(abs(gt_labels-tf_labels)), 111 | n_bins - np.count_nonzero(abs(gt_labels - tf_labels))), 112 | n_bins)) 113 | 114 | 115 | if __name__ == "__main__": 116 | example_of_usage() 117 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/modules/prob_estimation_ground_truth_masks.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief sourse separation performance all eval values 3 | 4 | @author Efthymios Tzinis {etzinis2@illinois.edu} 5 | @copyright University of illinois at Urbana Champaign 6 | """ 7 | 8 | import argparse 9 | import os 10 | import sys 11 | import numpy as np 12 | from pprint import pprint 13 | from joblib import Parallel, delayed 14 | from tqdm import tqdm 15 | import itertools 16 | import joblib 17 | 18 | root_dir = os.path.join( 19 | os.path.dirname(os.path.realpath(__file__)), 20 | '../../../') 21 | sys.path.insert(0, root_dir) 22 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as data_loader 23 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as np_eval 24 | from spatial_two_mics.config import FINAL_RESULTS_DIR 25 | 26 | 27 | def eval(data_generator, 28 | dataset_path): 29 | 30 | data_dir = os.path.dirname(dataset_path) 31 | info = os.path.basename(data_dir) 32 | n_sources = int(info.split('_')[4]) 33 | 34 | eval_dic = {'sdr': [], 'sir': [], 'sar': []} 35 | 36 | for batch_data in data_generator: 37 | abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data 38 | 39 | for b in np.arange(abs_tfs.size(0)): 40 | embedding_labels = masks[b].data.numpy() 41 | 42 | sdr, sir, sar = np_eval.naive_cpu_bss_eval( 43 | embedding_labels, 44 | real_tfs[b].data.numpy(), 45 | imag_tfs[b].data.numpy(), 46 | wavs_lists[b].data.numpy(), 47 | n_sources, 48 | batch_index=b) 49 | 50 | eval_dic['sdr'].append(sdr) 51 | eval_dic['sir'].append(sir) 52 | eval_dic['sar'].append(sar) 53 | 54 | # return all values 55 | result_dic = {} 56 | for k, v in eval_dic.items(): 57 | result_dic[k] = np.array(v) 58 | 59 | return result_dic 60 | 61 | 62 | def evaluate_labels(dataset_folder, 63 | n_jobs=1, 64 | get_top=None): 65 | (dataset_dir, partition) = (os.path.dirname(dataset_folder), 66 | os.path.basename(dataset_folder)) 67 | 68 | assert partition == 'test' or partition == 'val', '' \ 69 | 'All selected dataset folder to be evaluated have either ' \ 70 | 'to be test or val folder from a certain dataset!' 71 | 72 | eval_results={} 73 | for eval_labels in ['duet', 'ground_truth']: 74 | val_generator, n_val_batches = \ 75 | data_loader.get_data_generator( 76 | dataset_dir, partition=partition, 77 | get_top=get_top, num_workers=1, 78 | return_stats=False, labels_mask=eval_labels, 79 | return_n_batches=True, 80 | only_mask_evaluation=True) 81 | 82 | eval_results[eval_labels] = eval(val_generator, 83 | os.path.join(dataset_dir, 84 | partition)) 85 | 86 | return eval_results 87 | 88 | 89 | def get_args(): 90 | """! Command line parser for computing the evaluation for 91 | specific datasets""" 92 | parser = argparse.ArgumentParser(description='Evaluating' 93 | ' groundtruth or duet labels for datasets folders') 94 | parser.add_argument("-i", "--dataset_folders", type=str, nargs='+', 95 | help="Dataset paths you want to evaluate", 96 | default=[]) 97 | parser.add_argument("--n_jobs", type=int, 98 | help="Number of parallel spawinign jobs", 99 | default=1) 100 | parser.add_argument("--n_eval", type=int, 101 | help="""Reduce the number of evaluation 102 | samples to this number.""", default=None) 103 | return parser.parse_args() 104 | 105 | 106 | if __name__ == "__main__": 107 | args = get_args() 108 | 109 | 110 | for dataset_folder in args.dataset_folders: 111 | (dataset_dir, partition) = (os.path.dirname(dataset_folder), 112 | os.path.basename(dataset_folder)) 113 | 114 | eval_results = evaluate_labels(dataset_folder, 115 | n_jobs=args.n_jobs, 116 | get_top=args.n_eval) 117 | 118 | pprint(eval_results) 119 | 120 | test_on = os.path.basename(dataset_dir) + '_' + partition 121 | save_folder_name = os.path.join(FINAL_RESULTS_DIR, 122 | 'test_on_' + test_on) 123 | if not os.path.exists(save_folder_name): 124 | os.makedirs(save_folder_name) 125 | 126 | for labels, metrics in eval_results.items(): 127 | file_path = os.path.join(save_folder_name, 128 | labels + '_mask_metrics.gz') 129 | 130 | joblib.dump(metrics, file_path) 131 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/modules/measure_initial_SDR.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief For a specific dataset just find all the groundtruth 3 | evaluation when applying either a duet or a ground truth labeled mask 4 | for source separation 5 | 6 | @author Efthymios Tzinis {etzinis2@illinois.edu} 7 | @copyright University of illinois at Urbana Champaign 8 | """ 9 | 10 | import argparse 11 | import os 12 | import sys 13 | import numpy as np 14 | from pprint import pprint 15 | from joblib import Parallel, delayed 16 | from tqdm import tqdm 17 | import itertools 18 | import pandas as pd 19 | 20 | root_dir = os.path.join( 21 | os.path.dirname(os.path.realpath(__file__)), 22 | '../../../') 23 | sys.path.insert(0, root_dir) 24 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as data_loader 25 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as np_eval 26 | 27 | 28 | def eval(data_generator, 29 | dataset_path): 30 | 31 | data_dir = os.path.dirname(dataset_path) 32 | info = os.path.basename(data_dir) 33 | n_sources = int(info.split('_')[4]) 34 | 35 | eval_dic = {'sdr': [], 'sir': [], 'sar': []} 36 | 37 | for batch_data in data_generator: 38 | abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data 39 | 40 | for b in np.arange(abs_tfs.size(0)): 41 | 42 | sdr, sir, sar = np_eval.mixture_bss_eval( 43 | real_tfs[b].data.numpy(), 44 | imag_tfs[b].data.numpy(), 45 | wavs_lists[b].data.numpy(), 46 | n_sources) 47 | 48 | eval_dic['sdr'].append(sdr) 49 | eval_dic['sir'].append(sir) 50 | eval_dic['sar'].append(sar) 51 | 52 | # return both mean and std values 53 | mean_std_dic = {} 54 | for k, v in eval_dic.items(): 55 | # mean_std_dic[k + "_max"] = np.max(np.array(v)) 56 | # mean_std_dic[k + "_min"] = np.min(np.array(v)) 57 | mean_std_dic[k+"_mean"] = np.mean(np.array(v)) 58 | mean_std_dic[k+"_std"] = np.std(np.array(v)) 59 | # mean_std_dic[k + "_50"] = np.quantile(np.array(v), 0.50) 60 | # mean_std_dic[k + "_25"] = np.quantile(np.array(v), 0.25) 61 | # mean_std_dic[k + "_75"] = np.quantile(np.array(v), 0.75) 62 | 63 | return dataset_path, mean_std_dic 64 | 65 | def evaluate_bss_metrics(dataset_folders, 66 | n_jobs=1, 67 | get_top=None): 68 | 69 | dirs_and_parts = [(os.path.dirname(f), os.path.basename(f)) 70 | for f in dataset_folders] 71 | 72 | assert all([partition == 'test' or partition == 'val' 73 | for (_, partition) in dirs_and_parts]), '' \ 74 | 'All selected dataset folder to be evaluated have either ' \ 75 | 'to be test or val folder from a certain dataset!' 76 | 77 | print("Initializing the data loaders for all the datasets...") 78 | datasets_loaders = [data_loader.get_data_generator( 79 | dataset_dir, partition=partition, 80 | get_top=get_top, num_workers=1, 81 | return_stats=False, 82 | return_n_batches=True, 83 | only_mask_evaluation=True) 84 | for (dataset_dir, partition) in dirs_and_parts] 85 | 86 | data_info = [list(itertools.chain.from_iterable(info_lists)) 87 | for info_lists in zip(datasets_loaders, dirs_and_parts)] 88 | 89 | eval_results = Parallel(n_jobs=n_jobs)( 90 | [delayed(eval)(data_loader, 91 | os.path.join(data_dir, partition)) 92 | for (data_loader, n_batches, data_dir, partition) 93 | in tqdm(data_info)]) 94 | 95 | return eval_results 96 | 97 | 98 | def get_args(): 99 | """! Command line parser for computing the evaluation for 100 | specific datasets""" 101 | parser = argparse.ArgumentParser(description='Evaluating' 102 | ' initial SDR SAR and SIR for datasets') 103 | parser.add_argument("-i", "--dataset_folders", type=str, nargs='+', 104 | help="Dataset paths you want to evaluate", 105 | default=[]) 106 | parser.add_argument("--n_jobs", type=int, 107 | help="Number of parallel spawinign jobs", 108 | default=1) 109 | parser.add_argument("--n_eval", type=int, 110 | help="""Reduce the number of evaluation 111 | samples to this number.""", default=None) 112 | return parser.parse_args() 113 | 114 | 115 | if __name__ == "__main__": 116 | args = get_args() 117 | eval_results = evaluate_bss_metrics(args.dataset_folders, 118 | n_jobs=args.n_jobs, 119 | get_top=args.n_eval) 120 | 121 | df = pd.DataFrame(dict([(os.path.basename(os.path.dirname(p)) + 122 | '/' + os.path.basename(p), res) 123 | for (p, res) in eval_results])).T 124 | pd.set_option('display.expand_frame_repr', False) 125 | print(df) 126 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/utils/model_logger.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Model logger in order to be able to load the model and test it 3 | on different data. 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of illinois at Urbana Champaign 7 | """ 8 | import os 9 | import sys 10 | import torch 11 | import datetime 12 | import glob2 13 | import torch.nn as nn 14 | 15 | root_dir = os.path.join( 16 | os.path.dirname(os.path.realpath(__file__)), 17 | '../../../') 18 | sys.path.insert(0, root_dir) 19 | from spatial_two_mics.config import MODELS_DIR 20 | from spatial_two_mics.config import MODELS_RAW_PHASE_DIR 21 | from spatial_two_mics.config import MODELS_GROUND_TRUTH 22 | import spatial_two_mics.dnn.models.simple_LSTM_encoder as LSTM_builder 23 | 24 | 25 | def save(model, 26 | optimizer, 27 | args, 28 | epoch, 29 | performance_dic, 30 | dataset_id, 31 | mean_tr, 32 | std_tr, 33 | max_models_per_dataset=30, 34 | training_labels=''): 35 | state = { 36 | 'epoch': epoch, 37 | 'val_performance': performance_dic, 38 | 'model_state': model.state_dict(), 39 | 'optimizer_state': optimizer.state_dict(), 40 | 'args': args, 41 | 'mean_tr': mean_tr, 42 | 'std_tr': std_tr, 43 | 'training_labels': training_labels 44 | } 45 | sdr_str = str(round(performance_dic['sdr'], 3)) 46 | sar_str = str(round(performance_dic['sar'], 3)) 47 | sir_str = str(round(performance_dic['sir'], 3)) 48 | 49 | if training_labels == 'raw_phase_diff': 50 | folder_name = os.path.join(MODELS_RAW_PHASE_DIR, dataset_id) 51 | elif training_labels == 'ground_truth': 52 | folder_name = os.path.join(MODELS_GROUND_TRUTH, dataset_id) 53 | else: 54 | folder_name = os.path.join(MODELS_DIR, dataset_id) 55 | 56 | 57 | if not os.path.exists(folder_name): 58 | os.makedirs(folder_name) 59 | 60 | available_models = glob2.glob(folder_name + '/*.pt') 61 | 62 | if len(available_models) > max_models_per_dataset: 63 | sdr_and_model_path = [os.path.basename(path) 64 | for path in available_models] 65 | sdr_and_model_path = [float(path.split("_")[1]) 66 | for path in sdr_and_model_path] 67 | sdr_and_model_path = zip(sdr_and_model_path, available_models) 68 | sdr_sorted_models = sorted(sdr_and_model_path, 69 | key=lambda x: x[0])[::-1] 70 | for sdr, path in sdr_sorted_models[max_models_per_dataset:]: 71 | try: 72 | os.remove(path) 73 | except: 74 | print("Error in removing {} ...".format(path)) 75 | 76 | ts = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%s") 77 | filename = "SDR_{}_SIR_{}_SAR_{}_{}.pt".format(sdr_str, 78 | sir_str, 79 | sar_str, 80 | ts) 81 | file_path = os.path.join(folder_name, filename) 82 | torch.save(state, file_path) 83 | 84 | 85 | # def load(model, 86 | # optimizer, 87 | # dataset_id, 88 | # filename=None): 89 | # 90 | # folder_name = os.path.join(MODELS_DIR, dataset_id) 91 | # if filename is None: 92 | # available_models = glob2.glob(folder_name + '/*.pt') 93 | # file_path = os.path.join(folder_name, available_models[0]) 94 | # else: 95 | # file_path = os.path.join(folder_name, filename) 96 | # 97 | # loaded_state = torch.load(file_path) 98 | # model.load_state_dict(loaded_state['model_state']) 99 | # optimizer.load_state_dict(loaded_state['optimizer_state']) 100 | # epoch = loaded_state['epoch'] 101 | # val_performance = loaded_state['val_performance'] 102 | # args = loaded_state['args'] 103 | # mean_tr = loaded_state['mean_tr'] 104 | # std_tr = loaded_state['std_tr'] 105 | # 106 | # return (model, optimizer, epoch, val_performance, 107 | # args, mean_tr, std_tr) 108 | 109 | 110 | def load_and_create_the_model(model_path): 111 | 112 | loaded_state = torch.load(model_path) 113 | epoch = loaded_state['epoch'] 114 | val_performance = loaded_state['val_performance'] 115 | args = loaded_state['args'] 116 | mean_tr = loaded_state['mean_tr'] 117 | std_tr = loaded_state['std_tr'] 118 | training_labels = loaded_state['training_labels'] 119 | 120 | model = LSTM_builder.BLSTMEncoder(num_layers=args.n_layers, 121 | hidden_size=args.hidden_size, 122 | embedding_depth=args.embedding_depth, 123 | bidirectional=args.bidirectional, 124 | dropout=args.dropout) 125 | model = nn.DataParallel(model).cuda() 126 | 127 | optimizer = torch.optim.Adam(model.parameters(), 128 | lr=args.learning_rate, 129 | betas=(0.9, 0.999)) 130 | 131 | model.load_state_dict(loaded_state['model_state']) 132 | optimizer.load_state_dict(loaded_state['optimizer_state']) 133 | 134 | return (model, optimizer, epoch, val_performance, 135 | args, mean_tr, std_tr, training_labels) 136 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/utils/experiment_command_line_parser_v2.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Command line parser for experiments 3 | 4 | @author Efthymios Tzinis {etzinis2@illinois.edu} 5 | @copyright University of illinois at Urbana Champaign 6 | """ 7 | 8 | import argparse 9 | 10 | def get_args(): 11 | """! Command line parser for experiments""" 12 | parser = argparse.ArgumentParser(description='Deep Clustering for ' 13 | 'Audio Source ' 14 | 'Separation ' 15 | 'Experiment') 16 | parser.add_argument("--train", type=str, 17 | help="Path for the training dataset", 18 | default=None) 19 | parser.add_argument("--test", type=str, 20 | help="Path for the testing dataset", 21 | default=None) 22 | parser.add_argument("--val", type=str, 23 | help="Path for the validation dataset", 24 | default=None) 25 | parser.add_argument("--n_train", type=int, 26 | help="""Reduce the number of training 27 | samples to this number.""", default=None) 28 | parser.add_argument("--n_test", type=int, 29 | help="""Reduce the number of testing 30 | samples to this number.""", default=None) 31 | parser.add_argument("--n_val", type=int, 32 | help="""Reduce the number of evaluation 33 | samples to this number.""", default=None) 34 | parser.add_argument("-nl", "--n_layers", type=int, 35 | help="""The number of layers of the BLSTM 36 | encoder""", default=2) 37 | parser.add_argument("-ed", "--embedding_depth", type=int, 38 | help="""The depth of the embedding""", 39 | default=16) 40 | parser.add_argument("-hs", "--hidden_size", type=int, 41 | help="""The size of the LSTM cells """, 42 | default=1024) 43 | parser.add_argument("-bs", "--batch_size", type=int, 44 | help="""The number of samples in each batch. 45 | Warning: Cannot be less than the number of 46 | the validation samples""", default=32) 47 | parser.add_argument("-name", "--experiment_name", type=str, 48 | help="""The name or identifier of this 49 | experiment""", 50 | default='A sample experiment'), 51 | parser.add_argument("-train_l", "--training_labels", type=str, 52 | help="""The type of masks that you want to 53 | use for training as the ideal affinities""", 54 | default='duet', choices=['duet', 55 | 'raw_phase_diff', 56 | 'ground_truth']) 57 | parser.add_argument("-cad", "--cuda_available_devices", type=int, 58 | nargs="+", 59 | help="""A list of Cuda IDs that would be 60 | available for running this experiment""", 61 | default=[0]) 62 | parser.add_argument("--num_workers", type=int, 63 | help="""The number of cpu workers for 64 | loading the data, etc.""", default=3) 65 | parser.add_argument("--epochs", type=int, 66 | help="""The number of epochs that the 67 | experiment should run""", default=50) 68 | parser.add_argument("--eval_per", type=int, 69 | help="""The number of training epochs in 70 | order to run an evaluation""", default=5) 71 | parser.add_argument("-lr", "--learning_rate", type=float, 72 | help="""Initial Learning rate""", default=1e-4) 73 | parser.add_argument("-dr", "--dropout", type=float, 74 | help="""Dropout Ratio""", default=0.) 75 | parser.add_argument("--bidirectional", action='store_true', 76 | help="""Bidirectional or not""") 77 | parser.add_argument("--early_stop_patience", type=int, 78 | help="""The number of training epochs that 79 | the model will endure until the eval metric ( 80 | e.g SDR) will not become better""", 81 | default=15) 82 | parser.add_argument("--lr_patience", type=int, 83 | help="""The number of training epochs that 84 | the model will endure until the learning 85 | rate would be reduced""", default=7) 86 | parser.add_argument("--lr_gamma_decay", type=float, 87 | help="""Multiplicative value of decay that 88 | would be enforced in the value of the learning 89 | rate""", default=0.2) 90 | parser.add_argument("--save_best", type=int, 91 | help="""The number of best models dependent 92 | on the metric you want to use that are going 93 | to be saved under the preferred logging model 94 | directory.""", 95 | default=10) 96 | 97 | return parser.parse_args() -------------------------------------------------------------------------------- /spatial_two_mics/dnn/losses/affinity_approximation.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Loss functions for low rank approximations of an ideal 3 | affinity mask 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of illinois at Urbana Champaign 7 | """ 8 | import torch 9 | import torch.nn as nn 10 | 11 | 12 | def frobenius_naive(vs, ys): 13 | """! Computing naively the loss function between embedding 14 | vectors vs and ideal affinity matrices ys 15 | 16 | :param vs: size: batch_size x n_elements x embedded_features 17 | :param ys: One hot tensor corresponding to 1 where a specific 18 | class is the label for one element or 0 otherwise and its size: 19 | batch_size x n_elements x n_classes 20 | :return: The computed loss of these two tensors 21 | """ 22 | loss = torch.mean(torch.norm(torch.norm( 23 | torch.matmul(vs, vs.permute(0, 2, 1)) - 24 | torch.matmul(ys, ys.permute(0, 2, 1)), 2, 1), 2, 1)**2) 25 | 26 | return loss 27 | 28 | 29 | def efficient_frobenius(vs, ys, eps=10e-12): 30 | ys_T = ys.permute(0, 2, 1) 31 | vs_T = vs.permute(0, 2, 1) 32 | summed_y_T = ys_T.sum(dim=2).unsqueeze(-1) 33 | d = torch.bmm(ys, summed_y_T) 34 | d_m1_2 = torch.reciprocal(torch.sqrt(d) + eps) 35 | 36 | # print("Psola") 37 | # print((torch.bmm(vs_T, vs * d_m1_2)**2).shape) 38 | 39 | est_loss = (torch.bmm(vs_T, vs * d_m1_2) ** 2).sum() 40 | union_loss = (torch.bmm(vs_T, ys * d_m1_2) ** 2).sum() 41 | true_loss = (torch.bmm(ys_T, ys * d_m1_2) ** 2).sum() 42 | total_loss = est_loss - 2. * union_loss + true_loss 43 | # print(total_loss.shape) 44 | # print(est_loss.shape) 45 | # print(est_loss) 46 | 47 | print(union_loss) 48 | uni_loss = (torch.bmm(ys_T, vs * d_m1_2) ** 2).sum() 49 | print(uni_loss) 50 | 51 | return total_loss / vs.size(0) 52 | 53 | 54 | def paris_naive(vs, ys): 55 | """! Computing naively the loss function between embedding 56 | vectors vs and ideal affinity matrices ys 57 | 58 | :param vs: size: batch_size x n_elements x embedded_features 59 | :param ys: One hot tensor corresponding to 1 where a specific 60 | class is the label for one element or 0 otherwise and its size: 61 | batch_size x n_elements x n_classes 62 | :return: The computed loss of these two tensors 63 | """ 64 | loss = torch.sqrt(torch.mean(torch.bmm(vs.transpose(1, 2), vs) ** 2)) \ 65 | - 2. * torch.sqrt(torch.mean(torch.bmm(vs.transpose(1, 2), ys) ** 2)) \ 66 | + torch.sqrt(torch.mean(torch.bmm(ys.transpose(1, 2),ys) ** 2)) 67 | return loss 68 | 69 | 70 | 71 | def thymios_naive(vs, ys): 72 | """! Computing naively the loss function between embedding 73 | vectors vs and ideal affinity matrices ys 74 | 75 | :param vs: size: batch_size x n_elements x embedded_features 76 | :param ys: One hot tensor corresponding to 1 where a specific 77 | class is the label for one element or 0 otherwise and its size: 78 | batch_size x n_elements x n_classes 79 | :return: The computed loss of these two tensors 80 | """ 81 | l = torch.sqrt((torch.bmm(vs.transpose(1, 2), vs) ** 2).sum()) \ 82 | - 2.*torch.sqrt((torch.bmm(vs.transpose(1, 2), ys) **2).sum()) \ 83 | + torch.sqrt((torch.bmm(ys.transpose(1, 2), ys) ** 2).sum()) 84 | return l / vs.size(0) 85 | 86 | 87 | def naive(vs, ys): 88 | """! Computing naively the loss function between embedding 89 | vectors vs and ideal affinity matrices ys 90 | 91 | :param vs: size: batch_size x n_elements x embedded_features 92 | :param ys: One hot tensor corresponding to 1 where a specific 93 | class is the label for one element or 0 otherwise and its size: 94 | batch_size x n_elements x n_classes 95 | :return: The computed loss of these two tensors 96 | """ 97 | loss = (torch.matmul(vs.transpose(1, 2), vs) ** 2).sum() \ 98 | - 2. * (torch.matmul(vs.transpose(1, 2), ys) ** 2).sum() \ 99 | + (torch.matmul(ys.transpose(1, 2), ys) ** 2).sum() 100 | return loss / vs.size(0) 101 | # return loss 102 | 103 | 104 | def diagonal(embedding, assignments): 105 | batch_size, sequence_length, num_frequencies, embedding_size = embedding.size() 106 | _, _, _, num_sources = assignments.size() 107 | embedding = embedding.view(-1, embedding.size()[-1]) 108 | assignments = assignments.view(-1, assignments.size()[-1]) 109 | 110 | class_weights = nn.functional.normalize(torch.sum(assignments.detach(), dim=-2), p=1, dim=-1).unsqueeze(0) 111 | class_weights = 1.0 / (torch.sqrt(class_weights) + 1e-7) 112 | weights = torch.matmul(assignments.detach(), class_weights.transpose(1, 0)) 113 | # norm = torch.sum(weights**2)**2 114 | assignments = assignments * weights.repeat(1, assignments.size()[-1]) 115 | embedding = embedding * weights.repeat(1, embedding.size()[-1]) 116 | 117 | embedding = embedding.view(batch_size, sequence_length*num_frequencies, embedding_size) 118 | assignments = assignments.view(batch_size, sequence_length*num_frequencies, num_sources) 119 | 120 | embedding_transpose = embedding.permute(0, 2, 1) 121 | assignments_transpose = assignments.permute(0, 2, 1) 122 | 123 | loss_est = torch.sum(torch.matmul(embedding_transpose, embedding)**2) 124 | loss_est_true = torch.sum(torch.matmul(embedding_transpose, assignments)**2) 125 | loss_true = torch.sum(torch.matmul(assignments_transpose, assignments)**2) 126 | loss = loss_est - 2*loss_est_true + loss_true 127 | # loss = loss / norm 128 | return loss 129 | 130 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/modules/ground_truth_evaluation.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief For a specific dataset just find all the groundtruth 3 | evaluation when applying either a duet or a ground truth labeled mask 4 | for source separation 5 | 6 | @author Efthymios Tzinis {etzinis2@illinois.edu} 7 | @copyright University of illinois at Urbana Champaign 8 | """ 9 | 10 | import argparse 11 | import os 12 | import sys 13 | import numpy as np 14 | from pprint import pprint 15 | from joblib import Parallel, delayed 16 | from tqdm import tqdm 17 | import itertools 18 | import pandas as pd 19 | 20 | root_dir = os.path.join( 21 | os.path.dirname(os.path.realpath(__file__)), 22 | '../../../') 23 | sys.path.insert(0, root_dir) 24 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as data_loader 25 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as np_eval 26 | 27 | 28 | def eval(data_generator, 29 | dataset_path): 30 | 31 | data_dir = os.path.dirname(dataset_path) 32 | info = os.path.basename(data_dir) 33 | n_sources = int(info.split('_')[4]) 34 | 35 | eval_dic = {'sdr': [], 'sir': [], 'sar': []} 36 | 37 | for batch_data in data_generator: 38 | abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data 39 | 40 | for b in np.arange(abs_tfs.size(0)): 41 | embedding_labels = masks[b].data.numpy() 42 | 43 | sdr, sir, sar = np_eval.naive_cpu_bss_eval( 44 | embedding_labels, 45 | real_tfs[b].data.numpy(), 46 | imag_tfs[b].data.numpy(), 47 | wavs_lists[b].data.numpy(), 48 | n_sources, 49 | batch_index=b) 50 | 51 | eval_dic['sdr'].append(sdr) 52 | eval_dic['sir'].append(sir) 53 | eval_dic['sar'].append(sar) 54 | 55 | # return both mean and std values 56 | mean_std_dic = {} 57 | for k, v in eval_dic.items(): 58 | mean_std_dic[k+"_mean"] = np.mean(np.array(v)) 59 | mean_std_dic[k+"_std"] = np.std(np.array(v)) 60 | 61 | return dataset_path, mean_std_dic 62 | 63 | 64 | def eval_wrapper(): 65 | return lambda data_generator, n_batches, dataset_path: eval( 66 | data_generator, n_batches, dataset_path) 67 | 68 | 69 | def evaluate_labels(dataset_folders, 70 | eval_labels='duet', 71 | n_jobs=1, 72 | get_top=None): 73 | 74 | n_workers = n_jobs 75 | dirs_and_parts = [(os.path.dirname(f), os.path.basename(f)) 76 | for f in dataset_folders] 77 | 78 | assert all([partition == 'test' or partition == 'val' 79 | for (_, partition) in dirs_and_parts]), '' \ 80 | 'All selected dataset folder to be evaluated have either ' \ 81 | 'to be test or val folder from a certain dataset!' 82 | 83 | print("Initializing the data loaders for all the datasets...") 84 | datasets_loaders = [data_loader.get_data_generator( 85 | dataset_dir, partition=partition, 86 | get_top=get_top, num_workers=1, 87 | return_stats=False, labels_mask=eval_labels, 88 | return_n_batches=True, 89 | only_mask_evaluation=True) 90 | for (dataset_dir, partition) in dirs_and_parts] 91 | 92 | data_info = [list(itertools.chain.from_iterable(info_lists)) 93 | for info_lists in zip(datasets_loaders, dirs_and_parts)] 94 | 95 | eval_results = Parallel(n_jobs=n_jobs)( 96 | [delayed(eval)(data_loader, 97 | os.path.join(data_dir, partition)) 98 | for (data_loader, n_batches, data_dir, partition) 99 | in tqdm(data_info)]) 100 | 101 | return eval_results 102 | 103 | 104 | def get_args(): 105 | """! Command line parser for computing the evaluation for 106 | specific datasets""" 107 | parser = argparse.ArgumentParser(description='Evaluating' 108 | ' groundtruth or duet labels for datasets folders') 109 | parser.add_argument("-i", "--dataset_folders", type=str, nargs='+', 110 | help="Dataset paths you want to evaluate", 111 | default=[]) 112 | parser.add_argument("-l", "--eval_labels", type=str, 113 | help="Choose what labels do you want to use " 114 | "for the evaluation", 115 | default='duet', choices=['duet', 116 | 'ground_truth']) 117 | parser.add_argument("--n_jobs", type=int, 118 | help="Number of parallel spawinign jobs", 119 | default=1) 120 | parser.add_argument("--n_eval", type=int, 121 | help="""Reduce the number of evaluation 122 | samples to this number.""", default=None) 123 | return parser.parse_args() 124 | 125 | 126 | if __name__ == "__main__": 127 | args = get_args() 128 | eval_results = evaluate_labels(args.dataset_folders, 129 | eval_labels=args.eval_labels, 130 | n_jobs=args.n_jobs, 131 | get_top=args.n_eval) 132 | 133 | df = pd.DataFrame(dict([(os.path.basename(os.path.dirname(p)) + 134 | '/' + os.path.basename(p), res) 135 | for (p, res) in eval_results])).T 136 | pd.set_option('display.expand_frame_repr', False) 137 | print(df) 138 | -------------------------------------------------------------------------------- /spatial_two_mics/data_generator/dataset_storage.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief A dataset creation which is used in order to combine the 3 | mixtures form the dataset and also store them inside a specified folder 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of Illinois at Urbana Champaign 7 | """ 8 | 9 | import argparse 10 | import os 11 | import sys 12 | import numpy as np 13 | from pprint import pprint 14 | from sklearn.externals import joblib 15 | 16 | root_dir = os.path.join( 17 | os.path.dirname(os.path.realpath(__file__)), 18 | '../../') 19 | sys.path.insert(0, root_dir) 20 | 21 | import spatial_two_mics.data_generator.dataset_generator as generator 22 | 23 | 24 | def create_dataset_name(args): 25 | dataset_name = '{}_{}_{}_{}_{}'.format( 26 | args['dataset'], 27 | '_'.join(map(str, args['n_samples'])), 28 | args['n_sources'], 29 | ''.join(sorted(args['genders'])), 30 | 'taus'.join(map(str, args['force_delays']))) 31 | return dataset_name 32 | 33 | 34 | def get_mixture_name_and_data_to_save(mix_info): 35 | name = [s_id['speaker_id']+'-'+s_id['sentence_id'] 36 | for s_id in mix_info['sources_ids']] 37 | name = '_'.join(name) 38 | 39 | # don't save also the wavs, read them real time 40 | for i, source_info in enumerate(mix_info['sources_ids']): 41 | try: 42 | del mix_info['sources_ids'][i]['wav'] 43 | except: 44 | pass 45 | 46 | data = { 47 | 'positions': mix_info['positions'], 48 | 'sources_ids': mix_info['sources_ids'], 49 | 'ground_truth_mask': mix_info['ground_truth_mask'] 50 | } 51 | 52 | if 'soft_labeled_mask' in mix_info: 53 | data['soft_labeled_mask'] = mix_info['soft_labeled_mask'] 54 | 55 | return name, data 56 | 57 | def time_loading_comparison(data, f_path): 58 | import _pickle as cPickle 59 | from sklearn.externals import joblib 60 | import time 61 | 62 | joblib.dump(data, f_path) 63 | before = time.time() 64 | tempos = joblib.load(f_path) 65 | now = time.time() 66 | jlib_time = now - before 67 | 68 | cPickle.dump(data, open(f_path, 'wb')) 69 | before = time.time() 70 | tempos = cPickle.load(open(f_path, 'rb')) 71 | now = time.time() 72 | pickle_time = now - before 73 | 74 | return jlib_time, pickle_time 75 | 76 | 77 | def store_dataset(dataset_dic, args): 78 | 79 | dataset_params = args.__dict__ 80 | dataset_name = create_dataset_name(dataset_params) 81 | 82 | dataset_path = os.path.join(args.output_path, dataset_name) 83 | if not os.path.exists(dataset_path): 84 | os.makedirs(dataset_path) 85 | 86 | for subf, mixtures_info in dataset_dic.items(): 87 | subf_path = os.path.join(dataset_path, subf) 88 | if not os.path.exists(subf_path): 89 | os.makedirs(subf_path) 90 | 91 | for mix_info in mixtures_info: 92 | name, data = get_mixture_name_and_data_to_save(mix_info) 93 | f_path = os.path.join(subf_path, name) 94 | joblib.dump(data, f_path, compress=3) 95 | 96 | 97 | def generate_dataset(args): 98 | n_train, n_test, n_val = args.n_samples 99 | timit_mixture_creator = generator.RandomCombinations( 100 | audio_dataset_name=args.dataset, 101 | genders_mixtures=args.genders, 102 | subset_of_speakers='train', 103 | create_val_set=False) 104 | 105 | dataset_dic = timit_mixture_creator.get_all_mixture_sets( 106 | n_sources_in_mix=args.n_sources, 107 | n_mixtures=n_train, 108 | force_delays=args.force_delays) 109 | 110 | timit_mixture_creator = generator.RandomCombinations( 111 | audio_dataset_name=args.dataset, 112 | genders_mixtures=args.genders, 113 | subset_of_speakers='test', 114 | create_val_set=True) 115 | 116 | test_val_dic = timit_mixture_creator.get_all_mixture_sets( 117 | n_sources_in_mix=args.n_sources, 118 | n_mixtures=max(n_test, n_val), 119 | force_delays=args.force_delays) 120 | 121 | if n_val > n_test: 122 | test_val_dic['test'] = np.random.choice(test_val_dic['test'], 123 | size=n_test, 124 | replace=False) 125 | elif n_val < n_test: 126 | test_val_dic['val'] = np.random.choice(test_val_dic['val'], 127 | size=n_val, 128 | replace=False) 129 | 130 | dataset_dic.update(test_val_dic) 131 | return dataset_dic 132 | 133 | 134 | def create_and_store_dataset(args): 135 | dataset_dic = generate_dataset(args) 136 | store_dataset(dataset_dic, args) 137 | 138 | 139 | def get_args(): 140 | """! Command line parser """ 141 | parser = argparse.ArgumentParser(description='Mixture dataset ' 142 | 'creator') 143 | parser.add_argument("--dataset", type=str, 144 | help="Dataset name", default="timit") 145 | parser.add_argument("--n_sources", type=int, 146 | help="How many sources in each mix", default=2) 147 | parser.add_argument("--n_samples", type=int, nargs='+', 148 | help="How many samples do u want to be " 149 | "created for train test val", 150 | default=10) 151 | parser.add_argument("--genders", type=str, nargs='+', 152 | help="Genders that will correspond to the " 153 | "genders in the mixtures", 154 | default=['m', 'f']) 155 | parser.add_argument("-o", "--output_path", type=str, 156 | help="""The path that the resulting dataset 157 | would be stored. If the folder does not 158 | exist it will be created as well as its 159 | child folders train or test and val if it is 160 | selected""", 161 | required=True) 162 | parser.add_argument("-f", "--force_delays", nargs='+', type=int, 163 | help="""Whether you want to force integer 164 | delays of +- 1 in the sources e.g.""", 165 | default=None) 166 | parser.add_argument('--val_set', action="store_true", 167 | help='Force to create a separate val folder ' 168 | 'with the same amount of the mixtures as ' 169 | 'the initial test/train folder but using ' 170 | 'half of the available speakers') 171 | return parser.parse_args() 172 | 173 | 174 | if __name__ == "__main__": 175 | args = get_args() 176 | create_and_store_dataset(args) 177 | 178 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/modules/find_best_model_and_estimate_prob.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Initial SDR all measurements and not only stat values 3 | 4 | @author Efthymios Tzinis {etzinis2@illinois.edu} 5 | @copyright University of illinois at Urbana Champaign 6 | """ 7 | 8 | import argparse 9 | import os 10 | import sys 11 | import numpy as np 12 | from pprint import pprint 13 | import joblib 14 | from sklearn.cluster import KMeans 15 | from progress.bar import ChargingBar 16 | import torch 17 | import pandas as pd 18 | 19 | 20 | root_dir = os.path.join( 21 | os.path.dirname(os.path.realpath(__file__)), 22 | '../../../') 23 | sys.path.insert(0, root_dir) 24 | import spatial_two_mics.dnn.utils.model_logger as model_logger 25 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as data_loader 26 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as np_eval 27 | from spatial_two_mics.config import * 28 | 29 | 30 | def eval(dataset_gen, 31 | model_path, 32 | n_sources, 33 | n_batches, 34 | n_jobs): 35 | 36 | model_name = os.path.basename(model_path) 37 | 38 | eval_dic = {'sdr': [], 'sir': [], 'sar': []} 39 | 40 | model, optimizer, _, _, args, mean_tr, std_tr, training_labels = \ 41 | model_logger.load_and_create_the_model(model_path) 42 | 43 | k_means_obj = KMeans(n_clusters=n_sources, n_jobs=n_jobs) 44 | 45 | model.eval() 46 | with torch.no_grad(): 47 | bar = ChargingBar("Evaluating model {} ...".format(model_name), 48 | max=n_batches) 49 | for batch_data in dataset_gen: 50 | abs_tfs, wavs_lists, real_tfs, imag_tfs = batch_data 51 | input_tfs = abs_tfs.cuda() 52 | # the input sequence is determined by time and not freqs 53 | # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps 54 | input_tfs = input_tfs.permute(0, 2, 1).contiguous() 55 | 56 | # normalize with mean and variance from the training dataset 57 | input_tfs -= mean_tr 58 | input_tfs /= std_tr 59 | 60 | vs = model(input_tfs) 61 | for b in np.arange(vs.size(0)): 62 | embedding_features = vs[b, :, :].data.cpu().numpy() 63 | 64 | z_embds = (embedding_features - 65 | np.mean(embedding_features, axis=0)) / ( 66 | np.std(embedding_features, axis=0) + 10e-8) 67 | 68 | embedding_labels = np.array(k_means_obj.fit_predict( 69 | z_embds)) 70 | 71 | sdr, sir, sar = np_eval.naive_cpu_bss_eval( 72 | embedding_labels, 73 | real_tfs[b].data.numpy(), 74 | imag_tfs[b].data.numpy(), 75 | wavs_lists[b].data.numpy(), 76 | n_sources, 77 | batch_index=b) 78 | 79 | eval_dic['sdr'].append(sdr) 80 | eval_dic['sir'].append(sir) 81 | eval_dic['sar'].append(sar) 82 | 83 | bar.next() 84 | bar.finish() 85 | 86 | # return both mean and std values 87 | result_dic = {} 88 | for k, v in eval_dic.items(): 89 | result_dic[k] = np.array(v) 90 | 91 | return result_dic 92 | 93 | 94 | def find_best_model_and_evaluate(args): 95 | 96 | visible_cuda_ids = ','.join(map(str, args.cuda_available_devices)) 97 | os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids 98 | 99 | for result_path in args.results_paths: 100 | (dataset_name, 101 | model_dataset) = (os.path.basename(os.path.dirname( 102 | result_path)).split( 103 | "test_on_")[1], 104 | os.path.basename( 105 | result_path).split( 106 | "train_on_")[1].split(".csv")[0]) 107 | 108 | partition = dataset_name.split('_')[-1] 109 | dataset_dirname = dataset_name.split('_' + partition)[0] 110 | 111 | print(dataset_dirname) 112 | 113 | df = pd.read_csv(result_path) 114 | 115 | mask_types2model_dir = { 116 | 'duet': os.path.join(MODELS_DIR, model_dataset), 117 | 'ground_truth': os.path.join(MODELS_GROUND_TRUTH, 118 | model_dataset), 119 | 'raw_phase_diff': os.path.join(MODELS_RAW_PHASE_DIR, 120 | model_dataset)} 121 | 122 | for mask_type, saved_models_dir in mask_types2model_dir.items(): 123 | mask_df = df.loc[df['training_labels'] == mask_type] 124 | mask_df = mask_df.sort_values(['sdr_mean'], ascending=False) 125 | mask_df.reset_index(drop=True, inplace=True) 126 | 127 | best_model_name = mask_df['Unnamed: 0'].loc[0] 128 | 129 | # construct model path 130 | best_model_p = os.path.join(saved_models_dir, 131 | best_model_name) 132 | 133 | if not os.path.exists(best_model_p): 134 | print(best_model_p) 135 | raise IOError("Model path not found!") 136 | 137 | test_dataset_dir = os.path.join(DATASETS_DIR, 138 | dataset_dirname) 139 | 140 | if not os.path.exists(test_dataset_dir): 141 | print(test_dataset_dir) 142 | raise IOError("Dataset path not found!") 143 | 144 | val_generator, n_val_batches, n_val_sources = \ 145 | data_loader.get_data_generator(test_dataset_dir, 146 | partition=partition, 147 | get_top=args.n_eval, 148 | num_workers=args.n_jobs, 149 | return_stats=False, 150 | return_n_batches=True, 151 | return_n_sources=True, 152 | batch_size=32) 153 | 154 | res = eval(val_generator, 155 | best_model_p, 156 | n_val_sources, 157 | n_val_batches, 158 | args.n_jobs) 159 | 160 | test_on = os.path.basename(dataset_dirname) + '_' + partition 161 | save_folder_name = os.path.join(FINAL_RESULTS_DIR, 162 | 'test_on_' + test_on) 163 | if not os.path.exists(save_folder_name): 164 | os.makedirs(save_folder_name) 165 | 166 | file_path = os.path.join(save_folder_name, 167 | mask_type+'_deep_clustering_metrics.gz') 168 | 169 | pprint(res) 170 | 171 | joblib.dump(res, file_path) 172 | 173 | 174 | def get_args(): 175 | """! Command line parser for computing the evaluation for 176 | specific datasets""" 177 | parser = argparse.ArgumentParser(description='Evaluating' 178 | ' SDR SAR and SIR for datasets for the best models') 179 | parser.add_argument("-i", "--results_paths", type=str, nargs='+', 180 | help="Results for datasets", 181 | default=None) 182 | parser.add_argument("--n_jobs", type=int, 183 | help="Number of parallel spawinign jobs", 184 | default=1) 185 | parser.add_argument("-cad", "--cuda_available_devices", type=int, 186 | nargs="+", 187 | help="""A list of Cuda IDs that would be 188 | available for running this experiment""", 189 | default=[2]) 190 | parser.add_argument("--n_eval", type=int, 191 | help="""Reduce the number of evaluation 192 | samples to this number.""", default=None) 193 | return parser.parse_args() 194 | 195 | 196 | if __name__ == "__main__": 197 | args = get_args() 198 | find_best_model_and_evaluate(args) 199 | -------------------------------------------------------------------------------- /spatial_two_mics/data_generator/source_position_generator.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Get some random sampling for the position of two sources 3 | 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of illinois at Urbana Champaign 7 | """ 8 | 9 | import numpy as np 10 | from scipy.spatial import distance as dst 11 | from pprint import pprint 12 | 13 | 14 | class RandomCirclePositioner(object): 15 | """ 16 | ! Returns n_source_pairs positions based on a circle with 17 | specified radius Cartessian and Polar coordinates like follows: 18 | 19 | For each pair of the list we get a dictionary of: 20 | { 21 | 'thetas': angles in rads [<(+x, s1), <(+x, s2)] in list, 22 | 'd_theta': < (+x, s2) - < (+x, s1), 23 | 'xy_positons': [(x_1, y_1), (x_2, y_2)], Cartessian 24 | 'distances': [[||si-mj||]] all precomputed distances 25 | 'taus': time delays in sample format 26 | 'amplitudes': a1 and a2 for: m2(t) = a1*s1(t+d1) + a2*s2(t+d2) 27 | } 28 | (theta_of_source_1, theta_of_source_1) 29 | 30 | 31 | 32 | s2 OOO ooo 33 | OOo (x1, y1) 34 | oOO 35 | oOO s1 36 | oOO 37 | oOO OOo 38 | oOO OOo 39 | oOO OOo 40 | oOO OOo 41 | oOO OOo 42 | oOO m1 <-- mic_distance --> m2 ================>>+x 43 | oOO OOo 44 | oOO OOo 45 | oOO OOo 46 | oOO OOo 47 | oOO OOo 48 | oOO OOo 49 | oO OOo 50 | oOO OOo 51 | oOO OOo 52 | ooo OOO OOO ooo 53 | """ 54 | 55 | def __init__(self, 56 | min_angle=np.pi/18., 57 | angle_sup=np.pi - np.pi/18., 58 | angle_inf=np.pi/18., 59 | radius=10.71, 60 | mic_distance_percentage=0.002, 61 | sound_speed=343, 62 | fs=16000): 63 | """ 64 | :param min_angle: minimum angle in rads for the 2 sources 65 | :param angle_sup or inf: the maximum and minimum values 66 | available for an angle that a source would lie on 67 | :param radius: Radius of the circle in **meters** 68 | :param mic_distance_percentage: Percentage of the radius 69 | corresponding to the distance between the two microphones 70 | :param sound_speed: Default 343 m/s in 20oC room temperature 71 | :param fs: sampling ratio in Hz 72 | """ 73 | 74 | self.min_angle = min_angle 75 | self.angle_sup = angle_sup 76 | self.angle_inf = angle_inf 77 | self.radius = radius 78 | self.mic_distance = self.radius * mic_distance_percentage 79 | # for 16000 hz in order to get maximum +- 1 sample delays we 80 | # have to sustain a distance of maximum: 2.142 cm 81 | # between the mics 82 | self.m1 = (-self.mic_distance / 2, 0.) 83 | self.m2 = (self.mic_distance / 2, 0.) 84 | self.sound_speed = sound_speed 85 | self.fs = fs 86 | 87 | @staticmethod 88 | def get_cartessian_position(radius, 89 | angle): 90 | return radius * np.cos(angle), radius * np.sin(angle) 91 | 92 | def get_amplifier_values_for_sources(self, 93 | n_sources): 94 | """ 95 | :return: A dictionary of all the amplitudes in order to infer 96 | the final mixture depending on the weighted summation of the 97 | source-signals 98 | """ 99 | alphas = np.random.uniform(low=0.2, 100 | high=1.0, 101 | size=n_sources) 102 | total_amplitude = sum(alphas) 103 | 104 | return dict([("a"+str(i+1), a/total_amplitude) 105 | for (i, a) in enumerate(alphas)]) 106 | 107 | def get_time_delays_for_sources(self, 108 | distances, 109 | n_sources): 110 | # delays are always computed using the m1 microphone as 111 | # reference and comparing to the time delay from m2 112 | 113 | taus_list = [] 114 | for i in np.arange(n_sources): 115 | source = "s"+str(i+1) 116 | taus_list.append(distances[source+"m1"] 117 | - distances[source+"m2"]) 118 | 119 | return [(1. * self.fs * tau) / self.sound_speed 120 | for tau in taus_list] 121 | 122 | def compute_distances_for_sources_and_mics(self, 123 | source_points): 124 | """! si \in source_points must be in format (xi, yi) 125 | \:return a dictionary of all given points""" 126 | points = {"m1": self.m1, "m2": self.m2} 127 | points.update(dict([("s"+str(i+1), xy) 128 | for (i, xy) in enumerate(source_points)])) 129 | distances = {} 130 | 131 | for point_1, xy1 in points.items(): 132 | for point_2, xy2 in points.items(): 133 | distances[point_1+point_2] = dst.euclidean(xy1, xy2) 134 | 135 | return distances 136 | 137 | def get_angles(self, n_source_pairs): 138 | while True: 139 | thetas = np.random.uniform(low=self.angle_inf, 140 | high=self.angle_sup, 141 | size=n_source_pairs) 142 | thetas = sorted(thetas) 143 | d_thetas = [th2 - th1 for (th1, th2) in 144 | zip(thetas[:-1], thetas[1:])] 145 | 146 | min_angle_enforced = np.where(np.abs(d_thetas) < 147 | self.min_angle)[0].shape[0] == 0 148 | 149 | if min_angle_enforced: 150 | break 151 | 152 | return thetas, d_thetas 153 | 154 | def get_sources_locations(self, 155 | n_source_pairs): 156 | """! 157 | Generate the positions, angles and distances for 158 | n_source_pairs of the same mixture corersponding to 2 mics""" 159 | thetas, d_thetas = self.get_angles(n_source_pairs) 160 | xys = [] 161 | for angle in thetas: 162 | xys.append(self.get_cartessian_position(self.radius, angle)) 163 | 164 | distances = self.compute_distances_for_sources_and_mics(xys) 165 | 166 | taus = self.get_time_delays_for_sources(distances, 167 | n_source_pairs) 168 | 169 | mix_amplitudes = self.get_amplifier_values_for_sources( 170 | n_source_pairs) 171 | 172 | sources_locations = {'thetas': np.asarray(thetas), 173 | 'd_thetas': np.asarray(d_thetas), 174 | 'xy_positons': np.asarray(xys), 175 | 'distances': distances, 176 | 'taus': np.asarray(taus), 177 | 'amplitudes': np.asarray(list( 178 | mix_amplitudes.values()))} 179 | 180 | return sources_locations 181 | 182 | 183 | def example_of_usage(): 184 | """ 185 | :return: 186 | {'amplitudes': array([0.28292362, 0.08583346, 0.63124292]), 187 | 'd_thetas': array([1.37373734, 1.76785531]), 188 | 'distances': {'m1m1': 0.0, 189 | 'm1m2': 0.03, 190 | 'm1s1': 3.015, ... 191 | 's3s3': 0.0}, 192 | 'taus': array([ 1, -1, 0]), 193 | 'thetas': array([0. , 1.37373734, 3.14159265]), 194 | 'xy_positons': array([[ 3.00000000e+00, 0.00000000e+00], 195 | [ 5.87358252e-01, 2.94193988e+00], 196 | [-3.00000000e+00, 3.67394040e-16]])} 197 | """ 198 | random_positioner = RandomCirclePositioner() 199 | positions_info = random_positioner.get_sources_locations(5) 200 | pprint(positions_info) 201 | 202 | 203 | if __name__ == "__main__": 204 | example_of_usage() 205 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/experiments/sample_convergence_LSTM.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief A simple experiment on how LSTM converge 3 | 4 | @author Efthymios Tzinis {etzinis2@illinois.edu} 5 | @copyright University of illinois at Urbana Champaign 6 | """ 7 | 8 | import os 9 | import sys 10 | import torch 11 | import time 12 | import numpy as np 13 | import copy 14 | from pprint import pprint 15 | import torch.nn as nn 16 | 17 | root_dir = os.path.join( 18 | os.path.dirname(os.path.realpath(__file__)), 19 | '../../../') 20 | sys.path.insert(0, root_dir) 21 | 22 | import spatial_two_mics.dnn.models.simple_LSTM_encoder as LSTM_enc 23 | import spatial_two_mics.dnn.losses.affinity_approximation as \ 24 | affinity_losses 25 | import spatial_two_mics.dnn.utils.dataset as data_generator 26 | import spatial_two_mics.dnn.utils.data_conversions as converters 27 | import spatial_two_mics.dnn.utils.experiment_command_line_parser as \ 28 | parser 29 | import spatial_two_mics.dnn.utils.update_history as update_history 30 | from progress.bar import ChargingBar 31 | import spatial_two_mics.utils.robust_means_clustering as robust_kmeans 32 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as \ 33 | numpy_eval 34 | from sklearn.preprocessing import StandardScaler 35 | 36 | 37 | def train(args, 38 | model, 39 | training_generator, 40 | optimizer, 41 | mean_tr, 42 | std_tr, 43 | epoch, 44 | history, 45 | n_batches): 46 | model.train() 47 | timing_dic = {'Loading batch': 0., 48 | 'Transformations and Forward': 0., 49 | 'Loss Computation and Backprop': 0.} 50 | before = time.time() 51 | bar = ChargingBar("Training for epoch: {}...".format(epoch), 52 | max=n_batches) 53 | for batch_data in training_generator: 54 | (abs_tfs, real_tfs, imag_tfs, 55 | duet_masks, ground_truth_masks, 56 | sources_raw, amplitudes, n_sources) = batch_data 57 | timing_dic['Loading batch'] += time.time() - before 58 | before = time.time() 59 | input_tfs, index_ys = abs_tfs.cuda(), duet_masks.cuda() 60 | # the input sequence is determined by time and not freqs 61 | # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps 62 | input_tfs = input_tfs.permute(0, 2, 1).contiguous() 63 | 64 | # normalize with mean and variance from the training dataset 65 | input_tfs -= mean_tr 66 | input_tfs /= std_tr 67 | 68 | index_ys = index_ys.permute(0, 2, 1).contiguous() 69 | 70 | one_hot_ys = converters.one_hot_3Dmasks(index_ys, 71 | n_sources[0]) 72 | 73 | optimizer.zero_grad() 74 | vs = model(input_tfs) 75 | 76 | flatened_ys = one_hot_ys.view(one_hot_ys.size(0), 77 | -1, 78 | one_hot_ys.size(-1)).cuda() 79 | timing_dic['Transformations and Forward'] += time.time() - \ 80 | before 81 | before = time.time() 82 | naive_loss = affinity_losses.naive(vs, flatened_ys) 83 | naive_loss.backward() 84 | optimizer.step() 85 | timing_dic['Loss Computation and Backprop'] += time.time() - \ 86 | before 87 | 88 | update_history.values_update([('loss', naive_loss)], 89 | history, update_mode='batch') 90 | before = time.time() 91 | bar.next() 92 | bar.finish() 93 | 94 | pprint(timing_dic) 95 | 96 | 97 | def eval(args, 98 | model, 99 | val_generator, 100 | mean_tr, 101 | std_tr, 102 | epoch, 103 | history, 104 | n_batches): 105 | timing_dic = {'Loading batch': 0., 106 | 'Transformations and Forward': 0., 107 | 'BSS CPU evaluation': 0., 108 | 'Kmeans evaluation': 0.} 109 | r_kmeans = robust_kmeans.RobustKmeans( 110 | n_true_clusters=args.n_sources, 111 | n_used_clusters=args.n_sources) 112 | z_scaler = StandardScaler() 113 | 114 | # make some evaluation 115 | model.eval() 116 | before = time.time() 117 | with torch.no_grad(): 118 | bar = ChargingBar("Evaluating for epoch: {}...".format(epoch), 119 | max=n_batches) 120 | before = time.time() 121 | for batch_data in val_generator: 122 | (abs_tfs, real_tfs, imag_tfs, 123 | duet_masks, ground_truth_masks, 124 | sources_raw, amplitudes, n_sources) = batch_data 125 | timing_dic['Loading batch'] += time.time() - before 126 | before = time.time() 127 | input_tfs, index_ys = abs_tfs.cuda(), duet_masks.cuda() 128 | # the input sequence is determined by time and not freqs 129 | # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps 130 | input_tfs = input_tfs.permute(0, 2, 1).contiguous() 131 | 132 | # normalize with mean and variance from the training dataset 133 | input_tfs -= mean_tr 134 | input_tfs /= std_tr 135 | 136 | vs = model(input_tfs) 137 | for b in np.arange(vs.size(0)): 138 | embedding_features = z_scaler.fit_transform( 139 | vs[b, :, :].data.cpu().numpy()) 140 | 141 | embedding_labels = r_kmeans.fit(embedding_features) 142 | 143 | sdr, sir, sar = numpy_eval.naive_cpu_bss_eval( 144 | embedding_labels, 145 | real_tfs[b].data.numpy(), 146 | imag_tfs[b].data.numpy(), 147 | sources_raw[b].data.numpy(), 148 | n_sources[0].data.numpy()) 149 | 150 | update_history.values_update([('sdr', sdr), 151 | ('sir', sir), 152 | ('sar', sar)], 153 | history, 154 | update_mode='batch') 155 | 156 | before = time.time() 157 | bar.next() 158 | bar.finish() 159 | 160 | 161 | def convergence_of_LSTM(args): 162 | visible_cuda_ids = ','.join(map(str, args.cuda_available_devices)) 163 | os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids 164 | 165 | (training_generator, mean_tr, std_tr, n_tr_batches) = \ 166 | data_generator.get_data_generator(args, 167 | return_stats=True) 168 | 169 | val_args = copy.copy(args) 170 | val_args.partition = 'val' 171 | val_generator, n_val_batches = \ 172 | data_generator.get_data_generator(val_args, 173 | get_top=args.n_eval) 174 | 175 | model = LSTM_enc.BLSTMEncoder(num_layers=args.n_layers, 176 | hidden_size=args.hidden_size, 177 | embedding_depth=args.embedding_depth, 178 | bidirectional=args.bidirectional) 179 | model = nn.DataParallel(model).cuda() 180 | 181 | optimizer = torch.optim.Adam(model.parameters(), 182 | lr=args.learning_rate, 183 | betas=(0.9, 0.999)) 184 | 185 | # just iterate over the data 186 | history = {} 187 | for epoch in np.arange(args.epochs): 188 | 189 | train(args, model, training_generator, optimizer, mean_tr, 190 | std_tr, epoch, history, n_tr_batches) 191 | 192 | update_history.values_update([('loss', None)], 193 | history, 194 | update_mode='epoch') 195 | 196 | if epoch % args.evaluate_per == 0: 197 | eval(args, model, val_generator, mean_tr, 198 | std_tr, epoch, history, n_val_batches) 199 | 200 | update_history.values_update([('sdr', None), 201 | ('sir', None), 202 | ('sar', None)], 203 | history, 204 | update_mode='epoch') 205 | 206 | pprint(history['loss'][-1]) 207 | pprint(history['sdr'][-1]) 208 | pprint(history['sir'][-1]) 209 | pprint(history['sar'][-1]) 210 | print("BEST SDR: {}, SIR: {}, SAR {}".format(max(history['sdr']), 211 | max(history['sir']), max(history['sar']))) 212 | 213 | 214 | if __name__ == "__main__": 215 | args = parser.get_args() 216 | convergence_of_LSTM(args) 217 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/modules/model_evaluation.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief For a specific dataset just apply the saved models on a 3 | specific dataset and save the results 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of illinois at Urbana Champaign 7 | """ 8 | 9 | import argparse 10 | import os 11 | import sys 12 | import numpy as np 13 | from pprint import pprint 14 | from joblib import Parallel, delayed 15 | from tqdm import tqdm 16 | import torch 17 | import itertools 18 | import pandas as pd 19 | from progress.bar import ChargingBar 20 | 21 | root_dir = os.path.join( 22 | os.path.dirname(os.path.realpath(__file__)), 23 | '../../../') 24 | sys.path.insert(0, root_dir) 25 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as data_loader 26 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as np_eval 27 | import spatial_two_mics.dnn.utils.model_logger as model_logger 28 | from spatial_two_mics.config import RESULTS_DIR 29 | from sklearn.cluster import KMeans 30 | from spatial_two_mics.utils import robust_means_clustering as \ 31 | robust_kmeans 32 | 33 | 34 | def eval(dataset_gen, 35 | model_path, 36 | n_sources, 37 | n_batches, 38 | n_jobs): 39 | 40 | model_name = os.path.basename(model_path) 41 | 42 | eval_dic = {'sdr': [], 'sir': [], 'sar': []} 43 | 44 | model, optimizer, _, _, args, mean_tr, std_tr, training_labels = \ 45 | model_logger.load_and_create_the_model(model_path) 46 | 47 | k_means_obj = KMeans(n_clusters=n_sources, n_jobs=n_jobs) 48 | 49 | model.eval() 50 | with torch.no_grad(): 51 | bar = ChargingBar("Evaluating model {} ...".format(model_name), 52 | max=n_batches) 53 | for batch_data in dataset_gen: 54 | abs_tfs, wavs_lists, real_tfs, imag_tfs = batch_data 55 | input_tfs = abs_tfs.cuda() 56 | # the input sequence is determined by time and not freqs 57 | # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps 58 | input_tfs = input_tfs.permute(0, 2, 1).contiguous() 59 | 60 | # normalize with mean and variance from the training dataset 61 | input_tfs -= mean_tr 62 | input_tfs /= std_tr 63 | 64 | vs = model(input_tfs) 65 | for b in np.arange(vs.size(0)): 66 | embedding_features = vs[b, :, :].data.cpu().numpy() 67 | 68 | z_embds = (embedding_features - 69 | np.mean(embedding_features, axis=0)) / ( 70 | np.std(embedding_features, axis=0) + 10e-8) 71 | 72 | embedding_labels = np.array(k_means_obj.fit_predict( 73 | z_embds)) 74 | 75 | sdr, sir, sar = np_eval.naive_cpu_bss_eval( 76 | embedding_labels, 77 | real_tfs[b].data.numpy(), 78 | imag_tfs[b].data.numpy(), 79 | wavs_lists[b].data.numpy(), 80 | n_sources, 81 | batch_index=b) 82 | 83 | eval_dic['sdr'].append(sdr) 84 | eval_dic['sir'].append(sir) 85 | eval_dic['sar'].append(sar) 86 | 87 | bar.next() 88 | bar.finish() 89 | 90 | # return both mean and std values 91 | mean_std_dic = {} 92 | for k, v in eval_dic.items(): 93 | mean_std_dic[k+"_mean"] = np.mean(np.array(v)) 94 | mean_std_dic[k+"_std"] = np.std(np.array(v)) 95 | mean_std_dic['hidden_size'] = args.hidden_size 96 | mean_std_dic['num_layers'] = args.n_layers 97 | mean_std_dic['embedding_depth'] = args.embedding_depth 98 | mean_std_dic['dropout'] = str(args.dropout) 99 | mean_std_dic['lr'] = args.learning_rate 100 | mean_std_dic['training_labels'] = training_labels 101 | 102 | return model_name, mean_std_dic 103 | 104 | 105 | def evaluate_models(pretrained_models, 106 | dataset_folder, 107 | n_jobs=1, 108 | get_top=None, 109 | batch_size=32): 110 | 111 | visible_cuda_ids = ','.join(map(str, args.cuda_available_devices)) 112 | os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids 113 | 114 | (dataset_dir, partition) = (os.path.dirname(dataset_folder), 115 | os.path.basename(dataset_folder)) 116 | 117 | default_bs = batch_size 118 | if get_top is None: 119 | loading_bs = default_bs 120 | else: 121 | loading_bs = min(default_bs, get_top) 122 | 123 | print("Initializing the data loader for the dataset...") 124 | val_generator, n_val_batches, n_val_sources = \ 125 | data_loader.get_data_generator(dataset_dir, 126 | partition=partition, 127 | get_top=get_top, 128 | num_workers=n_jobs, 129 | return_stats=False, 130 | return_n_batches=True, 131 | return_n_sources=True, 132 | batch_size=loading_bs) 133 | 134 | eval_results = {} 135 | for model_path in sorted(pretrained_models): 136 | 137 | try: 138 | test_on = os.path.basename(dataset_dir) + '_' + partition 139 | train_on = os.path.basename(os.path.dirname(model_path)) 140 | folder_name = os.path.join(RESULTS_DIR, 141 | 'test_on_' + test_on) 142 | if not os.path.exists(folder_name): 143 | os.makedirs(folder_name) 144 | 145 | # if the df already exists then do not make the evaluation 146 | df_path = os.path.join(folder_name, 147 | 'train_on_' + train_on + '.csv') 148 | if os.path.exists(df_path): 149 | df = pd.read_csv(df_path) 150 | df.set_index("Unnamed: 0", drop=True, inplace=True) 151 | eval_results = df.to_dict(orient='index') 152 | 153 | if os.path.basename(model_path) in eval_results.keys(): 154 | continue 155 | 156 | model_name, res = eval(val_generator, 157 | model_path, 158 | n_val_sources, 159 | n_val_batches, 160 | n_jobs) 161 | 162 | print(model_name) 163 | print(res) 164 | 165 | eval_results[model_name] = res 166 | 167 | df = pd.DataFrame(eval_results).T 168 | df = df.sort_values(['sdr_mean'], ascending=False) 169 | df.to_csv(df_path) 170 | except Exception as e: 171 | print(e) 172 | 173 | return df 174 | 175 | 176 | def get_args(): 177 | """! Command line parser for computing the evaluation for 178 | specific datasets""" 179 | parser = argparse.ArgumentParser(description='Evaluating' 180 | ' stored models for a specific dataset') 181 | parser.add_argument("-d", "--dataset_to_test", type=str, 182 | help="Dataset path you want to evaluate", 183 | default=None) 184 | parser.add_argument("-m", "--pretrained_models", type=str, 185 | nargs='+', 186 | help="Paths of pretrained models that you " 187 | "need to test on this dataset", 188 | default=[]) 189 | parser.add_argument("--n_jobs", type=int, 190 | help="Number of parallel spawning jobs", 191 | default=1) 192 | parser.add_argument("-bs", "--batch_size", type=int, 193 | help="Batch size to be evaluated", 194 | default=32) 195 | parser.add_argument("--n_eval", type=int, 196 | help="""Reduce the number of evaluation 197 | samples to this number.""", default=None) 198 | parser.add_argument("-cad", "--cuda_available_devices", type=int, 199 | nargs="+", 200 | help="""A list of Cuda IDs that would be 201 | available for running this experiment""", 202 | default=[0]) 203 | return parser.parse_args() 204 | 205 | 206 | if __name__ == "__main__": 207 | args = get_args() 208 | df_results = evaluate_models(args.pretrained_models, 209 | args.dataset_to_test, 210 | n_jobs=args.n_jobs, 211 | get_top=args.n_eval, 212 | batch_size=args.batch_size) 213 | 214 | pd.set_option('display.expand_frame_repr', False) 215 | print(df_results.sort_values(['sdr_mean'], ascending=False)) 216 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/experiments/simple_LSTM_encoder.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief A simple experiment on how models, losses, etc should be used 3 | 4 | @author Efthymios Tzinis {etzinis2@illinois.edu} 5 | @copyright University of illinois at Urbana Champaign 6 | """ 7 | 8 | import argparse 9 | import os 10 | import sys 11 | import torch 12 | import time 13 | import numpy as np 14 | import copy 15 | from pprint import pprint 16 | from torch.utils.data import DataLoader 17 | 18 | root_dir = os.path.join( 19 | os.path.dirname(os.path.realpath(__file__)), 20 | '../../../') 21 | sys.path.insert(0, root_dir) 22 | 23 | import spatial_two_mics.dnn.models.simple_LSTM_encoder as LSTM_enc 24 | import spatial_two_mics.dnn.losses.affinity_approximation as \ 25 | affinity_losses 26 | import spatial_two_mics.dnn.utils.dataset as data_generator 27 | import spatial_two_mics.dnn.utils.data_conversions as converters 28 | 29 | 30 | def check_device_model_loading(model): 31 | device = 0 32 | print(torch.cuda.get_device_capability(device=device)) 33 | print(torch.cuda.memory_allocated(device=device)) 34 | print(torch.cuda.memory_cached(device=device)) 35 | 36 | model = model.cuda() 37 | print(torch.cuda.get_device_properties(device=device).total_memory) 38 | print(torch.cuda.memory_allocated(device)) 39 | print(torch.cuda.memory_cached(device)) 40 | 41 | temp_model = copy.deepcopy(model) 42 | temp_model = temp_model.cuda() 43 | print(torch.cuda.max_memory_cached(device=device)) 44 | print(torch.cuda.memory_allocated(device)) 45 | print(torch.cuda.memory_cached(device)) 46 | 47 | 48 | def compare_losses(vs, one_hot_ys): 49 | timing_dic = {} 50 | 51 | before = time.time() 52 | flatened_ys = one_hot_ys.view(one_hot_ys.size(0), 53 | -1, 54 | one_hot_ys.size(-1)).cuda() 55 | naive_loss = affinity_losses.naive(vs, flatened_ys) 56 | now = time.time() 57 | timing_dic['Naive Loss Implementation'] = now - before 58 | 59 | before = time.time() 60 | expanded_vs = vs.view(vs.size(0), one_hot_ys.size(1), 61 | one_hot_ys.size(2), vs.size(-1)).cuda() 62 | diagonal_loss = affinity_losses.diagonal(expanded_vs, 63 | one_hot_ys) 64 | now = time.time() 65 | timing_dic['Diagonal Loss Implementation'] = now - before 66 | 67 | pprint(timing_dic) 68 | 69 | return diagonal_loss 70 | 71 | 72 | def example_of_usage(args): 73 | 74 | visible_cuda_ids = ','.join(map(str, args.cuda_available_devices)) 75 | os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids 76 | print(visible_cuda_ids) 77 | print(torch.cuda.current_device()) 78 | 79 | training_generator, n_batches = data_generator.get_data_generator( 80 | args) 81 | timing_dic = {} 82 | 83 | before = time.time() 84 | model = LSTM_enc.BLSTMEncoder(num_layers=args.n_layers, 85 | hidden_size=args.hidden_size, 86 | embedding_depth=args.embedding_depth, 87 | bidirectional=args.bidirectional) 88 | timing_dic['Iitializing model'] = time.time() - before 89 | model = model.cuda() 90 | timing_dic['Transfering model to device'] = time.time() - before 91 | 92 | optimizer = torch.optim.Adam(model.parameters(), 93 | lr=args.learning_rate, 94 | betas=(0.9, 0.999)) 95 | 96 | # just iterate over the data 97 | epochs = 10 98 | for epoch in np.arange(epochs): 99 | print("Training for epoch: {}...".format(epoch)) 100 | for batch_data in training_generator: 101 | 102 | (abs_tfs, real_tfs, imag_tfs, 103 | duet_masks, ground_truth_masks, 104 | sources_raw, amplitudes, n_sources) = batch_data 105 | 106 | input_tfs, index_ys = abs_tfs.cuda(), duet_masks.cuda() 107 | # the input sequence is determined by time and not freqs 108 | # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps 109 | input_tfs = input_tfs.permute(0, 2, 1).contiguous() 110 | index_ys = index_ys.permute(0, 2, 1).contiguous() 111 | 112 | one_hot_ys = converters.one_hot_3Dmasks(index_ys, n_sources[0]) 113 | 114 | timing_dic = {} 115 | 116 | optimizer.zero_grad() 117 | vs = model(input_tfs) 118 | 119 | before = time.time() 120 | flatened_ys = one_hot_ys.view(one_hot_ys.size(0), 121 | -1, 122 | one_hot_ys.size(-1)).cuda() 123 | naive_loss = affinity_losses.naive(vs, flatened_ys) 124 | naive_loss.backward() 125 | optimizer.step() 126 | now = time.time() 127 | print("Naive Loss: {}".format(naive_loss)) 128 | timing_dic['Naive Loss Implementation Time'] = now - before 129 | 130 | optimizer.zero_grad() 131 | vs = model(input_tfs) 132 | 133 | before = time.time() 134 | expanded_vs = vs.view(vs.size(0), one_hot_ys.size(1), 135 | one_hot_ys.size(2), vs.size(-1)).cuda() 136 | diagonal_loss = affinity_losses.diagonal(expanded_vs, 137 | one_hot_ys) 138 | diagonal_loss.backward() 139 | optimizer.step() 140 | now = time.time() 141 | print("Diagonal Loss: {}".format(diagonal_loss)) 142 | timing_dic['Diagonal Loss Implementation Time'] = now - before 143 | 144 | pprint(timing_dic) 145 | 146 | 147 | 148 | def get_args(): 149 | """! Command line parser """ 150 | parser = argparse.ArgumentParser(description='Deep Clustering for ' 151 | 'Audio Source ' 152 | 'Separation ' 153 | 'Experiment') 154 | parser.add_argument("--dataset", type=str, 155 | help="Dataset name", 156 | default="timit") 157 | parser.add_argument("--n_sources", type=int, 158 | help="How many sources in each mix", 159 | default=2) 160 | parser.add_argument("--n_samples", type=int, nargs='+', 161 | help="How many samples do u want to be " 162 | "created for train test val", 163 | default=[256, 64, 128]) 164 | parser.add_argument("--genders", type=str, nargs='+', 165 | help="Genders that will correspond to the " 166 | "genders in the mixtures", 167 | default=['m']) 168 | parser.add_argument("-f", "--force_delays", nargs='+', type=int, 169 | help="""Whether you want to force integer 170 | delays of +- 1 in the sources e.g.""", 171 | default=[-1, 1]) 172 | parser.add_argument("-nl", "--n_layers", type=int, 173 | help="""The number of layers of the LSTM 174 | encoder""", default=2) 175 | parser.add_argument("-ed", "--embedding_depth", type=int, 176 | help="""The depth of the embedding""", 177 | default=10) 178 | parser.add_argument("-hs", "--hidden_size", type=int, 179 | help="""The size of the LSTM cells """, 180 | default=10) 181 | parser.add_argument("-bs", "--batch_size", type=int, 182 | help="""The number of samples in each batch""", 183 | default=64) 184 | parser.add_argument("-name", "--experiment_name", type=str, 185 | help="""The name or identifier of this 186 | experiment""", 187 | default='A sample experiment') 188 | parser.add_argument("-cad", "--cuda_available_devices", type=int, 189 | nargs="+", 190 | help="""A list of Cuda IDs that would be 191 | available for runnign this experiment""", 192 | default=[0]) 193 | parser.add_argument("--num_workers", type=int, 194 | help="""The number of cpu workers for 195 | loading the data, etc.""", default=3) 196 | parser.add_argument("-lr", "--learning_rate", type=float, 197 | help="""Initial Learning rate""", default=1e-1) 198 | parser.add_argument("--bidirectional", action='store_true', 199 | help="""Bidirectional or not""") 200 | 201 | return parser.parse_args() 202 | 203 | 204 | if __name__ == "__main__": 205 | args = get_args() 206 | example_of_usage(args) -------------------------------------------------------------------------------- /spatial_two_mics/dnn/experiments/check_overfitting.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Using the fast version of the dataset generator provide a 3 | naive experimental setup for checking the capability of the model to 4 | verfit on a set of data 5 | 6 | @author Efthymios Tzinis {etzinis2@illinois.edu} 7 | @copyright University of illinois at Urbana Champaign 8 | """ 9 | 10 | import os 11 | import sys 12 | import torch 13 | import time 14 | import numpy as np 15 | import copy 16 | from pprint import pprint 17 | import torch.nn as nn 18 | 19 | root_dir = os.path.join( 20 | os.path.dirname(os.path.realpath(__file__)), 21 | '../../../') 22 | sys.path.insert(0, root_dir) 23 | 24 | import spatial_two_mics.dnn.models.simple_LSTM_encoder as LSTM_enc 25 | import spatial_two_mics.dnn.losses.affinity_approximation as \ 26 | affinity_losses 27 | import spatial_two_mics.dnn.utils.fast_dataset_v2 as fast_data_gen 28 | import spatial_two_mics.dnn.utils.data_conversions as converters 29 | import spatial_two_mics.dnn.utils.experiment_command_line_parser as \ 30 | parser 31 | import spatial_two_mics.dnn.utils.update_history as update_history 32 | from progress.bar import ChargingBar 33 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as \ 34 | numpy_eval 35 | from sklearn.cluster import KMeans 36 | 37 | 38 | def train(args, 39 | model, 40 | training_generator, 41 | optimizer, 42 | mean_tr, 43 | std_tr, 44 | epoch, 45 | history, 46 | n_batches): 47 | model.train() 48 | timing_dic = {'Loading batch': 0., 49 | 'Transformations and Forward': 0., 50 | 'Loss Computation and Backprop': 0.} 51 | before = time.time() 52 | bar = ChargingBar("Training for epoch: {}...".format(epoch), 53 | max=n_batches) 54 | for batch_data in training_generator: 55 | abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data 56 | timing_dic['Loading batch'] += time.time() - before 57 | before = time.time() 58 | input_tfs, index_ys = abs_tfs.cuda(), masks.cuda() 59 | # the input sequence is determined by time and not freqs 60 | # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps 61 | input_tfs = input_tfs.permute(0, 2, 1).contiguous() 62 | index_ys = index_ys.permute(0, 2, 1).contiguous() 63 | 64 | # normalize with mean and variance from the training dataset 65 | input_tfs -= mean_tr 66 | input_tfs /= std_tr 67 | 68 | # index_ys = index_ys.permute(0, 2, 1).contiguous() 69 | one_hot_ys = converters.one_hot_3Dmasks(index_ys, 70 | args.n_sources) 71 | 72 | optimizer.zero_grad() 73 | vs = model(input_tfs) 74 | 75 | flatened_ys = one_hot_ys.view(one_hot_ys.size(0), 76 | -1, 77 | one_hot_ys.size(-1)).cuda() 78 | 79 | timing_dic['Transformations and Forward'] += time.time() - \ 80 | before 81 | before = time.time() 82 | loss = affinity_losses.paris_naive(vs, flatened_ys) 83 | # loss = affinity_losses.diagonal(vs.view(vs.size(0), 84 | # one_hot_ys.size(1), 85 | # one_hot_ys.size(2), 86 | # vs.size(-1)), 87 | # one_hot_ys.cuda()) 88 | 89 | loss.backward() 90 | nn.utils.clip_grad_norm(model.parameters(), 100.) 91 | optimizer.step() 92 | timing_dic['Loss Computation and Backprop'] += time.time() - \ 93 | before 94 | 95 | update_history.values_update([('loss', loss)], 96 | history, update_mode='batch') 97 | before = time.time() 98 | bar.next() 99 | bar.finish() 100 | 101 | pprint(timing_dic) 102 | 103 | 104 | def eval(args, 105 | model, 106 | val_generator, 107 | mean_tr, 108 | std_tr, 109 | epoch, 110 | history, 111 | n_batches, 112 | k_means_obj): 113 | timing_dic = {'Standard Scaler': 0., 114 | 'Kmeans': 0., 115 | 'Dummy BSS evaluation': 0.} 116 | 117 | # make some evaluation 118 | model.eval() 119 | before = time.time() 120 | with torch.no_grad(): 121 | bar = ChargingBar("Evaluating for epoch: {}...".format(epoch), 122 | max=n_batches) 123 | before = time.time() 124 | for batch_data in val_generator: 125 | abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data 126 | input_tfs = abs_tfs.cuda() 127 | # the input sequence is determined by time and not freqs 128 | # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps 129 | input_tfs = input_tfs.permute(0, 2, 1).contiguous() 130 | 131 | # normalize with mean and variance from the training dataset 132 | input_tfs -= mean_tr 133 | input_tfs /= std_tr 134 | 135 | vs = model(input_tfs) 136 | for b in np.arange(vs.size(0)): 137 | 138 | # possibly go into GPU ? 139 | # before = time.time() 140 | # embedding_features = z_scaler.fit_transform( 141 | # vs[b, :, :].data.cpu().numpy()) 142 | # timing_dic['Standard Scaler'] += time.time() - before 143 | 144 | embedding_features = vs[b, :, :].data.cpu().numpy() 145 | # embedding_features = masks[b, :, :].view(-1, 1).data.numpy() 146 | # embedding_labels = masks[b].data.numpy() 147 | # embedding_features = flatened_ys[b, :, :].data.cpu().numpy() 148 | 149 | 150 | 151 | # possibly perform kmeans on GPU? 152 | before = time.time() 153 | embedding_labels = np.array(k_means_obj.fit_predict( 154 | embedding_features)) 155 | timing_dic['Kmeans'] += time.time() - before 156 | 157 | # possibly do it on GPU? 158 | before = time.time() 159 | sdr, sir, sar = numpy_eval.naive_cpu_bss_eval( 160 | embedding_labels, 161 | real_tfs[b].data.numpy(), 162 | imag_tfs[b].data.numpy(), 163 | wavs_lists[b].data.numpy(), 164 | args.n_sources, 165 | batch_index=b) 166 | timing_dic['Dummy BSS evaluation'] += time.time() - before 167 | 168 | update_history.values_update([('sdr', sdr), 169 | ('sir', sir), 170 | ('sar', sar)], 171 | history, 172 | update_mode='batch') 173 | 174 | bar.next() 175 | pprint(timing_dic) 176 | bar.finish() 177 | 178 | 179 | def convergence_of_LSTM(args): 180 | visible_cuda_ids = ','.join(map(str, args.cuda_available_devices)) 181 | os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids 182 | 183 | val_args = copy.copy(args) 184 | val_args.partition = 'val' 185 | val_generator, mean_val, std_val, n_val_batches = \ 186 | fast_data_gen.get_data_generator(val_args, 187 | return_stats=True, 188 | get_top=args.n_eval) 189 | 190 | model = LSTM_enc.BLSTMEncoder(num_layers=args.n_layers, 191 | hidden_size=args.hidden_size, 192 | embedding_depth=args.embedding_depth, 193 | bidirectional=args.bidirectional) 194 | model = nn.DataParallel(model).cuda() 195 | 196 | optimizer = torch.optim.Adam(model.parameters(), 197 | lr=args.learning_rate, 198 | betas=(0.9, 0.999)) 199 | 200 | k_means_obj = KMeans(n_clusters=2) 201 | # just iterate over the data 202 | history = {} 203 | for epoch in np.arange(args.epochs): 204 | 205 | train(args, model, val_generator, optimizer, mean_val, 206 | std_val, epoch, history, n_val_batches) 207 | 208 | update_history.values_update([('loss', None)], 209 | history, 210 | update_mode='epoch') 211 | 212 | 213 | if epoch % args.evaluate_per == 0: 214 | eval(args, model, val_generator, mean_val, 215 | std_val, epoch, history, n_val_batches, k_means_obj) 216 | 217 | update_history.values_update([('sdr', None), 218 | ('sir', None), 219 | ('sar', None)], 220 | history, 221 | update_mode='epoch') 222 | 223 | pprint(history['loss'][-1]) 224 | pprint(history['sdr'][-1]) 225 | pprint(history['sir'][-1]) 226 | pprint(history['sar'][-1]) 227 | print( 228 | "BEST SDR: {}, SIR: {}, SAR {}".format(max(history['sdr']), 229 | max(history['sir']), 230 | max(history['sar']))) 231 | 232 | 233 | if __name__ == "__main__": 234 | args = parser.get_args() 235 | convergence_of_LSTM(args) 236 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/experiments/convergence_check_v2.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Using the fast version of the dataset generator provide a 3 | naive experimental setup for checking the convergence of the model 4 | 5 | @author Efthymios Tzinis {etzinis2@illinois.edu} 6 | @copyright University of illinois at Urbana Champaign 7 | """ 8 | 9 | import os 10 | import sys 11 | import torch 12 | import time 13 | import numpy as np 14 | import copy 15 | from pprint import pprint 16 | import torch.nn as nn 17 | 18 | root_dir = os.path.join( 19 | os.path.dirname(os.path.realpath(__file__)), 20 | '../../../') 21 | sys.path.insert(0, root_dir) 22 | 23 | import spatial_two_mics.dnn.models.simple_LSTM_encoder as LSTM_enc 24 | import spatial_two_mics.dnn.losses.affinity_approximation as \ 25 | affinity_losses 26 | import spatial_two_mics.dnn.utils.fast_dataset_v2 as fast_data_gen 27 | import spatial_two_mics.dnn.utils.data_conversions as converters 28 | import spatial_two_mics.dnn.utils.experiment_command_line_parser as \ 29 | parser 30 | import spatial_two_mics.dnn.utils.update_history as update_history 31 | from progress.bar import ChargingBar 32 | import spatial_two_mics.utils.robust_means_clustering as robust_kmeans 33 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as \ 34 | numpy_eval 35 | from sklearn.preprocessing import StandardScaler 36 | from sklearn.cluster import KMeans 37 | import librosa 38 | 39 | 40 | def train(args, 41 | model, 42 | training_generator, 43 | optimizer, 44 | mean_tr, 45 | std_tr, 46 | epoch, 47 | history, 48 | n_batches): 49 | model.train() 50 | timing_dic = {'Loading batch': 0., 51 | 'Transformations and Forward': 0., 52 | 'Loss Computation and Backprop': 0.} 53 | before = time.time() 54 | bar = ChargingBar("Training for epoch: {}...".format(epoch), 55 | max=n_batches) 56 | for batch_data in training_generator: 57 | (abs_tfs, masks) = batch_data 58 | timing_dic['Loading batch'] += time.time() - before 59 | before = time.time() 60 | input_tfs, index_ys = abs_tfs.cuda(), masks.cuda() 61 | # the input sequence is determined by time and not freqs 62 | # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps 63 | input_tfs = input_tfs.permute(0, 2, 1).contiguous() 64 | index_ys = index_ys.permute(0, 2, 1).contiguous() 65 | 66 | # normalize with mean and variance from the training dataset 67 | input_tfs -= mean_tr 68 | input_tfs /= std_tr 69 | 70 | # index_ys = index_ys.permute(0, 2, 1).contiguous() 71 | one_hot_ys = converters.one_hot_3Dmasks(index_ys, 72 | args.n_sources) 73 | 74 | optimizer.zero_grad() 75 | vs = model(input_tfs) 76 | 77 | flatened_ys = one_hot_ys.view(one_hot_ys.size(0), 78 | -1, 79 | one_hot_ys.size(-1)).cuda() 80 | 81 | timing_dic['Transformations and Forward'] += time.time() - \ 82 | before 83 | before = time.time() 84 | loss = affinity_losses.paris_naive(vs, flatened_ys) 85 | # loss = affinity_losses.diagonal(vs.view(vs.size(0), 86 | # one_hot_ys.size(1), 87 | # one_hot_ys.size(2), 88 | # vs.size(-1)), 89 | # one_hot_ys.cuda()) 90 | 91 | loss.backward() 92 | nn.utils.clip_grad_norm(model.parameters(), 100.) 93 | optimizer.step() 94 | timing_dic['Loss Computation and Backprop'] += time.time() - \ 95 | before 96 | 97 | update_history.values_update([('loss', loss)], 98 | history, update_mode='batch') 99 | before = time.time() 100 | bar.next() 101 | bar.finish() 102 | 103 | pprint(timing_dic) 104 | 105 | 106 | def eval(args, 107 | model, 108 | val_generator, 109 | mean_tr, 110 | std_tr, 111 | epoch, 112 | history, 113 | n_batches, 114 | k_means_obj): 115 | timing_dic = {'Standard Scaler': 0., 116 | 'Kmeans': 0., 117 | 'Dummy BSS evaluation': 0.} 118 | 119 | # make some evaluation 120 | model.eval() 121 | before = time.time() 122 | with torch.no_grad(): 123 | bar = ChargingBar("Evaluating for epoch: {}...".format(epoch), 124 | max=n_batches) 125 | before = time.time() 126 | for batch_data in val_generator: 127 | abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data 128 | input_tfs = abs_tfs.cuda() 129 | # the input sequence is determined by time and not freqs 130 | # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps 131 | input_tfs = input_tfs.permute(0, 2, 1).contiguous() 132 | 133 | # normalize with mean and variance from the training dataset 134 | input_tfs -= mean_tr 135 | input_tfs /= std_tr 136 | 137 | vs = model(input_tfs) 138 | for b in np.arange(vs.size(0)): 139 | 140 | # possibly go into GPU ? 141 | # before = time.time() 142 | # embedding_features = z_scaler.fit_transform( 143 | # vs[b, :, :].data.cpu().numpy()) 144 | # timing_dic['Standard Scaler'] += time.time() - before 145 | 146 | embedding_features = vs[b, :, :].data.cpu().numpy() 147 | # embedding_features = masks[b, :, :].view(-1, 1).data.numpy() 148 | # embedding_labels = masks[b].data.numpy() 149 | # embedding_features = flatened_ys[b, :, :].data.cpu().numpy() 150 | 151 | 152 | 153 | # possibly perform kmeans on GPU? 154 | before = time.time() 155 | embedding_labels = np.array(k_means_obj.fit_predict( 156 | embedding_features)) 157 | timing_dic['Kmeans'] += time.time() - before 158 | 159 | # possibly do it on GPU? 160 | before = time.time() 161 | sdr, sir, sar = numpy_eval.naive_cpu_bss_eval( 162 | embedding_labels, 163 | real_tfs[b].data.numpy(), 164 | imag_tfs[b].data.numpy(), 165 | wavs_lists[b].data.numpy(), 166 | args.n_sources, 167 | batch_index=b) 168 | timing_dic['Dummy BSS evaluation'] += time.time() - before 169 | 170 | update_history.values_update([('sdr', sdr), 171 | ('sir', sir), 172 | ('sar', sar)], 173 | history, 174 | update_mode='batch') 175 | 176 | bar.next() 177 | pprint(timing_dic) 178 | bar.finish() 179 | 180 | 181 | def convergence_of_LSTM(args): 182 | visible_cuda_ids = ','.join(map(str, args.cuda_available_devices)) 183 | os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids 184 | 185 | (training_generator, mean_tr, std_tr, n_tr_batches) = \ 186 | fast_data_gen.get_data_generator(args, 187 | return_stats=True) 188 | 189 | val_args = copy.copy(args) 190 | val_args.partition = 'val' 191 | val_generator, n_val_batches = \ 192 | fast_data_gen.get_data_generator(val_args, 193 | get_top=args.n_eval) 194 | 195 | model = LSTM_enc.BLSTMEncoder(num_layers=args.n_layers, 196 | hidden_size=args.hidden_size, 197 | embedding_depth=args.embedding_depth, 198 | bidirectional=args.bidirectional) 199 | model = nn.DataParallel(model).cuda() 200 | 201 | optimizer = torch.optim.Adam(model.parameters(), 202 | lr=args.learning_rate, 203 | betas=(0.9, 0.999)) 204 | 205 | k_means_obj = KMeans(n_clusters=2) 206 | # just iterate over the data 207 | history = {} 208 | for epoch in np.arange(args.epochs): 209 | 210 | train(args, model, training_generator, optimizer, mean_tr, 211 | std_tr, epoch, history, n_tr_batches) 212 | 213 | update_history.values_update([('loss', None)], 214 | history, 215 | update_mode='epoch') 216 | 217 | 218 | if epoch % args.evaluate_per == 0: 219 | eval(args, model, val_generator, mean_tr, 220 | std_tr, epoch, history, n_val_batches, k_means_obj) 221 | 222 | update_history.values_update([('sdr', None), 223 | ('sir', None), 224 | ('sar', None)], 225 | history, 226 | update_mode='epoch') 227 | 228 | pprint(history['loss'][-1]) 229 | pprint(history['sdr'][-1]) 230 | pprint(history['sir'][-1]) 231 | pprint(history['sar'][-1]) 232 | print( 233 | "BEST SDR: {}, SIR: {}, SAR {}".format(max(history['sdr']), 234 | max(history['sir']), 235 | max(history['sar']))) 236 | 237 | 238 | if __name__ == "__main__": 239 | args = parser.get_args() 240 | convergence_of_LSTM(args) 241 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/utils/fast_dataset_v3.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief A dataset creation which is compatible with pytorch framework 3 | and much faster in loading time depending on the new version of 4 | loading only the appropriate files that might be needed. Moreover 5 | this dataset has minimal input argument requirements in order to be 6 | more user friendly. 7 | 8 | @author Efthymios Tzinis {etzinis2@illinois.edu} 9 | @copyright University of illinois at Urbana Champaign 10 | """ 11 | 12 | import os 13 | import glob2 14 | import numpy as np 15 | from sklearn.externals import joblib 16 | from torch.utils.data import Dataset, DataLoader 17 | 18 | 19 | class PytorchMixtureDataset(Dataset): 20 | """ 21 | This is a general compatible class for pytorch datasets. 22 | 23 | @note Each instance of the dataset should be stored using 24 | joblib.dump() and this is the way that it would be returned. 25 | After some transformations. 26 | 27 | The path of all datasets should be defined inside config. 28 | All datasets should be formatted with appropriate subfolders of 29 | train / test and val and under them there should be all the 30 | available files. 31 | """ 32 | def __init__(self, 33 | dataset_dir, 34 | partition='train', 35 | get_top=None, 36 | labels_mask='duet', 37 | only_mask_evaluation=False, 38 | **kwargs): 39 | """! 40 | Input dataset dir should have the following structure: 41 | ./dataset_dir 42 | ./train 43 | ./test 44 | ./val 45 | """ 46 | 47 | self.dataset_dirpath = os.path.join(dataset_dir, 48 | partition) 49 | self.dataset_stats_path = self.dataset_dirpath + '_stats' 50 | self.partition = partition 51 | 52 | if (labels_mask == 'duet' 53 | or labels_mask == 'ground_truth' 54 | or labels_mask == 'raw_phase_diff'): 55 | self.selected_mask = labels_mask 56 | elif labels_mask is None: 57 | pass 58 | else: 59 | raise NotImplementedError("There is no available mask " 60 | "called: {}".format(labels_mask)) 61 | 62 | if not os.path.isdir(self.dataset_dirpath): 63 | raise IOError("Dataset folder {} not found!".format( 64 | self.dataset_dirpath)) 65 | else: 66 | print("Loading files from {} ...".format( 67 | self.dataset_dirpath)) 68 | 69 | self.mixture_folders = glob2.glob(os.path.join( 70 | self.dataset_dirpath, '*')) 71 | if get_top is not None: 72 | self.mixture_folders = self.mixture_folders[:get_top] 73 | 74 | self.n_samples = len(self.mixture_folders) 75 | self.only_mask_evaluation = only_mask_evaluation 76 | 77 | self.n_sources = int(os.path.basename( 78 | dataset_dir).split("_")[4]) 79 | 80 | # preprocess -- store all absolute spectra values for faster 81 | # loading during run time 82 | self.store_directly_abs_spectra() 83 | 84 | def __len__(self): 85 | return self.n_samples 86 | 87 | def __getitem__(self, idx): 88 | """! 89 | Depending on the selected partition it returns accordingly 90 | the following objects: 91 | 92 | if self.partition == 'train': 93 | (abs_tfs, selected_mask) 94 | else if partition == 'test' or 'val' 95 | (abs_tfs, selected_mask, wavs_list, real_tfs, imag_tfs)""" 96 | mix_folder = self.mixture_folders[idx] 97 | try: 98 | abs_tfs = joblib.load(os.path.join(mix_folder, 'abs_tfs')) 99 | except: 100 | raise IOError("Failed to load data from path: {} " 101 | "for absolute spectra.".format(mix_folder)) 102 | 103 | if self.partition == 'val' or self.partition == 'test': 104 | try: 105 | real_p = os.path.join(mix_folder, 'real_tfs') 106 | imag_p = os.path.join(mix_folder, 'imag_tfs') 107 | wavs_p = os.path.join(mix_folder, 'wavs') 108 | real_tfs = joblib.load(real_p) 109 | imag_tfs = joblib.load(imag_p) 110 | wavs_list = joblib.load(wavs_p) 111 | wavs_list = np.array(wavs_list) 112 | except: 113 | raise IOError("Failed to load data from path: {} " 114 | "for real, imag tf of the mixture and " 115 | "wavs".format(mix_folder)) 116 | 117 | if not self.only_mask_evaluation: 118 | return abs_tfs, wavs_list, real_tfs, imag_tfs 119 | 120 | try: 121 | if self.selected_mask == 'duet': 122 | mask = joblib.load(os.path.join(mix_folder, 123 | 'soft_labeled_mask')) 124 | elif self.selected_mask == 'ground_truth': 125 | mask = joblib.load(os.path.join(mix_folder, 126 | 'ground_truth_mask')) 127 | except Exception as e: 128 | print(e) 129 | raise IOError("Failed to load data from path: {} " 130 | "for tf label masks".format(mix_folder)) 131 | 132 | return abs_tfs, mask, wavs_list, real_tfs, imag_tfs 133 | 134 | if self.partition == 'train': 135 | try: 136 | if self.selected_mask == 'duet': 137 | mask = joblib.load(os.path.join(mix_folder, 138 | 'soft_labeled_mask')) 139 | elif self.selected_mask == 'ground_truth': 140 | mask = joblib.load(os.path.join(mix_folder, 141 | 'ground_truth_mask')) 142 | else: 143 | mask = joblib.load(os.path.join(mix_folder, 144 | 'raw_phase_diff')) 145 | except Exception as e: 146 | print(e) 147 | raise IOError("Failed to load data from path: {} " 148 | "for tf label masks".format(mix_folder)) 149 | return abs_tfs, mask 150 | 151 | return None 152 | 153 | def store_directly_abs_spectra(self): 154 | for mix_folder in self.mixture_folders: 155 | abs_p = os.path.join(mix_folder, 'abs_tfs') 156 | if os.path.lexists(abs_p): 157 | continue 158 | 159 | try: 160 | real_p = os.path.join(mix_folder, 'real_tfs') 161 | imag_p = os.path.join(mix_folder, 'imag_tfs') 162 | real_tfs = joblib.load(real_p) 163 | imag_tfs = joblib.load(imag_p) 164 | except: 165 | raise IOError("Failed to load data from path: {} " 166 | "using joblib.".format(mix_folder)) 167 | abs_tfs = np.abs(real_tfs + 1j * imag_tfs) 168 | try: 169 | joblib.dump(abs_tfs, abs_p, compress=0) 170 | except: 171 | raise IOError("Failed to save absolute value of " 172 | "spectra in path: {}".format(abs_p)) 173 | 174 | def extract_stats(self): 175 | if not os.path.lexists(self.dataset_stats_path): 176 | mean = 0. 177 | std = 0. 178 | for mix_folder in self.mixture_folders: 179 | try: 180 | abs_p = os.path.join(mix_folder, 'abs_tfs') 181 | abs_tfs = joblib.load(abs_p) 182 | except: 183 | raise IOError("Failed to load absolute tf " 184 | "representation from path: {} " 185 | "using joblib.".format(abs_p)) 186 | 187 | mean += np.mean(np.mean(abs_tfs)) 188 | std += np.std(abs_tfs) 189 | mean /= self.__len__() 190 | std /= self.__len__() 191 | 192 | # store them for later usage 193 | joblib.dump((mean, std), self.dataset_stats_path) 194 | print("Saving dataset mean and variance in: {}".format( 195 | self.dataset_stats_path)) 196 | else: 197 | mean, std = joblib.load(self.dataset_stats_path) 198 | 199 | return mean, std 200 | 201 | 202 | def get_data_generator(dataset_dir, 203 | partition='train', 204 | num_workers=1, 205 | return_stats=False, 206 | get_top=None, 207 | batch_size=1, 208 | return_n_batches=True, 209 | labels_mask='duet', 210 | return_n_sources=False, 211 | only_mask_evaluation=False): 212 | data = PytorchMixtureDataset(dataset_dir, 213 | partition=partition, 214 | get_top=get_top, 215 | labels_mask=labels_mask, 216 | only_mask_evaluation=only_mask_evaluation) 217 | generator_params = {'batch_size': batch_size, 218 | 'shuffle': True, 219 | 'num_workers': num_workers, 220 | 'drop_last': True} 221 | data_generator = DataLoader(data, 222 | **generator_params, 223 | pin_memory=False) 224 | 225 | results = [data_generator] 226 | 227 | if return_stats: 228 | mean, std = data.extract_stats() 229 | results += [mean, std] 230 | 231 | if return_n_batches: 232 | n_batches = int(len(data) / batch_size) 233 | results.append(n_batches) 234 | 235 | if return_n_sources: 236 | results.append(data.n_sources) 237 | 238 | return results 239 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/experiments/run_experiment_v1.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Using the fast version of the dataset generator provide a 3 | naive experimental setup for performing the experiment using also the 4 | new command line argument parser. 5 | 6 | @author Efthymios Tzinis {etzinis2@illinois.edu} 7 | @copyright University of illinois at Urbana Champaign 8 | """ 9 | 10 | 11 | 12 | import os 13 | import sys 14 | import torch 15 | import time 16 | import numpy as np 17 | import copy 18 | from pprint import pprint 19 | import torch.nn as nn 20 | 21 | root_dir = os.path.join( 22 | os.path.dirname(os.path.realpath(__file__)), 23 | '../../../') 24 | sys.path.insert(0, root_dir) 25 | 26 | import spatial_two_mics.dnn.models.simple_LSTM_encoder as LSTM_enc 27 | import spatial_two_mics.dnn.losses.affinity_approximation as \ 28 | affinity_losses 29 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as fast_data_gen 30 | import spatial_two_mics.dnn.utils.data_conversions as converters 31 | import spatial_two_mics.dnn.utils.experiment_command_line_parser_v2 as \ 32 | parser 33 | import spatial_two_mics.dnn.utils.update_history as update_history 34 | import spatial_two_mics.dnn.utils.model_logger as model_logger 35 | from progress.bar import ChargingBar 36 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as \ 37 | numpy_eval 38 | from sklearn.cluster import KMeans 39 | 40 | 41 | def train(model, 42 | training_generator, 43 | optimizer, 44 | mean_tr, 45 | std_tr, 46 | epoch, 47 | history, 48 | n_batches, 49 | n_sources, 50 | training_labels=''): 51 | model.train() 52 | bar = ChargingBar("Training for epoch: {}...".format(epoch), 53 | max=n_batches) 54 | for batch_data in training_generator: 55 | (abs_tfs, masks) = batch_data 56 | input_tfs, index_ys = abs_tfs.cuda(), masks.cuda() 57 | # the input sequence is determined by time and not freqs 58 | # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps 59 | input_tfs = input_tfs.permute(0, 2, 1).contiguous() 60 | index_ys = index_ys.permute(0, 2, 1).contiguous() 61 | 62 | # normalize with mean and variance from the training dataset 63 | input_tfs -= mean_tr 64 | input_tfs /= std_tr 65 | 66 | if training_labels == 'raw_phase_diff': 67 | flatened_ys = index_ys.view(index_ys.size(0), -1, 1) 68 | else: 69 | # index_ys = index_ys.permute(0, 2, 1).contiguous() 70 | one_hot_ys = converters.one_hot_3Dmasks(index_ys, 71 | n_sources) 72 | flatened_ys = one_hot_ys.view(one_hot_ys.size(0), 73 | -1, 74 | one_hot_ys.size(-1)).cuda() 75 | 76 | optimizer.zero_grad() 77 | vs = model(input_tfs) 78 | 79 | 80 | loss = affinity_losses.paris_naive(vs, flatened_ys) 81 | 82 | loss.backward() 83 | nn.utils.clip_grad_norm_(model.parameters(), 100.) 84 | optimizer.step() 85 | 86 | update_history.values_update([('loss', loss)], 87 | history, update_mode='batch') 88 | bar.next() 89 | bar.finish() 90 | 91 | 92 | def eval(model, 93 | val_generator, 94 | mean_tr, 95 | std_tr, 96 | epoch, 97 | history, 98 | n_batches, 99 | k_means_obj, 100 | n_sources, 101 | batch_size): 102 | 103 | model.eval() 104 | with torch.no_grad(): 105 | bar = ChargingBar("Evaluating for epoch: {}...".format(epoch), 106 | max=n_batches*batch_size) 107 | for batch_data in val_generator: 108 | abs_tfs, wavs_lists, real_tfs, imag_tfs = batch_data 109 | input_tfs = abs_tfs.cuda() 110 | # the input sequence is determined by time and not freqs 111 | # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps 112 | input_tfs = input_tfs.permute(0, 2, 1).contiguous() 113 | 114 | # normalize with mean and variance from the training dataset 115 | input_tfs -= mean_tr 116 | input_tfs /= std_tr 117 | 118 | vs = model(input_tfs) 119 | for b in np.arange(vs.size(0)): 120 | 121 | embedding_features = vs[b, :, :].data.cpu().numpy() 122 | 123 | embedding_labels = np.array(k_means_obj.fit_predict( 124 | embedding_features)) 125 | 126 | sdr, sir, sar = numpy_eval.naive_cpu_bss_eval( 127 | embedding_labels, 128 | real_tfs[b].data.numpy(), 129 | imag_tfs[b].data.numpy(), 130 | wavs_lists[b].data.numpy(), 131 | n_sources, 132 | batch_index=b) 133 | 134 | update_history.values_update([('sdr', sdr), 135 | ('sir', sir), 136 | ('sar', sar)], 137 | history, 138 | update_mode='batch') 139 | 140 | bar.next() 141 | bar.finish() 142 | 143 | 144 | def run_LSTM_experiment(args): 145 | visible_cuda_ids = ','.join(map(str, args.cuda_available_devices)) 146 | os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids 147 | 148 | (training_generator, mean_tr, std_tr, n_tr_batches, n_tr_sources) =\ 149 | fast_data_gen.get_data_generator(args.train, 150 | partition='train', 151 | num_workers=args.num_workers, 152 | return_stats=True, 153 | get_top=args.n_train, 154 | batch_size=args.batch_size, 155 | return_n_batches=True, 156 | labels_mask=args.training_labels, 157 | return_n_sources=True) 158 | 159 | val_generator, n_val_batches, n_val_sources = \ 160 | fast_data_gen.get_data_generator(args.val, 161 | partition='val', 162 | num_workers=args.num_workers, 163 | return_stats=False, 164 | get_top=args.n_val, 165 | batch_size=args.batch_size, 166 | return_n_batches=True, 167 | labels_mask=None, 168 | return_n_sources=True) 169 | 170 | model = LSTM_enc.BLSTMEncoder(num_layers=args.n_layers, 171 | hidden_size=args.hidden_size, 172 | embedding_depth=args.embedding_depth, 173 | bidirectional=args.bidirectional, 174 | dropout=args.dropout) 175 | model = nn.DataParallel(model).cuda() 176 | 177 | optimizer = torch.optim.Adam(model.parameters(), 178 | lr=args.learning_rate, 179 | betas=(0.9, 0.999)) 180 | 181 | assert n_val_sources == n_tr_sources, "Number of sources in both " \ 182 | "training and evaluation " \ 183 | "should be equal while " \ 184 | "training" 185 | k_means_obj = KMeans(n_clusters=n_tr_sources) 186 | # just iterate over the data 187 | history = {} 188 | for epoch in np.arange(args.epochs): 189 | 190 | train(model, training_generator, optimizer, mean_tr, 191 | std_tr, epoch, history, n_tr_batches, n_tr_sources, 192 | training_labels=args.training_labels) 193 | 194 | update_history.values_update([('loss', None)], 195 | history, 196 | update_mode='epoch') 197 | 198 | 199 | if epoch % args.eval_per == 0: 200 | eval(model, val_generator, mean_tr, std_tr, epoch, 201 | history, n_val_batches, k_means_obj, n_val_sources, 202 | args.batch_size) 203 | 204 | update_history.values_update([('sdr', None), 205 | ('sir', None), 206 | ('sar', None)], 207 | history, 208 | update_mode='epoch') 209 | 210 | # keep track of best performances so far 211 | epoch_performance_dic = { 212 | 'sdr': history['sdr'][-1], 213 | 'sir': history['sir'][-1], 214 | 'sar': history['sar'][-1] 215 | } 216 | update_history.update_best_performance( 217 | epoch_performance_dic, epoch, history, 218 | buffer_size=args.save_best) 219 | 220 | 221 | # save the model if it is one of the best according to SDR 222 | if (history['sdr'][-1] >= 223 | history['best_performances'][-1][0]['sdr']): 224 | dataset_id = os.path.basename(args.train) 225 | 226 | model_logger.save(model, 227 | optimizer, 228 | args, 229 | epoch, 230 | epoch_performance_dic, 231 | dataset_id, 232 | mean_tr, 233 | std_tr, 234 | training_labels=args.training_labels) 235 | 236 | 237 | pprint(history['loss'][-1]) 238 | pprint(history['best_performances']) 239 | 240 | 241 | if __name__ == "__main__": 242 | args = parser.get_args() 243 | run_LSTM_experiment(args) -------------------------------------------------------------------------------- /spatial_two_mics/dnn/utils/dataset.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief A dataset creation which is compatible with pytorch framework 3 | 4 | @author Efthymios Tzinis {etzinis2@illinois.edu} 5 | @copyright University of illinois at Urbana Champaign 6 | """ 7 | 8 | import torch 9 | import argparse 10 | import os 11 | import sys 12 | import glob2 13 | import numpy as np 14 | from sklearn.externals import joblib 15 | import scipy.io.wavfile as wavfile 16 | from torch.utils.data import Dataset, DataLoader 17 | from pprint import pprint 18 | 19 | root_dir = os.path.join( 20 | os.path.dirname(os.path.realpath(__file__)), 21 | '../../../') 22 | sys.path.insert(0, root_dir) 23 | import spatial_two_mics.utils.audio_mixture_constructor as \ 24 | mixture_creator 25 | import spatial_two_mics.config as config 26 | import spatial_two_mics.data_generator.dataset_storage as \ 27 | dataset_storage 28 | 29 | 30 | class PytorchMixtureDataset(Dataset): 31 | """ 32 | This is a general compatible class for pytorch datasets. 33 | 34 | @note Each instance of the dataset should be stored using 35 | joblib.dump() and this is the way that it would be returned. 36 | After some transformations. 37 | 38 | The path of all datasets should be defined inside config. 39 | All datasets should be formatted with appropriate subfolders of 40 | train / test and val and under them there should be all the 41 | available files. 42 | """ 43 | def __init__(self, 44 | dataset='timit', 45 | partition='train', 46 | n_samples=[512, 128, 256], 47 | n_sources=2, 48 | genders=['f', 'm'], 49 | n_fft=512, 50 | win_len=512, 51 | hop_length=128, 52 | mixture_duration=2.0, 53 | force_delays=[-1, 1], 54 | get_top=None, 55 | **kwargs): 56 | 57 | self.dataset_params = { 58 | 'dataset': dataset, 59 | 'n_samples': n_samples, 60 | 'n_sources': n_sources, 61 | 'genders': genders, 62 | 'force_delays': force_delays 63 | } 64 | dataset_name = dataset_storage.create_dataset_name( 65 | self.dataset_params) 66 | 67 | self.dataset_stats_path = os.path.join(config.DATASETS_DIR, 68 | dataset_name, 69 | partition+'_stats') 70 | 71 | self.dataset_dirpath = os.path.join( 72 | config.DATASETS_DIR, 73 | dataset_name, 74 | partition) 75 | 76 | if not os.path.isdir(self.dataset_dirpath): 77 | raise IOError("Dataset folder {} not found!".format( 78 | self.dataset_dirpath)) 79 | else: 80 | print("Loading files from {} ...".format( 81 | self.dataset_dirpath)) 82 | 83 | self.data_paths = glob2.glob(os.path.join(self.dataset_dirpath, 84 | '*')) 85 | if get_top is not None: 86 | self.data_paths = self.data_paths[:get_top] 87 | 88 | self.n_samples = len(self.data_paths) 89 | 90 | self.mix_creator = mixture_creator.AudioMixtureConstructor( 91 | n_fft=n_fft, 92 | win_len=win_len, 93 | hop_len=hop_length, 94 | mixture_duration=mixture_duration, 95 | force_delays=force_delays) 96 | 97 | def __len__(self): 98 | return self.n_samples 99 | 100 | def __getitem__(self, idx): 101 | file_path = self.data_paths[idx] 102 | try: 103 | mixture_info = joblib.load(file_path) 104 | except: 105 | raise IOError("Failed to load data from path: {} " 106 | "using joblib.".format(file_path)) 107 | 108 | tf_info = self.mix_creator.construct_mixture(mixture_info) 109 | mixture_tf = tf_info['m1_tf'] 110 | abs_tf = abs(mixture_tf) 111 | real_tf = np.real(mixture_tf) 112 | imag_tf = np.imag(mixture_tf) 113 | 114 | # assert (real_tf + 1j * imag_tf == mixture_tf).all() 115 | 116 | duet_mask = None 117 | ground_truth_mask = None 118 | try: 119 | duet_mask = mixture_info['soft_labeled_mask'] 120 | except: 121 | raise KeyError("Mixture info does not have a soft label " 122 | "attribute inferred by duet algorithm") 123 | 124 | try: 125 | ground_truth_mask = mixture_info['ground_truth_mask'] 126 | except: 127 | raise KeyError("Mixture info does not have a ground truth " 128 | "mask inferred by the most dominant source " 129 | "in each TF bin.") 130 | 131 | sources_raw = np.array(tf_info['sources_raw']) 132 | amplitudes = np.array(mixture_info['positions']['amplitudes']) 133 | n_sources = len(sources_raw) 134 | 135 | return (abs_tf, real_tf, imag_tf, 136 | duet_mask, ground_truth_mask, 137 | sources_raw, amplitudes, n_sources) 138 | 139 | def extract_stats(self): 140 | if not os.path.lexists(self.dataset_stats_path): 141 | mean = 0. 142 | std = 0. 143 | for file_path in self.data_paths: 144 | try: 145 | mix_info = joblib.load(file_path) 146 | except: 147 | raise IOError("Failed to load data from path: {} " 148 | "using joblib.".format(file_path)) 149 | 150 | tf_info = self.mix_creator.construct_mixture(mix_info) 151 | mixture_tf = tf_info['m1_tf'] 152 | abs_tf = abs(mixture_tf) 153 | mean += np.mean(np.mean(abs_tf)) 154 | std += np.std(abs_tf) 155 | mean /= self.__len__() 156 | std /= self.__len__() 157 | 158 | # store them for later usage 159 | joblib.dump((mean, std), self.dataset_stats_path) 160 | print("Saving dataset mean and variance in: {}".format( 161 | self.dataset_stats_path)) 162 | return mean, std 163 | 164 | else: 165 | mean, std = joblib.load(self.dataset_stats_path) 166 | 167 | return mean, std 168 | 169 | 170 | def get_data_generator(args, 171 | return_stats=False, 172 | get_top=None): 173 | data = PytorchMixtureDataset(**args.__dict__, 174 | get_top=get_top) 175 | generator_params = {'batch_size': args.batch_size, 176 | 'shuffle': True, 177 | 'num_workers': args.num_workers, 178 | 'drop_last': True} 179 | data_generator = DataLoader(data, 180 | **generator_params, 181 | pin_memory=False) 182 | n_batches = int(len(data) / args.batch_size) 183 | if return_stats: 184 | mean, std = data.extract_stats() 185 | return data_generator, mean, std, n_batches 186 | else: 187 | return data_generator, n_batches 188 | 189 | 190 | def concatenate_for_masks(masks, n_sources, batch_size): 191 | # create 3d masks for each source 192 | batch_list = [] 193 | for b in torch.arange(batch_size): 194 | sources_list = [] 195 | for i in torch.arange(n_sources): 196 | source_mask = masks[b, :, :] == int(i) 197 | sources_list.append(source_mask) 198 | 199 | sources_tensor = torch.stack(sources_list, 200 | dim=n_sources) 201 | batch_list.append(sources_tensor) 202 | return torch.stack(batch_list, dim=0) 203 | 204 | 205 | def initialize_and_copy_masks(masks, n_sources, batch_size, device): 206 | new_masks = torch.empty((batch_size, 207 | masks.shape[1], 208 | masks.shape[2], 209 | n_sources), 210 | dtype=torch.uint8) 211 | new_masks.to(device) 212 | for i in torch.arange(n_sources): 213 | new_masks[:, :, :, i] = masks[:, :, :] == int(i) 214 | 215 | return new_masks 216 | 217 | 218 | def example_of_usage(args): 219 | import time 220 | 221 | training_data = PytorchMixtureDataset(**args.__dict__) 222 | mean, std = training_data.extract_stats() 223 | 224 | generator_params = {'batch_size': 128, 225 | 'shuffle': True, 226 | 'num_workers': 1, 227 | 'drop_last': True} 228 | training_generator = DataLoader(training_data, **generator_params) 229 | device = torch.device("cuda") 230 | 231 | timing_dic = {} 232 | 233 | batch_now = time.time() 234 | # just iterate over the data 235 | for batch_data in training_generator: 236 | timing_dic['Loading batch'] = time.time() - batch_now 237 | batch_now = time.time() 238 | 239 | before = time.time() 240 | (abs_tfs, real_tfs, imag_tfs, 241 | duet_masks, ground_truth_masks, 242 | sources_raw, amplitudes, n_sources) = batch_data 243 | now = time.time() 244 | timing_dic['Loading from disk'] = now-before 245 | 246 | before = time.time() 247 | input_tf, masks_tf = abs_tfs.to(device), duet_masks.to(device) 248 | now = time.time() 249 | timing_dic['Loading to GPU'] = now - before 250 | 251 | 252 | before = time.time() 253 | duet_stack = concatenate_for_masks(duet_masks, 254 | args.n_sources, 255 | generator_params['batch_size']) 256 | gt_stack = concatenate_for_masks(ground_truth_masks, 257 | args.n_sources, 258 | generator_params['batch_size']) 259 | now = time.time() 260 | timing_dic['Stacking in appropriate dimensions the masks'] = \ 261 | now - before 262 | 263 | before = time.time() 264 | duet_copy = initialize_and_copy_masks(duet_masks, 265 | args.n_sources, 266 | generator_params[ 267 | 'batch_size'], 268 | device) 269 | 270 | gt_copy = initialize_and_copy_masks(ground_truth_masks, 271 | args.n_sources, 272 | generator_params[ 273 | 'batch_size'], 274 | device) 275 | now = time.time() 276 | timing_dic['Initializing and copying for masks'] = now - before 277 | 278 | assert torch.equal(duet_copy, duet_stack) 279 | assert torch.equal(gt_copy, gt_stack) 280 | 281 | 282 | # torch.cuda.empty_cache() 283 | pprint(timing_dic) 284 | 285 | 286 | def get_args(): 287 | """! Command line parser """ 288 | parser = argparse.ArgumentParser(description='Pytorch Dataset ' 289 | 'Loader') 290 | parser.add_argument("--dataset", type=str, 291 | help="Dataset name", default="timit") 292 | parser.add_argument("--n_sources", type=int, 293 | help="How many sources in each mix", default=2) 294 | parser.add_argument("--n_samples", type=int, nargs='+', 295 | help="How many samples do u want to be " 296 | "created for train test val", 297 | required=True) 298 | parser.add_argument("--genders", type=str, nargs='+', 299 | help="Genders that will correspond to the " 300 | "genders in the mixtures", 301 | default=['m', 'f']) 302 | parser.add_argument("-f", "--force_delays", nargs='+', type=int, 303 | help="""Whether you want to force integer 304 | delays of +- 1 in the sources e.g.""", 305 | default=[-1,1]) 306 | return parser.parse_args() 307 | 308 | 309 | if __name__ == "__main__": 310 | args = get_args() 311 | example_of_usage(args) 312 | 313 | 314 | -------------------------------------------------------------------------------- /spatial_two_mics/dnn/utils/fast_dataset_v2.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief A dataset creation which is compatible with pytorch framework 3 | and much faster in loading time depending on the new version of 4 | loading only the appropriate files that might be needed 5 | 6 | @author Efthymios Tzinis {etzinis2@illinois.edu} 7 | @copyright University of illinois at Urbana Champaign 8 | """ 9 | 10 | import torch 11 | import argparse 12 | import os 13 | import sys 14 | import glob2 15 | import numpy as np 16 | from sklearn.externals import joblib 17 | import scipy.io.wavfile as wavfile 18 | from torch.utils.data import Dataset, DataLoader 19 | from pprint import pprint 20 | 21 | root_dir = os.path.join( 22 | os.path.dirname(os.path.realpath(__file__)), 23 | '../../../') 24 | sys.path.insert(0, root_dir) 25 | import spatial_two_mics.utils.audio_mixture_constructor as \ 26 | mixture_creator 27 | import spatial_two_mics.config as config 28 | import spatial_two_mics.data_generator.dataset_storage as \ 29 | dataset_storage 30 | 31 | 32 | class PytorchMixtureDataset(Dataset): 33 | """ 34 | This is a general compatible class for pytorch datasets. 35 | 36 | @note Each instance of the dataset should be stored using 37 | joblib.dump() and this is the way that it would be returned. 38 | After some transformations. 39 | 40 | The path of all datasets should be defined inside config. 41 | All datasets should be formatted with appropriate subfolders of 42 | train / test and val and under them there should be all the 43 | available files. 44 | """ 45 | def __init__(self, 46 | dataset='timit', 47 | partition='train', 48 | n_samples=[512, 128, 256], 49 | n_sources=2, 50 | genders=['f', 'm'], 51 | n_fft=512, 52 | win_len=512, 53 | hop_length=128, 54 | mixture_duration=2.0, 55 | force_delays=[-1, 1], 56 | get_top=None, 57 | labels_mask='duet', 58 | **kwargs): 59 | 60 | self.dataset_params = { 61 | 'dataset': dataset, 62 | 'n_samples': n_samples, 63 | 'n_sources': n_sources, 64 | 'genders': genders, 65 | 'force_delays': force_delays 66 | } 67 | 68 | if labels_mask == 'duet' or labels_mask == 'ground_truth': 69 | self.selected_mask = labels_mask 70 | else: 71 | raise NotImplementedError("There is no available mask " 72 | "called: {}".format(labels_mask)) 73 | self.partition = partition 74 | 75 | dataset_name = dataset_storage.create_dataset_name( 76 | self.dataset_params) 77 | 78 | self.dataset_dirpath = os.path.join( 79 | config.DATASETS_DIR, 80 | dataset_name, 81 | partition) 82 | 83 | self.dataset_stats_path = self.dataset_dirpath + '_stats' 84 | 85 | if not os.path.isdir(self.dataset_dirpath): 86 | raise IOError("Dataset folder {} not found!".format( 87 | self.dataset_dirpath)) 88 | else: 89 | print("Loading files from {} ...".format( 90 | self.dataset_dirpath)) 91 | 92 | self.mixture_folders = glob2.glob(os.path.join( 93 | self.dataset_dirpath, '*')) 94 | if get_top is not None: 95 | self.mixture_folders = self.mixture_folders[:get_top] 96 | 97 | self.n_samples = len(self.mixture_folders) 98 | 99 | # preprocess -- store all absolute spectra values for faster 100 | # loading during run time 101 | self.store_directly_abs_spectra() 102 | 103 | def __len__(self): 104 | return self.n_samples 105 | 106 | def __getitem__(self, idx): 107 | """! 108 | Depending on the selected partition it returns accordingly 109 | the following objects: 110 | 111 | if self.partition == 'train': 112 | (abs_tfs, selected_mask) 113 | else if partition == 'test' or 'val' 114 | (abs_tfs, selected_mask, wavs_list, real_tfs, imag_tfs)""" 115 | mix_folder = self.mixture_folders[idx] 116 | try: 117 | abs_tfs = joblib.load(os.path.join(mix_folder, 'abs_tfs')) 118 | except: 119 | raise IOError("Failed to load data from path: {} " 120 | "for absolute spectra.".format(mix_folder)) 121 | 122 | 123 | try: 124 | if self.selected_mask == 'duet': 125 | mask = joblib.load(os.path.join(mix_folder, 126 | 'soft_labeled_mask')) 127 | else: 128 | mask = joblib.load(os.path.join(mix_folder, 129 | 'ground_truth_mask')) 130 | except: 131 | raise IOError("Failed to load data from path: {} " 132 | "for tf label masks".format(mix_folder)) 133 | 134 | if self.partition == 'train': 135 | return abs_tfs, mask 136 | 137 | try: 138 | real_p = os.path.join(mix_folder, 'real_tfs') 139 | imag_p = os.path.join(mix_folder, 'imag_tfs') 140 | wavs_p= os.path.join(mix_folder, 'wavs') 141 | real_tfs = joblib.load(real_p) 142 | imag_tfs = joblib.load(imag_p) 143 | wavs_list = joblib.load(wavs_p) 144 | wavs_list = np.array(wavs_list) 145 | except: 146 | raise IOError("Failed to load data from path: {} " 147 | "for real, imag tf of the mixture and " 148 | "wavs".format(mix_folder)) 149 | 150 | return abs_tfs, mask, wavs_list, real_tfs, imag_tfs 151 | 152 | def store_directly_abs_spectra(self): 153 | for mix_folder in self.mixture_folders: 154 | abs_p = os.path.join(mix_folder, 'abs_tfs') 155 | if os.path.lexists(abs_p): 156 | continue 157 | 158 | try: 159 | real_p = os.path.join(mix_folder, 'real_tfs') 160 | imag_p = os.path.join(mix_folder, 'imag_tfs') 161 | real_tfs = joblib.load(real_p) 162 | imag_tfs = joblib.load(imag_p) 163 | except: 164 | raise IOError("Failed to load data from path: {} " 165 | "using joblib.".format(mix_folder)) 166 | abs_tfs = np.abs(real_tfs + 1j * imag_tfs) 167 | try: 168 | joblib.dump(abs_tfs, abs_p, compress=0) 169 | except: 170 | raise IOError("Failed to save absolute value of " 171 | "spectra in path: {}".format(abs_p)) 172 | 173 | def extract_stats(self): 174 | if not os.path.lexists(self.dataset_stats_path): 175 | mean = 0. 176 | std = 0. 177 | for mix_folder in self.mixture_folders: 178 | try: 179 | abs_p = os.path.join(mix_folder, 'abs_tfs') 180 | abs_tfs = joblib.load(abs_p) 181 | except: 182 | raise IOError("Failed to load absolute tf " 183 | "representation from path: {} " 184 | "using joblib.".format(abs_p)) 185 | 186 | mean += np.mean(np.mean(abs_tfs)) 187 | std += np.std(abs_tfs) 188 | mean /= self.__len__() 189 | std /= self.__len__() 190 | 191 | # store them for later usage 192 | joblib.dump((mean, std), self.dataset_stats_path) 193 | print("Saving dataset mean and variance in: {}".format( 194 | self.dataset_stats_path)) 195 | else: 196 | mean, std = joblib.load(self.dataset_stats_path) 197 | 198 | return mean, std 199 | 200 | 201 | def get_data_generator(args, 202 | return_stats=False, 203 | get_top=None): 204 | data = PytorchMixtureDataset(**args.__dict__, 205 | get_top=get_top) 206 | generator_params = {'batch_size': args.batch_size, 207 | 'shuffle': True, 208 | 'num_workers': args.num_workers, 209 | 'drop_last': True} 210 | data_generator = DataLoader(data, 211 | **generator_params, 212 | pin_memory=False) 213 | n_batches = int(len(data) / args.batch_size) 214 | if return_stats: 215 | mean, std = data.extract_stats() 216 | return data_generator, mean, std, n_batches 217 | else: 218 | return data_generator, n_batches 219 | 220 | 221 | def concatenate_for_masks(masks, n_sources, batch_size): 222 | # create 3d masks for each source 223 | batch_list = [] 224 | for b in torch.arange(batch_size): 225 | sources_list = [] 226 | for i in torch.arange(n_sources): 227 | source_mask = masks[b, :, :] == int(i) 228 | sources_list.append(source_mask) 229 | 230 | sources_tensor = torch.stack(sources_list, 231 | dim=n_sources) 232 | batch_list.append(sources_tensor) 233 | return torch.stack(batch_list, dim=0) 234 | 235 | 236 | def initialize_and_copy_masks(masks, n_sources, batch_size, device): 237 | new_masks = torch.empty((batch_size, 238 | masks.shape[1], 239 | masks.shape[2], 240 | n_sources), 241 | dtype=torch.uint8) 242 | new_masks.to(device) 243 | for i in torch.arange(n_sources): 244 | new_masks[:, :, :, i] = masks[:, :, :] == int(i) 245 | 246 | return new_masks 247 | 248 | 249 | def example_of_usage(args): 250 | import time 251 | 252 | training_data = PytorchMixtureDataset(**args.__dict__) 253 | mean, std = training_data.extract_stats() 254 | generator_params = {'batch_size': 128, 255 | 'shuffle': True, 256 | 'num_workers': 1, 257 | 'drop_last': True} 258 | training_generator = DataLoader(training_data, **generator_params) 259 | device = torch.device("cuda") 260 | 261 | timing_dic = {} 262 | n_sources = 2 263 | 264 | batch_now = time.time() 265 | # just iterate over the data 266 | for batch_data in training_generator: 267 | timing_dic['Loading batch'] = time.time() - batch_now 268 | batch_now = time.time() 269 | 270 | before = time.time() 271 | (abs_tfs, masks) = batch_data 272 | now = time.time() 273 | timing_dic['Loading from disk'] = now-before 274 | 275 | before = time.time() 276 | input_tf, masks_tf = abs_tfs.to(device), masks.to(device) 277 | now = time.time() 278 | timing_dic['Loading to GPU'] = now - before 279 | 280 | 281 | before = time.time() 282 | duet_stack = concatenate_for_masks(masks, 283 | n_sources, 284 | generator_params['batch_size']) 285 | now = time.time() 286 | timing_dic['Stacking in appropriate dimensions the masks'] = \ 287 | now - before 288 | 289 | before = time.time() 290 | duet_copy = initialize_and_copy_masks(masks, 291 | n_sources, 292 | generator_params[ 293 | 'batch_size'], 294 | device) 295 | now = time.time() 296 | timing_dic['Initializing and copying for masks'] = now - before 297 | 298 | pprint(timing_dic) 299 | batch_now = time.time() 300 | 301 | 302 | def get_args(): 303 | """! Command line parser """ 304 | parser = argparse.ArgumentParser(description='Pytorch Fast Dataset ' 305 | 'Loader') 306 | parser.add_argument("--dataset", type=str, 307 | help="Dataset name", default="timit") 308 | parser.add_argument("--n_sources", type=int, 309 | help="How many sources in each mix", default=2) 310 | parser.add_argument("--n_samples", type=int, nargs='+', 311 | help="How many samples do u want to be " 312 | "created for train test val", 313 | required=True) 314 | parser.add_argument("--genders", type=str, nargs='+', 315 | help="Genders that will correspond to the " 316 | "genders in the mixtures", 317 | default=['m', 'f']) 318 | parser.add_argument("-f", "--force_delays", nargs='+', type=int, 319 | help="""Whether you want to force integer 320 | delays of +- 1 in the sources e.g.""", 321 | default=[-1,1]) 322 | return parser.parse_args() 323 | 324 | 325 | if __name__ == "__main__": 326 | args = get_args() 327 | example_of_usage(args) 328 | 329 | 330 | -------------------------------------------------------------------------------- /spatial_two_mics/utils/audio_mixture_constructor.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief This utility serves as a level of abstraction in order to 3 | construct audio mixtures 4 | 5 | 6 | @author Efthymios Tzinis {etzinis2@illinois.edu} 7 | @copyright University of Illinois at Urbana Champaign 8 | """ 9 | 10 | from librosa.core import stft 11 | from pprint import pprint 12 | import numpy as np 13 | import scipy.io.wavfile as wavfile 14 | 15 | 16 | class AudioMixtureConstructor(object): 17 | def __init__(self, 18 | n_fft=1024, 19 | win_len=None, 20 | hop_len=None, 21 | force_delays=None, 22 | normalize_audio_by_std=True, 23 | mixture_duration=2.0, 24 | precision=0.01, 25 | freqs_included=5): 26 | """ 27 | :param fs: sampling rate 28 | :param n_fft: FFT window size 29 | :param win_len: The window will be of length win_length and 30 | then padded with zeros to match n_fft. 31 | If unspecified, defaults to win_length = n_fft. 32 | :param hop_len: number audio of frames between STFT columns. 33 | If unspecified, defaults win_length / 4. 34 | :param force_delays: list of delays to be forced in the 35 | source signals -1 or 1 integer delay for the microphones 36 | mixtures, if is [0, 0] then no delay would be forced 37 | :param normalize_audio_by_std: if the loaded wavs would be 38 | normalized by their std values 39 | :param mixture_duration: the duration on which the mixture 40 | would be created (in seconds) 41 | :param precision: The precision as a floating number e.g. 0.01 42 | if you are using floating point delays between your source 43 | signals for each mixture 44 | :param freqs_included: How many frequencies should be 45 | included in the sinc function before convolving it with the 46 | true signal in order to upsample it (1/precision) times more 47 | and shift it in order to get the truly delayed signal. 48 | """ 49 | self.mixture_duration = mixture_duration 50 | self.n_fft = n_fft 51 | self.win_len = win_len 52 | self.hop_len = hop_len 53 | self.normalize_audio_by_std = normalize_audio_by_std 54 | self.force_delays = force_delays 55 | self.precision = precision 56 | self.freqs_included = freqs_included 57 | 58 | xs = np.linspace(-self.freqs_included, 59 | self.freqs_included, 60 | 2. * self.freqs_included / self.precision) 61 | self.windowed_sinc = np.sinc(xs) 62 | 63 | @staticmethod 64 | def load_wav(source_info): 65 | return wavfile.read(source_info['wav_path']) 66 | 67 | def get_stft(self, 68 | signal): 69 | 70 | return stft(signal, 71 | n_fft=self.n_fft, 72 | win_length=self.win_len, 73 | hop_length=self.hop_len) 74 | 75 | def force_delay_on_signal(self, 76 | signal, 77 | delay): 78 | if delay >= 0: 79 | return signal[delay:] 80 | else: 81 | return signal[:delay] 82 | 83 | def enforce_float_delays(self, 84 | source_signals, 85 | delays_for_sources, 86 | fs): 87 | """! 88 | For 2 microphone enforce a floating point number delay with some 89 | selected precision and apply that for all sources that would 90 | be given. Also make sure that the required to be returned 91 | wavs have to have a length equal to the duration""" 92 | upsampling_rate = int(1. / self.precision) 93 | duration_in_samples = int(self.mixture_duration * fs) - 1 94 | decimals = int(np.log10(upsampling_rate)) 95 | n_augmentation_zeros = upsampling_rate - 1 96 | 97 | rounded_taus = np.around(delays_for_sources, decimals=decimals) 98 | taus_samples = upsampling_rate * rounded_taus 99 | taus_samples = taus_samples.astype(int) 100 | 101 | mic_signals = {'m1': [], 'm2': []} 102 | for src_id, source_sig in enumerate(source_signals): 103 | sig_len = source_sig.shape[0] 104 | augmented_signal = np.zeros( 105 | sig_len + (sig_len - 1) * n_augmentation_zeros) 106 | augmented_signal[::upsampling_rate] = source_sig 107 | est_augmented_sig = np.convolve(augmented_signal, 108 | self.windowed_sinc, 109 | mode='valid') 110 | 111 | tau_in_samples = taus_samples[src_id] 112 | if tau_in_samples > 0: 113 | source_in_mic1 = est_augmented_sig[ 114 | tau_in_samples:][::upsampling_rate] 115 | source_in_mic2 = est_augmented_sig[ 116 | :-tau_in_samples][::upsampling_rate] 117 | elif tau_in_samples < 0: 118 | source_in_mic1 = est_augmented_sig[ 119 | :tau_in_samples][::upsampling_rate] 120 | source_in_mic2 = est_augmented_sig[ 121 | -tau_in_samples:][::upsampling_rate] 122 | else: 123 | source_in_mic1 = est_augmented_sig[::upsampling_rate] 124 | source_in_mic2 = est_augmented_sig[::upsampling_rate] 125 | 126 | # check the duration which is very important 127 | if (len(source_in_mic1) < duration_in_samples or 128 | len(source_in_mic2) < duration_in_samples): 129 | raise ValueError("Duration given: {} could " 130 | "not be sufficed before the gven source" 131 | " signal has a lesser duration of {} " 132 | "after the float delay.".format( 133 | duration_in_samples, len(source_in_mic1))) 134 | 135 | mic_signals['m1'].append( 136 | source_in_mic1[:duration_in_samples]) 137 | mic_signals['m2'].append( 138 | source_in_mic2[:duration_in_samples]) 139 | 140 | return mic_signals 141 | 142 | def construct_mic_signals(self, 143 | source_signals, 144 | delays_for_sources): 145 | """! 146 | This function might extend to any real delay by interpolation 147 | of the source signals or just forcing a delay over the sources. 148 | After that it returns a dictionary containing a list of signals 149 | for each microphone. also cropped to the duration specified. 150 | 151 | :return mic_signals ={ 'm1': [s1, s2, ..., sn], 'm2': same } 152 | """ 153 | 154 | fs = source_signals[0][1] 155 | assert all([sr == fs for (s, sr) in source_signals]), 'When ' \ 156 | 'trying to enforce the delays over the source signals ' \ 157 | 'the fs should be the same for all sources!' 158 | 159 | if self.force_delays is None: 160 | mic_signals = self.enforce_float_delays( 161 | [s for (s, sr) in source_signals], 162 | delays_for_sources, 163 | fs) 164 | 165 | else: 166 | # naive way in order to force a delay for DUET algorithm 167 | m1_delays = self.force_delays 168 | m2_delays = self.force_delays[::-1] 169 | 170 | cropped_signals = [s[:int(self.mixture_duration * fs)] 171 | for (s, sr) in source_signals] 172 | 173 | mic_signals = { 174 | 'm1': [self.force_delay_on_signal(s, m1_delays[i]) 175 | for (i, s) in enumerate(cropped_signals)], 176 | 'm2': [self.force_delay_on_signal(s, m2_delays[i]) 177 | for (i, s) in enumerate(cropped_signals)] 178 | } 179 | 180 | return mic_signals 181 | 182 | def get_tf_representations(self, 183 | mixture_info): 184 | """! 185 | This function constructs the mixture for each mic (m1, 186 | m2) in the following way: 187 | m1(t) = a1*s1(t) + ... + an*sn(t) 188 | m2(t) = a1*s1(t+d1) + ... + an*sn(t+dn) 189 | 190 | by also cutting them off to self.min_samples 191 | 192 | :return 193 | mixture_info = { 194 | 'm1_raw': numpy array containing the raw m1 signal, 195 | 'm2_raw': numpy array containing the raw m2 signal, 196 | 'm1_tf': numpy array containing the m1 TF representation, 197 | 'm2_tf': numpy array containing the m2 TF representation, 198 | 'sources_raw': a list of numpy 1d vectors containing the 199 | sources , 200 | 'sources_tf': a list of numpy 2d vectors containing the 201 | TF represeantations of the sources , 202 | 'amplitudes': the weights that each source contributes to 203 | the mixture of the second microphone 204 | } 205 | """ 206 | positions = mixture_info['positions'] 207 | source_signals = [(s['wav'], s['fs']) 208 | for s in mixture_info['sources_ids']] 209 | n_sources = len(source_signals) 210 | 211 | mic_signals = self.construct_mic_signals(source_signals, 212 | positions['taus']) 213 | 214 | m1 = sum([positions['amplitudes'][i] * mic_signals['m1'][i] 215 | for i in np.arange(n_sources)]) 216 | 217 | m2 = sum([positions['amplitudes'][i] * mic_signals['m2'][i] 218 | for i in np.arange(n_sources)]) 219 | 220 | sources_spectra = [self.get_stft(s) for s in mic_signals['m1']] 221 | 222 | m1_tf = self.get_stft(m1) 223 | m2_tf = self.get_stft(m2) 224 | 225 | mixture_info = { 226 | 'm1_raw': m1, 227 | 'm2_raw': m2, 228 | 'm1_tf': m1_tf, 229 | 'm2_tf': m2_tf, 230 | 'sources_raw': mic_signals['m1'], 231 | 'sources_tf': sources_spectra, 232 | 'amplitudes': positions['amplitudes'] 233 | } 234 | 235 | return mixture_info 236 | 237 | def construct_mixture(self, 238 | mixture_info): 239 | """! The whole processing for getting the mixture signals for 240 | the two mics and the positions is done here. 241 | 242 | :param mixture_info 243 | { 244 | 'positions': example 245 | 'sources_ids': 246 | [ { 247 | 'gender': combination_info.gender 248 | 'sentence_id': combination_info.sentence_id 249 | 'speaker_id': combination_info.speaker_id 250 | 'wav_path': the wav_path for the file 251 | } ... ] 252 | } 253 | 254 | 255 | :return tf_representations = { 256 | 'm1_raw': numpy array containing the raw m1 signal, 257 | 'm2_raw': numpy array containing the raw m2 signal, 258 | 'm1_tf': numpy array containing the m1 TF representation, 259 | 'm2_tf': numpy array containing the m2 TF representation, 260 | 'sources_raw': a list of numpy 1d vectors containing the 261 | sources , 262 | 'sources_tf': a list of numpy 2d vectors containing the 263 | TF represeantations of the sources , 264 | 'amplitudes': the weights that each source contributes to 265 | the mixture of the second microphone 266 | } 267 | """ 268 | 269 | for i, source_info in enumerate(mixture_info['sources_ids']): 270 | fs, wav = self.load_wav(source_info) 271 | if self.normalize_audio_by_std: 272 | wav = wav / np.std(wav) 273 | mixture_info['sources_ids'][i]['fs'] = int(fs) 274 | mixture_info['sources_ids'][i]['wav'] = wav 275 | 276 | tf_representations = self.get_tf_representations(mixture_info) 277 | 278 | return tf_representations 279 | 280 | 281 | def example_of_usage(): 282 | """! 283 | How the class of Audio mixtures should be called""" 284 | 285 | import os 286 | import sys 287 | root_dir = os.path.join( 288 | os.path.dirname(os.path.realpath(__file__)), 289 | '../../') 290 | sys.path.insert(0, root_dir) 291 | import spatial_two_mics.examples.mixture_example as me 292 | 293 | mixture_creator = AudioMixtureConstructor(n_fft=1024, 294 | win_len=1024, 295 | hop_len=512, 296 | mixture_duration=2.0, 297 | force_delays=[-1, 1]) 298 | 299 | mixture_info = me.mixture_info_example() 300 | 301 | import spatial_two_mics.data_generator.source_position_generator \ 302 | as position_generator 303 | 304 | # add some randomness in the generation of the positions 305 | random_positioner = position_generator.RandomCirclePositioner() 306 | positions_info = random_positioner.get_sources_locations(2) 307 | mixture_info['positions'] = positions_info 308 | 309 | tf_mixtures = mixture_creator.construct_mixture(mixture_info) 310 | 311 | pprint(tf_mixtures) 312 | 313 | if __name__ == "__main__": 314 | example_of_usage() -------------------------------------------------------------------------------- /spatial_two_mics/data_loaders/wham.py: -------------------------------------------------------------------------------- 1 | """! 2 | @brief Pytorch dataloader for wham dataset for multiple gender combinations. 3 | 4 | @author Efthymios Tzinis {etzinis2@illinois.edu} 5 | @copyright University of illinois at Urbana Champaign 6 | """ 7 | 8 | import torch 9 | import os 10 | import numpy as np 11 | import pickle 12 | import glob2 13 | import sys 14 | 15 | current_dir = os.path.dirname(os.path.abspath('__file__')) 16 | root_dir = os.path.abspath(os.path.join(current_dir, '../../')) 17 | sys.path.append(root_dir) 18 | import approx_ensembles.separation.dataset_loader.abstract_dataset as \ 19 | abstract_dataset 20 | from scipy.io import wavfile 21 | import warnings 22 | from tqdm import tqdm 23 | from time import time 24 | 25 | EPS = 1e-8 26 | enh_single = {'mixture': 'mix_single', 27 | 'sources': ['s1', 'noise'], 28 | 'n_sources': 1} 29 | enh_single_white_noise = { 30 | 'mixture': 'source_with_white_noise', 31 | 'sources': ['s1', 'white_noise'], 32 | 'n_sources': 1} 33 | enh_both = {'mixture': 'mix_both', 34 | 'sources': ['mix_clean', 'noise'], 35 | 'n_sources': 1} 36 | sep_clean = {'mixture': 'mix_clean', 37 | 'sources': ['s1', 's2'], 38 | 'n_sources': 2} 39 | sep_noisy = {'mixture': 'mix_both', 40 | 'sources': ['s1', 's2', 'noise'], 41 | 'n_sources': 2} 42 | 43 | VALID_GENDER_COMBS = set(['ff', 'mm', 'fm', 'mf']) 44 | 45 | WHAM_TASKS = {'enhance_single_white_noise': enh_single_white_noise, 46 | 'enhance_single': enh_single, 47 | 'enhance_both': enh_both, 48 | 'sep_clean': sep_clean, 49 | 'sep_noisy': sep_noisy} 50 | WHAM_TASKS['enh_single'] = WHAM_TASKS['enhance_single'] 51 | WHAM_TASKS['enh_both'] = WHAM_TASKS['enhance_both'] 52 | 53 | 54 | def normalize_tensor_wav(wav_tensor, eps=1e-8, std=None): 55 | mean = wav_tensor.mean(-1, keepdim=True) 56 | if std is None: 57 | std = wav_tensor.std(-1, keepdim=True) 58 | return (wav_tensor - mean) / (std + eps) 59 | 60 | 61 | class Dataset(torch.utils.data.Dataset, abstract_dataset.Dataset): 62 | """ Dataset class for WHAM source separation and speech enhancement tasks. 63 | 64 | Example of kwargs: 65 | root_dirpath='/mnt/data/wham', task='enh_single', 66 | split='tr', sample_rate=8000, timelength=4.0, 67 | normalize_audio=False, n_samples=0, zero_pad=False 68 | """ 69 | def __init__(self, **kwargs): 70 | super(Dataset, self).__init__() 71 | warnings.filterwarnings("ignore") 72 | self.kwargs = kwargs 73 | 74 | self.task = self.get_arg_and_check_validness( 75 | 'task', known_type=str, choices=WHAM_TASKS.keys()) 76 | 77 | self.zero_pad = self.get_arg_and_check_validness( 78 | 'zero_pad', known_type=bool) 79 | 80 | self.augment = self.get_arg_and_check_validness( 81 | 'augment', known_type=bool) 82 | 83 | # Gender combination priors for combinations 84 | # ff, mm, fm/mf 85 | self.gender_combination_priors = self.get_arg_and_check_validness( 86 | 'gender_combination_priors', known_type=float, 87 | dict_check={'ff': float, 'mm': float, 'fm': float, 'mf':float}, 88 | extra_lambda_checks=[lambda x: [0 <= y <= 1 for y in x.values()]]) 89 | 90 | self.normalize_audio = self.get_arg_and_check_validness( 91 | 'normalize_audio', known_type=bool) 92 | 93 | self.min_or_max = self.get_arg_and_check_validness( 94 | 'min_or_max', known_type=str, choices=['min', 'max']) 95 | 96 | self.split = self.get_arg_and_check_validness( 97 | 'split', known_type=str, choices=['cv', 'tr', 'tt']) 98 | 99 | self.n_samples = self.get_arg_and_check_validness( 100 | 'n_samples', known_type=int, extra_lambda_checks=[lambda x: x >= 0]) 101 | 102 | self.sample_rate = self.get_arg_and_check_validness('sample_rate', 103 | known_type=int) 104 | self.root_path = self.get_arg_and_check_validness( 105 | 'root_dirpath', known_type=str, 106 | extra_lambda_checks=[lambda y: os.path.lexists(y)]) 107 | self.dataset_dirpath = self.get_path() 108 | 109 | self.mixtures_info_metadata_path = os.path.join( 110 | self.dataset_dirpath, 'metadata_v2') 111 | 112 | self.timelength = self.get_arg_and_check_validness( 113 | 'timelength', known_type=float) 114 | 115 | self.time_samples = int(self.sample_rate * self.timelength) 116 | 117 | # Create the indexing for the dataset 118 | mix_folder_path = os.path.join(self.dataset_dirpath, 119 | WHAM_TASKS[self.task]['mixture']) 120 | self.file_names = [] 121 | self.available_mixtures = glob2.glob(mix_folder_path + '/*.wav') 122 | 123 | self.mixtures_info = [] 124 | print('Parsing Dataset found at: {}...'.format(self.dataset_dirpath)) 125 | if not os.path.lexists(self.mixtures_info_metadata_path): 126 | # Parse gender information. 127 | gender_info_path = os.path.join( 128 | os.path.dirname(os.path.abspath('__file__')), 129 | 'wham_speaker_info.txt') 130 | gender_dic = {} 131 | if os.path.lexists(gender_info_path): 132 | with open(gender_info_path, 'rb') as filehandle: 133 | gender_dic = dict([tuple([x.decode() for x in l.split()]) 134 | for l in filehandle.readlines()]) 135 | 136 | for file_path in tqdm(self.available_mixtures): 137 | sample_rate, waveform = wavfile.read(file_path) 138 | assert sample_rate == self.sample_rate 139 | numpy_wav = np.array(waveform) 140 | 141 | speaker_info = os.path.basename(file_path).split('.wav')[0] 142 | speaker_info = [x[:3] for x in speaker_info.split('_')[::2]] 143 | 144 | this_gender_comb = '' 145 | for speaker in speaker_info: 146 | if speaker not in gender_dic: 147 | raise ValueError('Speaker with id: {} not ' 148 | 'found!'.format(speaker)) 149 | else: 150 | this_gender_comb += gender_dic[speaker].lower() 151 | 152 | self.mixtures_info.append([os.path.basename(file_path), 153 | numpy_wav.shape[0], 154 | this_gender_comb]) 155 | 156 | print('Dumping metadata in: {}'.format( 157 | self.mixtures_info_metadata_path)) 158 | with open(self.mixtures_info_metadata_path, 'wb') as filehandle: 159 | pickle.dump(self.mixtures_info, filehandle) 160 | 161 | if os.path.lexists(self.mixtures_info_metadata_path): 162 | with open(self.mixtures_info_metadata_path, 'rb') as filehandle: 163 | self.mixtures_info = pickle.load(filehandle) 164 | print('Loaded metadata from: {}'.format( 165 | self.mixtures_info_metadata_path)) 166 | 167 | self.file_names_g_comb = dict([(g, []) for g in VALID_GENDER_COMBS]) 168 | for path, n_samples, gender_comb in self.mixtures_info: 169 | if (n_samples >= self.time_samples or self.zero_pad): 170 | self.file_names_g_comb[gender_comb].append((path, n_samples)) 171 | 172 | self.file_names = [] 173 | # Apply the priors 174 | for gender_comb in self.file_names_g_comb: 175 | percentage = self.gender_combination_priors[gender_comb] 176 | length = len(self.file_names_g_comb[gender_comb]) 177 | n_requested = int(length * percentage) 178 | self.file_names += self.file_names_g_comb[gender_comb][:n_requested] 179 | if self.n_samples > 0: 180 | self.file_names = self.file_names[:self.n_samples] 181 | 182 | max_time_samples = max([n_s for (_, n_s) in self.file_names]) 183 | self.file_names = [x for (x, _) in self.file_names] 184 | 185 | # for the case that we need the whole audio input 186 | if self.time_samples <= 0.: 187 | self.time_samples = max_time_samples 188 | 189 | def get_path(self): 190 | path = os.path.join(self.root_path, 191 | 'wav{}k'.format(int(self.sample_rate / 1000)), 192 | self.min_or_max, self.split) 193 | if os.path.lexists(path): 194 | return path 195 | else: 196 | raise IOError('Dataset path: {} not found!'.format(path)) 197 | 198 | def safe_pad(self, tensor_wav): 199 | if self.zero_pad and tensor_wav.shape[0] < self.time_samples: 200 | appropriate_shape = tensor_wav.shape 201 | padded_wav = torch.zeros( 202 | list(appropriate_shape[:-1]) + [self.time_samples], 203 | dtype=torch.float32) 204 | padded_wav[:tensor_wav.shape[0]] = tensor_wav 205 | return padded_wav[:self.time_samples] 206 | else: 207 | return tensor_wav[:self.time_samples] 208 | 209 | def __len__(self): 210 | return len(self.file_names) 211 | 212 | def __getitem__(self, idx): 213 | if self.augment: 214 | the_time = int(np.modf(time())[0] * 100000000) 215 | np.random.seed(the_time) 216 | 217 | filename = self.file_names[idx] 218 | 219 | mixture_path = os.path.join(self.dataset_dirpath, 220 | WHAM_TASKS[self.task]['mixture'], 221 | filename) 222 | _, waveform = wavfile.read(mixture_path) 223 | max_len = len(waveform) 224 | rand_start = 0 225 | if self.augment and max_len > self.time_samples: 226 | rand_start = np.random.randint(0, max_len - self.time_samples) 227 | waveform = waveform[rand_start:rand_start+self.time_samples] 228 | mixture_wav = np.array(waveform) 229 | mixture_wav = torch.tensor(mixture_wav, dtype=torch.float32) 230 | # First normalize the mixture and then pad 231 | if self.normalize_audio: 232 | mixture_wav = normalize_tensor_wav(mixture_wav) 233 | mixture_wav = self.safe_pad(mixture_wav) 234 | 235 | sources_list = [] 236 | for source_name in WHAM_TASKS[self.task]['sources']: 237 | source_path = os.path.join(self.dataset_dirpath, 238 | source_name, filename) 239 | try: 240 | _, waveform = wavfile.read(source_path) 241 | except Exception as e: 242 | print(e) 243 | raise IOError('could not load file from: {}'.format(source_path)) 244 | waveform = waveform[rand_start:rand_start + self.time_samples] 245 | numpy_wav = np.array(waveform) 246 | source_wav = torch.tensor(numpy_wav, dtype=torch.float32) 247 | # First normalize the mixture and then pad 248 | if self.normalize_audio: 249 | source_wav = normalize_tensor_wav(source_wav) 250 | source_wav = self.safe_pad(source_wav) 251 | sources_list.append(source_wav) 252 | 253 | if self.normalize_audio: 254 | mix_std = mixture_wav.detach().cpu().numpy().std() 255 | mixture_wav = normalize_tensor_wav(mixture_wav, std=mix_std) 256 | sources_list = [normalize_tensor_wav(s, std=mix_std) 257 | for s in sources_list] 258 | sources_wavs = torch.stack(sources_list, dim=0) 259 | 260 | return mixture_wav, sources_wavs 261 | 262 | def get_generator(self, batch_size=4, shuffle=True, num_workers=4): 263 | generator_params = {'batch_size': batch_size, 264 | 'shuffle': shuffle, 265 | 'num_workers': num_workers, 266 | 'drop_last': True} 267 | return torch.utils.data.DataLoader(self, **generator_params, 268 | pin_memory=True) 269 | 270 | 271 | def test_generator(): 272 | wham_root_p = '/mnt/data/wham' 273 | batch_size = 1 274 | sample_rate = 8000 275 | timelength = 4.0 276 | gender_combination_priors = { 277 | 'ff': 0., 'mm': 0.05, 'fm': 0., 'mf': 0.02 278 | } 279 | time_samples = int(sample_rate * timelength) 280 | data_loader = Dataset( 281 | root_dirpath=wham_root_p, task='sep_clean', 282 | gender_combination_priors=gender_combination_priors, 283 | split='tt', sample_rate=sample_rate, timelength=timelength, 284 | zero_pad=True, min_or_max='min', augment=True, 285 | normalize_audio=False, n_samples=10) 286 | generator = data_loader.get_generator(batch_size=batch_size, num_workers=1) 287 | 288 | for mixture, sources in generator: 289 | assert mixture.shape == (batch_size, time_samples) 290 | assert sources.shape == (batch_size, 2, time_samples) 291 | 292 | 293 | # test the testing set with batch size 1 only 294 | data_loader = Dataset( 295 | root_dirpath=wham_root_p, task='sep_clean', 296 | gender_combination_priors=gender_combination_priors, 297 | split='tt', sample_rate=sample_rate, timelength=-1., 298 | zero_pad=False, min_or_max='min', augment=False, 299 | normalize_audio=False, n_samples=10) 300 | generator = data_loader.get_generator(batch_size=1, num_workers=1) 301 | 302 | for mixture, sources in generator: 303 | assert mixture.shape[-1] == sources.shape[-1] 304 | 305 | if __name__ == "__main__": 306 | test_generator() 307 | --------------------------------------------------------------------------------