├── spatial_two_mics
    ├── __init__.py
    ├── dnn
    │   ├── __init__.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── data_conversions.py
    │   │   ├── update_history.py
    │   │   ├── experiment_command_line_parser.py
    │   │   ├── model_logger.py
    │   │   ├── experiment_command_line_parser_v2.py
    │   │   ├── fast_dataset_v3.py
    │   │   ├── dataset.py
    │   │   └── fast_dataset_v2.py
    │   ├── evaluation
    │   │   ├── __init__.py
    │   │   └── naive_evaluation_numpy.py
    │   ├── losses
    │   │   ├── __init__.py
    │   │   ├── test
    │   │   │   ├── __init__.py
    │   │   │   └── test_sanity_of_losses.py
    │   │   └── affinity_approximation.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── simple_LSTM_encoder.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── prob_estimation_initial_SDR.py
    │   │   ├── prob_estimation_ground_truth_masks.py
    │   │   ├── measure_initial_SDR.py
    │   │   ├── ground_truth_evaluation.py
    │   │   ├── find_best_model_and_estimate_prob.py
    │   │   └── model_evaluation.py
    │   └── experiments
    │   │   ├── __init__.py
    │   │   ├── sample_convergence_LSTM.py
    │   │   ├── simple_LSTM_encoder.py
    │   │   ├── check_overfitting.py
    │   │   ├── convergence_check_v2.py
    │   │   └── run_experiment_v1.py
    ├── examples
    │   ├── __init__.py
    │   └── mixture_example.py
    ├── utils
    │   ├── __init__.py
    │   ├── progress_display.py
    │   ├── robust_means_clustering.py
    │   └── audio_mixture_constructor.py
    ├── data_generator
    │   ├── __init__.py
    │   ├── parallel_dataset_creation.py
    │   ├── dataset_storage.py
    │   └── source_position_generator.py
    ├── data_loaders
    │   ├── __init__.py
    │   ├── wham_speaker_info.txt
    │   ├── timit.py
    │   └── wham.py
    ├── visualization
    │   └── __init__.py
    ├── labels_inference
    │   ├── __init__.py
    │   ├── ground_truth.py
    │   ├── duet_mask_estimation.py
    │   └── tf_label_estimator.py
    └── config.py
├── LICENSE
└── README.md


/spatial_two_mics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/data_generator/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/data_loaders/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/losses/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/losses/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/labels_inference/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/spatial_two_mics/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"
 4 | 
 5 | BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 6 | TIMIT_PATH = "/mnt/data/Speech/timit-wav"
 7 | DATASETS_DIR = "/mnt/nvme/spatial_two_mics_data/"
 8 | MODELS_DIR = "/mnt/nvme/spatial_two_mics_models/"
 9 | RESULTS_DIR = "/mnt/nvme/spatial_two_mics_results/"
10 | MODELS_RAW_PHASE_DIR = "/mnt/nvme/spatial_two_mics_models_raw_phase/"
11 | MODELS_GROUND_TRUTH = "/mnt/nvme/spatial_two_mics_models_ground_truth/"
12 | FINAL_RESULTS_DIR = "/mnt/nvme/spatial_two_mics_final_eval_results/"
13 | 
14 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/utils/data_conversions.py:
--------------------------------------------------------------------------------
 1 | """!
 2 | @brief Pytorch data tensors manipulations functions
 3 | 
 4 | @author Efthymios Tzinis {etzinis2@illinois.edu}
 5 | @copyright University of illinois at Urbana Champaign
 6 | """
 7 | 
 8 | import torch
 9 | 
10 | 
11 | def one_hot_3Dmasks(index_ys, n_classes):
12 |     """! Converting a matrix of float labels for each class to a one
13 |     hot vector of the same dimension plus the extra of one-hot
14 |     correspondence
15 | 
16 |     :param index_ys: mask 3d tensor with integer labels
17 |     :param n_classes: integer
18 |     :return: whatever diomensions x n_classes => 1 hot correspondence
19 |     """
20 |     clustered_ys = index_ys.unsqueeze(-1).long()
21 | 
22 |     one_hot = torch.cuda.FloatTensor(clustered_ys.size(0),
23 |                                      clustered_ys.size(1),
24 |                                      clustered_ys.size(2),
25 |                                      n_classes).zero_()
26 | 
27 |     return one_hot.scatter_(3, clustered_ys, 1).cuda()
28 | 
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Efthymios Tzinis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/spatial_two_mics/data_loaders/wham_speaker_info.txt:
--------------------------------------------------------------------------------
  1 | 001 M
  2 | 002 F
  3 | 00a F
  4 | 00b M
  5 | 00c M
  6 | 00d M
  7 | 00f F
  8 | 010 M
  9 | 011 F
 10 | 012 M
 11 | 013 M
 12 | 014 F
 13 | 015 M
 14 | 016 F
 15 | 017 F
 16 | 018 F
 17 | 019 F
 18 | 01l M
 19 | 01a F
 20 | 01b F
 21 | 01c F
 22 | 01d F
 23 | 01e M
 24 | 01f F
 25 | 01g M
 26 | 01h F
 27 | 01i M
 28 | 01j F
 29 | 01k F
 30 | 01m F
 31 | 01n F
 32 | 01o F
 33 | 01p F
 34 | 01q F
 35 | 01r M
 36 | 01s M
 37 | 01t M
 38 | 01u F
 39 | 01v F
 40 | 01w M
 41 | 01x F
 42 | 01y M
 43 | 01z M
 44 | 020 M
 45 | 021 M
 46 | 022 F
 47 | 023 F
 48 | 024 M
 49 | 025 M
 50 | 026 M
 51 | 027 F
 52 | 028 F
 53 | 029 M
 54 | 02a F
 55 | 02b M
 56 | 02c F
 57 | 02d F
 58 | 02e F
 59 | 02f F
 60 | 050 F
 61 | 051 M
 62 | 052 M
 63 | 053 F
 64 | 200 M
 65 | 201 M
 66 | 202 F
 67 | 203 F
 68 | 204 F
 69 | 205 F
 70 | 206 F
 71 | 207 M
 72 | 208 M
 73 | 209 F
 74 | 20a F
 75 | 20b F
 76 | 20c M
 77 | 20d F
 78 | 20e F
 79 | 20f M
 80 | 20g M
 81 | 20h F
 82 | 20i M
 83 | 20j M
 84 | 20k M
 85 | 20l M
 86 | 20m M
 87 | 20n M
 88 | 20o M
 89 | 20p F
 90 | 20q M
 91 | 20r M
 92 | 20s M
 93 | 20t F
 94 | 20u M
 95 | 20v M
 96 | 22g M
 97 | 22h M
 98 | 400 M
 99 | 401 F
100 | 403 M
101 | 404 F
102 | 405 M
103 | 406 M
104 | 407 F
105 | 408 M
106 | 409 F
107 | 40a M
108 | 40b M
109 | 40c M
110 | 40d F
111 | 40e F
112 | 40f M
113 | 40g F
114 | 40h F
115 | 40i M
116 | 40j M
117 | 40k M
118 | 40l F
119 | 40m F
120 | 40n M
121 | 40o F
122 | 40p F
123 | 420 F
124 | 421 F
125 | 422 M
126 | 423 M
127 | 430 F
128 | 431 M
129 | 432 F
130 | 050 F
131 | 051 M
132 | 052 M
133 | 053 F
134 | 22g M
135 | 22h M
136 | 423 M
137 | 440 M
138 | 441 F
139 | 442 F
140 | 443 M
141 | 444 F
142 | 445 F
143 | 446 M
144 | 447 M


--------------------------------------------------------------------------------
/spatial_two_mics/utils/progress_display.py:
--------------------------------------------------------------------------------
 1 | """!
 2 | @brief A general bar progress bar display container for all functions
 3 | applied on a list or or an enumerable structure of elements
 4 | 
 5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
 6 | @copyright University of Illinois at Urbana Champaign
 7 | """
 8 | 
 9 | from progress.bar import ChargingBar
10 | import numpy as np
11 | 
12 | 
13 | def progress_bar_wrapper(func,
14 |                          l,
15 |                          message='Processing...'):
16 |     """
17 |     !
18 |     :param l: List of elements
19 |     :param func: This function should be applicable to elements of
20 |     the list l. E.g. a lamda func is also sufficient.
21 |     :param message: A string that you want to be displayed
22 |     :return: The result of map(func, l)
23 |     """
24 | 
25 |     l_copy = l.copy()
26 |     n_elements = len(l)
27 |     bar = ChargingBar(message, max=n_elements)
28 | 
29 |     for idx in np.arange(n_elements):
30 |         l_copy[idx] = func(l[idx])
31 |         bar.next()
32 | 
33 |     bar.finish()
34 |     return l_copy
35 | 
36 | 
37 | def test():
38 |     import pytest
39 | 
40 |     M = int(10e7)
41 |     size = int(10e4)
42 |     l = np.random.uniform(low=-M, high=M, size=size)
43 |     funcs = {
44 |         'const_mul': lambda x: x*2,
45 |         'power_2': lambda x: x**2,
46 |         'subtraction': lambda x: x-x/2.
47 |     }
48 | 
49 |     for name, func in funcs.items():
50 |         map_result = list(map(func, l))
51 |         wrapper_result = progress_bar_wrapper(func, l, message=name)
52 |         assert any(map_result ==wrapper_result), 'Progress wrapper ' \
53 |                                                  'should provide the ' \
54 |                                                  'same result as map'
55 | 
56 | if __name__ == "__main__":
57 |     test()


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/losses/test/test_sanity_of_losses.py:
--------------------------------------------------------------------------------
 1 | """!
 2 | @brief Testing the sanity of the losses comparing to naive
 3 | implementations
 4 | 
 5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
 6 | @copyright University of illinois at Urbana Champaign
 7 | """
 8 | import sys
 9 | import numpy as np
10 | import torch
11 | from pprint import pprint
12 | sys.path.append('../')
13 | import affinity_approximation as losses
14 | 
15 | 
16 | def numpy_naive(vs, ys):
17 |     frobenius_np = np.mean(
18 |         np.array([np.linalg.norm(vs[b].dot(vs[b].T) -
19 |                                  ys[b].dot(ys[b].T))**2
20 |                   for b in np.arange(vs.shape[0])]))
21 |     return frobenius_np
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     batch_size = 1
26 |     num_tfs = 100
27 |     embedding_depth = 10
28 |     n_sources = 2
29 |     vs_np = np.random.rand(batch_size, num_tfs, embedding_depth)
30 |     ys_np = np.abs(np.random.rand(batch_size, num_tfs, n_sources))
31 |     vs = torch.from_numpy(vs_np)
32 |     ys = torch.from_numpy(ys_np)
33 | 
34 |     np_frobenius = numpy_naive(vs_np, ys_np)
35 |     naive_torch_frobenius = losses.frobenius_naive(vs, ys).data.numpy()
36 |     #
37 |     print("Numpy Frobenius: {}".format(np_frobenius))
38 |     print("Naive Torch Frobenius: {}".format(naive_torch_frobenius))
39 | 
40 |     assert np.abs(np_frobenius -
41 |                   naive_torch_frobenius) < 10e-5, 'Naive ' \
42 |                     'implementations of Frobenius norm should be equal'
43 | 
44 | 
45 | 
46 |     efficient_frobenius = losses.efficient_frobenius(vs, ys)
47 |     print("Efficient Frobenius: {}".format(efficient_frobenius))
48 | 
49 |     # assert np.abs(np_frobenius -
50 |     #               efficient_frobenius) < 10e-5, 'Efficient == Naive '
51 | 
52 |     paris_wtf = losses.naive(vs, ys)
53 |     print("Paris wtf: {}".format(paris_wtf))
54 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/models/simple_LSTM_encoder.py:
--------------------------------------------------------------------------------
 1 | """!
 2 | @brief Simple LSTM encoder for embedding the input using a simple
 3 | LSTM architecture
 4 | 
 5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
 6 | @copyright University of Illinois at Urbana Champaign
 7 | """
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | 
12 | class BLSTMEncoder(nn.Module):
13 |     def __init__(self,
14 |                  n_timesteps=250,
15 |                  n_features=257,
16 |                  num_layers=1,
17 |                  hidden_size=None,
18 |                  dropout=0.0,
19 |                  embedding_depth=None,
20 |                  bidirectional=True):
21 |         super(BLSTMEncoder, self).__init__()
22 | 
23 |         if n_timesteps is None or n_features is None:
24 |             raise ValueError("You have to define both the number of "
25 |                              "timesteps in each sequence and the "
26 |                              "number of features for each timestep.")
27 |         else:
28 |             self.emb_dim = n_features * embedding_depth
29 | 
30 |         self.embedding_depth = embedding_depth
31 |         self.hidden_size = hidden_size
32 |         self.n_timesteps = n_timesteps
33 |         if bidirectional:
34 |             self.n_directions = 2
35 |         else:
36 |             self.n_directions = 1
37 |         # assert len(self.hidden_sizes) == num_layers, 'Each layer ' \
38 |         #        'should be defined by a corresponding hidden size.'
39 |         self.rnn = nn.LSTM(input_size=n_features,
40 |                            num_layers=num_layers,
41 |                            hidden_size=self.hidden_size,
42 |                            bidirectional=bidirectional,
43 |                            dropout=dropout,
44 |                            batch_first=True)
45 |         self.affine = nn.Linear(self.n_directions*self.hidden_size,
46 |                                 self.emb_dim)
47 | 
48 |     def forward(self, x):
49 |         rnn_out, (hidden, states) = self.rnn(x)
50 |         nonl_embedding = torch.sigmoid(self.affine(rnn_out))
51 |         v = nonl_embedding.contiguous().view(x.size(0),
52 |                                              -1,
53 |                                              self.embedding_depth)
54 |         # return nn.functional.normalize(v, dim=-1, p=2)
55 |         return v
56 | 
57 | if __name__ == "__main__":
58 | 
59 |     model = BLSTMEncoder()
60 | 


--------------------------------------------------------------------------------
/spatial_two_mics/examples/mixture_example.py:
--------------------------------------------------------------------------------
 1 | """!
 2 | @brief A simple example of how a compact mixture should look like
 3 | 
 4 | @author Efthymios Tzinis {etzinis2@illinois.edu}
 5 | @copyright University of Illinois at Urbana Champaign
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | root_dir = os.path.join(
11 |     os.path.dirname(os.path.realpath(__file__)),
12 |     '../../')
13 | sys.path.insert(0, root_dir)
14 | 
15 | from spatial_two_mics.config import TIMIT_PATH
16 | import numpy as np
17 | 
18 | 
19 | def mixture_info_example():
20 |     ex = {'positions':
21 |                   {'amplitudes': np.array([0.73382382,
22 |                                                 0.26617618]),
23 |                    'd_thetas': np.array([1.06829948]),
24 |                    'distances': {'m1m1': 0.0,
25 |                                  'm1m2': 0.03,
26 |                                  'm1s1': 3.015,
27 |                                  'm1s2': 3.0072529608785676,
28 |                                  'm2m1': 0.03,
29 |                                  'm2m2': 0.0,
30 |                                  'm2s1': 2.985,
31 |                                  'm2s2': 2.9928046426867034,
32 |                                  's1m1': 3.015,
33 |                                  's1m2': 2.985,
34 |                                  's1s1': 0.0,
35 |                                  's1s2': 3.054656422155759,
36 |                                  's2m1': 3.0072529608785676,
37 |                                  's2m2': 2.9928046426867034,
38 |                                  's2s1': 3.054656422155759,
39 |                                  's2s2': 0.0},
40 |                    'taus': np.array([1.39941691, 0.67397403]),
41 |                    'thetas': np.array([0., 1.06829948]),
42 |                    'xy_positons': np.array([[3., 0.],
43 |                                          [1.44484569, 2.62914833]])},
44 |      'sources_ids': [{'gender': 'f',
45 |                      'sentence_id': 'sa1',
46 |                      'speaker_id': 'flbw0',
47 |                      'wav_path': os.path.join(TIMIT_PATH,
48 |                                  'test/dr4/flbw0/sa1.wav')},
49 |                     {'gender': 'm',
50 |                      'sentence_id': 'sa2',
51 |                      'speaker_id': 'mbns0',
52 |                      'wav_path': os.path.join(TIMIT_PATH,
53 |                                  'test/dr4/mbns0/sa2.wav')}
54 |                      ]}
55 | 
56 |     return ex
57 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/utils/update_history.py:
--------------------------------------------------------------------------------
 1 | """!
 2 | @brief History and callback update functions
 3 | 
 4 | @author Efthymios Tzinis {etzinis2@illinois.edu}
 5 | @copyright University of illinois at Urbana Champaign
 6 | """
 7 | 
 8 | def values_update(list_of_pairs,
 9 |                   history_dic,
10 |                   update_mode='batch'):
11 |     """! Update the history dictionary for each key, value pair
12 |     INPLACE and stores values for batch and epoch
13 |     :param update_mode: In batch mode the values of the specific key
14 |     would be summed and in epoch mode would be averaged throughout
15 |     the batches.
16 |     :param list_of_pairs: list of tuples e.g. [('loss', 0.9987), ...,]
17 |     :param history_dic: a dictionary that we want to keep track for
18 |     a metric under all epochs
19 |     :return: history_dic updated with all the appropriate values for
20 |     batch and epoch
21 |     """
22 |     if update_mode == 'batch':
23 |         for k, v in list_of_pairs:
24 |             if not k+"_batch_total" in history_dic:
25 |                 history_dic[k] = []
26 |                 history_dic[k+"_batch_total"] = v
27 |                 history_dic[k + '_batch_counter'] = 1
28 |             else:
29 |                 history_dic[k + "_batch_total"] += v
30 |             history_dic[k+'_batch_counter'] += 1
31 |     elif update_mode == 'epoch':
32 |         for k, v in list_of_pairs:
33 |             history_dic[k].append(history_dic[k + "_batch_total"] /
34 |                                   history_dic[k + '_batch_counter'])
35 |             history_dic[k + "_batch_total"] = 0.
36 |             history_dic[k + '_batch_counter'] = 0
37 |     else:
38 |         raise NotImplementedError('Please use an update mode of epoch '
39 |                                   'or batch')
40 | 
41 |     return history_dic
42 | 
43 | 
44 | def update_best_performance(performance_dic,
45 |                             epoch,
46 |                             history_dic,
47 |                             buffer_size=0):
48 |     """! Update the history dictionary for the best performance so far
49 |     INPLACE and stores them in a list which has length equal to the
50 |     predefined buffer size
51 |     :return: history_dic updated with all the appropriate values for
52 |     the best performance so far
53 |     """
54 |     if 'best_performances' not in history_dic:
55 |         history_dic['best_performances'] = [(performance_dic, epoch)]
56 |     else:
57 |         history_dic['best_performances'].append((performance_dic,
58 |                                                  epoch))
59 |         history_dic['best_performances'] = \
60 |             sorted(history_dic['best_performances'],
61 |                    key=lambda x: x[0]['sdr'])[::-1][:buffer_size]
62 | 
63 |     return history_dic
64 | 


--------------------------------------------------------------------------------
/spatial_two_mics/labels_inference/ground_truth.py:
--------------------------------------------------------------------------------
 1 | """!
 2 | @brief Infering the masking for eah tf bin independently based on the
 3 | maximum energy of the sources in each bin
 4 | 
 5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
 6 | @copyright University of Illinois at Urbana Champaign
 7 | """
 8 | 
 9 | import numpy as np
10 | from pprint import pprint
11 | 
12 | 
13 | def infer_mask(mixture_info):
14 |     """
15 |     :param mixture_info:
16 |     mixture_info = {
17 |         'm1_raw': numpy array containing the raw m1 signal,
18 |         'm2_raw': numpy array containing the raw m2 signal,
19 |         'm1_tf': numpy array containing the m1 TF representation,
20 |         'm2_tf': numpy array containing the m2 TF representation,
21 |         'sources_raw': a list of numpy 1d vectors containing the
22 |         sources ,
23 |         'sources_tf': a list of numpy 2d vectors containing the
24 |          TF represeantations of the sources
25 |         'amplitudes': the weights that each source contributes to
26 |         the mixture of the second microphone
27 |     }
28 | 
29 |     :return: A tf 2d matrix corresponding to the dominating source
30 |     for each TF bin [0,1,...,n_sources]
31 |     """
32 |     sources_complex_spectra = mixture_info['sources_tf']
33 |     amplitudes = mixture_info['amplitudes']
34 |     n_sources = len(sources_complex_spectra)
35 | 
36 |     assert len(amplitudes) == n_sources, "Length of weights: {} " \
37 |                                          "should be equal to the " \
38 |                                          "number of sources: {}" \
39 |                                          "".format(len(amplitudes),
40 |                                                    n_sources)
41 | 
42 |     same_dimensions = [(sources_complex_spectra[i].shape ==
43 |                         sources_complex_spectra[0].shape)
44 |                        for i in np.arange(len(sources_complex_spectra))]
45 | 
46 |     assert all(same_dimensions), "All arrays should have the same " \
47 |                                  "dimensions. However, got sizes of {}"\
48 |                                  "".format([x.shape for x in
49 |                                             sources_complex_spectra])
50 | 
51 |     sources_complex_spectra = [amplitudes[i] * sources_complex_spectra[i]
52 |                                for i in np.arange(n_sources)]
53 | 
54 |     tf_real_sources = [np.abs(tf_complex)
55 |                        for tf_complex in sources_complex_spectra]
56 | 
57 |     mixture_tensor = np.dstack(tf_real_sources)
58 |     dominating_source = np.argmax(mixture_tensor, axis=2)
59 | 
60 |     zipped_tf_labels = dominating_source.astype(np.uint8)
61 | 
62 |     assert np.array_equal(dominating_source, zipped_tf_labels), \
63 |         "Zipping the numpy matrix should not yield different labels"
64 | 
65 |     return zipped_tf_labels
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # unsupervised_spatial_dc
 2 | Code for the paper: "Unsupervised Deep Clustering for Source Separation: Direct Learning from Mixtures using Spatial Information"
 3 | 
 4 | > Please cite as:
 5 | ```
 6 | @INPROCEEDINGS{8683201,
 7 |     author={E. {Tzinis} and S. {Venkataramani} and P. {Smaragdis}},
 8 |     booktitle={ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
 9 |     title={Unsupervised Deep Clustering for Source Separation: Direct Learning from Mixtures Using Spatial Information},
10 |     year={2019},
11 |     volume={},
12 |     number={},
13 |     pages={81-85},
14 |     keywords={pattern clustering;source separation;unsupervised learning;training process;ground truth separation information;direct learning;spatial information;monophonic source separation system;multichannel mixtures;unsupervised deep clustering approach;sound separation performance;multichannel recordings;Deep clustering;source separation;unsupervised learning},
15 |     doi={10.1109/ICASSP.2019.8683201},
16 |     ISSN={},
17 |     month={May},}
18 | ```
19 | 
20 | ## Disclaimer
21 | University of Illinois Open Source License
22 | 
23 | Copyright © 2018, University of Illinois at Urbana Champaign. All rights reserved.
24 | 
25 | Developed by: Efthymios Tzinis 1, Shrikant Venkataramani 1, Paris Smaragdis 1,2
26 | 
27 | 1: University of Illinois at Urbana-Champaign, 
28 | 2: Adobe Research 
29 | 
30 | This work was supported by NSF grant 1453104. 
31 | Paper link: https://doi.org/10.1109/ICASSP.2019.8683201
32 | 
33 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. Neither the names of Computational Audio Group, University of Illinois at Urbana-Champaign, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
34 | 


--------------------------------------------------------------------------------
/spatial_two_mics/labels_inference/duet_mask_estimation.py:
--------------------------------------------------------------------------------
 1 | """!
 2 | @brief Infering the masking for eah tf bin based on DUET features,
 3 | mainly phase difference and after that a robust K-means estimation
 4 | 
 5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
 6 | @copyright University of Illinois at Urbana Champaign
 7 | """
 8 | 
 9 | import numpy as np
10 | import sys
11 | root_dir = '../../'
12 | sys.path.insert(0, root_dir)
13 | from spatial_two_mics.utils import robust_means_clustering as  \
14 |      robust_kmeans
15 | 
16 | 
17 | def infer_mask(mixture_info,
18 |                return_phase_features=False):
19 |     """
20 |     :param mixture_info:
21 |     mixture_info = {
22 |         'm1_raw': numpy array containing the raw m1 signal,
23 |         'm2_raw': numpy array containing the raw m2 signal,
24 |         'm1_tf': numpy array containing the m1 TF representation,
25 |         'm2_tf': numpy array containing the m2 TF representation,
26 |         'sources_raw': a list of numpy 1d vectors containing the
27 |         sources ,
28 |         'sources_tf': a list of numpy 2d vectors containing the
29 |          TF represeantations of the sources ,
30 |         'amplitudes': the weights that each source contributes to
31 |         the mixture of the second microphone
32 |     }
33 | 
34 |     :return: A tf 2d matrix corresponding to the dominating source
35 |     for each TF bin [0,1,...,n_sources]
36 |     """
37 |     sources_complex_spectra = mixture_info['sources_tf']
38 |     amplitudes = mixture_info['amplitudes']
39 |     n_sources = len(sources_complex_spectra)
40 | 
41 |     assert len(amplitudes) == n_sources, "Length of weights: {} " \
42 |                                          "should be equal to the " \
43 |                                          "number of sources: {}" \
44 |                                          "".format(len(amplitudes),
45 |                                                    n_sources)
46 | 
47 |     same_dimensions = [(sources_complex_spectra[i].shape ==
48 |                         sources_complex_spectra[0].shape)
49 |                        for i in np.arange(len(sources_complex_spectra))]
50 | 
51 |     assert all(same_dimensions), "All arrays should have the same " \
52 |                                  "dimensions. However, got sizes of {}"\
53 |                                  "".format([x.shape for x in
54 |                                             sources_complex_spectra])
55 | 
56 |     r = mixture_info['m1_tf'] / (mixture_info['m2_tf'] + 1e-7)
57 |     phase_dif = np.angle(r) / np.linspace(1e-5, np.pi,
58 |                               mixture_info['m1_tf'].shape[0])[:, None]
59 | 
60 |     d_feature = np.reshape(phase_dif, (np.product(phase_dif.shape), 1))
61 |     r_kmeans = robust_kmeans.RobustKmeans(n_true_clusters=n_sources,
62 |                                           n_used_clusters=n_sources+3)
63 |     d_labels = r_kmeans.fit(d_feature, cut_outlier_in_norm=2.)
64 |     d_feature_mask = np.reshape(d_labels, phase_dif.shape)
65 | 
66 |     zipped_tf_labels = d_feature_mask.astype(np.uint8)
67 | 
68 |     assert np.array_equal(d_feature_mask, zipped_tf_labels), \
69 |         "Zipping the numpy matrix should not yield different labels"
70 | 
71 |     if return_phase_features:
72 |         return zipped_tf_labels, phase_dif
73 | 
74 |     return zipped_tf_labels
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/spatial_two_mics/data_loaders/timit.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief Dataloader for timit dataset in order to store in an internal
  3 | python dictionary structure the whole timit dataset.
  4 | 
  5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  6 | @copyright University of illinois at Urbana Champaign
  7 | """
  8 | 
  9 | import os
 10 | import sys
 11 | import scipy.io.wavfile as wavfile
 12 | import glob2
 13 | import numpy as np
 14 | 
 15 | root_dir = os.path.join(
 16 |            os.path.dirname(os.path.realpath(__file__)),
 17 |            '../../')
 18 | sys.path.insert(0, root_dir)
 19 | 
 20 | from spatial_two_mics.config import TIMIT_PATH
 21 | 
 22 | 
 23 | class TimitLoader(object):
 24 |     def __init__(self,
 25 |                  normalize_audio_by_std=True):
 26 |         self.dataset_path = TIMIT_PATH
 27 |         self.normalize_audio_by_std = normalize_audio_by_std
 28 | 
 29 |     def get_all_wavs(self, path):
 30 |         data_dic = {}
 31 |         print("Searching inside: {}...".format(path))
 32 |         dialects = os.listdir(path)
 33 |         for dial in dialects:
 34 |             if dial.startswith('.'):
 35 |                 continue
 36 |             d_path = os.path.join(path, dial)
 37 |             speakers = os.listdir(os.path.join(d_path))
 38 |             for speaker in speakers:
 39 |                 if speaker.startswith('.'):
 40 |                     continue
 41 |                 speaker_path = os.path.join(d_path, speaker)
 42 |                 wavs_paths = glob2.glob(os.path.join(speaker_path,
 43 |                                                      '*.wav'))
 44 | 
 45 |                 speaker_wavs = [list(wavfile.read(wav_p)) + [wav_p]
 46 |                                 for wav_p in wavs_paths]
 47 | 
 48 |                 if self.normalize_audio_by_std:
 49 |                     speaker_wavs = [(sr, wav / np.std(wav), wav_p)
 50 |                                     for (sr, wav, wav_p) in speaker_wavs]
 51 | 
 52 |                 speaker_wavs = [(wav_p.split('/')[-1].split('.wav')[0],
 53 |                                 {'wav': wav, 'sr': sr, 'path': wav_p})
 54 |                                 for (sr, wav, wav_p) in speaker_wavs]
 55 | 
 56 |                 speaker_gender = speaker[0]
 57 |                 data_dic[speaker] = {
 58 |                     'dialect': dial,
 59 |                     'gender': speaker_gender,
 60 |                     'sentences': dict(speaker_wavs)
 61 |                 }
 62 | 
 63 |         return data_dic
 64 | 
 65 |     def load(self):
 66 |         """
 67 |         Loading all the data inside a dictionary like the one below:
 68 | 
 69 |         {
 70 |         'train':
 71 |             'speaker_id_i': {
 72 |                 'dialect': which dialect the speaker belongs to,
 73 |                 'gender': f or m,
 74 |                 'sentences': {
 75 |                     'sentence_id_j': {
 76 |                         'wav': wav_on_a_numpy_matrix,
 77 |                         'sr': Fs in Hz integer,
 78 |                         'path': PAth of the located wav
 79 |                     }
 80 |                 }
 81 |             }
 82 | 
 83 |         * the same applies for test speakers
 84 |         }
 85 | 
 86 |         :return: Dictionary
 87 |         """
 88 |         data_dic = {'train': {},
 89 |                     'test': {}
 90 |                     }
 91 | 
 92 |         for chunk in data_dic:
 93 |             wavs_path = os.path.join(self.dataset_path, chunk)
 94 |             all_wavs_dic = self.get_all_wavs(wavs_path)
 95 |             data_dic[chunk] = all_wavs_dic
 96 | 
 97 |         return data_dic
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     print("Loading TIMIT Dataset from {}...".format(TIMIT_PATH))
102 |     timit_loader = TimitLoader()
103 |     timit_data = timit_loader.load()


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/evaluation/naive_evaluation_numpy.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief A naive implementation of how we evaluate the masks that are
  3 | derived --> reconstruct the source signals and also extract SDR,
  4 | SIR and SBR for the reconstructed signals and the true signals
  5 | 
  6 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  7 | @copyright University of illinois at Urbana Champaign
  8 | """
  9 | 
 10 | import numpy as np
 11 | import librosa
 12 | 
 13 | 
 14 | def bss_eval(sep, i, sources):
 15 |     # Current target
 16 |     min_len = min([len(sep), len(sources[i])])
 17 |     sources = sources[:, :min_len]
 18 |     sep = sep[:min_len]
 19 |     target = sources[i]
 20 | 
 21 |     # Target contribution
 22 |     s_target = target * np.dot(target, sep.T) / np.dot(target, target.T)
 23 | 
 24 |     # Interference contribution
 25 |     pse = np.dot(np.dot( sources, sep.T),
 26 |     np.linalg.inv(np.dot( sources, sources.T))).T.dot( sources)
 27 |     e_interf = pse - s_target
 28 | 
 29 |     # Artifact contribution
 30 |     e_artif = sep - pse
 31 | 
 32 |     # Interference + artifacts contribution
 33 |     e_total = e_interf + e_artif
 34 | 
 35 |     # Computation of the log energy ratios
 36 |     sdr = 10*np.log10(sum(s_target**2) / sum(e_total**2));
 37 |     sir = 10*np.log10(sum(s_target**2) / sum(e_interf**2));
 38 |     sar = 10*np.log10(sum((s_target + e_interf)**2) / sum(e_artif**2));
 39 | 
 40 |     # Done!
 41 |     return sdr, sir, sar
 42 | 
 43 | 
 44 | def naive_cpu_bss_eval(embedding_labels,
 45 |                        mix_real_tf,
 46 |                        mix_imag_tf,
 47 |                        sources_raw,
 48 |                        n_sources,
 49 |                        batch_index=0):
 50 | 
 51 |     mix_stft = mix_real_tf + 1j*mix_imag_tf
 52 | 
 53 |     if mix_stft.shape == embedding_labels.shape:
 54 |         embedding_clustered = embedding_labels
 55 |     else:
 56 |         embedding_clustered = embedding_labels.reshape(
 57 |                               mix_stft.shape[::-1]).T
 58 | 
 59 |     sdr_t, sir_t, sar_t = 0., 0., 0.
 60 |     for i in np.arange(n_sources):
 61 |         embed_mask = mix_stft*(embedding_clustered == i)
 62 |         reconstructed = librosa.core.istft(embed_mask,
 63 |                                            hop_length=128,
 64 |                                            win_length=512)
 65 |         bss_results = [bss_eval(reconstructed, j, sources_raw)
 66 |                        for j in np.arange(n_sources)]
 67 | 
 68 |         sdr, sir, sar = sorted(bss_results, key=lambda x: x[0])[-1]
 69 |         sdr_t += sdr
 70 |         sir_t += sir
 71 |         sar_t += sar
 72 | 
 73 |         # save_p = '/home/thymios/wavs/'
 74 |         # wav_p = os.path.join(save_p,
 75 |         #                      'batch_{}_source_{}'.format(
 76 |         #                          batch_index + 1, i + 1))
 77 |         # librosa.output.write_wav(wav_p, reconstructed, 16000)
 78 | 
 79 |     return sdr_t/n_sources, sir_t/n_sources, sar_t/n_sources
 80 | 
 81 | 
 82 | def mixture_bss_eval(mix_real_tf,
 83 |                      mix_imag_tf,
 84 |                      sources_raw,
 85 |                      n_sources):
 86 | 
 87 |     mix_stft = mix_real_tf + 1j*mix_imag_tf
 88 | 
 89 |     reconstructed = librosa.core.istft(mix_stft,
 90 |                                        hop_length=128,
 91 |                                        win_length=512)
 92 |     bss_results = [bss_eval(reconstructed, j, sources_raw)
 93 |                    for j in np.arange(n_sources)]
 94 | 
 95 |     (sdrs, sirs, sars) = (np.array([x[0] for x in bss_results]),
 96 |                           np.array([x[1] for x in bss_results]),
 97 |                           np.array([x[2] for x in bss_results]))
 98 | 
 99 |     return np.mean(sdrs), np.mean(sirs), np.mean(sars)
100 | 


--------------------------------------------------------------------------------
/spatial_two_mics/utils/robust_means_clustering.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief This utility serves as a level of abstraction in order to
  3 | construct audio mixtures
  4 | 
  5 | 
  6 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  7 | @copyright University of Illinois at Urbana Champaign
  8 | """
  9 | 
 10 | from pprint import pprint
 11 | from sklearn.cluster import KMeans
 12 | import numpy as np
 13 | 
 14 | 
 15 | class RobustKmeans(object):
 16 |     def __init__(self,
 17 |                  n_true_clusters=2,
 18 |                  n_used_clusters=4):
 19 |         """!
 20 |         Sometimes K-means creates clusters around outlier groups which
 21 |         should not be the case. For this reason we run K-means with
 22 |         n_used_clusters > n_true_clusters and then we assign at the most
 23 |         probable n_true_clusters the residual clusters
 24 | 
 25 |         :param n_true_clusters: the true number of clusters we wanna
 26 |         cluster the data at the end
 27 |         :param n_used_clusters: The amount of clusters that will be used
 28 |         in total for running kmeans and after that the residual would be
 29 |         assigned in the top most prior n_true_clusters
 30 |         """
 31 | 
 32 |         self.N_true = n_true_clusters
 33 |         self.N_used = n_used_clusters
 34 |         self.kmeans_obj = KMeans(n_clusters=self.N_used,
 35 |                                  random_state=7)
 36 | 
 37 |     def fit(self, x, cut_outlier_in_norm=2.):
 38 |         """!
 39 |         robust clustering for the input x
 40 | 
 41 |         :param x: nd array with shape: (n_samples, n_features)
 42 | 
 43 |         :return cluster_labels: 1d array with the corresponding
 44 |         labels from 0 to self.N_true - 1
 45 |         """
 46 | 
 47 |         if cut_outlier_in_norm is not None:
 48 |             robust_points = x[np.where(np.linalg.norm(x, axis=1) <=
 49 |                               cut_outlier_in_norm), :][0]
 50 | 
 51 |             fitted_centers = self.kmeans_obj.fit(robust_points)
 52 |             clustered = self.kmeans_obj.predict(x)
 53 |         else:
 54 |             fitted_centers = self.kmeans_obj.fit(x)
 55 |             clustered = fitted_centers.labels_
 56 | 
 57 |         cluster_coordinates = fitted_centers.cluster_centers_
 58 | 
 59 |         priors = np.bincount(clustered)
 60 |         cl_indexes = np.argsort(priors)
 61 |         true_clusters = cl_indexes[self.N_used - self.N_true:]
 62 | 
 63 |         fitted_centers.cluster_centers_ = cluster_coordinates[
 64 |                                           true_clusters]
 65 | 
 66 |         # make the new prediction with the new clusters
 67 |         robust_estimation = fitted_centers.predict(x)
 68 | 
 69 |         return robust_estimation
 70 | 
 71 |     def fit_predict(self, x, cut_outlier_in_norm=2.):
 72 |         """!
 73 |         robust clustering for the input x
 74 | 
 75 |         :param x: nd array with shape: (n_samples, n_features)
 76 | 
 77 |         :return cluster_labels: 1d array with the corresponding
 78 |         labels from 0 to self.N_true - 1
 79 |         """
 80 |         return self.fit(x, cut_outlier_in_norm=cut_outlier_in_norm)
 81 | 
 82 | 
 83 | def example_of_usage():
 84 |     """!
 85 |     How the class of Audio mixtures should be called"""
 86 | 
 87 |     from sklearn.datasets import load_iris
 88 |     data = load_iris()
 89 |     x = data.data
 90 |     y = data.target
 91 |     x /= np.linalg.norm(x)
 92 | 
 93 |     robust_clusterer = RobustKmeans(n_true_clusters=3,
 94 |                                     n_used_clusters=3)
 95 |     pred = robust_clusterer.fit(x)
 96 |     print("Using 3 True Clusters and 3 for Prediction: {}".format(pred))
 97 | 
 98 |     robust_clusterer = RobustKmeans(n_true_clusters=3,
 99 |                                     n_used_clusters=5)
100 |     pred = robust_clusterer.fit(x)
101 |     print("Using 3 True Clusters and 5 for Prediction: {}".format(pred))
102 | 
103 | if __name__ == "__main__":
104 |     example_of_usage()


--------------------------------------------------------------------------------
/spatial_two_mics/data_generator/parallel_dataset_creation.py:
--------------------------------------------------------------------------------
 1 | """!
 2 | @brief Create datasets for the experiments by individually assign
 3 | them as jobs in different processors
 4 | 
 5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
 6 | @copyright University of illinois at Urbana Champaign
 7 | """
 8 | 
 9 | import argparse
10 | import os
11 | import sys
12 | import itertools
13 | import copy
14 | from pprint import pprint
15 | root_dir = os.path.join(
16 |     os.path.dirname(os.path.realpath(__file__)),
17 |     '../../')
18 | sys.path.insert(0, root_dir)
19 | from joblib import Parallel, delayed
20 | import spatial_two_mics.data_generator.data_creator_and_storage_v2 as\
21 |     dataset_generator
22 | 
23 | 
24 | def generate_one_dataset_wrapper(this_dataset_args):
25 |     dataset_generator.generate_dataset(this_dataset_args)
26 |     return 1
27 | 
28 | 
29 | def generate_datasets(args):
30 |     genders = list(map(list, args.genders))
31 |     n_sources = args.n_sources
32 | 
33 |     dataset_combinations = list(itertools.product(*[genders,
34 |                                                     n_sources]))
35 | 
36 |     specific_args = []
37 |     for (gndrs, sources) in dataset_combinations:
38 |         this_args = copy.deepcopy(args)
39 |         this_args.n_sources = sources
40 |         this_args.genders = gndrs
41 |         del this_args.n_jobs
42 |         specific_args.append(this_args)
43 | 
44 |     pprint(specific_args)
45 | 
46 |     created_datasets = Parallel(n_jobs=args.n_jobs)(
47 |                        [delayed(generate_one_dataset_wrapper)(this_args)
48 |                         for this_args in specific_args])
49 | 
50 |     print("Successfully created: {} datasets".format(
51 |           sum(created_datasets)))
52 | 
53 |     return True
54 | 
55 | 
56 | def get_args():
57 |     """! Command line parser """
58 |     parser = argparse.ArgumentParser(description='Parallel Mixture '
59 |                                                  'datasets creator')
60 |     parser.add_argument("--dataset", type=str,
61 |                         help="Dataset name", default="timit")
62 |     parser.add_argument("--n_sources", type=int, nargs='+',
63 |                         help="How many sources in each mix", default=2)
64 |     parser.add_argument("--n_samples", type=int, nargs='+',
65 |                         help="How many samples do u want to be "
66 |                              "created",
67 |                         default=[1, 1, 1])
68 |     parser.add_argument("--genders", type=str, nargs='+',
69 |                         help="Genders that will correspond to the "
70 |                              "genders in the mixtures",
71 |                         default=['m'], choices=['m', 'f', 'fm', 'mf'])
72 |     parser.add_argument("-o", "--output_path", type=str,
73 |                         help="""The path that the resulting dataset 
74 |                         would be stored. If the folder does not 
75 |                         exist it will be created as well as its 
76 |                         child folders train or test and val if it is 
77 |                         selected""",
78 |                         required=True)
79 |     parser.add_argument("-f", "--force_delays", nargs='+', type=int,
80 |                         help="""Whether you want to force integer 
81 |                         delays of +- 1 in the sources e.g.""",
82 |                         default=None)
83 |     parser.add_argument('--val_set', action="store_true",
84 |                         help='Force to create a separate val folder '
85 |                              'with the same amount of the mixtures as '
86 |                              'the initial test/train folder but using '
87 |                              'half of the available speakers')
88 |     parser.add_argument("--n_jobs", type=int,
89 |                         help="Number of parallel spawning jobs",
90 |                         default=1)
91 |     return parser.parse_args()
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     args = get_args()
96 |     generate_datasets(args)


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/utils/experiment_command_line_parser.py:
--------------------------------------------------------------------------------
 1 | """!
 2 | @brief Command line parser for experiments
 3 | 
 4 | @author Efthymios Tzinis {etzinis2@illinois.edu}
 5 | @copyright University of illinois at Urbana Champaign
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | def get_args():
11 |     """! Command line parser for experiments"""
12 |     parser = argparse.ArgumentParser(description='Deep Clustering for '
13 |                                                  'Audio Source '
14 |                                                  'Separation '
15 |                                                  'Experiment')
16 |     parser.add_argument("--dataset", type=str,
17 |                         help="Dataset name",
18 |                         default="timit")
19 |     parser.add_argument("--n_sources", type=int,
20 |                         help="How many sources in each mix",
21 |                         default=2)
22 |     parser.add_argument("--n_samples", type=int, nargs='+',
23 |                         help="How many samples do u want to be "
24 |                              "created for train test val",
25 |                         default=[256, 64, 128])
26 |     parser.add_argument("--genders", type=str, nargs='+',
27 |                         help="Genders that will correspond to the "
28 |                              "genders in the mixtures",
29 |                         default=['m'])
30 |     parser.add_argument("-f", "--force_delays", nargs='+', type=int,
31 |                         help="""Whether you want to force integer 
32 |                         delays of +- 1 in the sources e.g.""",
33 |                         default=[-1, 1])
34 |     parser.add_argument("-nl", "--n_layers", type=int,
35 |                         help="""The number of layers of the LSTM 
36 |                         encoder""", default=2)
37 |     parser.add_argument("-ed", "--embedding_depth", type=int,
38 |                         help="""The depth of the embedding""",
39 |                         default=10)
40 |     parser.add_argument("-hs", "--hidden_size", type=int,
41 |                         help="""The size of the LSTM cells """,
42 |                         default=10)
43 |     parser.add_argument("-bs", "--batch_size", type=int,
44 |                         help="""The number of samples in each batch""",
45 |                         default=64)
46 |     parser.add_argument("-name", "--experiment_name", type=str,
47 |                         help="""The name or identifier of this 
48 |                         experiment""",
49 |                         default='A sample experiment'),
50 |     parser.add_argument("-mt", "--labels_mask", type=str,
51 |                         help="""The type of masks that you want to 
52 |                         use -- 'ground_truth' or 'duet'""",
53 |                         default='duet')
54 |     parser.add_argument("-cad", "--cuda_available_devices", type=int,
55 |                         nargs="+",
56 |                         help="""A list of Cuda IDs that would be 
57 |                         available for runnign this experiment""",
58 |                         default=[0])
59 |     parser.add_argument("--num_workers", type=int,
60 |                         help="""The number of cpu workers for 
61 |                         loading the data, etc.""", default=3)
62 |     parser.add_argument("--epochs", type=int,
63 |                         help="""The number of epochs that the 
64 |                         experiment should run""", default=50)
65 |     parser.add_argument("--evaluate_per", type=int,
66 |                         help="""The number of trianing epochs in 
67 |                         order to run an evaluation""", default=5)
68 |     parser.add_argument("--n_eval", type=int,
69 |                         help="""Reduce the number of eavluation 
70 |                         samples to this number.""", default=256)
71 |     parser.add_argument("-lr", "--learning_rate", type=float,
72 |                         help="""Initial Learning rate""", default=1e-3)
73 |     parser.add_argument("--bidirectional", action='store_true',
74 |                         help="""Bidirectional or not""")
75 | 
76 |     return parser.parse_args()


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/modules/prob_estimation_initial_SDR.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief Initial SDR all measurements and not only stat values
  3 | 
  4 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  5 | @copyright University of illinois at Urbana Champaign
  6 | """
  7 | 
  8 | import argparse
  9 | import os
 10 | import sys
 11 | import numpy as np
 12 | from pprint import pprint
 13 | import joblib
 14 | 
 15 | root_dir = os.path.join(
 16 |     os.path.dirname(os.path.realpath(__file__)),
 17 |     '../../../')
 18 | sys.path.insert(0, root_dir)
 19 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as data_loader
 20 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as np_eval
 21 | from spatial_two_mics.config import FINAL_RESULTS_DIR
 22 | 
 23 | 
 24 | def eval(data_generator,
 25 |          dataset_path):
 26 | 
 27 |     data_dir = os.path.dirname(dataset_path)
 28 |     info = os.path.basename(data_dir)
 29 |     n_sources = int(info.split('_')[4])
 30 | 
 31 |     eval_dic = {'sdr': [], 'sir': [], 'sar': []}
 32 | 
 33 |     for batch_data in data_generator:
 34 |         abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data
 35 | 
 36 |         for b in np.arange(abs_tfs.size(0)):
 37 | 
 38 |             sdr, sir, sar = np_eval.mixture_bss_eval(
 39 |                 real_tfs[b].data.numpy(),
 40 |                 imag_tfs[b].data.numpy(),
 41 |                 wavs_lists[b].data.numpy(),
 42 |                 n_sources)
 43 | 
 44 |             eval_dic['sdr'].append(sdr)
 45 |             eval_dic['sir'].append(sir)
 46 |             eval_dic['sar'].append(sar)
 47 | 
 48 |     # return all values
 49 |     result_dic = {}
 50 |     for k, v in eval_dic.items():
 51 |         result_dic[k] = np.array(v)
 52 | 
 53 |     return result_dic
 54 | 
 55 | 
 56 | def evaluate_bss_metrics(dataset_folder,
 57 |                          n_jobs=1,
 58 |                          get_top=None):
 59 | 
 60 |     (dataset_dir, partition) = (os.path.dirname(dataset_folder),
 61 |                                 os.path.basename(dataset_folder))
 62 | 
 63 |     assert partition == 'test' or partition == 'val', '' \
 64 |            'All selected dataset folder to be evaluated have either ' \
 65 |            'to be test or val folder from a certain dataset!'
 66 | 
 67 |     print("Initializing the data loaders for all the datasets...")
 68 |     val_generator, n_val_batches = \
 69 |         data_loader.get_data_generator(
 70 |                         dataset_dir, partition=partition,
 71 |                         get_top=get_top, num_workers=1,
 72 |                         return_stats=False,
 73 |                         return_n_batches=True,
 74 |                         only_mask_evaluation=True)
 75 | 
 76 |     result_dic = eval(val_generator,
 77 |                       os.path.join(dataset_dir, partition))
 78 | 
 79 |     return result_dic
 80 | 
 81 | 
 82 | def get_args():
 83 |     """! Command line parser for computing the evaluation for
 84 |     specific datasets"""
 85 |     parser = argparse.ArgumentParser(description='Evaluating'
 86 |              ' initial SDR SAR and SIR for datasets')
 87 |     parser.add_argument("-i", "--dataset_folders", type=str, nargs='+',
 88 |                         help="Dataset paths you want to evaluate",
 89 |                         default=None)
 90 |     parser.add_argument("--n_jobs", type=int,
 91 |                         help="Number of parallel spawinign jobs",
 92 |                         default=1)
 93 |     parser.add_argument("--n_eval", type=int,
 94 |                         help="""Reduce the number of evaluation 
 95 |                             samples to this number.""", default=None)
 96 |     return parser.parse_args()
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     args = get_args()
101 | 
102 |     for dataset_folder in args.dataset_folders:
103 |         (dataset_dir, partition) = (os.path.dirname(dataset_folder),
104 |                                     os.path.basename(dataset_folder))
105 | 
106 |         eval_results = evaluate_bss_metrics(dataset_folder,
107 |                                             n_jobs=args.n_jobs,
108 |                                             get_top=args.n_eval)
109 | 
110 |         pprint(eval_results)
111 | 
112 |         test_on = os.path.basename(dataset_dir) + '_' + partition
113 |         save_folder_name = os.path.join(FINAL_RESULTS_DIR,
114 |                                         'test_on_' + test_on)
115 |         if not os.path.exists(save_folder_name):
116 |             os.makedirs(save_folder_name)
117 | 
118 |         file_path = os.path.join(save_folder_name,
119 |                                  'initial_mixture_metrics.gz')
120 | 
121 |         joblib.dump(eval_results, file_path)
122 | 


--------------------------------------------------------------------------------
/spatial_two_mics/labels_inference/tf_label_estimator.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief An estimator of TF masks depending on Blind Source Separation
  3 | Algorithms or even the energy in each bin (Ground Truth).
  4 | 
  5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  6 | @copyright University of Illinois at Urbana Champaign
  7 | """
  8 | 
  9 | import numpy as np
 10 | import os
 11 | import sys
 12 | from pprint import pprint
 13 | 
 14 | root_dir = os.path.join(
 15 |     os.path.dirname(os.path.realpath(__file__)),
 16 |     '../../')
 17 | sys.path.insert(0, root_dir)
 18 | import spatial_two_mics.labels_inference.ground_truth as gt_inference
 19 | import spatial_two_mics.labels_inference.duet_mask_estimation as \
 20 |     duet_kmeans_inference
 21 | 
 22 | 
 23 | class TFMaskEstimator(object):
 24 |     """
 25 |     This is a general compatible class for encapsulating the label
 26 |     inference / a TF max for mixtures of signals coming from 2
 27 |     microphones.
 28 |     """
 29 |     def __init__(self,
 30 |                  inference_method=None,
 31 |                  return_duet_raw_features=False):
 32 |         if inference_method.lower() == "ground_truth":
 33 |             self.label_inference = gt_inference
 34 |         elif inference_method.lower() == "duet_kmeans":
 35 |             self.label_inference = duet_kmeans_inference
 36 |         else:
 37 |             raise NotImplementedError("Inference Method: {} is not yet "
 38 |                   "implemented.".format(inference_method))
 39 | 
 40 |         self.return_duet_raw_features = return_duet_raw_features
 41 | 
 42 |     def infer_mixture_labels(self,
 43 |                              mixture_info):
 44 |         """
 45 |         :param mixture_info:
 46 |         mixture_info = {
 47 |             'm1_raw': numpy array containing the raw m1 signal,
 48 |             'm2_raw': numpy array containing the raw m2 signal,
 49 |             'm1_tf': numpy array containing the m1 TF representation,
 50 |             'm2_tf': numpy array containing the m2 TF representation,
 51 |             'sources_raw': a list of numpy 1d vectors containing the
 52 |             sources ,
 53 |             'sources_tf': a list of numpy 2d vectors containing the
 54 |              TF represeantations of the sources ,
 55 |             'delayed_sources_raw': a list of numpy 1d vectors containing
 56 |             the sources delayed with some tau,
 57 |             'delayed_sources_tf': a list of numpy 2d vectors
 58 |             containing the TF representations of the delayed signals,
 59 |             'amplitudes': the weights that each source contributes to
 60 |             the mixture of the second microphone
 61 |         }
 62 | 
 63 |         :return: A TF representation with each TF bin to correspond
 64 |         to the source which the algorithm predicts that is dominating
 65 |         """
 66 | 
 67 |         if self.return_duet_raw_features:
 68 |             return self.label_inference.infer_mask(mixture_info,
 69 |                         return_phase_features=True)
 70 |         else:
 71 |             return self.label_inference.infer_mask(mixture_info)
 72 | 
 73 | 
 74 | def example_of_usage():
 75 |     """!
 76 |     How the class of Audio mixtures should be called"""
 77 | 
 78 |     import os
 79 |     import sys
 80 |     root_dir = os.path.join(
 81 |         os.path.dirname(os.path.realpath(__file__)),
 82 |         '../../')
 83 |     sys.path.insert(0, root_dir)
 84 |     import spatial_two_mics.examples.mixture_example as me
 85 |     import spatial_two_mics.utils.audio_mixture_constructor as \
 86 |         mix_constructor
 87 | 
 88 |     mixture_info = me.mixture_info_example()
 89 |     mixture_creator = mix_constructor.AudioMixtureConstructor(
 90 |         n_fft=1024, win_len=400, hop_len=200, mixture_duration=2.0,
 91 |         force_delays=[-1, 1])
 92 | 
 93 |     tf_mixtures = mixture_creator.construct_mixture(mixture_info)
 94 | 
 95 |     duet_estimator = TFMaskEstimator(inference_method='duet_Kmeans')
 96 | 
 97 |     tf_labels = duet_estimator.infer_mixture_labels(tf_mixtures)
 98 |     print("DUET Kmeans")
 99 |     pprint(tf_labels.shape)
100 | 
101 |     ground_truth_estimator = TFMaskEstimator(
102 |         inference_method='ground_truth')
103 | 
104 |     gt_labels = ground_truth_estimator.infer_mixture_labels(tf_mixtures)
105 |     print("Ground Truth")
106 |     pprint(gt_labels.shape)
107 | 
108 |     n_bins = np.product(gt_labels.shape)
109 |     print("Estimation differs at {} out of {} points".format(
110 |           min(np.count_nonzero(abs(gt_labels-tf_labels)),
111 |               n_bins - np.count_nonzero(abs(gt_labels - tf_labels))),
112 |           n_bins))
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     example_of_usage()
117 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/modules/prob_estimation_ground_truth_masks.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief sourse separation performance all eval values
  3 | 
  4 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  5 | @copyright University of illinois at Urbana Champaign
  6 | """
  7 | 
  8 | import argparse
  9 | import os
 10 | import sys
 11 | import numpy as np
 12 | from pprint import pprint
 13 | from joblib import Parallel, delayed
 14 | from tqdm import tqdm
 15 | import itertools
 16 | import joblib
 17 | 
 18 | root_dir = os.path.join(
 19 |     os.path.dirname(os.path.realpath(__file__)),
 20 |     '../../../')
 21 | sys.path.insert(0, root_dir)
 22 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as data_loader
 23 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as np_eval
 24 | from spatial_two_mics.config import FINAL_RESULTS_DIR
 25 | 
 26 | 
 27 | def eval(data_generator,
 28 |          dataset_path):
 29 | 
 30 |     data_dir = os.path.dirname(dataset_path)
 31 |     info = os.path.basename(data_dir)
 32 |     n_sources = int(info.split('_')[4])
 33 | 
 34 |     eval_dic = {'sdr': [], 'sir': [], 'sar': []}
 35 | 
 36 |     for batch_data in data_generator:
 37 |         abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data
 38 | 
 39 |         for b in np.arange(abs_tfs.size(0)):
 40 |             embedding_labels = masks[b].data.numpy()
 41 | 
 42 |             sdr, sir, sar = np_eval.naive_cpu_bss_eval(
 43 |                 embedding_labels,
 44 |                 real_tfs[b].data.numpy(),
 45 |                 imag_tfs[b].data.numpy(),
 46 |                 wavs_lists[b].data.numpy(),
 47 |                 n_sources,
 48 |                 batch_index=b)
 49 | 
 50 |             eval_dic['sdr'].append(sdr)
 51 |             eval_dic['sir'].append(sir)
 52 |             eval_dic['sar'].append(sar)
 53 | 
 54 |     # return all values
 55 |     result_dic = {}
 56 |     for k, v in eval_dic.items():
 57 |         result_dic[k] = np.array(v)
 58 | 
 59 |     return result_dic
 60 | 
 61 | 
 62 | def evaluate_labels(dataset_folder,
 63 |                     n_jobs=1,
 64 |                     get_top=None):
 65 |     (dataset_dir, partition) = (os.path.dirname(dataset_folder),
 66 |                                 os.path.basename(dataset_folder))
 67 | 
 68 |     assert partition == 'test' or partition == 'val', '' \
 69 |            'All selected dataset folder to be evaluated have either ' \
 70 |            'to be test or val folder from a certain dataset!'
 71 | 
 72 |     eval_results={}
 73 |     for eval_labels in ['duet', 'ground_truth']:
 74 |         val_generator, n_val_batches = \
 75 |             data_loader.get_data_generator(
 76 |                             dataset_dir, partition=partition,
 77 |                             get_top=get_top, num_workers=1,
 78 |                             return_stats=False, labels_mask=eval_labels,
 79 |                             return_n_batches=True,
 80 |                             only_mask_evaluation=True)
 81 | 
 82 |         eval_results[eval_labels] = eval(val_generator,
 83 |                                          os.path.join(dataset_dir,
 84 |                                                       partition))
 85 | 
 86 |     return eval_results
 87 | 
 88 | 
 89 | def get_args():
 90 |     """! Command line parser for computing the evaluation for
 91 |     specific datasets"""
 92 |     parser = argparse.ArgumentParser(description='Evaluating'
 93 |              ' groundtruth or duet labels for datasets folders')
 94 |     parser.add_argument("-i", "--dataset_folders", type=str, nargs='+',
 95 |                         help="Dataset paths you want to evaluate",
 96 |                         default=[])
 97 |     parser.add_argument("--n_jobs", type=int,
 98 |                         help="Number of parallel spawinign jobs",
 99 |                         default=1)
100 |     parser.add_argument("--n_eval", type=int,
101 |                         help="""Reduce the number of evaluation 
102 |                             samples to this number.""", default=None)
103 |     return parser.parse_args()
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     args = get_args()
108 | 
109 | 
110 |     for dataset_folder in args.dataset_folders:
111 |         (dataset_dir, partition) = (os.path.dirname(dataset_folder),
112 |                                     os.path.basename(dataset_folder))
113 | 
114 |         eval_results = evaluate_labels(dataset_folder,
115 |                                        n_jobs=args.n_jobs,
116 |                                        get_top=args.n_eval)
117 | 
118 |         pprint(eval_results)
119 | 
120 |         test_on = os.path.basename(dataset_dir) + '_' + partition
121 |         save_folder_name = os.path.join(FINAL_RESULTS_DIR,
122 |                                         'test_on_' + test_on)
123 |         if not os.path.exists(save_folder_name):
124 |             os.makedirs(save_folder_name)
125 | 
126 |         for labels, metrics in eval_results.items():
127 |             file_path = os.path.join(save_folder_name,
128 |                                      labels + '_mask_metrics.gz')
129 | 
130 |             joblib.dump(metrics, file_path)
131 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/modules/measure_initial_SDR.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief For a specific dataset just find all the groundtruth
  3 | evaluation when applying either a duet or a ground truth labeled mask
  4 | for source separation
  5 | 
  6 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  7 | @copyright University of illinois at Urbana Champaign
  8 | """
  9 | 
 10 | import argparse
 11 | import os
 12 | import sys
 13 | import numpy as np
 14 | from pprint import pprint
 15 | from joblib import Parallel, delayed
 16 | from tqdm import tqdm
 17 | import itertools
 18 | import pandas as pd
 19 | 
 20 | root_dir = os.path.join(
 21 |     os.path.dirname(os.path.realpath(__file__)),
 22 |     '../../../')
 23 | sys.path.insert(0, root_dir)
 24 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as data_loader
 25 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as np_eval
 26 | 
 27 | 
 28 | def eval(data_generator,
 29 |          dataset_path):
 30 | 
 31 |     data_dir = os.path.dirname(dataset_path)
 32 |     info = os.path.basename(data_dir)
 33 |     n_sources = int(info.split('_')[4])
 34 | 
 35 |     eval_dic = {'sdr': [], 'sir': [], 'sar': []}
 36 | 
 37 |     for batch_data in data_generator:
 38 |         abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data
 39 | 
 40 |         for b in np.arange(abs_tfs.size(0)):
 41 | 
 42 |             sdr, sir, sar = np_eval.mixture_bss_eval(
 43 |                 real_tfs[b].data.numpy(),
 44 |                 imag_tfs[b].data.numpy(),
 45 |                 wavs_lists[b].data.numpy(),
 46 |                 n_sources)
 47 | 
 48 |             eval_dic['sdr'].append(sdr)
 49 |             eval_dic['sir'].append(sir)
 50 |             eval_dic['sar'].append(sar)
 51 | 
 52 |     # return both mean and std values
 53 |     mean_std_dic = {}
 54 |     for k, v in eval_dic.items():
 55 |         # mean_std_dic[k + "_max"] = np.max(np.array(v))
 56 |         # mean_std_dic[k + "_min"] = np.min(np.array(v))
 57 |         mean_std_dic[k+"_mean"] = np.mean(np.array(v))
 58 |         mean_std_dic[k+"_std"] = np.std(np.array(v))
 59 |         # mean_std_dic[k + "_50"] = np.quantile(np.array(v), 0.50)
 60 |         # mean_std_dic[k + "_25"] = np.quantile(np.array(v), 0.25)
 61 |         # mean_std_dic[k + "_75"] = np.quantile(np.array(v), 0.75)
 62 | 
 63 |     return dataset_path, mean_std_dic
 64 | 
 65 | def evaluate_bss_metrics(dataset_folders,
 66 |                          n_jobs=1,
 67 |                          get_top=None):
 68 | 
 69 |     dirs_and_parts = [(os.path.dirname(f), os.path.basename(f))
 70 |                       for f in dataset_folders]
 71 | 
 72 |     assert all([partition == 'test' or partition == 'val'
 73 |                 for (_, partition) in dirs_and_parts]), '' \
 74 |            'All selected dataset folder to be evaluated have either ' \
 75 |            'to be test or val folder from a certain dataset!'
 76 | 
 77 |     print("Initializing the data loaders for all the datasets...")
 78 |     datasets_loaders = [data_loader.get_data_generator(
 79 |                         dataset_dir, partition=partition,
 80 |                         get_top=get_top, num_workers=1,
 81 |                         return_stats=False,
 82 |                         return_n_batches=True,
 83 |                         only_mask_evaluation=True)
 84 |                         for (dataset_dir, partition) in dirs_and_parts]
 85 | 
 86 |     data_info = [list(itertools.chain.from_iterable(info_lists))
 87 |                  for info_lists in zip(datasets_loaders, dirs_and_parts)]
 88 | 
 89 |     eval_results = Parallel(n_jobs=n_jobs)(
 90 |                    [delayed(eval)(data_loader,
 91 |                                   os.path.join(data_dir, partition))
 92 |                    for (data_loader, n_batches, data_dir, partition)
 93 |                    in tqdm(data_info)])
 94 | 
 95 |     return eval_results
 96 | 
 97 | 
 98 | def get_args():
 99 |     """! Command line parser for computing the evaluation for
100 |     specific datasets"""
101 |     parser = argparse.ArgumentParser(description='Evaluating'
102 |              ' initial SDR SAR and SIR for datasets')
103 |     parser.add_argument("-i", "--dataset_folders", type=str, nargs='+',
104 |                         help="Dataset paths you want to evaluate",
105 |                         default=[])
106 |     parser.add_argument("--n_jobs", type=int,
107 |                         help="Number of parallel spawinign jobs",
108 |                         default=1)
109 |     parser.add_argument("--n_eval", type=int,
110 |                         help="""Reduce the number of evaluation 
111 |                             samples to this number.""", default=None)
112 |     return parser.parse_args()
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     args = get_args()
117 |     eval_results = evaluate_bss_metrics(args.dataset_folders,
118 |                                         n_jobs=args.n_jobs,
119 |                                         get_top=args.n_eval)
120 | 
121 |     df = pd.DataFrame(dict([(os.path.basename(os.path.dirname(p)) +
122 |                              '/' + os.path.basename(p), res)
123 |                             for (p, res) in eval_results])).T
124 |     pd.set_option('display.expand_frame_repr', False)
125 |     print(df)
126 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/utils/model_logger.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief Model logger in order to be able to load the model and test it
  3 | on different data.
  4 | 
  5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  6 | @copyright University of illinois at Urbana Champaign
  7 | """
  8 | import os
  9 | import sys
 10 | import torch
 11 | import datetime
 12 | import glob2
 13 | import torch.nn as nn
 14 | 
 15 | root_dir = os.path.join(
 16 |     os.path.dirname(os.path.realpath(__file__)),
 17 |     '../../../')
 18 | sys.path.insert(0, root_dir)
 19 | from spatial_two_mics.config import MODELS_DIR
 20 | from spatial_two_mics.config import MODELS_RAW_PHASE_DIR
 21 | from spatial_two_mics.config import MODELS_GROUND_TRUTH
 22 | import spatial_two_mics.dnn.models.simple_LSTM_encoder as LSTM_builder
 23 | 
 24 | 
 25 | def save(model,
 26 |          optimizer,
 27 |          args,
 28 |          epoch,
 29 |          performance_dic,
 30 |          dataset_id,
 31 |          mean_tr,
 32 |          std_tr,
 33 |          max_models_per_dataset=30,
 34 |          training_labels=''):
 35 |     state = {
 36 |         'epoch': epoch,
 37 |         'val_performance': performance_dic,
 38 |         'model_state': model.state_dict(),
 39 |         'optimizer_state': optimizer.state_dict(),
 40 |         'args': args,
 41 |         'mean_tr': mean_tr,
 42 |         'std_tr': std_tr,
 43 |         'training_labels': training_labels
 44 |     }
 45 |     sdr_str = str(round(performance_dic['sdr'], 3))
 46 |     sar_str = str(round(performance_dic['sar'], 3))
 47 |     sir_str = str(round(performance_dic['sir'], 3))
 48 | 
 49 |     if training_labels == 'raw_phase_diff':
 50 |         folder_name = os.path.join(MODELS_RAW_PHASE_DIR, dataset_id)
 51 |     elif training_labels == 'ground_truth':
 52 |         folder_name = os.path.join(MODELS_GROUND_TRUTH, dataset_id)
 53 |     else:
 54 |         folder_name = os.path.join(MODELS_DIR, dataset_id)
 55 | 
 56 | 
 57 |     if not os.path.exists(folder_name):
 58 |         os.makedirs(folder_name)
 59 | 
 60 |     available_models = glob2.glob(folder_name + '/*.pt')
 61 | 
 62 |     if len(available_models) > max_models_per_dataset:
 63 |         sdr_and_model_path = [os.path.basename(path)
 64 |                               for path in available_models]
 65 |         sdr_and_model_path = [float(path.split("_")[1])
 66 |                               for path in sdr_and_model_path]
 67 |         sdr_and_model_path = zip(sdr_and_model_path, available_models)
 68 |         sdr_sorted_models = sorted(sdr_and_model_path,
 69 |                                    key=lambda x: x[0])[::-1]
 70 |         for sdr, path in sdr_sorted_models[max_models_per_dataset:]:
 71 |             try:
 72 |                 os.remove(path)
 73 |             except:
 74 |                 print("Error in removing {} ...".format(path))
 75 | 
 76 |     ts = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%s")
 77 |     filename = "SDR_{}_SIR_{}_SAR_{}_{}.pt".format(sdr_str,
 78 |                                                    sir_str,
 79 |                                                    sar_str,
 80 |                                                    ts)
 81 |     file_path = os.path.join(folder_name, filename)
 82 |     torch.save(state, file_path)
 83 | 
 84 | 
 85 | # def load(model,
 86 | #          optimizer,
 87 | #          dataset_id,
 88 | #          filename=None):
 89 | #
 90 | #     folder_name = os.path.join(MODELS_DIR, dataset_id)
 91 | #     if filename is None:
 92 | #         available_models = glob2.glob(folder_name + '/*.pt')
 93 | #         file_path = os.path.join(folder_name, available_models[0])
 94 | #     else:
 95 | #         file_path = os.path.join(folder_name, filename)
 96 | #
 97 | #     loaded_state = torch.load(file_path)
 98 | #     model.load_state_dict(loaded_state['model_state'])
 99 | #     optimizer.load_state_dict(loaded_state['optimizer_state'])
100 | #     epoch = loaded_state['epoch']
101 | #     val_performance = loaded_state['val_performance']
102 | #     args = loaded_state['args']
103 | #     mean_tr = loaded_state['mean_tr']
104 | #     std_tr = loaded_state['std_tr']
105 | #
106 | #     return (model, optimizer, epoch, val_performance,
107 | #             args, mean_tr, std_tr)
108 | 
109 | 
110 | def load_and_create_the_model(model_path):
111 | 
112 |     loaded_state = torch.load(model_path)
113 |     epoch = loaded_state['epoch']
114 |     val_performance = loaded_state['val_performance']
115 |     args = loaded_state['args']
116 |     mean_tr = loaded_state['mean_tr']
117 |     std_tr = loaded_state['std_tr']
118 |     training_labels = loaded_state['training_labels']
119 | 
120 |     model = LSTM_builder.BLSTMEncoder(num_layers=args.n_layers,
121 |                                       hidden_size=args.hidden_size,
122 |                                       embedding_depth=args.embedding_depth,
123 |                                       bidirectional=args.bidirectional,
124 |                                       dropout=args.dropout)
125 |     model = nn.DataParallel(model).cuda()
126 | 
127 |     optimizer = torch.optim.Adam(model.parameters(),
128 |                                  lr=args.learning_rate,
129 |                                  betas=(0.9, 0.999))
130 | 
131 |     model.load_state_dict(loaded_state['model_state'])
132 |     optimizer.load_state_dict(loaded_state['optimizer_state'])
133 | 
134 |     return (model, optimizer, epoch, val_performance,
135 |             args, mean_tr, std_tr, training_labels)
136 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/utils/experiment_command_line_parser_v2.py:
--------------------------------------------------------------------------------
 1 | """!
 2 | @brief Command line parser for experiments
 3 | 
 4 | @author Efthymios Tzinis {etzinis2@illinois.edu}
 5 | @copyright University of illinois at Urbana Champaign
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | def get_args():
11 |     """! Command line parser for experiments"""
12 |     parser = argparse.ArgumentParser(description='Deep Clustering for '
13 |                                                  'Audio Source '
14 |                                                  'Separation '
15 |                                                  'Experiment')
16 |     parser.add_argument("--train", type=str,
17 |                         help="Path for the training dataset",
18 |                         default=None)
19 |     parser.add_argument("--test", type=str,
20 |                         help="Path for the testing dataset",
21 |                         default=None)
22 |     parser.add_argument("--val", type=str,
23 |                         help="Path for the validation dataset",
24 |                         default=None)
25 |     parser.add_argument("--n_train", type=int,
26 |                         help="""Reduce the number of training 
27 |                             samples to this number.""", default=None)
28 |     parser.add_argument("--n_test", type=int,
29 |                         help="""Reduce the number of testing 
30 |                             samples to this number.""", default=None)
31 |     parser.add_argument("--n_val", type=int,
32 |                         help="""Reduce the number of evaluation 
33 |                             samples to this number.""", default=None)
34 |     parser.add_argument("-nl", "--n_layers", type=int,
35 |                         help="""The number of layers of the BLSTM 
36 |                         encoder""", default=2)
37 |     parser.add_argument("-ed", "--embedding_depth", type=int,
38 |                         help="""The depth of the embedding""",
39 |                         default=16)
40 |     parser.add_argument("-hs", "--hidden_size", type=int,
41 |                         help="""The size of the LSTM cells """,
42 |                         default=1024)
43 |     parser.add_argument("-bs", "--batch_size", type=int,
44 |                         help="""The number of samples in each batch. 
45 |                         Warning: Cannot be less than the number of 
46 |                         the validation samples""", default=32)
47 |     parser.add_argument("-name", "--experiment_name", type=str,
48 |                         help="""The name or identifier of this 
49 |                         experiment""",
50 |                         default='A sample experiment'),
51 |     parser.add_argument("-train_l", "--training_labels", type=str,
52 |                         help="""The type of masks that you want to 
53 |                         use for training as the ideal affinities""",
54 |                         default='duet', choices=['duet',
55 |                                                  'raw_phase_diff',
56 |                                                  'ground_truth'])
57 |     parser.add_argument("-cad", "--cuda_available_devices", type=int,
58 |                         nargs="+",
59 |                         help="""A list of Cuda IDs that would be 
60 |                         available for running this experiment""",
61 |                         default=[0])
62 |     parser.add_argument("--num_workers", type=int,
63 |                         help="""The number of cpu workers for 
64 |                         loading the data, etc.""", default=3)
65 |     parser.add_argument("--epochs", type=int,
66 |                         help="""The number of epochs that the 
67 |                         experiment should run""", default=50)
68 |     parser.add_argument("--eval_per", type=int,
69 |                         help="""The number of training epochs in 
70 |                         order to run an evaluation""", default=5)
71 |     parser.add_argument("-lr", "--learning_rate", type=float,
72 |                         help="""Initial Learning rate""", default=1e-4)
73 |     parser.add_argument("-dr", "--dropout", type=float,
74 |                         help="""Dropout Ratio""", default=0.)
75 |     parser.add_argument("--bidirectional", action='store_true',
76 |                         help="""Bidirectional or not""")
77 |     parser.add_argument("--early_stop_patience", type=int,
78 |                         help="""The number of training epochs that 
79 |                         the model will endure until the eval metric (
80 |                         e.g SDR) will not become better""",
81 |                         default=15)
82 |     parser.add_argument("--lr_patience", type=int,
83 |                         help="""The number of training epochs that 
84 |                         the model will endure until the learning 
85 |                         rate would be reduced""", default=7)
86 |     parser.add_argument("--lr_gamma_decay", type=float,
87 |                         help="""Multiplicative value of decay that 
88 |                         would be enforced in the value of the learning 
89 |                         rate""", default=0.2)
90 |     parser.add_argument("--save_best", type=int,
91 |                         help="""The number of best models dependent 
92 |                         on the metric you want to use that are going 
93 |                         to be saved under the preferred logging model 
94 |                         directory.""",
95 |                         default=10)
96 | 
97 |     return parser.parse_args()


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/losses/affinity_approximation.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief Loss functions for low rank approximations of an ideal
  3 | affinity mask
  4 | 
  5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  6 | @copyright University of illinois at Urbana Champaign
  7 | """
  8 | import torch
  9 | import torch.nn as nn
 10 | 
 11 | 
 12 | def frobenius_naive(vs, ys):
 13 |     """! Computing naively the loss function between embedding
 14 |     vectors vs and ideal affinity matrices ys
 15 | 
 16 |     :param vs: size: batch_size x n_elements x embedded_features
 17 |     :param ys: One hot tensor corresponding to 1 where a specific
 18 |     class is the label for one element or 0 otherwise and its size:
 19 |     batch_size x n_elements x n_classes
 20 |     :return: The computed loss of these two tensors
 21 |     """
 22 |     loss = torch.mean(torch.norm(torch.norm(
 23 |            torch.matmul(vs, vs.permute(0, 2, 1)) -
 24 |            torch.matmul(ys, ys.permute(0, 2, 1)), 2, 1), 2, 1)**2)
 25 | 
 26 |     return loss
 27 | 
 28 | 
 29 | def efficient_frobenius(vs, ys, eps=10e-12):
 30 |     ys_T = ys.permute(0, 2, 1)
 31 |     vs_T = vs.permute(0, 2, 1)
 32 |     summed_y_T = ys_T.sum(dim=2).unsqueeze(-1)
 33 |     d = torch.bmm(ys, summed_y_T)
 34 |     d_m1_2 = torch.reciprocal(torch.sqrt(d) + eps)
 35 | 
 36 |     # print("Psola")
 37 |     # print((torch.bmm(vs_T, vs * d_m1_2)**2).shape)
 38 | 
 39 |     est_loss = (torch.bmm(vs_T, vs * d_m1_2) ** 2).sum()
 40 |     union_loss = (torch.bmm(vs_T, ys * d_m1_2) ** 2).sum()
 41 |     true_loss = (torch.bmm(ys_T, ys * d_m1_2) ** 2).sum()
 42 |     total_loss = est_loss - 2. * union_loss + true_loss
 43 |     # print(total_loss.shape)
 44 |     # print(est_loss.shape)
 45 |     # print(est_loss)
 46 | 
 47 |     print(union_loss)
 48 |     uni_loss = (torch.bmm(ys_T, vs * d_m1_2) ** 2).sum()
 49 |     print(uni_loss)
 50 | 
 51 |     return total_loss / vs.size(0)
 52 | 
 53 | 
 54 | def paris_naive(vs, ys):
 55 |     """! Computing naively the loss function between embedding
 56 |     vectors vs and ideal affinity matrices ys
 57 | 
 58 |     :param vs: size: batch_size x n_elements x embedded_features
 59 |     :param ys: One hot tensor corresponding to 1 where a specific
 60 |     class is the label for one element or 0 otherwise and its size:
 61 |     batch_size x n_elements x n_classes
 62 |     :return: The computed loss of these two tensors
 63 |     """
 64 |     loss = torch.sqrt(torch.mean(torch.bmm(vs.transpose(1, 2), vs) ** 2)) \
 65 |          - 2. * torch.sqrt(torch.mean(torch.bmm(vs.transpose(1, 2), ys) ** 2)) \
 66 |          + torch.sqrt(torch.mean(torch.bmm(ys.transpose(1, 2),ys) ** 2))
 67 |     return loss
 68 | 
 69 | 
 70 | 
 71 | def thymios_naive(vs, ys):
 72 |     """! Computing naively the loss function between embedding
 73 |     vectors vs and ideal affinity matrices ys
 74 | 
 75 |     :param vs: size: batch_size x n_elements x embedded_features
 76 |     :param ys: One hot tensor corresponding to 1 where a specific
 77 |     class is the label for one element or 0 otherwise and its size:
 78 |     batch_size x n_elements x n_classes
 79 |     :return: The computed loss of these two tensors
 80 |     """
 81 |     l = torch.sqrt((torch.bmm(vs.transpose(1, 2), vs) ** 2).sum()) \
 82 |         - 2.*torch.sqrt((torch.bmm(vs.transpose(1, 2), ys) **2).sum()) \
 83 |         + torch.sqrt((torch.bmm(ys.transpose(1, 2), ys) ** 2).sum())
 84 |     return l / vs.size(0)
 85 | 
 86 | 
 87 | def naive(vs, ys):
 88 |     """! Computing naively the loss function between embedding
 89 |     vectors vs and ideal affinity matrices ys
 90 | 
 91 |     :param vs: size: batch_size x n_elements x embedded_features
 92 |     :param ys: One hot tensor corresponding to 1 where a specific
 93 |     class is the label for one element or 0 otherwise and its size:
 94 |     batch_size x n_elements x n_classes
 95 |     :return: The computed loss of these two tensors
 96 |     """
 97 |     loss = (torch.matmul(vs.transpose(1, 2), vs) ** 2).sum() \
 98 |          - 2. * (torch.matmul(vs.transpose(1, 2), ys) ** 2).sum() \
 99 |          + (torch.matmul(ys.transpose(1, 2), ys) ** 2).sum()
100 |     return loss / vs.size(0)
101 |     # return loss
102 | 
103 | 
104 | def diagonal(embedding, assignments):
105 |     batch_size, sequence_length, num_frequencies, embedding_size = embedding.size()
106 |     _, _, _, num_sources = assignments.size()
107 |     embedding = embedding.view(-1, embedding.size()[-1])
108 |     assignments = assignments.view(-1, assignments.size()[-1])
109 | 
110 |     class_weights = nn.functional.normalize(torch.sum(assignments.detach(), dim=-2), p=1, dim=-1).unsqueeze(0)
111 |     class_weights = 1.0 / (torch.sqrt(class_weights) + 1e-7)
112 |     weights = torch.matmul(assignments.detach(), class_weights.transpose(1, 0))
113 |     # norm = torch.sum(weights**2)**2
114 |     assignments = assignments * weights.repeat(1, assignments.size()[-1])
115 |     embedding = embedding * weights.repeat(1, embedding.size()[-1])
116 | 
117 |     embedding = embedding.view(batch_size, sequence_length*num_frequencies, embedding_size)
118 |     assignments = assignments.view(batch_size, sequence_length*num_frequencies, num_sources)
119 | 
120 |     embedding_transpose = embedding.permute(0, 2, 1)
121 |     assignments_transpose = assignments.permute(0, 2, 1)
122 | 
123 |     loss_est = torch.sum(torch.matmul(embedding_transpose, embedding)**2)
124 |     loss_est_true = torch.sum(torch.matmul(embedding_transpose, assignments)**2)
125 |     loss_true = torch.sum(torch.matmul(assignments_transpose, assignments)**2)
126 |     loss = loss_est - 2*loss_est_true + loss_true
127 |     # loss = loss / norm
128 |     return loss
129 | 
130 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/modules/ground_truth_evaluation.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief For a specific dataset just find all the groundtruth
  3 | evaluation when applying either a duet or a ground truth labeled mask
  4 | for source separation
  5 | 
  6 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  7 | @copyright University of illinois at Urbana Champaign
  8 | """
  9 | 
 10 | import argparse
 11 | import os
 12 | import sys
 13 | import numpy as np
 14 | from pprint import pprint
 15 | from joblib import Parallel, delayed
 16 | from tqdm import tqdm
 17 | import itertools
 18 | import pandas as pd
 19 | 
 20 | root_dir = os.path.join(
 21 |     os.path.dirname(os.path.realpath(__file__)),
 22 |     '../../../')
 23 | sys.path.insert(0, root_dir)
 24 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as data_loader
 25 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as np_eval
 26 | 
 27 | 
 28 | def eval(data_generator,
 29 |          dataset_path):
 30 | 
 31 |     data_dir = os.path.dirname(dataset_path)
 32 |     info = os.path.basename(data_dir)
 33 |     n_sources = int(info.split('_')[4])
 34 | 
 35 |     eval_dic = {'sdr': [], 'sir': [], 'sar': []}
 36 | 
 37 |     for batch_data in data_generator:
 38 |         abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data
 39 | 
 40 |         for b in np.arange(abs_tfs.size(0)):
 41 |             embedding_labels = masks[b].data.numpy()
 42 | 
 43 |             sdr, sir, sar = np_eval.naive_cpu_bss_eval(
 44 |                 embedding_labels,
 45 |                 real_tfs[b].data.numpy(),
 46 |                 imag_tfs[b].data.numpy(),
 47 |                 wavs_lists[b].data.numpy(),
 48 |                 n_sources,
 49 |                 batch_index=b)
 50 | 
 51 |             eval_dic['sdr'].append(sdr)
 52 |             eval_dic['sir'].append(sir)
 53 |             eval_dic['sar'].append(sar)
 54 | 
 55 |     # return both mean and std values
 56 |     mean_std_dic = {}
 57 |     for k, v in eval_dic.items():
 58 |         mean_std_dic[k+"_mean"] = np.mean(np.array(v))
 59 |         mean_std_dic[k+"_std"] = np.std(np.array(v))
 60 | 
 61 |     return dataset_path, mean_std_dic
 62 | 
 63 | 
 64 | def eval_wrapper():
 65 |     return lambda data_generator, n_batches, dataset_path: eval(
 66 |                   data_generator, n_batches, dataset_path)
 67 | 
 68 | 
 69 | def evaluate_labels(dataset_folders,
 70 |                     eval_labels='duet',
 71 |                     n_jobs=1,
 72 |                     get_top=None):
 73 | 
 74 |     n_workers = n_jobs
 75 |     dirs_and_parts = [(os.path.dirname(f), os.path.basename(f))
 76 |                       for f in dataset_folders]
 77 | 
 78 |     assert all([partition == 'test' or partition == 'val'
 79 |                 for (_, partition) in dirs_and_parts]), '' \
 80 |            'All selected dataset folder to be evaluated have either ' \
 81 |            'to be test or val folder from a certain dataset!'
 82 | 
 83 |     print("Initializing the data loaders for all the datasets...")
 84 |     datasets_loaders = [data_loader.get_data_generator(
 85 |                         dataset_dir, partition=partition,
 86 |                         get_top=get_top, num_workers=1,
 87 |                         return_stats=False, labels_mask=eval_labels,
 88 |                         return_n_batches=True,
 89 |                         only_mask_evaluation=True)
 90 |                         for (dataset_dir, partition) in dirs_and_parts]
 91 | 
 92 |     data_info = [list(itertools.chain.from_iterable(info_lists))
 93 |                  for info_lists in zip(datasets_loaders, dirs_and_parts)]
 94 | 
 95 |     eval_results = Parallel(n_jobs=n_jobs)(
 96 |                    [delayed(eval)(data_loader,
 97 |                                   os.path.join(data_dir, partition))
 98 |                    for (data_loader, n_batches, data_dir, partition)
 99 |                    in tqdm(data_info)])
100 | 
101 |     return eval_results
102 | 
103 | 
104 | def get_args():
105 |     """! Command line parser for computing the evaluation for
106 |     specific datasets"""
107 |     parser = argparse.ArgumentParser(description='Evaluating'
108 |              ' groundtruth or duet labels for datasets folders')
109 |     parser.add_argument("-i", "--dataset_folders", type=str, nargs='+',
110 |                         help="Dataset paths you want to evaluate",
111 |                         default=[])
112 |     parser.add_argument("-l", "--eval_labels", type=str,
113 |                         help="Choose what labels do you want to use "
114 |                              "for the evaluation",
115 |                         default='duet', choices=['duet',
116 |                                                  'ground_truth'])
117 |     parser.add_argument("--n_jobs", type=int,
118 |                         help="Number of parallel spawinign jobs",
119 |                         default=1)
120 |     parser.add_argument("--n_eval", type=int,
121 |                         help="""Reduce the number of evaluation 
122 |                             samples to this number.""", default=None)
123 |     return parser.parse_args()
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     args = get_args()
128 |     eval_results = evaluate_labels(args.dataset_folders,
129 |                                    eval_labels=args.eval_labels,
130 |                                    n_jobs=args.n_jobs,
131 |                                    get_top=args.n_eval)
132 | 
133 |     df = pd.DataFrame(dict([(os.path.basename(os.path.dirname(p)) +
134 |                              '/' + os.path.basename(p), res)
135 |                             for (p, res) in eval_results])).T
136 |     pd.set_option('display.expand_frame_repr', False)
137 |     print(df)
138 | 


--------------------------------------------------------------------------------
/spatial_two_mics/data_generator/dataset_storage.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief A dataset creation which is used in order to combine the
  3 | mixtures form the dataset and also store them inside a specified folder
  4 | 
  5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  6 | @copyright University of Illinois at Urbana Champaign
  7 | """
  8 | 
  9 | import argparse
 10 | import os
 11 | import sys
 12 | import numpy as np
 13 | from pprint import pprint
 14 | from sklearn.externals import joblib
 15 | 
 16 | root_dir = os.path.join(
 17 |     os.path.dirname(os.path.realpath(__file__)),
 18 |     '../../')
 19 | sys.path.insert(0, root_dir)
 20 | 
 21 | import spatial_two_mics.data_generator.dataset_generator as generator
 22 | 
 23 | 
 24 | def create_dataset_name(args):
 25 |     dataset_name = '{}_{}_{}_{}_{}'.format(
 26 |                     args['dataset'],
 27 |                     '_'.join(map(str, args['n_samples'])),
 28 |                     args['n_sources'],
 29 |                     ''.join(sorted(args['genders'])),
 30 |                     'taus'.join(map(str,  args['force_delays'])))
 31 |     return dataset_name
 32 | 
 33 | 
 34 | def get_mixture_name_and_data_to_save(mix_info):
 35 |     name = [s_id['speaker_id']+'-'+s_id['sentence_id']
 36 |             for s_id in mix_info['sources_ids']]
 37 |     name = '_'.join(name)
 38 | 
 39 |     # don't save also the wavs, read them real time 
 40 |     for i, source_info in enumerate(mix_info['sources_ids']):
 41 |         try:
 42 |             del mix_info['sources_ids'][i]['wav']
 43 |         except:
 44 |             pass
 45 | 
 46 |     data = {
 47 |         'positions': mix_info['positions'],
 48 |         'sources_ids': mix_info['sources_ids'],
 49 |         'ground_truth_mask': mix_info['ground_truth_mask']
 50 |     }
 51 | 
 52 |     if 'soft_labeled_mask' in mix_info:
 53 |         data['soft_labeled_mask'] = mix_info['soft_labeled_mask']
 54 | 
 55 |     return name, data
 56 | 
 57 | def time_loading_comparison(data, f_path):
 58 |     import _pickle as cPickle
 59 |     from sklearn.externals import joblib
 60 |     import time
 61 | 
 62 |     joblib.dump(data, f_path)
 63 |     before = time.time()
 64 |     tempos = joblib.load(f_path)
 65 |     now = time.time()
 66 |     jlib_time = now - before
 67 | 
 68 |     cPickle.dump(data, open(f_path, 'wb'))
 69 |     before = time.time()
 70 |     tempos = cPickle.load(open(f_path, 'rb'))
 71 |     now = time.time()
 72 |     pickle_time = now - before
 73 | 
 74 |     return jlib_time, pickle_time
 75 | 
 76 | 
 77 | def store_dataset(dataset_dic, args):
 78 | 
 79 |     dataset_params = args.__dict__
 80 |     dataset_name = create_dataset_name(dataset_params)
 81 | 
 82 |     dataset_path = os.path.join(args.output_path, dataset_name)
 83 |     if not os.path.exists(dataset_path):
 84 |         os.makedirs(dataset_path)
 85 | 
 86 |     for subf, mixtures_info in dataset_dic.items():
 87 |         subf_path = os.path.join(dataset_path, subf)
 88 |         if not os.path.exists(subf_path):
 89 |             os.makedirs(subf_path)
 90 | 
 91 |         for mix_info in mixtures_info:
 92 |             name, data = get_mixture_name_and_data_to_save(mix_info)
 93 |             f_path = os.path.join(subf_path, name)
 94 |             joblib.dump(data, f_path, compress=3)
 95 | 
 96 | 
 97 | def generate_dataset(args):
 98 |     n_train, n_test, n_val = args.n_samples
 99 |     timit_mixture_creator = generator.RandomCombinations(
100 |         audio_dataset_name=args.dataset,
101 |         genders_mixtures=args.genders,
102 |         subset_of_speakers='train',
103 |         create_val_set=False)
104 | 
105 |     dataset_dic = timit_mixture_creator.get_all_mixture_sets(
106 |         n_sources_in_mix=args.n_sources,
107 |         n_mixtures=n_train,
108 |         force_delays=args.force_delays)
109 | 
110 |     timit_mixture_creator = generator.RandomCombinations(
111 |         audio_dataset_name=args.dataset,
112 |         genders_mixtures=args.genders,
113 |         subset_of_speakers='test',
114 |         create_val_set=True)
115 | 
116 |     test_val_dic = timit_mixture_creator.get_all_mixture_sets(
117 |         n_sources_in_mix=args.n_sources,
118 |         n_mixtures=max(n_test, n_val),
119 |         force_delays=args.force_delays)
120 | 
121 |     if n_val > n_test:
122 |         test_val_dic['test'] = np.random.choice(test_val_dic['test'],
123 |                                                 size=n_test,
124 |                                                 replace=False)
125 |     elif n_val < n_test:
126 |         test_val_dic['val'] = np.random.choice(test_val_dic['val'],
127 |                                                size=n_val,
128 |                                                replace=False)
129 | 
130 |     dataset_dic.update(test_val_dic)
131 |     return dataset_dic
132 | 
133 | 
134 | def create_and_store_dataset(args):
135 |     dataset_dic = generate_dataset(args)
136 |     store_dataset(dataset_dic, args)
137 | 
138 | 
139 | def get_args():
140 |     """! Command line parser """
141 |     parser = argparse.ArgumentParser(description='Mixture dataset '
142 |                                                  'creator')
143 |     parser.add_argument("--dataset", type=str,
144 |                         help="Dataset name", default="timit")
145 |     parser.add_argument("--n_sources", type=int,
146 |                         help="How many sources in each mix", default=2)
147 |     parser.add_argument("--n_samples", type=int, nargs='+',
148 |                         help="How many samples do u want to be "
149 |                              "created for train test val",
150 |                         default=10)
151 |     parser.add_argument("--genders", type=str, nargs='+',
152 |                         help="Genders that will correspond to the "
153 |                              "genders in the mixtures",
154 |                         default=['m', 'f'])
155 |     parser.add_argument("-o", "--output_path", type=str,
156 |                         help="""The path that the resulting dataset 
157 |                         would be stored. If the folder does not 
158 |                         exist it will be created as well as its 
159 |                         child folders train or test and val if it is 
160 |                         selected""",
161 |                         required=True)
162 |     parser.add_argument("-f", "--force_delays", nargs='+', type=int,
163 |                         help="""Whether you want to force integer 
164 |                         delays of +- 1 in the sources e.g.""",
165 |                         default=None)
166 |     parser.add_argument('--val_set', action="store_true",
167 |                         help='Force to create a separate val folder '
168 |                              'with the same amount of the mixtures as '
169 |                              'the initial test/train folder but using '
170 |                              'half of the available speakers')
171 |     return parser.parse_args()
172 | 
173 | 
174 | if __name__ == "__main__":
175 |     args = get_args()
176 |     create_and_store_dataset(args)
177 | 
178 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/modules/find_best_model_and_estimate_prob.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief Initial SDR all measurements and not only stat values
  3 | 
  4 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  5 | @copyright University of illinois at Urbana Champaign
  6 | """
  7 | 
  8 | import argparse
  9 | import os
 10 | import sys
 11 | import numpy as np
 12 | from pprint import pprint
 13 | import joblib
 14 | from sklearn.cluster import KMeans
 15 | from progress.bar import ChargingBar
 16 | import torch
 17 | import pandas as pd
 18 | 
 19 | 
 20 | root_dir = os.path.join(
 21 |     os.path.dirname(os.path.realpath(__file__)),
 22 |     '../../../')
 23 | sys.path.insert(0, root_dir)
 24 | import spatial_two_mics.dnn.utils.model_logger as model_logger
 25 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as data_loader
 26 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as np_eval
 27 | from spatial_two_mics.config import *
 28 | 
 29 | 
 30 | def eval(dataset_gen,
 31 |          model_path,
 32 |          n_sources,
 33 |          n_batches,
 34 |          n_jobs):
 35 | 
 36 |     model_name = os.path.basename(model_path)
 37 | 
 38 |     eval_dic = {'sdr': [], 'sir': [], 'sar': []}
 39 | 
 40 |     model, optimizer, _, _, args, mean_tr, std_tr, training_labels = \
 41 |         model_logger.load_and_create_the_model(model_path)
 42 | 
 43 |     k_means_obj = KMeans(n_clusters=n_sources, n_jobs=n_jobs)
 44 | 
 45 |     model.eval()
 46 |     with torch.no_grad():
 47 |         bar = ChargingBar("Evaluating model {} ...".format(model_name),
 48 |                           max=n_batches)
 49 |         for batch_data in dataset_gen:
 50 |             abs_tfs, wavs_lists, real_tfs, imag_tfs = batch_data
 51 |             input_tfs = abs_tfs.cuda()
 52 |             # the input sequence is determined by time and not freqs
 53 |             # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps
 54 |             input_tfs = input_tfs.permute(0, 2, 1).contiguous()
 55 | 
 56 |             # normalize with mean and variance from the training dataset
 57 |             input_tfs -= mean_tr
 58 |             input_tfs /= std_tr
 59 | 
 60 |             vs = model(input_tfs)
 61 |             for b in np.arange(vs.size(0)):
 62 |                 embedding_features = vs[b, :, :].data.cpu().numpy()
 63 | 
 64 |                 z_embds = (embedding_features -
 65 |                            np.mean(embedding_features, axis=0)) / (
 66 |                            np.std(embedding_features, axis=0) + 10e-8)
 67 | 
 68 |                 embedding_labels = np.array(k_means_obj.fit_predict(
 69 |                     z_embds))
 70 | 
 71 |                 sdr, sir, sar = np_eval.naive_cpu_bss_eval(
 72 |                     embedding_labels,
 73 |                     real_tfs[b].data.numpy(),
 74 |                     imag_tfs[b].data.numpy(),
 75 |                     wavs_lists[b].data.numpy(),
 76 |                     n_sources,
 77 |                     batch_index=b)
 78 | 
 79 |                 eval_dic['sdr'].append(sdr)
 80 |                 eval_dic['sir'].append(sir)
 81 |                 eval_dic['sar'].append(sar)
 82 | 
 83 |             bar.next()
 84 |         bar.finish()
 85 | 
 86 |     # return both mean and std values
 87 |     result_dic = {}
 88 |     for k, v in eval_dic.items():
 89 |         result_dic[k] = np.array(v)
 90 | 
 91 |     return result_dic
 92 | 
 93 | 
 94 | def find_best_model_and_evaluate(args):
 95 | 
 96 |     visible_cuda_ids = ','.join(map(str, args.cuda_available_devices))
 97 |     os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids
 98 | 
 99 |     for result_path in args.results_paths:
100 |         (dataset_name,
101 |          model_dataset) = (os.path.basename(os.path.dirname(
102 |             result_path)).split(
103 |             "test_on_")[1],
104 |                            os.path.basename(
105 |                                result_path).split(
106 |                                "train_on_")[1].split(".csv")[0])
107 | 
108 |         partition = dataset_name.split('_')[-1]
109 |         dataset_dirname = dataset_name.split('_' + partition)[0]
110 | 
111 |         print(dataset_dirname)
112 | 
113 |         df = pd.read_csv(result_path)
114 | 
115 |         mask_types2model_dir = {
116 |             'duet': os.path.join(MODELS_DIR, model_dataset),
117 |             'ground_truth': os.path.join(MODELS_GROUND_TRUTH,
118 |                                          model_dataset),
119 |             'raw_phase_diff': os.path.join(MODELS_RAW_PHASE_DIR,
120 |                                            model_dataset)}
121 | 
122 |         for mask_type, saved_models_dir in mask_types2model_dir.items():
123 |             mask_df = df.loc[df['training_labels'] == mask_type]
124 |             mask_df = mask_df.sort_values(['sdr_mean'], ascending=False)
125 |             mask_df.reset_index(drop=True, inplace=True)
126 | 
127 |             best_model_name = mask_df['Unnamed: 0'].loc[0]
128 | 
129 |             # construct model path
130 |             best_model_p = os.path.join(saved_models_dir,
131 |                                         best_model_name)
132 | 
133 |             if not os.path.exists(best_model_p):
134 |                 print(best_model_p)
135 |                 raise IOError("Model path not found!")
136 | 
137 |             test_dataset_dir = os.path.join(DATASETS_DIR,
138 |                                             dataset_dirname)
139 | 
140 |             if not os.path.exists(test_dataset_dir):
141 |                 print(test_dataset_dir)
142 |                 raise IOError("Dataset path not found!")
143 | 
144 |             val_generator, n_val_batches, n_val_sources = \
145 |                 data_loader.get_data_generator(test_dataset_dir,
146 |                                                partition=partition,
147 |                                                get_top=args.n_eval,
148 |                                                num_workers=args.n_jobs,
149 |                                                return_stats=False,
150 |                                                return_n_batches=True,
151 |                                                return_n_sources=True,
152 |                                                batch_size=32)
153 | 
154 |             res = eval(val_generator,
155 |                        best_model_p,
156 |                        n_val_sources,
157 |                        n_val_batches,
158 |                        args.n_jobs)
159 | 
160 |             test_on = os.path.basename(dataset_dirname) + '_' + partition
161 |             save_folder_name = os.path.join(FINAL_RESULTS_DIR,
162 |                                             'test_on_' + test_on)
163 |             if not os.path.exists(save_folder_name):
164 |                 os.makedirs(save_folder_name)
165 | 
166 |             file_path = os.path.join(save_folder_name,
167 |                         mask_type+'_deep_clustering_metrics.gz')
168 | 
169 |             pprint(res)
170 | 
171 |             joblib.dump(res, file_path)
172 | 
173 | 
174 | def get_args():
175 |     """! Command line parser for computing the evaluation for
176 |     specific datasets"""
177 |     parser = argparse.ArgumentParser(description='Evaluating'
178 |              ' SDR SAR and SIR for datasets for the best models')
179 |     parser.add_argument("-i", "--results_paths", type=str, nargs='+',
180 |                         help="Results for datasets",
181 |                         default=None)
182 |     parser.add_argument("--n_jobs", type=int,
183 |                         help="Number of parallel spawinign jobs",
184 |                         default=1)
185 |     parser.add_argument("-cad", "--cuda_available_devices", type=int,
186 |                         nargs="+",
187 |                         help="""A list of Cuda IDs that would be 
188 |                                 available for running this experiment""",
189 |                         default=[2])
190 |     parser.add_argument("--n_eval", type=int,
191 |                         help="""Reduce the number of evaluation 
192 |                             samples to this number.""", default=None)
193 |     return parser.parse_args()
194 | 
195 | 
196 | if __name__ == "__main__":
197 |     args = get_args()
198 |     find_best_model_and_evaluate(args)
199 | 


--------------------------------------------------------------------------------
/spatial_two_mics/data_generator/source_position_generator.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief Get some random sampling for the position of two sources
  3 | 
  4 | 
  5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  6 | @copyright University of illinois at Urbana Champaign
  7 | """
  8 | 
  9 | import numpy as np
 10 | from scipy.spatial import distance as dst
 11 | from pprint import pprint
 12 | 
 13 | 
 14 | class RandomCirclePositioner(object):
 15 |     """
 16 |     ! Returns n_source_pairs positions based on a circle with
 17 |     specified radius Cartessian and Polar coordinates like follows:
 18 | 
 19 |     For each pair of the list we get a dictionary of:
 20 |     {
 21 |         'thetas': angles in rads [<(+x, s1), <(+x, s2)] in list,
 22 |         'd_theta': < (+x, s2) - < (+x, s1),
 23 |         'xy_positons': [(x_1, y_1), (x_2, y_2)], Cartessian
 24 |         'distances': [[||si-mj||]] all precomputed distances
 25 |         'taus': time delays in sample format
 26 |         'amplitudes': a1 and a2 for: m2(t) = a1*s1(t+d1) + a2*s2(t+d2)
 27 |     }
 28 |     (theta_of_source_1, theta_of_source_1)
 29 | 
 30 | 
 31 | 
 32 |                         s2     OOO ooo
 33 |                                        OOo    (x1, y1)
 34 |                oOO
 35 |             oOO                               s1
 36 |           oOO
 37 |         oOO                                       OOo
 38 |        oOO                                         OOo
 39 |       oOO                                           OOo
 40 |      oOO                                             OOo
 41 |      oOO                                             OOo
 42 |      oOO          m1 <-- mic_distance --> m2     ================>>+x
 43 |      oOO                                             OOo
 44 |      oOO                                             OOo
 45 |       oOO                                           OOo
 46 |        oOO                                         OOo
 47 |         oOO                                       OOo
 48 |           oOO                                   OOo
 49 |             oO                                OOo
 50 |                oOO                         OOo
 51 |                    oOO                 OOo
 52 |                        ooo OOO OOO ooo
 53 |     """
 54 | 
 55 |     def __init__(self,
 56 |                  min_angle=np.pi/18.,
 57 |                  angle_sup=np.pi - np.pi/18.,
 58 |                  angle_inf=np.pi/18.,
 59 |                  radius=10.71,
 60 |                  mic_distance_percentage=0.002,
 61 |                  sound_speed=343,
 62 |                  fs=16000):
 63 |         """
 64 |         :param min_angle: minimum angle in rads for the 2 sources
 65 |         :param angle_sup or inf: the maximum and minimum values
 66 |         available for an angle that a source would lie on
 67 |         :param radius: Radius of the circle in **meters**
 68 |         :param mic_distance_percentage: Percentage of the radius
 69 |         corresponding to the distance between the two microphones
 70 |         :param sound_speed: Default 343 m/s in 20oC room temperature
 71 |         :param fs: sampling ratio in Hz
 72 |         """
 73 | 
 74 |         self.min_angle = min_angle
 75 |         self.angle_sup = angle_sup
 76 |         self.angle_inf = angle_inf
 77 |         self.radius = radius
 78 |         self.mic_distance = self.radius * mic_distance_percentage
 79 |         # for 16000 hz in order to get maximum +- 1 sample delays we
 80 |         # have to sustain a distance of maximum: 2.142 cm
 81 |         # between the mics
 82 |         self.m1 = (-self.mic_distance / 2, 0.)
 83 |         self.m2 = (self.mic_distance / 2, 0.)
 84 |         self.sound_speed = sound_speed
 85 |         self.fs = fs
 86 | 
 87 |     @staticmethod
 88 |     def get_cartessian_position(radius,
 89 |                                 angle):
 90 |         return radius * np.cos(angle), radius * np.sin(angle)
 91 | 
 92 |     def get_amplifier_values_for_sources(self,
 93 |                                          n_sources):
 94 |         """
 95 |         :return: A dictionary of all the amplitudes in order to infer
 96 |         the final mixture depending on the weighted summation of the
 97 |         source-signals
 98 |         """
 99 |         alphas = np.random.uniform(low=0.2,
100 |                                    high=1.0,
101 |                                    size=n_sources)
102 |         total_amplitude = sum(alphas)
103 | 
104 |         return dict([("a"+str(i+1), a/total_amplitude)
105 |                      for (i, a) in enumerate(alphas)])
106 | 
107 |     def get_time_delays_for_sources(self,
108 |                                     distances,
109 |                                     n_sources):
110 |         # delays are always computed using the m1 microphone as
111 |         # reference and comparing to the time delay from m2
112 | 
113 |         taus_list = []
114 |         for i in np.arange(n_sources):
115 |             source = "s"+str(i+1)
116 |             taus_list.append(distances[source+"m1"]
117 |                              - distances[source+"m2"])
118 | 
119 |         return [(1. * self.fs * tau) / self.sound_speed
120 |                 for tau in taus_list]
121 | 
122 |     def compute_distances_for_sources_and_mics(self,
123 |                                                source_points):
124 |         """! si \in source_points must be in format (xi, yi)
125 |         \:return a dictionary of all given points"""
126 |         points = {"m1": self.m1, "m2": self.m2}
127 |         points.update(dict([("s"+str(i+1), xy)
128 |                             for (i, xy) in enumerate(source_points)]))
129 |         distances = {}
130 | 
131 |         for point_1, xy1 in points.items():
132 |             for point_2, xy2 in points.items():
133 |                 distances[point_1+point_2] = dst.euclidean(xy1, xy2)
134 | 
135 |         return distances
136 | 
137 |     def get_angles(self, n_source_pairs):
138 |         while True:
139 |             thetas = np.random.uniform(low=self.angle_inf,
140 |                                        high=self.angle_sup,
141 |                                        size=n_source_pairs)
142 |             thetas = sorted(thetas)
143 |             d_thetas = [th2 - th1 for (th1, th2) in
144 |                         zip(thetas[:-1], thetas[1:])]
145 | 
146 |             min_angle_enforced = np.where(np.abs(d_thetas) <
147 |                                     self.min_angle)[0].shape[0] == 0
148 | 
149 |             if min_angle_enforced:
150 |                 break
151 | 
152 |         return thetas, d_thetas
153 | 
154 |     def get_sources_locations(self,
155 |                               n_source_pairs):
156 |         """!
157 |         Generate the positions, angles and distances for
158 |         n_source_pairs of the same mixture corersponding to 2 mics"""
159 |         thetas, d_thetas = self.get_angles(n_source_pairs)
160 |         xys = []
161 |         for angle in thetas:
162 |             xys.append(self.get_cartessian_position(self.radius, angle))
163 | 
164 |         distances = self.compute_distances_for_sources_and_mics(xys)
165 | 
166 |         taus = self.get_time_delays_for_sources(distances,
167 |                                                 n_source_pairs)
168 | 
169 |         mix_amplitudes = self.get_amplifier_values_for_sources(
170 |                               n_source_pairs)
171 | 
172 |         sources_locations = {'thetas': np.asarray(thetas),
173 |                              'd_thetas': np.asarray(d_thetas),
174 |                              'xy_positons': np.asarray(xys),
175 |                              'distances': distances,
176 |                              'taus': np.asarray(taus),
177 |                              'amplitudes': np.asarray(list(
178 |                                            mix_amplitudes.values()))}
179 | 
180 |         return sources_locations
181 | 
182 | 
183 | def example_of_usage():
184 |     """
185 |     :return:
186 |     {'amplitudes': array([0.28292362, 0.08583346, 0.63124292]),
187 |      'd_thetas': array([1.37373734, 1.76785531]),
188 |      'distances': {'m1m1': 0.0,
189 |                    'm1m2': 0.03,
190 |                    'm1s1': 3.015, ...
191 |                    's3s3': 0.0},
192 |      'taus': array([ 1, -1,  0]),
193 |      'thetas': array([0.        , 1.37373734, 3.14159265]),
194 |      'xy_positons': array([[ 3.00000000e+00,  0.00000000e+00],
195 |            [ 5.87358252e-01,  2.94193988e+00],
196 |            [-3.00000000e+00,  3.67394040e-16]])}
197 |     """
198 |     random_positioner = RandomCirclePositioner()
199 |     positions_info = random_positioner.get_sources_locations(5)
200 |     pprint(positions_info)
201 | 
202 | 
203 | if __name__ == "__main__":
204 |     example_of_usage()
205 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/experiments/sample_convergence_LSTM.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief A simple experiment on how LSTM converge
  3 | 
  4 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  5 | @copyright University of illinois at Urbana Champaign
  6 | """
  7 | 
  8 | import os
  9 | import sys
 10 | import torch
 11 | import time
 12 | import numpy as np
 13 | import copy
 14 | from pprint import pprint
 15 | import torch.nn as nn
 16 | 
 17 | root_dir = os.path.join(
 18 |     os.path.dirname(os.path.realpath(__file__)),
 19 |     '../../../')
 20 | sys.path.insert(0, root_dir)
 21 | 
 22 | import spatial_two_mics.dnn.models.simple_LSTM_encoder as LSTM_enc
 23 | import spatial_two_mics.dnn.losses.affinity_approximation as \
 24 |     affinity_losses
 25 | import spatial_two_mics.dnn.utils.dataset as data_generator
 26 | import spatial_two_mics.dnn.utils.data_conversions as converters
 27 | import spatial_two_mics.dnn.utils.experiment_command_line_parser as \
 28 |     parser
 29 | import spatial_two_mics.dnn.utils.update_history as update_history
 30 | from progress.bar import ChargingBar
 31 | import spatial_two_mics.utils.robust_means_clustering as robust_kmeans
 32 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as  \
 33 |     numpy_eval
 34 | from sklearn.preprocessing import StandardScaler
 35 | 
 36 | 
 37 | def train(args,
 38 |           model,
 39 |           training_generator,
 40 |           optimizer,
 41 |           mean_tr,
 42 |           std_tr,
 43 |           epoch,
 44 |           history,
 45 |           n_batches):
 46 |     model.train()
 47 |     timing_dic = {'Loading batch': 0.,
 48 |                   'Transformations and Forward': 0.,
 49 |                   'Loss Computation and Backprop': 0.}
 50 |     before = time.time()
 51 |     bar = ChargingBar("Training for epoch: {}...".format(epoch),
 52 |                       max=n_batches)
 53 |     for batch_data in training_generator:
 54 |         (abs_tfs, real_tfs, imag_tfs,
 55 |          duet_masks, ground_truth_masks,
 56 |          sources_raw, amplitudes, n_sources) = batch_data
 57 |         timing_dic['Loading batch'] += time.time() - before
 58 |         before = time.time()
 59 |         input_tfs, index_ys = abs_tfs.cuda(), duet_masks.cuda()
 60 |         # the input sequence is determined by time and not freqs
 61 |         # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps
 62 |         input_tfs = input_tfs.permute(0, 2, 1).contiguous()
 63 | 
 64 |         # normalize with mean and variance from the training dataset
 65 |         input_tfs -= mean_tr
 66 |         input_tfs /= std_tr
 67 | 
 68 |         index_ys = index_ys.permute(0, 2, 1).contiguous()
 69 | 
 70 |         one_hot_ys = converters.one_hot_3Dmasks(index_ys,
 71 |                                                 n_sources[0])
 72 | 
 73 |         optimizer.zero_grad()
 74 |         vs = model(input_tfs)
 75 | 
 76 |         flatened_ys = one_hot_ys.view(one_hot_ys.size(0),
 77 |                                       -1,
 78 |                                       one_hot_ys.size(-1)).cuda()
 79 |         timing_dic['Transformations and Forward'] += time.time() - \
 80 |                                                      before
 81 |         before = time.time()
 82 |         naive_loss = affinity_losses.naive(vs, flatened_ys)
 83 |         naive_loss.backward()
 84 |         optimizer.step()
 85 |         timing_dic['Loss Computation and Backprop'] += time.time() - \
 86 |                                                       before
 87 | 
 88 |         update_history.values_update([('loss', naive_loss)],
 89 |                                      history, update_mode='batch')
 90 |         before = time.time()
 91 |         bar.next()
 92 |     bar.finish()
 93 | 
 94 |     pprint(timing_dic)
 95 | 
 96 | 
 97 | def eval(args,
 98 |          model,
 99 |          val_generator,
100 |          mean_tr,
101 |          std_tr,
102 |          epoch,
103 |          history,
104 |          n_batches):
105 |     timing_dic = {'Loading batch': 0.,
106 |                   'Transformations and Forward': 0.,
107 |                   'BSS CPU evaluation': 0.,
108 |                   'Kmeans evaluation': 0.}
109 |     r_kmeans = robust_kmeans.RobustKmeans(
110 |         n_true_clusters=args.n_sources,
111 |         n_used_clusters=args.n_sources)
112 |     z_scaler = StandardScaler()
113 | 
114 |     # make some evaluation
115 |     model.eval()
116 |     before = time.time()
117 |     with torch.no_grad():
118 |         bar = ChargingBar("Evaluating for epoch: {}...".format(epoch),
119 |                           max=n_batches)
120 |         before = time.time()
121 |         for batch_data in val_generator:
122 |             (abs_tfs, real_tfs, imag_tfs,
123 |              duet_masks, ground_truth_masks,
124 |              sources_raw, amplitudes, n_sources) = batch_data
125 |             timing_dic['Loading batch'] += time.time() - before
126 |             before = time.time()
127 |             input_tfs, index_ys = abs_tfs.cuda(), duet_masks.cuda()
128 |             # the input sequence is determined by time and not freqs
129 |             # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps
130 |             input_tfs = input_tfs.permute(0, 2, 1).contiguous()
131 | 
132 |             # normalize with mean and variance from the training dataset
133 |             input_tfs -= mean_tr
134 |             input_tfs /= std_tr
135 | 
136 |             vs = model(input_tfs)
137 |             for b in np.arange(vs.size(0)):
138 |                 embedding_features = z_scaler.fit_transform(
139 |                                      vs[b, :, :].data.cpu().numpy())
140 | 
141 |                 embedding_labels = r_kmeans.fit(embedding_features)
142 | 
143 |                 sdr, sir, sar = numpy_eval.naive_cpu_bss_eval(
144 |                                 embedding_labels,
145 |                                 real_tfs[b].data.numpy(),
146 |                                 imag_tfs[b].data.numpy(),
147 |                                 sources_raw[b].data.numpy(),
148 |                                 n_sources[0].data.numpy())
149 | 
150 |                 update_history.values_update([('sdr', sdr),
151 |                                               ('sir', sir),
152 |                                               ('sar', sar)],
153 |                                              history,
154 |                                              update_mode='batch')
155 | 
156 |             before = time.time()
157 |             bar.next()
158 |         bar.finish()
159 | 
160 | 
161 | def convergence_of_LSTM(args):
162 |     visible_cuda_ids = ','.join(map(str, args.cuda_available_devices))
163 |     os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids
164 | 
165 |     (training_generator, mean_tr, std_tr, n_tr_batches) = \
166 |         data_generator.get_data_generator(args,
167 |                                           return_stats=True)
168 |     
169 |     val_args = copy.copy(args)
170 |     val_args.partition = 'val'
171 |     val_generator, n_val_batches = \
172 |         data_generator.get_data_generator(val_args,
173 |                                           get_top=args.n_eval)
174 | 
175 |     model = LSTM_enc.BLSTMEncoder(num_layers=args.n_layers,
176 |                                   hidden_size=args.hidden_size,
177 |                                   embedding_depth=args.embedding_depth,
178 |                                   bidirectional=args.bidirectional)
179 |     model = nn.DataParallel(model).cuda()
180 | 
181 |     optimizer = torch.optim.Adam(model.parameters(),
182 |                                  lr=args.learning_rate,
183 |                                  betas=(0.9, 0.999))
184 | 
185 |     # just iterate over the data
186 |     history = {}
187 |     for epoch in np.arange(args.epochs):
188 | 
189 |         train(args, model, training_generator, optimizer, mean_tr,
190 |               std_tr, epoch, history, n_tr_batches)
191 | 
192 |         update_history.values_update([('loss', None)],
193 |                                      history,
194 |                                      update_mode='epoch')
195 | 
196 |         if epoch % args.evaluate_per == 0:
197 |             eval(args, model, val_generator, mean_tr,
198 |                  std_tr, epoch, history, n_val_batches)
199 | 
200 |             update_history.values_update([('sdr', None),
201 |                                           ('sir', None),
202 |                                           ('sar', None)],
203 |                                          history,
204 |                                          update_mode='epoch')
205 | 
206 |         pprint(history['loss'][-1])
207 |         pprint(history['sdr'][-1])
208 |         pprint(history['sir'][-1])
209 |         pprint(history['sar'][-1])
210 |         print("BEST SDR: {}, SIR: {}, SAR {}".format(max(history['sdr']),
211 |               max(history['sir']), max(history['sar'])))
212 | 
213 | 
214 | if __name__ == "__main__":
215 |     args = parser.get_args()
216 |     convergence_of_LSTM(args)
217 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/modules/model_evaluation.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief For a specific dataset just apply the saved models on a
  3 | specific dataset and save the results
  4 | 
  5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  6 | @copyright University of illinois at Urbana Champaign
  7 | """
  8 | 
  9 | import argparse
 10 | import os
 11 | import sys
 12 | import numpy as np
 13 | from pprint import pprint
 14 | from joblib import Parallel, delayed
 15 | from tqdm import tqdm
 16 | import torch
 17 | import itertools
 18 | import pandas as pd
 19 | from progress.bar import ChargingBar
 20 | 
 21 | root_dir = os.path.join(
 22 |     os.path.dirname(os.path.realpath(__file__)),
 23 |     '../../../')
 24 | sys.path.insert(0, root_dir)
 25 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as data_loader
 26 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as np_eval
 27 | import spatial_two_mics.dnn.utils.model_logger as model_logger
 28 | from spatial_two_mics.config import RESULTS_DIR
 29 | from sklearn.cluster import KMeans
 30 | from spatial_two_mics.utils import robust_means_clustering as  \
 31 |      robust_kmeans
 32 | 
 33 | 
 34 | def eval(dataset_gen,
 35 |          model_path,
 36 |          n_sources,
 37 |          n_batches,
 38 |          n_jobs):
 39 | 
 40 |     model_name = os.path.basename(model_path)
 41 | 
 42 |     eval_dic = {'sdr': [], 'sir': [], 'sar': []}
 43 | 
 44 |     model, optimizer, _, _, args, mean_tr, std_tr, training_labels = \
 45 |         model_logger.load_and_create_the_model(model_path)
 46 | 
 47 |     k_means_obj = KMeans(n_clusters=n_sources, n_jobs=n_jobs)
 48 | 
 49 |     model.eval()
 50 |     with torch.no_grad():
 51 |         bar = ChargingBar("Evaluating model {} ...".format(model_name),
 52 |                           max=n_batches)
 53 |         for batch_data in dataset_gen:
 54 |             abs_tfs, wavs_lists, real_tfs, imag_tfs = batch_data
 55 |             input_tfs = abs_tfs.cuda()
 56 |             # the input sequence is determined by time and not freqs
 57 |             # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps
 58 |             input_tfs = input_tfs.permute(0, 2, 1).contiguous()
 59 | 
 60 |             # normalize with mean and variance from the training dataset
 61 |             input_tfs -= mean_tr
 62 |             input_tfs /= std_tr
 63 | 
 64 |             vs = model(input_tfs)
 65 |             for b in np.arange(vs.size(0)):
 66 |                 embedding_features = vs[b, :, :].data.cpu().numpy()
 67 | 
 68 |                 z_embds = (embedding_features -
 69 |                            np.mean(embedding_features, axis=0)) / (
 70 |                            np.std(embedding_features, axis=0) + 10e-8)
 71 | 
 72 |                 embedding_labels = np.array(k_means_obj.fit_predict(
 73 |                     z_embds))
 74 | 
 75 |                 sdr, sir, sar = np_eval.naive_cpu_bss_eval(
 76 |                     embedding_labels,
 77 |                     real_tfs[b].data.numpy(),
 78 |                     imag_tfs[b].data.numpy(),
 79 |                     wavs_lists[b].data.numpy(),
 80 |                     n_sources,
 81 |                     batch_index=b)
 82 | 
 83 |                 eval_dic['sdr'].append(sdr)
 84 |                 eval_dic['sir'].append(sir)
 85 |                 eval_dic['sar'].append(sar)
 86 | 
 87 |             bar.next()
 88 |         bar.finish()
 89 | 
 90 |     # return both mean and std values
 91 |     mean_std_dic = {}
 92 |     for k, v in eval_dic.items():
 93 |         mean_std_dic[k+"_mean"] = np.mean(np.array(v))
 94 |         mean_std_dic[k+"_std"] = np.std(np.array(v))
 95 |     mean_std_dic['hidden_size'] = args.hidden_size
 96 |     mean_std_dic['num_layers'] = args.n_layers
 97 |     mean_std_dic['embedding_depth'] = args.embedding_depth
 98 |     mean_std_dic['dropout'] = str(args.dropout)
 99 |     mean_std_dic['lr'] = args.learning_rate
100 |     mean_std_dic['training_labels'] = training_labels
101 | 
102 |     return model_name, mean_std_dic
103 | 
104 | 
105 | def evaluate_models(pretrained_models,
106 |                     dataset_folder,
107 |                     n_jobs=1,
108 |                     get_top=None,
109 |                     batch_size=32):
110 | 
111 |     visible_cuda_ids = ','.join(map(str, args.cuda_available_devices))
112 |     os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids
113 | 
114 |     (dataset_dir, partition) = (os.path.dirname(dataset_folder),
115 |                                 os.path.basename(dataset_folder))
116 | 
117 |     default_bs = batch_size
118 |     if get_top is None:
119 |         loading_bs = default_bs
120 |     else:
121 |         loading_bs = min(default_bs, get_top)
122 | 
123 |     print("Initializing the data loader for the dataset...")
124 |     val_generator, n_val_batches, n_val_sources = \
125 |         data_loader.get_data_generator(dataset_dir,
126 |                                        partition=partition,
127 |                                        get_top=get_top,
128 |                                        num_workers=n_jobs,
129 |                                        return_stats=False,
130 |                                        return_n_batches=True,
131 |                                        return_n_sources=True,
132 |                                        batch_size=loading_bs)
133 | 
134 |     eval_results = {}
135 |     for model_path in sorted(pretrained_models):
136 | 
137 |         try:
138 |             test_on = os.path.basename(dataset_dir) + '_' + partition
139 |             train_on = os.path.basename(os.path.dirname(model_path))
140 |             folder_name = os.path.join(RESULTS_DIR,
141 |                                        'test_on_' + test_on)
142 |             if not os.path.exists(folder_name):
143 |                 os.makedirs(folder_name)
144 | 
145 |             # if the df already exists then do not make the evaluation
146 |             df_path = os.path.join(folder_name,
147 |                                    'train_on_' + train_on + '.csv')
148 |             if os.path.exists(df_path):
149 |                 df = pd.read_csv(df_path)
150 |                 df.set_index("Unnamed: 0", drop=True, inplace=True)
151 |                 eval_results = df.to_dict(orient='index')
152 | 
153 |                 if os.path.basename(model_path) in eval_results.keys():
154 |                     continue
155 | 
156 |             model_name, res = eval(val_generator,
157 |                                    model_path,
158 |                                    n_val_sources,
159 |                                    n_val_batches,
160 |                                    n_jobs)
161 | 
162 |             print(model_name)
163 |             print(res)
164 | 
165 |             eval_results[model_name] = res
166 | 
167 |             df = pd.DataFrame(eval_results).T
168 |             df = df.sort_values(['sdr_mean'], ascending=False)
169 |             df.to_csv(df_path)
170 |         except Exception as e:
171 |             print(e)
172 | 
173 |     return df
174 | 
175 | 
176 | def get_args():
177 |     """! Command line parser for computing the evaluation for
178 |     specific datasets"""
179 |     parser = argparse.ArgumentParser(description='Evaluating'
180 |              ' stored models for a specific dataset')
181 |     parser.add_argument("-d", "--dataset_to_test", type=str,
182 |                         help="Dataset path you want to evaluate",
183 |                         default=None)
184 |     parser.add_argument("-m", "--pretrained_models", type=str,
185 |                         nargs='+',
186 |                         help="Paths of pretrained models that you "
187 |                              "need to test on this dataset",
188 |                         default=[])
189 |     parser.add_argument("--n_jobs", type=int,
190 |                         help="Number of parallel spawning jobs",
191 |                         default=1)
192 |     parser.add_argument("-bs", "--batch_size", type=int,
193 |                         help="Batch size to be evaluated",
194 |                         default=32)
195 |     parser.add_argument("--n_eval", type=int,
196 |                         help="""Reduce the number of evaluation 
197 |                             samples to this number.""", default=None)
198 |     parser.add_argument("-cad", "--cuda_available_devices", type=int,
199 |                         nargs="+",
200 |                         help="""A list of Cuda IDs that would be 
201 |                             available for running this experiment""",
202 |                         default=[0])
203 |     return parser.parse_args()
204 | 
205 | 
206 | if __name__ == "__main__":
207 |     args = get_args()
208 |     df_results = evaluate_models(args.pretrained_models,
209 |                                  args.dataset_to_test,
210 |                                  n_jobs=args.n_jobs,
211 |                                  get_top=args.n_eval,
212 |                                  batch_size=args.batch_size)
213 | 
214 |     pd.set_option('display.expand_frame_repr', False)
215 |     print(df_results.sort_values(['sdr_mean'], ascending=False))
216 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/experiments/simple_LSTM_encoder.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief A simple experiment on how models, losses, etc should be used
  3 | 
  4 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  5 | @copyright University of illinois at Urbana Champaign
  6 | """
  7 | 
  8 | import argparse
  9 | import os
 10 | import sys
 11 | import torch
 12 | import time
 13 | import numpy as np
 14 | import copy
 15 | from pprint import pprint
 16 | from torch.utils.data import DataLoader
 17 | 
 18 | root_dir = os.path.join(
 19 |     os.path.dirname(os.path.realpath(__file__)),
 20 |     '../../../')
 21 | sys.path.insert(0, root_dir)
 22 | 
 23 | import spatial_two_mics.dnn.models.simple_LSTM_encoder as LSTM_enc
 24 | import spatial_two_mics.dnn.losses.affinity_approximation as \
 25 |     affinity_losses
 26 | import spatial_two_mics.dnn.utils.dataset as data_generator
 27 | import spatial_two_mics.dnn.utils.data_conversions as converters
 28 | 
 29 | 
 30 | def check_device_model_loading(model):
 31 |     device = 0
 32 |     print(torch.cuda.get_device_capability(device=device))
 33 |     print(torch.cuda.memory_allocated(device=device))
 34 |     print(torch.cuda.memory_cached(device=device))
 35 | 
 36 |     model = model.cuda()
 37 |     print(torch.cuda.get_device_properties(device=device).total_memory)
 38 |     print(torch.cuda.memory_allocated(device))
 39 |     print(torch.cuda.memory_cached(device))
 40 | 
 41 |     temp_model = copy.deepcopy(model)
 42 |     temp_model = temp_model.cuda()
 43 |     print(torch.cuda.max_memory_cached(device=device))
 44 |     print(torch.cuda.memory_allocated(device))
 45 |     print(torch.cuda.memory_cached(device))
 46 | 
 47 | 
 48 | def compare_losses(vs, one_hot_ys):
 49 |     timing_dic = {}
 50 | 
 51 |     before = time.time()
 52 |     flatened_ys = one_hot_ys.view(one_hot_ys.size(0),
 53 |                                   -1,
 54 |                                   one_hot_ys.size(-1)).cuda()
 55 |     naive_loss = affinity_losses.naive(vs, flatened_ys)
 56 |     now = time.time()
 57 |     timing_dic['Naive Loss Implementation'] = now - before
 58 | 
 59 |     before = time.time()
 60 |     expanded_vs = vs.view(vs.size(0), one_hot_ys.size(1),
 61 |                           one_hot_ys.size(2), vs.size(-1)).cuda()
 62 |     diagonal_loss = affinity_losses.diagonal(expanded_vs,
 63 |                                              one_hot_ys)
 64 |     now = time.time()
 65 |     timing_dic['Diagonal Loss Implementation'] = now - before
 66 | 
 67 |     pprint(timing_dic)
 68 | 
 69 |     return diagonal_loss
 70 | 
 71 | 
 72 | def example_of_usage(args):
 73 | 
 74 |     visible_cuda_ids = ','.join(map(str, args.cuda_available_devices))
 75 |     os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids
 76 |     print(visible_cuda_ids)
 77 |     print(torch.cuda.current_device())
 78 | 
 79 |     training_generator, n_batches = data_generator.get_data_generator(
 80 |         args)
 81 |     timing_dic = {}
 82 | 
 83 |     before = time.time()
 84 |     model = LSTM_enc.BLSTMEncoder(num_layers=args.n_layers,
 85 |                                   hidden_size=args.hidden_size,
 86 |                                   embedding_depth=args.embedding_depth,
 87 |                                   bidirectional=args.bidirectional)
 88 |     timing_dic['Iitializing model'] = time.time() - before
 89 |     model = model.cuda()
 90 |     timing_dic['Transfering model to device'] = time.time() - before
 91 | 
 92 |     optimizer = torch.optim.Adam(model.parameters(),
 93 |                                  lr=args.learning_rate,
 94 |                                  betas=(0.9, 0.999))
 95 | 
 96 |     # just iterate over the data
 97 |     epochs = 10
 98 |     for epoch in np.arange(epochs):
 99 |         print("Training for epoch: {}...".format(epoch))
100 |         for batch_data in training_generator:
101 | 
102 |             (abs_tfs, real_tfs, imag_tfs,
103 |              duet_masks, ground_truth_masks,
104 |              sources_raw, amplitudes, n_sources) = batch_data
105 | 
106 |             input_tfs, index_ys = abs_tfs.cuda(), duet_masks.cuda()
107 |             # the input sequence is determined by time and not freqs
108 |             # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps
109 |             input_tfs = input_tfs.permute(0, 2, 1).contiguous()
110 |             index_ys = index_ys.permute(0, 2, 1).contiguous()
111 | 
112 |             one_hot_ys = converters.one_hot_3Dmasks(index_ys, n_sources[0])
113 | 
114 |             timing_dic = {}
115 | 
116 |             optimizer.zero_grad()
117 |             vs = model(input_tfs)
118 | 
119 |             before = time.time()
120 |             flatened_ys = one_hot_ys.view(one_hot_ys.size(0),
121 |                                           -1,
122 |                                           one_hot_ys.size(-1)).cuda()
123 |             naive_loss = affinity_losses.naive(vs, flatened_ys)
124 |             naive_loss.backward()
125 |             optimizer.step()
126 |             now = time.time()
127 |             print("Naive Loss: {}".format(naive_loss))
128 |             timing_dic['Naive Loss Implementation Time'] = now - before
129 | 
130 |             optimizer.zero_grad()
131 |             vs = model(input_tfs)
132 | 
133 |             before = time.time()
134 |             expanded_vs = vs.view(vs.size(0), one_hot_ys.size(1),
135 |                                   one_hot_ys.size(2), vs.size(-1)).cuda()
136 |             diagonal_loss = affinity_losses.diagonal(expanded_vs,
137 |                                                      one_hot_ys)
138 |             diagonal_loss.backward()
139 |             optimizer.step()
140 |             now = time.time()
141 |             print("Diagonal Loss: {}".format(diagonal_loss))
142 |             timing_dic['Diagonal Loss Implementation Time'] = now - before
143 | 
144 |             pprint(timing_dic)
145 | 
146 | 
147 | 
148 | def get_args():
149 |     """! Command line parser """
150 |     parser = argparse.ArgumentParser(description='Deep Clustering for '
151 |                                                  'Audio Source '
152 |                                                  'Separation '
153 |                                                  'Experiment')
154 |     parser.add_argument("--dataset", type=str,
155 |                         help="Dataset name",
156 |                         default="timit")
157 |     parser.add_argument("--n_sources", type=int,
158 |                         help="How many sources in each mix",
159 |                         default=2)
160 |     parser.add_argument("--n_samples", type=int, nargs='+',
161 |                         help="How many samples do u want to be "
162 |                              "created for train test val",
163 |                         default=[256, 64, 128])
164 |     parser.add_argument("--genders", type=str, nargs='+',
165 |                         help="Genders that will correspond to the "
166 |                              "genders in the mixtures",
167 |                         default=['m'])
168 |     parser.add_argument("-f", "--force_delays", nargs='+', type=int,
169 |                         help="""Whether you want to force integer 
170 |                         delays of +- 1 in the sources e.g.""",
171 |                         default=[-1, 1])
172 |     parser.add_argument("-nl", "--n_layers", type=int,
173 |                         help="""The number of layers of the LSTM 
174 |                         encoder""", default=2)
175 |     parser.add_argument("-ed", "--embedding_depth", type=int,
176 |                         help="""The depth of the embedding""",
177 |                         default=10)
178 |     parser.add_argument("-hs", "--hidden_size", type=int,
179 |                         help="""The size of the LSTM cells """,
180 |                         default=10)
181 |     parser.add_argument("-bs", "--batch_size", type=int,
182 |                         help="""The number of samples in each batch""",
183 |                         default=64)
184 |     parser.add_argument("-name", "--experiment_name", type=str,
185 |                         help="""The name or identifier of this 
186 |                         experiment""",
187 |                         default='A sample experiment')
188 |     parser.add_argument("-cad", "--cuda_available_devices", type=int,
189 |                         nargs="+",
190 |                         help="""A list of Cuda IDs that would be 
191 |                         available for runnign this experiment""",
192 |                         default=[0])
193 |     parser.add_argument("--num_workers", type=int,
194 |                         help="""The number of cpu workers for 
195 |                         loading the data, etc.""", default=3)
196 |     parser.add_argument("-lr", "--learning_rate", type=float,
197 |                         help="""Initial Learning rate""", default=1e-1)
198 |     parser.add_argument("--bidirectional", action='store_true',
199 |                         help="""Bidirectional or not""")
200 | 
201 |     return parser.parse_args()
202 | 
203 | 
204 | if __name__ == "__main__":
205 |     args = get_args()
206 |     example_of_usage(args)


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/experiments/check_overfitting.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief Using the fast version of the dataset generator provide a
  3 | naive experimental setup for checking the capability of the model to
  4 | verfit on a set of data
  5 | 
  6 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  7 | @copyright University of illinois at Urbana Champaign
  8 | """
  9 | 
 10 | import os
 11 | import sys
 12 | import torch
 13 | import time
 14 | import numpy as np
 15 | import copy
 16 | from pprint import pprint
 17 | import torch.nn as nn
 18 | 
 19 | root_dir = os.path.join(
 20 |     os.path.dirname(os.path.realpath(__file__)),
 21 |     '../../../')
 22 | sys.path.insert(0, root_dir)
 23 | 
 24 | import spatial_two_mics.dnn.models.simple_LSTM_encoder as LSTM_enc
 25 | import spatial_two_mics.dnn.losses.affinity_approximation as \
 26 |     affinity_losses
 27 | import spatial_two_mics.dnn.utils.fast_dataset_v2 as fast_data_gen
 28 | import spatial_two_mics.dnn.utils.data_conversions as converters
 29 | import spatial_two_mics.dnn.utils.experiment_command_line_parser as \
 30 |     parser
 31 | import spatial_two_mics.dnn.utils.update_history as update_history
 32 | from progress.bar import ChargingBar
 33 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as \
 34 |     numpy_eval
 35 | from sklearn.cluster import KMeans
 36 | 
 37 | 
 38 | def train(args,
 39 |           model,
 40 |           training_generator,
 41 |           optimizer,
 42 |           mean_tr,
 43 |           std_tr,
 44 |           epoch,
 45 |           history,
 46 |           n_batches):
 47 |     model.train()
 48 |     timing_dic = {'Loading batch': 0.,
 49 |                   'Transformations and Forward': 0.,
 50 |                   'Loss Computation and Backprop': 0.}
 51 |     before = time.time()
 52 |     bar = ChargingBar("Training for epoch: {}...".format(epoch),
 53 |                       max=n_batches)
 54 |     for batch_data in training_generator:
 55 |         abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data
 56 |         timing_dic['Loading batch'] += time.time() - before
 57 |         before = time.time()
 58 |         input_tfs, index_ys = abs_tfs.cuda(), masks.cuda()
 59 |         # the input sequence is determined by time and not freqs
 60 |         # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps
 61 |         input_tfs = input_tfs.permute(0, 2, 1).contiguous()
 62 |         index_ys = index_ys.permute(0, 2, 1).contiguous()
 63 | 
 64 |         # normalize with mean and variance from the training dataset
 65 |         input_tfs -= mean_tr
 66 |         input_tfs /= std_tr
 67 | 
 68 |         # index_ys = index_ys.permute(0, 2, 1).contiguous()
 69 |         one_hot_ys = converters.one_hot_3Dmasks(index_ys,
 70 |                                                 args.n_sources)
 71 | 
 72 |         optimizer.zero_grad()
 73 |         vs = model(input_tfs)
 74 | 
 75 |         flatened_ys = one_hot_ys.view(one_hot_ys.size(0),
 76 |                                       -1,
 77 |                                       one_hot_ys.size(-1)).cuda()
 78 | 
 79 |         timing_dic['Transformations and Forward'] += time.time() - \
 80 |                                                      before
 81 |         before = time.time()
 82 |         loss = affinity_losses.paris_naive(vs, flatened_ys)
 83 |         # loss = affinity_losses.diagonal(vs.view(vs.size(0),
 84 |         #                                         one_hot_ys.size(1),
 85 |         #                                         one_hot_ys.size(2),
 86 |         #                                         vs.size(-1)),
 87 |         #                                 one_hot_ys.cuda())
 88 | 
 89 |         loss.backward()
 90 |         nn.utils.clip_grad_norm(model.parameters(), 100.)
 91 |         optimizer.step()
 92 |         timing_dic['Loss Computation and Backprop'] += time.time() - \
 93 |                                                        before
 94 | 
 95 |         update_history.values_update([('loss', loss)],
 96 |                                      history, update_mode='batch')
 97 |         before = time.time()
 98 |         bar.next()
 99 |     bar.finish()
100 | 
101 |     pprint(timing_dic)
102 | 
103 | 
104 | def eval(args,
105 |          model,
106 |          val_generator,
107 |          mean_tr,
108 |          std_tr,
109 |          epoch,
110 |          history,
111 |          n_batches,
112 |          k_means_obj):
113 |     timing_dic = {'Standard Scaler': 0.,
114 |                   'Kmeans': 0.,
115 |                   'Dummy BSS evaluation': 0.}
116 | 
117 |     # make some evaluation
118 |     model.eval()
119 |     before = time.time()
120 |     with torch.no_grad():
121 |         bar = ChargingBar("Evaluating for epoch: {}...".format(epoch),
122 |                           max=n_batches)
123 |         before = time.time()
124 |         for batch_data in val_generator:
125 |             abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data
126 |             input_tfs = abs_tfs.cuda()
127 |             # the input sequence is determined by time and not freqs
128 |             # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps
129 |             input_tfs = input_tfs.permute(0, 2, 1).contiguous()
130 | 
131 |             # normalize with mean and variance from the training dataset
132 |             input_tfs -= mean_tr
133 |             input_tfs /= std_tr
134 | 
135 |             vs = model(input_tfs)
136 |             for b in np.arange(vs.size(0)):
137 | 
138 |                 # possibly go into GPU ?
139 |                 # before = time.time()
140 |                 # embedding_features = z_scaler.fit_transform(
141 |                 #     vs[b, :, :].data.cpu().numpy())
142 |                 # timing_dic['Standard Scaler'] += time.time() - before
143 | 
144 |                 embedding_features = vs[b, :, :].data.cpu().numpy()
145 |                 # embedding_features = masks[b, :, :].view(-1, 1).data.numpy()
146 |                 # embedding_labels = masks[b].data.numpy()
147 |                 # embedding_features = flatened_ys[b, :, :].data.cpu().numpy()
148 | 
149 | 
150 | 
151 |                 # possibly perform kmeans on GPU?
152 |                 before = time.time()
153 |                 embedding_labels = np.array(k_means_obj.fit_predict(
154 |                                             embedding_features))
155 |                 timing_dic['Kmeans'] += time.time() - before
156 | 
157 |                 # possibly do it on GPU?
158 |                 before = time.time()
159 |                 sdr, sir, sar = numpy_eval.naive_cpu_bss_eval(
160 |                     embedding_labels,
161 |                     real_tfs[b].data.numpy(),
162 |                     imag_tfs[b].data.numpy(),
163 |                     wavs_lists[b].data.numpy(),
164 |                     args.n_sources,
165 |                     batch_index=b)
166 |                 timing_dic['Dummy BSS evaluation'] += time.time() - before
167 | 
168 |                 update_history.values_update([('sdr', sdr),
169 |                                               ('sir', sir),
170 |                                               ('sar', sar)],
171 |                                              history,
172 |                                              update_mode='batch')
173 | 
174 |             bar.next()
175 |         pprint(timing_dic)
176 |         bar.finish()
177 | 
178 | 
179 | def convergence_of_LSTM(args):
180 |     visible_cuda_ids = ','.join(map(str, args.cuda_available_devices))
181 |     os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids
182 | 
183 |     val_args = copy.copy(args)
184 |     val_args.partition = 'val'
185 |     val_generator, mean_val, std_val, n_val_batches = \
186 |         fast_data_gen.get_data_generator(val_args,
187 |                                          return_stats=True,
188 |                                          get_top=args.n_eval)
189 | 
190 |     model = LSTM_enc.BLSTMEncoder(num_layers=args.n_layers,
191 |                                   hidden_size=args.hidden_size,
192 |                                   embedding_depth=args.embedding_depth,
193 |                                   bidirectional=args.bidirectional)
194 |     model = nn.DataParallel(model).cuda()
195 | 
196 |     optimizer = torch.optim.Adam(model.parameters(),
197 |                                  lr=args.learning_rate,
198 |                                  betas=(0.9, 0.999))
199 | 
200 |     k_means_obj = KMeans(n_clusters=2)
201 |     # just iterate over the data
202 |     history = {}
203 |     for epoch in np.arange(args.epochs):
204 | 
205 |         train(args, model, val_generator, optimizer, mean_val,
206 |               std_val, epoch, history, n_val_batches)
207 | 
208 |         update_history.values_update([('loss', None)],
209 |                                      history,
210 |                                      update_mode='epoch')
211 | 
212 | 
213 |         if epoch % args.evaluate_per == 0:
214 |             eval(args, model, val_generator, mean_val,
215 |                  std_val, epoch, history, n_val_batches, k_means_obj)
216 | 
217 |             update_history.values_update([('sdr', None),
218 |                                           ('sir', None),
219 |                                           ('sar', None)],
220 |                                          history,
221 |                                          update_mode='epoch')
222 | 
223 |         pprint(history['loss'][-1])
224 |         pprint(history['sdr'][-1])
225 |         pprint(history['sir'][-1])
226 |         pprint(history['sar'][-1])
227 |         print(
228 |             "BEST SDR: {}, SIR: {}, SAR {}".format(max(history['sdr']),
229 |                                                    max(history['sir']),
230 |                                                    max(history['sar'])))
231 | 
232 | 
233 | if __name__ == "__main__":
234 |     args = parser.get_args()
235 |     convergence_of_LSTM(args)
236 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/experiments/convergence_check_v2.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief Using the fast version of the dataset generator provide a
  3 | naive experimental setup for checking the convergence of the model
  4 | 
  5 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  6 | @copyright University of illinois at Urbana Champaign
  7 | """
  8 | 
  9 | import os
 10 | import sys
 11 | import torch
 12 | import time
 13 | import numpy as np
 14 | import copy
 15 | from pprint import pprint
 16 | import torch.nn as nn
 17 | 
 18 | root_dir = os.path.join(
 19 |     os.path.dirname(os.path.realpath(__file__)),
 20 |     '../../../')
 21 | sys.path.insert(0, root_dir)
 22 | 
 23 | import spatial_two_mics.dnn.models.simple_LSTM_encoder as LSTM_enc
 24 | import spatial_two_mics.dnn.losses.affinity_approximation as \
 25 |     affinity_losses
 26 | import spatial_two_mics.dnn.utils.fast_dataset_v2 as fast_data_gen
 27 | import spatial_two_mics.dnn.utils.data_conversions as converters
 28 | import spatial_two_mics.dnn.utils.experiment_command_line_parser as \
 29 |     parser
 30 | import spatial_two_mics.dnn.utils.update_history as update_history
 31 | from progress.bar import ChargingBar
 32 | import spatial_two_mics.utils.robust_means_clustering as robust_kmeans
 33 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as \
 34 |     numpy_eval
 35 | from sklearn.preprocessing import StandardScaler
 36 | from sklearn.cluster import KMeans
 37 | import librosa
 38 | 
 39 | 
 40 | def train(args,
 41 |           model,
 42 |           training_generator,
 43 |           optimizer,
 44 |           mean_tr,
 45 |           std_tr,
 46 |           epoch,
 47 |           history,
 48 |           n_batches):
 49 |     model.train()
 50 |     timing_dic = {'Loading batch': 0.,
 51 |                   'Transformations and Forward': 0.,
 52 |                   'Loss Computation and Backprop': 0.}
 53 |     before = time.time()
 54 |     bar = ChargingBar("Training for epoch: {}...".format(epoch),
 55 |                       max=n_batches)
 56 |     for batch_data in training_generator:
 57 |         (abs_tfs, masks) = batch_data
 58 |         timing_dic['Loading batch'] += time.time() - before
 59 |         before = time.time()
 60 |         input_tfs, index_ys = abs_tfs.cuda(), masks.cuda()
 61 |         # the input sequence is determined by time and not freqs
 62 |         # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps
 63 |         input_tfs = input_tfs.permute(0, 2, 1).contiguous()
 64 |         index_ys = index_ys.permute(0, 2, 1).contiguous()
 65 | 
 66 |         # normalize with mean and variance from the training dataset
 67 |         input_tfs -= mean_tr
 68 |         input_tfs /= std_tr
 69 | 
 70 |         # index_ys = index_ys.permute(0, 2, 1).contiguous()
 71 |         one_hot_ys = converters.one_hot_3Dmasks(index_ys,
 72 |                                                 args.n_sources)
 73 | 
 74 |         optimizer.zero_grad()
 75 |         vs = model(input_tfs)
 76 | 
 77 |         flatened_ys = one_hot_ys.view(one_hot_ys.size(0),
 78 |                                       -1,
 79 |                                       one_hot_ys.size(-1)).cuda()
 80 | 
 81 |         timing_dic['Transformations and Forward'] += time.time() - \
 82 |                                                      before
 83 |         before = time.time()
 84 |         loss = affinity_losses.paris_naive(vs, flatened_ys)
 85 |         # loss = affinity_losses.diagonal(vs.view(vs.size(0),
 86 |         #                                         one_hot_ys.size(1),
 87 |         #                                         one_hot_ys.size(2),
 88 |         #                                         vs.size(-1)),
 89 |         #                                 one_hot_ys.cuda())
 90 | 
 91 |         loss.backward()
 92 |         nn.utils.clip_grad_norm(model.parameters(), 100.)
 93 |         optimizer.step()
 94 |         timing_dic['Loss Computation and Backprop'] += time.time() - \
 95 |                                                        before
 96 | 
 97 |         update_history.values_update([('loss', loss)],
 98 |                                      history, update_mode='batch')
 99 |         before = time.time()
100 |         bar.next()
101 |     bar.finish()
102 | 
103 |     pprint(timing_dic)
104 | 
105 | 
106 | def eval(args,
107 |          model,
108 |          val_generator,
109 |          mean_tr,
110 |          std_tr,
111 |          epoch,
112 |          history,
113 |          n_batches,
114 |          k_means_obj):
115 |     timing_dic = {'Standard Scaler': 0.,
116 |                   'Kmeans': 0.,
117 |                   'Dummy BSS evaluation': 0.}
118 | 
119 |     # make some evaluation
120 |     model.eval()
121 |     before = time.time()
122 |     with torch.no_grad():
123 |         bar = ChargingBar("Evaluating for epoch: {}...".format(epoch),
124 |                           max=n_batches)
125 |         before = time.time()
126 |         for batch_data in val_generator:
127 |             abs_tfs, masks, wavs_lists, real_tfs, imag_tfs = batch_data
128 |             input_tfs = abs_tfs.cuda()
129 |             # the input sequence is determined by time and not freqs
130 |             # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps
131 |             input_tfs = input_tfs.permute(0, 2, 1).contiguous()
132 | 
133 |             # normalize with mean and variance from the training dataset
134 |             input_tfs -= mean_tr
135 |             input_tfs /= std_tr
136 | 
137 |             vs = model(input_tfs)
138 |             for b in np.arange(vs.size(0)):
139 | 
140 |                 # possibly go into GPU ?
141 |                 # before = time.time()
142 |                 # embedding_features = z_scaler.fit_transform(
143 |                 #     vs[b, :, :].data.cpu().numpy())
144 |                 # timing_dic['Standard Scaler'] += time.time() - before
145 | 
146 |                 embedding_features = vs[b, :, :].data.cpu().numpy()
147 |                 # embedding_features = masks[b, :, :].view(-1, 1).data.numpy()
148 |                 # embedding_labels = masks[b].data.numpy()
149 |                 # embedding_features = flatened_ys[b, :, :].data.cpu().numpy()
150 | 
151 | 
152 | 
153 |                 # possibly perform kmeans on GPU?
154 |                 before = time.time()
155 |                 embedding_labels = np.array(k_means_obj.fit_predict(
156 |                                             embedding_features))
157 |                 timing_dic['Kmeans'] += time.time() - before
158 | 
159 |                 # possibly do it on GPU?
160 |                 before = time.time()
161 |                 sdr, sir, sar = numpy_eval.naive_cpu_bss_eval(
162 |                     embedding_labels,
163 |                     real_tfs[b].data.numpy(),
164 |                     imag_tfs[b].data.numpy(),
165 |                     wavs_lists[b].data.numpy(),
166 |                     args.n_sources,
167 |                     batch_index=b)
168 |                 timing_dic['Dummy BSS evaluation'] += time.time() - before
169 | 
170 |                 update_history.values_update([('sdr', sdr),
171 |                                               ('sir', sir),
172 |                                               ('sar', sar)],
173 |                                              history,
174 |                                              update_mode='batch')
175 | 
176 |             bar.next()
177 |         pprint(timing_dic)
178 |         bar.finish()
179 | 
180 | 
181 | def convergence_of_LSTM(args):
182 |     visible_cuda_ids = ','.join(map(str, args.cuda_available_devices))
183 |     os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids
184 | 
185 |     (training_generator, mean_tr, std_tr, n_tr_batches) = \
186 |         fast_data_gen.get_data_generator(args,
187 |                                          return_stats=True)
188 | 
189 |     val_args = copy.copy(args)
190 |     val_args.partition = 'val'
191 |     val_generator, n_val_batches = \
192 |         fast_data_gen.get_data_generator(val_args,
193 |                                          get_top=args.n_eval)
194 | 
195 |     model = LSTM_enc.BLSTMEncoder(num_layers=args.n_layers,
196 |                                   hidden_size=args.hidden_size,
197 |                                   embedding_depth=args.embedding_depth,
198 |                                   bidirectional=args.bidirectional)
199 |     model = nn.DataParallel(model).cuda()
200 | 
201 |     optimizer = torch.optim.Adam(model.parameters(),
202 |                                  lr=args.learning_rate,
203 |                                  betas=(0.9, 0.999))
204 | 
205 |     k_means_obj = KMeans(n_clusters=2)
206 |     # just iterate over the data
207 |     history = {}
208 |     for epoch in np.arange(args.epochs):
209 | 
210 |         train(args, model, training_generator, optimizer, mean_tr,
211 |               std_tr, epoch, history, n_tr_batches)
212 | 
213 |         update_history.values_update([('loss', None)],
214 |                                      history,
215 |                                      update_mode='epoch')
216 | 
217 | 
218 |         if epoch % args.evaluate_per == 0:
219 |             eval(args, model, val_generator, mean_tr,
220 |                  std_tr, epoch, history, n_val_batches, k_means_obj)
221 | 
222 |             update_history.values_update([('sdr', None),
223 |                                           ('sir', None),
224 |                                           ('sar', None)],
225 |                                          history,
226 |                                          update_mode='epoch')
227 | 
228 |         pprint(history['loss'][-1])
229 |         pprint(history['sdr'][-1])
230 |         pprint(history['sir'][-1])
231 |         pprint(history['sar'][-1])
232 |         print(
233 |             "BEST SDR: {}, SIR: {}, SAR {}".format(max(history['sdr']),
234 |                                                    max(history['sir']),
235 |                                                    max(history['sar'])))
236 | 
237 | 
238 | if __name__ == "__main__":
239 |     args = parser.get_args()
240 |     convergence_of_LSTM(args)
241 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/utils/fast_dataset_v3.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief A dataset creation which is compatible with pytorch framework
  3 | and much faster in loading time depending on the new version of
  4 | loading only the appropriate files that might be needed. Moreover
  5 | this dataset has minimal input argument requirements in order to be
  6 | more user friendly.
  7 | 
  8 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  9 | @copyright University of illinois at Urbana Champaign
 10 | """
 11 | 
 12 | import os
 13 | import glob2
 14 | import numpy as np
 15 | from sklearn.externals import joblib
 16 | from torch.utils.data import Dataset, DataLoader
 17 | 
 18 | 
 19 | class PytorchMixtureDataset(Dataset):
 20 |     """
 21 |     This is a general compatible class for pytorch datasets.
 22 | 
 23 |     @note Each instance of the dataset should be stored using
 24 |     joblib.dump() and this is the way that it would be returned.
 25 |     After some transformations.
 26 | 
 27 |     The path of all datasets should be defined inside config.
 28 |     All datasets should be formatted with appropriate subfolders of
 29 |     train / test and val and under them there should be all the
 30 |     available files.
 31 |     """
 32 |     def __init__(self,
 33 |                  dataset_dir,
 34 |                  partition='train',
 35 |                  get_top=None,
 36 |                  labels_mask='duet',
 37 |                  only_mask_evaluation=False,
 38 |                  **kwargs):
 39 |         """!
 40 |         Input dataset dir should have the following structure:
 41 |         ./dataset_dir
 42 |             ./train
 43 |             ./test
 44 |             ./val
 45 |         """
 46 | 
 47 |         self.dataset_dirpath = os.path.join(dataset_dir,
 48 |                                             partition)
 49 |         self.dataset_stats_path = self.dataset_dirpath + '_stats'
 50 |         self.partition = partition
 51 | 
 52 |         if (labels_mask == 'duet'
 53 |             or labels_mask == 'ground_truth'
 54 |             or labels_mask == 'raw_phase_diff'):
 55 |             self.selected_mask = labels_mask
 56 |         elif labels_mask is None:
 57 |             pass
 58 |         else:
 59 |             raise NotImplementedError("There is no available mask "
 60 |                   "called: {}".format(labels_mask))
 61 | 
 62 |         if not os.path.isdir(self.dataset_dirpath):
 63 |             raise IOError("Dataset folder {} not found!".format(
 64 |                 self.dataset_dirpath))
 65 |         else:
 66 |             print("Loading files from {} ...".format(
 67 |                 self.dataset_dirpath))
 68 | 
 69 |         self.mixture_folders = glob2.glob(os.path.join(
 70 |                                self.dataset_dirpath, '*'))
 71 |         if get_top is not None:
 72 |             self.mixture_folders = self.mixture_folders[:get_top]
 73 | 
 74 |         self.n_samples = len(self.mixture_folders)
 75 |         self.only_mask_evaluation = only_mask_evaluation
 76 | 
 77 |         self.n_sources = int(os.path.basename(
 78 |                              dataset_dir).split("_")[4])
 79 | 
 80 |         # preprocess -- store all absolute spectra values for faster
 81 |         # loading during run time
 82 |         self.store_directly_abs_spectra()
 83 | 
 84 |     def __len__(self):
 85 |         return self.n_samples
 86 | 
 87 |     def __getitem__(self, idx):
 88 |         """!
 89 |         Depending on the selected partition it returns accordingly
 90 |         the following objects:
 91 | 
 92 |         if self.partition == 'train':
 93 |             (abs_tfs, selected_mask)
 94 |         else if partition == 'test' or 'val'
 95 |             (abs_tfs, selected_mask, wavs_list, real_tfs, imag_tfs)"""
 96 |         mix_folder = self.mixture_folders[idx]
 97 |         try:
 98 |             abs_tfs = joblib.load(os.path.join(mix_folder, 'abs_tfs'))
 99 |         except:
100 |             raise IOError("Failed to load data from path: {} "
101 |                           "for absolute spectra.".format(mix_folder))
102 | 
103 |         if self.partition == 'val' or self.partition == 'test':
104 |             try:
105 |                 real_p = os.path.join(mix_folder, 'real_tfs')
106 |                 imag_p = os.path.join(mix_folder, 'imag_tfs')
107 |                 wavs_p = os.path.join(mix_folder, 'wavs')
108 |                 real_tfs = joblib.load(real_p)
109 |                 imag_tfs = joblib.load(imag_p)
110 |                 wavs_list = joblib.load(wavs_p)
111 |                 wavs_list = np.array(wavs_list)
112 |             except:
113 |                 raise IOError("Failed to load data from path: {} "
114 |                               "for real, imag tf of the mixture and "
115 |                               "wavs".format(mix_folder))
116 | 
117 |             if not self.only_mask_evaluation:
118 |                 return abs_tfs, wavs_list, real_tfs, imag_tfs
119 | 
120 |             try:
121 |                 if self.selected_mask == 'duet':
122 |                     mask = joblib.load(os.path.join(mix_folder,
123 |                                                     'soft_labeled_mask'))
124 |                 elif self.selected_mask == 'ground_truth':
125 |                     mask = joblib.load(os.path.join(mix_folder,
126 |                                                     'ground_truth_mask'))
127 |             except Exception as e:
128 |                 print(e)
129 |                 raise IOError("Failed to load data from path: {} "
130 |                               "for tf label masks".format(mix_folder))
131 | 
132 |             return abs_tfs, mask, wavs_list, real_tfs, imag_tfs
133 | 
134 |         if self.partition == 'train':
135 |             try:
136 |                 if self.selected_mask == 'duet':
137 |                     mask = joblib.load(os.path.join(mix_folder,
138 |                                        'soft_labeled_mask'))
139 |                 elif self.selected_mask == 'ground_truth':
140 |                     mask = joblib.load(os.path.join(mix_folder,
141 |                                        'ground_truth_mask'))
142 |                 else:
143 |                     mask = joblib.load(os.path.join(mix_folder,
144 |                                        'raw_phase_diff'))
145 |             except Exception as e:
146 |                 print(e)
147 |                 raise IOError("Failed to load data from path: {} "
148 |                               "for tf label masks".format(mix_folder))
149 |             return abs_tfs, mask
150 | 
151 |         return None
152 | 
153 |     def store_directly_abs_spectra(self):
154 |         for mix_folder in self.mixture_folders:
155 |             abs_p = os.path.join(mix_folder, 'abs_tfs')
156 |             if os.path.lexists(abs_p):
157 |                 continue
158 | 
159 |             try:
160 |                 real_p = os.path.join(mix_folder, 'real_tfs')
161 |                 imag_p = os.path.join(mix_folder, 'imag_tfs')
162 |                 real_tfs = joblib.load(real_p)
163 |                 imag_tfs = joblib.load(imag_p)
164 |             except:
165 |                 raise IOError("Failed to load data from path: {} "
166 |                               "using joblib.".format(mix_folder))
167 |             abs_tfs = np.abs(real_tfs + 1j * imag_tfs)
168 |             try:
169 |                 joblib.dump(abs_tfs, abs_p, compress=0)
170 |             except:
171 |                 raise IOError("Failed to save absolute value of "
172 |                               "spectra in path: {}".format(abs_p))
173 | 
174 |     def extract_stats(self):
175 |         if not os.path.lexists(self.dataset_stats_path):
176 |             mean = 0.
177 |             std = 0.
178 |             for mix_folder in self.mixture_folders:
179 |                 try:
180 |                     abs_p = os.path.join(mix_folder, 'abs_tfs')
181 |                     abs_tfs = joblib.load(abs_p)
182 |                 except:
183 |                     raise IOError("Failed to load absolute tf "
184 |                                   "representation from path: {} "
185 |                                   "using joblib.".format(abs_p))
186 | 
187 |                 mean += np.mean(np.mean(abs_tfs))
188 |                 std += np.std(abs_tfs)
189 |             mean /= self.__len__()
190 |             std /= self.__len__()
191 | 
192 |             #     store them for later usage
193 |             joblib.dump((mean, std), self.dataset_stats_path)
194 |             print("Saving dataset mean and variance in: {}".format(
195 |                 self.dataset_stats_path))
196 |         else:
197 |             mean, std = joblib.load(self.dataset_stats_path)
198 | 
199 |         return mean, std
200 | 
201 | 
202 | def get_data_generator(dataset_dir,
203 |                        partition='train',
204 |                        num_workers=1,
205 |                        return_stats=False,
206 |                        get_top=None,
207 |                        batch_size=1,
208 |                        return_n_batches=True,
209 |                        labels_mask='duet',
210 |                        return_n_sources=False,
211 |                        only_mask_evaluation=False):
212 |     data = PytorchMixtureDataset(dataset_dir,
213 |                                  partition=partition,
214 |                                  get_top=get_top,
215 |                                  labels_mask=labels_mask,
216 |                                  only_mask_evaluation=only_mask_evaluation)
217 |     generator_params = {'batch_size': batch_size,
218 |                         'shuffle': True,
219 |                         'num_workers': num_workers,
220 |                         'drop_last': True}
221 |     data_generator = DataLoader(data,
222 |                                 **generator_params,
223 |                                 pin_memory=False)
224 | 
225 |     results = [data_generator]
226 | 
227 |     if return_stats:
228 |         mean, std = data.extract_stats()
229 |         results += [mean, std]
230 | 
231 |     if return_n_batches:
232 |         n_batches = int(len(data) / batch_size)
233 |         results.append(n_batches)
234 | 
235 |     if return_n_sources:
236 |         results.append(data.n_sources)
237 | 
238 |     return results
239 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/experiments/run_experiment_v1.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief Using the fast version of the dataset generator provide a
  3 | naive experimental setup for performing the experiment using also the
  4 | new command line argument parser.
  5 | 
  6 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  7 | @copyright University of illinois at Urbana Champaign
  8 | """
  9 | 
 10 | 
 11 | 
 12 | import os
 13 | import sys
 14 | import torch
 15 | import time
 16 | import numpy as np
 17 | import copy
 18 | from pprint import pprint
 19 | import torch.nn as nn
 20 | 
 21 | root_dir = os.path.join(
 22 |     os.path.dirname(os.path.realpath(__file__)),
 23 |     '../../../')
 24 | sys.path.insert(0, root_dir)
 25 | 
 26 | import spatial_two_mics.dnn.models.simple_LSTM_encoder as LSTM_enc
 27 | import spatial_two_mics.dnn.losses.affinity_approximation as \
 28 |     affinity_losses
 29 | import spatial_two_mics.dnn.utils.fast_dataset_v3 as fast_data_gen
 30 | import spatial_two_mics.dnn.utils.data_conversions as converters
 31 | import spatial_two_mics.dnn.utils.experiment_command_line_parser_v2 as \
 32 |     parser
 33 | import spatial_two_mics.dnn.utils.update_history as update_history
 34 | import spatial_two_mics.dnn.utils.model_logger as model_logger
 35 | from progress.bar import ChargingBar
 36 | import spatial_two_mics.dnn.evaluation.naive_evaluation_numpy as \
 37 |     numpy_eval
 38 | from sklearn.cluster import KMeans
 39 | 
 40 | 
 41 | def train(model,
 42 |           training_generator,
 43 |           optimizer,
 44 |           mean_tr,
 45 |           std_tr,
 46 |           epoch,
 47 |           history,
 48 |           n_batches,
 49 |           n_sources,
 50 |           training_labels=''):
 51 |     model.train()
 52 |     bar = ChargingBar("Training for epoch: {}...".format(epoch),
 53 |                       max=n_batches)
 54 |     for batch_data in training_generator:
 55 |         (abs_tfs, masks) = batch_data
 56 |         input_tfs, index_ys = abs_tfs.cuda(), masks.cuda()
 57 |         # the input sequence is determined by time and not freqs
 58 |         # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps
 59 |         input_tfs = input_tfs.permute(0, 2, 1).contiguous()
 60 |         index_ys = index_ys.permute(0, 2, 1).contiguous()
 61 | 
 62 |         # normalize with mean and variance from the training dataset
 63 |         input_tfs -= mean_tr
 64 |         input_tfs /= std_tr
 65 | 
 66 |         if training_labels == 'raw_phase_diff':
 67 |             flatened_ys = index_ys.view(index_ys.size(0), -1, 1)
 68 |         else:
 69 |             # index_ys = index_ys.permute(0, 2, 1).contiguous()
 70 |             one_hot_ys = converters.one_hot_3Dmasks(index_ys,
 71 |                                                     n_sources)
 72 |             flatened_ys = one_hot_ys.view(one_hot_ys.size(0),
 73 |                                           -1,
 74 |                                           one_hot_ys.size(-1)).cuda()
 75 | 
 76 |         optimizer.zero_grad()
 77 |         vs = model(input_tfs)
 78 | 
 79 | 
 80 |         loss = affinity_losses.paris_naive(vs, flatened_ys)
 81 | 
 82 |         loss.backward()
 83 |         nn.utils.clip_grad_norm_(model.parameters(), 100.)
 84 |         optimizer.step()
 85 | 
 86 |         update_history.values_update([('loss', loss)],
 87 |                                      history, update_mode='batch')
 88 |         bar.next()
 89 |     bar.finish()
 90 | 
 91 | 
 92 | def eval(model,
 93 |          val_generator,
 94 |          mean_tr,
 95 |          std_tr,
 96 |          epoch,
 97 |          history,
 98 |          n_batches,
 99 |          k_means_obj,
100 |          n_sources,
101 |          batch_size):
102 | 
103 |     model.eval()
104 |     with torch.no_grad():
105 |         bar = ChargingBar("Evaluating for epoch: {}...".format(epoch),
106 |                           max=n_batches*batch_size)
107 |         for batch_data in val_generator:
108 |             abs_tfs, wavs_lists, real_tfs, imag_tfs = batch_data
109 |             input_tfs = abs_tfs.cuda()
110 |             # the input sequence is determined by time and not freqs
111 |             # before: input_tfs = batch_size x (n_fft/2+1) x n_timesteps
112 |             input_tfs = input_tfs.permute(0, 2, 1).contiguous()
113 | 
114 |             # normalize with mean and variance from the training dataset
115 |             input_tfs -= mean_tr
116 |             input_tfs /= std_tr
117 | 
118 |             vs = model(input_tfs)
119 |             for b in np.arange(vs.size(0)):
120 | 
121 |                 embedding_features = vs[b, :, :].data.cpu().numpy()
122 | 
123 |                 embedding_labels = np.array(k_means_obj.fit_predict(
124 |                                             embedding_features))
125 | 
126 |                 sdr, sir, sar = numpy_eval.naive_cpu_bss_eval(
127 |                     embedding_labels,
128 |                     real_tfs[b].data.numpy(),
129 |                     imag_tfs[b].data.numpy(),
130 |                     wavs_lists[b].data.numpy(),
131 |                     n_sources,
132 |                     batch_index=b)
133 | 
134 |                 update_history.values_update([('sdr', sdr),
135 |                                               ('sir', sir),
136 |                                               ('sar', sar)],
137 |                                              history,
138 |                                              update_mode='batch')
139 | 
140 |                 bar.next()
141 |         bar.finish()
142 | 
143 | 
144 | def run_LSTM_experiment(args):
145 |     visible_cuda_ids = ','.join(map(str, args.cuda_available_devices))
146 |     os.environ["CUDA_VISIBLE_DEVICES"] = visible_cuda_ids
147 | 
148 |     (training_generator, mean_tr, std_tr, n_tr_batches, n_tr_sources) =\
149 |     fast_data_gen.get_data_generator(args.train,
150 |                                      partition='train',
151 |                                      num_workers=args.num_workers,
152 |                                      return_stats=True,
153 |                                      get_top=args.n_train,
154 |                                      batch_size=args.batch_size,
155 |                                      return_n_batches=True,
156 |                                      labels_mask=args.training_labels,
157 |                                      return_n_sources=True)
158 | 
159 |     val_generator, n_val_batches, n_val_sources = \
160 |     fast_data_gen.get_data_generator(args.val,
161 |                                      partition='val',
162 |                                      num_workers=args.num_workers,
163 |                                      return_stats=False,
164 |                                      get_top=args.n_val,
165 |                                      batch_size=args.batch_size,
166 |                                      return_n_batches=True,
167 |                                      labels_mask=None,
168 |                                      return_n_sources=True)
169 | 
170 |     model = LSTM_enc.BLSTMEncoder(num_layers=args.n_layers,
171 |                                   hidden_size=args.hidden_size,
172 |                                   embedding_depth=args.embedding_depth,
173 |                                   bidirectional=args.bidirectional,
174 |                                   dropout=args.dropout)
175 |     model = nn.DataParallel(model).cuda()
176 | 
177 |     optimizer = torch.optim.Adam(model.parameters(),
178 |                                  lr=args.learning_rate,
179 |                                  betas=(0.9, 0.999))
180 | 
181 |     assert n_val_sources == n_tr_sources, "Number of sources in both " \
182 |                                           "training and evaluation " \
183 |                                           "should be equal while " \
184 |                                           "training"
185 |     k_means_obj = KMeans(n_clusters=n_tr_sources)
186 |     # just iterate over the data
187 |     history = {}
188 |     for epoch in np.arange(args.epochs):
189 | 
190 |         train(model, training_generator, optimizer, mean_tr,
191 |               std_tr, epoch, history, n_tr_batches, n_tr_sources,
192 |               training_labels=args.training_labels)
193 | 
194 |         update_history.values_update([('loss', None)],
195 |                                      history,
196 |                                      update_mode='epoch')
197 | 
198 | 
199 |         if epoch % args.eval_per == 0:
200 |             eval(model, val_generator, mean_tr, std_tr, epoch,
201 |                  history, n_val_batches, k_means_obj, n_val_sources,
202 |                  args.batch_size)
203 | 
204 |             update_history.values_update([('sdr', None),
205 |                                           ('sir', None),
206 |                                           ('sar', None)],
207 |                                          history,
208 |                                          update_mode='epoch')
209 | 
210 |             # keep track of best performances so far
211 |             epoch_performance_dic = {
212 |                 'sdr': history['sdr'][-1],
213 |                 'sir': history['sir'][-1],
214 |                 'sar': history['sar'][-1]
215 |             }
216 |             update_history.update_best_performance(
217 |                            epoch_performance_dic, epoch, history,
218 |                            buffer_size=args.save_best)
219 | 
220 | 
221 |             # save the model if it is one of the best according to SDR
222 |             if (history['sdr'][-1] >=
223 |                 history['best_performances'][-1][0]['sdr']):
224 |                 dataset_id = os.path.basename(args.train)
225 | 
226 |                 model_logger.save(model,
227 |                                   optimizer,
228 |                                   args,
229 |                                   epoch,
230 |                                   epoch_performance_dic,
231 |                                   dataset_id,
232 |                                   mean_tr,
233 |                                   std_tr,
234 |                                   training_labels=args.training_labels)
235 | 
236 | 
237 |         pprint(history['loss'][-1])
238 |         pprint(history['best_performances'])
239 | 
240 | 
241 | if __name__ == "__main__":
242 |     args = parser.get_args()
243 |     run_LSTM_experiment(args)


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/utils/dataset.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief A dataset creation which is compatible with pytorch framework
  3 | 
  4 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  5 | @copyright University of illinois at Urbana Champaign
  6 | """
  7 | 
  8 | import torch
  9 | import argparse
 10 | import os
 11 | import sys
 12 | import glob2
 13 | import numpy as np
 14 | from sklearn.externals import joblib
 15 | import scipy.io.wavfile as wavfile
 16 | from torch.utils.data import Dataset, DataLoader
 17 | from pprint import pprint
 18 | 
 19 | root_dir = os.path.join(
 20 |            os.path.dirname(os.path.realpath(__file__)),
 21 |            '../../../')
 22 | sys.path.insert(0, root_dir)
 23 | import spatial_two_mics.utils.audio_mixture_constructor as \
 24 |     mixture_creator
 25 | import spatial_two_mics.config as config
 26 | import spatial_two_mics.data_generator.dataset_storage as \
 27 |     dataset_storage
 28 | 
 29 | 
 30 | class PytorchMixtureDataset(Dataset):
 31 |     """
 32 |     This is a general compatible class for pytorch datasets.
 33 | 
 34 |     @note Each instance of the dataset should be stored using
 35 |     joblib.dump() and this is the way that it would be returned.
 36 |     After some transformations.
 37 | 
 38 |     The path of all datasets should be defined inside config.
 39 |     All datasets should be formatted with appropriate subfolders of
 40 |     train / test and val and under them there should be all the
 41 |     available files.
 42 |     """
 43 |     def __init__(self,
 44 |                  dataset='timit',
 45 |                  partition='train',
 46 |                  n_samples=[512, 128, 256],
 47 |                  n_sources=2,
 48 |                  genders=['f', 'm'],
 49 |                  n_fft=512,
 50 |                  win_len=512,
 51 |                  hop_length=128,
 52 |                  mixture_duration=2.0,
 53 |                  force_delays=[-1, 1],
 54 |                  get_top=None,
 55 |                  **kwargs):
 56 | 
 57 |         self.dataset_params = {
 58 |             'dataset': dataset,
 59 |             'n_samples': n_samples,
 60 |             'n_sources': n_sources,
 61 |             'genders': genders,
 62 |             'force_delays': force_delays
 63 |         }
 64 |         dataset_name = dataset_storage.create_dataset_name(
 65 |                                        self.dataset_params)
 66 | 
 67 |         self.dataset_stats_path = os.path.join(config.DATASETS_DIR,
 68 |                                                dataset_name,
 69 |                                                partition+'_stats')
 70 | 
 71 |         self.dataset_dirpath = os.path.join(
 72 |                                config.DATASETS_DIR,
 73 |                                dataset_name,
 74 |                                partition)
 75 | 
 76 |         if not os.path.isdir(self.dataset_dirpath):
 77 |             raise IOError("Dataset folder {} not found!".format(
 78 |                           self.dataset_dirpath))
 79 |         else:
 80 |             print("Loading files from {} ...".format(
 81 |                   self.dataset_dirpath))
 82 | 
 83 |         self.data_paths = glob2.glob(os.path.join(self.dataset_dirpath,
 84 |                                                   '*'))
 85 |         if get_top is not None:
 86 |             self.data_paths = self.data_paths[:get_top]
 87 | 
 88 |         self.n_samples = len(self.data_paths)
 89 | 
 90 |         self.mix_creator = mixture_creator.AudioMixtureConstructor(
 91 |                            n_fft=n_fft,
 92 |                            win_len=win_len,
 93 |                            hop_len=hop_length,
 94 |                            mixture_duration=mixture_duration,
 95 |                            force_delays=force_delays)
 96 | 
 97 |     def __len__(self):
 98 |         return self.n_samples
 99 | 
100 |     def __getitem__(self, idx):
101 |         file_path = self.data_paths[idx]
102 |         try:
103 |             mixture_info = joblib.load(file_path)
104 |         except:
105 |             raise IOError("Failed to load data from path: {} "
106 |                           "using joblib.".format(file_path))
107 | 
108 |         tf_info = self.mix_creator.construct_mixture(mixture_info)
109 |         mixture_tf = tf_info['m1_tf']
110 |         abs_tf = abs(mixture_tf)
111 |         real_tf = np.real(mixture_tf)
112 |         imag_tf = np.imag(mixture_tf)
113 | 
114 |         # assert (real_tf + 1j * imag_tf == mixture_tf).all()
115 | 
116 |         duet_mask = None
117 |         ground_truth_mask = None
118 |         try:
119 |             duet_mask = mixture_info['soft_labeled_mask']
120 |         except:
121 |             raise KeyError("Mixture info does not have a soft label "
122 |                            "attribute inferred by duet algorithm")
123 | 
124 |         try:
125 |             ground_truth_mask = mixture_info['ground_truth_mask']
126 |         except:
127 |             raise KeyError("Mixture info does not have a ground truth "
128 |                            "mask inferred by the most dominant source "
129 |                            "in each TF bin.")
130 | 
131 |         sources_raw = np.array(tf_info['sources_raw'])
132 |         amplitudes = np.array(mixture_info['positions']['amplitudes'])
133 |         n_sources = len(sources_raw)
134 | 
135 |         return (abs_tf, real_tf, imag_tf,
136 |                 duet_mask, ground_truth_mask,
137 |                 sources_raw, amplitudes, n_sources)
138 | 
139 |     def extract_stats(self):
140 |         if not os.path.lexists(self.dataset_stats_path):
141 |             mean = 0.
142 |             std = 0.
143 |             for file_path in self.data_paths:
144 |                 try:
145 |                     mix_info = joblib.load(file_path)
146 |                 except:
147 |                     raise IOError("Failed to load data from path: {} "
148 |                                   "using joblib.".format(file_path))
149 | 
150 |                 tf_info = self.mix_creator.construct_mixture(mix_info)
151 |                 mixture_tf = tf_info['m1_tf']
152 |                 abs_tf = abs(mixture_tf)
153 |                 mean += np.mean(np.mean(abs_tf))
154 |                 std += np.std(abs_tf)
155 |             mean /= self.__len__()
156 |             std /= self.__len__()
157 | 
158 |         #     store them for later usage
159 |             joblib.dump((mean, std), self.dataset_stats_path)
160 |             print("Saving dataset mean and variance in: {}".format(
161 |                   self.dataset_stats_path))
162 |             return mean, std
163 | 
164 |         else:
165 |             mean, std = joblib.load(self.dataset_stats_path)
166 | 
167 |         return mean, std
168 | 
169 | 
170 | def get_data_generator(args,
171 |                        return_stats=False,
172 |                        get_top=None):
173 |     data = PytorchMixtureDataset(**args.__dict__,
174 |                                  get_top=get_top)
175 |     generator_params = {'batch_size': args.batch_size,
176 |                         'shuffle': True,
177 |                         'num_workers': args.num_workers,
178 |                         'drop_last': True}
179 |     data_generator = DataLoader(data,
180 |                                 **generator_params,
181 |                                 pin_memory=False)
182 |     n_batches = int(len(data) / args.batch_size)
183 |     if return_stats:
184 |         mean, std = data.extract_stats()
185 |         return data_generator, mean, std, n_batches
186 |     else:
187 |         return data_generator, n_batches
188 | 
189 | 
190 | def concatenate_for_masks(masks, n_sources, batch_size):
191 |     # create 3d masks for each source
192 |     batch_list = []
193 |     for b in torch.arange(batch_size):
194 |         sources_list = []
195 |         for i in torch.arange(n_sources):
196 |             source_mask = masks[b, :, :] == int(i)
197 |             sources_list.append(source_mask)
198 | 
199 |         sources_tensor = torch.stack(sources_list,
200 |                                      dim=n_sources)
201 |         batch_list.append(sources_tensor)
202 |     return torch.stack(batch_list, dim=0)
203 | 
204 | 
205 | def initialize_and_copy_masks(masks, n_sources, batch_size, device):
206 |     new_masks = torch.empty((batch_size,
207 |                              masks.shape[1],
208 |                              masks.shape[2],
209 |                              n_sources),
210 |                             dtype=torch.uint8)
211 |     new_masks.to(device)
212 |     for i in torch.arange(n_sources):
213 |         new_masks[:, :, :, i] = masks[:, :, :] == int(i)
214 | 
215 |     return new_masks
216 | 
217 | 
218 | def example_of_usage(args):
219 |     import time
220 | 
221 |     training_data = PytorchMixtureDataset(**args.__dict__)
222 |     mean, std = training_data.extract_stats()
223 | 
224 |     generator_params = {'batch_size': 128,
225 |                         'shuffle': True,
226 |                         'num_workers': 1,
227 |                         'drop_last': True}
228 |     training_generator = DataLoader(training_data, **generator_params)
229 |     device = torch.device("cuda")
230 | 
231 |     timing_dic = {}
232 | 
233 |     batch_now = time.time()
234 |     # just iterate over the data
235 |     for batch_data in training_generator:
236 |         timing_dic['Loading batch'] = time.time() - batch_now
237 |         batch_now = time.time()
238 | 
239 |         before = time.time()
240 |         (abs_tfs, real_tfs, imag_tfs,
241 |          duet_masks, ground_truth_masks,
242 |          sources_raw, amplitudes, n_sources) = batch_data
243 |         now = time.time()
244 |         timing_dic['Loading from disk'] = now-before
245 | 
246 |         before = time.time()
247 |         input_tf, masks_tf = abs_tfs.to(device), duet_masks.to(device)
248 |         now = time.time()
249 |         timing_dic['Loading to GPU'] = now - before
250 | 
251 | 
252 |         before = time.time()
253 |         duet_stack = concatenate_for_masks(duet_masks,
254 |                                            args.n_sources,
255 |                                            generator_params['batch_size'])
256 |         gt_stack = concatenate_for_masks(ground_truth_masks,
257 |                                          args.n_sources,
258 |                                          generator_params['batch_size'])
259 |         now = time.time()
260 |         timing_dic['Stacking in appropriate dimensions the masks'] = \
261 |             now - before
262 | 
263 |         before = time.time()
264 |         duet_copy = initialize_and_copy_masks(duet_masks,
265 |                                               args.n_sources,
266 |                                               generator_params[
267 |                                                   'batch_size'],
268 |                                               device)
269 | 
270 |         gt_copy = initialize_and_copy_masks(ground_truth_masks,
271 |                                             args.n_sources,
272 |                                             generator_params[
273 |                                               'batch_size'],
274 |                                             device)
275 |         now = time.time()
276 |         timing_dic['Initializing and copying for masks'] = now - before
277 | 
278 |         assert torch.equal(duet_copy, duet_stack)
279 |         assert torch.equal(gt_copy, gt_stack)
280 | 
281 | 
282 |         # torch.cuda.empty_cache()
283 |         pprint(timing_dic)
284 | 
285 | 
286 | def get_args():
287 |     """! Command line parser """
288 |     parser = argparse.ArgumentParser(description='Pytorch Dataset '
289 |                                                  'Loader')
290 |     parser.add_argument("--dataset", type=str,
291 |                         help="Dataset name", default="timit")
292 |     parser.add_argument("--n_sources", type=int,
293 |                         help="How many sources in each mix", default=2)
294 |     parser.add_argument("--n_samples", type=int, nargs='+',
295 |                         help="How many samples do u want to be "
296 |                              "created for train test val",
297 |                         required=True)
298 |     parser.add_argument("--genders", type=str, nargs='+',
299 |                         help="Genders that will correspond to the "
300 |                              "genders in the mixtures",
301 |                         default=['m', 'f'])
302 |     parser.add_argument("-f", "--force_delays", nargs='+', type=int,
303 |                         help="""Whether you want to force integer 
304 |                         delays of +- 1 in the sources e.g.""",
305 |                         default=[-1,1])
306 |     return parser.parse_args()
307 | 
308 | 
309 | if __name__ == "__main__":
310 |     args = get_args()
311 |     example_of_usage(args)
312 | 
313 | 
314 | 


--------------------------------------------------------------------------------
/spatial_two_mics/dnn/utils/fast_dataset_v2.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief A dataset creation which is compatible with pytorch framework
  3 | and much faster in loading time depending on the new version of
  4 | loading only the appropriate files that might be needed
  5 | 
  6 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  7 | @copyright University of illinois at Urbana Champaign
  8 | """
  9 | 
 10 | import torch
 11 | import argparse
 12 | import os
 13 | import sys
 14 | import glob2
 15 | import numpy as np
 16 | from sklearn.externals import joblib
 17 | import scipy.io.wavfile as wavfile
 18 | from torch.utils.data import Dataset, DataLoader
 19 | from pprint import pprint
 20 | 
 21 | root_dir = os.path.join(
 22 |            os.path.dirname(os.path.realpath(__file__)),
 23 |            '../../../')
 24 | sys.path.insert(0, root_dir)
 25 | import spatial_two_mics.utils.audio_mixture_constructor as \
 26 |     mixture_creator
 27 | import spatial_two_mics.config as config
 28 | import spatial_two_mics.data_generator.dataset_storage as \
 29 |     dataset_storage
 30 | 
 31 | 
 32 | class PytorchMixtureDataset(Dataset):
 33 |     """
 34 |     This is a general compatible class for pytorch datasets.
 35 | 
 36 |     @note Each instance of the dataset should be stored using
 37 |     joblib.dump() and this is the way that it would be returned.
 38 |     After some transformations.
 39 | 
 40 |     The path of all datasets should be defined inside config.
 41 |     All datasets should be formatted with appropriate subfolders of
 42 |     train / test and val and under them there should be all the
 43 |     available files.
 44 |     """
 45 |     def __init__(self,
 46 |                  dataset='timit',
 47 |                  partition='train',
 48 |                  n_samples=[512, 128, 256],
 49 |                  n_sources=2,
 50 |                  genders=['f', 'm'],
 51 |                  n_fft=512,
 52 |                  win_len=512,
 53 |                  hop_length=128,
 54 |                  mixture_duration=2.0,
 55 |                  force_delays=[-1, 1],
 56 |                  get_top=None,
 57 |                  labels_mask='duet',
 58 |                  **kwargs):
 59 | 
 60 |         self.dataset_params = {
 61 |             'dataset': dataset,
 62 |             'n_samples': n_samples,
 63 |             'n_sources': n_sources,
 64 |             'genders': genders,
 65 |             'force_delays': force_delays
 66 |         }
 67 | 
 68 |         if labels_mask == 'duet' or labels_mask == 'ground_truth':
 69 |             self.selected_mask = labels_mask
 70 |         else:
 71 |             raise NotImplementedError("There is no available mask "
 72 |                                       "called: {}".format(labels_mask))
 73 |         self.partition = partition
 74 | 
 75 |         dataset_name = dataset_storage.create_dataset_name(
 76 |             self.dataset_params)
 77 | 
 78 |         self.dataset_dirpath = os.path.join(
 79 |             config.DATASETS_DIR,
 80 |             dataset_name,
 81 |             partition)
 82 | 
 83 |         self.dataset_stats_path = self.dataset_dirpath + '_stats'
 84 | 
 85 |         if not os.path.isdir(self.dataset_dirpath):
 86 |             raise IOError("Dataset folder {} not found!".format(
 87 |                 self.dataset_dirpath))
 88 |         else:
 89 |             print("Loading files from {} ...".format(
 90 |                 self.dataset_dirpath))
 91 | 
 92 |         self.mixture_folders = glob2.glob(os.path.join(
 93 |             self.dataset_dirpath, '*'))
 94 |         if get_top is not None:
 95 |             self.mixture_folders = self.mixture_folders[:get_top]
 96 | 
 97 |         self.n_samples = len(self.mixture_folders)
 98 | 
 99 |         # preprocess -- store all absolute spectra values for faster
100 |         # loading during run time
101 |         self.store_directly_abs_spectra()
102 | 
103 |     def __len__(self):
104 |         return self.n_samples
105 | 
106 |     def __getitem__(self, idx):
107 |         """!
108 |         Depending on the selected partition it returns accordingly
109 |         the following objects:
110 | 
111 |         if self.partition == 'train':
112 |             (abs_tfs, selected_mask)
113 |         else if partition == 'test' or 'val'
114 |             (abs_tfs, selected_mask, wavs_list, real_tfs, imag_tfs)"""
115 |         mix_folder = self.mixture_folders[idx]
116 |         try:
117 |             abs_tfs = joblib.load(os.path.join(mix_folder, 'abs_tfs'))
118 |         except:
119 |             raise IOError("Failed to load data from path: {} "
120 |                           "for absolute spectra.".format(mix_folder))
121 | 
122 | 
123 |         try:
124 |             if self.selected_mask == 'duet':
125 |                 mask = joblib.load(os.path.join(mix_folder,
126 |                                                 'soft_labeled_mask'))
127 |             else:
128 |                 mask = joblib.load(os.path.join(mix_folder,
129 |                                                 'ground_truth_mask'))
130 |         except:
131 |             raise IOError("Failed to load data from path: {} "
132 |                           "for tf label masks".format(mix_folder))
133 | 
134 |         if self.partition == 'train':
135 |             return abs_tfs, mask
136 | 
137 |         try:
138 |             real_p = os.path.join(mix_folder, 'real_tfs')
139 |             imag_p = os.path.join(mix_folder, 'imag_tfs')
140 |             wavs_p= os.path.join(mix_folder, 'wavs')
141 |             real_tfs = joblib.load(real_p)
142 |             imag_tfs = joblib.load(imag_p)
143 |             wavs_list = joblib.load(wavs_p)
144 |             wavs_list = np.array(wavs_list)
145 |         except:
146 |             raise IOError("Failed to load data from path: {} "
147 |                           "for real, imag tf of the mixture and "
148 |                           "wavs".format(mix_folder))
149 | 
150 |         return abs_tfs, mask, wavs_list, real_tfs, imag_tfs
151 | 
152 |     def store_directly_abs_spectra(self):
153 |         for mix_folder in self.mixture_folders:
154 |             abs_p = os.path.join(mix_folder, 'abs_tfs')
155 |             if os.path.lexists(abs_p):
156 |                 continue
157 | 
158 |             try:
159 |                 real_p = os.path.join(mix_folder, 'real_tfs')
160 |                 imag_p = os.path.join(mix_folder, 'imag_tfs')
161 |                 real_tfs = joblib.load(real_p)
162 |                 imag_tfs = joblib.load(imag_p)
163 |             except:
164 |                 raise IOError("Failed to load data from path: {} "
165 |                               "using joblib.".format(mix_folder))
166 |             abs_tfs = np.abs(real_tfs + 1j * imag_tfs)
167 |             try:
168 |                 joblib.dump(abs_tfs, abs_p, compress=0)
169 |             except:
170 |                 raise IOError("Failed to save absolute value of "
171 |                               "spectra in path: {}".format(abs_p))
172 | 
173 |     def extract_stats(self):
174 |         if not os.path.lexists(self.dataset_stats_path):
175 |             mean = 0.
176 |             std = 0.
177 |             for mix_folder in self.mixture_folders:
178 |                 try:
179 |                     abs_p = os.path.join(mix_folder, 'abs_tfs')
180 |                     abs_tfs = joblib.load(abs_p)
181 |                 except:
182 |                     raise IOError("Failed to load absolute tf "
183 |                                   "representation from path: {} "
184 |                                   "using joblib.".format(abs_p))
185 | 
186 |                 mean += np.mean(np.mean(abs_tfs))
187 |                 std += np.std(abs_tfs)
188 |             mean /= self.__len__()
189 |             std /= self.__len__()
190 | 
191 |             #     store them for later usage
192 |             joblib.dump((mean, std), self.dataset_stats_path)
193 |             print("Saving dataset mean and variance in: {}".format(
194 |                 self.dataset_stats_path))
195 |         else:
196 |             mean, std = joblib.load(self.dataset_stats_path)
197 | 
198 |         return mean, std
199 | 
200 | 
201 | def get_data_generator(args,
202 |                        return_stats=False,
203 |                        get_top=None):
204 |     data = PytorchMixtureDataset(**args.__dict__,
205 |                                  get_top=get_top)
206 |     generator_params = {'batch_size': args.batch_size,
207 |                         'shuffle': True,
208 |                         'num_workers': args.num_workers,
209 |                         'drop_last': True}
210 |     data_generator = DataLoader(data,
211 |                                 **generator_params,
212 |                                 pin_memory=False)
213 |     n_batches = int(len(data) / args.batch_size)
214 |     if return_stats:
215 |         mean, std = data.extract_stats()
216 |         return data_generator, mean, std, n_batches
217 |     else:
218 |         return data_generator, n_batches
219 | 
220 | 
221 | def concatenate_for_masks(masks, n_sources, batch_size):
222 |     # create 3d masks for each source
223 |     batch_list = []
224 |     for b in torch.arange(batch_size):
225 |         sources_list = []
226 |         for i in torch.arange(n_sources):
227 |             source_mask = masks[b, :, :] == int(i)
228 |             sources_list.append(source_mask)
229 | 
230 |         sources_tensor = torch.stack(sources_list,
231 |                                      dim=n_sources)
232 |         batch_list.append(sources_tensor)
233 |     return torch.stack(batch_list, dim=0)
234 | 
235 | 
236 | def initialize_and_copy_masks(masks, n_sources, batch_size, device):
237 |     new_masks = torch.empty((batch_size,
238 |                              masks.shape[1],
239 |                              masks.shape[2],
240 |                              n_sources),
241 |                             dtype=torch.uint8)
242 |     new_masks.to(device)
243 |     for i in torch.arange(n_sources):
244 |         new_masks[:, :, :, i] = masks[:, :, :] == int(i)
245 | 
246 |     return new_masks
247 | 
248 | 
249 | def example_of_usage(args):
250 |     import time
251 | 
252 |     training_data = PytorchMixtureDataset(**args.__dict__)
253 |     mean, std = training_data.extract_stats()
254 |     generator_params = {'batch_size': 128,
255 |                         'shuffle': True,
256 |                         'num_workers': 1,
257 |                         'drop_last': True}
258 |     training_generator = DataLoader(training_data, **generator_params)
259 |     device = torch.device("cuda")
260 | 
261 |     timing_dic = {}
262 |     n_sources = 2
263 | 
264 |     batch_now = time.time()
265 |     # just iterate over the data
266 |     for batch_data in training_generator:
267 |         timing_dic['Loading batch'] = time.time() - batch_now
268 |         batch_now = time.time()
269 | 
270 |         before = time.time()
271 |         (abs_tfs, masks) = batch_data
272 |         now = time.time()
273 |         timing_dic['Loading from disk'] = now-before
274 | 
275 |         before = time.time()
276 |         input_tf, masks_tf = abs_tfs.to(device), masks.to(device)
277 |         now = time.time()
278 |         timing_dic['Loading to GPU'] = now - before
279 | 
280 | 
281 |         before = time.time()
282 |         duet_stack = concatenate_for_masks(masks,
283 |                                            n_sources,
284 |                                            generator_params['batch_size'])
285 |         now = time.time()
286 |         timing_dic['Stacking in appropriate dimensions the masks'] = \
287 |             now - before
288 | 
289 |         before = time.time()
290 |         duet_copy = initialize_and_copy_masks(masks,
291 |                                               n_sources,
292 |                                               generator_params[
293 |                                                   'batch_size'],
294 |                                               device)
295 |         now = time.time()
296 |         timing_dic['Initializing and copying for masks'] = now - before
297 | 
298 |         pprint(timing_dic)
299 |         batch_now = time.time()
300 | 
301 | 
302 | def get_args():
303 |     """! Command line parser """
304 |     parser = argparse.ArgumentParser(description='Pytorch Fast Dataset '
305 |                                                  'Loader')
306 |     parser.add_argument("--dataset", type=str,
307 |                         help="Dataset name", default="timit")
308 |     parser.add_argument("--n_sources", type=int,
309 |                         help="How many sources in each mix", default=2)
310 |     parser.add_argument("--n_samples", type=int, nargs='+',
311 |                         help="How many samples do u want to be "
312 |                              "created for train test val",
313 |                         required=True)
314 |     parser.add_argument("--genders", type=str, nargs='+',
315 |                         help="Genders that will correspond to the "
316 |                              "genders in the mixtures",
317 |                         default=['m', 'f'])
318 |     parser.add_argument("-f", "--force_delays", nargs='+', type=int,
319 |                         help="""Whether you want to force integer 
320 |                         delays of +- 1 in the sources e.g.""",
321 |                         default=[-1,1])
322 |     return parser.parse_args()
323 | 
324 | 
325 | if __name__ == "__main__":
326 |     args = get_args()
327 |     example_of_usage(args)
328 | 
329 | 
330 | 


--------------------------------------------------------------------------------
/spatial_two_mics/utils/audio_mixture_constructor.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief This utility serves as a level of abstraction in order to
  3 | construct audio mixtures
  4 | 
  5 | 
  6 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  7 | @copyright University of Illinois at Urbana Champaign
  8 | """
  9 | 
 10 | from librosa.core import stft
 11 | from pprint import pprint
 12 | import numpy as np
 13 | import scipy.io.wavfile as wavfile
 14 | 
 15 | 
 16 | class AudioMixtureConstructor(object):
 17 |     def __init__(self,
 18 |                  n_fft=1024,
 19 |                  win_len=None,
 20 |                  hop_len=None,
 21 |                  force_delays=None,
 22 |                  normalize_audio_by_std=True,
 23 |                  mixture_duration=2.0,
 24 |                  precision=0.01,
 25 |                  freqs_included=5):
 26 |         """
 27 |         :param fs: sampling rate
 28 |         :param n_fft: FFT window size
 29 |         :param win_len: The window will be of length win_length and
 30 |         then padded with zeros to match n_fft.
 31 |         If unspecified, defaults to win_length = n_fft.
 32 |         :param hop_len: number audio of frames between STFT columns.
 33 |         If unspecified, defaults win_length / 4.
 34 |         :param force_delays: list of delays to be forced in the
 35 |         source signals -1 or 1 integer delay for the microphones
 36 |         mixtures, if is [0, 0] then no delay would be forced
 37 |         :param normalize_audio_by_std: if the loaded wavs would be
 38 |         normalized by their std values
 39 |         :param mixture_duration: the duration on which the mixture
 40 |         would be created (in seconds)
 41 |         :param precision: The precision as a floating number e.g. 0.01
 42 |         if you are using floating point delays between your source
 43 |         signals for each mixture
 44 |         :param freqs_included: How many frequencies should be
 45 |         included in the sinc function before convolving it with the
 46 |         true signal in order to upsample it (1/precision) times more
 47 |         and shift it in order to get the truly delayed signal.
 48 |         """
 49 |         self.mixture_duration = mixture_duration
 50 |         self.n_fft = n_fft
 51 |         self.win_len = win_len
 52 |         self.hop_len = hop_len
 53 |         self.normalize_audio_by_std = normalize_audio_by_std
 54 |         self.force_delays = force_delays
 55 |         self.precision = precision
 56 |         self.freqs_included = freqs_included
 57 | 
 58 |         xs = np.linspace(-self.freqs_included,
 59 |                          self.freqs_included,
 60 |                          2. * self.freqs_included / self.precision)
 61 |         self.windowed_sinc = np.sinc(xs)
 62 | 
 63 |     @staticmethod
 64 |     def load_wav(source_info):
 65 |         return wavfile.read(source_info['wav_path'])
 66 | 
 67 |     def get_stft(self,
 68 |                  signal):
 69 | 
 70 |         return stft(signal,
 71 |                     n_fft=self.n_fft,
 72 |                     win_length=self.win_len,
 73 |                     hop_length=self.hop_len)
 74 | 
 75 |     def force_delay_on_signal(self,
 76 |                               signal,
 77 |                               delay):
 78 |         if delay >= 0:
 79 |             return signal[delay:]
 80 |         else:
 81 |             return signal[:delay]
 82 | 
 83 |     def enforce_float_delays(self,
 84 |                              source_signals,
 85 |                              delays_for_sources,
 86 |                              fs):
 87 |         """!
 88 |         For 2 microphone enforce a floating point number delay with some
 89 |         selected precision and apply that for all sources that would
 90 |         be given. Also make sure that the required to be returned
 91 |         wavs have to have a length equal to the duration"""
 92 |         upsampling_rate = int(1. / self.precision)
 93 |         duration_in_samples = int(self.mixture_duration * fs) - 1
 94 |         decimals = int(np.log10(upsampling_rate))
 95 |         n_augmentation_zeros = upsampling_rate - 1
 96 | 
 97 |         rounded_taus = np.around(delays_for_sources, decimals=decimals)
 98 |         taus_samples = upsampling_rate * rounded_taus
 99 |         taus_samples = taus_samples.astype(int)
100 | 
101 |         mic_signals = {'m1': [], 'm2': []}
102 |         for src_id, source_sig in enumerate(source_signals):
103 |             sig_len = source_sig.shape[0]
104 |             augmented_signal = np.zeros(
105 |                 sig_len + (sig_len - 1) * n_augmentation_zeros)
106 |             augmented_signal[::upsampling_rate] = source_sig
107 |             est_augmented_sig = np.convolve(augmented_signal,
108 |                                             self.windowed_sinc,
109 |                                             mode='valid')
110 | 
111 |             tau_in_samples = taus_samples[src_id]
112 |             if tau_in_samples > 0:
113 |                 source_in_mic1 = est_augmented_sig[
114 |                                  tau_in_samples:][::upsampling_rate]
115 |                 source_in_mic2 = est_augmented_sig[
116 |                                  :-tau_in_samples][::upsampling_rate]
117 |             elif tau_in_samples < 0:
118 |                 source_in_mic1 = est_augmented_sig[
119 |                                  :tau_in_samples][::upsampling_rate]
120 |                 source_in_mic2 = est_augmented_sig[
121 |                                  -tau_in_samples:][::upsampling_rate]
122 |             else:
123 |                 source_in_mic1 = est_augmented_sig[::upsampling_rate]
124 |                 source_in_mic2 = est_augmented_sig[::upsampling_rate]
125 | 
126 |             # check the duration which is very important
127 |             if (len(source_in_mic1) < duration_in_samples or
128 |                     len(source_in_mic2) < duration_in_samples):
129 |                 raise ValueError("Duration given: {} could "
130 |                                  "not be sufficed before the gven source"
131 |                                  " signal has a lesser duration of {} "
132 |                                  "after the float delay.".format(
133 |                     duration_in_samples, len(source_in_mic1)))
134 | 
135 |             mic_signals['m1'].append(
136 |                         source_in_mic1[:duration_in_samples])
137 |             mic_signals['m2'].append(
138 |                         source_in_mic2[:duration_in_samples])
139 | 
140 |         return mic_signals
141 | 
142 |     def construct_mic_signals(self,
143 |                               source_signals,
144 |                               delays_for_sources):
145 |         """!
146 |         This function might extend to any real delay by interpolation
147 |         of the source signals or just forcing a delay over the sources.
148 |         After that it returns a dictionary containing a list of signals
149 |         for each microphone. also cropped to the duration specified.
150 | 
151 |         :return mic_signals ={ 'm1': [s1, s2, ..., sn], 'm2': same }
152 |         """
153 | 
154 |         fs = source_signals[0][1]
155 |         assert all([sr == fs for (s, sr) in source_signals]), 'When ' \
156 |                'trying to enforce the delays over the source signals ' \
157 |                'the fs should be the same for all sources!'
158 | 
159 |         if self.force_delays is None:
160 |             mic_signals = self.enforce_float_delays(
161 |                            [s for (s, sr) in source_signals],
162 |                            delays_for_sources,
163 |                            fs)
164 | 
165 |         else:
166 |         # naive way in order to force a delay for DUET algorithm
167 |             m1_delays = self.force_delays
168 |             m2_delays = self.force_delays[::-1]
169 | 
170 |             cropped_signals = [s[:int(self.mixture_duration * fs)]
171 |                                for (s, sr) in source_signals]
172 | 
173 |             mic_signals = {
174 |                 'm1': [self.force_delay_on_signal(s, m1_delays[i])
175 |                        for (i, s) in enumerate(cropped_signals)],
176 |                 'm2': [self.force_delay_on_signal(s, m2_delays[i])
177 |                        for (i, s) in enumerate(cropped_signals)]
178 |             }
179 | 
180 |         return mic_signals
181 | 
182 |     def get_tf_representations(self,
183 |                                mixture_info):
184 |         """!
185 |         This function constructs the mixture for each mic (m1,
186 |         m2) in the following way:
187 |         m1(t) = a1*s1(t) + ... + an*sn(t)
188 |         m2(t) = a1*s1(t+d1) + ... + an*sn(t+dn)
189 | 
190 |         by also cutting them off to self.min_samples
191 | 
192 |         :return
193 |         mixture_info = {
194 |             'm1_raw': numpy array containing the raw m1 signal,
195 |             'm2_raw': numpy array containing the raw m2 signal,
196 |             'm1_tf': numpy array containing the m1 TF representation,
197 |             'm2_tf': numpy array containing the m2 TF representation,
198 |             'sources_raw': a list of numpy 1d vectors containing the
199 |             sources ,
200 |             'sources_tf': a list of numpy 2d vectors containing the
201 |              TF represeantations of the sources ,
202 |             'amplitudes': the weights that each source contributes to
203 |             the mixture of the second microphone
204 |         }
205 |         """
206 |         positions = mixture_info['positions']
207 |         source_signals = [(s['wav'], s['fs'])
208 |                           for s in mixture_info['sources_ids']]
209 |         n_sources = len(source_signals)
210 | 
211 |         mic_signals = self.construct_mic_signals(source_signals,
212 |                                                  positions['taus'])
213 | 
214 |         m1 = sum([positions['amplitudes'][i] * mic_signals['m1'][i]
215 |                   for i in np.arange(n_sources)])
216 | 
217 |         m2 = sum([positions['amplitudes'][i] * mic_signals['m2'][i]
218 |                   for i in np.arange(n_sources)])
219 | 
220 |         sources_spectra = [self.get_stft(s) for s in mic_signals['m1']]
221 | 
222 |         m1_tf = self.get_stft(m1)
223 |         m2_tf = self.get_stft(m2)
224 | 
225 |         mixture_info = {
226 |             'm1_raw': m1,
227 |             'm2_raw': m2,
228 |             'm1_tf': m1_tf,
229 |             'm2_tf': m2_tf,
230 |             'sources_raw': mic_signals['m1'],
231 |             'sources_tf': sources_spectra,
232 |             'amplitudes': positions['amplitudes']
233 |         }
234 | 
235 |         return mixture_info
236 | 
237 |     def construct_mixture(self,
238 |                           mixture_info):
239 |         """! The whole processing for getting the mixture signals for
240 |         the two mics and the positions is done here.
241 | 
242 |         :param mixture_info
243 |         {
244 |             'positions': example
245 |             'sources_ids':
246 |             [       {
247 |                         'gender': combination_info.gender
248 |                         'sentence_id': combination_info.sentence_id
249 |                         'speaker_id': combination_info.speaker_id
250 |                         'wav_path': the wav_path for the file
251 |                     } ... ]
252 |         }
253 | 
254 | 
255 |         :return tf_representations = {
256 |             'm1_raw': numpy array containing the raw m1 signal,
257 |             'm2_raw': numpy array containing the raw m2 signal,
258 |             'm1_tf': numpy array containing the m1 TF representation,
259 |             'm2_tf': numpy array containing the m2 TF representation,
260 |             'sources_raw': a list of numpy 1d vectors containing the
261 |             sources ,
262 |             'sources_tf': a list of numpy 2d vectors containing the
263 |              TF represeantations of the sources ,
264 |             'amplitudes': the weights that each source contributes to
265 |             the mixture of the second microphone
266 |         }
267 |         """
268 | 
269 |         for i, source_info in enumerate(mixture_info['sources_ids']):
270 |             fs, wav = self.load_wav(source_info)
271 |             if self.normalize_audio_by_std:
272 |                 wav = wav / np.std(wav)
273 |             mixture_info['sources_ids'][i]['fs'] = int(fs)
274 |             mixture_info['sources_ids'][i]['wav'] = wav
275 | 
276 |         tf_representations = self.get_tf_representations(mixture_info)
277 | 
278 |         return tf_representations
279 | 
280 | 
281 | def example_of_usage():
282 |     """!
283 |     How the class of Audio mixtures should be called"""
284 | 
285 |     import os
286 |     import sys
287 |     root_dir = os.path.join(
288 |         os.path.dirname(os.path.realpath(__file__)),
289 |         '../../')
290 |     sys.path.insert(0, root_dir)
291 |     import spatial_two_mics.examples.mixture_example as me
292 | 
293 |     mixture_creator = AudioMixtureConstructor(n_fft=1024,
294 |                                               win_len=1024,
295 |                                               hop_len=512,
296 |                                               mixture_duration=2.0,
297 |                                               force_delays=[-1, 1])
298 | 
299 |     mixture_info = me.mixture_info_example()
300 | 
301 |     import spatial_two_mics.data_generator.source_position_generator \
302 |         as  position_generator
303 | 
304 |     # add some randomness in the generation of the positions
305 |     random_positioner = position_generator.RandomCirclePositioner()
306 |     positions_info = random_positioner.get_sources_locations(2)
307 |     mixture_info['positions'] = positions_info
308 | 
309 |     tf_mixtures = mixture_creator.construct_mixture(mixture_info)
310 | 
311 |     pprint(tf_mixtures)
312 | 
313 | if __name__ == "__main__":
314 |     example_of_usage()


--------------------------------------------------------------------------------
/spatial_two_mics/data_loaders/wham.py:
--------------------------------------------------------------------------------
  1 | """!
  2 | @brief Pytorch dataloader for wham dataset for multiple gender combinations.
  3 | 
  4 | @author Efthymios Tzinis {etzinis2@illinois.edu}
  5 | @copyright University of illinois at Urbana Champaign
  6 | """
  7 | 
  8 | import torch
  9 | import os
 10 | import numpy as np
 11 | import pickle
 12 | import glob2
 13 | import sys
 14 | 
 15 | current_dir = os.path.dirname(os.path.abspath('__file__'))
 16 | root_dir = os.path.abspath(os.path.join(current_dir, '../../'))
 17 | sys.path.append(root_dir)
 18 | import approx_ensembles.separation.dataset_loader.abstract_dataset as \
 19 |     abstract_dataset
 20 | from scipy.io import wavfile
 21 | import warnings
 22 | from tqdm import tqdm
 23 | from time import time
 24 | 
 25 | EPS = 1e-8
 26 | enh_single = {'mixture': 'mix_single',
 27 |               'sources': ['s1', 'noise'],
 28 |               'n_sources': 1}
 29 | enh_single_white_noise = {
 30 |               'mixture': 'source_with_white_noise',
 31 |               'sources': ['s1', 'white_noise'],
 32 |               'n_sources': 1}
 33 | enh_both = {'mixture': 'mix_both',
 34 |             'sources': ['mix_clean', 'noise'],
 35 |             'n_sources': 1}
 36 | sep_clean = {'mixture': 'mix_clean',
 37 |              'sources': ['s1', 's2'],
 38 |              'n_sources': 2}
 39 | sep_noisy = {'mixture': 'mix_both',
 40 |              'sources': ['s1', 's2', 'noise'],
 41 |              'n_sources': 2}
 42 | 
 43 | VALID_GENDER_COMBS = set(['ff', 'mm', 'fm', 'mf'])
 44 | 
 45 | WHAM_TASKS = {'enhance_single_white_noise': enh_single_white_noise,
 46 |               'enhance_single': enh_single,
 47 |               'enhance_both': enh_both,
 48 |               'sep_clean': sep_clean,
 49 |               'sep_noisy': sep_noisy}
 50 | WHAM_TASKS['enh_single'] = WHAM_TASKS['enhance_single']
 51 | WHAM_TASKS['enh_both'] = WHAM_TASKS['enhance_both']
 52 | 
 53 | 
 54 | def normalize_tensor_wav(wav_tensor, eps=1e-8, std=None):
 55 |     mean = wav_tensor.mean(-1, keepdim=True)
 56 |     if std is None:
 57 |         std = wav_tensor.std(-1, keepdim=True)
 58 |     return (wav_tensor - mean) / (std + eps)
 59 | 
 60 | 
 61 | class Dataset(torch.utils.data.Dataset, abstract_dataset.Dataset):
 62 |     """ Dataset class for WHAM source separation and speech enhancement tasks.
 63 | 
 64 |     Example of kwargs:
 65 |         root_dirpath='/mnt/data/wham', task='enh_single',
 66 |         split='tr', sample_rate=8000, timelength=4.0,
 67 |         normalize_audio=False, n_samples=0, zero_pad=False
 68 |     """
 69 |     def __init__(self, **kwargs):
 70 |         super(Dataset, self).__init__()
 71 |         warnings.filterwarnings("ignore")
 72 |         self.kwargs = kwargs
 73 | 
 74 |         self.task = self.get_arg_and_check_validness(
 75 |             'task', known_type=str, choices=WHAM_TASKS.keys())
 76 | 
 77 |         self.zero_pad = self.get_arg_and_check_validness(
 78 |             'zero_pad', known_type=bool)
 79 | 
 80 |         self.augment = self.get_arg_and_check_validness(
 81 |             'augment', known_type=bool)
 82 | 
 83 |         # Gender combination priors for combinations
 84 |         # ff, mm, fm/mf
 85 |         self.gender_combination_priors = self.get_arg_and_check_validness(
 86 |             'gender_combination_priors', known_type=float,
 87 |             dict_check={'ff': float, 'mm': float, 'fm': float, 'mf':float},
 88 |             extra_lambda_checks=[lambda x: [0 <= y <= 1 for y in x.values()]])
 89 | 
 90 |         self.normalize_audio = self.get_arg_and_check_validness(
 91 |             'normalize_audio', known_type=bool)
 92 | 
 93 |         self.min_or_max = self.get_arg_and_check_validness(
 94 |             'min_or_max', known_type=str, choices=['min', 'max'])
 95 | 
 96 |         self.split = self.get_arg_and_check_validness(
 97 |             'split', known_type=str, choices=['cv', 'tr', 'tt'])
 98 | 
 99 |         self.n_samples = self.get_arg_and_check_validness(
100 |             'n_samples', known_type=int, extra_lambda_checks=[lambda x: x >= 0])
101 | 
102 |         self.sample_rate = self.get_arg_and_check_validness('sample_rate',
103 |                                                             known_type=int)
104 |         self.root_path = self.get_arg_and_check_validness(
105 |             'root_dirpath', known_type=str,
106 |             extra_lambda_checks=[lambda y: os.path.lexists(y)])
107 |         self.dataset_dirpath = self.get_path()
108 | 
109 |         self.mixtures_info_metadata_path = os.path.join(
110 |             self.dataset_dirpath, 'metadata_v2')
111 | 
112 |         self.timelength = self.get_arg_and_check_validness(
113 |             'timelength', known_type=float)
114 | 
115 |         self.time_samples = int(self.sample_rate * self.timelength)
116 | 
117 |         # Create the indexing for the dataset
118 |         mix_folder_path = os.path.join(self.dataset_dirpath,
119 |                                        WHAM_TASKS[self.task]['mixture'])
120 |         self.file_names = []
121 |         self.available_mixtures = glob2.glob(mix_folder_path + '/*.wav')
122 | 
123 |         self.mixtures_info = []
124 |         print('Parsing Dataset found at: {}...'.format(self.dataset_dirpath))
125 |         if not os.path.lexists(self.mixtures_info_metadata_path):
126 |             # Parse gender information.
127 |             gender_info_path = os.path.join(
128 |                 os.path.dirname(os.path.abspath('__file__')),
129 |                 'wham_speaker_info.txt')
130 |             gender_dic = {}
131 |             if os.path.lexists(gender_info_path):
132 |                 with open(gender_info_path, 'rb') as filehandle:
133 |                     gender_dic = dict([tuple([x.decode() for x in l.split()])
134 |                                        for l in filehandle.readlines()])
135 | 
136 |             for file_path in tqdm(self.available_mixtures):
137 |                 sample_rate, waveform = wavfile.read(file_path)
138 |                 assert sample_rate == self.sample_rate
139 |                 numpy_wav = np.array(waveform)
140 | 
141 |                 speaker_info = os.path.basename(file_path).split('.wav')[0]
142 |                 speaker_info = [x[:3] for x in speaker_info.split('_')[::2]]
143 | 
144 |                 this_gender_comb = ''
145 |                 for speaker in speaker_info:
146 |                     if speaker not in gender_dic:
147 |                         raise ValueError('Speaker with id: {} not '
148 |                                          'found!'.format(speaker))
149 |                     else:
150 |                         this_gender_comb += gender_dic[speaker].lower()
151 | 
152 |                 self.mixtures_info.append([os.path.basename(file_path),
153 |                                            numpy_wav.shape[0],
154 |                                            this_gender_comb])
155 | 
156 |             print('Dumping metadata in: {}'.format(
157 |                 self.mixtures_info_metadata_path))
158 |             with open(self.mixtures_info_metadata_path, 'wb') as filehandle:
159 |                 pickle.dump(self.mixtures_info, filehandle)
160 | 
161 |         if os.path.lexists(self.mixtures_info_metadata_path):
162 |             with open(self.mixtures_info_metadata_path, 'rb') as filehandle:
163 |                 self.mixtures_info = pickle.load(filehandle)
164 |                 print('Loaded metadata from: {}'.format(
165 |                     self.mixtures_info_metadata_path))
166 | 
167 |         self.file_names_g_comb = dict([(g, []) for g in VALID_GENDER_COMBS])
168 |         for path, n_samples, gender_comb in self.mixtures_info:
169 |             if (n_samples >= self.time_samples or self.zero_pad):
170 |                 self.file_names_g_comb[gender_comb].append((path, n_samples))
171 | 
172 |         self.file_names = []
173 |         # Apply the priors
174 |         for gender_comb in self.file_names_g_comb:
175 |             percentage = self.gender_combination_priors[gender_comb]
176 |             length = len(self.file_names_g_comb[gender_comb])
177 |             n_requested = int(length * percentage)
178 |             self.file_names += self.file_names_g_comb[gender_comb][:n_requested]
179 |         if self.n_samples > 0:
180 |             self.file_names = self.file_names[:self.n_samples]
181 | 
182 |         max_time_samples = max([n_s for (_, n_s) in self.file_names])
183 |         self.file_names = [x for (x, _) in self.file_names]
184 | 
185 |         # for the case that we need the whole audio input
186 |         if self.time_samples <= 0.:
187 |             self.time_samples = max_time_samples
188 | 
189 |     def get_path(self):
190 |         path = os.path.join(self.root_path,
191 |                             'wav{}k'.format(int(self.sample_rate / 1000)),
192 |                             self.min_or_max, self.split)
193 |         if os.path.lexists(path):
194 |             return path
195 |         else:
196 |             raise IOError('Dataset path: {} not found!'.format(path))
197 | 
198 |     def safe_pad(self, tensor_wav):
199 |         if self.zero_pad and tensor_wav.shape[0] < self.time_samples:
200 |             appropriate_shape = tensor_wav.shape
201 |             padded_wav = torch.zeros(
202 |                 list(appropriate_shape[:-1]) + [self.time_samples],
203 |                 dtype=torch.float32)
204 |             padded_wav[:tensor_wav.shape[0]] = tensor_wav
205 |             return padded_wav[:self.time_samples]
206 |         else:
207 |             return tensor_wav[:self.time_samples]
208 | 
209 |     def __len__(self):
210 |         return len(self.file_names)
211 | 
212 |     def __getitem__(self, idx):
213 |         if self.augment:
214 |             the_time = int(np.modf(time())[0] * 100000000)
215 |             np.random.seed(the_time)
216 | 
217 |         filename = self.file_names[idx]
218 | 
219 |         mixture_path = os.path.join(self.dataset_dirpath,
220 |                                     WHAM_TASKS[self.task]['mixture'],
221 |                                     filename)
222 |         _, waveform = wavfile.read(mixture_path)
223 |         max_len = len(waveform)
224 |         rand_start = 0
225 |         if self.augment and max_len > self.time_samples:
226 |             rand_start = np.random.randint(0, max_len - self.time_samples)
227 |             waveform = waveform[rand_start:rand_start+self.time_samples]
228 |         mixture_wav = np.array(waveform)
229 |         mixture_wav = torch.tensor(mixture_wav, dtype=torch.float32)
230 |         # First normalize the mixture and then pad
231 |         if self.normalize_audio:
232 |             mixture_wav = normalize_tensor_wav(mixture_wav)
233 |         mixture_wav = self.safe_pad(mixture_wav)
234 | 
235 |         sources_list = []
236 |         for source_name in WHAM_TASKS[self.task]['sources']:
237 |             source_path = os.path.join(self.dataset_dirpath,
238 |                                        source_name, filename)
239 |             try:
240 |                 _, waveform = wavfile.read(source_path)
241 |             except Exception as e:
242 |                 print(e)
243 |                 raise IOError('could not load file from: {}'.format(source_path))
244 |             waveform = waveform[rand_start:rand_start + self.time_samples]
245 |             numpy_wav = np.array(waveform)
246 |             source_wav = torch.tensor(numpy_wav, dtype=torch.float32)
247 |             # First normalize the mixture and then pad
248 |             if self.normalize_audio:
249 |                 source_wav = normalize_tensor_wav(source_wav)
250 |             source_wav = self.safe_pad(source_wav)
251 |             sources_list.append(source_wav)
252 | 
253 |         if self.normalize_audio:
254 |             mix_std = mixture_wav.detach().cpu().numpy().std()
255 |             mixture_wav = normalize_tensor_wav(mixture_wav, std=mix_std)
256 |             sources_list = [normalize_tensor_wav(s, std=mix_std)
257 |                             for s in sources_list]
258 |         sources_wavs = torch.stack(sources_list, dim=0)
259 | 
260 |         return mixture_wav, sources_wavs
261 | 
262 |     def get_generator(self, batch_size=4, shuffle=True, num_workers=4):
263 |         generator_params = {'batch_size': batch_size,
264 |                             'shuffle': shuffle,
265 |                             'num_workers': num_workers,
266 |                             'drop_last': True}
267 |         return torch.utils.data.DataLoader(self, **generator_params,
268 |                                            pin_memory=True)
269 | 
270 | 
271 | def test_generator():
272 |     wham_root_p = '/mnt/data/wham'
273 |     batch_size = 1
274 |     sample_rate = 8000
275 |     timelength = 4.0
276 |     gender_combination_priors = {
277 |         'ff': 0., 'mm': 0.05, 'fm': 0., 'mf': 0.02
278 |     }
279 |     time_samples = int(sample_rate * timelength)
280 |     data_loader = Dataset(
281 |         root_dirpath=wham_root_p, task='sep_clean',
282 |         gender_combination_priors=gender_combination_priors,
283 |         split='tt', sample_rate=sample_rate, timelength=timelength,
284 |         zero_pad=True, min_or_max='min', augment=True,
285 |         normalize_audio=False, n_samples=10)
286 |     generator = data_loader.get_generator(batch_size=batch_size, num_workers=1)
287 | 
288 |     for mixture, sources in generator:
289 |         assert mixture.shape == (batch_size, time_samples)
290 |         assert sources.shape == (batch_size, 2, time_samples)
291 | 
292 | 
293 |     # test the testing set with batch size 1 only
294 |     data_loader = Dataset(
295 |         root_dirpath=wham_root_p, task='sep_clean',
296 |         gender_combination_priors=gender_combination_priors,
297 |         split='tt', sample_rate=sample_rate, timelength=-1.,
298 |         zero_pad=False, min_or_max='min', augment=False,
299 |         normalize_audio=False, n_samples=10)
300 |     generator = data_loader.get_generator(batch_size=1, num_workers=1)
301 | 
302 |     for mixture, sources in generator:
303 |         assert mixture.shape[-1] == sources.shape[-1]
304 | 
305 | if __name__ == "__main__":
306 |     test_generator()
307 | 


--------------------------------------------------------------------------------