├── .gitignore ├── README.md ├── callcenter ├── __init__.py ├── dataset_split.ipynb └── pytorchloader │ ├── __init__.py │ └── callcenter_dataset.py ├── esc ├── __init__.py ├── esc_gen.py ├── pytorchloader │ ├── __init__.py │ └── datasets │ │ ├── __init__.py │ │ └── esc_dataset.py └── tfrecord │ ├── esc_reader.py │ ├── esc_to_tfrecords.py │ ├── esc_utils.py │ └── example.ipynb ├── examples ├── callcenter.ipynb ├── esc.ipynb ├── gtzan.ipynb ├── librispeech.ipynb └── nsynth.ipynb ├── gtzan ├── __init__.py ├── gtzan_gen.py └── torch_readers │ ├── __init__.py │ ├── datasets │ ├── __init__.py │ └── gtzan.py │ └── gtzan_dataset.py ├── librispeech ├── README.md ├── __init__.py ├── tfrecord │ ├── __init__.py │ ├── example.ipynb │ ├── librispeech_reader.py │ └── librispeech_to_tfrecords.py └── torch_readers │ ├── __init__.py │ ├── constants.py │ ├── dataloader_tfrecord.py │ ├── dataset_h5py.py │ ├── dataset_tfrecord.py │ └── librispeech_gen.py ├── misc ├── __init__.py ├── basic_dataset.py ├── data_loader.py ├── transforms.py └── utils.py └── nsynth ├── __init__.py ├── constants.py ├── nsynth_gen.py ├── tfrecord ├── __init__.py ├── example.ipynb ├── nsynth_reader.py └── nsynth_utils.py ├── torch_readers ├── __init__.py ├── basic_dataset.py ├── dataloader_tfrecord.py ├── dataset_h5py.py └── dataset_tfrecord.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | *.mod 4 | *.a 5 | *.o 6 | *.DS_Store 7 | .idea 8 | *.egg 9 | *.egg-info 10 | .ipynb_checkpoints 11 | __pycache__ 12 | __pycache__/* 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dataloaders 2 | Pytorch and TFRecords data loaders for several audio datasets 3 | 4 | **Datasets** 5 | 1. [ESC](https://github.com/karoldvl/ESC-50) - dataset of environmental sounds 6 | - [x] [ESC Downloader](https://github.com/juliagusak/dataloaders/blob/master/esc/esc_gen.py) 7 | - [x] [Pytorch DataSet](https://github.com/juliagusak/dataloaders/blob/master/esc/pytorchloader/datasets/esc_dataset.py) 8 | - [x] [TFRecords Loader](https://github.com/juliagusak/dataloaders/blob/master/esc/tfrecord/esc_reader.py) 9 | 10 | 2. [LibriSpeech](http://www.openslr.org/12/) - corpus of read English speech 11 | - [x] [LibriSpeech downloader for PyTorch](https://github.com/juliagusak/dataloaders/blob/master/librispeech/torch_readers/librispeech_gen.py) 12 | - [x] [PyTorch DataSet](https://github.com/juliagusak/dataloaders/blob/master/librispeech/torch_readers/dataset_h5py.py) 13 | - [x] [PyTorch DataSet for TFRecord](https://github.com/juliagusak/dataloaders/blob/master/librispeech/torch_readers/dataloader_tfrecord.py) 14 | - [x] [PyTorch DataLoaders for TFRecord](https://github.com/juliagusak/dataloaders/blob/master/librispeech/torch_readers/dataloader_tfrecord.py) 15 | - [x] [TFRecords Loader](https://github.com/juliagusak/dataloaders/blob/master/librispeech/tfrecord/librispeech_reader.py) 16 | - [x] [TFRecords Generator](https://github.com/juliagusak/dataloaders/blob/master/librispeech/tfrecord/librispeech_to_tfrecords.py) 17 | 3. [NSynth](https://magenta.tensorflow.org/datasets/nsynth) - dataset of annotated musical notes 18 | - [x] [NSynth downloader and generator of *.h5py and *.tfrecord formats](https://github.com/juliagusak/dataloaders/blob/master/nsynth/nsynth_gen.py) 19 | - [x] [TFRecord reader](https://github.com/juliagusak/dataloaders/blob/master/nsynth/tfrecord/nsynth_reader.py) 20 | - [x] [PyTorch Dataset](https://github.com/juliagusak/dataloaders/blob/master/nsynth/torch_readers/dataset_h5py.py) 21 | - [x] [PyTorch Dataset for TFrecord](https://github.com/juliagusak/dataloaders/blob/master/nsynth/torch_readers/dataset_tfrecord.py) 22 | - [x] [PyTorch DataLoaders for TFRecord](https://github.com/juliagusak/dataloaders/blob/master/nsynth/torch_readers/dataloader_tfrecord.py) 23 | 4. [VoxCeleb2](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/) - human speech, extracted from YouTube interview videos 24 | - [ ] Pytorch loader 25 | - [ ] TFRecords loader 26 | 5. [GTZAN](http://marsyasweb.appspot.com/download/data_sets/) - audio tracks from a variety of sources annotated with genre class 27 | - [x] [GTZAN Downloader](https://github.com/juliagusak/dataloaders/blob/master/gtzan/gtzan_gen.py) 28 | - [x] [PyTorch DataSet](https://github.com/juliagusak/dataloaders/blob/master/gtzan/torch_readers/gtzan_dataset.py) 29 | 6. CallCenter - audio tracks with human and non-human speech 30 | - [x] [PyTorch DataSet](https://github.com/juliagusak/dataloaders/blob/master/callcenter/pytorchloader/callcenter_dataset.py) 31 | 32 | For validation we frequently use the following scheme: 33 | 1. Read 10 random crops from a file; 34 | 2. Predict a class for each crop; 35 | 3. Averaging results. 36 | 37 | For this scheme we've done additional DataLoaders for PyTorch: 38 | 39 | - [DataLoader for ESC, GTZAN, LibriSpeech](https://github.com/juliagusak/dataloaders/blob/master/misc/data_loader.py) 40 | - [DataLoader for LibriSpeech from TfRecords](https://github.com/juliagusak/dataloaders/blob/master/librispeech/torch_readers/dataloader_tfrecord.py) 41 | - [DataLoaders for NSynth](https://github.com/juliagusak/dataloaders/blob/master/nsynth/torch_readers/dataloader_tfrecord.py) 42 | -------------------------------------------------------------------------------- /callcenter/__init__.py: -------------------------------------------------------------------------------- 1 | from .pytorchloader import* -------------------------------------------------------------------------------- /callcenter/pytorchloader/__init__.py: -------------------------------------------------------------------------------- 1 | from .callcenter_dataset import * -------------------------------------------------------------------------------- /callcenter/pytorchloader/callcenter_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import librosa 4 | 5 | from torch.utils import data 6 | 7 | from misc.transforms import get_train_transform, get_test_transform 8 | from misc.utils import FEATURES, LABEL, numpy_one_hot, mix, tensor_to_numpy 9 | 10 | 11 | 12 | 13 | 14 | class CallCenterDataset(data.Dataset): 15 | def __init__(self, data_path, 16 | csv_local_path, 17 | sr =8000, 18 | is_train=True, 19 | signal_length=2**16, 20 | mix=False, 21 | precision=np.float32, 22 | n_files = None, 23 | upsample_factor = 1): 24 | 25 | self.signal_length = signal_length 26 | 27 | if is_train: 28 | self.transform = get_train_transform(length=signal_length) 29 | else: 30 | self.transform = get_test_transform(length=signal_length) 31 | 32 | self.sr = sr 33 | self.mix = mix 34 | self.precision = precision 35 | self.n_files = n_files 36 | self.upsample_factor = upsample_factor 37 | 38 | df = pd.read_csv(data_path + csv_local_path) 39 | 40 | if self.n_files is not None: 41 | df = df.sample(self.n_files) 42 | 43 | df['file_name'] = df['file_name'].apply(lambda x: '{}/{}'.format(data_path, 44 | '/'.join(['callCenterDataset', 45 | x.split('callCenterDataset/')[1]]))) 46 | 47 | self.X = [] 48 | self.y = [] 49 | 50 | for idx, row in df.iterrows(): 51 | v_start,v_end = row['v_start'],row['v_end'] 52 | 53 | signal, sr = librosa.load(file_name, sr = self.sr, 54 | res_type = 'kaiser_fast') 55 | assert (len(signal[int(v_start*sr):int(v_end*sr)]) > 0) 56 | 57 | self.X.append(signal[int(v_start*sr):int(v_end*sr)]) 58 | self.y.append(int(row['is_human'])) 59 | 60 | self.n_classes = len(set(self.y)) 61 | 62 | 63 | def __len__(self): 64 | ''' 65 | Denotes the total number of samples 66 | ''' 67 | return len(self.y) 68 | 69 | def __do_transform(self, signal): 70 | signal = signal.astype(self.precision) 71 | if self.transform: 72 | signal = tensor_to_numpy(self.transform(signal.reshape((1, -1, 1)))) 73 | 74 | signal = np.repeat(signal, repeats=self.upsample_factor, axis = -1) 75 | 76 | return signal 77 | 78 | def __mix_samples(self, sample1, sample2): 79 | r = np.random.uniform() 80 | 81 | sound1 = sample1[FEATURES].reshape((-1)) 82 | sound2 = sample2[FEATURES].reshape((-1)) 83 | 84 | sound = mix(sound1, sound2, r, self.sr*self.upsample_factor) 85 | label = r * sample1[LABEL] + (1.0 - r) * sample2[LABEL] 86 | 87 | sound = sound.reshape((1, 1, -1)) 88 | 89 | return {FEATURES: sound, LABEL: label} 90 | 91 | def __getitem__(self, index): 92 | if self.mix: 93 | idx1, idx2 = np.random.choice(len(self), 2, replace=False) 94 | 95 | sample1 = {FEATURES: self.__do_transform(self.X[idx1]), 96 | LABEL: numpy_one_hot(self.y[idx1], num_classes=self.n_classes)} 97 | sample2 = {FEATURES: self.__do_transform(self.X[idx2]), 98 | LABEL: numpy_one_hot(self.y[idx2], num_classes=self.n_classes)} 99 | 100 | sample = self.__mix_samples(sample1, sample2) 101 | 102 | else: 103 | sample = {FEATURES: self.__do_transform(self.X[index]), 104 | LABEL: numpy_one_hot(self.y[index], num_classes=self.n_classes)} 105 | 106 | return sample -------------------------------------------------------------------------------- /esc/__init__.py: -------------------------------------------------------------------------------- 1 | from .pytorchloader import * -------------------------------------------------------------------------------- /esc/esc_gen.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dataset preparation code for ESC-50 and ESC-10 [Piczak, 2015]. 3 | Usage: python esc_gen.py [path] 4 | FFmpeg should be installed. 5 | 6 | """ 7 | 8 | import sys 9 | import os 10 | import subprocess 11 | 12 | import glob 13 | import numpy as np 14 | import wavio 15 | 16 | 17 | def main(): 18 | esc50_path = os.path.join(sys.argv[1], 'esc50') 19 | esc10_path = os.path.join(sys.argv[1], 'esc10') 20 | os.mkdir(esc50_path) 21 | os.mkdir(esc10_path) 22 | fs_list = [16000, 44100] # EnvNet and EnvNet-v2, respectively 23 | 24 | # Download ESC-50 25 | subprocess.call('wget -P {} https://github.com/karoldvl/ESC-50/archive/master.zip'.format( 26 | esc50_path), shell=True) 27 | subprocess.call('unzip -d {} {}'.format( 28 | esc50_path, os.path.join(esc50_path, 'master.zip')), shell=True) 29 | os.remove(os.path.join(esc50_path, 'master.zip')) 30 | 31 | # Convert sampling rate 32 | for fs in fs_list: 33 | if fs == 44100: 34 | continue 35 | else: 36 | convert_fs(os.path.join(esc50_path, 'ESC-50-master', 'audio'), 37 | os.path.join(esc50_path, 'wav{}'.format(fs // 1000)), 38 | fs) 39 | 40 | # Create npz files 41 | for fs in fs_list: 42 | if fs == 44100: 43 | src_path = os.path.join(esc50_path, 'ESC-50-master', 'audio') 44 | else: 45 | src_path = os.path.join(esc50_path, 'wav{}'.format(fs // 1000)) 46 | 47 | create_dataset(src_path, 48 | os.path.join(esc50_path, 'wav{}.npz'.format(fs // 1000)), 49 | os.path.join(esc10_path, 'wav{}.npz'.format(fs // 1000))) 50 | 51 | 52 | def convert_fs(src_path, dst_path, fs): 53 | print('* {} -> {}'.format(src_path, dst_path)) 54 | os.mkdir(dst_path) 55 | for src_file in sorted(glob.glob(os.path.join(src_path, '*.wav'))): 56 | dst_file = src_file.replace(src_path, dst_path) 57 | subprocess.call('ffmpeg -i {} -ac 1 -ar {} -loglevel error -y {}'.format( 58 | src_file, fs, dst_file), shell=True) 59 | 60 | 61 | def create_dataset(src_path, esc50_dst_path, esc10_dst_path): 62 | print('* {} -> {}'.format(src_path, esc50_dst_path)) 63 | print('* {} -> {}'.format(src_path, esc10_dst_path)) 64 | esc10_classes = [0, 10, 11, 20, 38, 21, 40, 41, 1, 12] # ESC-10 is a subset of ESC-50 65 | esc50_dataset = {} 66 | esc10_dataset = {} 67 | 68 | for fold in range(1, 6): 69 | esc50_dataset['fold{}'.format(fold)] = {} 70 | esc50_sounds = [] 71 | esc50_labels = [] 72 | esc10_dataset['fold{}'.format(fold)] = {} 73 | esc10_sounds = [] 74 | esc10_labels = [] 75 | 76 | for wav_file in sorted(glob.glob(os.path.join(src_path, '{}-*.wav'.format(fold)))): 77 | sound = wavio.read(wav_file).data.T[0] 78 | start = sound.nonzero()[0].min() 79 | end = sound.nonzero()[0].max() 80 | sound = sound[start: end + 1] # Remove silent sections 81 | label = int(os.path.splitext(wav_file)[0].split('-')[-1]) 82 | esc50_sounds.append(sound) 83 | esc50_labels.append(label) 84 | if label in esc10_classes: 85 | esc10_sounds.append(sound) 86 | esc10_labels.append(esc10_classes.index(label)) 87 | 88 | esc50_dataset['fold{}'.format(fold)]['sounds'] = esc50_sounds 89 | esc50_dataset['fold{}'.format(fold)]['labels'] = esc50_labels 90 | esc10_dataset['fold{}'.format(fold)]['sounds'] = esc10_sounds 91 | esc10_dataset['fold{}'.format(fold)]['labels'] = esc10_labels 92 | 93 | np.savez(esc50_dst_path, **esc50_dataset) 94 | np.savez(esc10_dst_path, **esc10_dataset) 95 | 96 | 97 | if __name__ == '__main__': 98 | main() 99 | -------------------------------------------------------------------------------- /esc/pytorchloader/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import * -------------------------------------------------------------------------------- /esc/pytorchloader/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .esc_dataset import ESCDatasets 2 | -------------------------------------------------------------------------------- /esc/pytorchloader/datasets/esc_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | from torch.utils import data 5 | 6 | from misc.transforms import get_train_transform, get_test_transform 7 | from misc.utils import FEATURES, LABEL, numpy_one_hot, mix, tensor_to_numpy 8 | 9 | 10 | class ESCDatasets(data.Dataset): 11 | def __init__(self, data_path, dataset_name, 12 | sr, exclude, 13 | is_train=True, 14 | signal_length=2 ** 16, 15 | mix=False, precision=np.float32): 16 | 17 | self.signal_length = signal_length 18 | 19 | if is_train: 20 | self.transform = get_train_transform(length=signal_length) 21 | else: 22 | self.transform = get_test_transform(length=signal_length) 23 | 24 | self.sr = sr 25 | self.mix = mix 26 | self.precision = precision 27 | data_set = np.load(os.path.join(data_path, dataset_name, 'wav{}.npz'.format(sr // 1000))) 28 | 29 | self.X = [] 30 | self.y = [] 31 | for fold_name in data_set.keys(): 32 | if int(fold_name[4:]) in exclude: 33 | continue 34 | 35 | sounds = data_set[fold_name].item()['sounds'] 36 | labels = data_set[fold_name].item()['labels'] 37 | 38 | self.X.extend(sounds) 39 | self.y.extend(labels) 40 | 41 | self.n_classes = len(set(self.y)) 42 | 43 | def __len__(self): 44 | 'Denotes the total number of samples' 45 | return len(self.y) 46 | 47 | def __do_transform(self, signal): 48 | signal = signal.astype(self.precision) 49 | if self.transform: 50 | signal = tensor_to_numpy(self.transform(signal.reshape((1, -1, 1)))) 51 | 52 | return signal 53 | 54 | def __mix_samples(self, sample1, sample2): 55 | r = np.random.uniform() 56 | 57 | sound1 = sample1[FEATURES].reshape((-1)) 58 | sound2 = sample2[FEATURES].reshape((-1)) 59 | 60 | sound = mix(sound1, sound2, r, self.sr) 61 | label = r * sample1[LABEL] + (1.0 - r) * sample2[LABEL] 62 | 63 | sound = sound.reshape((1, 1, -1)) 64 | 65 | return {FEATURES: sound, LABEL: label} 66 | 67 | def __getitem__(self, index): 68 | if self.mix: 69 | idx1, idx2 = np.random.choice(len(self), 2, replace=False) 70 | 71 | sample1 = {FEATURES: self.__do_transform(self.X[idx1]), 72 | LABEL: numpy_one_hot(self.y[idx1], num_classes=self.n_classes)} 73 | sample2 = {FEATURES: self.__do_transform(self.X[idx2]), 74 | LABEL: numpy_one_hot(self.y[idx2], num_classes=self.n_classes)} 75 | 76 | sample = self.__mix_samples(sample1, sample2) 77 | 78 | else: 79 | # sample = {FEATURES: self.__do_transform(self.X[index]), 80 | # LABEL: self.y[index]} 81 | sample = {FEATURES: self.__do_transform(self.X[index]), 82 | LABEL: numpy_one_hot(self.y[index], num_classes=self.n_classes)} 83 | 84 | return sample 85 | 86 | 87 | # if __name__ == "__main__": 88 | # data_path = "/home/julia/DeepVoice_data/ESC" 89 | # dataset_name = "esc10" 90 | # sr = 16000 91 | # exclude = [5] 92 | # dataset = BCDatasets(data_path, dataset_name, sr, exclude, scattering_time_transform=False) 93 | # print(dataset[0]) 94 | -------------------------------------------------------------------------------- /esc/tfrecord/esc_reader.py: -------------------------------------------------------------------------------- 1 | """Module to load the Dataset.""" 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | # internal imports 8 | import numpy as np 9 | import tensorflow as tf 10 | 11 | 12 | class ESCDataset(object): 13 | '''Dataset object to help manage the TFRecord loading.''' 14 | 15 | def __init__(self, tfrecord_path, is_training = True): 16 | self.is_training = is_training 17 | self.record_path = tfrecord_path 18 | 19 | def get_example(self, batch_size): 20 | """Get a single example from the tfrecord file. 21 | Args: 22 | batch_size: Int, minibatch size. 23 | Returns: 24 | tf.Example protobuf parsed from tfrecord. 25 | """ 26 | reader = tf.TFRecordReader() 27 | num_epochs = None if self.is_training else 1 28 | capacity = batch_size 29 | 30 | path_queue = tf.train.input_producer( 31 | [self.record_path], 32 | num_epochs = num_epochs, 33 | shuffle = self.is_training, 34 | capacity = capacity) 35 | 36 | _, serialized_example = reader.read(path_queue) 37 | features = { 38 | 'signal_raw': tf.FixedLenFeature([], tf.string), 39 | 'sr': tf.FixedLenFeature([], tf.int64), 40 | 'speaker': tf.FixedLenFeature([], tf.int64), 41 | 'label': tf.FixedLenFeature([], tf.int64) 42 | } 43 | example = tf.parse_single_example(serialized_example, features) 44 | return example 45 | 46 | def get_wavenet_batch(self, batch_size, length = 40000): 47 | '''Get the Tensor expression from the reader. 48 | Args: 49 | batch_size: The integer batch size. 50 | length: Number of timesteps of a cropped sample to produce. 51 | Returns: 52 | A dict of key:tensor pairs. This includes "speaker", "label", "wav", and "sr". 53 | ''' 54 | example = self.get_example(batch_size) 55 | 56 | signal = tf.decode_raw(example['signal_raw'], tf.float32) 57 | sr = tf.cast(example['sr'], tf.int32) 58 | speaker = tf.cast(example['speaker'], tf.int32) 59 | label = tf.cast(example['label'], tf.int32) 60 | 61 | annotation = (sr, speaker, label) 62 | 63 | if self.is_training: 64 | # random crop 65 | crop = tf.random_crop(signal, [length]) 66 | crop = tf.reshape(crop, [1, length]) 67 | 68 | else: 69 | # fixed center crop 70 | offset = (40000 - length) // 2 # 24320 71 | crop = tf.slice(signal, [offset], [length]) 72 | crop = tf.reshape(crop, [1, length]) 73 | 74 | crops, annotations = tf.train.shuffle_batch( 75 | [crop, annotation], 76 | batch_size, 77 | num_threads=4, 78 | capacity=500 * batch_size, 79 | min_after_dequeue=200 * batch_size) 80 | 81 | crops = tf.reshape(tf.cast(crops, tf.float32), [batch_size, length]) 82 | 83 | return {"wav": crops, "sr": annotations[:,0], "speaker": annotations[:,1], "label": annotations[:,2]} 84 | -------------------------------------------------------------------------------- /esc/tfrecord/esc_to_tfrecords.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import esc_utils as U 4 | 5 | 6 | def _bytes_features(value): 7 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 8 | 9 | 10 | def _int64_features(value): 11 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 12 | 13 | 14 | def write_tfrecords(tfrecord_path, sounds, labels, fs=16000): 15 | with tf.python_io.TFRecordWriter(tfrecord_path) as writer: 16 | for sound, label in zip(sounds, labels): 17 | sound_raw = sound.tostring() 18 | 19 | example = tf.train.Example(features=tf.train.Features( 20 | feature={ 21 | 'signal_raw': _bytes_features(sound_raw), 22 | 'sr': _int64_features(fs), 23 | 'speaker': _int64_features(label), 24 | 'label': _int64_features(label) 25 | })) 26 | writer.write(example.SerializeToString()) 27 | 28 | 29 | def create_tfrecords(npz_path, tfrecord_pathes, 30 | split=4, fs=16000, 31 | augment_factor=0, strong=False): 32 | # tfrecord_pathes = pathes for train, val tfrecords 33 | with np.load(npz_path) as dataset: 34 | 35 | train_sounds, train_labels = [], [] 36 | val_sounds, val_labels = [], [] 37 | 38 | for i, fold in enumerate(dataset.files): 39 | sounds = dataset[fold].item()['sounds'] 40 | labels = dataset[fold].item()['labels'] 41 | 42 | # we'll add to dataset only samples with length >= 40000 43 | idxs = list(filter(lambda i: len(sounds[i]) >= 40000, 44 | range(len(sounds)))) 45 | sounds = list(np.array(sounds)[idxs]) 46 | labels = list(np.array(labels)[idxs]) 47 | 48 | print('Preprocessing sounds...') 49 | sounds = [U.preprocess_sound(sound) for sound in sounds] 50 | # print(len(sounds), len(labels)) 51 | 52 | print('Augmenting data...') 53 | if augment_factor: 54 | augmented_sounds, augmented_labels = [], [] 55 | for sound, label in zip(sounds, labels): 56 | augmented_sounds.extend([U.augment_sound(sound, strong=strong) for _ in range(augment_factor)]) 57 | augmented_labels.extend([label] * augment_factor) 58 | 59 | sounds.extend(augmented_sounds) 60 | labels.extend(augmented_labels) 61 | 62 | if i == split: 63 | val_sounds.extend(sounds) 64 | val_labels.extend(labels) 65 | else: 66 | train_sounds.extend(sounds) 67 | train_labels.extend(labels) 68 | 69 | print(len(train_sounds), len(train_labels)) 70 | print(len(val_sounds), len(val_labels)) 71 | 72 | print('Writing tfrecords...') 73 | train_tfrecord_path, val_tfrecord_path = tfrecord_pathes 74 | 75 | write_tfrecords(train_tfrecord_path, train_sounds, train_labels, fs=fs) 76 | write_tfrecords(val_tfrecord_path, val_sounds, val_labels, fs=fs) 77 | 78 | 79 | if __name__ == "__main__": 80 | FS = 16000 81 | SPLIT = 4 82 | AUGMENT = 9 83 | STRONG = True 84 | 85 | esc_path = '/workspace/data/ESC/esc10/' 86 | 87 | npz_path = '{}wav{}.npz'.format(esc_path, FS // 1000) 88 | tfrecord_pathes = ['{}wav{}_train.tfrecord'.format(esc_path, FS // 1000), 89 | '{}wav{}_val.tfrecord'.format(esc_path, FS // 1000)] 90 | 91 | create_tfrecords(npz_path, tfrecord_pathes, 92 | split=SPLIT, fs=FS, 93 | augment_factor=AUGMENT, strong=STRONG) 94 | -------------------------------------------------------------------------------- /esc/tfrecord/esc_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | INPUT_LENGTH = 40000 5 | FACTOR = 32768.0 6 | 7 | 8 | # Default data augmentation 9 | def padding(pad): 10 | def f(sound): 11 | return np.pad(sound, pad, 'constant') 12 | 13 | return f 14 | 15 | 16 | def random_crop(size): 17 | def f(sound): 18 | org_size = len(sound) 19 | start = random.randint(0, org_size - size) 20 | return sound[start: start + size] 21 | 22 | return f 23 | 24 | 25 | def normalize(factor): 26 | def f(sound): 27 | return sound / factor 28 | 29 | return f 30 | 31 | 32 | # For strong data augmentation 33 | # Scale audio signal (compress/decompress in time domain) 34 | # For augmentation use scale from [0.8, 1.25] 35 | def random_scale(max_scale, interpolate='Linear'): 36 | def f(sound): 37 | scale = np.power(max_scale, random.uniform(-1, 1)) 38 | output_size = int(len(sound) * scale) 39 | ref = np.arange(output_size) / scale 40 | if interpolate == 'Linear': 41 | ref1 = ref.astype(np.int32) 42 | ref2 = np.minimum(ref1 + 1, len(sound) - 1) 43 | r = ref - ref1 44 | scaled_sound = sound[ref1] * (1 - r) + sound[ref2] * r 45 | elif interpolate == 'Nearest': 46 | scaled_sound = sound[ref.astype(np.int32)] 47 | else: 48 | raise Exception('Invalid interpolation mode {}'.format(interpolate)) 49 | 50 | return scaled_sound 51 | 52 | return f 53 | 54 | 55 | # Make audio louder / quieter 56 | # For augmentation use db=6 57 | def random_gain(db): 58 | def f(sound): 59 | return sound * np.power(10, random.uniform(-db, db) / 20.0) 60 | 61 | return f 62 | 63 | 64 | def preprocess_sound(sound): 65 | sound = padding(INPUT_LENGTH // 2)(sound) 66 | sound = random_crop(INPUT_LENGTH)(sound) 67 | sound = normalize(FACTOR)(sound) 68 | 69 | return sound.astype(np.float32) 70 | 71 | 72 | def augment_sound(sound, strong=True): 73 | sound = random_scale(1.25)(sound) 74 | sound = preprocess_sound(sound) 75 | 76 | if strong: 77 | sound = random_gain(6)(sound) 78 | 79 | return sound.astype(np.float32) 80 | -------------------------------------------------------------------------------- /esc/tfrecord/example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from esc_to_tfrecords import *\n", 10 | "from esc_reader import*" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "##### Create tfrecords" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "Preprocessing sounds...\n", 30 | "Augmenting data...\n", 31 | "Preprocessing sounds...\n", 32 | "Augmenting data...\n", 33 | "Preprocessing sounds...\n", 34 | "Augmenting data...\n", 35 | "Preprocessing sounds...\n", 36 | "Augmenting data...\n", 37 | "Preprocessing sounds...\n", 38 | "Augmenting data...\n", 39 | "2870 2870\n", 40 | "740 740\n", 41 | "Writing tfrecords...\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "FS = 16000\n", 47 | "SPLIT = 4\n", 48 | "AUGMENT = 9\n", 49 | "STRONG = True\n", 50 | "\n", 51 | "esc_path = '/workspace/esc/esc10/'\n", 52 | "\n", 53 | "npz_path = '{}wav{}.npz'.format(esc_path, FS//1000)\n", 54 | "tfrecord_pathes = ['{}wav{}_train.tfrecord'.format(esc_path, FS//1000),\n", 55 | " '{}wav{}_val.tfrecord'.format(esc_path, FS//1000)]\n", 56 | "\n", 57 | "\n", 58 | "create_tfrecords(npz_path, tfrecord_pathes,\n", 59 | " split = SPLIT, fs = FS,\n", 60 | " augment_factor = AUGMENT, strong = STRONG)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "##### Read tfrecords without defining a graph\n", 68 | "Create generator to iterate through tfrecords " 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "7 7 16000\n", 81 | "(160000,)\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "tfrecord_path = tfrecord_pathes[0]\n", 87 | "record_iterator = tf.python_io.tf_record_iterator(path=tfrecord_path)\n", 88 | "\n", 89 | "for string_record in record_iterator:\n", 90 | " example = tf.train.Example()\n", 91 | " example.ParseFromString(string_record)\n", 92 | " \n", 93 | " \n", 94 | " label = example.features.feature['label'].int64_list.value[0]\n", 95 | " speaker = example.features.feature['speaker'].int64_list.value[0]\n", 96 | " sr = example.features.feature['sr'].int64_list.value[0]\n", 97 | "\n", 98 | " signal_string = example.features.feature['signal_raw'].bytes_list.value[0]\n", 99 | " signal = np.frombuffer(signal_string, dtype = np.uint8)\n", 100 | " \n", 101 | " print(label, speaker, sr)\n", 102 | " print(signal.shape)\n", 103 | " break" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "##### Create dataset" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 7, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "dataset = ESCDataset(tfrecord_path=tfrecord_path)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "##### Get dataset batch" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 8, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "dict_keys(['label', 'signal_raw', 'speaker', 'sr'])\n" 139 | ] 140 | }, 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "{'label': ,\n", 145 | " 'signal_raw': ,\n", 146 | " 'speaker': ,\n", 147 | " 'sr': }" 148 | ] 149 | }, 150 | "execution_count": 8, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "# get a batch in the following format: tf.Example protobuf parsed from tfrecord\n", 157 | "batch = dataset.get_example(batch_size = 10)\n", 158 | "print(batch.keys())\n", 159 | "\n", 160 | "batch" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 9, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "dict_keys(['wav', 'sr', 'speaker', 'label'])\n" 173 | ] 174 | }, 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "{'wav': ,\n", 179 | " 'sr': ,\n", 180 | " 'speaker': ,\n", 181 | " 'label': }" 182 | ] 183 | }, 184 | "execution_count": 9, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "#get a batch in the following format: {key:tensor} \n", 191 | "batch = dataset.get_wavenet_batch(batch_size = 10)\n", 192 | "print(batch.keys())\n", 193 | "\n", 194 | "batch" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "##### Define graph to read tfrecords and iterate through batches" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 11, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "(50, 40000) (50,)\n", 214 | "(50, 40000) (50,)\n", 215 | "(50, 40000) (50,)\n" 216 | ] 217 | } 218 | ], 219 | "source": [ 220 | "dataset = ESCDataset(tfrecord_path=tfrecord_path)\n", 221 | "\n", 222 | "LENGTH = 40000\n", 223 | "batch = dataset.get_wavenet_batch(batch_size = 50, length = LENGTH) \n", 224 | "\n", 225 | "# The op for initializing the variables.\n", 226 | "init_op = tf.group(tf.global_variables_initializer(),\n", 227 | " tf.local_variables_initializer())\n", 228 | "\n", 229 | "with tf.Session() as sess:\n", 230 | " \n", 231 | " sess.run(init_op)\n", 232 | " \n", 233 | " coord = tf.train.Coordinator()\n", 234 | " threads = tf.train.start_queue_runners(coord = coord)\n", 235 | " \n", 236 | " for i in range(3):\n", 237 | " \n", 238 | " batch_np = sess.run(batch)\n", 239 | " features, labels = batch_np['wav'], batch_np['label']\n", 240 | "\n", 241 | " print(features.shape, labels.shape)\n", 242 | " \n", 243 | " coord.request_stop()\n", 244 | " coord.join(threads)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [] 253 | } 254 | ], 255 | "metadata": { 256 | "kernelspec": { 257 | "display_name": "Python 3", 258 | "language": "python", 259 | "name": "python3" 260 | }, 261 | "language_info": { 262 | "codemirror_mode": { 263 | "name": "ipython", 264 | "version": 3 265 | }, 266 | "file_extension": ".py", 267 | "mimetype": "text/x-python", 268 | "name": "python", 269 | "nbconvert_exporter": "python", 270 | "pygments_lexer": "ipython3", 271 | "version": "3.6.5" 272 | } 273 | }, 274 | "nbformat": 4, 275 | "nbformat_minor": 2 276 | } 277 | -------------------------------------------------------------------------------- /examples/callcenter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Create dataset\n", 8 | "Usually we create different data sets for traininig and testing steps" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import sys\n", 18 | "sys.path.append('/workspace/jgusak/dataloaders/')\n", 19 | "\n", 20 | "from callcenter import CallCenterDataset" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 8, 26 | "metadata": { 27 | "scrolled": false 28 | }, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "10\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "data_path = '/workspace/datasets/callCenterDataset'\n", 40 | "csv_local_path = \"/callCenterDataset/train.csv\"\n", 41 | "\n", 42 | "# sampling rate and index of the testing data folder \n", 43 | "sr=8000\n", 44 | "\n", 45 | "dataset = CallCenterDataset(data_path, csv_local_path, sr=sr,\n", 46 | " is_train = True, mix = True,\n", 47 | " n_files = 10,\n", 48 | " upsample_factor = 2,\n", 49 | " signal_length = 12000)\n", 50 | "print(len(dataset))" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "### Get dataset item" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 9, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "dict_keys(['features', 'label'])\n", 70 | "(1, 1, 24000) (2,)\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "item = dataset[0]\n", 76 | "print(item.keys())\n", 77 | "\n", 78 | "print(item['features'].shape, item['label'].shape)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "### Create generator to iterate through batches" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 8, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "from torch.utils import data" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 9, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "params = {'batch_size': 64,\n", 104 | " 'shuffle': True,\n", 105 | " 'num_workers': 1}\n", 106 | "loader = data.DataLoader(dataset, **params)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 10, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "torch.Size([10, 1, 1, 65536]) torch.Size([10, 2])\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "for batch in loader:\n", 124 | " features, labels = batch['features'], batch['label']\n", 125 | " \n", 126 | " print(features.shape, labels.shape)\n", 127 | " break" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [] 136 | } 137 | ], 138 | "metadata": { 139 | "kernelspec": { 140 | "display_name": "Python 3", 141 | "language": "python", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "codemirror_mode": { 146 | "name": "ipython", 147 | "version": 3 148 | }, 149 | "file_extension": ".py", 150 | "mimetype": "text/x-python", 151 | "name": "python", 152 | "nbconvert_exporter": "python", 153 | "pygments_lexer": "ipython3", 154 | "version": "3.6.6" 155 | } 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 2 159 | } 160 | -------------------------------------------------------------------------------- /examples/esc.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Create dataset\n", 8 | "Usually we create different data sets for traininig and testing steps" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import sys\n", 18 | "sys.path.append('/workspace/jgusak/dataloaders/')\n", 19 | "\n", 20 | "from esc import ESCDatasets" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "320\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "data_path = \"/workspace/datasets/esc\"\n", 38 | "dataset_name=\"esc10\"\n", 39 | "\n", 40 | "# sampling rate and index of the testing data folder \n", 41 | "sr=16000\n", 42 | "exclude=[5]\n", 43 | "\n", 44 | "dataset = ESCDatasets(data_path, dataset_name, sr, exclude, mix = True)\n", 45 | "print(len(dataset))" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Get dataset item" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "dict_keys(['features', 'label'])\n" 65 | ] 66 | }, 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "((65536,), (10,))" 71 | ] 72 | }, 73 | "execution_count": 3, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "item = dataset[0]\n", 80 | "print(item.keys())\n", 81 | "\n", 82 | "item['features'].shape, item['label'].shape" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "### Create generator to iterate through batches" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "from torch.utils import data" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 5, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "params = {'batch_size': 50,\n", 108 | " 'shuffle': True,\n", 109 | " 'num_workers': 1}\n", 110 | "data_generator = data.DataLoader(dataset, **params)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 6, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "torch.Size([50, 65536]) torch.Size([50, 10])\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "for batch in data_generator:\n", 128 | " features, labels = batch['features'], batch['label']\n", 129 | " \n", 130 | " print(features.shape, labels.shape)\n", 131 | " break" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [] 140 | } 141 | ], 142 | "metadata": { 143 | "kernelspec": { 144 | "display_name": "Python 3", 145 | "language": "python", 146 | "name": "python3" 147 | }, 148 | "language_info": { 149 | "codemirror_mode": { 150 | "name": "ipython", 151 | "version": 3 152 | }, 153 | "file_extension": ".py", 154 | "mimetype": "text/x-python", 155 | "name": "python", 156 | "nbconvert_exporter": "python", 157 | "pygments_lexer": "ipython3", 158 | "version": "3.6.6" 159 | } 160 | }, 161 | "nbformat": 4, 162 | "nbformat_minor": 2 163 | } 164 | -------------------------------------------------------------------------------- /examples/gtzan.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import sys\n", 11 | "sys.path.append(os.path.abspath(\"..\"))\n", 12 | "\n", 13 | "\n", 14 | "from gtzan import GTANZDataset, GTZAN\n", 15 | "from misc import get_train_transform, get_test_transform\n", 16 | "\n", 17 | "from torch.utils import data" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "### First way" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [ 34 | { 35 | "ename": "BadZipFile", 36 | "evalue": "File is not a zip file", 37 | "output_type": "error", 38 | "traceback": [ 39 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 40 | "\u001b[0;31mBadZipFile\u001b[0m Traceback (most recent call last)", 41 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m dataset = GTANZDataset(\"{}/genres16_train.npz\".format(PATH_TO_DATASET),\n\u001b[1;32m 3\u001b[0m \u001b[0mtransforms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mget_train_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlength\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;34m**\u001b[0m \u001b[0;36m14\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m one_hot_labels=True)\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 42 | "\u001b[0;32m/workspace/jgusak/dataloaders/gtzan/torch_readers/gtzan_dataset.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, dataset_path, transforms, one_hot_labels)\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mGTANZDataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdataset_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransforms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mone_hot_labels\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 12\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransforms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtransforms\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"X\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 43 | "\u001b[0;32m/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/numpy/lib/npyio.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(file, mmap_mode, allow_pickle, fix_imports, encoding)\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[0mown_fid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 413\u001b[0m return NpzFile(fid, own_fid=tmp, allow_pickle=allow_pickle,\n\u001b[0;32m--> 414\u001b[0;31m pickle_kwargs=pickle_kwargs)\n\u001b[0m\u001b[1;32m 415\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmagic\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMAGIC_PREFIX\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 416\u001b[0m \u001b[0;31m# .npy file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 44 | "\u001b[0;32m/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/numpy/lib/npyio.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, fid, own_fid, allow_pickle, pickle_kwargs)\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[0;31m# Import is postponed to here since zipfile depends on gzip, an\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 172\u001b[0m \u001b[0;31m# optional component of the so-called standard library.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 173\u001b[0;31m \u001b[0m_zip\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzipfile_factory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 174\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_files\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_zip\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnamelist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 175\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfiles\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 45 | "\u001b[0;32m/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/numpy/lib/npyio.py\u001b[0m in \u001b[0;36mzipfile_factory\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 102\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'allowZip64'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 103\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZipFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 46 | "\u001b[0;32m/opt/conda/envs/pytorch-py3.6/lib/python3.6/zipfile.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, file, mode, compression, allowZip64)\u001b[0m\n\u001b[1;32m 1106\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1107\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1108\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_RealGetContents\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1109\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'w'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'x'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1110\u001b[0m \u001b[0;31m# set the modified flag so central directory gets written\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 47 | "\u001b[0;32m/opt/conda/envs/pytorch-py3.6/lib/python3.6/zipfile.py\u001b[0m in \u001b[0;36m_RealGetContents\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1173\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mBadZipFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"File is not a zip file\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1174\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mendrec\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1175\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mBadZipFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"File is not a zip file\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1176\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1177\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mendrec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 48 | "\u001b[0;31mBadZipFile\u001b[0m: File is not a zip file" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "PATH_TO_DATASET = \"/workspace/datasets/gtzan\"\n", 54 | "dataset = GTANZDataset(\"{}/genres16_train.npz\".format(PATH_TO_DATASET),\n", 55 | " transforms=get_train_transform(length=2 ** 14),\n", 56 | " one_hot_labels=True)\n", 57 | "print(len(dataset))\n", 58 | "print(dataset[5])" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "params = {'batch_size': 64,\n", 68 | " 'shuffle': True,\n", 69 | " 'num_workers': 1}\n", 70 | "dataset = GTANZDataset(\"../genres16_test.npz\",\n", 71 | " transforms=get_test_transform(length=2 ** 14),\n", 72 | " one_hot_labels=True)\n", 73 | "test_generator = ValidationDataLoader(dataset, **params)\n", 74 | "for batch in test_generator:\n", 75 | " print(batch['sound'].shape)\n", 76 | " print(batch)\n", 77 | " break" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "### Second way" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 3, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stderr", 94 | "output_type": "stream", 95 | "text": [ 96 | "100%|██████████| 1000/1000 [01:14<00:00, 13.36it/s]\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "PATH_TO_DATA = \"/workspace/datasets/gtzan/genres/\"\n", 102 | "\n", 103 | "dataset = GTZAN(PATH_TO_DATA, verbose=1, n_jobs=8, sr=16000)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 6, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [ 113 | { 114 | "ename": "AttributeError", 115 | "evalue": "Traceback (most recent call last):\n File \"/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/utils/data/dataloader.py\", line 106, in _worker_loop\n samples = collate_fn([dataset[i] for i in batch_indices])\n File \"/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/utils/data/dataloader.py\", line 106, in \n samples = collate_fn([dataset[i] for i in batch_indices])\n File \"/workspace/jgusak/dataloaders/gtzan/torch_readers/datasets/gtzan.py\", line 66, in __getitem__\n y_enc = self.le(np.array(self.y[index]).reshape((-1, 1))).toarray()[0, :]\nAttributeError: 'numpy.ndarray' object has no attribute 'toarray'\n", 116 | "output_type": "error", 117 | "traceback": [ 118 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 119 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", 120 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mloader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataLoader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mloader\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'features'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 121 | "\u001b[0;32m/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 334\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreorder_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 335\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 336\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_next_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 337\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mnext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m__next__\u001b[0m \u001b[0;31m# Python 2 compatibility\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 122 | "\u001b[0;32m/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_process_next_batch\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_put_indices\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 356\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExceptionWrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 357\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_msg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 358\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 359\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 123 | "\u001b[0;31mAttributeError\u001b[0m: Traceback (most recent call last):\n File \"/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/utils/data/dataloader.py\", line 106, in _worker_loop\n samples = collate_fn([dataset[i] for i in batch_indices])\n File \"/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/utils/data/dataloader.py\", line 106, in \n samples = collate_fn([dataset[i] for i in batch_indices])\n File \"/workspace/jgusak/dataloaders/gtzan/torch_readers/datasets/gtzan.py\", line 66, in __getitem__\n y_enc = self.le(np.array(self.y[index]).reshape((-1, 1))).toarray()[0, :]\nAttributeError: 'numpy.ndarray' object has no attribute 'toarray'\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "params = {'batch_size': 256,\n", 129 | " 'shuffle': True,\n", 130 | " 'num_workers': 1}\n", 131 | "\n", 132 | "loader = data.DataLoader(dataset, **params)\n", 133 | "\n", 134 | "for batch in loader:\n", 135 | " print(batch['features'][0], batch['label'][0], sep='\\n')\n", 136 | " break" 137 | ] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python (pytorch-py3.6)", 143 | "language": "python", 144 | "name": "pytorch-py3.6" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.6.6" 157 | } 158 | }, 159 | "nbformat": 4, 160 | "nbformat_minor": 2 161 | } 162 | -------------------------------------------------------------------------------- /examples/librispeech.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.append(\"../\")\n", 11 | "\n", 12 | "from misc import get_train_transform, get_test_transform\n", 13 | "from librispeech import H5PyDataset\n", 14 | "\n", 15 | "from torch.utils import data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "### Create dataset" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 9, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "/Users/rayne/Documents/workplace/dataloaders/examples\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "!pwd" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 15, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "'{}/librispeach/test-clean-100.hdf5/../librispeech'" 51 | ] 52 | }, 53 | "execution_count": 15, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "import os\n", 60 | "PATH_TO_DATASET = \"../librispeech\"\n", 61 | "os.path.join(PATH_TO_DATASET)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 13, 67 | "metadata": { 68 | "collapsed": true 69 | }, 70 | "outputs": [ 71 | { 72 | "ename": "OSError", 73 | "evalue": "Unable to open file (unable to open file: name = '../librispeach/librispeach/test-clean-100.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)", 74 | "output_type": "error", 75 | "traceback": [ 76 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 77 | "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", 78 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0msr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m16000\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mone_hot_utterance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m in_memory=False)\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Dataset Len\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 79 | "\u001b[0;32m~/Documents/workplace/dataloaders/librispeech/torch_readers/dataset_h5py.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, dataset_path, transforms, sr, signal_length, precision, one_hot_all, one_hot_speaker, one_hot_chapter, one_hot_utterance, encode_cat, in_memory)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhpy_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 29\u001b[0;31m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5py\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 30\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspeaker\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchapter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutterance\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mSPEAKER\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mCHAPTER\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mUTTERANCE\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0min_memory\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 80 | "\u001b[0;32m/anaconda2/envs/py36/lib/python3.6/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, mode, driver, libver, userblock_size, swmr, **kwds)\u001b[0m\n\u001b[1;32m 310\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mphil\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 311\u001b[0m \u001b[0mfapl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_fapl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlibver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 312\u001b[0;31m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_fid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muserblock_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mswmr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mswmr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 313\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 314\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mswmr_support\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 81 | "\u001b[0;32m/anaconda2/envs/py36/lib/python3.6/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36mmake_fid\u001b[0;34m(name, mode, userblock_size, fapl, fcpl, swmr)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mswmr\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mswmr_support\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[0mflags\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_SWMR_READ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 142\u001b[0;31m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 143\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'r+'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 144\u001b[0m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_RDWR\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 82 | "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n", 83 | "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n", 84 | "\u001b[0;32mh5py/h5f.pyx\u001b[0m in \u001b[0;36mh5py.h5f.open\u001b[0;34m()\u001b[0m\n", 85 | "\u001b[0;31mOSError\u001b[0m: Unable to open file (unable to open file: name = '../librispeach/librispeach/test-clean-100.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "PATH_TO_DATASET = \"../librispeach\"\n", 91 | "path_to_hdf5 = \"{}/librispeach/test-clean-100.hdf5\".format(PATH_TO_DATASET)\n", 92 | "\n", 93 | "test_transforms = get_test_transform(length=2 ** 14)\n", 94 | "test_dataset = H5PyDataset(path_to_hdf5,\n", 95 | " transforms=test_transforms,\n", 96 | " sr=16000,\n", 97 | " one_hot_utterance=True,\n", 98 | " in_memory=False)\n", 99 | "\n", 100 | "print(\"Dataset Len\", len(dataset))\n", 101 | "print(\"item 0\", dataset[0])" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Create dataloader" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "test_dataset = test_dataset.instance_dataset(path_to_hdf5, test_transforms, False)\n", 118 | "\n", 119 | "params = {'batch_size': 64,\n", 120 | " 'shuffle': True,\n", 121 | " 'num_workers': 1}\n", 122 | "test_loader = data.DataLoader(test_dataset, **params)\n", 123 | "\n", 124 | "for batch in test_loader:\n", 125 | " print(batch['sound'].shape)\n", 126 | " print(batch['utterance'])\n", 127 | " break" 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "Python 3", 134 | "language": "python", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.6.5" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 2 152 | } 153 | -------------------------------------------------------------------------------- /examples/nsynth.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.append(\"/workspace/jgusak/dataloaders\")\n", 11 | "\n", 12 | "from misc import get_train_transform, get_test_transform\n", 13 | "from nsynth import NSynthH5PyDataset, AUDIO, PITCH\n", 14 | "\n", 15 | "from torch.utils import data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "### Create dataset" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "[100 100 100 ... 76 76 76]\n", 35 | "Dataset Len 289205\n", 36 | "item 0 {'audio': array([[[-3.7326345e-08, -9.8574212e-08, -8.0184151e-08, ...,\n", 37 | " 0.0000000e+00, 0.0000000e+00, 0.0000000e+00]]], dtype=float32), 'pitch': array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 38 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 39 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 40 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 41 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 42 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,\n", 43 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]), 'velocity': 3, 'instrument_source': 2, 'instrument_family': 0, 'qualities': array([0, 0, 1, 1, 0, 0, 0, 1, 0, 0])}\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "PATH_TO_DATASET = '/workspace/datasets/nsynth'\n", 49 | "\n", 50 | "train_transforms = get_train_transform(length=2 ** 14)\n", 51 | "train_dataset = NSynthH5PyDataset(\"{}/nsynth-train.hdf5\".format(PATH_TO_DATASET),\n", 52 | " one_hot_pitch=True,\n", 53 | " encode_cat=True,\n", 54 | " transforms=train_transforms,\n", 55 | " sr=16000,\n", 56 | " in_memory=True)\n", 57 | "\n", 58 | "print(\"Dataset Len\", len(train_dataset))\n", 59 | "print(\"item 0\", train_dataset[0])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "### Create dataloader" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "torch.Size([64, 1, 1, 16384])\n", 79 | "torch.Size([64, 1, 112])\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "params = {'batch_size': 64,\n", 85 | " 'shuffle': True,\n", 86 | " 'num_workers': 4}\n", 87 | "train_loader = data.DataLoader(train_dataset, **params)\n", 88 | "\n", 89 | "for batch in train_loader:\n", 90 | " print(batch[AUDIO].shape)\n", 91 | " print(batch[PITCH].shape)\n", 92 | " break" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [] 101 | } 102 | ], 103 | "metadata": { 104 | "kernelspec": { 105 | "display_name": "Python 3", 106 | "language": "python", 107 | "name": "python3" 108 | }, 109 | "language_info": { 110 | "codemirror_mode": { 111 | "name": "ipython", 112 | "version": 3 113 | }, 114 | "file_extension": ".py", 115 | "mimetype": "text/x-python", 116 | "name": "python", 117 | "nbconvert_exporter": "python", 118 | "pygments_lexer": "ipython3", 119 | "version": "3.6.6" 120 | } 121 | }, 122 | "nbformat": 4, 123 | "nbformat_minor": 2 124 | } 125 | -------------------------------------------------------------------------------- /gtzan/__init__.py: -------------------------------------------------------------------------------- 1 | from .torch_readers import * 2 | -------------------------------------------------------------------------------- /gtzan/gtzan_gen.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import subprocess 5 | from random import shuffle 6 | 7 | import librosa 8 | import numpy as np 9 | 10 | from joblib import Parallel, delayed 11 | from sklearn.model_selection import train_test_split 12 | 13 | 14 | # add path to the directory with misc folder 15 | import sys 16 | sys.path.append(os.path.abspath("..")) 17 | 18 | 19 | from misc.utils import LabelsEncoder 20 | 21 | GTZAN_SPEECH_URL = "http://opihi.cs.uvic.ca/sound/genres.tar.gz" 22 | DEFAULT_BIT_RATE = 22050 23 | 24 | TAR_FILE = "genres.tar.gz" 25 | FOLDER_NAME = TAR_FILE[:-7] 26 | TRAIN_SUFFIX = "train.npz" 27 | TEST_SUFFIX = "test.npz" 28 | VAL_SUFFIX = "val.npz" 29 | 30 | 31 | def parse_args(): 32 | parser = argparse.ArgumentParser(description='GTZAN') 33 | 34 | # General settings 35 | parser.add_argument('--path', required=True, help="Where to store results") 36 | parser.add_argument('--train', default=1.0, type=float, help="What fraction take for training") 37 | parser.add_argument('--val', default=0.0, type=float, help="Where fraction take for testing") 38 | parser.add_argument('--force_download', action='store_true', help="Force downloading from website.") 39 | parser.add_argument('--force_extraction', action='store_true', help="Forcing extraction from tar.gz file.") 40 | parser.add_argument('--force_npz', action='store_true', help="Forcing convertation to wav") 41 | parser.add_argument('--force_h5py', action='store_true', help="Forcing storing to h5py_utils") 42 | parser.add_argument('--sr', default=16000, type=int, help="Sample rate for wav. Default is 16kHz") 43 | parser.add_argument('--n_jobs', default=4, type=int, help="Number of threads for reading audio samples") 44 | 45 | return parser.parse_args() 46 | 47 | 48 | def save_npz(X, y, z, save_to): 49 | data = {"X": X, "y": y, "label_name": z} 50 | #np.savez(save_to, **data) 51 | np.savez_compressed(save_to, **data) 52 | 53 | def read_file(file_name, sr, verbose=0): 54 | if verbose: 55 | print("Read file:", file_name) 56 | label = file_name.split('/')[-2] 57 | audio, sr = librosa.core.load(file_name, sr, res_type='kaiser_fast') 58 | return audio, label 59 | 60 | 61 | if __name__ == "__main__": 62 | opt = parse_args() 63 | tar_gz_path = os.path.join(opt.path, TAR_FILE) 64 | extracted_path = os.path.join(opt.path, FOLDER_NAME) 65 | if not os.path.exists(tar_gz_path) or opt.force_download: 66 | if opt.force_download and os.path.exists(tar_gz_path): 67 | print("Force download. {} file will be replaced.".format(tar_gz_path)) 68 | os.remove(tar_gz_path) 69 | 70 | print("Download *.tar.gz file to", tar_gz_path) 71 | subprocess.run("wget {} -P {}".format(GTZAN_SPEECH_URL, opt.path), shell=True, check=True) 72 | else: 73 | print("The dataset has been already downloaded to {}".format(tar_gz_path)) 74 | 75 | if not os.path.exists(extracted_path): 76 | print("Extract data to", extracted_path) 77 | subprocess.run("tar xvzf {} -C {}".format(tar_gz_path, opt.path), shell=True, check=True) 78 | else: 79 | print("The dataset has been already extracted to {}".format(extracted_path)) 80 | 81 | print("Read in memory") 82 | X = [] 83 | y = [] 84 | 85 | file_names = glob.glob(extracted_path + '/**/*.au') 86 | shuffle(file_names) 87 | file_names = file_names 88 | result = Parallel(n_jobs=opt.n_jobs, verbose=0)(delayed(read_file)(file_name, opt.sr, 1) for file_name in file_names) 89 | X, y = zip(*result) 90 | 91 | X = np.array(X) 92 | y = np.array(y) 93 | 94 | encoder = LabelsEncoder(y) 95 | z = encoder(y) 96 | 97 | print("Finish") 98 | 99 | if opt.train < 1.0: 100 | X_train, X_test, y_train, y_test, z_train, z_test = train_test_split(X, y, z, test_size=1-opt.train) 101 | else: 102 | X_train, y_train, z_train = X, y, z 103 | 104 | save_npz(X_train, z_train, y_train, 105 | os.path.join(opt.path, FOLDER_NAME) + "{}_{}".format(int(opt.sr // 1000), TRAIN_SUFFIX)) 106 | 107 | if opt.train < 1.0: 108 | if opt.val == 0: 109 | save_npz(X_test, z_test, y_test, 110 | os.path.join(opt.path, FOLDER_NAME) + "{}_{}".format(int(opt.sr // 1000), TEST_SUFFIX)) 111 | else: 112 | X_test, X_val, y_test, y_val, z_test, z_val = train_test_split(X_test, y_test, z_test, test_size=opt.val) 113 | save_npz(X_test, z_test, y_test, 114 | os.path.join(opt.path, FOLDER_NAME) + "{}_{}".format(int(opt.sr // 1000), TEST_SUFFIX)) 115 | save_npz(X_val, z_val, y_val, 116 | os.path.join(opt.path, FOLDER_NAME) + "{}_{}".format(int(opt.sr // 1000), VAL_SUFFIX)) 117 | 118 | -------------------------------------------------------------------------------- /gtzan/torch_readers/__init__.py: -------------------------------------------------------------------------------- 1 | from .gtzan_dataset import * 2 | from .datasets import * -------------------------------------------------------------------------------- /gtzan/torch_readers/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .gtzan import GTZAN -------------------------------------------------------------------------------- /gtzan/torch_readers/datasets/gtzan.py: -------------------------------------------------------------------------------- 1 | from torch.utils import data 2 | import glob 3 | import librosa 4 | import numpy as np 5 | from tqdm import tqdm 6 | from joblib import Parallel, delayed 7 | from misc.transforms import get_train_transform, get_test_transform 8 | from misc.utils import FEATURES, LABEL, tensor_to_numpy, LabelsToOneHot 9 | 10 | 11 | class GTZAN(data.Dataset): 12 | def __init__(self, 13 | root_dir, 14 | sr=16000, 15 | precision=np.float32, 16 | is_train=True, 17 | seed=42, 18 | n_jobs=8, 19 | signal_length=2 ** 16, 20 | verbose=0): 21 | self.root_dir = root_dir 22 | self.file_list = sorted(glob.glob(self.root_dir + '/**/*.*')) 23 | self.X = [] 24 | self.y = [] 25 | self.is_train = is_train 26 | self.n_jobs = n_jobs 27 | self.seed = seed 28 | self.sr = sr 29 | self.precision = precision 30 | 31 | self.signal_length = signal_length 32 | if is_train: 33 | self.transform = get_train_transform(length=signal_length) 34 | else: 35 | self.transform = get_test_transform(length=signal_length) 36 | 37 | ## reduce reading from 10 minutes to 42 seconds 38 | if verbose: 39 | iterable = tqdm(self.file_list) 40 | else: 41 | iterable = self.file_list 42 | parres = Parallel(n_jobs=self.n_jobs, verbose=0)(delayed(self.__reader__)(f) for f in iterable) 43 | for wave, label in parres: 44 | self.X.append(wave) 45 | self.y.append(label) 46 | self.X = np.array(self.X) 47 | self.y = np.array(self.y) 48 | self.le = LabelsToOneHot(self.y) 49 | assert len(self.X) == len(self.y) 50 | 51 | def __len__(self): 52 | return len(self.X) 53 | 54 | def __reader__(self, f): 55 | label = f.split('/')[-2] 56 | wave, sr = librosa.core.load(f, self.sr, res_type='kaiser_fast') 57 | return (wave, label) 58 | 59 | def __do_transform(self, signal): 60 | signal = signal.astype(self.precision) 61 | if self.transform: 62 | signal = tensor_to_numpy(self.transform(signal.reshape((1, -1, 1)))) 63 | return signal 64 | 65 | def __getitem__(self, index): 66 | y_enc = self.le(np.array(self.y[index]).reshape((-1, 1))).toarray()[0, :] 67 | X_trans = self.__do_transform(self.X[index]) 68 | sample = {FEATURES: X_trans, LABEL: y_enc} 69 | return sample 70 | 71 | 72 | if __name__=="__main__": 73 | pass 74 | -------------------------------------------------------------------------------- /gtzan/torch_readers/gtzan_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch.utils import data 3 | 4 | from misc.data_loader import ValidationDataLoader 5 | from misc.transforms import get_train_transform, get_test_transform 6 | from misc.utils import LabelsToOneHot, tensor_to_numpy 7 | 8 | 9 | class GTANZDataset(data.Dataset): 10 | def __init__(self, dataset_path, transforms=None, one_hot_labels=False): 11 | data = np.load(dataset_path) 12 | self.transforms = transforms 13 | self.X = data["X"] 14 | self.y = data["y"] 15 | self.label_name = data["label_name"] 16 | 17 | self.n = self.X.shape[0] 18 | 19 | self.one_hot_labels = one_hot_labels 20 | if one_hot_labels: 21 | self.one_hot_encoder = LabelsToOneHot(self.y) 22 | else: 23 | self.one_hot_encoder = None 24 | 25 | def instance_dataset(self, dataset_path, transforms): 26 | new_dataset = GTANZDataset(dataset_path, transforms=transforms, one_hot_labels=False) 27 | if self.one_hot_labels: 28 | new_dataset.one_hot_labels = True 29 | new_dataset.one_hot_encoder = self.one_hot_encoder 30 | return new_dataset 31 | 32 | def __len__(self): 33 | return self.n 34 | 35 | def __getitem__(self, index): 36 | X, y, label_name = self.X[index], self.y[index], self.label_name[index] 37 | 38 | if self.transforms: 39 | X = tensor_to_numpy(self.transforms(X.reshape((1, -1, 1)))) 40 | 41 | if self.one_hot_labels: 42 | y = self.one_hot_encoder(y)[0, :] 43 | 44 | return {"sound": X, "class": y, "class_label": label_name} 45 | 46 | 47 | if __name__=="__main__": 48 | dataset = GTANZDataset("../genres16_test.npz", 49 | transforms=get_train_transform(length=2 ** 14), 50 | one_hot_labels=True) 51 | print(len(dataset)) 52 | print(dataset[5]) 53 | 54 | params = {'batch_size': 64, 55 | 'shuffle': True, 56 | 'num_workers': 1} 57 | dataset = GTANZDataset("../genres16_test.npz", 58 | transforms=get_test_transform(length=2 ** 14), 59 | one_hot_labels=True) 60 | test_generator = ValidationDataLoader(dataset, **params) 61 | for batch in test_generator: 62 | print(batch['sound'].shape) 63 | print(batch) 64 | break 65 | -------------------------------------------------------------------------------- /librispeech/README.md: -------------------------------------------------------------------------------- 1 | LibriSpeech Readers 2 | ============================ 3 | To generate h5py file with [LibriSpeech](http://www.openslr.org/12/) data evaluate the following cmd: 4 | 5 | python librispeech_gen.py --dataset train-clean-100.tar.gz --path ./librispeech --force_h5py True 6 | 7 | For more options do: 8 | 9 | python librispeech_gen.py -h 10 | 11 | To create PyTorch dataset from the *.h5py file please use the class *LibriSpeechH5py* from *h5py_reader.py*. 12 | To evaluate test data loader use *LibriSpeechH5pyTestDataLoader* from *data_loader.py*. 13 | You can find usage examples in *h5py_reader.py*, *data_loader.py*. 14 | -------------------------------------------------------------------------------- /librispeech/__init__.py: -------------------------------------------------------------------------------- 1 | from .torch_readers import * -------------------------------------------------------------------------------- /librispeech/tfrecord/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliagusak/dataloaders/210604349b3bfa51d62cacc9e7668f53dcfe265d/librispeech/tfrecord/__init__.py -------------------------------------------------------------------------------- /librispeech/tfrecord/example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from librispeech_reader import *\n", 10 | "from librispeech_to_tfrecords import *" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "##### Create tfrecords" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# path to the folder with wav files\n", 27 | "part = 'test'\n", 28 | "wav_path = '/workspace/jgusak/Data/LibriSpeech_to_classify/{}'.format(part)\n", 29 | "\n", 30 | "# file name with saved tfrecords\n", 31 | "tfrecord_path = '{}/wavs.tfrecord'.format(wav_path)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "write_tfrecords(wav_path, tfrecord_path)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "##### Read tfrecords without defining a graph\n", 48 | "Create generator to iterate through tfrecords " 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "33 237 16000\n", 61 | "(160000,)\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "record_iterator = tf.python_io.tf_record_iterator(path=tfrecord_path)\n", 67 | "\n", 68 | "for string_record in record_iterator:\n", 69 | " example = tf.train.Example()\n", 70 | " example.ParseFromString(string_record)\n", 71 | " \n", 72 | " \n", 73 | " label = example.features.feature['label'].int64_list.value[0]\n", 74 | " speaker = example.features.feature['speaker'].int64_list.value[0]\n", 75 | " sr = example.features.feature['sr'].int64_list.value[0]\n", 76 | "\n", 77 | " signal_string = example.features.feature['signal_raw'].bytes_list.value[0]\n", 78 | " signal = np.frombuffer(signal_string, dtype = np.uint8)\n", 79 | " \n", 80 | " print(label, speaker, sr)\n", 81 | " print(signal.shape)\n", 82 | " break" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "##### Create dataset" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 6, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "dataset = LibriSpeechDataset(tfrecord_path=tfrecord_path)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "##### Get dataset batch" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 15, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "dict_keys(['label', 'signal_raw', 'speaker', 'sr'])\n" 118 | ] 119 | }, 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "{'label': ,\n", 124 | " 'signal_raw': ,\n", 125 | " 'speaker': ,\n", 126 | " 'sr': }" 127 | ] 128 | }, 129 | "execution_count": 15, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "# get a batch in the following format: tf.Example protobuf parsed from tfrecord\n", 136 | "batch = dataset.get_example(batch_size = 10)\n", 137 | "print(batch.keys())\n", 138 | "\n", 139 | "batch" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 16, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "dict_keys(['wav', 'sr', 'speaker', 'label'])\n" 152 | ] 153 | }, 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "{'wav': ,\n", 158 | " 'sr': ,\n", 159 | " 'speaker': ,\n", 160 | " 'label': }" 161 | ] 162 | }, 163 | "execution_count": 16, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "#get a batch in the following format: {key:tensor} \n", 170 | "batch = dataset.get_wavenet_batch(batch_size = 10)\n", 171 | "print(batch.keys())\n", 172 | "\n", 173 | "batch" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "##### Define graph to read tfrecords and iterate through batches" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 17, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "(50, 40000) (50,)\n", 193 | "(50, 40000) (50,)\n", 194 | "(50, 40000) (50,)\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "dataset = LibriSpeechDataset(tfrecord_path=tfrecord_path)\n", 200 | "\n", 201 | "LENGTH = 40000\n", 202 | "batch = dataset.get_wavenet_batch(batch_size = 50, length = LENGTH) \n", 203 | "\n", 204 | "# The op for initializing the variables.\n", 205 | "init_op = tf.group(tf.global_variables_initializer(),\n", 206 | " tf.local_variables_initializer())\n", 207 | "\n", 208 | "with tf.Session() as sess:\n", 209 | " \n", 210 | " sess.run(init_op)\n", 211 | " \n", 212 | " coord = tf.train.Coordinator()\n", 213 | " threads = tf.train.start_queue_runners(coord = coord)\n", 214 | " \n", 215 | " for i in range(3):\n", 216 | " \n", 217 | " batch_np = sess.run(batch)\n", 218 | " features, labels = batch_np['wav'], batch_np['label']\n", 219 | "\n", 220 | " print(features.shape, labels.shape)\n", 221 | " \n", 222 | " coord.request_stop()\n", 223 | " coord.join(threads)" 224 | ] 225 | } 226 | ], 227 | "metadata": { 228 | "kernelspec": { 229 | "display_name": "Python 3", 230 | "language": "python", 231 | "name": "python3" 232 | }, 233 | "language_info": { 234 | "codemirror_mode": { 235 | "name": "ipython", 236 | "version": 3 237 | }, 238 | "file_extension": ".py", 239 | "mimetype": "text/x-python", 240 | "name": "python", 241 | "nbconvert_exporter": "python", 242 | "pygments_lexer": "ipython3", 243 | "version": "3.6.5" 244 | } 245 | }, 246 | "nbformat": 4, 247 | "nbformat_minor": 2 248 | } 249 | -------------------------------------------------------------------------------- /librispeech/tfrecord/librispeech_reader.py: -------------------------------------------------------------------------------- 1 | """Module to load the Dataset.""" 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | # internal imports 8 | import numpy as np 9 | import tensorflow as tf 10 | 11 | 12 | class LibriSpeechDataset(object): 13 | '''Dataset object to help manage the TFRecord loading.''' 14 | 15 | def __init__(self, tfrecord_path, is_training = True): 16 | self.is_training = is_training 17 | self.record_path = tfrecord_path 18 | 19 | def get_example(self, batch_size): 20 | """Get a single example from the tfrecord file. 21 | Args: 22 | batch_size: Int, minibatch size. 23 | Returns: 24 | tf.Example protobuf parsed from tfrecord. 25 | """ 26 | reader = tf.TFRecordReader() 27 | num_epochs = None if self.is_training else 1 28 | capacity = batch_size 29 | 30 | path_queue = tf.train.input_producer( 31 | [self.record_path], 32 | num_epochs = num_epochs, 33 | shuffle = self.is_training, 34 | capacity = capacity) 35 | 36 | _, serialized_example = reader.read(path_queue) 37 | features = { 38 | 'signal_raw': tf.FixedLenFeature([], tf.string), 39 | 'sr': tf.FixedLenFeature([], tf.int64), 40 | 'speaker': tf.FixedLenFeature([], tf.int64), 41 | 'label': tf.FixedLenFeature([], tf.int64) 42 | } 43 | example = tf.parse_single_example(serialized_example, features) 44 | return example 45 | 46 | def get_wavenet_batch(self, batch_size, length = 40000): 47 | '''Get the Tensor expression from the reader. 48 | Args: 49 | batch_size: The integer batch size. 50 | length: Number of timesteps of a cropped sample to produce. 51 | Returns: 52 | A dict of key:tensor pairs. This includes "speaker", "label", "wav", and "sr". 53 | ''' 54 | example = self.get_example(batch_size) 55 | 56 | signal = tf.decode_raw(example['signal_raw'], tf.float32) 57 | sr = tf.cast(example['sr'], tf.int32) 58 | speaker = tf.cast(example['speaker'], tf.int32) 59 | label = tf.cast(example['label'], tf.int32) 60 | 61 | annotation = (sr, speaker, label) 62 | 63 | if self.is_training: 64 | # random crop 65 | crop = tf.random_crop(signal, [length]) 66 | crop = tf.reshape(crop, [1, length]) 67 | 68 | else: 69 | # fixed center crop 70 | offset = (40000 - length) // 2 # 24320 71 | crop = tf.slice(signal, [offset], [length]) 72 | crop = tf.reshape(crop, [1, length]) 73 | 74 | crops, annotations = tf.train.shuffle_batch( 75 | [crop, annotation], 76 | batch_size, 77 | num_threads=4, 78 | capacity=500 * batch_size, 79 | min_after_dequeue=200 * batch_size) 80 | 81 | crops = tf.reshape(tf.cast(crops, tf.float32), [batch_size, length]) 82 | 83 | return {"wav": crops, "sr": annotations[:,0], "speaker": annotations[:,1], "label": annotations[:,2]} 84 | -------------------------------------------------------------------------------- /librispeech/tfrecord/librispeech_to_tfrecords.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from glob import glob 3 | import numpy as np 4 | import librosa 5 | import scipy 6 | 7 | 8 | # create .tfrecords file with signals and annonation info 9 | 10 | def _bytes_features(value): 11 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 12 | 13 | 14 | def _int64_features(value): 15 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 16 | 17 | 18 | def convert_speakers_to_labels(labels, convertion_dict=None): 19 | return np.array([convertion_dict[l] for l in labels], dtype=np.int32) 20 | 21 | 22 | def write_tfrecords(wav_path, tfrecord_path, signal_length=40000, sr=16000): 23 | wav_files = glob('{}/**/*.wav'.format(wav_path), recursive=True) 24 | 25 | speakers = [int(file.split('/')[-1].split('-')[0]) for file in wav_files] 26 | speaker_to_label = {v: k for k, v in enumerate(set(speakers))} 27 | 28 | labels = convert_speakers_to_labels(speakers, convertion_dict=speaker_to_label) 29 | 30 | tfrecords_filename = tfrecord_path 31 | 32 | with tf.python_io.TFRecordWriter(tfrecords_filename) as writer: 33 | 34 | original_signals = [] 35 | 36 | for wav_file, speaker, label in zip(wav_files, speakers, labels): 37 | print(wav_file) 38 | # sr, wav = scipy.io.wavfile.read(wav_file) 39 | wav, sr = librosa.core.load(wav_file, sr=sr, dtype=np.float32) 40 | 41 | if len(wav) < signal_length: 42 | continue 43 | else: 44 | wav = wav[:signal_length] 45 | 46 | annotation = (sr, speaker, label) 47 | original_signals.append((wav, annotation)) 48 | 49 | # encode to bytes 50 | wav_raw = wav.tostring() 51 | 52 | example = tf.train.Example(features=tf.train.Features( 53 | feature={ 54 | 'signal_raw': _bytes_features(wav_raw), 55 | 'sr': _int64_features(sr), 56 | 'speaker': _int64_features(speaker), 57 | 'label': _int64_features(label) 58 | })) 59 | writer.write(example.SerializeToString()) 60 | 61 | 62 | if __name__ == "__main__": 63 | for folder in ['train']: 64 | wav_path = '/workspace/data/LibriSpeech_to_classify/{}'.format(folder) 65 | tfrecord_path = '{}/wavs.tfrecord'.format(wav_path) 66 | 67 | write_tfrecords(wav_path, tfrecord_path) 68 | -------------------------------------------------------------------------------- /librispeech/torch_readers/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataloader_tfrecord import * 2 | from .dataset_h5py import * 3 | from .dataset_tfrecord import * 4 | from .constants import * -------------------------------------------------------------------------------- /librispeech/torch_readers/constants.py: -------------------------------------------------------------------------------- 1 | SOUND = 'sound' 2 | SPEAKER = "speaker" 3 | CHAPTER = "chapter" 4 | UTTERANCE = "utterance" 5 | SR = "sr" 6 | LABEL = "label" 7 | -------------------------------------------------------------------------------- /librispeech/torch_readers/dataloader_tfrecord.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch.utils.data import DataLoader 3 | 4 | 5 | class LibriSpeechTFRecordDataLoader(DataLoader): 6 | def __init__(self, dataset, **kwargs): 7 | super(LibriSpeechTFRecordDataLoader, self).__init__(dataset, **kwargs) 8 | 9 | def __iter__(self): 10 | sound = [] 11 | speaker = [] 12 | label = [] 13 | sr = [] 14 | for idx in range(len(self.dataset)): 15 | 16 | elem = self.dataset[idx] 17 | sound.append(elem["sound"]) 18 | speaker.append(elem["speaker"]) 19 | label.append(elem["label"]) 20 | sr.append(elem["sr"]) 21 | 22 | if (idx + 1) % self.batch_size == 0: 23 | yield {"sound": np.vstack(sound), "speaker": np.hstack(speaker), 24 | "label": np.hstack(label), "sr": np.hstack(sr)} 25 | 26 | sound.clear() 27 | speaker.clear() 28 | label.clear() 29 | sr.clear() 30 | 31 | batch = {"sound": np.vstack(sound), "speaker": np.hstack(speaker), 32 | "label": np.hstack(label), "sr": np.hstack(sr)} 33 | yield batch 34 | 35 | 36 | class LibriSpeechTFRecordTestDataLoader(DataLoader): 37 | def __init__(self, dataset, **kwargs): 38 | kwargs['batch_size'] = 1 39 | super(LibriSpeechTFRecordTestDataLoader, self).__init__(dataset, **kwargs) 40 | 41 | def __iter__(self): 42 | for idx in range(len(self.dataset)): 43 | elem = self.dataset[idx] 44 | result = {"sound": elem["sound"], "speaker": elem["speaker"], 45 | "label": elem["label"], "sr": elem["sr"]} 46 | yield result 47 | 48 | 49 | if __name__ == "__main__": 50 | from misc.transforms import get_train_transform, get_test_transform 51 | from librispeech.torch_readers.dataset_tfrecord import TFRecordDataset 52 | 53 | params = {'batch_size': 64, 54 | 'shuffle': False, 55 | 'num_workers': 1} 56 | 57 | dataset = TFRecordDataset("../librispeach/test-clean-100_wav16.tfrecord", 58 | get_train_transform(16000), 16000, in_memory=False) 59 | test_generator = LibriSpeechTFRecordDataLoader(dataset, **params) 60 | for batch in test_generator: 61 | print(batch['sound'].shape) 62 | print(batch) 63 | break 64 | 65 | params = {'batch_size': 64, 66 | 'shuffle': False, 67 | 'num_workers': 1} 68 | 69 | dataset = TFRecordDataset("../librispeach/test-clean-100_wav16.tfrecord", 70 | get_test_transform(16000), 16000, in_memory=False) 71 | test_generator = LibriSpeechTFRecordTestDataLoader(dataset, **params) 72 | for batch in test_generator: 73 | print(batch['sound'].shape) 74 | print(batch) 75 | break 76 | -------------------------------------------------------------------------------- /librispeech/torch_readers/dataset_h5py.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | 3 | import numpy as np 4 | 5 | from torch.utils import data 6 | 7 | from misc.basic_dataset import BasicDataset 8 | from misc.utils import LabelsToOneHot, LabelsEncoder 9 | from librispeech.torch_readers.constants import SPEAKER, SOUND, CHAPTER, UTTERANCE 10 | 11 | 12 | class H5PyDataset(BasicDataset): 13 | def __init__(self, 14 | dataset_path, 15 | transforms, 16 | sr, 17 | signal_length=2 ** 16, 18 | precision=np.float32, 19 | one_hot_all=False, 20 | one_hot_speaker=False, 21 | one_hot_chapter=False, 22 | one_hot_utterance=False, 23 | encode_cat=False, 24 | in_memory=True): 25 | super(H5PyDataset, self).__init__(transforms, sr, signal_length, precision, 26 | one_hot_all, encode_cat, in_memory) 27 | 28 | self.hpy_file = None 29 | f = h5py.File(dataset_path, 'r') 30 | self.speaker, self.chapter, self.utterance = f[SPEAKER][:], f[CHAPTER][:], f[UTTERANCE][:] 31 | if self.in_memory: 32 | self.sound = f[SOUND][:] 33 | f.close() 34 | else: 35 | self.hpy_file = f 36 | self.sound = f[SOUND] 37 | 38 | self.n = self.speaker.shape[0] 39 | 40 | self.one_hot_speaker = one_hot_speaker 41 | self.one_hot_chapter = one_hot_chapter 42 | self.one_hot_utterance = one_hot_utterance 43 | 44 | if self.encode_cat: 45 | self.speaker_encode = LabelsEncoder(self.speaker) 46 | self.chapter_encode = LabelsEncoder(self.chapter) 47 | self.utterance_encode = LabelsEncoder(self.utterance) 48 | 49 | self.speaker = self.speaker_encode(self.speaker) 50 | self.chapter = self.chapter_encode(self.chapter) 51 | self.utterance = self.utterance_encode(self.utterance) 52 | else: 53 | self.speaker_encode = None 54 | self.chapter_encode = None 55 | self.utterance_encode = None 56 | 57 | if self.one_hot_speaker or self.one_hot_all: 58 | self.speaker_one_hot = LabelsToOneHot(self.speaker) 59 | else: 60 | self.speaker_one_hot = None 61 | 62 | if self.one_hot_chapter or self.one_hot_all: 63 | self.chapter_one_hot = LabelsToOneHot(self.chapter) 64 | else: 65 | self.chapter_one_hot = None 66 | 67 | if self.one_hot_utterance or self.one_hot_all: 68 | self.utterance_one_hot = LabelsToOneHot(self.utterance) 69 | else: 70 | self.utterance_one_hot = None 71 | 72 | def instance_dataset(self, dataset_path, transforms, in_memory): 73 | new_dataset = self.__class__(dataset_path, 74 | transforms, 75 | sr=self.sr, 76 | signal_length=self.signal_length, 77 | precision=self.precision, 78 | one_hot_all=False, 79 | one_hot_speaker=False, 80 | one_hot_chapter=False, 81 | one_hot_utterance=False, 82 | encode_cat=False, 83 | in_memory=in_memory 84 | ) 85 | 86 | new_dataset.one_hot_all = self.one_hot_all 87 | 88 | if self.one_hot_speaker or self.one_hot_all: 89 | new_dataset.one_hot_speaker = True 90 | new_dataset.pitch_one_hot = self.speaker_one_hot 91 | if self.one_hot_chapter or self.one_hot_all: 92 | new_dataset.one_hot_chapter = True 93 | new_dataset.chapter_one_hot = self.chapter_one_hot 94 | if self.one_hot_utterance or self.one_hot_all: 95 | new_dataset.one_hot_utterance = True 96 | new_dataset.utterance_one_hot = self.utterance_one_hot 97 | 98 | if self.encode_cat: 99 | new_dataset.speaker_encode = self.speaker_encode 100 | new_dataset.chapter_encode = self.chapter_encode 101 | new_dataset.utterance_encode = self.utterance_encode 102 | 103 | return new_dataset 104 | 105 | def __exit__(self, exc_type, exc_value, traceback): 106 | if self.hpy_file is not None: 107 | self.hpy_file.close() 108 | 109 | def __getitem__(self, index): 110 | sound, speaker, chapter, utterance = self.sound[index], self.speaker[index], \ 111 | self.chapter[index], self.utterance[index] 112 | sound = self.do_transform(sound) 113 | 114 | if self.encode_cat: 115 | speaker = self.speaker_encode(speaker) 116 | chapter = self.chapter_encode(chapter) 117 | utterance = self.utterance_encode(utterance) 118 | 119 | if self.one_hot_speaker or self.one_hot_all: 120 | speaker = self.speaker_one_hot(speaker) 121 | if self.one_hot_chapter or self.one_hot_all: 122 | chapter = self.chapter_one_hot(chapter) 123 | if self.one_hot_utterance or self.one_hot_all: 124 | utterance = self.utterance_one_hot(utterance) 125 | 126 | return {SOUND: sound, SPEAKER: speaker, CHAPTER: chapter, UTTERANCE: utterance} 127 | 128 | 129 | if __name__ == "__main__": 130 | from misc.transforms import get_train_transform 131 | 132 | train_transforms = get_train_transform(length=2 ** 14) 133 | dataset = H5PyDataset("../librispeach/train-clean-100.hdf5", 134 | transforms=train_transforms, 135 | sr=16000, 136 | one_hot_utterance=True, 137 | in_memory=False) 138 | print("Dataset Len", len(dataset)) 139 | print("item 0", dataset[0]) 140 | 141 | dataset = dataset.instance_dataset("../librispeach/train-clean-100.hdf5", train_transforms, False) 142 | 143 | params = {'batch_size': 64, 144 | 'shuffle': True, 145 | 'num_workers': 1} 146 | training_generator = data.DataLoader(dataset, **params) 147 | 148 | for batch in training_generator: 149 | print(batch['sound'].shape) 150 | print(batch['utterance']) 151 | break 152 | -------------------------------------------------------------------------------- /librispeech/torch_readers/dataset_tfrecord.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from librispeech.torch_readers.constants import * 5 | from misc.basic_dataset import BasicDataset 6 | from misc.utils import LabelsToOneHot, LabelsEncoder, itarate_over_tfrecord, configure_tf_dataset 7 | 8 | 9 | def librispeech_features(example): 10 | features = { 11 | 'signal_raw': tf.FixedLenFeature([], tf.string), 12 | 'sr': tf.FixedLenFeature([], tf.int64), 13 | 'speaker': tf.FixedLenFeature([], tf.int64), 14 | 'label': tf.FixedLenFeature([], tf.int64) 15 | } 16 | 17 | parsed_example = tf.parse_single_example(example, features) 18 | 19 | sound = tf.decode_raw(parsed_example['signal_raw'], tf.float32) 20 | sr = tf.cast(parsed_example['sr'], tf.int32) 21 | speaker = tf.cast(parsed_example['speaker'], tf.int32) 22 | label = tf.cast(parsed_example['label'], tf.int32) 23 | 24 | return sound, sr, speaker, label 25 | 26 | 27 | class TFRecordDataset(BasicDataset): 28 | def __init__(self, 29 | dataset_path, 30 | transforms, 31 | sr, 32 | signal_length=2 ** 16, 33 | precision=np.float32, 34 | one_hot_all=False, 35 | one_hot_speaker=False, 36 | one_hot_label=False, 37 | encode_cat=False, 38 | in_memory=True, 39 | batch_size=1, 40 | repeat=1, 41 | buffer_size=10): 42 | super(TFRecordDataset, self).__init__(transforms, sr, signal_length, precision, 43 | one_hot_all, encode_cat, in_memory) 44 | 45 | self.sound = [] 46 | self.speaker = [] 47 | self.label = [] 48 | 49 | self.dataset = configure_tf_dataset(librispeech_features, batch_size, buffer_size, dataset_path, repeat) 50 | 51 | self.sound = [] 52 | self.sr = [] 53 | self.speaker = [] 54 | self.label = [] 55 | self.sess = None 56 | self.iterator = None 57 | 58 | iter = self.dataset.make_one_shot_iterator() 59 | if self.in_memory: 60 | for sound, sr, speaker, label in itarate_over_tfrecord(iter): 61 | self.sound.append(sound) 62 | self.sr.append(sr) 63 | self.speaker.append(speaker) 64 | self.label.append(label) 65 | 66 | self.sound = np.vstack(self.sound) 67 | self.sr = np.hstack(self.sr) 68 | self.speaker = np.hstack(self.speaker) 69 | self.label = np.hstack(self.label) 70 | 71 | self.n = self.label.shape[0] 72 | else: 73 | self.sess = tf.Session() 74 | self.n = 0 75 | for sound, sr, speaker, label in itarate_over_tfrecord(iter): 76 | self.speaker.append(speaker[0]) 77 | self.label.append(label[0]) 78 | self.n += 1 79 | self.speaker = np.array(self.speaker) 80 | self.label = np.array(self.label) 81 | 82 | self.one_hot_speaker = one_hot_speaker 83 | self.one_hot_label = one_hot_label 84 | 85 | if self.encode_cat: 86 | self.speaker_encoder = LabelsEncoder(self.speaker) 87 | self.label_encoder = LabelsEncoder(self.label) 88 | 89 | self.speaker = self.speaker_encoder(self.speaker) 90 | self.label = self.label_encoder(self.label) 91 | else: 92 | self.speaker_encoder = None 93 | self.label_encoder = None 94 | 95 | if self.one_hot_speaker or self.one_hot_all: 96 | self.speaker_one_hot = LabelsToOneHot(self.speaker) 97 | else: 98 | self.speaker_one_hot = None 99 | 100 | if self.one_hot_label or self.one_hot_all: 101 | self.label_one_hot = LabelsToOneHot(self.label) 102 | else: 103 | self.label_one_hot = None 104 | 105 | def instance_dataset(self, dataset_path, transforms, in_memory): 106 | new_dataset = self.__class__(dataset_path, 107 | transforms, 108 | sr=self.sr, 109 | signal_length=self.signal_length, 110 | precision=self.precision, 111 | one_hot_all=False, 112 | one_hot_speaker=False, 113 | one_hot_label=False, 114 | encode_cat=False, 115 | in_memory=in_memory 116 | ) 117 | 118 | new_dataset.one_hot_all = self.one_hot_all 119 | 120 | if self.one_hot_speaker or self.one_hot_all: 121 | new_dataset.one_hot_speaker = True 122 | new_dataset.speaker_one_hot = self.speaker_one_hot 123 | if self.one_hot_label or self.one_hot_all: 124 | new_dataset.one_hot_label = True 125 | new_dataset.label_one_hot = self.label_one_hot 126 | 127 | if self.encode_cat: 128 | new_dataset.speaker_encode = self.speaker_encoder 129 | new_dataset.label_encoder = self.label_encoder 130 | 131 | return new_dataset 132 | 133 | def __exit__(self, exc_type, exc_value, traceback): 134 | if self.sess is not None: 135 | self.sess.close() 136 | 137 | def __getitem__(self, index): 138 | if index >= self.n: 139 | raise IndexError 140 | 141 | if self.in_memory: 142 | sound, sr, speaker, label = self.sound[index], self.sr[index], self.speaker[index], self.label[index] 143 | else: 144 | if self.iterator is None: 145 | self.iterator = self.dataset.make_one_shot_iterator() 146 | try: 147 | sound, sr, speaker, label = self.iterator.get_next() 148 | sound, sr, speaker, label = self.sess.run([sound, sr, speaker, label]) 149 | except tf.errors.OutOfRangeError: 150 | self.iterator = self.dataset.make_one_shot_iterator() 151 | sound, sr, speaker, label = self.sess.run(self.iterator.get_next()) 152 | 153 | sound = self.do_transform(sound) 154 | 155 | if self.encode_cat: 156 | speaker = self.speaker_encoder(speaker) 157 | label = self.label_encoder(label) 158 | 159 | if self.one_hot_all or self.one_hot_speaker: 160 | speaker = self.speaker_one_hot(speaker) 161 | if self.one_hot_all or self.one_hot_label: 162 | label = self.label_one_hot(label) 163 | 164 | return {SOUND: sound, SR: sr, SPEAKER: speaker, LABEL: label} 165 | 166 | 167 | if __name__ == "__main__": 168 | from misc.transforms import get_train_transform 169 | 170 | train_transforms = get_train_transform(16000) 171 | dataset = TFRecordDataset("../librispeach/test-clean-100_wav16.tfrecord", 172 | train_transforms, 16000, in_memory=False, encode_cat=True) 173 | print(dataset[3]['sound'].shape) 174 | print(len(dataset)) 175 | i = 0 176 | for _ in dataset: 177 | i += 1 178 | print(i) 179 | 180 | print("------------------------------") 181 | dataset = dataset.instance_dataset("../librispeach/test-clean-100_wav16.tfrecord", train_transforms, False) 182 | 183 | print(dataset[3]['sound'].shape) 184 | print(len(dataset)) 185 | 186 | -------------------------------------------------------------------------------- /librispeech/torch_readers/librispeech_gen.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import h5py 3 | import numpy as np 4 | import os 5 | import subprocess 6 | import wavio 7 | 8 | from glob import glob 9 | from tqdm import tqdm 10 | from random import shuffle 11 | 12 | #from utils import UTTERANCE, CHAPTER, SPEAKER, SOUND 13 | from constants import UTTERANCE, CHAPTER, SPEAKER, SOUND 14 | 15 | LIBRI_SPEECH_URL = "http://www.openslr.org/12/" 16 | EXTRACTED_FOLDER = "LibriSpeech" 17 | 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser(description='LibriSpeech') 21 | 22 | # General settings 23 | parser.add_argument('--dataset', 24 | required=True, 25 | help="The name of a particular dataset from {}".format(LIBRI_SPEECH_URL)) 26 | parser.add_argument('--url', 27 | default=LIBRI_SPEECH_URL, 28 | help="Where datasets are stored. Default: {}".format(LIBRI_SPEECH_URL)) 29 | parser.add_argument('--path', required=True, help="Where to store results") 30 | parser.add_argument('--force_download', action='store_true', help="Force downloading from website.") 31 | parser.add_argument('--force_extraction', action='store_true', help="Forcing extraction from tar.gz file.") 32 | parser.add_argument('--force_convert', action='store_true', help="Forcing convertation to wav") 33 | parser.add_argument('--force_h5py', action='store_true', help="Forcing storing to h5py_torch") 34 | parser.add_argument('--sr', default=16000, help="Sample rate for wav. Default is 16kHz") 35 | parser.add_argument('--wav_dir', default=EXTRACTED_FOLDER+"Wav", help="Where to store wav files") 36 | parser.add_argument('--rm_flac', default=True, help="Remove or not folder with flac files") 37 | parser.add_argument('--take_random', action='store_true', 38 | help="Take N random wav files for storing in h5py_torch") 39 | 40 | return parser.parse_args() 41 | 42 | 43 | if __name__=="__main__": 44 | opt = parse_args() 45 | 46 | # Download tar 47 | data_url = os.path.join(opt.url, opt.dataset) 48 | tar_path = os.path.join(opt.path, opt.dataset) 49 | extraction_path = os.path.join(opt.path, EXTRACTED_FOLDER, opt.dataset[:-7]) 50 | wav_path = os.path.join(opt.path, opt.dataset[:-7] + "_wav" + str(opt.sr // 1000)) 51 | dataset_path = os.path.join(opt.path, opt.dataset[:-7])+'.hdf5' 52 | 53 | if opt.force_h5py: 54 | print("Force h5py_torch creation. {} file will me replaced.".format(dataset_path)) 55 | subprocess.run("rm -rf {}".format(dataset_path), shell=True, check=True) 56 | 57 | if os.path.exists(dataset_path) and not (opt.force_download or opt.force_extraction or opt.force_convert): 58 | print('Dataset is already downloaded and prepared') 59 | exit() 60 | 61 | # rm folders 62 | if opt.force_download: 63 | if opt.force_download and os.path.exists(tar_path): 64 | print("Force download. {} file will me replaced.".format(tar_path)) 65 | os.remove(tar_path) 66 | 67 | if opt.force_extraction and os.path.exists(extraction_path): 68 | print("Force extraction. {} file will me replaced.".format(extraction_path)) 69 | subprocess.run("rm -rf {}".format(extraction_path), shell=True, check=True) 70 | 71 | if opt.force_convert and os.path.exists(wav_path): 72 | print("Force extraction. {} file will me replaced.".format(wav_path)) 73 | subprocess.run("rm -rf {}".format(wav_path), shell=True, check=True) 74 | 75 | if not os.path.exists(tar_path) or opt.force_download: 76 | print("Download tar.gz") 77 | subprocess.run("wget {} -P {}".format(data_url, opt.path), shell=True, check=True) 78 | else: 79 | print("Dataset has already downloaded") 80 | 81 | # Extract tar 82 | if (not os.path.exists(extraction_path) and not os.path.exists(wav_path)) or opt.force_extraction: 83 | print("Extraction path:", extraction_path) 84 | subprocess.run("tar xvzf {} -C {}".format(tar_path, opt.path), shell=True, check=True) 85 | else: 86 | print("Dataset has already extracted") 87 | 88 | # Convert to wav 89 | wav_path = os.path.join(opt.path, opt.dataset[:-7] + "_wav" + str(opt.sr//1000)) 90 | print("wav_path", wav_path) 91 | if not os.path.exists(wav_path) or opt.force_convert or opt.force_extraction: 92 | os.mkdir(wav_path) 93 | flacs = glob('{}/**/*.flac'.format(extraction_path), recursive=True) 94 | for flac in flacs: 95 | wav_file = os.path.join(wav_path, flac.split("/")[-1][:-5] + '.wav') 96 | subprocess.run('ffmpeg -i {} {} -ar {}'.format(flac, wav_file, opt.sr), shell=True, check=True) 97 | else: 98 | print("Dataset has already converted to wav with sr {}".format(opt.sr)) 99 | 100 | if opt.rm_flac and os.path.exists(extraction_path): 101 | print("Flac folder {} will be removed".format(extraction_path)) 102 | subprocess.run("rm -rf {}".format(extraction_path), shell=True, check=True) 103 | 104 | print("Convertation to wav is finished") 105 | 106 | if not os.path.exists(dataset_path): 107 | print("Packing into {} file".format(dataset_path)) 108 | wav_files = os.listdir(wav_path) 109 | shuffle(wav_files) 110 | if opt.take_random is not None: 111 | wav_files = wav_files[:opt.take_random] 112 | 113 | data_len = len(wav_files) 114 | f = h5py.File(dataset_path, 'w') 115 | 116 | dt = h5py.special_dtype(vlen=np.float32) 117 | sound = f.create_dataset(SOUND, (data_len, ), dtype=dt) 118 | speaker = f.create_dataset(SPEAKER, (data_len,), dtype=np.int) 119 | chapter = f.create_dataset(CHAPTER, (data_len,), dtype=np.int) 120 | utterance = f.create_dataset(UTTERANCE, (data_len,), dtype=np.int) 121 | 122 | for i, wav_file in tqdm(enumerate(wav_files), total=data_len): 123 | file_name = wav_file.split("/")[-1][:-4] 124 | sound_wav = wavio.read(os.path.join(wav_path, wav_file)).data.T[0] 125 | speaker_id, chapter_id, utterance_id = map(int, file_name.split("-")) 126 | sound[i] = sound_wav 127 | speaker[i], chapter[i], utterance[i] = speaker_id, chapter_id, utterance_id 128 | 129 | f.flush() 130 | f.close() 131 | else: 132 | print("{} file has been already prepared.".format(dataset_path)) 133 | 134 | -------------------------------------------------------------------------------- /misc/__init__.py: -------------------------------------------------------------------------------- 1 | from .basic_dataset import * 2 | from .data_loader import * 3 | from .transforms import * 4 | from .utils import * -------------------------------------------------------------------------------- /misc/basic_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from torch.utils import data 4 | 5 | from misc.utils import tensor_to_numpy 6 | 7 | 8 | class BasicDataset(data.Dataset): 9 | def __init__(self, 10 | transforms, 11 | sr, 12 | signal_length=2 ** 16, 13 | precision=np.float32, 14 | one_hot_all=False, 15 | encode_cat=False, 16 | in_memory=True): 17 | self.in_memory = in_memory 18 | self.transforms = transforms 19 | self.sr = sr 20 | self.signal_length = signal_length 21 | self.precision = precision 22 | 23 | self.n = None 24 | 25 | self.one_hot_all = one_hot_all 26 | self.encode_cat = encode_cat 27 | 28 | def do_transform(self, sound): 29 | if self.transforms: 30 | trans_sig = self.transforms(sound.reshape((1, -1, 1))) 31 | sound = tensor_to_numpy(trans_sig) 32 | 33 | return sound 34 | 35 | def __len__(self): 36 | return self.n 37 | 38 | def __getitem__(self, index): 39 | pass 40 | -------------------------------------------------------------------------------- /misc/data_loader.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | 3 | 4 | class ValidationDataLoader(DataLoader): 5 | def __init__(self, dataset, **kwargs): 6 | kwargs['batch_size'] = 1 7 | super(ValidationDataLoader, self).__init__(dataset, **kwargs) 8 | 9 | def __iter__(self): 10 | iterator = super(ValidationDataLoader, self).__iter__() 11 | for batch in iterator: 12 | batch['sound'] = batch['sound'].view(batch['sound'].size()[1:]) 13 | yield batch 14 | 15 | 16 | if __name__ == "__main__": 17 | from misc.transforms import get_test_transform 18 | from librispeech.torch_readers.dataset_h5py import H5PyDataset 19 | 20 | test_transforms = get_test_transform(length=2 ** 14) 21 | test_dataset = H5PyDataset("./librispeach/train-clean-100.hdf5", 22 | transforms=test_transforms, 23 | sr=16000, 24 | one_hot_utterance=True, 25 | in_memory=False) 26 | params = {'batch_size': 64, 27 | 'shuffle': True, 28 | 'num_workers': 1} 29 | test_generator = ValidationDataLoader(test_dataset, **params) 30 | for batch in test_generator: 31 | print(batch['sound'].shape) 32 | print(batch) 33 | break 34 | -------------------------------------------------------------------------------- /misc/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import torchvision 4 | 5 | from torchvision.transforms import ToPILImage, Pad, RandomCrop, ToTensor, TenCrop, Lambda 6 | 7 | # MAX_INT = 32768.0 8 | MAX_INT = 1.0 9 | 10 | 11 | class Centring(object): 12 | def __init__(self, factor): 13 | self.factor = factor 14 | 15 | def __call__(self, img): 16 | return img / self.factor 17 | 18 | 19 | def get_train_transform(length=None): 20 | transforms = [ToPILImage(), 21 | Pad((length // 2, 0)), 22 | RandomCrop((1, length)), 23 | ToTensor(), 24 | Centring(MAX_INT)] 25 | return torchvision.transforms.Compose(transforms) 26 | 27 | 28 | def get_test_transform(length=None): 29 | transforms = [ToPILImage(), 30 | Pad((length // 2, 0)), 31 | TenCrop((1, length)), 32 | Lambda(lambda crops: torch.stack([ToTensor()(crop) for crop in crops])), 33 | Centring(MAX_INT)] 34 | return torchvision.transforms.Compose(transforms) 35 | -------------------------------------------------------------------------------- /misc/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import torch 4 | 5 | from PIL import Image 6 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 7 | 8 | FEATURES = 'features' 9 | LABEL = 'label' 10 | 11 | MAX_INT = 32768.0 12 | 13 | Image.MAX_IMAGE_PIXELS = None 14 | 15 | 16 | class LabelsToOneHot: 17 | def __init__(self, data): 18 | self.labels_encoder = LabelEncoder() 19 | self.one_hot_encoder = OneHotEncoder() 20 | 21 | self.labels_encoder.fit(data.reshape(-1, )) 22 | self.one_hot_encoder.fit(self.labels_encoder.transform(data.reshape(-1, )).reshape((-1, 1))) 23 | 24 | def __call__(self, data): 25 | return self.one_hot_encoder.transform(self.labels_encoder.transform(data.reshape(-1, )).reshape((-1, 1))).toarray() 26 | 27 | 28 | class LabelsEncoder: 29 | def __init__(self, data): 30 | self.labels_encoder = LabelEncoder() 31 | self.labels_encoder.fit(data.reshape(-1, )) 32 | 33 | def __call__(self, data): 34 | return self.labels_encoder.transform(data.reshape(-1, )) 35 | 36 | 37 | def configure_tf_dataset(features_extractor, batch_size, buffer_size, dataset_path, repeat): 38 | dataset = tf.data.TFRecordDataset(dataset_path) 39 | dataset = dataset.map(features_extractor) 40 | dataset = dataset.batch(batch_size) 41 | dataset = dataset.shuffle(buffer_size=buffer_size) 42 | return dataset.repeat(repeat) 43 | 44 | 45 | def itarate_over_tfrecord(iter): 46 | iter = iter.get_next() 47 | with tf.Session() as sess: 48 | try: 49 | while True: 50 | yield sess.run(iter) 51 | except tf.errors.OutOfRangeError: 52 | pass 53 | 54 | 55 | def tensor_to_numpy(tensor): 56 | if torch.cuda.is_available(): 57 | return tensor.cpu().numpy() 58 | else: 59 | return tensor.numpy() 60 | 61 | 62 | def numpy_one_hot(label, num_classes=2): 63 | label = np.eye(num_classes)[label] 64 | return label 65 | 66 | 67 | # For BC learning 68 | def a_weight(fs, n_fft, min_db=-80.0): 69 | freq = np.linspace(0, fs // 2, n_fft // 2 + 1) 70 | freq_sq = np.power(freq, 2) 71 | freq_sq[0] = 1.0 72 | weight = 2.0 + 20.0 * (2 * np.log10(12194) + 2 * np.log10(freq_sq) 73 | - np.log10(freq_sq + 12194 ** 2) 74 | - np.log10(freq_sq + 20.6 ** 2) 75 | - 0.5 * np.log10(freq_sq + 107.7 ** 2) 76 | - 0.5 * np.log10(freq_sq + 737.9 ** 2)) 77 | weight = np.maximum(weight, min_db) 78 | 79 | return weight 80 | 81 | 82 | def compute_gain(sound, fs, min_db=-80.0, mode='A_weighting'): 83 | if fs == 16000: 84 | n_fft = 2048 85 | elif fs == 44100: 86 | n_fft = 4096 87 | else: 88 | raise Exception('Invalid fs {}'.format(fs)) 89 | stride = n_fft // 2 90 | 91 | gain = [] 92 | for i in range(0, len(sound) - n_fft + 1, stride): 93 | if mode == 'RMSE': 94 | g = np.mean(sound[i: i + n_fft] ** 2) 95 | elif mode == 'A_weighting': 96 | spec = np.fft.rfft(np.hanning(n_fft + 1)[:-1] * sound[i: i + n_fft]) 97 | power_spec = np.abs(spec) ** 2 98 | a_weighted_spec = power_spec * np.power(10, a_weight(fs, n_fft) / 10) 99 | g = np.sum(a_weighted_spec) 100 | else: 101 | raise Exception('Invalid mode {}'.format(mode)) 102 | gain.append(g) 103 | 104 | gain = np.array(gain) 105 | gain = np.maximum(gain, np.power(10, min_db / 10)) 106 | gain_db = 10 * np.log10(gain) 107 | 108 | return gain_db 109 | 110 | 111 | def mix(sound1, sound2, r, fs): 112 | gain1 = np.max(compute_gain(sound1, fs)) # Decibel 113 | gain2 = np.max(compute_gain(sound2, fs)) 114 | 115 | t = 1.0 / (1 + np.power(10, (gain1 - gain2) / 20.) * (1 - r) / r) 116 | sound = ((sound1 * t + sound2 * (1 - t)) / np.sqrt(t ** 2 + (1 - t) ** 2)) 117 | 118 | return sound 119 | 120 | -------------------------------------------------------------------------------- /nsynth/__init__.py: -------------------------------------------------------------------------------- 1 | from .torch_readers import * 2 | from .utils import * 3 | from .constants import * -------------------------------------------------------------------------------- /nsynth/constants.py: -------------------------------------------------------------------------------- 1 | NSYNTH_TRAIN = "http://download.magenta.tensorflow.org/datasets/nsynth/nsynth-train.tfrecord" 2 | NSYNTH_TEST = "http://download.magenta.tensorflow.org/datasets/nsynth/nsynth-test.tfrecord" 3 | NSYNTH_VAL = "http://download.magenta.tensorflow.org/datasets/nsynth/nsynth-valid.tfrecord" 4 | 5 | TRAIN_FILE = NSYNTH_TRAIN.split("/")[-1] 6 | TEST_FILE = NSYNTH_TEST.split("/")[-1] 7 | VAL_FILE = NSYNTH_VAL.split("/")[-1] 8 | 9 | TRAIN_EXAMPLES = 289205 10 | VAL_EXAMPLES = 12678 11 | TEST_EXAMPLES = 4096 12 | 13 | AUDIO_LEN = 64000 14 | QUALITIES_LEN = 10 15 | 16 | NOTE_STR = "note_str" 17 | AUDIO = "audio" 18 | PITCH = "pitch" 19 | VELOCITY = "velocity" 20 | INSTR_SRC = "instrument_source" 21 | INSTR_FAMILY = "instrument_family" 22 | QUALITIES = "qualities" 23 | -------------------------------------------------------------------------------- /nsynth/nsynth_gen.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import subprocess 3 | 4 | import h5py 5 | import os 6 | 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | # add path to the directory with misc folder 11 | import sys 12 | sys.path.append(os.path.abspath("..")) 13 | 14 | from misc.utils import itarate_over_tfrecord 15 | from nsynth.constants import * 16 | from nsynth.utils import nsynth_extract_features 17 | 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser(description='NSynth') 21 | 22 | # General settings 23 | parser.add_argument('--path', required=True, help="Where to store results") 24 | parser.add_argument('--train', action='store_true', help="Download the train dataset") 25 | parser.add_argument('--test', action='store_true', help="Download the train dataset") 26 | parser.add_argument('--val', action='store_true', help="Download the validate dataset") 27 | parser.add_argument('--force_download', action='store_true', help="Force downloading from website.") 28 | parser.add_argument('--force_h5py', action='store_true', help="Force creating h5py file.") 29 | parser.add_argument('--store_h5py', action='store_true', help="Forcing storing to h5py_utils") 30 | parser.add_argument('--batch_size', default=256, type=int, help="How many items read from tfrecord at once") 31 | 32 | return parser.parse_args() 33 | 34 | 35 | def download_dataset(url, path, force_download): 36 | if not os.path.exists(path) or force_download: 37 | if force_download and os.path.exists(path): 38 | print("Force download. {} file will me replaced.".format(path)) 39 | os.remove(path) 40 | 41 | print("Download *.tfrecord file to", path) 42 | subprocess.run("wget {} -P {}".format(url, path), shell=True, check=True) 43 | else: 44 | print("The dataset has been already downloaded to {}".format(path)) 45 | 46 | 47 | if __name__ == "__main__": 48 | opt = parse_args() 49 | print(opt) 50 | 51 | process_files = [] 52 | if opt.train: 53 | download_dataset(NSYNTH_TRAIN, opt.path, opt.force_download) 54 | process_files.append((os.path.join(opt.path, TRAIN_FILE), TRAIN_EXAMPLES)) 55 | if opt.test: 56 | download_dataset(NSYNTH_TEST, opt.path, opt.force_download) 57 | process_files.append((os.path.join(opt.path, TEST_FILE), TEST_EXAMPLES)) 58 | if opt.val: 59 | download_dataset(NSYNTH_VAL, opt.path, opt.force_download) 60 | process_files.append((os.path.join(opt.path, VAL_FILE), VAL_EXAMPLES)) 61 | 62 | if opt.store_h5py: 63 | print('hey') 64 | for file_name, num_examples in process_files: 65 | print(file_name) 66 | dataset_path = file_name[:-9] + ".hdf5" 67 | 68 | if opt.force_h5py and os.path.exists(dataset_path): 69 | print("h5py file {} will be removed".format(dataset_path)) 70 | subprocess.run("rm -rf {}".format(dataset_path), shell=True, check=True) 71 | if not opt.force_h5py and os.path.exists(dataset_path): 72 | print("h5py file {} has been already created".format(dataset_path)) 73 | continue 74 | 75 | dataset = tf.data.TFRecordDataset(file_name) 76 | dataset = dataset.map(nsynth_extract_features) 77 | dataset = dataset.batch(opt.batch_size) 78 | dataset = dataset.repeat(1) 79 | 80 | iter = dataset.make_one_shot_iterator() 81 | 82 | f = h5py.File(dataset_path, 'w') 83 | 84 | dt = h5py.special_dtype(vlen=np.float32) 85 | audio_ds = f.create_dataset(AUDIO, (num_examples, AUDIO_LEN), dtype=np.float32) 86 | pitch_ds = f.create_dataset(PITCH, (num_examples,), dtype=np.int) 87 | velocity_ds = f.create_dataset(VELOCITY, (num_examples,), dtype=np.int) 88 | instr_src_ds = f.create_dataset(INSTR_SRC, (num_examples,), dtype=np.int) 89 | instr_fml_ds = f.create_dataset(INSTR_FAMILY, (num_examples,), dtype=np.int) 90 | qualities_ds = f.create_dataset(QUALITIES, (num_examples, QUALITIES_LEN), dtype=np.int) 91 | 92 | idx = 0 93 | for audio, pitch, velocity, instrument_source, instrument_family, qualities in itarate_over_tfrecord(iter): 94 | curr_batch_size = audio.shape[0] 95 | start = idx 96 | end = idx + curr_batch_size 97 | idx = end 98 | 99 | audio_ds[start:end, :] = audio.reshape((audio.shape[0], -1)) 100 | pitch_ds[start:end] = pitch.reshape((-1)) 101 | velocity_ds[start:end] = velocity.reshape((-1)) 102 | instr_src_ds[start:end] = instrument_source.reshape((-1)) 103 | instr_fml_ds[start:end] = instrument_family.reshape((-1)) 104 | qualities_ds[start:end, :] = qualities.reshape((-1, QUALITIES_LEN)) 105 | f.close() 106 | print("Complete converting: {} to {}".format(file_name, dataset_path)) 107 | -------------------------------------------------------------------------------- /nsynth/tfrecord/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/juliagusak/dataloaders/210604349b3bfa51d62cacc9e7668f53dcfe265d/nsynth/tfrecord/__init__.py -------------------------------------------------------------------------------- /nsynth/tfrecord/example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from nsynth_reader import *" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 3, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "data_path = \"/workspace/jgusak/Data\"\n", 19 | "tfrecord_path = \"{}/nsynth/nsynth-train.tfrecord\".format(data_path)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "##### Read tfrecords without defining a graph\n", 27 | "Create generator to iterate through tfrecords " 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 36, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "0 2 100 0 100\n", 40 | "[ 98 97 115 115 95 115 121 110 116 104 101 116 105 99 95 48 49 56\n", 41 | " 45 49 48 48 45 49 48 48]\n", 42 | "(64000,)\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "record_iterator = tf.python_io.tf_record_iterator(path=tfrecord_path)\n", 48 | "\n", 49 | "for string_record in record_iterator:\n", 50 | " example = tf.train.Example()\n", 51 | " example.ParseFromString(string_record)\n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " audio = example.features.feature['audio'].float_list.value\n", 56 | " audio = np.array(audio)\n", 57 | "\n", 58 | " instrument_family = example.features.feature['instrument_family'].int64_list.value[0]\n", 59 | " instrument_source = example.features.feature['instrument_source'].int64_list.value[0]\n", 60 | "\n", 61 | " note_str = example.features.feature['note_str'].bytes_list.value[0]\n", 62 | " note = np.frombuffer(note_str, dtype = np.uint8)\n", 63 | " \n", 64 | "\n", 65 | " pitch = example.features.feature['pitch'].int64_list.value[0]\n", 66 | " qualities = example.features.feature['qualities'].int64_list.value[0]\n", 67 | " velocity = example.features.feature['velocity'].int64_list.value[0]\n", 68 | " \n", 69 | " print(instrument_family, instrument_source, pitch, qualities, velocity)\n", 70 | " print(note)\n", 71 | " print(audio.shape)\n", 72 | " break" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "##### Create dataset" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "dataset = NSynthDataset(tfrecord_path, is_training=False)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "##### Get dataset batch" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "dict_keys(['audio', 'instrument_family', 'instrument_source', 'note_str', 'pitch', 'qualities', 'velocity'])\n" 108 | ] 109 | }, 110 | { 111 | "data": { 112 | "text/plain": [ 113 | "{'audio': ,\n", 114 | " 'instrument_family': ,\n", 115 | " 'instrument_source': ,\n", 116 | " 'note_str': ,\n", 117 | " 'pitch': ,\n", 118 | " 'qualities': ,\n", 119 | " 'velocity': }" 120 | ] 121 | }, 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "# get a batch in the following format: tf.Example protobuf parsed from tfrecord\n", 129 | "batch = dataset.get_example(batch_size = 10)\n", 130 | "print(batch.keys())\n", 131 | "\n", 132 | "batch" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 6, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "dict_keys(['pitch', 'wav', 'key'])\n" 145 | ] 146 | }, 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "{'pitch': ,\n", 151 | " 'wav': ,\n", 152 | " 'key': }" 153 | ] 154 | }, 155 | "execution_count": 6, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "#get a batch in the following format: {key:tensor} \n", 162 | "batch = dataset.get_wavenet_batch(batch_size = 10)\n", 163 | "print(batch.keys())\n", 164 | "\n", 165 | "batch" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "##### Define graph to read tfrecords and iterate through batches" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 13, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "(50, 40000) (50,)\n", 185 | "(50, 40000) (50,)\n", 186 | "(50, 40000) (50,)\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "dataset = NSynthDataset(tfrecord_path=tfrecord_path)\n", 192 | "\n", 193 | "LENGTH = 40000\n", 194 | "batch = dataset.get_wavenet_batch(batch_size = 50, length = LENGTH) \n", 195 | "\n", 196 | "# The op for initializing the variables.\n", 197 | "init_op = tf.group(tf.global_variables_initializer(),\n", 198 | " tf.local_variables_initializer())\n", 199 | "\n", 200 | "with tf.Session() as sess:\n", 201 | " \n", 202 | " sess.run(init_op)\n", 203 | " \n", 204 | " coord = tf.train.Coordinator()\n", 205 | " threads = tf.train.start_queue_runners(coord = coord)\n", 206 | " \n", 207 | " for i in range(3):\n", 208 | " \n", 209 | " batch_np = sess.run(batch)\n", 210 | " features, labels = batch_np['wav'], batch_np['pitch']\n", 211 | "\n", 212 | " print(features.shape, labels.shape)\n", 213 | " \n", 214 | " coord.request_stop()\n", 215 | " coord.join(threads)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [] 224 | } 225 | ], 226 | "metadata": { 227 | "kernelspec": { 228 | "display_name": "Python 3", 229 | "language": "python", 230 | "name": "python3" 231 | }, 232 | "language_info": { 233 | "codemirror_mode": { 234 | "name": "ipython", 235 | "version": 3 236 | }, 237 | "file_extension": ".py", 238 | "mimetype": "text/x-python", 239 | "name": "python", 240 | "nbconvert_exporter": "python", 241 | "pygments_lexer": "ipython3", 242 | "version": "3.6.5" 243 | } 244 | }, 245 | "nbformat": 4, 246 | "nbformat_minor": 2 247 | } 248 | -------------------------------------------------------------------------------- /nsynth/tfrecord/nsynth_reader.py: -------------------------------------------------------------------------------- 1 | # Code is borrowed from https://github.com/tensorflow/magenta/blob/master/magenta/models/nsynth/reader.py 2 | """ Module to load the Dataset.""" 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | # internal imports 9 | import numpy as np 10 | import tensorflow as tf 11 | 12 | # from magenta.models.nsynth import utils 13 | import nsynth_utils 14 | 15 | 16 | # FFT Specgram Shapes 17 | SPECGRAM_REGISTRY = { 18 | (nfft, hop): shape for nfft, hop, shape in zip( 19 | [256, 256, 512, 512, 1024, 1024], 20 | [64, 128, 128, 256, 256, 512], 21 | [[129, 1001, 2], [129, 501, 2], [257, 501, 2], 22 | [257, 251, 2], [513, 251, 2], [513, 126, 2]]) 23 | } 24 | 25 | 26 | class NSynthDataset(object): 27 | """Dataset object to help manage the TFRecord loading.""" 28 | 29 | def __init__(self, tfrecord_path, is_training=True): 30 | self.is_training = is_training 31 | self.record_path = tfrecord_path 32 | 33 | def get_example(self, batch_size): 34 | """Get a single example from the tfrecord file. 35 | Args: 36 | batch_size: Int, minibatch size. 37 | Returns: 38 | tf.Example protobuf parsed from tfrecord. 39 | """ 40 | reader = tf.TFRecordReader() 41 | num_epochs = None if self.is_training else 1 42 | capacity = batch_size 43 | path_queue = tf.train.input_producer( 44 | [self.record_path], 45 | num_epochs=num_epochs, 46 | shuffle=self.is_training, 47 | capacity=capacity) 48 | unused_key, serialized_example = reader.read(path_queue) 49 | features = { 50 | "note_str": tf.FixedLenFeature([], dtype=tf.string), 51 | "pitch": tf.FixedLenFeature([1], dtype=tf.int64), 52 | "velocity": tf.FixedLenFeature([1], dtype=tf.int64), 53 | "audio": tf.FixedLenFeature([64000], dtype=tf.float32), 54 | "qualities": tf.FixedLenFeature([10], dtype=tf.int64), 55 | "instrument_source": tf.FixedLenFeature([1], dtype=tf.int64), 56 | "instrument_family": tf.FixedLenFeature([1], dtype=tf.int64), 57 | } 58 | example = tf.parse_single_example(serialized_example, features) 59 | return example 60 | 61 | def get_wavenet_batch(self, batch_size, length=64000): 62 | """Get the Tensor expressions from the reader. 63 | Args: 64 | batch_size: The integer batch size. 65 | length: Number of timesteps of a cropped sample to produce. 66 | Returns: 67 | A dict of key:tensor pairs. This includes "pitch", "wav", and "key". 68 | """ 69 | example = self.get_example(batch_size) 70 | wav = example["audio"] 71 | wav = tf.slice(wav, [0], [64000]) 72 | pitch = tf.squeeze(example["pitch"]) 73 | key = tf.squeeze(example["note_str"]) 74 | 75 | if self.is_training: 76 | # random crop 77 | crop = tf.random_crop(wav, [length]) 78 | crop = tf.reshape(crop, [1, length]) 79 | key, crop, pitch = tf.train.shuffle_batch( 80 | [key, crop, pitch], 81 | batch_size, 82 | num_threads=4, 83 | capacity=500 * batch_size, 84 | min_after_dequeue=200 * batch_size) 85 | else: 86 | # fixed center crop 87 | offset = (64000 - length) // 2 # 24320 88 | crop = tf.slice(wav, [offset], [length]) 89 | crop = tf.reshape(crop, [1, length]) 90 | key, crop, pitch = tf.train.shuffle_batch( 91 | [key, crop, pitch], 92 | batch_size, 93 | num_threads=4, 94 | capacity=500 * batch_size, 95 | min_after_dequeue=200 * batch_size) 96 | 97 | crop = tf.reshape(tf.cast(crop, tf.float32), [batch_size, length]) 98 | pitch = tf.cast(pitch, tf.int32) 99 | return {"pitch": pitch, "wav": crop, "key": key} 100 | 101 | def get_baseline_batch(self, hparams): 102 | """Get the Tensor expressions from the reader. 103 | Args: 104 | hparams: Hyperparameters object with specgram parameters. 105 | Returns: 106 | A dict of key:tensor pairs. This includes "pitch", "wav", and "key". 107 | """ 108 | example = self.get_example(hparams.batch_size) 109 | audio = tf.slice(example["audio"], [0], [64000]) 110 | audio = tf.reshape(audio, [1, 64000]) 111 | pitch = tf.slice(example["pitch"], [0], [1]) 112 | velocity = tf.slice(example["velocity"], [0], [1]) 113 | instrument_source = tf.slice(example["instrument_source"], [0], [1]) 114 | instrument_family = tf.slice(example["instrument_family"], [0], [1]) 115 | qualities = tf.slice(example["qualities"], [0], [10]) 116 | qualities = tf.reshape(qualities, [1, 10]) 117 | 118 | # Get Specgrams 119 | hop_length = hparams.hop_length 120 | n_fft = hparams.n_fft 121 | if hop_length and n_fft: 122 | specgram = utils.tf_specgram( 123 | audio, 124 | n_fft=n_fft, 125 | hop_length=hop_length, 126 | mask=hparams.mask, 127 | log_mag=hparams.log_mag, 128 | re_im=hparams.re_im, 129 | dphase=hparams.dphase, 130 | mag_only=hparams.mag_only) 131 | shape = [1] + SPECGRAM_REGISTRY[(n_fft, hop_length)] 132 | if hparams.mag_only: 133 | shape[-1] = 1 134 | specgram = tf.reshape(specgram, shape) 135 | tf.logging.info("SPECGRAM BEFORE PADDING", specgram) 136 | 137 | if hparams.pad: 138 | # Pad and crop specgram to 256x256 139 | num_padding = 2**int(np.ceil(np.log(shape[2]) / np.log(2))) - shape[2] 140 | tf.logging.info("num_pading: %d" % num_padding) 141 | specgram = tf.reshape(specgram, shape) 142 | specgram = tf.pad(specgram, [[0, 0], [0, 0], [0, num_padding], [0, 0]]) 143 | specgram = tf.slice(specgram, [0, 0, 0, 0], [-1, shape[1] - 1, -1, -1]) 144 | tf.logging.info("SPECGRAM AFTER PADDING", specgram) 145 | 146 | # Form a Batch 147 | if self.is_training: 148 | (audio, velocity, pitch, specgram, 149 | instrument_source, instrument_family, 150 | qualities) = tf.train.shuffle_batch( 151 | [ 152 | audio, velocity, pitch, specgram, 153 | instrument_source, instrument_family, qualities 154 | ], 155 | batch_size=hparams.batch_size, 156 | capacity=20 * hparams.batch_size, 157 | min_after_dequeue=10 * hparams.batch_size, 158 | enqueue_many=True) 159 | elif hparams.batch_size > 1: 160 | (audio, velocity, pitch, specgram, 161 | instrument_source, instrument_family, qualities) = tf.train.batch( 162 | [ 163 | audio, velocity, pitch, specgram, 164 | instrument_source, instrument_family, qualities 165 | ], 166 | batch_size=hparams.batch_size, 167 | capacity=10 * hparams.batch_size, 168 | enqueue_many=True) 169 | 170 | audio.set_shape([hparams.batch_size, 64000]) 171 | 172 | batch = dict( 173 | pitch=pitch, 174 | velocity=velocity, 175 | audio=audio, 176 | instrument_source=instrument_source, 177 | instrument_family=instrument_family, 178 | qualities=qualities, 179 | spectrogram=specgram) 180 | 181 | return batch -------------------------------------------------------------------------------- /nsynth/tfrecord/nsynth_utils.py: -------------------------------------------------------------------------------- 1 | # Code is borrowed from https://github.com/tensorflow/magenta/blob/master/magenta/models/nsynth/utils.py 2 | """Utility functions for NSynth.""" 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import importlib 9 | import os 10 | 11 | # internal imports 12 | import librosa 13 | import numpy as np 14 | from six.moves import range # pylint: disable=redefined-builtin 15 | import tensorflow as tf 16 | 17 | slim = tf.contrib.slim 18 | 19 | 20 | def shell_path(path): 21 | return os.path.abspath(os.path.expanduser(os.path.expandvars(path))) 22 | 23 | 24 | #=============================================================================== 25 | # WaveNet Functions 26 | #=============================================================================== 27 | def get_module(module_path): 28 | """Imports module from NSynth directory. 29 | Args: 30 | module_path: Path to module separated by dots. 31 | -> "configs.linear" 32 | Returns: 33 | module: Imported module. 34 | """ 35 | import_path = "magenta.models.nsynth." 36 | module = importlib.import_module(import_path + module_path) 37 | return module 38 | 39 | 40 | def load_audio(path, sample_length=64000, sr=16000): 41 | """Loading of a wave file. 42 | Args: 43 | path: Location of a wave file to load. 44 | sample_length: The truncated total length of the final wave file. 45 | sr: Samples per a second. 46 | Returns: 47 | out: The audio in samples from -1.0 to 1.0 48 | """ 49 | audio, _ = librosa.load(path, sr=sr) 50 | audio = audio[:sample_length] 51 | return audio 52 | 53 | 54 | def mu_law(x, mu=255, int8=False): 55 | """A TF implementation of Mu-Law encoding. 56 | Args: 57 | x: The audio samples to encode. 58 | mu: The Mu to use in our Mu-Law. 59 | int8: Use int8 encoding. 60 | Returns: 61 | out: The Mu-Law encoded int8 data. 62 | """ 63 | out = tf.sign(x) * tf.log(1 + mu * tf.abs(x)) / np.log(1 + mu) 64 | out = tf.floor(out * 128) 65 | if int8: 66 | out = tf.cast(out, tf.int8) 67 | return out 68 | 69 | 70 | def inv_mu_law(x, mu=255): 71 | """A TF implementation of inverse Mu-Law. 72 | Args: 73 | x: The Mu-Law samples to decode. 74 | mu: The Mu we used to encode these samples. 75 | Returns: 76 | out: The decoded data. 77 | """ 78 | x = tf.cast(x, tf.float32) 79 | out = (x + 0.5) * 2. / (mu + 1) 80 | out = tf.sign(out) / mu * ((1 + mu)**tf.abs(out) - 1) 81 | out = tf.where(tf.equal(x, 0), x, out) 82 | return out 83 | 84 | 85 | def inv_mu_law_numpy(x, mu=255.0): 86 | """A numpy implementation of inverse Mu-Law. 87 | Args: 88 | x: The Mu-Law samples to decode. 89 | mu: The Mu we used to encode these samples. 90 | Returns: 91 | out: The decoded data. 92 | """ 93 | x = np.array(x).astype(np.float32) 94 | out = (x + 0.5) * 2. / (mu + 1) 95 | out = np.sign(out) / mu * ((1 + mu)**np.abs(out) - 1) 96 | out = np.where(np.equal(x, 0), x, out) 97 | return out 98 | 99 | 100 | def trim_for_encoding(wav_data, sample_length, hop_length=512): 101 | """Make sure audio is a even multiple of hop_size. 102 | Args: 103 | wav_data: 1-D or 2-D array of floats. 104 | sample_length: Max length of audio data. 105 | hop_length: Pooling size of WaveNet autoencoder. 106 | Returns: 107 | wav_data: Trimmed array. 108 | sample_length: Length of trimmed array. 109 | """ 110 | if wav_data.ndim == 1: 111 | # Max sample length is the data length 112 | if sample_length > wav_data.size: 113 | sample_length = wav_data.size 114 | # Multiple of hop_length 115 | sample_length = (sample_length // hop_length) * hop_length 116 | # Trim 117 | wav_data = wav_data[:sample_length] 118 | # Assume all examples are the same length 119 | elif wav_data.ndim == 2: 120 | # Max sample length is the data length 121 | if sample_length > wav_data[0].size: 122 | sample_length = wav_data[0].size 123 | # Multiple of hop_length 124 | sample_length = (sample_length // hop_length) * hop_length 125 | # Trim 126 | wav_data = wav_data[:, :sample_length] 127 | 128 | return wav_data, sample_length 129 | 130 | 131 | #=============================================================================== 132 | # Baseline Functions 133 | #=============================================================================== 134 | #--------------------------------------------------- 135 | # Pre/Post-processing 136 | #--------------------------------------------------- 137 | def get_optimizer(learning_rate, hparams): 138 | """Get the tf.train.Optimizer for this optimizer string. 139 | Args: 140 | learning_rate: The learning_rate tensor. 141 | hparams: TF.HParams object with the optimizer and momentum values. 142 | Returns: 143 | optimizer: The tf.train.Optimizer based on the optimizer string. 144 | """ 145 | return { 146 | "rmsprop": 147 | tf.RMSPropOptimizer( 148 | learning_rate, 149 | decay=0.95, 150 | momentum=hparams.momentum, 151 | epsilon=1e-4), 152 | "adam": 153 | tf.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8), 154 | "adagrad": 155 | tf.AdagradOptimizer(learning_rate, initial_accumulator_value=1.0), 156 | "mom": 157 | tf.MomentumOptimizer(learning_rate, momentum=hparams.momentum), 158 | "sgd": 159 | tf.GradientDescentOptimizer(learning_rate) 160 | }.get(hparams.optimizer) 161 | 162 | 163 | def specgram(audio, 164 | n_fft=512, 165 | hop_length=None, 166 | mask=True, 167 | log_mag=True, 168 | re_im=False, 169 | dphase=True, 170 | mag_only=False): 171 | """Spectrogram using librosa. 172 | Args: 173 | audio: 1-D array of float32 sound samples. 174 | n_fft: Size of the FFT. 175 | hop_length: Stride of FFT. Defaults to n_fft/2. 176 | mask: Mask the phase derivative by the magnitude. 177 | log_mag: Use the logamplitude. 178 | re_im: Output Real and Imag. instead of logMag and dPhase. 179 | dphase: Use derivative of phase instead of phase. 180 | mag_only: Don't return phase. 181 | Returns: 182 | specgram: [n_fft/2 + 1, audio.size / hop_length, 2]. The first channel is 183 | the logamplitude and the second channel is the derivative of phase. 184 | """ 185 | if not hop_length: 186 | hop_length = int(n_fft / 2.) 187 | 188 | fft_config = dict( 189 | n_fft=n_fft, win_length=n_fft, hop_length=hop_length, center=True) 190 | 191 | spec = librosa.stft(audio, **fft_config) 192 | 193 | if re_im: 194 | re = spec.real[:, :, np.newaxis] 195 | im = spec.imag[:, :, np.newaxis] 196 | spec_real = np.concatenate((re, im), axis=2) 197 | 198 | else: 199 | mag, phase = librosa.core.magphase(spec) 200 | phase_angle = np.angle(phase) 201 | 202 | # Magnitudes, scaled 0-1 203 | if log_mag: 204 | mag = (librosa.power_to_db( 205 | mag**2, amin=1e-13, top_db=120., ref=np.max) / 120.) + 1 206 | else: 207 | mag /= mag.max() 208 | 209 | if dphase: 210 | # Derivative of phase 211 | phase_unwrapped = np.unwrap(phase_angle) 212 | p = phase_unwrapped[:, 1:] - phase_unwrapped[:, :-1] 213 | p = np.concatenate([phase_unwrapped[:, 0:1], p], axis=1) / np.pi 214 | else: 215 | # Normal phase 216 | p = phase_angle / np.pi 217 | # Mask the phase 218 | if log_mag and mask: 219 | p = mag * p 220 | # Return Mag and Phase 221 | p = p.astype(np.float32)[:, :, np.newaxis] 222 | mag = mag.astype(np.float32)[:, :, np.newaxis] 223 | if mag_only: 224 | spec_real = mag[:, :, np.newaxis] 225 | else: 226 | spec_real = np.concatenate((mag, p), axis=2) 227 | return spec_real 228 | 229 | 230 | def inv_magphase(mag, phase_angle): 231 | phase = np.cos(phase_angle) + 1.j * np.sin(phase_angle) 232 | return mag * phase 233 | 234 | 235 | def griffin_lim(mag, phase_angle, n_fft, hop, num_iters): 236 | """Iterative algorithm for phase retrival from a magnitude spectrogram. 237 | Args: 238 | mag: Magnitude spectrogram. 239 | phase_angle: Initial condition for phase. 240 | n_fft: Size of the FFT. 241 | hop: Stride of FFT. Defaults to n_fft/2. 242 | num_iters: Griffin-Lim iterations to perform. 243 | Returns: 244 | audio: 1-D array of float32 sound samples. 245 | """ 246 | fft_config = dict(n_fft=n_fft, win_length=n_fft, hop_length=hop, center=True) 247 | ifft_config = dict(win_length=n_fft, hop_length=hop, center=True) 248 | complex_specgram = inv_magphase(mag, phase_angle) 249 | for i in range(num_iters): 250 | audio = librosa.istft(complex_specgram, **ifft_config) 251 | if i != num_iters - 1: 252 | complex_specgram = librosa.stft(audio, **fft_config) 253 | _, phase = librosa.magphase(complex_specgram) 254 | phase_angle = np.angle(phase) 255 | complex_specgram = inv_magphase(mag, phase_angle) 256 | return audio 257 | 258 | 259 | def ispecgram(spec, 260 | n_fft=512, 261 | hop_length=None, 262 | mask=True, 263 | log_mag=True, 264 | re_im=False, 265 | dphase=True, 266 | mag_only=True, 267 | num_iters=1000): 268 | """Inverse Spectrogram using librosa. 269 | Args: 270 | spec: 3-D specgram array [freqs, time, (mag_db, dphase)]. 271 | n_fft: Size of the FFT. 272 | hop_length: Stride of FFT. Defaults to n_fft/2. 273 | mask: Reverse the mask of the phase derivative by the magnitude. 274 | log_mag: Use the logamplitude. 275 | re_im: Output Real and Imag. instead of logMag and dPhase. 276 | dphase: Use derivative of phase instead of phase. 277 | mag_only: Specgram contains no phase. 278 | num_iters: Number of griffin-lim iterations for mag_only. 279 | Returns: 280 | audio: 1-D array of sound samples. Peak normalized to 1. 281 | """ 282 | if not hop_length: 283 | hop_length = n_fft // 2 284 | 285 | ifft_config = dict(win_length=n_fft, hop_length=hop_length, center=True) 286 | 287 | if mag_only: 288 | mag = spec[:, :, 0] 289 | phase_angle = np.pi * np.random.rand(*mag.shape) 290 | elif re_im: 291 | spec_real = spec[:, :, 0] + 1.j * spec[:, :, 1] 292 | else: 293 | mag, p = spec[:, :, 0], spec[:, :, 1] 294 | if mask and log_mag: 295 | p /= (mag + 1e-13 * np.random.randn(*mag.shape)) 296 | if dphase: 297 | # Roll up phase 298 | phase_angle = np.cumsum(p * np.pi, axis=1) 299 | else: 300 | phase_angle = p * np.pi 301 | 302 | # Magnitudes 303 | if log_mag: 304 | mag = (mag - 1.0) * 120.0 305 | mag = 10**(mag / 20.0) 306 | phase = np.cos(phase_angle) + 1.j * np.sin(phase_angle) 307 | spec_real = mag * phase 308 | 309 | if mag_only: 310 | audio = griffin_lim( 311 | mag, phase_angle, n_fft, hop_length, num_iters=num_iters) 312 | else: 313 | audio = librosa.core.istft(spec_real, **ifft_config) 314 | return np.squeeze(audio / audio.max()) 315 | 316 | 317 | def batch_specgram(audio, 318 | n_fft=512, 319 | hop_length=None, 320 | mask=True, 321 | log_mag=True, 322 | re_im=False, 323 | dphase=True, 324 | mag_only=False): 325 | assert len(audio.shape) == 2 326 | batch_size = audio.shape[0] 327 | res = [] 328 | for b in range(batch_size): 329 | res.append( 330 | specgram(audio[b], n_fft, hop_length, mask, log_mag, re_im, dphase, 331 | mag_only)) 332 | return np.array(res) 333 | 334 | 335 | def batch_ispecgram(spec, 336 | n_fft=512, 337 | hop_length=None, 338 | mask=True, 339 | log_mag=True, 340 | re_im=False, 341 | dphase=True, 342 | mag_only=False, 343 | num_iters=1000): 344 | assert len(spec.shape) == 4 345 | batch_size = spec.shape[0] 346 | res = [] 347 | for b in range(batch_size): 348 | res.append( 349 | ispecgram(spec[b, :, :, :], n_fft, hop_length, mask, log_mag, re_im, 350 | dphase, mag_only, num_iters)) 351 | return np.array(res) 352 | 353 | 354 | def tf_specgram(audio, 355 | n_fft=512, 356 | hop_length=None, 357 | mask=True, 358 | log_mag=True, 359 | re_im=False, 360 | dphase=True, 361 | mag_only=False): 362 | return tf.py_func(batch_specgram, [ 363 | audio, n_fft, hop_length, mask, log_mag, re_im, dphase, mag_only 364 | ], tf.float32) 365 | 366 | 367 | def tf_ispecgram(spec, 368 | n_fft=512, 369 | hop_length=None, 370 | mask=True, 371 | pad=True, 372 | log_mag=True, 373 | re_im=False, 374 | dphase=True, 375 | mag_only=False, 376 | num_iters=1000): 377 | dims = spec.get_shape().as_list() 378 | # Add back in nyquist frequency 379 | x = spec if not pad else tf.concat( 380 | [spec, tf.zeros([dims[0], 1, dims[2], dims[3]])], 1) 381 | audio = tf.py_func(batch_ispecgram, [ 382 | x, n_fft, hop_length, mask, log_mag, re_im, dphase, mag_only, num_iters 383 | ], tf.float32) 384 | return audio 385 | 386 | 387 | #--------------------------------------------------- 388 | # Summaries 389 | #--------------------------------------------------- 390 | def form_image_grid(input_tensor, grid_shape, image_shape, num_channels): 391 | """Arrange a minibatch of images into a grid to form a single image. 392 | Args: 393 | input_tensor: Tensor. Minibatch of images to format, either 4D 394 | ([batch size, height, width, num_channels]) or flattened 395 | ([batch size, height * width * num_channels]). 396 | grid_shape: Sequence of int. The shape of the image grid, 397 | formatted as [grid_height, grid_width]. 398 | image_shape: Sequence of int. The shape of a single image, 399 | formatted as [image_height, image_width]. 400 | num_channels: int. The number of channels in an image. 401 | Returns: 402 | Tensor representing a single image in which the input images have been 403 | arranged into a grid. 404 | Raises: 405 | ValueError: The grid shape and minibatch size don't match, or the image 406 | shape and number of channels are incompatible with the input tensor. 407 | """ 408 | if grid_shape[0] * grid_shape[1] != int(input_tensor.get_shape()[0]): 409 | raise ValueError("Grid shape incompatible with minibatch size.") 410 | if len(input_tensor.get_shape()) == 2: 411 | num_features = image_shape[0] * image_shape[1] * num_channels 412 | if int(input_tensor.get_shape()[1]) != num_features: 413 | raise ValueError("Image shape and number of channels incompatible with " 414 | "input tensor.") 415 | elif len(input_tensor.get_shape()) == 4: 416 | if (int(input_tensor.get_shape()[1]) != image_shape[0] or 417 | int(input_tensor.get_shape()[2]) != image_shape[1] or 418 | int(input_tensor.get_shape()[3]) != num_channels): 419 | raise ValueError("Image shape and number of channels incompatible with " 420 | "input tensor.") 421 | else: 422 | raise ValueError("Unrecognized input tensor format.") 423 | height, width = grid_shape[0] * image_shape[0], grid_shape[1] * image_shape[1] 424 | input_tensor = tf.reshape(input_tensor, 425 | grid_shape + image_shape + [num_channels]) 426 | input_tensor = tf.transpose(input_tensor, [0, 1, 3, 2, 4]) 427 | input_tensor = tf.reshape( 428 | input_tensor, [grid_shape[0], width, image_shape[0], num_channels]) 429 | input_tensor = tf.transpose(input_tensor, [0, 2, 1, 3]) 430 | input_tensor = tf.reshape(input_tensor, [1, height, width, num_channels]) 431 | return input_tensor 432 | 433 | 434 | def specgram_summaries(spec, 435 | name, 436 | hparams, 437 | rows=4, 438 | columns=4, 439 | image=True, 440 | phase=True, 441 | audio=True): 442 | """Post summaries of a specgram (Image and Audio). 443 | For image summaries, creates a rows x columns composite image from the batch. 444 | Also can create audio summaries for raw audio, but hparams.raw_audio must be 445 | True. 446 | Args: 447 | spec: Batch of spectrograms. 448 | name: String prepended to summaries. 449 | hparams: Hyperparamenters. 450 | rows: Int, number of rows in image. 451 | columns: Int, number of columns in image. 452 | image: Bool, create image summary. 453 | phase: Bool, create image summary from second channel in the batch. 454 | audio: Bool, create audio summaries for each spectrogram in the batch. 455 | """ 456 | batch_size, n_freq, n_time, unused_channels = spec.get_shape().as_list() 457 | # Must divide minibatch evenly 458 | b = min(batch_size, rows * columns) 459 | 460 | if hparams.raw_audio: 461 | spec = tf.squeeze(spec) 462 | spec /= tf.expand_dims(tf.reduce_max(spec, axis=1), axis=1) 463 | tf.summary.audio( 464 | name, tf.squeeze(spec), hparams.samples_per_second, max_outputs=b) 465 | else: 466 | if image: 467 | if b % columns != 0: 468 | rows = np.floor(np.sqrt(b)) 469 | columns = rows 470 | else: 471 | rows = b / columns 472 | tf.summary.image("Mag/%s" % name, 473 | form_image_grid(spec[:b, :, :, :1], [rows, columns], 474 | [n_freq, n_time], 1)) 475 | if phase: 476 | tf.summary.image("Phase/%s" % name, 477 | form_image_grid(spec[:b, :, :, 1:], [rows, columns], 478 | [n_freq, n_time], 1)) 479 | if audio: 480 | tf.summary.audio( 481 | name, 482 | tf_ispecgram( 483 | spec, 484 | n_fft=hparams.n_fft, 485 | hop_length=hparams.hop_length, 486 | mask=hparams.mask, 487 | log_mag=hparams.log_mag, 488 | pad=hparams.pad, 489 | re_im=hparams.re_im, 490 | dphase=hparams.dphase, 491 | mag_only=hparams.mag_only), 492 | hparams.samples_per_second, 493 | max_outputs=b) 494 | 495 | 496 | def calculate_softmax_and_summaries(logits, one_hot_labels, name): 497 | """Calculate the softmax cross entropy loss and associated summaries. 498 | Args: 499 | logits: Tensor of logits, first dimension is batch size. 500 | one_hot_labels: Tensor of one hot encoded categorical labels. First 501 | dimension is batch size. 502 | name: Name to use as prefix for summaries. 503 | Returns: 504 | loss: Dimensionless tensor representing the mean negative 505 | log-probability of the true class. 506 | """ 507 | loss = tf.nn.softmax_cross_entropy_with_logits( 508 | logits=logits, labels=one_hot_labels) 509 | loss = tf.reduce_mean(loss) 510 | softmax_summaries(loss, logits, one_hot_labels, name) 511 | return loss 512 | 513 | 514 | def calculate_sparse_softmax_and_summaries(logits, labels, name): 515 | """Calculate the softmax cross entropy loss and associated summaries. 516 | Args: 517 | logits: Tensor of logits, first dimension is batch size. 518 | labels: Tensor of categorical labels [ints]. First 519 | dimension is batch size. 520 | name: Name to use as prefix for summaries. 521 | Returns: 522 | loss: Dimensionless tensor representing the mean negative 523 | log-probability of the true class. 524 | """ 525 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits( 526 | logits=logits, labels=labels) 527 | loss = tf.reduce_mean(loss) 528 | softmax_summaries(loss, logits, labels, name) 529 | return loss 530 | 531 | 532 | def softmax_summaries(loss, logits, one_hot_labels, name="softmax"): 533 | """Create the softmax summaries for this cross entropy loss. 534 | Args: 535 | loss: Cross-entropy loss. 536 | logits: The [batch_size, classes] float tensor representing the logits. 537 | one_hot_labels: The float tensor representing actual class ids. If this is 538 | [batch_size, classes], then we take the argmax of it first. 539 | name: Prepended to summary scope. 540 | """ 541 | tf.summary.scalar(name + "_loss", loss) 542 | 543 | one_hot_labels = tf.cond( 544 | tf.equal(tf.rank(one_hot_labels), 545 | 2), lambda: tf.to_int32(tf.argmax(one_hot_labels, 1)), 546 | lambda: tf.to_int32(one_hot_labels)) 547 | 548 | in_top_1 = tf.nn.in_top_k(logits, one_hot_labels, 1) 549 | tf.summary.scalar(name + "_precision@1", 550 | tf.reduce_mean(tf.to_float(in_top_1))) 551 | in_top_5 = tf.nn.in_top_k(logits, one_hot_labels, 5) 552 | tf.summary.scalar(name + "_precision@5", 553 | tf.reduce_mean(tf.to_float(in_top_5))) 554 | 555 | 556 | def calculate_l2_and_summaries(predicted_vectors, true_vectors, name): 557 | """Calculate L2 loss and associated summaries. 558 | Args: 559 | predicted_vectors: Tensor of predictions, first dimension is batch size. 560 | true_vectors: Tensor of labels, first dimension is batch size. 561 | name: Name to use as prefix for summaries. 562 | Returns: 563 | loss: Dimensionless tensor representing the mean euclidean distance 564 | between true and predicted. 565 | """ 566 | loss = tf.reduce_mean((predicted_vectors - true_vectors)**2) 567 | tf.summary.scalar(name + "_loss", loss, name="loss") 568 | tf.summary.scalar( 569 | name + "_prediction_mean_squared_norm", 570 | tf.reduce_mean(tf.nn.l2_loss(predicted_vectors)), 571 | name=name + "_prediction_mean_squared_norm") 572 | tf.summary.scalar( 573 | name + "_label_mean_squared_norm", 574 | tf.reduce_mean(tf.nn.l2_loss(true_vectors)), 575 | name=name + "_label_mean_squared_norm") 576 | return loss 577 | 578 | 579 | def frequency_weighted_cost_mask(peak=10.0, hz_flat=1000, sr=16000, n_fft=512): 580 | """Calculates a mask to weight lower frequencies higher. 581 | Piecewise linear approximation. Assumes magnitude is in log scale. 582 | Args: 583 | peak: Cost increase at 0 Hz. 584 | hz_flat: Hz at which cost increase is 0. 585 | sr: Sample rate. 586 | n_fft: FFT size. 587 | Returns: 588 | Constant tensor [1, N_freq, 1] of cost weighting. 589 | """ 590 | n = int(n_fft / 2) 591 | cutoff = np.where( 592 | librosa.core.fft_frequencies(sr=sr, n_fft=n_fft) >= hz_flat)[0][0] 593 | mask = np.concatenate([np.linspace(peak, 1.0, cutoff), np.ones(n - cutoff)]) 594 | return tf.constant(mask[np.newaxis, :, np.newaxis], dtype=tf.float32) 595 | 596 | 597 | #--------------------------------------------------- 598 | # Neural Nets 599 | #--------------------------------------------------- 600 | def pitch_embeddings(batch, 601 | timesteps=1, 602 | n_pitches=128, 603 | dim_embedding=128, 604 | reuse=False): 605 | """Get a embedding of each pitch note. 606 | Args: 607 | batch: NSynthDataset batch dictionary. 608 | timesteps: Number of timesteps to replicate across. 609 | n_pitches: Number of one-hot embeddings. 610 | dim_embedding: Dimension of linear projection of one-hot encoding. 611 | reuse: Reuse variables. 612 | Returns: 613 | embedding: A tensor of shape [batch_size, 1, timesteps, dim_embedding]. 614 | """ 615 | batch_size = batch["pitch"].get_shape().as_list()[0] 616 | with tf.variable_scope("PitchEmbedding", reuse=reuse): 617 | w = tf.get_variable( 618 | name="embedding_weights", 619 | shape=[n_pitches, dim_embedding], 620 | initializer=tf.random_normal_initializer()) 621 | one_hot_pitch = tf.reshape(batch["pitch"], [batch_size]) 622 | one_hot_pitch = tf.one_hot(one_hot_pitch, depth=n_pitches) 623 | embedding = tf.matmul(one_hot_pitch, w) 624 | embedding = tf.reshape(embedding, [batch_size, 1, 1, dim_embedding]) 625 | if timesteps > 1: 626 | embedding = tf.tile(embedding, [1, 1, timesteps, 1]) 627 | return embedding 628 | 629 | 630 | def slim_batchnorm_arg_scope(is_training, activation_fn=None): 631 | """Create a scope for applying BatchNorm in slim. 632 | This scope also applies Glorot initializiation to convolutional weights. 633 | Args: 634 | is_training: Whether this is a training run. 635 | activation_fn: Whether we apply an activation_fn to the convolution result. 636 | Returns: 637 | scope: Use this scope to automatically apply BatchNorm and Xavier Init to 638 | slim.conv2d and slim.fully_connected. 639 | """ 640 | batch_norm_params = { 641 | "is_training": is_training, 642 | "decay": 0.999, 643 | "epsilon": 0.001, 644 | "variables_collections": { 645 | "beta": None, 646 | "gamma": None, 647 | "moving_mean": "moving_vars", 648 | "moving_variance": "moving_vars", 649 | } 650 | } 651 | 652 | with slim.arg_scope( 653 | [slim.conv2d, slim.fully_connected, slim.conv2d_transpose], 654 | weights_initializer=slim.initializers.xavier_initializer(), 655 | activation_fn=activation_fn, 656 | normalizer_fn=slim.batch_norm, 657 | normalizer_params=batch_norm_params) as scope: 658 | return scope 659 | 660 | 661 | def conv2d(x, 662 | kernel_size, 663 | stride, 664 | channels, 665 | is_training, 666 | scope="conv2d", 667 | batch_norm=False, 668 | residual=False, 669 | gated=False, 670 | activation_fn=tf.nn.relu, 671 | resize=False, 672 | transpose=False, 673 | stacked_layers=1): 674 | """2D-Conv with optional batch_norm, gating, residual. 675 | Args: 676 | x: Tensor input [MB, H, W, CH]. 677 | kernel_size: List [H, W]. 678 | stride: List [H, W]. 679 | channels: Int, output channels. 680 | is_training: Whether to collect stats for BatchNorm. 681 | scope: Enclosing scope name. 682 | batch_norm: Apply batch normalization 683 | residual: Residual connections, have stacked_layers >= 2. 684 | gated: Gating ala Wavenet. 685 | activation_fn: Nonlinearity function. 686 | resize: On transposed convolution, do ImageResize instead of conv_transpose. 687 | transpose: Use conv_transpose instead of conv. 688 | stacked_layers: Number of layers before a residual connection. 689 | Returns: 690 | x: Tensor output. 691 | """ 692 | # For residual 693 | x0 = x 694 | # Choose convolution function 695 | conv_fn = slim.conv2d_transpose if transpose else slim.conv2d 696 | # Double output channels for gates 697 | num_outputs = channels * 2 if gated else channels 698 | normalizer_fn = slim.batch_norm if batch_norm else None 699 | 700 | with tf.variable_scope(scope + "_Layer"): 701 | # Apply a stack of convolutions Before adding residual 702 | for layer_idx in range(stacked_layers): 703 | with slim.arg_scope( 704 | slim_batchnorm_arg_scope(is_training, activation_fn=None)): 705 | # Use interpolation to upsample instead of conv_transpose 706 | if transpose and resize: 707 | unused_mb, h, w, unused_ch = x.get_shape().as_list() 708 | x = tf.image.resize_images( 709 | x, size=[h * stride[0], w * stride[1]], method=0) 710 | stride_conv = [1, 1] 711 | else: 712 | stride_conv = stride 713 | 714 | x = conv_fn( 715 | inputs=x, 716 | stride=stride_conv, 717 | kernel_size=kernel_size, 718 | num_outputs=num_outputs, 719 | normalizer_fn=normalizer_fn, 720 | biases_initializer=tf.zeros_initializer(), 721 | scope=scope) 722 | 723 | if gated: 724 | with tf.variable_scope("Gated"): 725 | x1, x2 = x[:, :, :, :channels], x[:, :, :, channels:] 726 | if activation_fn: 727 | x1, x2 = activation_fn(x1), tf.sigmoid(x2) 728 | else: 729 | x2 = tf.sigmoid(x2) 730 | x = x1 * x2 731 | 732 | # Apply residual to last layer before the last nonlinearity 733 | if residual and (layer_idx == stacked_layers - 1): 734 | with tf.variable_scope("Residual"): 735 | # Don't upsample residual in time 736 | if stride[0] == 1 and stride[1] == 1: 737 | channels_in = x0.get_shape().as_list()[-1] 738 | # Make n_channels match for residual 739 | if channels != channels_in: 740 | x0 = slim.conv2d( 741 | inputs=x0, 742 | stride=[1, 1], 743 | kernel_size=[1, 1], 744 | num_outputs=channels, 745 | normalizer_fn=None, 746 | activation_fn=None, 747 | biases_initializer=tf.zeros_initializer, 748 | scope=scope + "_residual") 749 | x += x0 750 | else: 751 | x += x0 752 | if activation_fn and not gated: 753 | x = activation_fn(x) 754 | return x 755 | 756 | 757 | def leaky_relu(leak=0.1): 758 | """Leaky ReLU activation function. 759 | Args: 760 | leak: float. Slope for the negative part of the leaky ReLU function. 761 | Defaults to 0.1. 762 | Returns: 763 | A lambda computing the leaky ReLU function with the specified slope. 764 | """ 765 | return lambda x: tf.maximum(x, leak * x) 766 | 767 | 768 | def causal_linear(x, n_inputs, n_outputs, name, filter_length, rate, 769 | batch_size): 770 | """Applies dilated convolution using queues. 771 | Assumes a filter_length of 3. 772 | Args: 773 | x: The [mb, time, channels] tensor input. 774 | n_inputs: The input number of channels. 775 | n_outputs: The output number of channels. 776 | name: The variable scope to provide to W and biases. 777 | filter_length: The length of the convolution, assumed to be 3. 778 | rate: The rate or dilation 779 | batch_size: Non-symbolic value for batch_size. 780 | Returns: 781 | y: The output of the operation 782 | (init_1, init_2): Initialization operations for the queues 783 | (push_1, push_2): Push operations for the queues 784 | """ 785 | assert filter_length == 3 786 | 787 | # create queue 788 | q_1 = tf.FIFOQueue(rate, dtypes=tf.float32, shapes=(batch_size, 1, n_inputs)) 789 | q_2 = tf.FIFOQueue(rate, dtypes=tf.float32, shapes=(batch_size, 1, n_inputs)) 790 | init_1 = q_1.enqueue_many(tf.zeros((rate, batch_size, 1, n_inputs))) 791 | init_2 = q_2.enqueue_many(tf.zeros((rate, batch_size, 1, n_inputs))) 792 | state_1 = q_1.dequeue() 793 | push_1 = q_1.enqueue(x) 794 | state_2 = q_2.dequeue() 795 | push_2 = q_2.enqueue(state_1) 796 | 797 | # get pretrained weights 798 | w = tf.get_variable( 799 | name=name + "/W", 800 | shape=[1, filter_length, n_inputs, n_outputs], 801 | dtype=tf.float32) 802 | b = tf.get_variable( 803 | name=name + "/biases", shape=[n_outputs], dtype=tf.float32) 804 | w_q_2 = tf.slice(w, [0, 0, 0, 0], [-1, 1, -1, -1]) 805 | w_q_1 = tf.slice(w, [0, 1, 0, 0], [-1, 1, -1, -1]) 806 | w_x = tf.slice(w, [0, 2, 0, 0], [-1, 1, -1, -1]) 807 | 808 | # perform op w/ cached states 809 | y = tf.nn.bias_add( 810 | tf.matmul(state_2[:, 0, :], w_q_2[0][0]) + tf.matmul( 811 | state_1[:, 0, :], w_q_1[0][0]) + tf.matmul(x[:, 0, :], w_x[0][0]), b) 812 | 813 | y = tf.expand_dims(y, 1) 814 | return y, (init_1, init_2), (push_1, push_2) 815 | 816 | 817 | def linear(x, n_inputs, n_outputs, name): 818 | """Simple linear layer. 819 | Args: 820 | x: The [mb, time, channels] tensor input. 821 | n_inputs: The input number of channels. 822 | n_outputs: The output number of channels. 823 | name: The variable scope to provide to W and biases. 824 | Returns: 825 | y: The output of the operation. 826 | """ 827 | w = tf.get_variable( 828 | name=name + "/W", shape=[1, 1, n_inputs, n_outputs], dtype=tf.float32) 829 | b = tf.get_variable( 830 | name=name + "/biases", shape=[n_outputs], dtype=tf.float32) 831 | y = tf.nn.bias_add(tf.matmul(x[:, 0, :], w[0][0]), b) 832 | y = tf.expand_dims(y, 1) 833 | return y -------------------------------------------------------------------------------- /nsynth/torch_readers/__init__.py: -------------------------------------------------------------------------------- 1 | from .basic_dataset import * 2 | from .dataset_h5py import * 3 | from .dataset_tfrecord import * -------------------------------------------------------------------------------- /nsynth/torch_readers/basic_dataset.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | import numpy as np 4 | 5 | from misc.basic_dataset import BasicDataset 6 | from misc.utils import LabelsEncoder, LabelsToOneHot 7 | from nsynth.constants import * 8 | 9 | 10 | class NSynthBasicDataset(BasicDataset): 11 | def __init__(self, 12 | dataset_path, 13 | transforms, 14 | sr, 15 | signal_length=2 ** 16, 16 | precision=np.float32, 17 | one_hot_all=False, 18 | one_hot_pitch=False, 19 | one_hot_velocity=False, 20 | one_hot_instr_src=False, 21 | one_hot_instr_family=False, 22 | encode_cat=False, 23 | in_memory=True): 24 | super(NSynthBasicDataset, self).__init__(transforms, sr, signal_length, precision, 25 | one_hot_all, encode_cat, in_memory) 26 | self.one_hot_pitch = one_hot_pitch 27 | self.one_hot_velocity = one_hot_velocity 28 | self.one_hot_instr_src = one_hot_instr_src 29 | self.one_hot_instr_family = one_hot_instr_family 30 | 31 | self.audio = [] 32 | self.pitch = [] 33 | self.velocity = [] 34 | self.instr_src = [] 35 | self.instr_fml = [] 36 | self.qualities = [] 37 | 38 | self.read_file(dataset_path) 39 | 40 | print(self.pitch) 41 | self.n = self.pitch.shape[0] 42 | 43 | if self.encode_cat: 44 | self.pitch_encoder = LabelsEncoder(self.pitch) 45 | self.velocity_encoder = LabelsEncoder(self.velocity) 46 | self.instr_src_encoder = LabelsEncoder(self.instr_src) 47 | self.instr_fml_encoder = LabelsEncoder(self.instr_fml) 48 | 49 | self.pitch = self.pitch_encoder(self.pitch) 50 | self.velocity = self.velocity_encoder(self.velocity) 51 | self.instr_src = self.instr_src_encoder(self.instr_src) 52 | self.instr_fml = self.instr_fml_encoder(self.instr_fml) 53 | 54 | if self.one_hot_pitch or self.one_hot_all: 55 | self.pitch_one_hot = LabelsToOneHot(self.pitch) 56 | else: 57 | self.pitch_one_hot = None 58 | 59 | if self.one_hot_velocity or self.one_hot_all: 60 | self.velocity_one_hot = LabelsToOneHot(self.velocity) 61 | else: 62 | self.velocity_one_hot = None 63 | 64 | if self.one_hot_instr_src or self.one_hot_all: 65 | self.instr_src_one_hot = LabelsToOneHot(self.instr_src) 66 | else: 67 | self.instr_src_one_hot = None 68 | 69 | if self.one_hot_instr_family or self.one_hot_all: 70 | self.instr_fml_one_hot = LabelsToOneHot(self.instr_fml) 71 | else: 72 | self.instr_fml_one_hot = None 73 | 74 | @abstractmethod 75 | def read_file(self, dataset_path): 76 | pass 77 | 78 | @abstractmethod 79 | def read_elem(self, index): 80 | return None 81 | 82 | def instance_dataset(self, dataset_path, transforms, in_memory): 83 | new_dataset = self.__class__(dataset_path, 84 | transforms, 85 | sr=self.sr, 86 | signal_length=self.signal_length, 87 | precision=self.precision, 88 | one_hot_all=False, 89 | one_hot_pitch=False, 90 | one_hot_velocity=False, 91 | one_hot_instr_src=False, 92 | one_hot_instr_family=False, 93 | encode_cat=False, 94 | in_memory=in_memory 95 | ) 96 | 97 | new_dataset.one_hot_all = self.one_hot_all 98 | if self.one_hot_pitch or self.one_hot_all: 99 | new_dataset.one_hot_pitch = True 100 | new_dataset.pitch_one_hot = self.pitch_one_hot 101 | 102 | if self.one_hot_velocity or self.one_hot_all: 103 | new_dataset.one_hot_velocity = True 104 | new_dataset.velocity_one_hot = self.velocity_one_hot 105 | 106 | if self.one_hot_instr_src or self.one_hot_all: 107 | new_dataset.one_hot_instr_src = True 108 | new_dataset.instr_src_one_hot = self.instr_src_one_hot 109 | 110 | if self.one_hot_instr_family or self.one_hot_all: 111 | new_dataset.one_hot_instr_family = True 112 | new_dataset.instr_fml_one_hot = self.instr_fml_one_hot 113 | 114 | new_dataset.encode_cat = self.encode_cat 115 | if self.encode_cat: 116 | new_dataset.pitch_encoder = self.pitch_encoder 117 | new_dataset.velocity_encoder = self.velocity_encoder 118 | new_dataset.instr_src_encoder = self.instr_src_encoder 119 | new_dataset.instr_fml_encoder = self.instr_fml_encoder 120 | 121 | return new_dataset 122 | 123 | def __getitem__(self, index): 124 | audio, pitch, velocity, instrument_source, instrument_family, qualities = self.read_elem(index) 125 | 126 | audio = self.do_transform(audio) 127 | 128 | if self.encode_cat and not self.in_memory: 129 | pitch = self.pitch_encoder(pitch) 130 | velocity = self.velocity_encoder(velocity) 131 | instrument_source = self.instr_src_encoder(instrument_source) 132 | instrument_family = self.instr_fml_encoder(instrument_family) 133 | 134 | if self.one_hot_pitch or self.one_hot_all: 135 | pitch = self.pitch_one_hot(pitch) 136 | if self.one_hot_velocity or self.one_hot_all: 137 | velocity = self.velocity_one_hot(velocity) 138 | if self.one_hot_instr_src or self.one_hot_all: 139 | instrument_source = self.instr_src_one_hot(instrument_source) 140 | if self.one_hot_instr_family or self.one_hot_all: 141 | instrument_family = self.instr_fml_one_hot(instrument_family) 142 | 143 | return {AUDIO: audio, PITCH: pitch, VELOCITY: velocity, 144 | INSTR_SRC: instrument_source, INSTR_FAMILY: instrument_family, QUALITIES: qualities} 145 | 146 | 147 | if __name__ == "__main__": 148 | pass 149 | -------------------------------------------------------------------------------- /nsynth/torch_readers/dataloader_tfrecord.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | 3 | from nsynth import * 4 | 5 | 6 | class NSynthTFRecordDataLoader(DataLoader): 7 | def __init__(self, dataset, **kwargs): 8 | super(NSynthTFRecordDataLoader, self).__init__(dataset, **kwargs) 9 | 10 | def __iter__(self): 11 | audio = [] 12 | pitch = [] 13 | velocity = [] 14 | instr_src = [] 15 | instr_fml = [] 16 | qualities = [] 17 | 18 | for idx in range(len(self.dataset)): 19 | elem = self.dataset[idx] 20 | audio.append(elem[AUDIO]) 21 | pitch.append(elem[PITCH]) 22 | velocity.append(elem[VELOCITY]) 23 | instr_src.append(elem[INSTR_SRC]) 24 | instr_fml.append(elem[INSTR_FAMILY]) 25 | qualities.append(elem[QUALITIES]) 26 | 27 | if (idx + 1) % self.batch_size == 0: 28 | yield {AUDIO: np.vstack(audio), PITCH: np.hstack(pitch), 29 | VELOCITY: np.hstack(velocity), INSTR_SRC: np.hstack(instr_src), 30 | INSTR_FAMILY: np.hstack(instr_fml), QUALITIES: np.hstack(qualities)} 31 | 32 | audio.clear() 33 | pitch.clear() 34 | velocity.clear() 35 | instr_src.clear() 36 | instr_fml.clear() 37 | qualities.clear() 38 | 39 | return {AUDIO: np.vstack(audio), PITCH: np.hstack(pitch), 40 | VELOCITY: np.hstack(velocity), INSTR_SRC: np.hstack(instr_src), 41 | INSTR_FAMILY: np.hstack(instr_fml), QUALITIES: np.hstack(qualities)} 42 | 43 | 44 | class NSynthTFRecordTestDataLoader(DataLoader): 45 | def __init__(self, dataset, **kwargs): 46 | kwargs['batch_size'] = 1 47 | super(NSynthTFRecordTestDataLoader, self).__init__(dataset, **kwargs) 48 | 49 | def __iter__(self): 50 | for idx in range(len(self.dataset)): 51 | elem = self.dataset[idx] 52 | result = {AUDIO: elem[AUDIO], PITCH: elem[PITCH], 53 | VELOCITY: elem[VELOCITY], INSTR_SRC: elem[INSTR_SRC], 54 | INSTR_FAMILY: elem[INSTR_FAMILY], QUALITIES: elem[QUALITIES]} 55 | yield result 56 | 57 | 58 | if __name__ == "__main__": 59 | from misc.transforms import get_train_transform, get_test_transform 60 | 61 | params = {'batch_size': 64, 62 | 'shuffle': False, 63 | 'num_workers': 1} 64 | 65 | train_transforms = get_train_transform(length=2 ** 14) 66 | dataset = NSynthTFRecordDataset("../nsynth-test.tfrecord", 67 | one_hot_pitch=True, 68 | encode_cat=True, 69 | transforms=train_transforms, 70 | sr=16000, 71 | in_memory=False) 72 | test_generator = NSynthTFRecordDataLoader(dataset, **params) 73 | for batch in test_generator: 74 | print(batch['audio'].shape) 75 | print(batch) 76 | break 77 | 78 | print("--------------------------") 79 | params = {'batch_size': 64, 80 | 'shuffle': False, 81 | 'num_workers': 1} 82 | 83 | test_transforms = get_test_transform(length=2 ** 14) 84 | dataset = NSynthTFRecordDataset("../nsynth-test.tfrecord", 85 | one_hot_pitch=True, 86 | encode_cat=True, 87 | transforms=test_transforms, 88 | sr=16000, 89 | in_memory=False) 90 | test_generator = NSynthTFRecordTestDataLoader(dataset, **params) 91 | for batch in test_generator: 92 | print(batch['audio'].shape) 93 | print(batch) 94 | break 95 | -------------------------------------------------------------------------------- /nsynth/torch_readers/dataset_h5py.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | 3 | import numpy as np 4 | 5 | from nsynth.constants import * 6 | from nsynth.torch_readers.basic_dataset import NSynthBasicDataset 7 | 8 | 9 | class NSynthH5PyDataset(NSynthBasicDataset): 10 | def __init__(self, 11 | dataset_path, 12 | transforms, 13 | sr, 14 | signal_length=2 ** 16, 15 | precision=np.float32, 16 | one_hot_all=False, 17 | one_hot_pitch=False, 18 | one_hot_velocity=False, 19 | one_hot_instr_src=False, 20 | one_hot_instr_family=False, 21 | encode_cat=False, 22 | in_memory=True): 23 | super(NSynthH5PyDataset, self).__init__(dataset_path, 24 | transforms, 25 | sr, 26 | signal_length=signal_length, 27 | precision=precision, 28 | one_hot_all=one_hot_all, 29 | one_hot_pitch=one_hot_pitch, 30 | one_hot_velocity=one_hot_velocity, 31 | one_hot_instr_src=one_hot_instr_src, 32 | one_hot_instr_family=one_hot_instr_family, 33 | encode_cat=encode_cat, 34 | in_memory=in_memory) 35 | self.hpy_file = None 36 | 37 | def read_file(self, dataset_path): 38 | f = h5py.File(dataset_path, 'r') 39 | self.pitch = f[PITCH][:] 40 | self.velocity = f[VELOCITY][:] 41 | self.instr_src = f[INSTR_SRC][:] 42 | self.instr_fml = f[INSTR_FAMILY][:] 43 | self.qualities = f[QUALITIES][:] 44 | if self.in_memory: 45 | self.audio = f[AUDIO][:] 46 | f.close() 47 | else: 48 | self.hpy_file = f 49 | self.audio = f[AUDIO] 50 | 51 | def __exit__(self, exc_type, exc_value, traceback): 52 | if self.hpy_file is not None: 53 | self.hpy_file.close() 54 | 55 | def read_elem(self, index): 56 | audio, pitch, velocity = self.audio[index], self.pitch[index], self.velocity[index] 57 | instrument_source, instrument_family = self.instr_src[index], self.instr_fml[index] 58 | qualities = self.qualities[index] 59 | 60 | return audio, pitch, velocity, instrument_source, instrument_family, qualities 61 | 62 | 63 | if __name__ == "__main__": 64 | from misc.transforms import get_train_transform 65 | from torch.utils import data 66 | 67 | train_transforms = get_train_transform(length=2 ** 14) 68 | dataset = NSynthH5PyDataset("../nsynth-test.hdf5", 69 | one_hot_pitch=True, 70 | encode_cat=True, 71 | transforms=train_transforms, 72 | sr=16000, 73 | in_memory=True) 74 | print("Dataset Len", len(dataset)) 75 | print("item 0", dataset[0]) 76 | 77 | dataset = dataset.instance_dataset("../nsynth-test.hdf5", train_transforms, False) 78 | 79 | params = {'batch_size': 64, 80 | 'shuffle': True, 81 | 'num_workers': 1} 82 | training_generator = data.DataLoader(dataset, **params) 83 | 84 | for batch in training_generator: 85 | print(batch[AUDIO].shape) 86 | print(batch[PITCH]) 87 | break 88 | -------------------------------------------------------------------------------- /nsynth/torch_readers/dataset_tfrecord.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from misc import configure_tf_dataset, itarate_over_tfrecord 4 | from nsynth.torch_readers.basic_dataset import NSynthBasicDataset 5 | from nsynth.utils import * 6 | 7 | 8 | class NSynthTFRecordDataset(NSynthBasicDataset): 9 | def __init__(self, 10 | dataset_path, 11 | transforms, 12 | sr, 13 | signal_length=2 ** 16, 14 | precision=np.float32, 15 | one_hot_all=False, 16 | one_hot_pitch=False, 17 | one_hot_velocity=False, 18 | one_hot_instr_src=False, 19 | one_hot_instr_family=False, 20 | encode_cat=False, 21 | in_memory=True, 22 | batch_size=1, 23 | repeat=1, 24 | buffer_size=10): 25 | # self.sess = tf.Session() 26 | self.sess = None 27 | self.iterator = None 28 | 29 | self.dataset = configure_tf_dataset(nsynth_extract_features, batch_size, buffer_size, dataset_path, repeat) 30 | 31 | super(NSynthTFRecordDataset, self).__init__(dataset_path, 32 | transforms, 33 | sr, 34 | signal_length=signal_length, 35 | precision=precision, 36 | one_hot_all=one_hot_all, 37 | one_hot_pitch=one_hot_pitch, 38 | one_hot_velocity=one_hot_velocity, 39 | one_hot_instr_src=one_hot_instr_src, 40 | one_hot_instr_family=one_hot_instr_family, 41 | encode_cat=encode_cat, 42 | in_memory=in_memory) 43 | 44 | def read_file(self, dataset_path): 45 | iter = self.dataset.make_one_shot_iterator() 46 | self.n = 0 47 | for audio, pitch, velocity, instrument_source, instrument_family, qualities in itarate_over_tfrecord(iter): 48 | if not self.in_memory: 49 | self.audio.append(audio) 50 | self.n += 1 51 | self.pitch.append(pitch) 52 | self.velocity.append(velocity) 53 | self.instr_src.append(instrument_source) 54 | self.instr_fml.append(instrument_family) 55 | self.qualities.append(qualities) 56 | 57 | self.audio = np.array(self.audio) 58 | self.pitch = np.array(self.pitch) 59 | self.velocity = np.array(self.velocity) 60 | self.instr_src = np.array(self.instr_src) 61 | self.instr_fml = np.array(self.instr_fml) 62 | self.qualities = np.array(self.qualities) 63 | 64 | if not self.in_memory: 65 | self.sess = tf.Session() 66 | 67 | def __exit__(self, exc_type, exc_value, traceback): 68 | if self.sess is not None: 69 | self.sess.close() 70 | 71 | def read_elem(self, index): 72 | if self.in_memory: 73 | audio, pitch, velocity = self.audio[index], self.pitch[index], self.velocity[index] 74 | instrument_source, instrument_family = self.instr_src[index], self.instr_fml[index] 75 | qualities = self.qualities[index] 76 | else: 77 | if self.iterator is None: 78 | self.iterator = self.dataset.make_one_shot_iterator() 79 | try: 80 | audio, pitch, velocity, instrument_source, instrument_family, qualities = self.sess.run( 81 | self.iterator.get_next()) 82 | except tf.errors.OutOfRangeError: 83 | self.iterator = self.dataset.make_one_shot_iterator() 84 | audio, pitch, velocity, instrument_source, instrument_family, qualities = self.sess.run( 85 | self.iterator.get_next()) 86 | 87 | return audio, pitch, velocity, instrument_source, instrument_family, qualities 88 | 89 | 90 | if __name__ == "__main__": 91 | from misc.transforms import get_train_transform 92 | 93 | train_transforms = get_train_transform(length=2 ** 14) 94 | dataset = NSynthTFRecordDataset("../nsynth-test.tfrecord", 95 | one_hot_pitch=True, 96 | encode_cat=True, 97 | transforms=train_transforms, 98 | sr=16000, 99 | in_memory=False) 100 | print("Dataset Len", len(dataset)) 101 | print("item 0", dataset[0]) 102 | 103 | dataset = dataset.instance_dataset("../nsynth-test.tfrecord", train_transforms, False) 104 | 105 | print("Dataset Len", len(dataset)) 106 | print("item 0", dataset[0]) 107 | -------------------------------------------------------------------------------- /nsynth/utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from nsynth.constants import * 4 | 5 | 6 | def nsynth_extract_features(example): 7 | features = { 8 | NOTE_STR: tf.FixedLenFeature([], dtype=tf.string), 9 | PITCH: tf.FixedLenFeature([1], dtype=tf.int64), 10 | VELOCITY: tf.FixedLenFeature([1], dtype=tf.int64), 11 | AUDIO: tf.FixedLenFeature([64000], dtype=tf.float32), 12 | QUALITIES: tf.FixedLenFeature([10], dtype=tf.int64), 13 | INSTR_SRC: tf.FixedLenFeature([1], dtype=tf.int64), 14 | INSTR_FAMILY: tf.FixedLenFeature([1], dtype=tf.int64), 15 | } 16 | 17 | parsed_example = tf.parse_single_example(example, features) 18 | 19 | audio = tf.reshape(tf.cast(parsed_example[AUDIO], tf.float32), [1, 64000]) 20 | pitch = tf.cast(parsed_example[PITCH], tf.int64) 21 | velocity = tf.cast(parsed_example[VELOCITY], tf.int64) 22 | instrument_source = tf.cast(parsed_example[INSTR_SRC], tf.int64) 23 | instrument_family = tf.cast(parsed_example[INSTR_FAMILY], tf.int64) 24 | qualities = tf.reshape(tf.cast(parsed_example[QUALITIES], tf.int64), [1, 10]) 25 | return audio, pitch, velocity, instrument_source, instrument_family, qualities --------------------------------------------------------------------------------