├── lib ├── data │ ├── __init__.py │ ├── config.py │ ├── audio.py │ ├── scd.py │ ├── batch.py │ ├── dcs.py │ └── mtt.py ├── model_config.py ├── utils.py ├── initialization.py └── model.py ├── eval.py ├── .gitignore ├── train.py └── README.md /lib/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import config 2 | from . import mtt 3 | from . import scd 4 | from . import dcs 5 | from . import audio 6 | -------------------------------------------------------------------------------- /lib/model_config.py: -------------------------------------------------------------------------------- 1 | from lib.model import * 2 | 3 | 4 | class ModelConfig: 5 | """ 6 | The default setting is for MTT with se-multi. 7 | """ 8 | 9 | def __init__(self, block='se', multi=True, num_blocks=9, init_features=128, num_convs=1, 10 | amplifying_ratio=0.125, dropout=0.5, activation='sigmoid', num_classes=50, weight_decay=0.): 11 | 12 | # Configure block specific settings. 13 | if block == 'basic': 14 | block_fn = basic_block 15 | elif block.startswith('rese'): 16 | num_convs = int(block[-1]) 17 | block_fn = rese_block 18 | elif block.startswith('res'): 19 | num_convs = int(block[-1]) 20 | amplifying_ratio = None 21 | block_fn = rese_block 22 | elif block == 'se': 23 | block_fn = se_block 24 | else: 25 | raise Exception(f'Unknown block name: {block}') 26 | 27 | # Overall architecture configurations. 28 | self.multi = multi 29 | self.init_features = init_features 30 | 31 | # Block configurations. 32 | self.block = block 33 | self.block_fn = block_fn 34 | self.num_blocks = num_blocks 35 | self.amplifying_ratio = amplifying_ratio 36 | self.num_convs = num_convs 37 | 38 | # Training related configurations. 39 | self.dropout = dropout 40 | self.activation = activation 41 | self.num_classes = num_classes 42 | self.weight_decay = weight_decay 43 | 44 | def get_signature(self): 45 | s = self.block 46 | if self.multi: 47 | s += '_multi' 48 | return s 49 | 50 | def print_summary(self): 51 | print(f'''=> {self.get_signature()} properties: 52 | block : {self.block} 53 | multi : {self.multi} 54 | num_blocks : {self.num_blocks} 55 | amplifying_ratio : {self.amplifying_ratio} 56 | dropout : {self.dropout} 57 | activation : {self.activation} 58 | num_classes : {self.num_classes}''') 59 | -------------------------------------------------------------------------------- /lib/data/config.py: -------------------------------------------------------------------------------- 1 | class DatasetConfig: 2 | 3 | def __init__(self, num_blocks, num_samples, sr, len_audio, num_audios_per_shard, 4 | num_classes, loss, metrics, activation, mean, std, 5 | num_train_audios, num_test_audios, num_val_audios, threshold=0.5): 6 | self.num_blocks = num_blocks 7 | self.num_samples = num_samples 8 | self.sr = sr 9 | self.len_audio = len_audio 10 | self.num_segments = len_audio * sr // num_samples 11 | self.num_audios_per_shard = num_audios_per_shard 12 | 13 | self.num_train_audios = num_train_audios 14 | self.num_val_audios = num_val_audios 15 | self.num_test_audios = num_test_audios 16 | self.num_train_segs = num_train_audios * self.num_segments 17 | self.num_val_segs = num_val_audios * self.num_segments 18 | self.num_test_segs = num_test_audios * self.num_segments 19 | 20 | self.num_classes = num_classes 21 | self.loss = loss 22 | self.metrics = metrics 23 | self.activation = activation 24 | self.threshold = threshold 25 | 26 | self.mean = mean 27 | self.std = std 28 | 29 | 30 | MTT_CONFIG = DatasetConfig(num_blocks=9, num_samples=59049, sr=22050, len_audio=29, num_audios_per_shard=100, 31 | num_train_audios=15250, num_val_audios=1529, num_test_audios=4332, 32 | loss='binary_crossentropy', metrics=None, activation='sigmoid', num_classes=50, 33 | mean=-0.0001650025078561157, std=0.1551193743944168) 34 | 35 | SCD_CONFIG = DatasetConfig(num_blocks=8, num_samples=22050, sr=22050, len_audio=1, num_audios_per_shard=1000, 36 | num_train_audios=84843, num_val_audios=9981, num_test_audios=11005, 37 | loss='categorical_crossentropy', metrics=['accuracy'], activation='softmax', num_classes=35, 38 | mean=-8.520474e-05, std=0.18) 39 | 40 | DCS_CONFIG = DatasetConfig(num_blocks=8, num_samples=22050, sr=22050, len_audio=10, num_audios_per_shard=300, 41 | num_train_audios=46042, num_val_audios=5618, num_test_audios=1103, 42 | loss='binary_crossentropy', metrics=['accuracy'], activation='sigmoid', num_classes=17, 43 | mean=-0.0003320679534226656, std=0.20514629781246185, threshold=0.1) 44 | -------------------------------------------------------------------------------- /lib/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def mkpath(*paths): 5 | """Make path.""" 6 | path = os.path.join(*[str(path) for path in paths]) 7 | path = os.path.realpath(path) 8 | return path 9 | 10 | # def apk(actual, predicted, k=3): 11 | # """ 12 | # Computes the average precision at k. 13 | # This function computes the average prescision at k between two lists of 14 | # items. 15 | # Parameters 16 | # ---------- 17 | # actual : list 18 | # A list of elements that are to be predicted (order doesn't matter) 19 | # predicted : list 20 | # A list of predicted elements (order does matter) 21 | # k : int, optional 22 | # The maximum number of predicted elements 23 | # Returns 24 | # ------- 25 | # score : double 26 | # The average precision at k over the input lists 27 | # 28 | # Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py 29 | # """ 30 | # if len(predicted) > k: 31 | # predicted = predicted[:k] 32 | # 33 | # score = 0.0 34 | # num_hits = 0.0 35 | # 36 | # for i, p in enumerate(predicted): 37 | # if p in actual and p not in predicted[:i]: 38 | # num_hits += 1.0 39 | # score += num_hits / (i + 1.0) 40 | # 41 | # if not actual: 42 | # return 0.0 43 | # 44 | # return score / min(len(actual), k) 45 | # 46 | # 47 | # def mapk(y_true, y_pred, k=3): 48 | # """ 49 | # Computes the mean average precision at k. 50 | # This function computes the mean average prescision at k between two lists 51 | # of lists of items. 52 | # Parameters 53 | # ---------- 54 | # y_true : list 55 | # A list of lists of elements that are to be predicted 56 | # (order doesn't matter in the lists) 57 | # y_pred : list 58 | # A list of lists of predicted elements 59 | # (order matters in the lists) 60 | # k : int, optional 61 | # The maximum number of predicted elements 62 | # Returns 63 | # ------- 64 | # score : double 65 | # The mean average precision at k over the input lists 66 | # 67 | # Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py 68 | # """ 69 | # actual = np.argmax(y_true, axis=1).reshape(-1, 1).tolist() 70 | # pred_topk = np.argsort(y_pred, axis=1)[:, ::-1][:, :k].tolist() 71 | # return np.mean([apk(a, p, k) for a, p in zip(actual, pred_topk)]) 72 | -------------------------------------------------------------------------------- /lib/data/audio.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | 6 | def to_tfrecord_examples(row, config, sequence): 7 | audio_path, label = row['path'], row['label'] 8 | sr, num_samples, num_segments, len_audio = config.sr, config.num_samples, config.num_segments, config.len_audio 9 | 10 | audio = load_audio(audio_path, sr, len_audio) 11 | segments = [audio[i * num_samples:(i + 1) * num_samples] for i in range(num_segments)] 12 | 13 | if sequence: 14 | examples = [segments_to_sequence_example(segments, label)] 15 | else: 16 | examples = [segment_to_example(segment, label) for segment in segments] 17 | 18 | return examples 19 | 20 | 21 | def segment_to_example(segment, label): 22 | raw_segment = np.array(segment, dtype=np.float32).reshape(-1).tostring() 23 | raw_label = np.array(label, dtype=np.uint8).reshape(-1).tostring() 24 | 25 | example = tf.train.Example(features=tf.train.Features(feature={ 26 | 'label': bytes_feature(raw_label), # array: dtype=uint8, shape=(num_classes,) 27 | 'segment': bytes_feature(raw_segment) # array: dtype=float32, shape=(num_samples,) 28 | })) 29 | 30 | return example 31 | 32 | 33 | def segments_to_sequence_example(segments, label): 34 | raw_segments = [np.array(segment, dtype=np.float32).reshape(-1).tostring() for segment in segments] 35 | raw_label = np.array(label, dtype=np.uint8).reshape(-1).tostring() 36 | 37 | sequence_example = tf.train.SequenceExample( 38 | context=tf.train.Features(feature={ 39 | 'label': bytes_feature(raw_label) # uint8 Tensor (50,) 40 | }), 41 | feature_lists=tf.train.FeatureLists(feature_list={ 42 | 'segments': bytes_feature_list(raw_segments) # list of float32 Tensor (num_samples,) 43 | })) 44 | 45 | return sequence_example 46 | 47 | 48 | def load_audio(path, sr, len_audio): 49 | audio, _ = librosa.load(path, sr=sr, mono=True, duration=len_audio, dtype=np.float32, res_type='kaiser_best') 50 | 51 | total_samples = sr * len_audio 52 | if len(audio) < total_samples: 53 | audio = np.repeat(audio, total_samples // len(audio) + 1)[:total_samples] 54 | 55 | return audio 56 | 57 | 58 | def bytes_feature(value): 59 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 60 | 61 | 62 | def bytes_feature_list(values): 63 | return tf.train.FeatureList(feature=[bytes_feature(v) for v in values]) 64 | 65 | 66 | def int64_feature(value): 67 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 68 | 69 | 70 | def float_feature(value): 71 | return tf.train.Feature(float_list=tf.train.FloatList(value=[value])) 72 | -------------------------------------------------------------------------------- /lib/data/scd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import tensorflow as tf 4 | from glob import glob 5 | from sklearn.utils import shuffle 6 | from lib.utils import mkpath 7 | 8 | CLASSES = ['backward', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'follow', 'forward', 'four', 'go', 'happy', 9 | 'house', 'learn', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 10 | 'stop', 'three', 'tree', 'two', 'up', 'visual', 'wow', 'yes', 'zero'] 11 | IDX2NAME = {i: name for i, name in enumerate(CLASSES)} 12 | NAME2IDX = {name: i for i, name in enumerate(CLASSES)} 13 | 14 | 15 | def load_audio_paths(dataset_dir): 16 | audio_paths = glob(mkpath(dataset_dir, 'raw/*/*.wav')) 17 | noise_paths = glob(mkpath(dataset_dir, 'raw/_background_noise_/*.wav')) 18 | 19 | with open(mkpath(dataset_dir, 'raw/validation_list.txt')) as f: 20 | val_paths = f.read().splitlines() 21 | val_paths = [mkpath(dataset_dir, 'raw', path) for path in val_paths] 22 | 23 | with open(mkpath(dataset_dir, 'raw/testing_list.txt')) as f: 24 | test_paths = f.read().splitlines() 25 | test_paths = [mkpath(dataset_dir, 'raw', path) for path in test_paths] 26 | 27 | # Remove validation, test set, and noises from the training set. 28 | train_paths = list(set(audio_paths) - set(val_paths) - set(test_paths) - set(noise_paths)) 29 | 30 | # Sort paths. 31 | train_paths.sort(), val_paths.sort(), test_paths.sort() 32 | 33 | return train_paths, val_paths, test_paths 34 | 35 | 36 | def make_dataset_info(dataset_dir, num_audios_per_shard): 37 | train_paths, val_paths, test_paths = load_audio_paths(dataset_dir) 38 | 39 | paths = train_paths + val_paths + test_paths 40 | ids = ['/'.join(p.split('/')[-2:]) for p in paths] 41 | labels = [tf.keras.utils.to_categorical(NAME2IDX[id.split('/')[0]], num_classes=len(CLASSES)) for id in ids] 42 | splits = ['train'] * len(train_paths) + ['val'] * len(val_paths) + ['test'] * len(test_paths) 43 | 44 | df = pd.DataFrame({'id': ids, 'label': labels, 'split': splits, 'path': paths}) 45 | 46 | # Shuffle and shard. 47 | df = shuffle(df, random_state=123) 48 | for split in ['train', 'val', 'test']: 49 | num_audios = sum(df['split'] == split) 50 | num_shards = num_audios // num_audios_per_shard 51 | num_remainders = num_audios % num_audios_per_shard 52 | 53 | shards = np.tile(np.arange(num_shards), num_audios_per_shard) 54 | shards = np.concatenate([shards, np.arange(num_remainders) % num_shards]) 55 | shards = np.random.permutation(shards) 56 | 57 | df.loc[df['split'] == split, 'shard'] = shards 58 | 59 | df['shard'] = df['shard'].astype(int) 60 | 61 | return df 62 | -------------------------------------------------------------------------------- /lib/initialization.py: -------------------------------------------------------------------------------- 1 | import math 2 | import tensorflow as tf 3 | from tensorflow.keras.initializers import Initializer 4 | 5 | 6 | def _compute_audio_fans(shape): 7 | assert len(shape) == 3, 'This initialization is for Conv1D.' 8 | 9 | len_filter, in_channels, out_channels = shape 10 | 11 | receptive_field_size = len_filter * in_channels # 원래는 len_filter 여야함!! 12 | fan_in = in_channels * receptive_field_size 13 | fan_out = out_channels * receptive_field_size 14 | 15 | return fan_in, fan_out 16 | 17 | 18 | class AudioVarianceScaling(Initializer): 19 | """VarianceScaling for Audio""" 20 | 21 | def __init__(self, 22 | scale=1.0, 23 | mode="fan_in", 24 | distribution="truncated_normal", 25 | seed=None, 26 | dtype=tf.float32): 27 | if scale <= 0.: 28 | raise ValueError("`scale` must be positive float.") 29 | if mode not in {"fan_in", "fan_out", "fan_avg"}: 30 | raise ValueError("Invalid `mode` argument:", mode) 31 | distribution = distribution.lower() 32 | if distribution not in {"uniform", "truncated_normal", "untruncated_normal"}: 33 | raise ValueError("Invalid `distribution` argument:", distribution) 34 | self.scale = scale 35 | self.mode = mode 36 | self.distribution = distribution 37 | self.seed = seed 38 | self.dtype = tf.as_dtype(dtype) 39 | 40 | def __call__(self, shape, dtype=None, partition_info=None): 41 | if dtype is None: 42 | dtype = self.dtype 43 | scale = self.scale 44 | scale_shape = shape 45 | if partition_info is not None: 46 | scale_shape = partition_info.full_shape 47 | fan_in, fan_out = _compute_audio_fans(scale_shape) 48 | if self.mode == "fan_in": 49 | scale /= max(1., fan_in) 50 | elif self.mode == "fan_out": 51 | scale /= max(1., fan_out) 52 | else: 53 | scale /= max(1., (fan_in + fan_out) / 2.) 54 | if self.distribution == "normal" or self.distribution == "truncated_normal": 55 | # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.) 56 | stddev = math.sqrt(scale) / .87962566103423978 57 | return tf.truncated_normal( 58 | shape, 0.0, stddev, dtype, seed=self.seed) 59 | elif self.distribution == "untruncated_normal": 60 | stddev = math.sqrt(scale) 61 | return tf.random_normal( 62 | shape, 0.0, stddev, dtype, seed=self.seed) 63 | else: 64 | limit = math.sqrt(3.0 * scale) 65 | return tf.random_uniform( 66 | shape, -limit, limit, dtype, seed=self.seed) 67 | 68 | def get_config(self): 69 | return { 70 | "scale": self.scale, 71 | "mode": self.mode, 72 | "distribution": self.distribution, 73 | "seed": self.seed, 74 | "dtype": self.dtype.name 75 | } 76 | 77 | 78 | def taejun_uniform(scale=2., seed=None): 79 | return AudioVarianceScaling(scale=scale, mode='fan_in', distribution='uniform', seed=seed) 80 | -------------------------------------------------------------------------------- /lib/data/batch.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from glob import glob 3 | 4 | 5 | def tfrecord_parser(config): 6 | def parse_fn(example): 7 | features = tf.parse_single_example(example, features={ 8 | 'label': tf.FixedLenFeature([], tf.string), 9 | 'segment': tf.FixedLenFeature([], tf.string) 10 | }) 11 | 12 | segment = tf.decode_raw(features['segment'], tf.float32) 13 | segment = (segment - config.mean) / config.std # standardization 14 | segment = tf.expand_dims(segment, axis=-1) 15 | 16 | label = tf.decode_raw(features['label'], tf.uint8) 17 | label = tf.cast(label, tf.float32) 18 | 19 | return segment, label 20 | 21 | return parse_fn 22 | 23 | 24 | def tfrecord_parser_sequence(config): 25 | def parse_fn(sequence_example): 26 | context, sequence = tf.parse_single_sequence_example( 27 | sequence_example, 28 | context_features={ 29 | 'label': tf.FixedLenFeature([], tf.string) 30 | }, 31 | sequence_features={ 32 | 'segments': tf.FixedLenSequenceFeature([], tf.string) 33 | }) 34 | 35 | segments = tf.decode_raw(sequence['segments'], tf.float32) 36 | segments = (segments - config.mean) / config.std # standardization 37 | segments = tf.expand_dims(segments, axis=-1) 38 | 39 | label = tf.decode_raw(context['label'], tf.uint8) 40 | label = tf.cast(label, tf.float32) 41 | 42 | return segments, label 43 | 44 | return parse_fn 45 | 46 | 47 | def create_datasets(tfrecord_path, batch_size, num_readers, config, only_test=False): 48 | batch_size_test = max(1, batch_size // config.num_segments) 49 | filenames_test = glob(tfrecord_path + '/test-*.tfrecord') 50 | dataset_test = tf.data.TFRecordDataset(filenames_test) 51 | dataset_test = dataset_test.map(tfrecord_parser_sequence(config), num_parallel_calls=num_readers) 52 | dataset_test = dataset_test.batch(batch_size_test) 53 | dataset_test = dataset_test.prefetch(8 * batch_size_test) 54 | 55 | if only_test: 56 | return dataset_test 57 | else: 58 | filenames_train = glob(tfrecord_path + '/train-*.tfrecord') 59 | dataset_train = tf.data.TFRecordDataset(filenames_train) 60 | dataset_train = dataset_train.map(tfrecord_parser(config), num_parallel_calls=num_readers) 61 | dataset_train = dataset_train.shuffle(buffer_size=10000) 62 | dataset_train = dataset_train.batch(batch_size) 63 | dataset_train = dataset_train.repeat() 64 | dataset_train = dataset_train.prefetch(8 * batch_size) 65 | 66 | filenames_val = glob(tfrecord_path + '/val-*.tfrecord') 67 | dataset_val = tf.data.TFRecordDataset(filenames_val) 68 | dataset_val = dataset_val.map(tfrecord_parser(config), num_parallel_calls=num_readers) 69 | # NOTE: Do not shuffle validation set. 70 | dataset_val = dataset_val.batch(batch_size) 71 | dataset_val = dataset_val.repeat() 72 | dataset_val = dataset_val.prefetch(8 * batch_size) 73 | 74 | return dataset_train, dataset_val, dataset_test 75 | -------------------------------------------------------------------------------- /lib/data/dcs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.utils import shuffle 4 | from lib.utils import mkpath 5 | 6 | CLASSES = ['Train horn', 'Air horn, truck horn', 'Car alarm', 'Reversing beeps', 'Ambulance (siren)', 7 | 'Police car (siren)', 'Fire engine, fire truck (siren)', 'Civil defense siren', 'Screaming', 'Bicycle', 8 | 'Skateboard', 'Car', 'Car passing by', 'Bus', 'Truck', 'Motorcycle', 'Train'] 9 | 10 | C2I = {c: i for i, c in enumerate(CLASSES)} 11 | 12 | DIR_TRAIN = 'unbalanced_train_segments_training_set_audio_formatted_and_segmented_downloads' 13 | DIR_TEST = 'unbalanced_train_segments_testing_set_audio_formatted_and_segmented_downloads' 14 | DIR_EVAL = 'evaluation_set_formatted_audio_segments' 15 | 16 | 17 | def make_dataset_info(dataset_dir, num_audios_per_shard): 18 | df_train = read_csv(mkpath(dataset_dir, 'raw/groundtruth_weak_label_training_set.csv')) 19 | df_test = read_csv(mkpath(dataset_dir, 'raw/groundtruth_weak_label_testing_set.csv')) 20 | df_eval = read_csv(mkpath(dataset_dir, 'raw/groundtruth_weak_label_evaluation_set.csv')) 21 | 22 | df_train['path'] = [mkpath(dataset_dir, f'raw/{DIR_TRAIN}/Y{f}') for f in df_train['file']] 23 | df_test['path'] = [mkpath(dataset_dir, f'raw/{DIR_TEST}/Y{f}') for f in df_test['file']] 24 | df_eval['path'] = [mkpath(dataset_dir, f'raw/{DIR_EVAL}/Y{f}') for f in df_eval['file']] 25 | 26 | df_train = pd.concat([df_train, df_test]) 27 | 28 | # Split validation set. 29 | val_files = [] 30 | for c in CLASSES: 31 | df_class = df_train[df_train['label'] == c] 32 | val_files += df_class.sample(frac=0.1, random_state=123)['file'].tolist() 33 | val_files = list(set(val_files)) 34 | 35 | is_val = df_train['file'].isin(val_files) 36 | df_val = df_train[is_val].assign(split='val') 37 | df_train = df_train[~is_val].assign(split='train') 38 | df_eval = df_eval.assign(split='test') 39 | 40 | df = pd.concat([df_train, df_val, df_eval]) 41 | 42 | # Encode labels. 43 | label = df.groupby('file')['label'].apply(list) 44 | label.iloc[:] = [encode(l) for l in label] 45 | label = label.to_frame().reset_index() 46 | df = df.drop_duplicates('file').drop('label', axis=1).merge(label, on='file') 47 | 48 | # Shuffle and shard. 49 | df = shuffle(df, random_state=123) 50 | for split in ['train', 'val', 'test']: 51 | num_audios = sum(df['split'] == split) 52 | num_shards = num_audios // num_audios_per_shard 53 | num_remainders = num_audios % num_audios_per_shard 54 | 55 | shards = np.tile(np.arange(num_shards), num_audios_per_shard) 56 | shards = np.concatenate([shards, np.arange(num_remainders) % num_shards]) 57 | shards = np.random.permutation(shards) 58 | 59 | df.loc[df['split'] == split, 'shard'] = shards 60 | 61 | df['shard'] = df['shard'].astype(int) 62 | 63 | return df 64 | 65 | 66 | def read_csv(path): 67 | df = pd.read_csv(path, delimiter='\t', names=['file', 'start', 'end', 'label']) 68 | return df 69 | 70 | 71 | def encode(label): 72 | x = np.zeros(shape=len(CLASSES), dtype=np.float32) 73 | x[[C2I[l] for l in label]] = 1. 74 | return x 75 | -------------------------------------------------------------------------------- /lib/data/mtt.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.utils import shuffle 4 | 5 | CLASSES = ['choral', 'female voice', 'metal', 'country', 'weird', 'no voice', 'cello', 'harp', 'beats', 'female vocal', 6 | 'male voice', 'dance', 'new age', 'voice', 'choir', 'classic', 'man', 'solo', 'sitar', 'soft', 'pop', 7 | 'no vocal', 'male vocal', 'woman', 'flute', 'quiet', 'loud', 'harpsichord', 'no vocals', 'vocals', 'singing', 8 | 'male', 'opera', 'indian', 'female', 'synth', 'vocal', 'violin', 'beat', 'ambient', 'piano', 'fast', 'rock', 9 | 'electronic', 'drums', 'strings', 'techno', 'slow', 'classical', 'guitar'] 10 | 11 | 12 | def make_dataset_info(dataset_dir, num_audios_per_shard=100, num_top=50): 13 | """Reads annotation file, takes top N tags, and splits data samples. 14 | 15 | Results 54 (top50_tags + [clip_id, mp3_path, split, shard]) columns: 16 | 17 | ['choral', 'female voice', 'metal', 'country', 'weird', 'no voice', 18 | 'cello', 'harp', 'beats', 'female vocal', 'male voice', 'dance', 19 | 'new age', 'voice', 'choir', 'classic', 'man', 'solo', 'sitar', 'soft', 20 | 'pop', 'no vocal', 'male vocal', 'woman', 'flute', 'quiet', 'loud', 21 | 'harpsichord', 'no vocals', 'vocals', 'singing', 'male', 'opera', 22 | 'indian', 'female', 'synth', 'vocal', 'violin', 'beat', 'ambient', 23 | 'piano', 'fast', 'rock', 'electronic', 'drums', 'strings', 'techno', 24 | 'slow', 'classical', 'guitar', 'clip_id', 'mp3_path', 'split', 'shard'] 25 | 26 | NOTE: This will exclude audios which have only zero-tags. Therefore, number of 27 | each split will be 15250 / 1529 / 4332 (training / validation / test). 28 | 29 | Args: 30 | filename: A path to annotation CSV file. 31 | num_top: Number of the most popular tags to take. 32 | num_audios_per_shard: Number of audios per shard. 33 | 34 | Returns: 35 | A DataFrame contains information of audios. 36 | 37 | Schema: 38 | : 0 or 1 39 | clip_id: clip_id of the original dataset 40 | mp3_path: A path to a mp3 audio file. 41 | split: A split of dataset (training / validation / test). 42 | The split is determined by its directory (0, 1, ... , f). 43 | First 12 directories (0 ~ b) are used for training, 44 | 1 (c) for validation, and 3 (d ~ f) for test. 45 | shard: A shard index of the audio. 46 | """ 47 | df = pd.read_csv(dataset_dir + '/raw/annotations_final.csv', delimiter='\t') 48 | 49 | # Calculate TOP 50 tags. 50 | top50 = (df.drop(['clip_id', 'mp3_path'], axis=1) 51 | .sum() 52 | .sort_values() 53 | .tail(num_top) 54 | .index 55 | .tolist()) 56 | 57 | # Select TOP 50 columns. 58 | df = df[top50 + ['clip_id', 'mp3_path']] 59 | 60 | # Select rows which has at least one label. 61 | df = df.loc[df.iloc[:, :num_top].any(axis=1)] 62 | 63 | def split_by_directory(mp3_path): 64 | directory = mp3_path.split('/')[0] 65 | part = int(directory, 16) 66 | 67 | if part in range(12): 68 | return 'train' 69 | elif part is 12: 70 | return 'val' 71 | elif part in range(13, 16): 72 | return 'test' 73 | 74 | # Split by directories. 75 | df['split'] = df['mp3_path'].apply(lambda mp3_path: split_by_directory(mp3_path)) 76 | 77 | df = shuffle(df) 78 | for split in ['train', 'val', 'test']: 79 | num_audios = sum(df['split'] == split) 80 | num_shards = num_audios // num_audios_per_shard 81 | num_remainders = num_audios % num_audios_per_shard 82 | 83 | shards = np.tile(np.arange(num_shards), num_audios_per_shard) 84 | shards = np.concatenate([shards, np.arange(num_remainders) % num_shards]) 85 | shards = np.random.permutation(shards) 86 | 87 | df.loc[df['split'] == split, 'shard'] = shards 88 | 89 | df['shard'] = df['shard'].astype(int) 90 | 91 | # To unified format. 92 | paths = [f'{dataset_dir}/raw/mp3/{p}' for p in df['mp3_path']] 93 | labels = [label for label in df.iloc[:, :num_top].values] 94 | 95 | df = pd.DataFrame({'id': df['clip_id'], 'label': labels, 'split': df['split'], 'shard': df['shard'], 'path': paths}) 96 | 97 | return df 98 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import tensorflow as tf 4 | import lib.data as data 5 | from sklearn import metrics 6 | from lib.data.batch import create_datasets 7 | from lib.data.config import * 8 | from lib.initialization import AudioVarianceScaling 9 | from lib.utils import mkpath 10 | 11 | 12 | def main(args): 13 | args.model_path = mkpath(args.model_path) 14 | args.dataset = args.dataset or args.model_path.split('/')[-2].split('-')[1] # extract dataset name from train_dir. 15 | 16 | if args.dataset == 'mtt': 17 | config = MTT_CONFIG 18 | classes = data.mtt.CLASSES 19 | elif args.dataset == 'scd': 20 | config = SCD_CONFIG 21 | classes = data.scd.CLASSES 22 | elif args.dataset == 'dcs': 23 | config = DCS_CONFIG 24 | classes = data.dcs.CLASSES 25 | else: 26 | raise Exception('Not implemented.') 27 | 28 | # Create training, validation, and test datasets. 29 | dataset_path = mkpath(args.data_dir, args.dataset, 'tfrecord') 30 | dataset_test = create_datasets(dataset_path, args.batch_size, args.num_readers, config, only_test=True) 31 | 32 | # Load the trained model. 33 | model = tf.keras.models.load_model(args.model_path, 34 | custom_objects={'AudioVarianceScaling': AudioVarianceScaling, 'tf': tf}) 35 | 36 | # Evaluate 37 | evaluate(model, dataset_test, config, classes=classes) 38 | 39 | 40 | def evaluate(model, dataset_test, config, classes=None): 41 | # Create the iterator. 42 | iterator = dataset_test.make_one_shot_iterator() 43 | seg, label = iterator.get_next() 44 | 45 | # Get dynamic shapes. 46 | seg_shape = tf.shape(seg) 47 | batch_size, num_segments, num_samples = seg_shape[0], seg_shape[1], seg_shape[2] 48 | num_classes = tf.shape(label)[1] 49 | 50 | seg = tf.reshape(seg, shape=(batch_size * num_segments, num_samples, 1)) 51 | pred_segs = model(seg) # predict all segments 52 | pred_segs = tf.reshape(pred_segs, shape=(batch_size, num_segments, num_classes)) 53 | pred = tf.reduce_mean(pred_segs, axis=1) # Average segments for each audio 54 | 55 | y_true, y_prob = [], [] 56 | sess = tf.keras.backend.get_session() 57 | while True: 58 | try: 59 | label_batch, pred_batch = sess.run([label, pred], feed_dict={tf.keras.backend.learning_phase(): 0}) 60 | y_true.append(label_batch) 61 | y_prob.append(pred_batch) 62 | except tf.errors.OutOfRangeError: 63 | break 64 | 65 | y_true, y_prob = np.concatenate(y_true), np.concatenate(y_prob) 66 | rocauc = metrics.roc_auc_score(y_true, y_prob, average='macro') 67 | prauc = metrics.average_precision_score(y_true, y_prob, average='macro') 68 | 69 | y_pred = (y_prob > config.threshold).astype(np.float32) 70 | acc = metrics.accuracy_score(y_true, y_pred) 71 | f1 = metrics.f1_score(y_true, y_pred, average='samples') 72 | 73 | if classes is not None: 74 | print(f'\n=> Individual scores of {len(classes)} classes') 75 | for i, cls in enumerate(classes): 76 | cls_rocauc = metrics.roc_auc_score(y_true[:, i], y_prob[:, i]) 77 | cls_prauc = metrics.average_precision_score(y_true[:, i], y_prob[:, i]) 78 | cls_acc = metrics.accuracy_score(y_true[:, i], y_pred[:, i]) 79 | cls_f1 = metrics.f1_score(y_true[:, i], y_pred[:, i]) 80 | print(f'[{i:2} {cls:30}] rocauc={cls_rocauc:.4f} prauc={cls_prauc:.4f} acc={cls_acc:.4f} f1={cls_f1:.4f}') 81 | print() 82 | 83 | print(f'=> Test scores: rocauc={rocauc:.6f}\tprauc={prauc:.6f}\tacc={acc:.6f}\tf1={f1:.6f}') 84 | return rocauc, prauc, acc, f1 85 | 86 | 87 | if __name__ == '__main__': 88 | parser = argparse.ArgumentParser(description='Evaluate a SampleCNN.') 89 | parser.add_argument('dataset', type=str, metavar='DATASET', 90 | choices=['mtt', 'scd', 'dcs'], help='Dataset for training: {mtt|scd|dcs}') 91 | parser.add_argument('model_path', type=str, metavar='PATH', help='Path to the saved model.') 92 | parser.add_argument('--data-dir', type=str, default='./data', metavar='PATH') 93 | parser.add_argument('--batch-size', type=int, default=23, metavar='N', help='Mini-batch size.') 94 | parser.add_argument('--num-readers', type=int, default=8, metavar='N', help='Number of TFRecord readers.') 95 | 96 | args = parser.parse_args() 97 | 98 | main(args) 99 | print('\n=> Done.\n') 100 | -------------------------------------------------------------------------------- /lib/model.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.models import Model 2 | from tensorflow.keras.layers import (Conv1D, MaxPool1D, BatchNormalization, GlobalAvgPool1D, Multiply, GlobalMaxPool1D, 3 | Dense, Dropout, Activation, Reshape, Concatenate, Add, Input) 4 | from tensorflow.keras.regularizers import l2 5 | from lib.initialization import taejun_uniform 6 | 7 | 8 | def squeeze_excitation(x, amplifying_ratio, name): 9 | num_features = x.shape[-1].value 10 | x = GlobalAvgPool1D(name=f'{name}_squeeze')(x) 11 | x = Reshape((1, num_features), name=f'{name}_reshape')(x) 12 | x = Dense(num_features * amplifying_ratio, activation='relu', 13 | kernel_initializer='glorot_uniform', name=f'{name}_ex0')(x) 14 | x = Dense(num_features, activation='sigmoid', kernel_initializer='glorot_uniform', name=f'{name}_ex1')(x) 15 | return x 16 | 17 | 18 | def basic_block(x, num_features, cfg, name): 19 | """Block for basic models.""" 20 | x = Conv1D(num_features, kernel_size=3, padding='same', use_bias=True, 21 | kernel_regularizer=l2(cfg.weight_decay), kernel_initializer=taejun_uniform(), name=f'{name}_conv')(x) 22 | x = BatchNormalization(name=f'{name}_norm')(x) 23 | x = Activation('relu', name=f'{name}_relu')(x) 24 | x = MaxPool1D(pool_size=3, name=f'{name}_pool')(x) 25 | return x 26 | 27 | 28 | def se_block(x, num_features, cfg, name): 29 | """Block for SE models.""" 30 | x = basic_block(x, num_features, cfg, name) 31 | x = Multiply(name=f'{name}_scale')([x, squeeze_excitation(x, cfg.amplifying_ratio, name)]) 32 | return x 33 | 34 | 35 | def rese_block(x, num_features, cfg, name): 36 | """Block for Res-N & ReSE-N models.""" 37 | if num_features != x.shape[-1].value: 38 | shortcut = Conv1D(num_features, kernel_size=1, padding='same', use_bias=True, name=f'{name}_scut_conv', 39 | kernel_regularizer=l2(cfg.weight_decay), kernel_initializer='glorot_uniform')(x) 40 | shortcut = BatchNormalization(name=f'{name}_scut_norm')(shortcut) 41 | else: 42 | shortcut = x 43 | 44 | for i in range(cfg.num_convs): 45 | if i > 0: 46 | x = Activation('relu', name=f'{name}_relu{i-1}')(x) 47 | x = Dropout(0.2, name=f'{name}_drop{i-1}')(x) 48 | x = Conv1D(num_features, kernel_size=3, padding='same', use_bias=True, 49 | kernel_regularizer=l2(cfg.weight_decay), kernel_initializer=taejun_uniform(), name=f'{name}_conv{i}')(x) 50 | x = BatchNormalization(name=f'{name}_norm{i}')(x) 51 | 52 | # Add SE if it is ReSE block. 53 | if cfg.amplifying_ratio: 54 | x = Multiply(name=f'{name}_scale')([x, squeeze_excitation(x, cfg.amplifying_ratio, name)]) 55 | 56 | x = Add(name=f'{name}_scut')([shortcut, x]) 57 | x = Activation('relu', name=f'{name}_relu1')(x) 58 | x = MaxPool1D(pool_size=3, name=f'{name}_pool')(x) 59 | return x 60 | 61 | 62 | def SampleCNN(cfg): 63 | """Build a SampleCNN model.""" 64 | # Variable-length input for feature visualization. 65 | x_in = Input(shape=(None, 1), name='input') 66 | 67 | num_features = cfg.init_features 68 | x = Conv1D(num_features, kernel_size=3, strides=3, padding='same', use_bias=True, 69 | kernel_regularizer=l2(cfg.weight_decay), kernel_initializer=taejun_uniform(scale=1.), name='conv0')(x_in) 70 | x = BatchNormalization(name='norm0')(x) 71 | x = Activation('relu', name='relu0')(x) 72 | 73 | # Stack convolutional blocks. 74 | layer_outputs = [] 75 | for i in range(cfg.num_blocks): 76 | num_features *= 2 if (i == 2 or i == (cfg.num_blocks - 1)) else 1 77 | x = cfg.block_fn(x, num_features, cfg, f'block{i}') 78 | layer_outputs.append(x) 79 | 80 | if cfg.multi: # Use multi-level feature aggregation or not. 81 | x = Concatenate(name='multi')([GlobalMaxPool1D(name=f'final_pool{i}')(output) 82 | for i, output in enumerate(layer_outputs[-3:])]) 83 | else: 84 | x = GlobalMaxPool1D(name='final_pool')(x) 85 | 86 | # The final two FCs. 87 | x = Dense(x.shape[-1].value, kernel_initializer='glorot_uniform', name='final_fc')(x) 88 | x = BatchNormalization(name='final_norm')(x) 89 | x = Activation('relu', name='final_relu')(x) 90 | if cfg.dropout > 0.: 91 | x = Dropout(cfg.dropout, name='final_drop')(x) 92 | x = Dense(cfg.num_classes, kernel_initializer='glorot_uniform', name='logit')(x) 93 | x = Activation(cfg.activation, name='pred')(x) 94 | 95 | return Model(inputs=[x_in], outputs=[x], name='sample_cnn') 96 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /log 2 | /data 3 | /.idea 4 | /out 5 | 6 | # Created by .ignore support plugin (hsz.mobi) 7 | ### Python template 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | ### macOS template 112 | # General 113 | .DS_Store 114 | .AppleDouble 115 | .LSOverride 116 | 117 | # Icon must end with two \r 118 | Icon 119 | 120 | # Thumbnails 121 | ._* 122 | 123 | # Files that might appear in the root of a volume 124 | .DocumentRevisions-V100 125 | .fseventsd 126 | .Spotlight-V100 127 | .TemporaryItems 128 | .Trashes 129 | .VolumeIcon.icns 130 | .com.apple.timemachine.donotpresent 131 | 132 | # Directories potentially created on remote AFP share 133 | .AppleDB 134 | .AppleDesktop 135 | Network Trash Folder 136 | Temporary Items 137 | .apdisk 138 | ### JetBrains template 139 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 140 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 141 | 142 | # User-specific stuff 143 | .idea/**/workspace.xml 144 | .idea/**/tasks.xml 145 | .idea/**/usage.statistics.xml 146 | .idea/**/dictionaries 147 | .idea/**/shelf 148 | 149 | # Sensitive or high-churn files 150 | .idea/**/dataSources/ 151 | .idea/**/dataSources.ids 152 | .idea/**/dataSources.local.xml 153 | .idea/**/sqlDataSources.xml 154 | .idea/**/dynamic.xml 155 | .idea/**/uiDesigner.xml 156 | .idea/**/dbnavigator.xml 157 | 158 | # Gradle 159 | .idea/**/gradle.xml 160 | .idea/**/libraries 161 | 162 | # Gradle and Maven with auto-import 163 | # When using Gradle or Maven with auto-import, you should exclude module files, 164 | # since they will be recreated, and may cause churn. Uncomment if using 165 | # auto-import. 166 | # .idea/modules.xml 167 | # .idea/*.iml 168 | # .idea/modules 169 | 170 | # CMake 171 | cmake-build-*/ 172 | 173 | # Mongo Explorer plugin 174 | .idea/**/mongoSettings.xml 175 | 176 | # File-based project format 177 | *.iws 178 | 179 | # IntelliJ 180 | out/ 181 | 182 | # mpeltonen/sbt-idea plugin 183 | .idea_modules/ 184 | 185 | # JIRA plugin 186 | atlassian-ide-plugin.xml 187 | 188 | # Cursive Clojure plugin 189 | .idea/replstate.xml 190 | 191 | # Crashlytics plugin (for Android Studio and IntelliJ) 192 | com_crashlytics_export_strings.xml 193 | crashlytics.properties 194 | crashlytics-build.properties 195 | fabric.properties 196 | 197 | # Editor-based Rest Client 198 | .idea/httpRequests 199 | ### Linux template 200 | *~ 201 | 202 | # temporary files which can be created if a process still has a handle open of a deleted file 203 | .fuse_hidden* 204 | 205 | # KDE directory preferences 206 | .directory 207 | 208 | # Linux trash folder which might appear on any partition or disk 209 | .Trash-* 210 | 211 | # .nfs files are created when an open file is removed but is still being accessed 212 | .nfs* 213 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import math 3 | import os 4 | import tensorflow as tf 5 | import tensorflow.keras.backend as K 6 | from datetime import datetime 7 | from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping 8 | from lib.model import SampleCNN 9 | from lib.model_config import ModelConfig 10 | from lib.data.batch import create_datasets 11 | from lib.data.config import * 12 | from lib.initialization import AudioVarianceScaling 13 | from lib.utils import mkpath 14 | from eval import evaluate 15 | 16 | 17 | def main(args): 18 | print(f'=> Dataset: {args.dataset}') 19 | if args.dataset == 'mtt': 20 | config = MTT_CONFIG 21 | elif args.dataset == 'scd': 22 | config = SCD_CONFIG 23 | elif args.dataset == 'dcs': 24 | config = DCS_CONFIG 25 | else: 26 | raise Exception(f'Not implemented dataset: {args.dataset}') 27 | 28 | dataset_path = mkpath(args.data_dir, args.dataset) 29 | tfrecord_path = f'{dataset_path}/tfrecord' 30 | 31 | # Configure the model. 32 | model_config = ModelConfig(block=args.block, amplifying_ratio=args.amplifying_ratio, multi=args.multi, 33 | num_blocks=config.num_blocks, dropout=args.dropout, activation=config.activation, 34 | num_classes=config.num_classes) 35 | 36 | # Set the training directory. 37 | args.train_dir = mkpath(args.log_dir, datetime.now().strftime('%Y%m%d_%H%M%S') + f'-{args.dataset}') 38 | if args.name is None: 39 | args.name = model_config.get_signature() 40 | args.train_dir += '-' + args.name 41 | os.makedirs(args.train_dir, exist_ok=False) 42 | print('=> Training directory: ' + args.train_dir) 43 | 44 | # Create training, validation, and test datasets. 45 | dataset_train, dataset_val, dataset_test = create_datasets(tfrecord_path, args.batch_size, args.num_readers, config) 46 | 47 | model = SampleCNN(model_config) 48 | model_config.print_summary() 49 | 50 | num_params = int(sum([K.count_params(p) for p in set(model.trainable_weights)])) 51 | print(f'=> #params: {num_params:,}') 52 | 53 | for stage in range(args.num_stages): 54 | print(f'=> Stage {stage}') 55 | # Set the learning rate of current stage 56 | lr = args.lr * (args.lr_decay ** stage) 57 | # Train the network. 58 | train(model, lr, dataset_train, dataset_val, config, args) 59 | # Load the best model. 60 | model = tf.keras.models.load_model(f'{args.train_dir}/best.h5', 61 | custom_objects={'AudioVarianceScaling': AudioVarianceScaling, 'tf': tf}) 62 | # Evaluate. 63 | rocauc, prauc, acc, f1 = evaluate(model, dataset_test, config) 64 | 65 | # Change the file name of the best checkpoint with the scores. 66 | os.rename(f'{args.train_dir}/best.h5', f'{args.train_dir}/final-auc_{rocauc:.6f}-acc_{acc:.6f}-f1_{f1:.6f}.h5') 67 | # Report the final scores. 68 | print(f'=> FINAL SCORES [{args.dataset}] {args.name}: ' 69 | f'rocauc={rocauc:.6f}, acc={acc:.6f}, f1={f1:.6f}, prauc={prauc:.6f}') 70 | 71 | model_config.print_summary() 72 | 73 | return rocauc, prauc, acc, f1 74 | 75 | 76 | def train(model, lr, dataset_train, dataset_val, config, args): 77 | # Define a optimizer and compile the model. 78 | optimizer = tf.keras.optimizers.SGD(lr=lr, momentum=args.momentum, decay=1e-6, nesterov=True) 79 | model.compile(optimizer, loss=config.loss, metrics=config.metrics) 80 | 81 | # Setup callbacks. 82 | early_stopping = EarlyStopping(monitor='val_loss', patience=args.patience) 83 | checkpointer_best = ModelCheckpoint(f'{args.train_dir}/best.h5', monitor='val_loss', save_best_only=True) 84 | 85 | # Train! 86 | steps_train = int(math.ceil(config.num_train_segs / args.batch_size)) 87 | steps_val = int(math.ceil(config.num_val_segs / args.batch_size)) 88 | model.fit(dataset_train, epochs=100, steps_per_epoch=steps_train, 89 | validation_data=dataset_val, validation_steps=steps_val, 90 | callbacks=[early_stopping, checkpointer_best]) 91 | 92 | 93 | def parse_args(): 94 | parser = argparse.ArgumentParser(description='Train a SampleCNN.') 95 | parser.add_argument('dataset', type=str, metavar='DATASET', 96 | choices=['mtt', 'scd', 'dcs'], help='Dataset for training: {mtt|scd|dcs}') 97 | parser.add_argument('name', type=str, metavar='NAME', nargs='?', help='Name of log directory.') 98 | parser.add_argument('--data-dir', type=str, default='./data', metavar='PATH') 99 | parser.add_argument('--log-dir', type=str, default='./log', metavar='PATH', 100 | help='Directory where to write event logs and models.') 101 | 102 | parser.add_argument('--block', type=str, default='se', choices=['basic', 'se', 'res1', 'res2', 'rese1', 'rese2'], 103 | help='Convolutional block to build a model (default: se, options: basic/se/res1/res2/rese1/rese2).') 104 | parser.add_argument('--amplifying-ratio', type=float, default=0.125, metavar='N') 105 | parser.add_argument('--multi', action='store_true', help='Use multi-level feature aggregation.') 106 | 107 | parser.add_argument('--batch-size', type=int, default=23, metavar='N', help='Mini-batch size.') 108 | parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='Momentum for SGD.') 109 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='Learning rate.') 110 | parser.add_argument('--lr-decay', type=float, default=0.2, metavar='DC', help='Learning rate decay rate.') 111 | 112 | parser.add_argument('--dropout', type=float, default=0.5, metavar='DO', help='Dropout rate.') 113 | parser.add_argument('--weight-decay', type=float, default=0., metavar='WD', help='Weight decay.') 114 | 115 | parser.add_argument('--num-stages', type=int, default=5, metavar='N', help='Number of stages to train.') 116 | parser.add_argument('--patience', type=int, default=2, metavar='N', help='Stop training stage after #patiences.') 117 | 118 | parser.add_argument('--num-readers', type=int, default=8, metavar='N', help='Number of TFRecord readers.') 119 | 120 | return parser.parse_args() 121 | 122 | 123 | if __name__ == '__main__': 124 | args = parse_args() 125 | 126 | main(args) 127 | 128 | print('\n=> Done.\n') 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SampleCNNs for Audio Classifications 2 | This repository contains the code that used for the publication below: 3 | > Taejun Kim, Jongpil Lee, and Juhan Nam, "Comparison and Analysis of SampleCNN Architectures for Audio Classification" 4 | in IEEE Journal of Selected Topics in Signal Processing (JSTSP), 2019. 5 | 6 | 7 | 8 | Contents: 9 | * Install Dependencies 10 | * Building Datasets 11 | * Music auto-tagging: MagnaTagATune 12 | * Keyword spotting: Speech Commands 13 | * Acoustic scene tagging: DCASE 2017 Task 4 14 | * Training a SampleCNN 15 | 16 | ## Dependency Installation 17 | NOTE: The code of this repository is written and tested on **Python 3.6**. 18 | 19 | * tensorflow 1.10.X (strongly recommend to use 1.10.X because of version compatibility) 20 | * librosa 21 | * ffmpeg 22 | * pandas 23 | * numpy 24 | * scikit-learn 25 | * h5py 26 | 27 | To install the required python packages using conda, run the command below: 28 | ```sh 29 | conda install tensorflow-gpu=1.10.0 ffmpeg pandas numpy scikit-learn h5py 30 | conda install -c conda-forge librosa 31 | ``` 32 | 33 | 34 | ## Building Datasets 35 | Download and preprocess a dataset that you want to train a model on. 36 | 37 | ### Music auto-tagging: [MagnaTagATune][2] 38 | > Edith Law, Kris West, Michael Mandel, Mert Bay and J. Stephen Downie (2009). 39 | [Evaluation of algorithms using games: the case of music annotation.][1] 40 | In Proceedings of the 10th International Conference on Music Information Retrieval (ISMIR). 41 | 42 | Create a directory for the dataset and download required one `.csv` file and three `.zip` files in the directory `data/mtt/raw`: 43 | ```sh 44 | mkdir -p data/mtt/raw 45 | cd data/mtt/raw 46 | wget http://mi.soi.city.ac.uk/datasets/magnatagatune/annotations_final.csv 47 | wget http://mi.soi.city.ac.uk/datasets/magnatagatune/mp3.zip.001 48 | wget http://mi.soi.city.ac.uk/datasets/magnatagatune/mp3.zip.002 49 | wget http://mi.soi.city.ac.uk/datasets/magnatagatune/mp3.zip.003 50 | ``` 51 | 52 | After download the files, merge and expand the three `.zip` files: 53 | ```sh 54 | cat mp3.zip.* > mp3_all.zip 55 | unzip mp3_all.zip -d mp3 56 | ``` 57 | 58 | Your directory structure should look like this: 59 | ```sh 60 | data 61 | └── mtt 62 | └── raw 63 | ├── annotations_final.csv 64 | └── mp3 65 | ├── 0 66 | ├── ... 67 | └── f 68 | ``` 69 | 70 | Finally, segment and convert audios to TFRecords using following command: 71 | ```sh 72 | python build_dataset.py mtt 73 | ``` 74 | 75 | 76 | ### Keyword spotting: [Speech Commands][3] 77 | > Pete Warden (2018). 78 | [Speech commands: A dataset for limited-vocabulary speech recognition.][4] 79 | arXiv:1804.03209. 80 | 81 | After create a directory for the dataset, download and expand the dataset in the directory `data/scd/raw`: 82 | ```sh 83 | mkdir -p data/scd/raw 84 | cd data/scd/raw 85 | wget http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz 86 | tar zxvf speech_commands_v0.02.tar.gz 87 | ``` 88 | 89 | 90 | Finally, segment and convert audios to TFRecords using following command: 91 | ```sh 92 | python build_dataset.py scd 93 | ``` 94 | 95 | 96 | ### Acoustic scene tagging: [DCASE 2017 Task 4][5] 97 | > Annamaria Mesaros, Toni Heittola, Aleksandr Diment, Benjamin Elizalde, Ankit Shah, Emmanuel Vincent, Bhiksha Raj and Tuomas Virtanen (2017). 98 | [DCASE 2017 challenge setup: tasks, datasets and baseline system.][6] 99 | In Proceedings of the Detection and Classification of Acoustic Scenes and Events 2017 Workshop (DCASE2017). 100 | 101 | ```sh 102 | mkdir -p data/dcs/raw 103 | cd data/dcs/raw 104 | 105 | wget --no-check-certificate -r 'https://docs.google.com/uc?export=download&id=1HOQaUHbTgCRsS6Sr9I9uE6uCjiNPC3d3' -O Task_4_DCASE_2017_training_set.zip 106 | wget --no-check-certificate -r 'https://docs.google.com/uc?export=download&id=1GfP5JATSmCqD8p3CBIkk1J90mfJuPI-k' -O Task_4_DCASE_2017_testing_set.zip 107 | wget https://dl.dropboxusercontent.com/s/bbgqfd47cudwe9y/DCASE_2017_evaluation_set_audio_files.zip 108 | 109 | unzip -P DCASE_2017_training_set Task_4_DCASE_2017_training_set.zip 110 | unzip -P DCASE_2017_testing_set Task_4_DCASE_2017_testing_set.zip 111 | unzip -P DCASE_2017_evaluation_set DCASE_2017_evaluation_set_audio_files.zip 112 | 113 | wget https://github.com/ankitshah009/Task-4-Large-scale-weakly-supervised-sound-event-detection-for-smart-cars/raw/master/groundtruth_release/groundtruth_weak_label_training_set.csv 114 | wget https://github.com/ankitshah009/Task-4-Large-scale-weakly-supervised-sound-event-detection-for-smart-cars/raw/master/groundtruth_release/groundtruth_weak_label_testing_set.csv 115 | wget https://github.com/ankitshah009/Task-4-Large-scale-weakly-supervised-sound-event-detection-for-smart-cars/raw/master/groundtruth_release/groundtruth_weak_label_evaluation_set.csv 116 | ``` 117 | 118 | Finally, segment and convert audios to TFRecords using following command: 119 | ```sh 120 | python build_dataset.py dcs 121 | ``` 122 | 123 | ## Training a SampleCNN 124 | You can train a SampleCNN with a block on a dataset that you want. 125 | Here are several examples to run training: 126 | ```sh 127 | # Train a SampleCNN with SE block (default) on MagnaTagATune dataset (music auto-tagging) 128 | python train.py mtt 129 | 130 | # Train a SampleCNN with ReSE-2 block on Speech Commands dataset (keyword spotting) 131 | python train.py scd --block rese2 132 | 133 | # Train a SampleCNN with basic block on DCASE 2017 Task 4 dataset (acoustic scene tagging 134 | python train.py dcs --block basic 135 | ``` 136 | Trained models are saved under `log` directory with a datetime that you started running. 137 | Here is an example of saved model: 138 | ```sh 139 | log/ 140 | └── 20190424_213449-scd-se/ 141 | └── final-auc_0.XXXXXX-acc_0.XXXXXX-f1_0.XXXXXX.h5 142 | ``` 143 | 144 | You can see the available options for training using the command below: 145 | ```sh 146 | $ python train.py -h 147 | 148 | usage: train.py [-h] [--data-dir PATH] [--log-dir PATH] 149 | [--block {basic,se,res1,res2,rese1,rese2}] 150 | [--amplifying-ratio N] [--multi] [--batch-size N] 151 | [--momentum M] [--lr LR] [--lr-decay DC] [--dropout DO] 152 | [--weight-decay WD] [--num-stages N] [--patience N] 153 | [--num-readers N] 154 | DATASET [NAME] 155 | 156 | Train a SampleCNN. 157 | 158 | positional arguments: 159 | DATASET Dataset for training: {mtt|scd|dcs} 160 | NAME Name of log directory. 161 | 162 | optional arguments: 163 | -h, --help show this help message and exit 164 | --data-dir PATH 165 | --log-dir PATH Directory where to write event logs and models. 166 | --block {basic,se,res1,res2,rese1,rese2} 167 | Convolutional block to build a model (default: se, 168 | options: basic/se/res1/res2/rese1/rese2). 169 | --amplifying-ratio N 170 | --multi Use multi-level feature aggregation. 171 | --batch-size N Mini-batch size. 172 | --momentum M Momentum for SGD. 173 | --lr LR Learning rate. 174 | --lr-decay DC Learning rate decay rate. 175 | --dropout DO Dropout rate. 176 | --weight-decay WD Weight decay. 177 | --num-stages N Number of stages to train. 178 | --patience N Stop training stage after #patiences. 179 | --num-readers N Number of TFRecord readers. 180 | ``` 181 | 182 | 183 | [1]: http://ismir2009.ismir.net/proceedings/OS5-5.pdf 184 | [2]: http://mirg.city.ac.uk/codeapps/the-magnatagatune-dataset 185 | [3]: https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html 186 | [4]: https://arxiv.org/pdf/1804.03209.pdf 187 | [5]: http://dcase.community/challenge2017/task-large-scale-sound-event-detection 188 | [6]: http://dcase.community/documents/workshop2017/proceedings/DCASE2017Workshop_Mesaros_100.pdf 189 | --------------------------------------------------------------------------------