├── lib
    ├── data
    │   ├── __init__.py
    │   ├── config.py
    │   ├── audio.py
    │   ├── scd.py
    │   ├── batch.py
    │   ├── dcs.py
    │   └── mtt.py
    ├── model_config.py
    ├── utils.py
    ├── initialization.py
    └── model.py
├── eval.py
├── .gitignore
├── train.py
└── README.md


/lib/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import config
2 | from . import mtt
3 | from . import scd
4 | from . import dcs
5 | from . import audio
6 | 


--------------------------------------------------------------------------------
/lib/model_config.py:
--------------------------------------------------------------------------------
 1 | from lib.model import *
 2 | 
 3 | 
 4 | class ModelConfig:
 5 |   """
 6 |   The default setting is for MTT with se-multi.
 7 |   """
 8 | 
 9 |   def __init__(self, block='se', multi=True, num_blocks=9, init_features=128, num_convs=1,
10 |                amplifying_ratio=0.125, dropout=0.5, activation='sigmoid', num_classes=50, weight_decay=0.):
11 | 
12 |     # Configure block specific settings.
13 |     if block == 'basic':
14 |       block_fn = basic_block
15 |     elif block.startswith('rese'):
16 |       num_convs = int(block[-1])
17 |       block_fn = rese_block
18 |     elif block.startswith('res'):
19 |       num_convs = int(block[-1])
20 |       amplifying_ratio = None
21 |       block_fn = rese_block
22 |     elif block == 'se':
23 |       block_fn = se_block
24 |     else:
25 |       raise Exception(f'Unknown block name: {block}')
26 | 
27 |     # Overall architecture configurations.
28 |     self.multi = multi
29 |     self.init_features = init_features
30 | 
31 |     # Block configurations.
32 |     self.block = block
33 |     self.block_fn = block_fn
34 |     self.num_blocks = num_blocks
35 |     self.amplifying_ratio = amplifying_ratio
36 |     self.num_convs = num_convs
37 | 
38 |     # Training related configurations.
39 |     self.dropout = dropout
40 |     self.activation = activation
41 |     self.num_classes = num_classes
42 |     self.weight_decay = weight_decay
43 | 
44 |   def get_signature(self):
45 |     s = self.block
46 |     if self.multi:
47 |       s += '_multi'
48 |     return s
49 | 
50 |   def print_summary(self):
51 |     print(f'''=> {self.get_signature()} properties:
52 |       block             : {self.block}
53 |       multi             : {self.multi}
54 |       num_blocks        : {self.num_blocks}
55 |       amplifying_ratio  : {self.amplifying_ratio}
56 |       dropout           : {self.dropout}
57 |       activation        : {self.activation}
58 |       num_classes       : {self.num_classes}''')
59 | 


--------------------------------------------------------------------------------
/lib/data/config.py:
--------------------------------------------------------------------------------
 1 | class DatasetConfig:
 2 | 
 3 |   def __init__(self, num_blocks, num_samples, sr, len_audio, num_audios_per_shard,
 4 |                num_classes, loss, metrics, activation, mean, std,
 5 |                num_train_audios, num_test_audios, num_val_audios, threshold=0.5):
 6 |     self.num_blocks = num_blocks
 7 |     self.num_samples = num_samples
 8 |     self.sr = sr
 9 |     self.len_audio = len_audio
10 |     self.num_segments = len_audio * sr // num_samples
11 |     self.num_audios_per_shard = num_audios_per_shard
12 | 
13 |     self.num_train_audios = num_train_audios
14 |     self.num_val_audios = num_val_audios
15 |     self.num_test_audios = num_test_audios
16 |     self.num_train_segs = num_train_audios * self.num_segments
17 |     self.num_val_segs = num_val_audios * self.num_segments
18 |     self.num_test_segs = num_test_audios * self.num_segments
19 | 
20 |     self.num_classes = num_classes
21 |     self.loss = loss
22 |     self.metrics = metrics
23 |     self.activation = activation
24 |     self.threshold = threshold
25 | 
26 |     self.mean = mean
27 |     self.std = std
28 | 
29 | 
30 | MTT_CONFIG = DatasetConfig(num_blocks=9, num_samples=59049, sr=22050, len_audio=29, num_audios_per_shard=100,
31 |                            num_train_audios=15250, num_val_audios=1529, num_test_audios=4332,
32 |                            loss='binary_crossentropy', metrics=None, activation='sigmoid', num_classes=50,
33 |                            mean=-0.0001650025078561157, std=0.1551193743944168)
34 | 
35 | SCD_CONFIG = DatasetConfig(num_blocks=8, num_samples=22050, sr=22050, len_audio=1, num_audios_per_shard=1000,
36 |                            num_train_audios=84843, num_val_audios=9981, num_test_audios=11005,
37 |                            loss='categorical_crossentropy', metrics=['accuracy'], activation='softmax', num_classes=35,
38 |                            mean=-8.520474e-05, std=0.18)
39 | 
40 | DCS_CONFIG = DatasetConfig(num_blocks=8, num_samples=22050, sr=22050, len_audio=10, num_audios_per_shard=300,
41 |                            num_train_audios=46042, num_val_audios=5618, num_test_audios=1103,
42 |                            loss='binary_crossentropy', metrics=['accuracy'], activation='sigmoid', num_classes=17,
43 |                            mean=-0.0003320679534226656, std=0.20514629781246185, threshold=0.1)
44 | 


--------------------------------------------------------------------------------
/lib/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def mkpath(*paths):
 5 |   """Make path."""
 6 |   path = os.path.join(*[str(path) for path in paths])
 7 |   path = os.path.realpath(path)
 8 |   return path
 9 | 
10 | # def apk(actual, predicted, k=3):
11 | #   """
12 | #   Computes the average precision at k.
13 | #   This function computes the average prescision at k between two lists of
14 | #   items.
15 | #   Parameters
16 | #   ----------
17 | #   actual : list
18 | #            A list of elements that are to be predicted (order doesn't matter)
19 | #   predicted : list
20 | #               A list of predicted elements (order does matter)
21 | #   k : int, optional
22 | #       The maximum number of predicted elements
23 | #   Returns
24 | #   -------
25 | #   score : double
26 | #           The average precision at k over the input lists
27 | #
28 | #   Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
29 | #   """
30 | #   if len(predicted) > k:
31 | #     predicted = predicted[:k]
32 | #
33 | #   score = 0.0
34 | #   num_hits = 0.0
35 | #
36 | #   for i, p in enumerate(predicted):
37 | #     if p in actual and p not in predicted[:i]:
38 | #       num_hits += 1.0
39 | #       score += num_hits / (i + 1.0)
40 | #
41 | #   if not actual:
42 | #     return 0.0
43 | #
44 | #   return score / min(len(actual), k)
45 | #
46 | #
47 | # def mapk(y_true, y_pred, k=3):
48 | #   """
49 | #   Computes the mean average precision at k.
50 | #   This function computes the mean average prescision at k between two lists
51 | #   of lists of items.
52 | #   Parameters
53 | #   ----------
54 | #   y_true : list
55 | #            A list of lists of elements that are to be predicted
56 | #            (order doesn't matter in the lists)
57 | #   y_pred : list
58 | #               A list of lists of predicted elements
59 | #               (order matters in the lists)
60 | #   k : int, optional
61 | #       The maximum number of predicted elements
62 | #   Returns
63 | #   -------
64 | #   score : double
65 | #           The mean average precision at k over the input lists
66 | #
67 | #   Source: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
68 | #   """
69 | #   actual = np.argmax(y_true, axis=1).reshape(-1, 1).tolist()
70 | #   pred_topk = np.argsort(y_pred, axis=1)[:, ::-1][:, :k].tolist()
71 | #   return np.mean([apk(a, p, k) for a, p in zip(actual, pred_topk)])
72 | 


--------------------------------------------------------------------------------
/lib/data/audio.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | 
 5 | 
 6 | def to_tfrecord_examples(row, config, sequence):
 7 |   audio_path, label = row['path'], row['label']
 8 |   sr, num_samples, num_segments, len_audio = config.sr, config.num_samples, config.num_segments, config.len_audio
 9 | 
10 |   audio = load_audio(audio_path, sr, len_audio)
11 |   segments = [audio[i * num_samples:(i + 1) * num_samples] for i in range(num_segments)]
12 | 
13 |   if sequence:
14 |     examples = [segments_to_sequence_example(segments, label)]
15 |   else:
16 |     examples = [segment_to_example(segment, label) for segment in segments]
17 | 
18 |   return examples
19 | 
20 | 
21 | def segment_to_example(segment, label):
22 |   raw_segment = np.array(segment, dtype=np.float32).reshape(-1).tostring()
23 |   raw_label = np.array(label, dtype=np.uint8).reshape(-1).tostring()
24 | 
25 |   example = tf.train.Example(features=tf.train.Features(feature={
26 |     'label': bytes_feature(raw_label),  # array: dtype=uint8, shape=(num_classes,)
27 |     'segment': bytes_feature(raw_segment)  # array: dtype=float32, shape=(num_samples,)
28 |   }))
29 | 
30 |   return example
31 | 
32 | 
33 | def segments_to_sequence_example(segments, label):
34 |   raw_segments = [np.array(segment, dtype=np.float32).reshape(-1).tostring() for segment in segments]
35 |   raw_label = np.array(label, dtype=np.uint8).reshape(-1).tostring()
36 | 
37 |   sequence_example = tf.train.SequenceExample(
38 |     context=tf.train.Features(feature={
39 |       'label': bytes_feature(raw_label)  # uint8 Tensor (50,)
40 |     }),
41 |     feature_lists=tf.train.FeatureLists(feature_list={
42 |       'segments': bytes_feature_list(raw_segments)  # list of float32 Tensor (num_samples,)
43 |     }))
44 | 
45 |   return sequence_example
46 | 
47 | 
48 | def load_audio(path, sr, len_audio):
49 |   audio, _ = librosa.load(path, sr=sr, mono=True, duration=len_audio, dtype=np.float32, res_type='kaiser_best')
50 | 
51 |   total_samples = sr * len_audio
52 |   if len(audio) < total_samples:
53 |     audio = np.repeat(audio, total_samples // len(audio) + 1)[:total_samples]
54 | 
55 |   return audio
56 | 
57 | 
58 | def bytes_feature(value):
59 |   return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
60 | 
61 | 
62 | def bytes_feature_list(values):
63 |   return tf.train.FeatureList(feature=[bytes_feature(v) for v in values])
64 | 
65 | 
66 | def int64_feature(value):
67 |   return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
68 | 
69 | 
70 | def float_feature(value):
71 |   return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
72 | 


--------------------------------------------------------------------------------
/lib/data/scd.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import tensorflow as tf
 4 | from glob import glob
 5 | from sklearn.utils import shuffle
 6 | from lib.utils import mkpath
 7 | 
 8 | CLASSES = ['backward', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'follow', 'forward', 'four', 'go', 'happy',
 9 |            'house', 'learn', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six',
10 |            'stop', 'three', 'tree', 'two', 'up', 'visual', 'wow', 'yes', 'zero']
11 | IDX2NAME = {i: name for i, name in enumerate(CLASSES)}
12 | NAME2IDX = {name: i for i, name in enumerate(CLASSES)}
13 | 
14 | 
15 | def load_audio_paths(dataset_dir):
16 |   audio_paths = glob(mkpath(dataset_dir, 'raw/*/*.wav'))
17 |   noise_paths = glob(mkpath(dataset_dir, 'raw/_background_noise_/*.wav'))
18 | 
19 |   with open(mkpath(dataset_dir, 'raw/validation_list.txt')) as f:
20 |     val_paths = f.read().splitlines()
21 |     val_paths = [mkpath(dataset_dir, 'raw', path) for path in val_paths]
22 | 
23 |   with open(mkpath(dataset_dir, 'raw/testing_list.txt')) as f:
24 |     test_paths = f.read().splitlines()
25 |     test_paths = [mkpath(dataset_dir, 'raw', path) for path in test_paths]
26 | 
27 |   # Remove validation, test set, and noises from the training set.
28 |   train_paths = list(set(audio_paths) - set(val_paths) - set(test_paths) - set(noise_paths))
29 | 
30 |   # Sort paths.
31 |   train_paths.sort(), val_paths.sort(), test_paths.sort()
32 | 
33 |   return train_paths, val_paths, test_paths
34 | 
35 | 
36 | def make_dataset_info(dataset_dir, num_audios_per_shard):
37 |   train_paths, val_paths, test_paths = load_audio_paths(dataset_dir)
38 | 
39 |   paths = train_paths + val_paths + test_paths
40 |   ids = ['/'.join(p.split('/')[-2:]) for p in paths]
41 |   labels = [tf.keras.utils.to_categorical(NAME2IDX[id.split('/')[0]], num_classes=len(CLASSES)) for id in ids]
42 |   splits = ['train'] * len(train_paths) + ['val'] * len(val_paths) + ['test'] * len(test_paths)
43 | 
44 |   df = pd.DataFrame({'id': ids, 'label': labels, 'split': splits, 'path': paths})
45 | 
46 |   # Shuffle and shard.
47 |   df = shuffle(df, random_state=123)
48 |   for split in ['train', 'val', 'test']:
49 |     num_audios = sum(df['split'] == split)
50 |     num_shards = num_audios // num_audios_per_shard
51 |     num_remainders = num_audios % num_audios_per_shard
52 | 
53 |     shards = np.tile(np.arange(num_shards), num_audios_per_shard)
54 |     shards = np.concatenate([shards, np.arange(num_remainders) % num_shards])
55 |     shards = np.random.permutation(shards)
56 | 
57 |     df.loc[df['split'] == split, 'shard'] = shards
58 | 
59 |   df['shard'] = df['shard'].astype(int)
60 | 
61 |   return df
62 | 


--------------------------------------------------------------------------------
/lib/initialization.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import tensorflow as tf
 3 | from tensorflow.keras.initializers import Initializer
 4 | 
 5 | 
 6 | def _compute_audio_fans(shape):
 7 |   assert len(shape) == 3, 'This initialization is for Conv1D.'
 8 | 
 9 |   len_filter, in_channels, out_channels = shape
10 | 
11 |   receptive_field_size = len_filter * in_channels  # 원래는 len_filter 여야함!!
12 |   fan_in = in_channels * receptive_field_size
13 |   fan_out = out_channels * receptive_field_size
14 | 
15 |   return fan_in, fan_out
16 | 
17 | 
18 | class AudioVarianceScaling(Initializer):
19 |   """VarianceScaling for Audio"""
20 | 
21 |   def __init__(self,
22 |                scale=1.0,
23 |                mode="fan_in",
24 |                distribution="truncated_normal",
25 |                seed=None,
26 |                dtype=tf.float32):
27 |     if scale <= 0.:
28 |       raise ValueError("`scale` must be positive float.")
29 |     if mode not in {"fan_in", "fan_out", "fan_avg"}:
30 |       raise ValueError("Invalid `mode` argument:", mode)
31 |     distribution = distribution.lower()
32 |     if distribution not in {"uniform", "truncated_normal", "untruncated_normal"}:
33 |       raise ValueError("Invalid `distribution` argument:", distribution)
34 |     self.scale = scale
35 |     self.mode = mode
36 |     self.distribution = distribution
37 |     self.seed = seed
38 |     self.dtype = tf.as_dtype(dtype)
39 | 
40 |   def __call__(self, shape, dtype=None, partition_info=None):
41 |     if dtype is None:
42 |       dtype = self.dtype
43 |     scale = self.scale
44 |     scale_shape = shape
45 |     if partition_info is not None:
46 |       scale_shape = partition_info.full_shape
47 |     fan_in, fan_out = _compute_audio_fans(scale_shape)
48 |     if self.mode == "fan_in":
49 |       scale /= max(1., fan_in)
50 |     elif self.mode == "fan_out":
51 |       scale /= max(1., fan_out)
52 |     else:
53 |       scale /= max(1., (fan_in + fan_out) / 2.)
54 |     if self.distribution == "normal" or self.distribution == "truncated_normal":
55 |       # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
56 |       stddev = math.sqrt(scale) / .87962566103423978
57 |       return tf.truncated_normal(
58 |         shape, 0.0, stddev, dtype, seed=self.seed)
59 |     elif self.distribution == "untruncated_normal":
60 |       stddev = math.sqrt(scale)
61 |       return tf.random_normal(
62 |         shape, 0.0, stddev, dtype, seed=self.seed)
63 |     else:
64 |       limit = math.sqrt(3.0 * scale)
65 |       return tf.random_uniform(
66 |         shape, -limit, limit, dtype, seed=self.seed)
67 | 
68 |   def get_config(self):
69 |     return {
70 |       "scale": self.scale,
71 |       "mode": self.mode,
72 |       "distribution": self.distribution,
73 |       "seed": self.seed,
74 |       "dtype": self.dtype.name
75 |     }
76 | 
77 | 
78 | def taejun_uniform(scale=2., seed=None):
79 |   return AudioVarianceScaling(scale=scale, mode='fan_in', distribution='uniform', seed=seed)
80 | 


--------------------------------------------------------------------------------
/lib/data/batch.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from glob import glob
 3 | 
 4 | 
 5 | def tfrecord_parser(config):
 6 |   def parse_fn(example):
 7 |     features = tf.parse_single_example(example, features={
 8 |       'label': tf.FixedLenFeature([], tf.string),
 9 |       'segment': tf.FixedLenFeature([], tf.string)
10 |     })
11 | 
12 |     segment = tf.decode_raw(features['segment'], tf.float32)
13 |     segment = (segment - config.mean) / config.std  # standardization
14 |     segment = tf.expand_dims(segment, axis=-1)
15 | 
16 |     label = tf.decode_raw(features['label'], tf.uint8)
17 |     label = tf.cast(label, tf.float32)
18 | 
19 |     return segment, label
20 | 
21 |   return parse_fn
22 | 
23 | 
24 | def tfrecord_parser_sequence(config):
25 |   def parse_fn(sequence_example):
26 |     context, sequence = tf.parse_single_sequence_example(
27 |       sequence_example,
28 |       context_features={
29 |         'label': tf.FixedLenFeature([], tf.string)
30 |       },
31 |       sequence_features={
32 |         'segments': tf.FixedLenSequenceFeature([], tf.string)
33 |       })
34 | 
35 |     segments = tf.decode_raw(sequence['segments'], tf.float32)
36 |     segments = (segments - config.mean) / config.std  # standardization
37 |     segments = tf.expand_dims(segments, axis=-1)
38 | 
39 |     label = tf.decode_raw(context['label'], tf.uint8)
40 |     label = tf.cast(label, tf.float32)
41 | 
42 |     return segments, label
43 | 
44 |   return parse_fn
45 | 
46 | 
47 | def create_datasets(tfrecord_path, batch_size, num_readers, config, only_test=False):
48 |   batch_size_test = max(1, batch_size // config.num_segments)
49 |   filenames_test = glob(tfrecord_path + '/test-*.tfrecord')
50 |   dataset_test = tf.data.TFRecordDataset(filenames_test)
51 |   dataset_test = dataset_test.map(tfrecord_parser_sequence(config), num_parallel_calls=num_readers)
52 |   dataset_test = dataset_test.batch(batch_size_test)
53 |   dataset_test = dataset_test.prefetch(8 * batch_size_test)
54 | 
55 |   if only_test:
56 |     return dataset_test
57 |   else:
58 |     filenames_train = glob(tfrecord_path + '/train-*.tfrecord')
59 |     dataset_train = tf.data.TFRecordDataset(filenames_train)
60 |     dataset_train = dataset_train.map(tfrecord_parser(config), num_parallel_calls=num_readers)
61 |     dataset_train = dataset_train.shuffle(buffer_size=10000)
62 |     dataset_train = dataset_train.batch(batch_size)
63 |     dataset_train = dataset_train.repeat()
64 |     dataset_train = dataset_train.prefetch(8 * batch_size)
65 | 
66 |     filenames_val = glob(tfrecord_path + '/val-*.tfrecord')
67 |     dataset_val = tf.data.TFRecordDataset(filenames_val)
68 |     dataset_val = dataset_val.map(tfrecord_parser(config), num_parallel_calls=num_readers)
69 |     # NOTE: Do not shuffle validation set.
70 |     dataset_val = dataset_val.batch(batch_size)
71 |     dataset_val = dataset_val.repeat()
72 |     dataset_val = dataset_val.prefetch(8 * batch_size)
73 | 
74 |     return dataset_train, dataset_val, dataset_test
75 | 


--------------------------------------------------------------------------------
/lib/data/dcs.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.utils import shuffle
 4 | from lib.utils import mkpath
 5 | 
 6 | CLASSES = ['Train horn', 'Air horn, truck horn', 'Car alarm', 'Reversing beeps', 'Ambulance (siren)',
 7 |            'Police car (siren)', 'Fire engine, fire truck (siren)', 'Civil defense siren', 'Screaming', 'Bicycle',
 8 |            'Skateboard', 'Car', 'Car passing by', 'Bus', 'Truck', 'Motorcycle', 'Train']
 9 | 
10 | C2I = {c: i for i, c in enumerate(CLASSES)}
11 | 
12 | DIR_TRAIN = 'unbalanced_train_segments_training_set_audio_formatted_and_segmented_downloads'
13 | DIR_TEST = 'unbalanced_train_segments_testing_set_audio_formatted_and_segmented_downloads'
14 | DIR_EVAL = 'evaluation_set_formatted_audio_segments'
15 | 
16 | 
17 | def make_dataset_info(dataset_dir, num_audios_per_shard):
18 |   df_train = read_csv(mkpath(dataset_dir, 'raw/groundtruth_weak_label_training_set.csv'))
19 |   df_test = read_csv(mkpath(dataset_dir, 'raw/groundtruth_weak_label_testing_set.csv'))
20 |   df_eval = read_csv(mkpath(dataset_dir, 'raw/groundtruth_weak_label_evaluation_set.csv'))
21 | 
22 |   df_train['path'] = [mkpath(dataset_dir, f'raw/{DIR_TRAIN}/Y{f}') for f in df_train['file']]
23 |   df_test['path'] = [mkpath(dataset_dir, f'raw/{DIR_TEST}/Y{f}') for f in df_test['file']]
24 |   df_eval['path'] = [mkpath(dataset_dir, f'raw/{DIR_EVAL}/Y{f}') for f in df_eval['file']]
25 | 
26 |   df_train = pd.concat([df_train, df_test])
27 | 
28 |   # Split validation set.
29 |   val_files = []
30 |   for c in CLASSES:
31 |     df_class = df_train[df_train['label'] == c]
32 |     val_files += df_class.sample(frac=0.1, random_state=123)['file'].tolist()
33 |   val_files = list(set(val_files))
34 | 
35 |   is_val = df_train['file'].isin(val_files)
36 |   df_val = df_train[is_val].assign(split='val')
37 |   df_train = df_train[~is_val].assign(split='train')
38 |   df_eval = df_eval.assign(split='test')
39 | 
40 |   df = pd.concat([df_train, df_val, df_eval])
41 | 
42 |   # Encode labels.
43 |   label = df.groupby('file')['label'].apply(list)
44 |   label.iloc[:] = [encode(l) for l in label]
45 |   label = label.to_frame().reset_index()
46 |   df = df.drop_duplicates('file').drop('label', axis=1).merge(label, on='file')
47 | 
48 |   # Shuffle and shard.
49 |   df = shuffle(df, random_state=123)
50 |   for split in ['train', 'val', 'test']:
51 |     num_audios = sum(df['split'] == split)
52 |     num_shards = num_audios // num_audios_per_shard
53 |     num_remainders = num_audios % num_audios_per_shard
54 | 
55 |     shards = np.tile(np.arange(num_shards), num_audios_per_shard)
56 |     shards = np.concatenate([shards, np.arange(num_remainders) % num_shards])
57 |     shards = np.random.permutation(shards)
58 | 
59 |     df.loc[df['split'] == split, 'shard'] = shards
60 | 
61 |   df['shard'] = df['shard'].astype(int)
62 | 
63 |   return df
64 | 
65 | 
66 | def read_csv(path):
67 |   df = pd.read_csv(path, delimiter='\t', names=['file', 'start', 'end', 'label'])
68 |   return df
69 | 
70 | 
71 | def encode(label):
72 |   x = np.zeros(shape=len(CLASSES), dtype=np.float32)
73 |   x[[C2I[l] for l in label]] = 1.
74 |   return x
75 | 


--------------------------------------------------------------------------------
/lib/data/mtt.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.utils import shuffle
 4 | 
 5 | CLASSES = ['choral', 'female voice', 'metal', 'country', 'weird', 'no voice', 'cello', 'harp', 'beats', 'female vocal',
 6 |            'male voice', 'dance', 'new age', 'voice', 'choir', 'classic', 'man', 'solo', 'sitar', 'soft', 'pop',
 7 |            'no vocal', 'male vocal', 'woman', 'flute', 'quiet', 'loud', 'harpsichord', 'no vocals', 'vocals', 'singing',
 8 |            'male', 'opera', 'indian', 'female', 'synth', 'vocal', 'violin', 'beat', 'ambient', 'piano', 'fast', 'rock',
 9 |            'electronic', 'drums', 'strings', 'techno', 'slow', 'classical', 'guitar']
10 | 
11 | 
12 | def make_dataset_info(dataset_dir, num_audios_per_shard=100, num_top=50):
13 |   """Reads annotation file, takes top N tags, and splits data samples.
14 | 
15 |   Results 54 (top50_tags + [clip_id, mp3_path, split, shard]) columns:
16 | 
17 |     ['choral', 'female voice', 'metal', 'country', 'weird', 'no voice',
18 |      'cello', 'harp', 'beats', 'female vocal', 'male voice', 'dance',
19 |      'new age', 'voice', 'choir', 'classic', 'man', 'solo', 'sitar', 'soft',
20 |      'pop', 'no vocal', 'male vocal', 'woman', 'flute', 'quiet', 'loud',
21 |      'harpsichord', 'no vocals', 'vocals', 'singing', 'male', 'opera',
22 |      'indian', 'female', 'synth', 'vocal', 'violin', 'beat', 'ambient',
23 |      'piano', 'fast', 'rock', 'electronic', 'drums', 'strings', 'techno',
24 |      'slow', 'classical', 'guitar', 'clip_id', 'mp3_path', 'split', 'shard']
25 | 
26 |   NOTE: This will exclude audios which have only zero-tags. Therefore, number of
27 |     each split will be 15250 / 1529 / 4332 (training / validation / test).
28 | 
29 |   Args:
30 |     filename: A path to annotation CSV file.
31 |     num_top: Number of the most popular tags to take.
32 |     num_audios_per_shard: Number of audios per shard.
33 | 
34 |   Returns:
35 |     A DataFrame contains information of audios.
36 | 
37 |     Schema:
38 |       <tags>: 0 or 1
39 |       clip_id: clip_id of the original dataset
40 |       mp3_path: A path to a mp3 audio file.
41 |       split: A split of dataset (training / validation / test).
42 |              The split is determined by its directory (0, 1, ... , f).
43 |              First 12 directories (0 ~ b) are used for training,
44 |              1 (c) for validation, and 3 (d ~ f) for test.
45 |       shard: A shard index of the audio.
46 |   """
47 |   df = pd.read_csv(dataset_dir + '/raw/annotations_final.csv', delimiter='\t')
48 | 
49 |   # Calculate TOP 50 tags.
50 |   top50 = (df.drop(['clip_id', 'mp3_path'], axis=1)
51 |            .sum()
52 |            .sort_values()
53 |            .tail(num_top)
54 |            .index
55 |            .tolist())
56 | 
57 |   # Select TOP 50 columns.
58 |   df = df[top50 + ['clip_id', 'mp3_path']]
59 | 
60 |   # Select rows which has at least one label.
61 |   df = df.loc[df.iloc[:, :num_top].any(axis=1)]
62 | 
63 |   def split_by_directory(mp3_path):
64 |     directory = mp3_path.split('/')[0]
65 |     part = int(directory, 16)
66 | 
67 |     if part in range(12):
68 |       return 'train'
69 |     elif part is 12:
70 |       return 'val'
71 |     elif part in range(13, 16):
72 |       return 'test'
73 | 
74 |   # Split by directories.
75 |   df['split'] = df['mp3_path'].apply(lambda mp3_path: split_by_directory(mp3_path))
76 | 
77 |   df = shuffle(df)
78 |   for split in ['train', 'val', 'test']:
79 |     num_audios = sum(df['split'] == split)
80 |     num_shards = num_audios // num_audios_per_shard
81 |     num_remainders = num_audios % num_audios_per_shard
82 | 
83 |     shards = np.tile(np.arange(num_shards), num_audios_per_shard)
84 |     shards = np.concatenate([shards, np.arange(num_remainders) % num_shards])
85 |     shards = np.random.permutation(shards)
86 | 
87 |     df.loc[df['split'] == split, 'shard'] = shards
88 | 
89 |   df['shard'] = df['shard'].astype(int)
90 | 
91 |   # To unified format.
92 |   paths = [f'{dataset_dir}/raw/mp3/{p}' for p in df['mp3_path']]
93 |   labels = [label for label in df.iloc[:, :num_top].values]
94 | 
95 |   df = pd.DataFrame({'id': df['clip_id'], 'label': labels, 'split': df['split'], 'shard': df['shard'], 'path': paths})
96 | 
97 |   return df
98 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import lib.data as data
  5 | from sklearn import metrics
  6 | from lib.data.batch import create_datasets
  7 | from lib.data.config import *
  8 | from lib.initialization import AudioVarianceScaling
  9 | from lib.utils import mkpath
 10 | 
 11 | 
 12 | def main(args):
 13 |   args.model_path = mkpath(args.model_path)
 14 |   args.dataset = args.dataset or args.model_path.split('/')[-2].split('-')[1]  # extract dataset name from train_dir.
 15 | 
 16 |   if args.dataset == 'mtt':
 17 |     config = MTT_CONFIG
 18 |     classes = data.mtt.CLASSES
 19 |   elif args.dataset == 'scd':
 20 |     config = SCD_CONFIG
 21 |     classes = data.scd.CLASSES
 22 |   elif args.dataset == 'dcs':
 23 |     config = DCS_CONFIG
 24 |     classes = data.dcs.CLASSES
 25 |   else:
 26 |     raise Exception('Not implemented.')
 27 | 
 28 |   # Create training, validation, and test datasets.
 29 |   dataset_path = mkpath(args.data_dir, args.dataset, 'tfrecord')
 30 |   dataset_test = create_datasets(dataset_path, args.batch_size, args.num_readers, config, only_test=True)
 31 | 
 32 |   # Load the trained model.
 33 |   model = tf.keras.models.load_model(args.model_path,
 34 |                                      custom_objects={'AudioVarianceScaling': AudioVarianceScaling, 'tf': tf})
 35 | 
 36 |   # Evaluate
 37 |   evaluate(model, dataset_test, config, classes=classes)
 38 | 
 39 | 
 40 | def evaluate(model, dataset_test, config, classes=None):
 41 |   # Create the iterator.
 42 |   iterator = dataset_test.make_one_shot_iterator()
 43 |   seg, label = iterator.get_next()
 44 | 
 45 |   # Get dynamic shapes.
 46 |   seg_shape = tf.shape(seg)
 47 |   batch_size, num_segments, num_samples = seg_shape[0], seg_shape[1], seg_shape[2]
 48 |   num_classes = tf.shape(label)[1]
 49 | 
 50 |   seg = tf.reshape(seg, shape=(batch_size * num_segments, num_samples, 1))
 51 |   pred_segs = model(seg)  # predict all segments
 52 |   pred_segs = tf.reshape(pred_segs, shape=(batch_size, num_segments, num_classes))
 53 |   pred = tf.reduce_mean(pred_segs, axis=1)  # Average segments for each audio
 54 | 
 55 |   y_true, y_prob = [], []
 56 |   sess = tf.keras.backend.get_session()
 57 |   while True:
 58 |     try:
 59 |       label_batch, pred_batch = sess.run([label, pred], feed_dict={tf.keras.backend.learning_phase(): 0})
 60 |       y_true.append(label_batch)
 61 |       y_prob.append(pred_batch)
 62 |     except tf.errors.OutOfRangeError:
 63 |       break
 64 | 
 65 |   y_true, y_prob = np.concatenate(y_true), np.concatenate(y_prob)
 66 |   rocauc = metrics.roc_auc_score(y_true, y_prob, average='macro')
 67 |   prauc = metrics.average_precision_score(y_true, y_prob, average='macro')
 68 | 
 69 |   y_pred = (y_prob > config.threshold).astype(np.float32)
 70 |   acc = metrics.accuracy_score(y_true, y_pred)
 71 |   f1 = metrics.f1_score(y_true, y_pred, average='samples')
 72 | 
 73 |   if classes is not None:
 74 |     print(f'\n=> Individual scores of {len(classes)} classes')
 75 |     for i, cls in enumerate(classes):
 76 |       cls_rocauc = metrics.roc_auc_score(y_true[:, i], y_prob[:, i])
 77 |       cls_prauc = metrics.average_precision_score(y_true[:, i], y_prob[:, i])
 78 |       cls_acc = metrics.accuracy_score(y_true[:, i], y_pred[:, i])
 79 |       cls_f1 = metrics.f1_score(y_true[:, i], y_pred[:, i])
 80 |       print(f'[{i:2} {cls:30}] rocauc={cls_rocauc:.4f} prauc={cls_prauc:.4f} acc={cls_acc:.4f} f1={cls_f1:.4f}')
 81 |     print()
 82 | 
 83 |   print(f'=> Test scores: rocauc={rocauc:.6f}\tprauc={prauc:.6f}\tacc={acc:.6f}\tf1={f1:.6f}')
 84 |   return rocauc, prauc, acc, f1
 85 | 
 86 | 
 87 | if __name__ == '__main__':
 88 |   parser = argparse.ArgumentParser(description='Evaluate a SampleCNN.')
 89 |   parser.add_argument('dataset', type=str, metavar='DATASET',
 90 |                       choices=['mtt', 'scd', 'dcs'], help='Dataset for training: {mtt|scd|dcs}')
 91 |   parser.add_argument('model_path', type=str, metavar='PATH', help='Path to the saved model.')
 92 |   parser.add_argument('--data-dir', type=str, default='./data', metavar='PATH')
 93 |   parser.add_argument('--batch-size', type=int, default=23, metavar='N', help='Mini-batch size.')
 94 |   parser.add_argument('--num-readers', type=int, default=8, metavar='N', help='Number of TFRecord readers.')
 95 | 
 96 |   args = parser.parse_args()
 97 | 
 98 |   main(args)
 99 |   print('\n=> Done.\n')
100 | 


--------------------------------------------------------------------------------
/lib/model.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.models import Model
 2 | from tensorflow.keras.layers import (Conv1D, MaxPool1D, BatchNormalization, GlobalAvgPool1D, Multiply, GlobalMaxPool1D,
 3 |                                      Dense, Dropout, Activation, Reshape, Concatenate, Add, Input)
 4 | from tensorflow.keras.regularizers import l2
 5 | from lib.initialization import taejun_uniform
 6 | 
 7 | 
 8 | def squeeze_excitation(x, amplifying_ratio, name):
 9 |   num_features = x.shape[-1].value
10 |   x = GlobalAvgPool1D(name=f'{name}_squeeze')(x)
11 |   x = Reshape((1, num_features), name=f'{name}_reshape')(x)
12 |   x = Dense(num_features * amplifying_ratio, activation='relu',
13 |             kernel_initializer='glorot_uniform', name=f'{name}_ex0')(x)
14 |   x = Dense(num_features, activation='sigmoid', kernel_initializer='glorot_uniform', name=f'{name}_ex1')(x)
15 |   return x
16 | 
17 | 
18 | def basic_block(x, num_features, cfg, name):
19 |   """Block for basic models."""
20 |   x = Conv1D(num_features, kernel_size=3, padding='same', use_bias=True,
21 |              kernel_regularizer=l2(cfg.weight_decay), kernel_initializer=taejun_uniform(), name=f'{name}_conv')(x)
22 |   x = BatchNormalization(name=f'{name}_norm')(x)
23 |   x = Activation('relu', name=f'{name}_relu')(x)
24 |   x = MaxPool1D(pool_size=3, name=f'{name}_pool')(x)
25 |   return x
26 | 
27 | 
28 | def se_block(x, num_features, cfg, name):
29 |   """Block for SE models."""
30 |   x = basic_block(x, num_features, cfg, name)
31 |   x = Multiply(name=f'{name}_scale')([x, squeeze_excitation(x, cfg.amplifying_ratio, name)])
32 |   return x
33 | 
34 | 
35 | def rese_block(x, num_features, cfg, name):
36 |   """Block for Res-N & ReSE-N models."""
37 |   if num_features != x.shape[-1].value:
38 |     shortcut = Conv1D(num_features, kernel_size=1, padding='same', use_bias=True, name=f'{name}_scut_conv',
39 |                       kernel_regularizer=l2(cfg.weight_decay), kernel_initializer='glorot_uniform')(x)
40 |     shortcut = BatchNormalization(name=f'{name}_scut_norm')(shortcut)
41 |   else:
42 |     shortcut = x
43 | 
44 |   for i in range(cfg.num_convs):
45 |     if i > 0:
46 |       x = Activation('relu', name=f'{name}_relu{i-1}')(x)
47 |       x = Dropout(0.2, name=f'{name}_drop{i-1}')(x)
48 |     x = Conv1D(num_features, kernel_size=3, padding='same', use_bias=True,
49 |                kernel_regularizer=l2(cfg.weight_decay), kernel_initializer=taejun_uniform(), name=f'{name}_conv{i}')(x)
50 |     x = BatchNormalization(name=f'{name}_norm{i}')(x)
51 | 
52 |   # Add SE if it is ReSE block.
53 |   if cfg.amplifying_ratio:
54 |     x = Multiply(name=f'{name}_scale')([x, squeeze_excitation(x, cfg.amplifying_ratio, name)])
55 | 
56 |   x = Add(name=f'{name}_scut')([shortcut, x])
57 |   x = Activation('relu', name=f'{name}_relu1')(x)
58 |   x = MaxPool1D(pool_size=3, name=f'{name}_pool')(x)
59 |   return x
60 | 
61 | 
62 | def SampleCNN(cfg):
63 |   """Build a SampleCNN model."""
64 |   # Variable-length input for feature visualization.
65 |   x_in = Input(shape=(None, 1), name='input')
66 | 
67 |   num_features = cfg.init_features
68 |   x = Conv1D(num_features, kernel_size=3, strides=3, padding='same', use_bias=True,
69 |              kernel_regularizer=l2(cfg.weight_decay), kernel_initializer=taejun_uniform(scale=1.), name='conv0')(x_in)
70 |   x = BatchNormalization(name='norm0')(x)
71 |   x = Activation('relu', name='relu0')(x)
72 | 
73 |   # Stack convolutional blocks.
74 |   layer_outputs = []
75 |   for i in range(cfg.num_blocks):
76 |     num_features *= 2 if (i == 2 or i == (cfg.num_blocks - 1)) else 1
77 |     x = cfg.block_fn(x, num_features, cfg, f'block{i}')
78 |     layer_outputs.append(x)
79 | 
80 |   if cfg.multi:  # Use multi-level feature aggregation or not.
81 |     x = Concatenate(name='multi')([GlobalMaxPool1D(name=f'final_pool{i}')(output)
82 |                                    for i, output in enumerate(layer_outputs[-3:])])
83 |   else:
84 |     x = GlobalMaxPool1D(name='final_pool')(x)
85 | 
86 |   # The final two FCs.
87 |   x = Dense(x.shape[-1].value, kernel_initializer='glorot_uniform', name='final_fc')(x)
88 |   x = BatchNormalization(name='final_norm')(x)
89 |   x = Activation('relu', name='final_relu')(x)
90 |   if cfg.dropout > 0.:
91 |     x = Dropout(cfg.dropout, name='final_drop')(x)
92 |   x = Dense(cfg.num_classes, kernel_initializer='glorot_uniform', name='logit')(x)
93 |   x = Activation(cfg.activation, name='pred')(x)
94 | 
95 |   return Model(inputs=[x_in], outputs=[x], name='sample_cnn')
96 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | /log
  2 | /data
  3 | /.idea
  4 | /out
  5 | 
  6 | # Created by .ignore support plugin (hsz.mobi)
  7 | ### Python template
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # celery beat schedule file
 85 | celerybeat-schedule
 86 | 
 87 | # SageMath parsed files
 88 | *.sage.py
 89 | 
 90 | # Environments
 91 | .env
 92 | .venv
 93 | env/
 94 | venv/
 95 | ENV/
 96 | env.bak/
 97 | venv.bak/
 98 | 
 99 | # Spyder project settings
100 | .spyderproject
101 | .spyproject
102 | 
103 | # Rope project settings
104 | .ropeproject
105 | 
106 | # mkdocs documentation
107 | /site
108 | 
109 | # mypy
110 | .mypy_cache/
111 | ### macOS template
112 | # General
113 | .DS_Store
114 | .AppleDouble
115 | .LSOverride
116 | 
117 | # Icon must end with two \r
118 | Icon
119 | 
120 | # Thumbnails
121 | ._*
122 | 
123 | # Files that might appear in the root of a volume
124 | .DocumentRevisions-V100
125 | .fseventsd
126 | .Spotlight-V100
127 | .TemporaryItems
128 | .Trashes
129 | .VolumeIcon.icns
130 | .com.apple.timemachine.donotpresent
131 | 
132 | # Directories potentially created on remote AFP share
133 | .AppleDB
134 | .AppleDesktop
135 | Network Trash Folder
136 | Temporary Items
137 | .apdisk
138 | ### JetBrains template
139 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
140 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
141 | 
142 | # User-specific stuff
143 | .idea/**/workspace.xml
144 | .idea/**/tasks.xml
145 | .idea/**/usage.statistics.xml
146 | .idea/**/dictionaries
147 | .idea/**/shelf
148 | 
149 | # Sensitive or high-churn files
150 | .idea/**/dataSources/
151 | .idea/**/dataSources.ids
152 | .idea/**/dataSources.local.xml
153 | .idea/**/sqlDataSources.xml
154 | .idea/**/dynamic.xml
155 | .idea/**/uiDesigner.xml
156 | .idea/**/dbnavigator.xml
157 | 
158 | # Gradle
159 | .idea/**/gradle.xml
160 | .idea/**/libraries
161 | 
162 | # Gradle and Maven with auto-import
163 | # When using Gradle or Maven with auto-import, you should exclude module files,
164 | # since they will be recreated, and may cause churn.  Uncomment if using
165 | # auto-import.
166 | # .idea/modules.xml
167 | # .idea/*.iml
168 | # .idea/modules
169 | 
170 | # CMake
171 | cmake-build-*/
172 | 
173 | # Mongo Explorer plugin
174 | .idea/**/mongoSettings.xml
175 | 
176 | # File-based project format
177 | *.iws
178 | 
179 | # IntelliJ
180 | out/
181 | 
182 | # mpeltonen/sbt-idea plugin
183 | .idea_modules/
184 | 
185 | # JIRA plugin
186 | atlassian-ide-plugin.xml
187 | 
188 | # Cursive Clojure plugin
189 | .idea/replstate.xml
190 | 
191 | # Crashlytics plugin (for Android Studio and IntelliJ)
192 | com_crashlytics_export_strings.xml
193 | crashlytics.properties
194 | crashlytics-build.properties
195 | fabric.properties
196 | 
197 | # Editor-based Rest Client
198 | .idea/httpRequests
199 | ### Linux template
200 | *~
201 | 
202 | # temporary files which can be created if a process still has a handle open of a deleted file
203 | .fuse_hidden*
204 | 
205 | # KDE directory preferences
206 | .directory
207 | 
208 | # Linux trash folder which might appear on any partition or disk
209 | .Trash-*
210 | 
211 | # .nfs files are created when an open file is removed but is still being accessed
212 | .nfs*
213 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import math
  3 | import os
  4 | import tensorflow as tf
  5 | import tensorflow.keras.backend as K
  6 | from datetime import datetime
  7 | from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
  8 | from lib.model import SampleCNN
  9 | from lib.model_config import ModelConfig
 10 | from lib.data.batch import create_datasets
 11 | from lib.data.config import *
 12 | from lib.initialization import AudioVarianceScaling
 13 | from lib.utils import mkpath
 14 | from eval import evaluate
 15 | 
 16 | 
 17 | def main(args):
 18 |   print(f'=> Dataset: {args.dataset}')
 19 |   if args.dataset == 'mtt':
 20 |     config = MTT_CONFIG
 21 |   elif args.dataset == 'scd':
 22 |     config = SCD_CONFIG
 23 |   elif args.dataset == 'dcs':
 24 |     config = DCS_CONFIG
 25 |   else:
 26 |     raise Exception(f'Not implemented dataset: {args.dataset}')
 27 | 
 28 |   dataset_path = mkpath(args.data_dir, args.dataset)
 29 |   tfrecord_path = f'{dataset_path}/tfrecord'
 30 | 
 31 |   # Configure the model.
 32 |   model_config = ModelConfig(block=args.block, amplifying_ratio=args.amplifying_ratio, multi=args.multi,
 33 |                              num_blocks=config.num_blocks, dropout=args.dropout, activation=config.activation,
 34 |                              num_classes=config.num_classes)
 35 | 
 36 |   # Set the training directory.
 37 |   args.train_dir = mkpath(args.log_dir, datetime.now().strftime('%Y%m%d_%H%M%S') + f'-{args.dataset}')
 38 |   if args.name is None:
 39 |     args.name = model_config.get_signature()
 40 |   args.train_dir += '-' + args.name
 41 |   os.makedirs(args.train_dir, exist_ok=False)
 42 |   print('=> Training directory: ' + args.train_dir)
 43 | 
 44 |   # Create training, validation, and test datasets.
 45 |   dataset_train, dataset_val, dataset_test = create_datasets(tfrecord_path, args.batch_size, args.num_readers, config)
 46 | 
 47 |   model = SampleCNN(model_config)
 48 |   model_config.print_summary()
 49 | 
 50 |   num_params = int(sum([K.count_params(p) for p in set(model.trainable_weights)]))
 51 |   print(f'=> #params: {num_params:,}')
 52 | 
 53 |   for stage in range(args.num_stages):
 54 |     print(f'=> Stage {stage}')
 55 |     # Set the learning rate of current stage
 56 |     lr = args.lr * (args.lr_decay ** stage)
 57 |     # Train the network.
 58 |     train(model, lr, dataset_train, dataset_val, config, args)
 59 |     # Load the best model.
 60 |     model = tf.keras.models.load_model(f'{args.train_dir}/best.h5',
 61 |                                        custom_objects={'AudioVarianceScaling': AudioVarianceScaling, 'tf': tf})
 62 |     # Evaluate.
 63 |     rocauc, prauc, acc, f1 = evaluate(model, dataset_test, config)
 64 | 
 65 |   # Change the file name of the best checkpoint with the scores.
 66 |   os.rename(f'{args.train_dir}/best.h5', f'{args.train_dir}/final-auc_{rocauc:.6f}-acc_{acc:.6f}-f1_{f1:.6f}.h5')
 67 |   # Report the final scores.
 68 |   print(f'=> FINAL SCORES [{args.dataset}] {args.name}: '
 69 |         f'rocauc={rocauc:.6f}, acc={acc:.6f}, f1={f1:.6f}, prauc={prauc:.6f}')
 70 | 
 71 |   model_config.print_summary()
 72 | 
 73 |   return rocauc, prauc, acc, f1
 74 | 
 75 | 
 76 | def train(model, lr, dataset_train, dataset_val, config, args):
 77 |   # Define a optimizer and compile the model.
 78 |   optimizer = tf.keras.optimizers.SGD(lr=lr, momentum=args.momentum, decay=1e-6, nesterov=True)
 79 |   model.compile(optimizer, loss=config.loss, metrics=config.metrics)
 80 | 
 81 |   # Setup callbacks.
 82 |   early_stopping = EarlyStopping(monitor='val_loss', patience=args.patience)
 83 |   checkpointer_best = ModelCheckpoint(f'{args.train_dir}/best.h5', monitor='val_loss', save_best_only=True)
 84 | 
 85 |   # Train!
 86 |   steps_train = int(math.ceil(config.num_train_segs / args.batch_size))
 87 |   steps_val = int(math.ceil(config.num_val_segs / args.batch_size))
 88 |   model.fit(dataset_train, epochs=100, steps_per_epoch=steps_train,
 89 |             validation_data=dataset_val, validation_steps=steps_val,
 90 |             callbacks=[early_stopping, checkpointer_best])
 91 | 
 92 | 
 93 | def parse_args():
 94 |   parser = argparse.ArgumentParser(description='Train a SampleCNN.')
 95 |   parser.add_argument('dataset', type=str, metavar='DATASET',
 96 |                       choices=['mtt', 'scd', 'dcs'], help='Dataset for training: {mtt|scd|dcs}')
 97 |   parser.add_argument('name', type=str, metavar='NAME', nargs='?', help='Name of log directory.')
 98 |   parser.add_argument('--data-dir', type=str, default='./data', metavar='PATH')
 99 |   parser.add_argument('--log-dir', type=str, default='./log', metavar='PATH',
100 |                       help='Directory where to write event logs and models.')
101 | 
102 |   parser.add_argument('--block', type=str, default='se', choices=['basic', 'se', 'res1', 'res2', 'rese1', 'rese2'],
103 |                       help='Convolutional block to build a model (default: se, options: basic/se/res1/res2/rese1/rese2).')
104 |   parser.add_argument('--amplifying-ratio', type=float, default=0.125, metavar='N')
105 |   parser.add_argument('--multi', action='store_true', help='Use multi-level feature aggregation.')
106 | 
107 |   parser.add_argument('--batch-size', type=int, default=23, metavar='N', help='Mini-batch size.')
108 |   parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='Momentum for SGD.')
109 |   parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='Learning rate.')
110 |   parser.add_argument('--lr-decay', type=float, default=0.2, metavar='DC', help='Learning rate decay rate.')
111 | 
112 |   parser.add_argument('--dropout', type=float, default=0.5, metavar='DO', help='Dropout rate.')
113 |   parser.add_argument('--weight-decay', type=float, default=0., metavar='WD', help='Weight decay.')
114 | 
115 |   parser.add_argument('--num-stages', type=int, default=5, metavar='N', help='Number of stages to train.')
116 |   parser.add_argument('--patience', type=int, default=2, metavar='N', help='Stop training stage after #patiences.')
117 | 
118 |   parser.add_argument('--num-readers', type=int, default=8, metavar='N', help='Number of TFRecord readers.')
119 | 
120 |   return parser.parse_args()
121 | 
122 | 
123 | if __name__ == '__main__':
124 |   args = parse_args()
125 | 
126 |   main(args)
127 | 
128 |   print('\n=> Done.\n')
129 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SampleCNNs for Audio Classifications
  2 | This repository contains the code that used for the publication below:
  3 | > Taejun Kim, Jongpil Lee, and Juhan Nam, "Comparison and Analysis of SampleCNN Architectures for Audio Classification"
  4 | in IEEE Journal of Selected Topics in Signal Processing (JSTSP), 2019.
  5 | 
  6 | 
  7 | 
  8 | Contents:
  9 | * Install Dependencies
 10 | * Building Datasets
 11 |   * Music auto-tagging: MagnaTagATune
 12 |   * Keyword spotting: Speech Commands
 13 |   * Acoustic scene tagging: DCASE 2017 Task 4
 14 | * Training a SampleCNN
 15 | 
 16 | ## Dependency Installation
 17 | NOTE: The code of this repository is written and tested on **Python 3.6**.
 18 |  
 19 | * tensorflow 1.10.X (strongly recommend to use 1.10.X because of version compatibility)
 20 | * librosa
 21 | * ffmpeg
 22 | * pandas
 23 | * numpy
 24 | * scikit-learn
 25 | * h5py
 26 | 
 27 | To install the required python packages using conda, run the command below:
 28 | ```sh
 29 | conda install tensorflow-gpu=1.10.0 ffmpeg pandas numpy scikit-learn h5py
 30 | conda install -c conda-forge librosa
 31 | ```
 32 | 
 33 | 
 34 | ## Building Datasets
 35 | Download and preprocess a dataset that you want to train a model on.
 36 | 
 37 | ### Music auto-tagging: [MagnaTagATune][2]
 38 | > Edith Law, Kris West, Michael Mandel, Mert Bay and J. Stephen Downie (2009).
 39 | [Evaluation of algorithms using games: the case of music annotation.][1]
 40 | In  Proceedings of the 10th International Conference on Music Information Retrieval (ISMIR).
 41 | 
 42 | Create a directory for the dataset and download required one `.csv` file and three `.zip` files in the directory `data/mtt/raw`:
 43 | ```sh
 44 | mkdir -p data/mtt/raw
 45 | cd data/mtt/raw
 46 | wget http://mi.soi.city.ac.uk/datasets/magnatagatune/annotations_final.csv
 47 | wget http://mi.soi.city.ac.uk/datasets/magnatagatune/mp3.zip.001
 48 | wget http://mi.soi.city.ac.uk/datasets/magnatagatune/mp3.zip.002
 49 | wget http://mi.soi.city.ac.uk/datasets/magnatagatune/mp3.zip.003
 50 | ```
 51 |  
 52 | After download the files, merge and expand the three `.zip` files:
 53 | ```sh
 54 | cat mp3.zip.* > mp3_all.zip
 55 | unzip mp3_all.zip -d mp3
 56 | ```
 57 | 
 58 | Your directory structure should look like this:
 59 | ```sh
 60 | data
 61 | └── mtt
 62 |     └── raw
 63 |         ├── annotations_final.csv
 64 |         └── mp3
 65 |             ├── 0
 66 |             ├── ...
 67 |             └── f
 68 | ```
 69 | 
 70 | Finally, segment and convert audios to TFRecords using following command:
 71 | ```sh
 72 | python build_dataset.py mtt
 73 | ```
 74 | 
 75 | 
 76 | ### Keyword spotting: [Speech Commands][3]
 77 | > Pete Warden (2018).
 78 | [Speech commands: A dataset for limited-vocabulary speech recognition.][4]
 79 | arXiv:1804.03209.
 80 | 
 81 | After create a directory for the dataset, download and expand the dataset in the directory `data/scd/raw`:
 82 | ```sh
 83 | mkdir -p data/scd/raw
 84 | cd data/scd/raw
 85 | wget http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz
 86 | tar zxvf speech_commands_v0.02.tar.gz
 87 | ```
 88 | 
 89 | 
 90 | Finally, segment and convert audios to TFRecords using following command:
 91 | ```sh
 92 | python build_dataset.py scd
 93 | ```
 94 | 
 95 | 
 96 | ### Acoustic scene tagging: [DCASE 2017 Task 4][5]
 97 | > Annamaria Mesaros, Toni Heittola, Aleksandr Diment, Benjamin Elizalde, Ankit Shah, Emmanuel Vincent, Bhiksha Raj and Tuomas Virtanen (2017).
 98 | [DCASE 2017 challenge setup: tasks, datasets and baseline system.][6]
 99 | In Proceedings of the Detection and Classification of Acoustic Scenes and Events 2017 Workshop (DCASE2017).
100 | 
101 | ```sh
102 | mkdir -p data/dcs/raw
103 | cd data/dcs/raw
104 | 
105 | wget --no-check-certificate -r 'https://docs.google.com/uc?export=download&id=1HOQaUHbTgCRsS6Sr9I9uE6uCjiNPC3d3' -O Task_4_DCASE_2017_training_set.zip
106 | wget --no-check-certificate -r 'https://docs.google.com/uc?export=download&id=1GfP5JATSmCqD8p3CBIkk1J90mfJuPI-k' -O Task_4_DCASE_2017_testing_set.zip
107 | wget https://dl.dropboxusercontent.com/s/bbgqfd47cudwe9y/DCASE_2017_evaluation_set_audio_files.zip
108 | 
109 | unzip -P DCASE_2017_training_set Task_4_DCASE_2017_training_set.zip
110 | unzip -P DCASE_2017_testing_set Task_4_DCASE_2017_testing_set.zip
111 | unzip -P DCASE_2017_evaluation_set DCASE_2017_evaluation_set_audio_files.zip
112 | 
113 | wget https://github.com/ankitshah009/Task-4-Large-scale-weakly-supervised-sound-event-detection-for-smart-cars/raw/master/groundtruth_release/groundtruth_weak_label_training_set.csv
114 | wget https://github.com/ankitshah009/Task-4-Large-scale-weakly-supervised-sound-event-detection-for-smart-cars/raw/master/groundtruth_release/groundtruth_weak_label_testing_set.csv
115 | wget https://github.com/ankitshah009/Task-4-Large-scale-weakly-supervised-sound-event-detection-for-smart-cars/raw/master/groundtruth_release/groundtruth_weak_label_evaluation_set.csv
116 | ```
117 | 
118 | Finally, segment and convert audios to TFRecords using following command:
119 | ```sh
120 | python build_dataset.py dcs
121 | ```
122 | 
123 | ## Training a SampleCNN
124 | You can train a SampleCNN with a block on a dataset that you want.
125 | Here are several examples to run training:
126 | ```sh
127 | # Train a SampleCNN with SE block (default) on MagnaTagATune dataset (music auto-tagging)
128 | python train.py mtt
129 | 
130 | # Train a SampleCNN with ReSE-2 block on Speech Commands dataset (keyword spotting)
131 | python train.py scd --block rese2
132 | 
133 | # Train a SampleCNN with basic block on DCASE 2017 Task 4 dataset (acoustic scene tagging
134 | python train.py dcs --block basic
135 | ```
136 | Trained models are saved under `log` directory with a datetime that you started running.
137 | Here is an example of saved model:
138 | ```sh
139 | log/
140 |     └── 20190424_213449-scd-se/
141 |         └── final-auc_0.XXXXXX-acc_0.XXXXXX-f1_0.XXXXXX.h5
142 | ```
143 | 
144 | You can see the available options for training using the command below:
145 | ```sh
146 | $ python train.py -h
147 | 
148 | usage: train.py [-h] [--data-dir PATH] [--log-dir PATH]
149 |                 [--block {basic,se,res1,res2,rese1,rese2}]
150 |                 [--amplifying-ratio N] [--multi] [--batch-size N]
151 |                 [--momentum M] [--lr LR] [--lr-decay DC] [--dropout DO]
152 |                 [--weight-decay WD] [--num-stages N] [--patience N]
153 |                 [--num-readers N]
154 |                 DATASET [NAME]
155 | 
156 | Train a SampleCNN.
157 | 
158 | positional arguments:
159 |   DATASET               Dataset for training: {mtt|scd|dcs}
160 |   NAME                  Name of log directory.
161 | 
162 | optional arguments:
163 |   -h, --help            show this help message and exit
164 |   --data-dir PATH
165 |   --log-dir PATH        Directory where to write event logs and models.
166 |   --block {basic,se,res1,res2,rese1,rese2}
167 |                         Convolutional block to build a model (default: se,
168 |                         options: basic/se/res1/res2/rese1/rese2).
169 |   --amplifying-ratio N
170 |   --multi               Use multi-level feature aggregation.
171 |   --batch-size N        Mini-batch size.
172 |   --momentum M          Momentum for SGD.
173 |   --lr LR               Learning rate.
174 |   --lr-decay DC         Learning rate decay rate.
175 |   --dropout DO          Dropout rate.
176 |   --weight-decay WD     Weight decay.
177 |   --num-stages N        Number of stages to train.
178 |   --patience N          Stop training stage after #patiences.
179 |   --num-readers N       Number of TFRecord readers.
180 | ```
181 | 
182 | 
183 | [1]: http://ismir2009.ismir.net/proceedings/OS5-5.pdf
184 | [2]: http://mirg.city.ac.uk/codeapps/the-magnatagatune-dataset
185 | [3]: https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html
186 | [4]: https://arxiv.org/pdf/1804.03209.pdf
187 | [5]: http://dcase.community/challenge2017/task-large-scale-sound-event-detection
188 | [6]: http://dcase.community/documents/workshop2017/proceedings/DCASE2017Workshop_Mesaros_100.pdf
189 | 


--------------------------------------------------------------------------------