├── .gitignore
├── README.md
├── data_utils.py
├── eval.py
├── get_csv_data.py
├── metrics.py
├── metrics_test.py
├── pipeline.py
├── pipeline_test.py
├── requirements.txt
├── sample_answer.json
├── sj_train.py
├── swa.py
├── trainer.py
├── transforms.py
├── transforms_test.py
├── utils.py
└── utils_test.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | tensorboard_log/
132 | *.h5
133 | *.csv
134 | *.log
135 | *.pickle
136 | *.npy
137 | *.wav


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # challenge
 2 | ready for AI grand challenge
 3 | 
 4 | 
 5 | ## Contributors:
 6 | 
 7 | ### Prof. Jong-hwan Ko, Sungkyunkwan University
 8 | ### Phd. Ji-ho Chang
 9 | ### Tae-soo Kim
10 | ### Daniel Rho
11 | ### Seung-jin Lee
12 | 


--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torchaudio
  3 | import tensorflow as tf
  4 | 
  5 | from utils import EPSILON, safe_div
  6 | from transforms import mask
  7 | 
  8 | 
  9 | def load_wav(wav_fname: str):
 10 |     '''
 11 |     OUTPUT
 12 |     complex_specs: list of complex spectrograms
 13 |                    each complex spectrogram has shape of
 14 |                    [freq, time, chan*2]
 15 |     '''
 16 | 
 17 |     stft = torchaudio.transforms.Spectrogram(512, power=None)
 18 | 
 19 |     wav, r = torchaudio.load(wav_fname)
 20 |     wav = torchaudio.compliance.kaldi.resample_waveform(
 21 |         wav, r, 16000)
 22 |     wav = normalize(wav)
 23 |     wav = stft(wav)
 24 | 
 25 |     # [chan, freq, time, 2] -> [freq, time, chan, 2]
 26 |     wav = wav.numpy().transpose(1, 2, 3, 0)
 27 |     wav = wav.reshape((*wav.shape[:2], -1))
 28 | 
 29 |     return wav
 30 | 
 31 | 
 32 | def normalize(wav):
 33 |     rms = torch.sqrt(torch.mean(torch.pow(wav, 2))) * 10
 34 |     return wav / rms
 35 | 
 36 | 
 37 | def minmax(x, y=None):
 38 |     # batch-wise pre-processing
 39 |     axis = tuple(range(1, len(x.shape)))
 40 | 
 41 |     # MIN-MAX
 42 |     x_max = tf.math.reduce_max(x, axis=axis, keepdims=True)
 43 |     x_min = tf.math.reduce_min(x, axis=axis, keepdims=True)
 44 |     x = safe_div(x-x_min, x_max-x_min)
 45 |     if y is not None:
 46 |         return x, y
 47 |     return x
 48 | 
 49 | 
 50 | def log_on_mel(mel, labels=None):
 51 |     mel = tf.math.log(mel + EPSILON)
 52 | 
 53 |     if labels is not None:
 54 |         return mel, labels
 55 |     return mel
 56 | 
 57 | 
 58 | def augment(specs, labels, time_axis=-2, freq_axis=-3):
 59 |     specs = mask(specs, axis=time_axis, max_mask_size=24, n_mask=6)
 60 |     specs = mask(specs, axis=freq_axis, max_mask_size=16)
 61 |     return specs, labels
 62 | 
 63 | 
 64 | def to_frame_labels(x, y):
 65 |     """
 66 |     :param y: [..., n_voices, n_frames, n_classes]
 67 |     :return: [..., n_frames, n_classes]
 68 |     """
 69 |     y = tf.reduce_sum(y, axis=-3)
 70 |     return x, y
 71 | 
 72 | 
 73 | def mono_chan(x, y=None):
 74 |     if y is not None:
 75 |         return x[..., :1] + x[..., 1:], y
 76 |     return x
 77 | 
 78 | 
 79 | def stereo_mono(x, y=None):
 80 |     if y is None:
 81 |         return tf.concat([x[..., :2], x[..., :1] + x[..., 1:2], x[..., 2:4], x[..., 2:3] + x[..., 3:4]], -1)
 82 |     return tf.concat([x[..., :2], x[..., :1] + x[..., 1:2], x[..., 2:4], x[..., 2:3] + x[..., 3:4]], -1), y
 83 | 
 84 | 
 85 | def label_downsample(resolution=32):
 86 |     def _label_downsample(x, y):
 87 |         if isinstance(y, (list, tuple)):
 88 |             y_ = y[0]
 89 |             y_ = tf.keras.layers.AveragePooling1D(resolution, resolution, padding='same')(y_)
 90 |             y_ = tf.cast(y_ >= 0.5, y_.dtype)[:resolution]
 91 |             y = (y_,) + tuple([*y[1:]])
 92 |         else:
 93 |             y = tf.keras.layers.AveragePooling1D(resolution, resolution, padding='same')(y)
 94 |             y = tf.cast(y >= 0.5, y.dtype)[:resolution]
 95 | 
 96 |         return x, y
 97 |     return _label_downsample
 98 | 
 99 | 
100 | def random_merge_aug(number):
101 |     def _random_merge_aug(x, y=None):
102 |         chan = x.shape[-1] // 2
103 |         if chan != 2:
104 |             raise ValueError('This augment can be used in 2 channel audio')
105 | 
106 |         real = x[...,:chan]
107 |         imag = x[...,chan:]
108 |         
109 |         factor = tf.random.uniform((1, 1, number - chan), 0.1, 0.9)
110 |         aug_real = factor * tf.repeat(real[..., :1], number - chan, -1) + tf.sqrt(1 - factor) * tf.repeat(real[..., 1:], number - chan, -1)
111 |         
112 |         real = tf.concat([real, aug_real], -1)
113 |         imag = tf.concat([imag, tf.repeat(imag[...,:1] + imag[...,1:], number - chan, -1)], -1)
114 |         if y is not None:
115 |             return tf.concat([real, imag], -1), y
116 |         return tf.concat([real, imag], -1)
117 |     return _random_merge_aug
118 | 
119 | 
120 | def multiply_label(multiply_factor):
121 |     def _multiply_label(x, y):
122 |         return x, y * multiply_factor
123 |     return _multiply_label
124 | 
125 | 
126 | def stft_filter(filter_num):
127 |     def _stft_filter(x, y=None):
128 |         mask = tf.concat([tf.ones([1] + [*x.shape[1:]], x.dtype),
129 |                           tf.zeros([filter_num] + [*x.shape[1:]], x.dtype),
130 |                           tf.ones([x.shape[0] - filter_num - 1] + [*x.shape[1:]], x.dtype),
131 |                           ], 0)
132 |         x *= mask
133 |         if y is None:
134 |             return x
135 |         return x, y
136 |     return _stft_filter
137 | 
138 | 
139 | def speech_enhancement_preprocess(x, y=None):
140 |     """
141 |     :param y: ([..., n_voices, n_frames, n_classes], ..., ...)
142 |     :return: [..., n_frames, n_classes]
143 |     """
144 |     x = x[1:,...,:x.shape[-1] // 2]
145 |     if y is None:
146 |         return x
147 |     y = (tf.reduce_sum(y[0], axis=-3), y[1][1:, ...,:x.shape[-1] // 2], y[2][1:, ...,:x.shape[-1] // 2])
148 |     return x, y
149 | 
150 | 
151 | if __name__ == '__main__':
152 |     import glob
153 |     wavs = glob.glob('/codes/2020_track3/t3_audio/*.wav')
154 |     print(wavs)
155 |     stfts = [load_wav(wav) for wav in wavs]
156 | 
157 |     for stft in stfts:
158 |         print(stft.shape)
159 | 
160 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | import tensorflow as tf
 3 | import json
 4 | 
 5 | from transforms import *
 6 | from utils import *
 7 | from data_utils import *
 8 | 
 9 | from sj_train import get_model, ARGS, random_merge_aug, stereo_mono, stft_filter, label_downsample_model
10 | from metrics import Challenge_Metric, output_to_metric, get_er, evaluate
11 |     
12 | 
13 | def minmax_log_on_mel(mel, labels=None):
14 |     # batch-wise pre-processing
15 |     axis = tuple(range(1, len(mel.shape)))
16 | 
17 |     # MIN-MAX
18 |     mel_max = tf.math.reduce_max(mel, axis=axis, keepdims=True)
19 |     mel_min = tf.math.reduce_min(mel, axis=axis, keepdims=True)
20 |     mel = safe_div(mel-mel_min, mel_max-mel_min)
21 | 
22 |     # LOG
23 |     mel = tf.math.log(mel + EPSILON)
24 | 
25 |     if labels is not None:
26 |         return mel, labels
27 |     return mel
28 | 
29 | 
30 | def second2frame(seconds: list, frame_num, resolution):
31 |     # seconds = [[class, start, end], ...]
32 |     frames = np.zeros([frame_num, 3], dtype=np.float32)
33 |     for second in seconds:
34 |         class_num = second[0]
35 |         start = int(np.round(second[1] * resolution))
36 |         end = int(np.round(second[2] * resolution))
37 |         frames[start:end,class_num] += 1
38 |     return tf.convert_to_tensor(frames, dtype=tf.float32)
39 | 
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     config = ARGS()
44 |     config.args.add_argument('--verbose', help='verbose', type=bool, default=True)
45 |     config.args.add_argument('--p', help='parsing name', action='store_true')
46 |     config.args.add_argument('--path', type=str, default='')
47 |     config = config.get()
48 |     if config.p:
49 |         parsed_name = config.name.split('_')
50 |         if parsed_name[0][0] not in ('B', 'v'):
51 |             parsed_name = parsed_name[1:]
52 |         if parsed_name[0] == 'vad':
53 |             config.model_type = 'vad'
54 |             config.model = 1
55 |         else:
56 |             config.model = int(parsed_name[0][-1])
57 |         config.v = int(parsed_name[1][-1])
58 |         config.n_mels = int(parsed_name[6][3:])
59 |         config.n_chan = int(parsed_name[7][-1])
60 |         config.n_frame = int(parsed_name[9].split('framelen')[-1])
61 |     os.environ['CUDA_VISIBLE_DEVICES'] = config.gpus
62 | 
63 |     model = get_model(config)
64 |     model.load_weights(os.path.join(config.path, f'{config.name}.h5'))
65 |     final_score = evaluate(config, model, verbose=config.verbose)
66 | 
67 | 


--------------------------------------------------------------------------------
/get_csv_data.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from glob import glob
  3 | import os
  4 | from numpy import max, mean
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | from sj_train import ARGS, get_model
  9 | from metrics import evaluate
 10 | 
 11 | 
 12 | def main(config):
 13 |     os.environ['CUDA_VISIBLE_DEVICES'] = config.gpus
 14 |     data_path = config.path
 15 |     paths = sorted(glob(os.path.join(data_path, '*.csv')))
 16 |     result_path = os.path.join(data_path, 'result.csv')
 17 |     category = ['이름', '모델', 'version', 'batch', 'lr', 'optimizer', 'loss function', 'input', 'chan', 'output', 'epoch', 'cos_sim', 'er', 'f1_score', 'loss', 'val_cos_sim', 'val_er', 'val_f1_score', 'val_loss', 'test_er', 'swa_test_er', 'sample_test_er']
 18 | 
 19 |     prev_lines = [category]
 20 |     
 21 |     if len(prev_lines) == 0:
 22 |         with open(result_path, 'w') as f:
 23 |             wr = csv.writer(f)
 24 |             wr.writerow(category)
 25 | 
 26 |     for path in tqdm(paths):
 27 |         if path == result_path:
 28 |             continue
 29 | 
 30 |         lines = []
 31 |         with open(path, 'r') as f:
 32 |             data = csv.reader(f)
 33 |             for i, line in enumerate(data):
 34 |                 if i == 0:
 35 |                     continue
 36 |                 lines.append(line)
 37 |         data = lines[max([len(lines)-config.patience, 0])]
 38 |         filename = os.path.splitext(path.split('/')[-1])[0]
 39 |         if 'vad' not in filename:
 40 |             name = filename[filename.find('B'):].split('_')
 41 |         else:
 42 |             name = filename[filename.find('vad'):].split('_')
 43 |         model_name = name[0]
 44 |         version = name[1][1:]
 45 |         lr = name[2][2:]
 46 |         batch = name[3].split('batch')[-1]
 47 |         opt = name[5]
 48 |         n_mel = name[6].split('mel')[-1]
 49 |         chan = name[7].split('chan')[-1]
 50 |         loss = name[8]
 51 |         framelen = name[9].split('framelen')[-1]
 52 |         if 'vad' in name:
 53 |             config.model_type = 'vad'
 54 |         elif 'se' in name:
 55 |             config.model_type = 'se'
 56 |         else:
 57 |             config.model_type = 'eff'
 58 |         evaluation = max([len(lines)-config.patience, 0]) > 5
 59 | 
 60 |         
 61 |         config.model = model_name[1:]
 62 |         config.v = int(version)
 63 |         config.n_mels = int(n_mel)
 64 |         config.n_chan = int(chan)
 65 |         config.n_frame = int(framelen)
 66 |         try:
 67 |             model = get_model(config)
 68 |         except ValueError:
 69 |             continue
 70 |         
 71 |         if config.model_type == 'se':
 72 |             output = str(tuple([i for i in model.output[0].shape[1:]]))
 73 |         else:
 74 |             output = str(tuple([i for i in model.output.shape[1:]]))
 75 |         data = [filename, 'vad' if config.model_type == 'vad' else model_name, version, batch, lr, opt, loss, str(tuple([i for i in model.input.shape[1:-1]])), chan, output] + data
 76 |         if os.path.exists(f'{os.path.splitext(path)[0]}.h5'):
 77 |             if evaluation:
 78 |                 try:
 79 |                     model.load_weights(f'{os.path.splitext(path)[0]}.h5')
 80 |                     score = evaluate(config, model, overlap_hop=int(framelen) // 2, verbose=True)
 81 |                 except:
 82 |                     continue
 83 |             else:
 84 |                 score = 1.0
 85 |             data += [mean(score)]
 86 |         else:
 87 |             data += 'None'
 88 | 
 89 |         if os.path.exists(f'{os.path.splitext(path)[0]}_SWA.h5'):
 90 |             if evaluation:
 91 |                 model.load_weights(f'{os.path.splitext(path)[0]}_SWA.h5')
 92 |                 score = evaluate(config, model, overlap_hop=int(framelen) // 2, verbose=True)
 93 |             else:
 94 |                 score = 1.0
 95 |             data += [mean(score)]
 96 |         else:
 97 |             data += ['None']
 98 | 
 99 |         if os.path.exists(f'{os.path.splitext(path)[0]}_sample.h5'):
100 |             if evaluation:
101 |                 model.load_weights(f'{os.path.splitext(path)[0]}_sample.h5')
102 |                 score = evaluate(config, model, overlap_hop=int(framelen) // 2, verbose=True)
103 |             else:
104 |                 score = 1.0
105 |             data += [mean(score)]
106 |         else:
107 |             data += ['None']
108 | 
109 |         prev_lines.append(data)
110 | 
111 |     with open(result_path, 'w') as f:
112 |         wr = csv.writer(f)
113 |         wr.writerows(prev_lines)
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     args = ARGS()
118 |     args.args.add_argument('--path', type=str, default='')
119 |     main(args.get())
120 | 


--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from glob import glob
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from tensorflow.keras.callbacks import *
  7 | import tensorflow_addons as tfa
  8 | 
  9 | from utils import *
 10 | from data_utils import *
 11 | from transforms import *
 12 | 
 13 | 
 14 | class eval_callback(tf.keras.callbacks.Callback):
 15 |     def __init__(self, config, NAME):
 16 |         super(eval_callback, self).__init__()
 17 |         self.config = config
 18 |         self.name = NAME
 19 |         self.score = np.inf
 20 |         
 21 |     def on_epoch_end(self, epoch, logs=None):
 22 |         if epoch % 5 == 2:
 23 |             model = tf.keras.models.clone_model(self.model)
 24 |             model.load_weights(self.name)
 25 |             score = tf.reduce_mean(evaluate(self.config, model, verbose=True))
 26 |             if score <= self.score:
 27 |                 self.score = score
 28 |                 tf.keras.models.save_model(model, os.path.splitext(self.name)[0] + '_sample.h5')
 29 |             
 30 | 
 31 | def evaluate(config, model, overlap_hop = 512, verbose: bool = False):
 32 |     final_score = []
 33 |     with open('sample_answer.json') as f:
 34 |         answer_gt = json.load(f)
 35 |     answer_gt = answer_gt['task2_answer']
 36 |     sr = 16000
 37 |     hop = 256
 38 |     metric = Challenge_Metric()
 39 | 
 40 |     for path in sorted(glob('*.wav')):
 41 |         inputs = load_wav(path)
 42 |         if config.n_chan == 1:
 43 |             inputs = mono_chan(inputs)
 44 |         elif config.n_chan == 3:
 45 |             inputs = stereo_mono(inputs)
 46 |         elif config.n_chan > 3:
 47 |             inputs = random_merge_aug(config.n_chan)(inputs, None)
 48 | 
 49 |         if config.model_type != 'se':
 50 |             inputs = stft_filter(int(round(256 * 1000 / 16000)))(inputs)
 51 |             inputs = complex_to_magphase(inputs)
 52 |             inputs = magphase_to_mel(config.n_mels)(inputs)
 53 |             inputs = minmax(inputs)
 54 |             inputs = log_on_mel(inputs)
 55 |         else:
 56 |             # inputs = complex_to_magphase(inputs)
 57 |             inputs = speech_enhancement_preprocess(inputs)
 58 | 
 59 |         frame_len = inputs.shape[-2]
 60 |         inputs = tf.signal.frame(inputs, config.n_frame, overlap_hop, pad_end=True, axis=-2)
 61 |         inputs = tf.transpose(inputs, (1, 0, 2, 3))
 62 |         preds = model.predict(inputs[..., :config.n_chan]) # [batch, time, class]
 63 | 
 64 |         if config.model_type == 'se' and config.v == 9:
 65 |             preds = preds[0]
 66 |         
 67 |         if config.v in label_downsample_model:
 68 |             resolution = config.n_frame / preds.shape[-2]
 69 |             preds = tf.keras.layers.UpSampling1D(resolution)(preds)
 70 |             
 71 |         preds = tf.transpose(preds, [2, 0, 1])
 72 |         total_counts = tf.signal.overlap_and_add(tf.ones_like(preds), overlap_hop)[..., :frame_len]
 73 |         preds = tf.signal.overlap_and_add(preds, overlap_hop)[..., :frame_len]
 74 |         preds /= total_counts
 75 |         preds = tf.transpose(preds, [1, 0])
 76 | 
 77 |         # smoothing
 78 |         smoothing_kernel_size = int(0.5 * sr) // hop # 0.5초 길이의 kernel
 79 |         preds = tf.keras.layers.AveragePooling1D(smoothing_kernel_size, 1, padding='same')(preds[tf.newaxis, ...])[0]
 80 |         preds = tf.keras.layers.MaxPooling1D(smoothing_kernel_size * 4, 1, padding='same')(preds[tf.newaxis, ...])[0]
 81 |         preds = tf.cast(preds >= 0.5, tf.float32)
 82 |         cls0, cls1, cls2 = metric.get_start_end_frame(preds)
 83 |         answer_gt_temp = tf.convert_to_tensor(answer_gt[os.path.basename(path)[:-4]])
 84 |         answer_predict = output_to_metric(hop, sr)(cls0, cls1, cls2)
 85 |         er = get_er(answer_gt_temp, answer_predict)
 86 |         
 87 |         final_score.append(er)
 88 |     if verbose:
 89 |         print('FINAL SCORE:', np.mean(final_score))
 90 |     return final_score
 91 | 
 92 | 
 93 | class Challenge_Metric:
 94 |     def __init__(self, sr=16000, hop=256) -> None:
 95 |         self.reset_state()
 96 |         self.sr = sr
 97 |         self.hop = hop
 98 | 
 99 |     def get_start_end_time(self, data):
100 |         data1, data2, data3 = self.get_start_end_frame(data)
101 |         data1 = tf.cast(tf.round(data1 * self.hop / self.sr), tf.int32)
102 |         data2 = tf.cast(tf.round(data2 * self.hop / self.sr), tf.int32)
103 |         data3 = tf.cast(tf.round(data3 * self.hop / self.sr), tf.int32)
104 |         data1 = tf.gather(data1, np.unique(data1, True, axis=0)[1])
105 |         data2 = tf.gather(data2, np.unique(data2, True, axis=0)[1])
106 |         data3 = tf.gather(data3, np.unique(data3, True, axis=0)[1])
107 |         return data1, data2, data3
108 | 
109 |     def get_start_end_frame(self, data):
110 |         data_temp = tf.concat([tf.zeros([1,3]), data[:-1,:]], 0)
111 |         diff_index = tf.where(data_temp != data)
112 |         class_0 = diff_index[diff_index[:,1] == 0][:,0]
113 |         class_1 = diff_index[diff_index[:,1] == 1][:,0]
114 |         class_2 = diff_index[diff_index[:,1] == 2][:,0]
115 | 
116 |         if (class_0.shape[0] % 2 != 0):
117 |             class_0 = tf.concat((class_0, tf.Variable([len(data)], dtype=tf.int64)),0)
118 | 
119 |         class_0 = tf.reshape(class_0, [-1, 2])
120 |         class_0 = tf.transpose(tf.concat([[class_0[:,0]], [class_0[:,1] -1]], 0))
121 | 
122 |         if (class_1.shape[0] % 2 != 0):
123 |             class_1 = tf.concat((class_1, tf.Variable([len(data)], dtype=tf.int64)),0)
124 | 
125 |         class_1 = tf.reshape(class_1, [-1, 2])
126 |         class_1 = tf.transpose(tf.concat([[class_1[:,0]], [class_1[:,1] -1]], 0))
127 | 
128 |         if (class_2.shape[0]  % 2 != 0):
129 |             class_2 = tf.concat((class_2, tf.Variable([len(data)], dtype=tf.int64)),0)
130 | 
131 |         class_2 = tf.reshape(class_2, [-1, 2])
132 |         class_2 = tf.transpose(tf.concat([[class_2[:,0]], [class_2[:,1] -1]], 0))
133 |         return class_0, class_1, class_2
134 | 
135 |     def get_second_answer(self, data):
136 |         data_second = np.asarray([self.hop*i//self.sr for i in range(len(data))])
137 |         second_true = np.zeros([np.max(data_second), 3])
138 |         for i in range(np.max(data_second)):
139 |             second_true[i, 0] = (tf.reduce_mean(data[:, 0][data_second == i]) > 0.5)
140 |             second_true[i, 1] = (tf.reduce_mean(data[:, 1][data_second == i]) > 0.5)
141 |             second_true[i, 2] = (tf.reduce_mean(data[:, 2][data_second == i]) > 0.5)
142 |         cls0, cls1, cls2 = self.get_1(second_true)
143 |         cls0 = tf.cast(cls0, dtype=tf.int32)
144 |         cls1 = tf.cast(cls1, dtype=tf.int32)
145 |         cls2 = tf.cast(cls2, dtype=tf.int32)
146 |         return cls0, cls1, cls2
147 | 
148 |     def reset_state(self):
149 |         self.arr0 = tf.TensorArray(tf.int64, size=0, dynamic_size=True, clear_after_read=False)
150 |         self.arr1 = tf.TensorArray(tf.int64, size=0, dynamic_size=True, clear_after_read=False)
151 |         self.arr2 = tf.TensorArray(tf.int64, size=0, dynamic_size=True, clear_after_read=False)
152 |         self.tmp0 = tf.TensorArray(tf.int64, size=2, dynamic_size=True, clear_after_read=True)
153 |         self.tmp1 = tf.TensorArray(tf.int64, size=2, dynamic_size=True, clear_after_read=True)
154 |         self.tmp2 = tf.TensorArray(tf.int64, size=2, dynamic_size=True, clear_after_read=True)
155 |         self.ts0 = 0 # tmp size
156 |         self.ts1 = 0 # tmp size
157 |         self.ts2 = 0 # tmp size
158 | 
159 | 
160 | def extract_middle(y_pred):
161 |     # [batch, time, cls]
162 |     pred_starts = tf.clip_by_value(y_pred - tf.pad(y_pred, [[0, 0], [1, 0], [0, 0]])[:, :-1], 0, 1)
163 |     pred_ends = tf.clip_by_value(y_pred - tf.pad(y_pred, [[0, 0], [0, 1], [0, 0]])[:, 1:], 0, 1)
164 |     n_pred = tf.reduce_sum(tf.cast(pred_starts, tf.float32), (1, 2))
165 |     pred_starts = tf.where(pred_starts)
166 |     pred_ends = tf.where(pred_ends)
167 |     pred_starts = tf.gather(pred_starts, tf.argsort(pred_starts[:, -1]), -1)
168 |     pred_starts = tf.gather(pred_starts, tf.argsort(pred_starts[:, 0]), 0)
169 |     pred_ends = tf.gather(pred_ends, tf.argsort(pred_ends[:, -1]), -1)
170 |     pred_ends = tf.gather(pred_ends, tf.argsort(pred_ends[:, 0]), 0)
171 | 
172 |     middle = tf.cast((pred_starts+pred_ends)/2, tf.int64)
173 |     return middle
174 | 
175 | 
176 | def get_er(gt, predict):
177 |     predict_2 = tf.identity(predict)
178 |     predict_2 = tf.gather(predict_2, tf.argsort(predict_2[:,1]))
179 |     gt = tf.gather(gt, tf.argsort(gt[:,1]))
180 |     N = len(predict_2) + len(gt)
181 |     answer = 0
182 |     for gt_item in gt:
183 |         remove = False
184 |         for i, pred_item in enumerate(predict_2):
185 |             if (gt_item[1] <= pred_item[1]) and (pred_item[1] <= gt_item[2]):
186 |                 if gt_item[0] == pred_item[0]:
187 |                     answer += 2 
188 |                     temp = i
189 |                     remove = True
190 |                     break
191 |         if remove:
192 |             predict_2 = tf.concat((predict_2[:i,:], predict_2[i+1:, :]), axis=0)
193 |     return (N - answer) / len(gt)
194 | 
195 | 
196 | def output_to_metric(hop, sr):
197 |     hop = hop
198 |     sr = sr
199 |     def output_to_metric_(cls0, cls1, cls2):
200 |         answer_list = tf.cast(tf.zeros([0,2]), tf.int32)
201 | 
202 |         for item in cls0:
203 |             new_item = tf.cast(tf.stack([0, ((item[0] + item[1]) / 2)*hop/sr], 0), answer_list.dtype)[tf.newaxis, ...]
204 |             answer_list = tf.concat([answer_list, new_item], axis=0)
205 | 
206 |         for item in cls1:
207 |             new_item = tf.cast(tf.stack([1, ((item[0] + item[1]) / 2)*hop/sr], 0), answer_list.dtype)[tf.newaxis, ...]
208 |             answer_list = tf.concat([answer_list, new_item], axis=0)
209 | 
210 |         for item in cls2:
211 |             new_item = tf.cast(tf.stack([2, ((item[0] + item[1]) / 2)*hop/sr], 0), answer_list.dtype)[tf.newaxis, ...]
212 |             answer_list = tf.concat([answer_list, new_item], axis=0)
213 |         return answer_list
214 |     return output_to_metric_
215 | 
216 | 
217 | def er_score(threshold=0.5, smoothing=True):
218 |     threshold = tf.constant(threshold, tf.float32)
219 | 
220 |     def er(y_true, y_pred):
221 |         y_true = tf.cast(y_true >= threshold, tf.int32)
222 |         if smoothing:
223 |             smoothing_kernel_size = int(0.5 * 16000) // 256 # 0.5
224 |             y_pred = tf.keras.layers.AveragePooling1D(smoothing_kernel_size, padding='same')(y_pred)
225 |         y_pred = tf.cast(y_pred >= threshold, tf.int32)
226 | 
227 |         # True values
228 |         # [batch, time, cls]
229 |         true_starts = tf.clip_by_value(
230 |             y_true - tf.pad(y_true, [[0, 0], [1, 0], [0, 0]])[:, :-1], 0, 1)
231 |         true_ends = tf.clip_by_value(
232 |             y_true - tf.pad(y_true, [[0, 0], [0, 1], [0, 0]])[:, 1:], 0, 1)
233 |         n_true = tf.reduce_sum(tf.cast(true_starts, tf.float32), (1, 2))
234 | 
235 |         true_starts = tf.where(true_starts)
236 |         true_ends = tf.where(true_ends)
237 |         true_starts = tf.gather(true_starts, tf.argsort(true_starts[:, -1]), -1)
238 |         true_starts = tf.gather(true_starts, tf.argsort(true_starts[:, 0]), 0)
239 |         true_ends = tf.gather(true_ends, tf.argsort(true_ends[:, -1]), -1)
240 |         true_ends = tf.gather(true_ends, tf.argsort(true_ends[:, 0]), 0)
241 | 
242 |         # prediction values
243 |         pred_starts = tf.clip_by_value(
244 |             y_pred - tf.pad(y_pred, [[0, 0], [1, 0], [0, 0]])[:, :-1], 0, 1)
245 |         pred_ends = tf.clip_by_value(
246 |             y_pred - tf.pad(y_pred, [[0, 0], [0, 1], [0, 0]])[:, 1:], 0, 1)
247 |         n_pred = tf.reduce_sum(tf.cast(pred_starts, tf.float32), (1, 2))
248 | 
249 |         pred_starts = tf.where(pred_starts)
250 |         pred_ends = tf.where(pred_ends)
251 |         pred_starts = tf.gather(pred_starts, tf.argsort(pred_starts[:, -1]), -1)
252 |         pred_starts = tf.gather(pred_starts, tf.argsort(pred_starts[:, 0]), 0)
253 |         pred_ends = tf.gather(pred_ends, tf.argsort(pred_ends[:, -1]), -1)
254 |         pred_ends = tf.gather(pred_ends, tf.argsort(pred_ends[:, 0]), 0)
255 | 
256 |         middle = tf.cast((pred_starts+pred_ends)/2, tf.int64)
257 | 
258 |         # correct: correct batch and cls (true, pred)
259 |         correct = (
260 |             true_starts[:, ::2, None]==tf.transpose(middle, (1, 0))[None, ::2])
261 |         correct = tf.reduce_min(tf.cast(correct, tf.float32), axis=1)
262 | 
263 |         mid_time = tf.transpose(middle[:, 1:2], (1, 0))
264 |         correct *= tf.cast(true_starts[:, 1:2] <= mid_time, tf.float32)
265 |         correct *= tf.cast(true_ends[:, 1:2] >= mid_time, tf.float32)
266 |         correct = tf.reduce_max(tf.pad(correct, [[0, 0], [0, 1]]), -1)
267 | 
268 |         correct_per_sample = tf.reduce_sum(
269 |             tf.one_hot(true_starts[:, 0], tf.shape(y_pred)[0])*correct[:, None],
270 |             0)
271 |         score = n_true + n_pred - 2 * correct_per_sample
272 |         score /= tf.clip_by_value(n_true, 1, tf.reduce_max(n_true))
273 |         return score
274 |     return er
275 | 
276 | 
277 | def cos_sim(y_true, y_pred):
278 |     if isinstance(y_true, tuple):
279 |         y_true = y_true[0]
280 |     if isinstance(y_pred, tuple):
281 |         y_pred = y_pred[0]
282 |     mask = tf.cast(
283 |         tf.reduce_sum(y_true, axis=-2) > 0., tf.float32) # [None, 3]
284 |     mask = safe_div(mask, tf.reduce_sum(mask, axis=-1, keepdims=True))
285 |     return tf.reduce_sum(
286 |         tf.keras.losses.cosine_similarity(y_true, y_pred, axis=-2) * mask, 
287 |         axis=-1)
288 | 
289 | 
290 | def f1_score():
291 |     f1_score_fn = tfa.metrics.F1Score(num_classes=3, threshold=0.5, average='micro')
292 |     def f1_score(y_true, y_pred):
293 |         if isinstance(y_true, tuple):
294 |             y_true = y_true[0]
295 |         if isinstance(y_pred, tuple):
296 |             y_pred = y_pred[0]
297 |         return f1_score_fn(y_true, y_pred)
298 |     return f1_score
299 | 
300 | 


--------------------------------------------------------------------------------
/metrics_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from metrics import *
 5 | 
 6 | 
 7 | class MetricsTest(tf.test.TestCase):
 8 |     def setUp(self):
 9 |         self.gt = tf.convert_to_tensor([[0, 0, 10], [2, 0, 20], [1, 15, 30], [2, 31, 40], [1, 32, 35]])
10 |         self.predict = tf.convert_to_tensor([[1, 5], [1, 19], [2, 32], [2, 38], [0, 38]])
11 | 
12 |     def test_er_score(self):
13 |         gt_numpy = self.gt.numpy()
14 |         gt_array = np.zeros([2, 40, 3])
15 |         pred_array = np.zeros([2, 40, 3])
16 |         for item in gt_numpy:
17 |             gt_array[0, item[1]:item[2], item[0]] = 1
18 |             gt_array[1, item[1]:item[2], item[0]] = 1
19 |         for item in self.predict.numpy():
20 |             pred_array[0, item[1]-2:item[1]+2, item[0]] = 1
21 |             pred_array[1, item[1]-2:item[1]+2, item[0]] = 1
22 | 
23 |         er_func = er_score(smoothing=False)
24 |         er = er_func(gt_array, pred_array)
25 |         self.assertEqual(tf.reduce_mean(er), 1.2)
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
30 |     tf.test.main()
31 | 
32 | 


--------------------------------------------------------------------------------
/pipeline.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from functools import partial
  3 | from utils import list_to_generator
  4 | 
  5 | 
  6 | def merge_complex_specs(background, 
  7 |                         voices_and_labels, 
  8 |                         noises=None,
  9 |                         n_frame=300, 
 10 |                         n_classes=3,
 11 |                         t_axis=1, # time-axis
 12 |                         min_ratio=2/3,
 13 |                         min_noise_ratio=1/2,
 14 |                         snr=-20,
 15 |                         seperate_noise_voice=False):
 16 |     '''
 17 |     OUTPUT:
 18 |         complex_spec: (freq, time, chan2)
 19 |         labels: (n_voices, time, n_classes)
 20 |     '''
 21 |     voices, labels = voices_and_labels
 22 |     output_shape = tuple(
 23 |         [s if i != t_axis else n_frame
 24 |          for i, s in enumerate(background.shape)])
 25 |     n_dims = len(output_shape)
 26 |     axis = tuple(i for i in range(n_dims) if i != t_axis)
 27 | 
 28 |     # background and its label
 29 |     bg_frame = tf.shape(background)[t_axis]
 30 |     background = tf.tile(
 31 |         background, 
 32 |         [1 if i != t_axis else (n_frame+bg_frame-1) // bg_frame 
 33 |          for i in range(n_dims)])
 34 |     # background = tf.pad(background, [[4, 0], [0, 0], [0, 0]])
 35 |     complex_spec = tf.image.random_crop(background, output_shape)
 36 |         
 37 |     only_voice = tf.zeros_like(complex_spec)
 38 |     only_noise = tf.identity(complex_spec)
 39 | 
 40 |     # voices
 41 |     max_voices = tf.shape(voices)[0]
 42 |     if max_voices > 1:
 43 |         n_voices = tf.random.uniform([], minval=1, maxval=max_voices,
 44 |                                      dtype='int32')
 45 |     else:
 46 |         n_voices = 1
 47 |     label = tf.zeros(shape=[max_voices, n_frame, n_classes], dtype='float32')
 48 |     for v in range(n_voices):
 49 |         voice = voices[v]
 50 |         v_ratio = tf.math.pow(10., -tf.random.uniform([], maxval=-snr/10))
 51 |         v_frame = tf.shape(voice)[t_axis]
 52 | 
 53 |         l = labels[v:v+1] # shape=[1, n_classes]
 54 |         l = tf.tile(l, [v_frame, 1]) # [v_frame, n_classes]
 55 |         mask = tf.cast(tf.reduce_max(voice, axis=axis) > 0, tf.float32)
 56 |         l *= tf.expand_dims(mask, axis=-1)
 57 | 
 58 |         v_frame = tf.cast(v_frame, tf.float32)
 59 |         pad_size = n_frame - tf.cast(min_ratio*v_frame, tf.int32)
 60 | 
 61 |         if pad_size > 0:
 62 |             voice = tf.pad(
 63 |                 voice,
 64 |                 [[0, 0] if i != t_axis else [pad_size] * 2
 65 |                  for i in range(n_dims)])
 66 |             l = tf.pad(l, [[pad_size]*2, [0, 0]])
 67 | 
 68 |         maxval = tf.shape(voice)[t_axis] - n_frame
 69 |         offset = tf.random.uniform([], maxval=maxval, dtype=tf.int32)
 70 |         voice = tf.slice(
 71 |             voice, 
 72 |             [0 if i != t_axis else offset for i in range(n_dims)],
 73 |             output_shape)
 74 |         l = tf.slice(l, [offset, 0], [n_frame, n_classes])
 75 |         l = tf.reshape(tf.one_hot(v, max_voices, dtype='float32'), (-1, 1, 1)) \
 76 |             * tf.expand_dims(l, axis=0)
 77 | 
 78 |         no_overlap = tf.cast(tf.reduce_max(tf.reduce_sum(label+l, axis=0)) < 2,
 79 |                              tf.float32)
 80 | 
 81 |         complex_spec += v_ratio * voice * no_overlap
 82 |         if seperate_noise_voice:
 83 |             only_voice += v_ratio * voice * no_overlap
 84 |         label += l * no_overlap
 85 |     
 86 |     if noises is not None:
 87 |         n_noises = tf.random.uniform([], maxval=tf.shape(noises)[0],
 88 |                                      dtype='int32')
 89 | 
 90 |         for n in range(n_noises):
 91 |             noise = noises[n]
 92 | 
 93 |             # SNR 0 ~ -20
 94 |             n_ratio = tf.math.pow(10., -tf.random.uniform([], maxval=2)) 
 95 |             ns_frame = tf.cast(tf.shape(noise)[t_axis], tf.float32)
 96 |             pad_size = n_frame - tf.cast(min_noise_ratio*ns_frame, tf.int32)
 97 | 
 98 |             if pad_size > 0:
 99 |                 noise = tf.pad(
100 |                     noise,
101 |                     [[0, 0] if i != t_axis else [pad_size]*2
102 |                      for i in range(n_dims)])
103 |             noise = tf.image.random_crop(noise, output_shape)
104 |             if seperate_noise_voice:
105 |                 only_noise += n_ratio * noise
106 |             complex_spec += n_ratio * noise
107 |     if seperate_noise_voice:
108 |         label = (label, only_voice, only_noise)
109 | 
110 |     return complex_spec, label
111 | 
112 | 
113 | def make_pipeline(backgrounds, # a list of backgrounds noises
114 |                   voices, # a list of human voicess
115 |                   labels, # a list of labelss of human voicess
116 |                   noises=None, # a list of additional noises
117 |                   n_frame=300, # number of frames per sample
118 |                   max_voices=10,
119 |                   max_noises=10,
120 |                   n_classes=3,
121 |                   **kwargs):
122 |     '''
123 |     OUTPUT
124 |         dataset: tf.data.Dataset
125 |                  it only returns a raw complex spectrogram
126 |                  and its labels
127 |                  you have to apply augmentations (ex. mixup)
128 |                  or preprocessing functions (ex. applying log)
129 |                  you don't have to apply shuffle
130 | 
131 |                  complex spectrogram: [freq_bins, n_frame, chan*2]
132 |                      [..., :chan] = real
133 |                      [..., chan:] = imag
134 |                  labels: [n_frame, n_classes]
135 |     '''
136 |     assert len(backgrounds[0].shape) == 3, 'each spec must be a 3D-tensor'
137 |     assert len(voices) == len(labels)
138 |     assert len(labels[0].shape) == 1 and labels[0].shape[0] == n_classes, \
139 |            'labels must be in the form of [n_samples, n_classes]'
140 | 
141 |     # BACKGROUND NOISE (DRONE)
142 |     freq, _, chan = backgrounds[0].shape
143 |     b_dataset = tf.data.Dataset.from_generator(
144 |         list_to_generator(backgrounds),
145 |         tf.float32,
146 |         tf.TensorShape([freq, None, chan]))
147 |     b_dataset = b_dataset.repeat().shuffle(len(backgrounds))
148 | 
149 |     # HUMAN VOICE
150 |     v_dataset = tf.data.Dataset.from_generator(
151 |         list_to_generator((voices, labels)),
152 |         (tf.float32, tf.float32),
153 |         (tf.TensorShape([freq, None, chan]), tf.TensorShape([n_classes])))
154 |     v_dataset = v_dataset.repeat().shuffle(len(voices))
155 |     v_dataset = v_dataset.padded_batch(
156 |         max_voices, padded_shapes=([freq, None, chan], [n_classes]))
157 | 
158 |     # NOISES
159 |     if noises is not None:
160 |         n_dataset = tf.data.Dataset.from_generator(
161 |             list_to_generator(noises),
162 |             tf.float32,
163 |             tf.TensorShape([freq, None, chan]))
164 |         n_dataset = n_dataset.repeat().shuffle(len(noises))
165 |         n_dataset = n_dataset.padded_batch(
166 |             max_noises, padded_shapes=[freq, None, chan])
167 |         dataset = tf.data.Dataset.zip((b_dataset, v_dataset, n_dataset))
168 |     else:
169 |         dataset = tf.data.Dataset.zip((b_dataset, v_dataset))
170 | 
171 |     dataset = dataset.map(partial(merge_complex_specs,
172 |                                   n_frame=n_frame,
173 |                                   n_classes=n_classes,
174 |                                   **kwargs))
175 |     return dataset
176 | 
177 | 


--------------------------------------------------------------------------------
/pipeline_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from pipeline import *
 5 | 
 6 | 
 7 | class PipelineTest(tf.test.TestCase):
 8 |     def setUp(self):
 9 |         self.freq = 257
10 |         self.chan = 4
11 |         self.n_classes = 30
12 | 
13 |     def test_merge_complex_specs(self):
14 |         n_frame = 10
15 | 
16 |         background = np.random.randn(self.freq, 8, self.chan).astype('float32')
17 | 
18 |         n_voices = 4
19 |         voices = np.random.randn(n_voices, self.freq, n_frame, self.chan)
20 |         voices = voices.astype('float32')
21 |         mask = tf.sequence_mask(np.random.randint(1, n_frame, size=n_voices),
22 |                                 n_frame)
23 |         mask = tf.reshape(mask, (n_voices, 1, n_frame, 1))
24 |         voices *= tf.cast(mask, tf.float32)
25 |         labels = np.random.randint(1, n_frame, size=n_voices)
26 |         labels = np.eye(self.n_classes, dtype='float32')[labels]
27 | 
28 |         n_noises = 2
29 |         noises = np.random.randn(n_noises, self.freq, n_frame, self.chan)
30 |         noises = noises.astype('float32')
31 |         mask = tf.sequence_mask(np.random.randint(1, n_frame, size=n_noises),
32 |                                 n_frame)
33 |         mask = tf.reshape(mask, (n_noises, 1, n_frame, 1))
34 |         noises *= tf.cast(mask, tf.float32)
35 | 
36 |         spec, l = merge_complex_specs(background, 
37 |                                       (voices, labels), 
38 |                                       noises,
39 |                                       n_frame=n_frame, 
40 |                                       n_classes=self.n_classes)
41 |         self.assertEqual(spec.shape, [self.freq, n_frame, self.chan]) 
42 |         self.assertEqual(l.shape, [n_voices, n_frame, self.n_classes]) 
43 | 
44 |     def test_make_pipeline(self):
45 |         n_frame = 30
46 | 
47 |         backgrounds = [np.random.randn(self.freq, 
48 |                                        np.random.randint(1, n_frame*2), 
49 |                                        self.chan)
50 |                        for _ in range(30)]
51 |         voices = [np.random.randn(self.freq,
52 |                                   np.random.randint(1, n_frame//2),
53 |                                   self.chan)
54 |                   for _ in range(40)]
55 |         labels = np.random.randint(self.n_classes, size=(40,))
56 |         labels = np.eye(self.n_classes, dtype='float32')[labels]
57 | 
58 |         noises = [np.random.randn(self.freq,
59 |                                   np.random.randint(1, n_frame//2),
60 |                                   self.chan)
61 |                   for _ in range(50)]
62 | 
63 |         pipeline = make_pipeline(backgrounds, 
64 |                                  voices, 
65 |                                  labels, 
66 |                                  noises,
67 |                                  n_frame=n_frame, 
68 |                                  max_voices=4,
69 |                                  max_noises=4,
70 |                                  n_classes=self.n_classes)
71 | 
72 |         for s, l in pipeline.take(3):
73 |             self.assertEqual(s.shape, [self.freq, n_frame, self.chan])
74 |             self.assertEqual(l.shape, [4, n_frame, self.n_classes])
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
79 |     tf.test.main()
80 | 
81 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow-gpu == 2.2.0
2 | tensorflow-probability == 0.10.0
3 | tensorflow_addons
4 | torch # for data processing
5 | torchaudio # for data processing
6 | numpy
7 | efficientnet


--------------------------------------------------------------------------------
/sample_answer.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "task2_answer": {
  3 |         "set01_drone01": [
  4 |             [
  5 |                 0,
  6 |                 210,
  7 |                 213
  8 |             ],
  9 |             [
 10 |                 0,
 11 |                 216,
 12 |                 219
 13 |             ],
 14 |             [
 15 |                 1,
 16 |                 222,
 17 |                 225
 18 |             ],
 19 |             [
 20 |                 2,
 21 |                 74,
 22 |                 78
 23 |             ],
 24 |             [
 25 |                 2,
 26 |                 225,
 27 |                 231
 28 |             ]
 29 |         ],
 30 |         "set01_drone02": [
 31 |             [
 32 |                 0,
 33 |                 168,
 34 |                 171
 35 |             ],
 36 |             [
 37 |                 0,
 38 |                 183,
 39 |                 186
 40 |             ],
 41 |             [
 42 |                 1,
 43 |                 175,
 44 |                 179
 45 |             ],
 46 |             [
 47 |                 2,
 48 |                 165,
 49 |                 168
 50 |             ]
 51 |         ],
 52 |         "set01_drone03": [
 53 |             [
 54 |                 0,
 55 |                 213,
 56 |                 216
 57 |             ],
 58 |             [
 59 |                 0,
 60 |                 220,
 61 |                 224
 62 |             ],
 63 |             [
 64 |                 1,
 65 |                 214,
 66 |                 218
 67 |             ],
 68 |             [
 69 |                 2,
 70 |                 227,
 71 |                 231
 72 |             ]
 73 |         ],
 74 |         "set01_drone01_new": [
 75 |             [
 76 |                 0,
 77 |                 33,
 78 |                 36
 79 |             ],
 80 |             [
 81 |                 0,
 82 |                 60,
 83 |                 64
 84 |             ],
 85 |             [
 86 |                 0,
 87 |                 210,
 88 |                 213
 89 |             ],
 90 |             [
 91 |                 0,
 92 |                 216,
 93 |                 219
 94 |             ],
 95 |             [
 96 |                 1,
 97 |                 91,
 98 |                 94
 99 |             ],
100 |             [
101 |                 1,
102 |                 149,
103 |                 154
104 |             ],
105 |             [
106 |                 1,
107 |                 222,
108 |                 225
109 |             ],
110 |             [
111 |                 2,
112 |                 44,
113 |                 48
114 |             ],
115 |             [
116 |                 2,
117 |                 74,
118 |                 78
119 |             ],
120 |             [
121 |                 2,
122 |                 104,
123 |                 107
124 |             ],
125 |             [
126 |                 2,
127 |                 225,
128 |                 231
129 |             ]
130 |         ],
131 |         "set01_drone02_new": [
132 |             [
133 |                 0,
134 |                 67,
135 |                 70
136 |             ],
137 |             [
138 |                 0,
139 |                 95,
140 |                 98
141 |             ],
142 |             [
143 |                 0,
144 |                 168,
145 |                 171
146 |             ],
147 |             [
148 |                 0,
149 |                 183,
150 |                 186
151 |             ],
152 |             [
153 |                 1,
154 |                 64,
155 |                 68
156 |             ],
157 |             [
158 |                 1,
159 |                 115,
160 |                 118
161 |             ],
162 |             [
163 |                 1,
164 |                 175,
165 |                 179
166 |             ],
167 |             [
168 |                 2,
169 |                 32,
170 |                 35
171 |             ],
172 |             [
173 |                 2,
174 |                 116,
175 |                 121
176 |             ],
177 |             [
178 |                 2,
179 |                 165,
180 |                 168
181 |             ]
182 |         ],
183 |         "set01_drone03_new": [
184 |             [
185 |                 0,
186 |                 18,
187 |                 21
188 |             ],
189 |             [
190 |                 0,
191 |                 105,
192 |                 108
193 |             ],
194 |             [
195 |                 0,
196 |                 213,
197 |                 216
198 |             ],
199 |             [
200 |                 0,
201 |                 220,
202 |                 224
203 |             ],
204 |             [
205 |                 1,
206 |                 55,
207 |                 59
208 |             ],
209 |             [
210 |                 1,
211 |                 131,
212 |                 135
213 |             ],
214 |             [
215 |                 1,
216 |                 214,
217 |                 218
218 |             ],
219 |             [
220 |                 2,
221 |                 57,
222 |                 60
223 |             ],
224 |             [
225 |                 2,
226 |                 154,
227 |                 157
228 |             ],
229 |             [
230 |                 2,
231 |                 227,
232 |                 231
233 |             ]
234 |         ]
235 |     }
236 | }


--------------------------------------------------------------------------------
/sj_train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | from copy import deepcopy
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from tensorflow.keras.callbacks import *
  8 | from tensorflow.keras.losses import *
  9 | from tensorflow.keras.metrics import *
 10 | from tensorflow.keras.optimizers import *
 11 | 
 12 | from metrics import *
 13 | from pipeline import *
 14 | from data_utils import *
 15 | from swa import SWA, NO_SWA_ERROR
 16 | from transforms import *
 17 | from utils import *
 18 | 
 19 | 
 20 | class ARGS:
 21 |     def __init__(self) -> None:
 22 |         self.args = argparse.ArgumentParser()
 23 |         self.args.add_argument('--name', type=str, default='')
 24 |         self.args.add_argument('--gpus', type=str, default='-1')
 25 |         self.args.add_argument('--model', type=int, default=0)
 26 |         self.args.add_argument('--model_type', type=str, default='vad', choices=['vad', 'eff', 'se'])
 27 |         self.args.add_argument('--v', type=int, default=1)
 28 |         self.args.add_argument('--pretrain', type=bool, default=False)
 29 |         self.args.add_argument('--n_layers', type=int, default=0)
 30 |         self.args.add_argument('--n_dim', type=int, default=256)
 31 |         self.args.add_argument('--n_chan', type=int, default=2)
 32 |         self.args.add_argument('--n_classes', type=int, default=3)
 33 |         self.args.add_argument('--patience', type=int, default=10)
 34 | 
 35 |         # DATA
 36 |         self.args.add_argument('--mse_multiplier', type=int, default=1)
 37 |         self.args.add_argument('--datapath', type=str, default='/root/datasets/Interspeech2020/generate_wavs/codes')
 38 |         self.args.add_argument('--background_sounds', type=str, default='drone_normed_complex_v4.pickle')
 39 |         self.args.add_argument('--voices', type=str, default='voice_normed_complex_v3.pickle')
 40 |         self.args.add_argument('--labels', type=str, default='voice_labels_mfc_v3.npy')
 41 |         self.args.add_argument('--noises', type=str, default='noises_specs_v2.pickle')
 42 |         self.args.add_argument('--test_background_sounds', type=str,
 43 |                         default='test_drone_normed_complex_v2.pickle')
 44 |         self.args.add_argument('--test_voices', type=str, default='test_voice_normed_complex.pickle')
 45 |         self.args.add_argument('--test_labels', type=str, default='test_voice_labels_mfc.npy')
 46 |         self.args.add_argument('--n_mels', type=int, default=80)
 47 | 
 48 |         # TRAINING
 49 |         self.args.add_argument('--optimizer', type=str, default='adam',
 50 |                         choices=['adam', 'sgd', 'rmsprop', 'adabelief'])
 51 |         self.args.add_argument('--lr', type=float, default=1e-3)
 52 |         self.args.add_argument('--end_lr', type=float, default=1e-4)
 53 |         self.args.add_argument('--lr_power', type=float, default=0.5)
 54 |         self.args.add_argument('--lr_div', type=float, default=2)
 55 |         self.args.add_argument('--clipvalue', type=float, default=0.01)
 56 | 
 57 |         self.args.add_argument('--epochs', type=int, default=300)
 58 |         self.args.add_argument('--batch_size', type=int, default=12)
 59 |         self.args.add_argument('--n_frame', type=int, default=512)
 60 |         self.args.add_argument('--steps_per_epoch', type=int, default=100)
 61 |         self.args.add_argument('--l1', type=float, default=0)
 62 |         self.args.add_argument('--l2', type=float, default=1e-6)
 63 |         self.args.add_argument('--loss', type=str, default='BCE')
 64 | 
 65 |         # AUGMENTATION
 66 |         self.args.add_argument('--snr', type=float, default=-20)
 67 |         self.args.add_argument('--max_voices', type=int, default=7)
 68 |         self.args.add_argument('--max_noises', type=int, default=2)
 69 | 
 70 |     def get(self):
 71 |         return self.args.parse_args()
 72 | 
 73 | 
 74 | def make_dataset(config, training=True, n_classes=3):
 75 |     # Load required datasets
 76 |     if not os.path.exists(config.datapath):
 77 |         config.datapath = ''
 78 |     if training:
 79 |         backgrounds = load_data(os.path.join(config.datapath, config.background_sounds))
 80 |         voices = load_data(os.path.join(config.datapath, config.voices))
 81 |         labels = load_data(os.path.join(config.datapath, config.labels))
 82 |     else:
 83 |         backgrounds = load_data(os.path.join(config.datapath, config.test_background_sounds))
 84 |         voices = load_data(os.path.join(config.datapath, config.test_voices))
 85 |         labels = load_data(os.path.join(config.datapath, config.test_labels))
 86 |     if labels.max() - 1 != config.n_classes:
 87 |         labels //= 10
 88 |     labels = np.eye(n_classes, dtype='float32')[labels] # to one-hot vectors
 89 |     noises = load_data(os.path.join(config.datapath, config.noises))
 90 | 
 91 |     # Make pipeline and process the pipeline
 92 |     pipeline = make_pipeline(backgrounds, 
 93 |                              voices, labels, noises,
 94 |                              n_frame=config.n_frame,
 95 |                              max_voices=config.max_voices,
 96 |                              max_noises=config.max_noises,
 97 |                              n_classes=n_classes,
 98 |                              snr=config.snr,
 99 |                              min_ratio=1,
100 |                              seperate_noise_voice=config.model_type == 'se' and config.v == 9)
101 |     if config.model_type == 'se' and config.v == 9:
102 |         # pipeline = pipeline.map(complex_to_magphase)
103 |         pipeline = pipeline.map(speech_enhancement_preprocess)
104 |         pipeline = pipeline.batch(config.batch_size, drop_remainder=False)
105 |         pipeline = pipeline.map(label_downsample(32))
106 |         return pipeline.prefetch(AUTOTUNE)
107 |     pipeline = pipeline.map(to_frame_labels)
108 |     if training: 
109 |         pipeline = pipeline.map(augment)
110 |     if config.n_chan == 1:
111 |         pipeline = pipeline.map(mono_chan)
112 |     elif config.n_chan == 3:
113 |         pipeline = pipeline.map(stereo_mono)
114 |     elif config.n_chan > 3:
115 |         pipeline = pipeline.map(random_merge_aug(config.n_chan))
116 |     if 'filter' in config.name:
117 |         pipeline = pipeline.map(stft_filter(int(round(200 / (16000 / 256)))))
118 |     pipeline = pipeline.batch(config.batch_size, drop_remainder=False)
119 |     pipeline = pipeline.map(complex_to_magphase)
120 |     pipeline = pipeline.map(magphase_to_mel(config.n_mels))
121 |     if 'nominmax' not in config.name:
122 |         pipeline = pipeline.map(minmax)
123 |     pipeline = pipeline.map(log_on_mel)
124 |     if config.v in label_downsample_model:
125 |         pipeline = pipeline.map(label_downsample(32))
126 |     elif config.v == 5:
127 |         pipeline = pipeline.map(label_downsample(config.n_frame // (config.n_frame * 256 // 16000)))
128 |     if config.loss.upper() in ('MSE', 'MAE'):
129 |         pipeline = pipeline.map(multiply_label(config.mse_multiplier))
130 |     return pipeline.prefetch(AUTOTUNE)
131 | 
132 | 
133 | def custom_scheduler(d_model, warmup_steps=4000, lr_div=2):
134 |     # https://www.tensorflow.org/tutorials/text/transformer#optimizer
135 |     d_model = tf.cast(d_model, tf.float32)
136 | 
137 |     def _scheduler(step):
138 |         step = tf.cast(step+1, tf.float32)
139 |         arg1 = tf.math.rsqrt(step)
140 |         arg2 = step * (warmup_steps ** -1.5)
141 |         return tf.math.rsqrt(d_model) * tf.math.minimum(arg1, arg2) / lr_div
142 |     return _scheduler
143 | 
144 | 
145 | def adaptive_clip_grad(parameters, gradients, clip_factor=0.01,
146 |                        eps=1e-3):
147 |     new_grads = []
148 |     for (params, grads) in zip(parameters, gradients):
149 |         p_norm = unitwise_norm(params)
150 |         max_norm = tf.math.maximum(p_norm, eps) * clip_factor
151 |         grad_norm = unitwise_norm(grads)
152 |         clipped_grad = grads * (max_norm / tf.math.maximum(grad_norm, 1e-6))
153 |         new_grad = tf.where(grad_norm < max_norm, grads, clipped_grad)
154 |         new_grads.append(new_grad)
155 |     return new_grads
156 | 
157 | 
158 | class CustomModel(tf.keras.Model):
159 |     def __init__(self, **kwargs) -> None:
160 |         super(CustomModel, self).__init__(**kwargs)
161 | 
162 |     def train_step(self, data):
163 |         # Unpack the data. Its structure depends on your model and
164 |         # on what you pass to `fit()`.
165 |         x, y = data
166 |         if not isinstance(y, tuple):
167 |             y = (y,)
168 |         with tf.GradientTape() as tape:
169 |             y_pred = self(x, training=True)  # Forward pass
170 |             if not isinstance(y_pred, (tuple, list)):
171 |                 y_pred = (y_pred,)
172 |             # Compute the loss value
173 |             # (the loss function is configured in `compile()`)
174 |             loss = self.compiled_loss(y, y_pred)
175 | 
176 |         # Compute gradients
177 |         trainable_vars = self.trainable_variables
178 |         
179 |         gradients = tape.gradient(loss, trainable_vars)
180 |         gradients = adaptive_clip_grad(self.trainable_variables, gradients)
181 |         # Update weights
182 |         self.optimizer.apply_gradients(zip(gradients, trainable_vars))
183 |         # Update metrics (includes the metric that tracks the loss)
184 |         
185 |         self.compiled_metrics.update_state(y, y_pred[0])
186 | 
187 |         # Return a dict mapping metric names to current value
188 |         return {m.name: m.result() for m in self.metrics}
189 | 
190 | 
191 | def ConvMPBlock(x, num_convs=2, fsize=32, kernel_size=3, pool_size=(2,2), strides=(2,2), BN=False, DO=False, MP=True):
192 |     for i in range(num_convs):
193 |        x = tf.keras.layers.Conv2D(fsize, kernel_size, padding='same')(x)
194 |        if BN:
195 |            x = tf.keras.layers.BatchNormalization()(x)
196 |        if DO:
197 |            x = tf.keras.layers.Dropout(DO)(x)
198 |        x = tf.keras.layers.Activation('relu')(x)
199 |     if MP:
200 |         x = tf.keras.layers.MaxPooling2D(pool_size=pool_size, strides=strides, padding='same')(x)
201 |     return x
202 | 
203 | 
204 | def FullyConnectedLayer(x, nodes=512, act='relu', BN=False, DO=False, name=None):
205 |     x = tf.keras.layers.Dense(nodes)(x)
206 |     if BN:
207 |         x = tf.keras.layers.BatchNormalization()(x)
208 |     if DO:
209 |         x = tf.keras.layers.Dropout(DO)(x)
210 |     x = tf.keras.layers.Activation(act, name=name)(x)
211 |     return x
212 | 
213 | 
214 | def define_keras_model(config=None):
215 |     fsize = 32
216 |     if config.model_type == 'vad' and config.v == 8:
217 |         fsize = 48
218 | 
219 |     td_dim = 1024
220 |     input_tensor = tf.keras.layers.Input(
221 |         shape=(config.n_mels, config.n_frame, config.n_chan))
222 |     x = input_tensor
223 |     x = ConvMPBlock(x, num_convs=2, fsize=fsize, BN=True)
224 |     for i in range(1, 5):
225 |         if config.model_type == 'vad' and config.v == 6:
226 |             seconds = 0.5
227 |             kernel_size = int(round(seconds / (256 * config.n_frame / 16000 / x.shape[-2])))
228 |             x = tf.keras.layers.AveragePooling2D((1,kernel_size,), 1, padding='same')(x)
229 |             x = tf.keras.layers.MaxPooling2D((1,kernel_size * 2,), 1, padding='same')(x)
230 |         if config.model_type == 'vad' and config.v == 7:
231 |             skip = x
232 |             x = tf.keras.layers.Conv2D(skip.shape[-1] // 4, 1, 1, padding='same')(x)
233 |             x = tf.keras.layers.BatchNormalization()(x)
234 |             x = tf.keras.layers.Activation('relu')(x)
235 |             x = tf.keras.layers.Conv2D(skip.shape[-1] // 4, 3, 1, padding='same')(x)
236 |             x = tf.keras.layers.BatchNormalization()(x)
237 |             x = tf.keras.layers.Activation('relu')(x)
238 |             x = tf.keras.layers.Conv2D(skip.shape[-1], 1, 1, padding='same')(x)
239 |             x = tf.keras.layers.BatchNormalization()(x)
240 |             x = tf.keras.layers.Activation('relu')(x)
241 |             x += skip
242 |         x = ConvMPBlock(x, num_convs=3, fsize=fsize * 2**i, BN=True)
243 | 
244 |     x = tf.keras.layers.Permute((2,1,3))(x)
245 |     x = tf.keras.layers.Reshape((x.shape[1], x.shape[2]*x.shape[3]))(x)
246 |     x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(td_dim, activation='relu'))(x)
247 |     if config.model_type == 'vad' and config.v == 9:
248 |         x = FullyConnectedLayer(x, 512, BN=True)
249 |     x = FullyConnectedLayer(x, 256, BN=True)
250 |     x = FullyConnectedLayer(x, 128, BN=True)
251 |     if config.model_type == 'vad' and config.v == 9:
252 |         x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x)
253 |     x = FullyConnectedLayer(x, 64, BN=True)
254 |     x = FullyConnectedLayer(x, 3, act='sigmoid' if config.model_type == 'vad' else 'relu', name='class' if config.model_type == 'se' else None)
255 |     return CustomModel(inputs=input_tensor, outputs=x)
256 | 
257 | 
258 | def convset(inp, chan=16):
259 |     out = inp
260 |     for _ in range(2):
261 |         out = tf.keras.layers.Conv2D(chan, 3, strides=1, padding='same')(out)
262 |         out = tf.keras.layers.BatchNormalization()(out)
263 |         out = tf.keras.layers.Activation('relu')(out)
264 |     out = tf.keras.layers.MaxPooling2D()(out)
265 |     return out
266 | 
267 | 
268 | def upsampling(inp, chan=64):
269 |     out = inp
270 |     out = tf.keras.layers.Conv2D(chan, 3, strides=1, padding='same')(out)
271 |     out = tf.keras.layers.BatchNormalization()(out)
272 |     out = tf.keras.layers.Activation('relu')(out)
273 |     return tf.keras.layers.Conv2DTranspose(chan, 2, 2, padding='same')(out)
274 | 
275 | 
276 | def speech_enhancement_model(input):
277 |     merge_input = tf.keras.layers.Input(tensor=input[1:])
278 |     inp1 = convset(merge_input, 64)
279 |     inp2 = convset(inp1, 128)
280 |     inp3 = convset(inp2, 256)
281 |     latent = convset(inp3, 512)
282 |     
283 |     speech3 = upsampling(latent, 256)
284 |     speech2 = upsampling(tf.keras.layers.Concatenate(-1)([inp3, speech3]), 128)
285 |     speech1 = upsampling(tf.keras.layers.Concatenate(-1)([inp2, speech2]), 64)
286 |     speech = upsampling(tf.keras.layers.Concatenate(-1)([inp1, speech1]), 2)
287 | 
288 |     noise3 = upsampling(latent, 256)
289 |     noise2 = upsampling(tf.keras.layers.Concatenate(-1)([inp3, noise3]), 128)
290 |     noise1 = upsampling(tf.keras.layers.Concatenate(-1)([inp2, noise2]), 64)
291 |     noise = upsampling(tf.keras.layers.Concatenate(-1)([inp1, noise1]), 2)
292 |     return CustomModel(inputs=merge_input, outputs=[speech, noise])
293 | 
294 | 
295 | def get_model(config):
296 |     input_tensor = tf.keras.layers.Input(
297 |         shape=(config.n_mels, config.n_frame, config.n_chan))
298 | 
299 |     if config.model_type == 'se':
300 |         input_tensor = tf.keras.layers.Input(shape=(256, config.n_frame, config.n_chan))
301 |         merge_input = input_tensor[:, 1:]
302 |         merge_input = tf.transpose(input_tensor, perm=[0, 2, 1, 3])
303 | 
304 |         se_model = speech_enhancement_model(merge_input)
305 |         if not config.pretrain:
306 |             se_model.trainable = False
307 |         speech, noise = se_model(merge_input)
308 |         
309 |         # out = tf.keras.layers.Concatenate(-1)([speech, noise])
310 |         out = speech
311 |         out = tf.transpose(out, perm=[0, 2, 1, 3])
312 |         config.n_mels = out.shape[1]
313 |         tmp_config = deepcopy(config)
314 |         tmp_config.n_chan = out.shape[-1]
315 |         vadmodel = define_keras_model(tmp_config)
316 |         if config.pretrain:
317 |             vadmodel.trainable = False
318 |         out = vadmodel(out)
319 | 
320 |         # backbone = getattr(tf.keras.applications.efficientnet, f'EfficientNetB4')(
321 |         # include_top=False, weights=None, input_tensor=out)
322 |         # out = tf.keras.layers.Permute((2, 1, 3))(backbone.output)
323 |         # out = tf.keras.layers.Reshape((-1, out.shape[-1] * out.shape[-2]))(out)
324 |         # out = tf.keras.layers.Conv1DTranspose(128, 2, 2)(out)
325 |         # out = tf.keras.layers.Activation('relu')(out)
326 |         # out = tf.keras.layers.Conv1DTranspose(64, 2, 2)(out)
327 |         # out = tf.keras.layers.Activation('relu')(out)
328 |         # out = tf.keras.layers.Conv1DTranspose(32, 2, 2)(out)
329 |         # out = tf.keras.layers.Activation('relu')(out)
330 |         # out = tf.keras.layers.Conv1DTranspose(16, 2, 2)(out)
331 |         # out = tf.keras.layers.Activation('relu')(out)
332 |         # out = tf.keras.layers.Conv1DTranspose(8, 2, 2)(out)
333 |         # out = tf.keras.layers.Activation('relu')(out)
334 |         # out = tf.keras.layers.Dense(config.n_classes)(out)
335 |         # out = tf.keras.layers.Activation('sigmoid', name='class')(out)
336 | 
337 |         speech = tf.keras.layers.Permute((2, 1, 3), name='speech')(speech)
338 |         noise = tf.keras.layers.Permute((2, 1, 3), name='noise')(noise)
339 |         return CustomModel(inputs=[input_tensor], outputs=[out, speech, noise])
340 |     elif config.model_type == 'eff':
341 |         backbone = getattr(tf.keras.applications.efficientnet, f'EfficientNetB{config.model}')(
342 |             include_top=False, weights=None, input_tensor=input_tensor)
343 | 
344 |         out = tf.transpose(backbone.output, perm=[0, 2, 1, 3])
345 |         out = tf.keras.layers.Reshape([-1, out.shape[-1]*out.shape[-2]])(out)
346 | 
347 |         for i in range(config.n_layers):
348 |             out = tf.keras.layers.Dense(config.n_dim)(out)
349 |             out = tf.keras.layers.BatchNormalization()(out)
350 |             out = tf.keras.layers.Activation('sigmoid')(out) * out
351 | 
352 |         # v1 -------------------------
353 |         if config.v == 1:
354 |             out = tf.keras.layers.Conv1DTranspose(128, 2, 2)(out)
355 |             out = tf.keras.layers.Activation('relu')(out)
356 |             out = tf.keras.layers.Conv1DTranspose(64, 2, 2)(out)
357 |             out = tf.keras.layers.Activation('relu')(out)
358 |             out = tf.keras.layers.Conv1DTranspose(32, 2, 2)(out)
359 |             out = tf.keras.layers.Activation('relu')(out)
360 |             out = tf.keras.layers.Conv1DTranspose(16, 2, 2)(out)
361 |             out = tf.keras.layers.Activation('relu')(out)
362 |             out = tf.keras.layers.Conv1DTranspose(3, 2, 2)(out)
363 |             out = tf.keras.layers.Activation('relu')(out)
364 |         # v2 -------------------------
365 |         elif config.v == 2:
366 |             raise ValueError('version 2 is deprecated')
367 |             out = tf.keras.layers.Conv1DTranspose(128, 2, 2)(out)
368 |             out = tf.keras.layers.Conv1DTranspose(64, 2, 2)(out)
369 |             out = tf.keras.layers.Conv1DTranspose(32, 2, 2)(out)
370 |             out = tf.keras.layers.Conv1DTranspose(16, 2, 2)(out)
371 |             out = tf.keras.layers.Conv1DTranspose(3, 2, 2)(out)
372 |         elif config.v == 3:
373 |             out = out
374 |         elif config.v == 4:
375 |             raise ValueError('version 4 is deprecated')
376 |             out = tf.keras.layers.Conv1D(config.n_frame, 1, use_bias=False, data_format='channels_first')(out)
377 |         elif config.v == 5:
378 |             if out.shape[1] != config.n_frame * 256 // 16000:
379 |                 out = tf.keras.layers.Conv1D(config.n_frame * 256 // 16000, 1, use_bias=False, data_format='channels_first')(out)
380 |                 out = tf.keras.layers.BatchNormalization()(out)
381 |                 out = tf.keras.layers.Activation('relu')(out)
382 |             out = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(out)
383 |         elif config.v == 6:
384 |             out = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(out)
385 |             out = FullyConnectedLayer(out, 256, BN=True)
386 |             out = FullyConnectedLayer(out, 128, BN=True)
387 |             out = FullyConnectedLayer(out, 64, BN=True)
388 |         elif config.v == 7:
389 |             out = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(out)
390 |             big = tf.keras.layers.Reshape((config.n_mels, -1))(input_tensor)
391 |             big = tf.keras.layers.Conv1D(out.shape[-1], 16, strides=5, padding='same')(big)
392 |             big = tf.keras.layers.Activation('tanh')(big)
393 |             out *= big
394 |         else:
395 |             raise ValueError('wrong version')
396 |             
397 |         out = tf.keras.layers.Dense(config.n_classes)(out)
398 |         # out= tf.keras.layers.Activation('relu')(out)
399 |         # out *= tf.cast(out < 1., out.dtype)
400 |         out = tf.keras.layers.Activation('sigmoid')(out)
401 |         return tf.keras.models.Model(inputs=input_tensor, outputs=out)
402 |     elif config.model_type == 'vad':
403 |         return define_keras_model(config)
404 | 
405 | 
406 | def main():
407 |     config = ARGS().get()
408 |     os.environ['CUDA_VISIBLE_DEVICES'] = config.gpus
409 |     config.loss = config.loss.upper()
410 |     if config.loss != 'MSE':
411 |         config.mse_multiplier = 1
412 |     print(config)
413 | 
414 |     TOTAL_EPOCH = config.epochs
415 |     BATCH_SIZE = config.batch_size
416 |     NAME = (config.name + '_') if config.name != '' else ''
417 |     if config.model_type == 'eff':
418 |         model_first_name = f'B{config.model}'
419 |     elif config.model_type == 'se':
420 |         model_first_name = 'se'
421 |     elif config.model_type == 'vad':
422 |         model_first_name = 'vad'
423 |         
424 |     NAME = NAME + '_'.join([model_first_name, f'v{config.v}', f'lr{config.lr}', 
425 |                             f'batch{config.batch_size}', f'opt_{config.optimizer}', 
426 |                             f'mel{config.n_mels}', f'chan{config.n_chan}', f'{config.loss.upper()}', f'framelen{config.n_frame}'])
427 |     if config.model_type == 'se' and config.v == 9 and config.pretrain:
428 |         NAME += '_weight'
429 |     NAME = NAME if NAME.endswith('.h5') else NAME + '.h5'
430 |     """ MODEL """
431 |     model = get_model(config)
432 | 
433 |     lr = config.lr
434 |     if config.optimizer == 'adam':
435 |         opt = Adam(lr, clipvalue=config.clipvalue)
436 |     elif config.optimizer == 'sgd':
437 |         opt = SGD(lr, momentum=0.9, clipvalue=config.clipvalue)
438 |     elif config.optimizer == 'rmsprop':
439 |         opt = RMSprop(lr, momentum=0.9, clipvalue=config.clipvalue)
440 |     else:
441 |         raise ValueError('adabelief is deprecated')
442 |         opt = AdaBelief(lr, clipvalue=config.clipvalue)
443 |     # if config.l2 > 0:
444 |     #     model = apply_kernel_regularizer(
445 |     #         model, tf.keras.regularizers.l1_l2(config.l1, config.l2))
446 | 
447 |     if config.loss.upper() == 'BCE':
448 |         loss = tf.keras.losses.BinaryCrossentropy()
449 |     elif config.loss.upper() == 'FOCAL':
450 |         loss = sigmoid_focal_crossentropy
451 |     if config.model_type == 'se' and config.v == 9:
452 |         loss = [loss, tf.losses.MAE, tf.losses.MAE]
453 | 
454 |     metrics = [cos_sim,
455 |                f1_score()]
456 |     if config.v != 5:
457 |         metrics.append(er_score(smoothing=False))
458 |     model.compile(optimizer=opt, 
459 |                 #   loss=custom_loss(alpha=config.loss_alpha, l2=config.loss_l2),
460 |                   loss=loss,
461 |                   loss_weights=[1, 10, 10],
462 |                   metrics=metrics)
463 |     setattr(model, 'train_config', config)
464 |     model.summary()
465 |     print(NAME)
466 | 
467 |     if config.model_type == 'se' and config.v == 9 and not config.pretrain:
468 |         model.load_weights(NAME)
469 |         print('loaded pretrained model')
470 | 
471 |     """ DATA """
472 |     train_set = make_dataset(config, training=True)
473 |     test_set = make_dataset(config, training=False)
474 |     
475 |     earlystop_monitor = 'val_loss'
476 |     model_checkpoint_monitor = 'val_class_er' if config.v == 9 and config.model_type == 'eff' else 'val_er'
477 |     if config.model_type == 'se' and config.v == 9:
478 |         if config.pretrain:
479 |             earlystop_monitor = 'val_speech_loss'
480 |             model_checkpoint_monitor = 'val_speech_loss'
481 |         else:
482 |             earlystop_monitor = 'val_class_loss'
483 |             model_checkpoint_monitor = 'val_class_er'
484 |     else:
485 |         earlystop_monitor = 'val_loss'
486 |         model_checkpoint_monitor = 'val_er'
487 | 
488 |     """ TRAINING """
489 |     callbacks = [
490 |         CSVLogger(NAME.replace('.h5', '.csv'), append=True),
491 |         SWA(start_epoch=TOTAL_EPOCH//4, swa_freq=2),
492 |         ModelCheckpoint(NAME, monitor=model_checkpoint_monitor, save_best_only=True, verbose=1),
493 |         TerminateOnNaN(),
494 |         TensorBoard(log_dir=os.path.join('tensorboard_log', NAME.split('.h5')[0])),
495 |         EarlyStopping(monitor=earlystop_monitor, patience=config.patience, restore_best_weights=True),
496 |         eval_callback(config, NAME),
497 |         # LearningRateScheduler(tf.keras.optimizers.schedules.CosineDecayRestarts(config.lr, 5), verbose=1),
498 |         # LearningRateScheduler(lr_schedule, verbose=1),
499 |         # ReduceLROnPlateau(monitor='val_loss', factor=1 / 2**0.5, patience=5, verbose=1, mode='min')
500 |     ]
501 |     callbacks.append(
502 |         LearningRateScheduler(
503 |             custom_scheduler(4096, TOTAL_EPOCH/12, config.lr_div)))
504 | 
505 |     # if not config.pretrain:
506 |     #     callbacks.append(
507 |     #         LearningRateScheduler(
508 |     #             custom_scheduler(4096, TOTAL_EPOCH/12, config.lr_div)))
509 |     # else:
510 |     #     callbacks.append(ReduceLROnPlateau(monitor='val_loss', factor=1 / 2**0.5, patience=5, verbose=1, mode='min'))
511 | 
512 |     try:
513 |         model.fit(train_set,
514 |                 epochs=TOTAL_EPOCH,
515 |                 batch_size=BATCH_SIZE,
516 |                 steps_per_epoch=config.steps_per_epoch,
517 |                 validation_data=test_set,
518 |                 validation_steps=16,
519 |                 callbacks=callbacks)
520 |         print('best model:', NAME.replace('.h5', '_SWA.h5'))
521 |         model.save(NAME.replace('.h5', '_SWA.h5'))
522 |     except NO_SWA_ERROR:
523 |         pass
524 |     print(NAME.split('.h5')[0])
525 |     exit()
526 | 
527 | 
528 | if __name__ == "__main__":
529 |     main()
530 |     
531 | 


--------------------------------------------------------------------------------
/swa.py:
--------------------------------------------------------------------------------
 1 | # https://github.com/simon-larsson/keras-swa/blob/master/swa/keras.py
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | class NO_SWA_ERROR(Exception):
 6 |     def __init__(self, msg="Didn't use SWA") -> None:
 7 |         self.msg = msg
 8 | 
 9 |     def __str__(self) -> str:
10 |         return self.msg
11 | 
12 | 
13 | class SWA(tf.keras.callbacks.Callback):
14 |     def __init__(self, start_epoch, swa_freq=1, verbose=True):
15 |         super(SWA, self).__init__()
16 |         self.start_epoch = start_epoch - 1
17 |         self.swa_freq = swa_freq
18 |         self.swa_weights = None
19 |         self.cnt = 0
20 |         self.verbose = verbose
21 | 
22 |     def on_epoch_end(self, epoch, logs=None):
23 |         epoch = epoch - self.start_epoch
24 |         if epoch == 0 or (epoch > 0 and epoch % self.swa_freq == 0):
25 |             if self.verbose:
26 |                 print("\nSaving Weights... ", epoch+self.start_epoch)
27 |             self.update_swa_weights()
28 | 
29 |     def on_train_end(self, logs=None):
30 |         print("\nFinal Model Has Been Saved... Please Reset BN")
31 |         try:
32 |             self.model.set_weights(self.swa_weights)
33 |         except TypeError:
34 |             raise NO_SWA_ERROR()
35 |         
36 |     def update_swa_weights(self):
37 |         if self.swa_weights is None:
38 |             self.swa_weights = self.model.get_weights()
39 |         else:
40 |             self.swa_weights = [
41 |                 (swa_w*self.cnt + w) / (self.cnt+1)
42 |                 for swa_w, w in zip(self.swa_weights, self.model.get_weights())]
43 | 
44 |         self.cnt += 1
45 | 
46 | 


--------------------------------------------------------------------------------
/trainer.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import os
  4 | import tensorflow as tf
  5 | from tensorflow.keras.callbacks import *
  6 | from tensorflow.keras.losses import *
  7 | from tensorflow.keras.metrics import *
  8 | from tensorflow.keras.optimizers import *
  9 | 
 10 | from metrics import *
 11 | from pipeline import *
 12 | from swa import SWA
 13 | from transforms import *
 14 | from utils import *
 15 | 
 16 | 
 17 | args = argparse.ArgumentParser()
 18 | args.add_argument('--name', type=str, required=True)
 19 | args.add_argument('--model', type=str, default='EfficientNetB4')
 20 | args.add_argument('--pretrain', type=bool, default=False)
 21 | args.add_argument('--n_layers', type=int, default=0)
 22 | args.add_argument('--n_dim', type=int, default=256)
 23 | args.add_argument('--n_chan', type=int, default=1)
 24 | args.add_argument('--n_classes', type=int, default=3)
 25 | 
 26 | # DATA
 27 | args.add_argument('--datapath', type=str, default='/root/datasets/Interspeech2020/generate_wavs/codes')
 28 | args.add_argument('--background_sounds', type=str, default='drone_normed_complex_v3.pickle')
 29 | args.add_argument('--voices', type=str, default='voice_normed_complex_v3.pickle')
 30 | args.add_argument('--labels', type=str, default='voice_labels_mfc_v3.npy')
 31 | args.add_argument('--noises', type=str, default='noises_specs_v2.pickle')
 32 | args.add_argument('--test_background_sounds', type=str,
 33 |                   default='dummy_specs.pickle')
 34 | args.add_argument('--test_voices', type=str, default='dummy_specs.pickle')
 35 | args.add_argument('--test_labels', type=str, default='dummy_labels.npy')
 36 | args.add_argument('--n_mels', type=int, default=80)
 37 | 
 38 | # TRAINING
 39 | args.add_argument('--optimizer', type=str, default='adabelief',
 40 |                   choices=['adam', 'sgd', 'rmsprop', 'adabelief'])
 41 | args.add_argument('--lr', type=float, default=1e-4)
 42 | args.add_argument('--end_lr', type=float, default=1e-4)
 43 | args.add_argument('--lr_power', type=float, default=0.5)
 44 | args.add_argument('--lr_div', type=float, default=2)
 45 | args.add_argument('--clipvalue', type=float, default=0.01)
 46 | 
 47 | args.add_argument('--epochs', type=int, default=500)
 48 | args.add_argument('--batch_size', type=int, default=12)
 49 | args.add_argument('--n_frame', type=int, default=2048)
 50 | args.add_argument('--steps_per_epoch', type=int, default=100)
 51 | args.add_argument('--l1', type=float, default=0)
 52 | args.add_argument('--l2', type=float, default=1e-6)
 53 | args.add_argument('--loss_alpha', type=float, default=0.8)
 54 | args.add_argument('--loss_l2', type=float, default=1.)
 55 | args.add_argument('--multiplier', type=float, default=10)
 56 | 
 57 | # AUGMENTATION
 58 | args.add_argument('--snr', type=float, default=-15)
 59 | args.add_argument('--max_voices', type=int, default=10)
 60 | args.add_argument('--max_noises', type=int, default=6)
 61 | 
 62 | 
 63 | def minmax_log_on_mel(mel, labels=None):
 64 |     # batch-wise pre-processing
 65 |     axis = tuple(range(1, len(mel.shape)))
 66 | 
 67 |     # MIN-MAX
 68 |     mel_max = tf.math.reduce_max(mel, axis=axis, keepdims=True)
 69 |     mel_min = tf.math.reduce_min(mel, axis=axis, keepdims=True)
 70 |     mel = safe_div(mel-mel_min, mel_max-mel_min)
 71 | 
 72 |     # LOG
 73 |     mel = tf.math.log(mel + EPSILON)
 74 | 
 75 |     if labels is not None:
 76 |         return mel, labels
 77 |     return mel
 78 | 
 79 | 
 80 | def augment(specs, labels, time_axis=-2, freq_axis=-3):
 81 |     specs = mask(specs, axis=time_axis, max_mask_size=24, n_mask=6)
 82 |     specs = mask(specs, axis=freq_axis, max_mask_size=16)
 83 |     return specs, labels
 84 | 
 85 | 
 86 | def preprocess_labels(multiplier):
 87 |     def _preprocess(x, y):
 88 |         # process y: [None, time, classes] -> [None, time', classes]
 89 |         for i in range(5):
 90 |             # sum_pool1d
 91 |             y = tf.nn.avg_pool1d(y, 2, strides=2, padding='SAME') * 2
 92 |         y *= multiplier
 93 |         return x, y
 94 |     return _preprocess
 95 | 
 96 | 
 97 | def to_density_labels(x, y):
 98 |     """
 99 |     :param y: [..., n_voices, n_frames, n_classes]
100 |     :return: [..., n_frames, n_classes]
101 |     """
102 |     y = safe_div(y, tf.reduce_sum(y, axis=(-2, -1), keepdims=True))
103 |     y = tf.reduce_sum(y, axis=-3)
104 |     return x, y
105 | 
106 | 
107 | def make_dataset(config, training=True, n_classes=3):
108 |     # Load required datasets
109 |     if not os.path.exists(config.datapath):
110 |        config.datapath = ''
111 |     if training:
112 |         backgrounds = load_data(os.path.join(config.datapath, config.background_sounds))
113 |         voices = load_data(os.path.join(config.datapath, config.voices))
114 |         labels = load_data(os.path.join(config.datapath, config.labels))
115 |     else:
116 |         backgrounds = load_data(os.path.join(config.datapath, config.test_background_sounds))
117 |         voices = load_data(os.path.join(config.datapath, config.test_voices))
118 |         labels = load_data(os.path.join(config.datapath, config.test_labels))
119 |     if labels.max() - 1 != config.n_classes:
120 |         labels //= 10
121 |     labels = np.eye(n_classes, dtype='float32')[labels] # to one-hot vectors
122 |     noises = load_data(os.path.join(config.datapath, config.noises))
123 | 
124 |     # Make pipeline and process the pipeline
125 |     pipeline = make_pipeline(backgrounds, 
126 |                              voices, labels, noises,
127 |                              n_frame=config.n_frame,
128 |                              max_voices=config.max_voices,
129 |                              max_noises=config.max_noises,
130 |                              n_classes=n_classes,
131 |                              snr=config.snr,
132 |                              min_ratio=1)
133 |     pipeline = pipeline.map(to_density_labels)
134 |     if training: 
135 |         pipeline = pipeline.map(augment)
136 |     pipeline = pipeline.batch(config.batch_size, drop_remainder=False)
137 |     pipeline = pipeline.map(complex_to_magphase)
138 |     pipeline = pipeline.map(magphase_to_mel(config.n_mels))
139 |     pipeline = pipeline.map(minmax_log_on_mel)
140 |     pipeline = pipeline.map(preprocess_labels(config.multiplier))
141 |     return pipeline.prefetch(AUTOTUNE)
142 | 
143 | 
144 | def custom_loss(alpha=0.8, l2=1):
145 |     def _custom(y_true, y_pred):
146 |         # y_true, y_pred = [None, time, 30]
147 |         # [None, time, 30] -> [None, time, 3, 10]
148 |         t_true = tf.stack(tf.split(y_true, 3, axis=-1), axis=-2)
149 |         t_pred = tf.stack(tf.split(y_pred, 3, axis=-1), axis=-2)
150 | 
151 |         # [None, time, 10]
152 |         d_y_true = tf.reduce_sum(t_true, axis=-2)
153 |         d_y_pred = tf.reduce_sum(t_pred, axis=-2)
154 | 
155 |         # [None, time, 3]
156 |         c_y_true = tf.reduce_sum(t_true, axis=-1)
157 |         c_y_pred = tf.reduce_sum(t_pred, axis=-1)
158 | 
159 |         loss = alpha * tf.keras.losses.MAE(tf.reduce_sum(d_y_true, axis=1),
160 |                                            tf.reduce_sum(d_y_pred, axis=1)) \
161 |              + (1-alpha) * tf.keras.losses.MAE(tf.reduce_sum(c_y_true, axis=1),
162 |                                                tf.reduce_sum(c_y_pred, axis=1))
163 | 
164 |         # TODO: OT loss
165 |         # TV: total variation loss
166 |         # normed - degrees [None, time, 10]
167 |         n_d_true = safe_div(
168 |             d_y_true, tf.reduce_sum(d_y_true, axis=1, keepdims=True))
169 |         n_d_pred = safe_div(
170 |             d_y_pred, tf.reduce_sum(d_y_pred, axis=1, keepdims=True))
171 | 
172 |         # normed - classes [None, time, 3]
173 |         n_c_true = safe_div(
174 |             c_y_true, tf.reduce_sum(c_y_true, axis=1, keepdims=True))
175 |         n_c_pred = safe_div(
176 |             c_y_pred, tf.reduce_sum(c_y_pred, axis=1, keepdims=True))
177 | 
178 |         tv = alpha * tf.reduce_mean(
179 |                 tf.reduce_sum(tf.math.abs(n_d_true - n_d_pred), axis=1) 
180 |                 * tf.reduce_sum(d_y_true, axis=1), # [None, 10]
181 |                 axis=1)
182 |         tv += (1-alpha) * tf.reduce_mean(
183 |                 tf.reduce_sum(tf.math.abs(n_c_true - n_c_pred), axis=1) 
184 |                 * tf.reduce_sum(c_y_true, axis=1), # [None, 3]
185 |                 axis=1)
186 |         loss += l2 * tv
187 | 
188 |         return loss
189 |     return _custom
190 | 
191 | 
192 | def cos_sim(y_true, y_pred):
193 |     mask = tf.cast(
194 |         tf.reduce_sum(y_true, axis=-2) > 0., tf.float32) # [None, 30]
195 |     mask = safe_div(mask, tf.reduce_sum(mask, axis=-1, keepdims=True))
196 |     return tf.reduce_sum(
197 |         tf.keras.losses.cosine_similarity(y_true, y_pred, axis=-2) * mask, 
198 |         axis=-1)
199 | 
200 | 
201 | def custom_scheduler(d_model, warmup_steps=4000, lr_div=2):
202 |     # https://www.tensorflow.org/tutorials/text/transformer#optimizer
203 |     d_model = tf.cast(d_model, tf.float32)
204 | 
205 |     def _scheduler(step):
206 |         step = tf.cast(step+1, tf.float32)
207 |         arg1 = tf.math.rsqrt(step)
208 |         arg2 = step * (warmup_steps ** -1.5)
209 |         return tf.math.rsqrt(d_model) * tf.math.minimum(arg1, arg2) / lr_div
210 |     return _scheduler
211 | 
212 | 
213 | if __name__ == "__main__":
214 |     config = args.parse_args()
215 |     print(config)
216 | 
217 |     TOTAL_EPOCH = config.epochs
218 |     BATCH_SIZE = config.batch_size
219 |     NAME = config.name if config.name.endswith('.h5') else config.name + '.h5'
220 | 
221 |     """ MODEL """
222 |     input_tensor = tf.keras.layers.Input(
223 |         shape=(config.n_mels, config.n_frame, config.n_chan))
224 |     backbone = getattr(tf.keras.applications.efficientnet, config.model)(
225 |         include_top=False, weights=None, input_tensor=input_tensor)
226 | 
227 |     out = tf.transpose(backbone.output, perm=[0, 2, 1, 3])
228 |     out = tf.keras.layers.Reshape([-1, out.shape[-1]*out.shape[-2]])(out)
229 | 
230 |     for i in range(config.n_layers):
231 |         out = tf.keras.layers.Dense(config.n_dim)(out)
232 |         out = tf.keras.layers.BatchNormalization()(out)
233 |         out = tf.keras.layers.Activation('sigmoid')(out) * out
234 | 
235 |     out = tf.keras.layers.Dense(config.n_classes, activation='relu')(out)
236 |     model = tf.keras.models.Model(inputs=input_tensor, outputs=out)
237 | 
238 |     lr = config.lr
239 |     if config.optimizer == 'adam':
240 |         opt = Adam(lr, clipvalue=config.clipvalue)
241 |     elif config.optimizer == 'sgd':
242 |         opt = SGD(lr, momentum=0.9, clipvalue=config.clipvalue)
243 |     elif config.optimizer == 'rmsprop':
244 |         opt = RMSprop(lr, momentum=0.9, clipvalue=config.clipvalue)
245 |     else:
246 |         opt = AdaBelief(lr, clipvalue=config.clipvalue)
247 | 
248 |     if config.l2 > 0:
249 |         model = apply_kernel_regularizer(
250 |             model, tf.keras.regularizers.l1_l2(config.l1, config.l2))
251 |     model.compile(optimizer=opt, 
252 |                   loss=custom_loss(alpha=config.loss_alpha, l2=config.loss_l2),
253 |                   metrics=[cos_sim])
254 |     # model.summary()
255 | 
256 |     if config.pretrain:
257 |         model.load_weights(NAME)
258 |         print('loaded pretrained model')
259 | 
260 |     """ DATA """
261 |     train_set = make_dataset(config, training=True)
262 |     test_set = make_dataset(config, training=False)
263 |     
264 |     """ TRAINING """
265 |     callbacks = [
266 |         CSVLogger(NAME.replace('.h5', '.log'), append=True),
267 |         SWA(start_epoch=TOTAL_EPOCH//2, swa_freq=2),
268 |         ModelCheckpoint(NAME, monitor='val_loss', save_best_only=True,
269 |                         verbose=1),
270 |         TerminateOnNaN()
271 |     ]
272 | 
273 |     if not config.pretrain:
274 |         callbacks.append(
275 |             LearningRateScheduler(
276 |                 custom_scheduler(4096, TOTAL_EPOCH/12, config.lr_div)))
277 |     else:
278 |         callbacks.append(
279 |             ReduceLROnPlateau(monitor='loss', factor=0.9, patience=5))
280 | 
281 |     model.fit(train_set,
282 |               epochs=TOTAL_EPOCH,
283 |               batch_size=BATCH_SIZE,
284 |               steps_per_epoch=config.steps_per_epoch,
285 |               validation_data=test_set,
286 |               validation_steps=16,
287 |               callbacks=callbacks)
288 | 
289 |     model.save(NAME.replace('.h5', '_SWA.h5'))
290 | 
291 | 


--------------------------------------------------------------------------------
/transforms.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from math import log, e
  4 | 
  5 | 
  6 | AUTOTUNE = tf.data.experimental.AUTOTUNE
  7 | EPSILON = 1e-8
  8 | LOG_EPSILON = log(EPSILON) / log(e)
  9 | 
 10 | 
 11 | """ FEATURE INDEPENDENT AUGMENTATIONS """
 12 | def mask(specs, axis, max_mask_size=None, n_mask=1):
 13 |     def make_shape(size):
 14 |         # returns (1, ..., size, ..., 1)
 15 |         shape = [1] * len(specs.shape)
 16 |         shape[axis] = size
 17 |         return tuple(shape)
 18 | 
 19 |     total = specs.shape[axis]
 20 |     mask = tf.ones(make_shape(total), dtype=specs.dtype)
 21 |     if max_mask_size is None:
 22 |         max_mask_size = total
 23 | 
 24 |     def apply_random_mask(mask):
 25 |         size = tf.random.uniform([], maxval=max_mask_size, dtype=tf.int32)
 26 |         offset = tf.random.uniform([], maxval=total-size, dtype=tf.int32)
 27 | 
 28 |         mask *= tf.concat(
 29 |             (tf.ones(shape=make_shape(offset), dtype=mask.dtype),
 30 |              tf.zeros(shape=make_shape(size), dtype=mask.dtype),
 31 |              tf.ones(shape=make_shape(total-size-offset), dtype=mask.dtype)),
 32 |             axis=axis)
 33 |         return mask
 34 | 
 35 |     i = tf.constant(0)
 36 |     cond = lambda i, m: i < n_mask
 37 |     body = lambda i, m: (i+1, apply_random_mask(m))
 38 |     _, mask = tf.while_loop(cond, body, (i, mask))
 39 | 
 40 |     return specs * mask
 41 | 
 42 | 
 43 | def random_shift(specs, axis=0, width=16):
 44 |     new_specs = tf.pad(specs, [[0]*2 if i != axis else [width]*2
 45 |                                for i in range(len(specs.shape))])
 46 |     new_specs = tf.image.random_crop(new_specs, specs.shape)
 47 |     return new_specs
 48 | 
 49 | 
 50 | """ MAGNITUDE-PHASE SPECTROGRAM """
 51 | def magphase_to_mel(num_mel_bins=80,
 52 |                     num_spectrogram_bins=257, 
 53 |                     sample_rate=16000,
 54 |                     **kwargs):
 55 |     mel_matrix = tf.signal.linear_to_mel_weight_matrix(
 56 |         num_mel_bins, num_spectrogram_bins, sample_rate, **kwargs)
 57 | 
 58 |     def _magphase_to_mel(x, y=None):
 59 |         '''
 60 |         x: [batch_size, freq, time, chan2]
 61 | 
 62 |         output: [batch_size, mel_freq, time, chan]
 63 |         '''
 64 |         x = x[..., :tf.shape(x)[-1] // 2] # remove phase
 65 |         x = tf.tensordot(x, mel_matrix, axes=[-3, 0]) # [b, time, chan, mel]
 66 |         
 67 |         if len(x.shape) == 4:
 68 |             x = tf.transpose(x, perm=[0, 3, 1, 2])
 69 |         elif len(x.shape) == 3:
 70 |             x = tf.transpose(x, perm=[2, 0, 1])
 71 |         else:
 72 |             raise ValueError('len(x.shape) must be 3 or 4')
 73 | 
 74 |         if y is None:
 75 |             return x
 76 |         return x, y
 77 |     return _magphase_to_mel 
 78 | 
 79 | 
 80 | def log_magphase(specs, labels=None, n_chan=2):
 81 |     specs = tf.concat(
 82 |             [tf.math.log(specs[..., :n_chan]+EPSILON), specs[..., n_chan:]],
 83 |             axis=-1)
 84 |     if labels is not None:
 85 |         return specs, labels
 86 |     return specs
 87 | 
 88 | 
 89 | def minmax_norm_magphase(specs, labels=None):
 90 |     n_chan = specs.shape[-1] // 2
 91 |     mag = specs[..., :n_chan]
 92 |     phase = specs[..., n_chan:]
 93 |     axis = tuple(range(1, len(specs.shape)))
 94 | 
 95 |     mag_max = tf.math.reduce_max(mag, axis=axis, keepdims=True)
 96 |     mag_min = tf.math.reduce_min(mag, axis=axis, keepdims=True)
 97 |     phase_max = tf.math.reduce_max(phase, axis=axis, keepdims=True)
 98 |     phase_min = tf.math.reduce_min(phase, axis=axis, keepdims=True)
 99 | 
100 |     specs = tf.concat(
101 |         [(mag-mag_min)/(mag_max-mag_min+EPSILON),
102 |          (phase-phase_min)/(phase_max-phase_min+EPSILON)],
103 |         axis=-1)
104 | 
105 |     if labels is not None:
106 |         return specs, labels
107 |     return specs
108 | 
109 | 
110 | """ COMPLEX-SPECTROGRAMS """
111 | def complex_to_magphase(complex_tensor, y=None):
112 |     n_chan = complex_tensor.shape[-1] // 2
113 |     real = complex_tensor[..., :n_chan]
114 |     img = complex_tensor[..., n_chan:]
115 | 
116 |     mag = tf.math.sqrt(real**2 + img**2)
117 |     phase = tf.math.atan2(img, real)
118 | 
119 |     magphase = tf.concat([mag, phase], axis=-1)
120 | 
121 |     if y is None:
122 |         return magphase
123 |     return magphase, y
124 | 
125 | 
126 | def magphase_to_complex(magphase):
127 |     n_chan = magphase.shape[-1] // 2
128 |     mag = magphase[..., :n_chan]
129 |     phase = magphase[..., n_chan:]
130 | 
131 |     real = mag * tf.cos(phase)
132 |     img = mag * tf.sin(phase)
133 | 
134 |     return tf.concat([real, img], axis=-1)
135 | 
136 | 
137 | def phase_vocoder(complex_spec: tf.Tensor,
138 |                   rate: float=1.) -> tf.Tensor:
139 |     """
140 |     https://pytorch.org/audio/_modules/torchaudio/functional.html#phase_vocoder
141 | 
142 |     complex_spec: [freq, time, chan*2] 
143 |                   [..., :chan] = real, [..., chan:] = imag
144 |     rate: float > 0.
145 |     """
146 |     if rate == 1:
147 |         return complex_spec
148 | 
149 |     # shape = tf.shape(complex_spec)
150 |     freq = complex_spec.shape[0]
151 |     hop_length = freq - 1 # n_fft // 2
152 |     n_chan = complex_spec.shape[-1] // 2
153 | 
154 |     def angle(spec):
155 |         return tf.math.atan2(spec[..., n_chan:], spec[..., :n_chan])
156 | 
157 |     phase_advance = tf.linspace(
158 |         0., np.pi * tf.cast(hop_length, 'float32'), freq)
159 |     phase_advance = tf.reshape(phase_advance, (-1, 1, 1))
160 |     time_steps = tf.range(
161 |         0, tf.shape(complex_spec)[1], rate, dtype=complex_spec.dtype)
162 | 
163 |     spec = tf.pad(
164 |         complex_spec,
165 |         [[0, 0] if i != 1 else [0, 2] for i in range(len(complex_spec.shape))])
166 | 
167 |     spec_0 = tf.gather(spec, tf.cast(time_steps, 'int32'), axis=1)
168 |     spec_1 = tf.gather(spec, tf.cast(time_steps+1, 'int32'), axis=1)
169 | 
170 |     angle_0 = angle(spec_0)
171 |     angle_1 = angle(spec_1)
172 | 
173 |     norm_0 = tf.norm(
174 |         tf.transpose(tf.reshape(spec_0, (freq, -1, 2, n_chan)), (0, 1, 3, 2)),
175 |         2, axis=-1)
176 |     norm_1 = tf.norm(
177 |         tf.transpose(tf.reshape(spec_1, (freq, -1, 2, n_chan)), (0, 1, 3, 2)),
178 |         2, axis=-1)
179 | 
180 |     # Compute Phase Accum
181 |     phase_0 = angle(spec[..., :1, :]) # first frame angle
182 |     phase = angle_1 - angle_0 - phase_advance
183 |     phase = phase - 2 * np.pi * tf.math.round(phase / (2 * np.pi))
184 |     phase = phase + phase_advance
185 |     phase = tf.concat([phase_0, phase[:, :-1]], axis=1)
186 |     phase_acc = tf.cumsum(phase, 1)
187 | 
188 |     alphas = tf.reshape(time_steps % 1., (1, -1, 1))
189 |     mag = alphas * norm_1 + (1 - alphas) * norm_0
190 | 
191 |     real = mag * tf.cos(phase_acc)
192 |     imag = mag * tf.sin(phase_acc)
193 | 
194 |     spec = tf.concat([real, imag], axis=-1)
195 |     return spec
196 | 
197 | 


--------------------------------------------------------------------------------
/transforms_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torchaudio
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from transforms import *
  7 | 
  8 | 
  9 | class TransformsTest(tf.test.TestCase):
 10 |     def test_mask(self):
 11 |         tf.random.set_seed(100)
 12 |         org = np.array([[ 0,  1,  2,  3,  4],
 13 |                         [ 5,  6,  7,  8,  9],
 14 |                         [10, 11, 12, 13, 14],
 15 |                         [15, 16, 17, 18, 19],
 16 |                         [20, 21, 22, 23, 24]])
 17 |         target = np.array([[ 0,  0,  0,  0,  0],
 18 |                            [ 0,  0,  0,  0,  0],
 19 |                            [ 0,  0,  0,  0,  0],
 20 |                            [15, 16, 17, 18, 19],
 21 |                            [20, 21, 22, 23, 24]])
 22 |         self.assertAllEqual(target, 
 23 |                             mask(org, axis=0, max_mask_size=None, n_mask=1))
 24 | 
 25 |         tf.random.set_seed(2020)
 26 |         target = np.array([[ 0,  1,  0,  3,  4],
 27 |                            [ 0,  6,  0,  8,  9],
 28 |                            [ 0, 11,  0, 13, 14],
 29 |                            [ 0, 16,  0, 18, 19],
 30 |                            [ 0, 21,  0, 23, 24]])
 31 |         self.assertAllEqual(target, 
 32 |                             mask(org, axis=1, max_mask_size=3, n_mask=2))
 33 | 
 34 |     def test_random_shift(self):
 35 |         tf.random.set_seed(0)
 36 |         org = np.array([[0, 1, 2],
 37 |                         [3, 4, 5],
 38 |                         [6, 7, 8]])
 39 |         target = np.array([[3, 4, 5],
 40 |                            [6, 7, 8],
 41 |                            [0, 0, 0]])
 42 |         self.assertAllEqual(target, 
 43 |                             random_shift(org, axis=0, width=2))
 44 | 
 45 |     def test_magphase_to_mel(self):
 46 |         # BATCH
 47 |         n_mels = 80
 48 |         magphase = np.random.randn(32, 257, 100, 4).astype('float32')
 49 |         mel = magphase_to_mel(n_mels)(magphase)
 50 |         self.assertEqual(mel.shape, [32, n_mels, 100, 2])
 51 | 
 52 |         # SINGLE SAMPLE
 53 |         magphase = np.random.randn(257, 100, 4).astype('float32')
 54 |         mel = magphase_to_mel(n_mels)(magphase)
 55 |         self.assertEqual(mel.shape, [n_mels, 100, 2])
 56 | 
 57 |     def test_log_magphase(self):
 58 |         specs = np.array([[  1,  10, 100,   0,   1,  -1],
 59 |                           [500,  50,   5,   3,  -3,   0]])
 60 |         t_specs = np.array([[0.      , 2.302585, 4.605170,  0,  1, -1],
 61 |                             [6.214608, 3.912023, 1.609438,  3, -3,  0]])
 62 |         self.assertAllClose(t_specs, log_magphase(specs, n_chan=3))
 63 | 
 64 |     def test_minmax_norm_magphse(self):
 65 |         n_sample, n_feature, n_chan = 5, 10, 2
 66 |         axis = tuple(range(1, 3))
 67 |         mag = np.random.randn(n_sample, n_feature, n_chan)
 68 |         phase = np.random.rand(n_sample, n_feature, n_chan)
 69 |         phase = (2*phase - 1) * np.pi
 70 |         magphase = np.concatenate([mag, phase], axis=-1)
 71 | 
 72 |         minmax_normed = minmax_norm_magphase(magphase)
 73 |         mins = tf.math.reduce_min(minmax_normed, axis=axis)
 74 |         maxs = tf.math.reduce_max(minmax_normed, axis=axis)
 75 | 
 76 |         self.assertAllClose(mins, tf.zeros_like(mins))
 77 |         self.assertAllClose(maxs, tf.ones_like(maxs))
 78 | 
 79 |     def test_complex_to_magphase(self):
 80 |         complex_tensor = np.array(
 81 |             [[1, 0], [0, 1], [-1, 0], [0, -1]], dtype='float32')
 82 |         magphase = np.array(
 83 |             [[1, 0], [1, np.pi/2], [1, np.pi], [1, -np.pi/2]],
 84 |             dtype='float32')
 85 | 
 86 |         self.assertAllClose(magphase,
 87 |                             complex_to_magphase(complex_tensor))
 88 | 
 89 |     def test_magphase_to_complex(self):
 90 |         magphase = np.array(
 91 |             [[1, 0], [1, np.pi/2], [1, np.pi], [1, -np.pi/2]],
 92 |             dtype='float32')
 93 |         complex_tensor = np.array(
 94 |             [[1, 0], [0, 1], [-1, 0], [0, -1]], dtype='float32')
 95 | 
 96 |         self.assertAllClose(complex_tensor, magphase_to_complex(magphase))
 97 | 
 98 |     def test_phase_vocoder(self):
 99 |         n_freq, time, chan2 = 257, 100, 6
100 |         complex_spec = tf.random.normal([n_freq, time, chan2])
101 | 
102 |         self.assertAllEqual(complex_spec,
103 |                             phase_vocoder(complex_spec, 1.))
104 |         
105 |         for rate in [1.2, 0.8]:
106 |             pv = phase_vocoder(complex_spec, rate=rate)
107 |             self.assertAllEqual([n_freq, int(np.ceil(time/rate)), chan2],
108 |                                 pv.shape)
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
113 |     tf.test.main()
114 | 
115 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import pickle
  4 | import tensorflow as tf
  5 | 
  6 | EPSILON = 1e-8
  7 | label_downsample_model = (3, 6, 7, 8, 9)
  8 | 
  9 | 
 10 | ''' 
 11 | UTILS FOR FRAMES AND WINDOWS 
 12 | '''
 13 | def seq_to_windows(seq, 
 14 |                    window, 
 15 |                    skip=1,
 16 |                    padding=True, 
 17 |                    **kwargs):
 18 |     '''
 19 |     INPUT:
 20 |         seq: np.ndarray
 21 |         window: array of indices
 22 |             ex) [-3, -1, 0, 1, 3]
 23 |         skip: int
 24 |         padding: bool
 25 |         **kwargs: params for np.pad
 26 | 
 27 |     OUTPUT:
 28 |         windows: [n_windows, window_size, ...]
 29 |     '''
 30 |     window = np.array(window - np.min(window)).astype(np.int32)
 31 |     win_size = max(window) + 1
 32 |     windows = window[np.newaxis, :] \
 33 |             + np.arange(0, len(seq), skip)[:, np.newaxis]
 34 |     if padding:
 35 |         seq = np.pad(
 36 |             seq,
 37 |             [[win_size//2, (win_size-1)//2]] + [[0, 0]]*len(seq.shape[1:]),
 38 |             mode='constant',
 39 |             **kwargs)
 40 | 
 41 |     return np.take(seq, windows, axis=0)
 42 | 
 43 | 
 44 | def windows_to_seq(windows,
 45 |                    window,
 46 |                    skip=1):
 47 |     '''
 48 |     INPUT:
 49 |         windows: np.ndarray (n_windows, window_size, ...)
 50 |         window: array of indices
 51 |         skip: int
 52 | 
 53 |     OUTPUT:
 54 |         seq
 55 |     '''
 56 |     n_window = windows.shape[0]
 57 |     window = np.array(window - np.min(window)).astype(np.int32)
 58 |     win_size = max(window)
 59 | 
 60 |     seq_len = (n_window-1)*skip + 1
 61 |     seq = np.zeros([seq_len, *windows.shape[2:]], dtype=windows.dtype)
 62 |     count = np.zeros(seq_len)
 63 | 
 64 |     for i, w in enumerate(window):
 65 |         indices = np.arange(n_window)*skip - win_size//2 + w
 66 |         select = np.logical_and(0 <= indices, indices < seq_len)
 67 |         seq[indices[select]] += windows[select, i]
 68 |         count[indices[select]] += 1
 69 |     
 70 |     seq = seq / (count + EPSILON)
 71 |     return seq
 72 | 
 73 | 
 74 | '''
 75 | DATASET
 76 | '''
 77 | def list_to_generator(dataset: list):
 78 |     def _gen():
 79 |         if isinstance(dataset, tuple):
 80 |             for z in zip(*dataset):
 81 |                 yield z
 82 |         else:
 83 |             for data in dataset:
 84 |                 yield data
 85 |     return _gen
 86 | 
 87 | 
 88 | def load_data(path):
 89 |     if path.endswith('.pickle'):
 90 |         return pickle.load(open(path, 'rb'))
 91 |     elif path.endswith('.npy'):
 92 |         return np.load(path)
 93 |     else:
 94 |         raise ValueError('invalid file format')
 95 | 
 96 | 
 97 | '''
 98 | MODEL
 99 | '''
100 | def apply_kernel_regularizer(model, kernel_regularizer):
101 |     model = tf.keras.models.clone_model(model)
102 |     layer_types = (tf.keras.layers.Dense, tf.keras.layers.Conv2D)
103 |     for layer in model.layers:
104 |         if isinstance(layer, layer_types):
105 |             layer.kernel_regularizer = kernel_regularizer
106 | 
107 |     model = tf.keras.models.clone_model(model)
108 |     return model
109 | 
110 | 
111 | '''
112 | ETC
113 | '''
114 | def safe_div(x, y, eps=EPSILON):
115 |     # returns safe x / max(y, epsilon)
116 |     return x / tf.maximum(y, eps)
117 | 
118 | 
119 | def predict(model, xs, reverse_and_add=False, vad=False, **kwargs):
120 |     output = model.predict(xs, **kwargs)
121 |     if vad:
122 |         output = output[..., :30] * tf.nn.sigmoid(output[..., 30:])
123 | 
124 |     if reverse_and_add:
125 |         rev_output = model.predict(tf.reverse(xs, [-1]), **kwargs)
126 |         if vad:
127 |             rev_output = rev_output[..., :30] * tf.nn.sigmoid(rev_output[..., 30:])
128 |         shape = rev_output.shape[:-1]
129 |         rev_output = rev_output.reshape(*shape, -1, 10)
130 |         rev_output = np.flip(rev_output, -1)
131 |         rev_output = rev_output.reshape(*shape, -1)
132 |         
133 |         output = (output + rev_output) / 2
134 |     return output
135 | 
136 | 
137 | '''
138 | OPTIMIZER
139 | '''
140 | class AdaBelief(tf.keras.optimizers.Optimizer):
141 |     _HAS_AGGREGATE_GRAD = True
142 | 
143 |     def __init__(self,
144 |                  learning_rate=0.001,
145 |                  beta_1=0.9,
146 |                  beta_2=0.999,
147 |                  epsilon=1e-7,
148 |                  amsgrad=False,
149 |                  name='AdaBelief',
150 |                  **kwargs):
151 |         super(AdaBelief, self).__init__(name, **kwargs)
152 |         self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
153 |         self._set_hyper('decay', self._initial_decay)
154 |         self._set_hyper('beta_1', beta_1)
155 |         self._set_hyper('beta_2', beta_2)
156 |         self.epsilon = epsilon or backend_config.epsilon()
157 |         self.amsgrad = amsgrad
158 | 
159 |     def _create_slots(self, var_list):
160 |         # Create slots for the first and second moments.
161 |         # Separate for-loops to respect the ordering of slot variables from v1.
162 |         for var in var_list:
163 |             self.add_slot(var, 'm')
164 |         for var in var_list:
165 |             self.add_slot(var, 'v')
166 |         if self.amsgrad:
167 |             for var in var_list:
168 |                 self.add_slot(var, 'vhat')
169 | 
170 |     def _prepare_local(self, var_device, var_dtype, apply_state):
171 |         super(AdaBelief, self)._prepare_local(var_device, var_dtype, apply_state)
172 | 
173 |         local_step = tf.cast(self.iterations + 1, var_dtype)
174 |         beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype))
175 |         beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype))
176 |         beta_1_power = tf.math.pow(beta_1_t, local_step)
177 |         beta_2_power = tf.math.pow(beta_2_t, local_step)
178 |         lr = (apply_state[(var_device, var_dtype)]['lr_t'] *
179 |                     (tf.math.sqrt(1 - beta_2_power) / (1 - beta_1_power)))
180 |         apply_state[(var_device, var_dtype)].update(
181 |             dict(
182 |                 lr=lr,
183 |                 epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
184 |                 beta_1_t=beta_1_t,
185 |                 beta_1_power=beta_1_power,
186 |                 one_minus_beta_1_t=1 - beta_1_t,
187 |                 beta_2_t=beta_2_t,
188 |                 beta_2_power=beta_2_power,
189 |                 one_minus_beta_2_t=1 - beta_2_t))
190 | 
191 |     def set_weights(self, weights):
192 |         params = self.weights
193 |         num_vars = int((len(params) - 1) / 2)
194 |         if len(weights) == 3 * num_vars + 1:
195 |             weights = weights[:len(params)]
196 |         super(AdaBelief, self).set_weights(weights)
197 | 
198 |     def _resource_apply_dense(self, grad, var, apply_state=None):
199 |         var_device, var_dtype = var.device, var.dtype.base_dtype
200 |         coefficients = ((apply_state or {}).get((var_device, var_dtype))
201 |                         or self._fallback_apply_state(var_device, var_dtype))
202 | 
203 |         # m_t = beta1 * m + (1 - beta1) * g_t
204 |         m = self.get_slot(var, 'm')
205 |         m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
206 |         m_t = tf.compat.v1.assign(m, 
207 |                                m * coefficients['beta_1_t'] + m_scaled_g_values,
208 |                                use_locking=self._use_locking)
209 | 
210 |         # v_t = beta2 * v + (1 - beta2) * ((g_t-m_t) * (g_t-m_t))
211 |         v = self.get_slot(var, 'v')
212 |         grad_dev = grad - m_t 
213 |         v_scaled_g_values = (grad_dev * grad_dev) * coefficients['one_minus_beta_2_t']
214 |         v_t = tf.compat.v1.assign(v, 
215 |                                   v * coefficients['beta_2_t'] + v_scaled_g_values,
216 |                                   use_locking=self._use_locking)
217 | 
218 |         if not self.amsgrad:
219 |             v_sqrt = tf.math.sqrt(v_t)
220 |             var_update = tf.compat.v1.assign_sub(
221 |                 var, coefficients['lr'] * m_t / (v_sqrt + coefficients['epsilon']),
222 |                 use_locking=self._use_locking)
223 |             return tf.group(*[var_update, m_t, v_t])
224 |         else:
225 |             v_hat = self.get_slot(var, 'vhat')
226 |             v_hat_t = tf.math.maximum(v_hat, v_t)
227 |             with ops.control_dependencies([v_hat_t]):
228 |                 v_hat_t = tf.compat.v1.assign(
229 |                     v_hat, v_hat_t, use_locking=self._use_locking)
230 |             v_hat_sqrt = tf.math.sqrt(v_hat_t)
231 |             var_update = tf.compat.v1.assign_sub(
232 |                 var,
233 |                 coefficients['lr'] * m_t / (v_hat_sqrt + coefficients['epsilon']),
234 |                 use_locking=self._use_locking)
235 |             return tf.group(*[var_update, m_t, v_t, v_hat_t])
236 | 
237 |     def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
238 |         var_device, var_dtype = var.device, var.dtype.base_dtype
239 |         coefficients = ((apply_state or {}).get((var_device, var_dtype))
240 |                         or self._fallback_apply_state(var_device, var_dtype))
241 | 
242 |         # m_t = beta1 * m + (1 - beta1) * g_t
243 |         m = self.get_slot(var, 'm')
244 |         m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
245 |         m_t = tf.compat.v1.assign(m, m * coefficients['beta_1_t'],
246 |                                                      use_locking=self._use_locking)
247 |         with ops.control_dependencies([m_t]):
248 |             m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
249 | 
250 |         # v_t = beta2 * v + (1 - beta2) * ((g_t-m_t) * (g_t-m_t))
251 |         v = self.get_slot(var, 'v')
252 |         grad_dev = grad - m_t 
253 |         v_scaled_g_values = (grad_dev * grad_dev) * coefficients['one_minus_beta_2_t']
254 |         v_t = tf.compat.v1.assign(v, v * coefficients['beta_2_t'],
255 |                                use_locking=self._use_locking)
256 |         with ops.control_dependencies([v_t]):
257 |             v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
258 | 
259 |         if not self.amsgrad:
260 |             v_sqrt = tf.math.sqrt(v_t)
261 |             var_update = tf.compat.v1.assign_sub(
262 |                 var, coefficients['lr'] * m_t / (v_sqrt + coefficients['epsilon']),
263 |                 use_locking=self._use_locking)
264 |             return tf.group(*[var_update, m_t, v_t])
265 |         else:
266 |             v_hat = self.get_slot(var, 'vhat')
267 |             v_hat_t = tf.math.maximum(v_hat, v_t)
268 |             with ops.control_dependencies([v_hat_t]):
269 |                 v_hat_t = tf.compat.v1.assign(
270 |                     v_hat, v_hat_t, use_locking=self._use_locking)
271 |             v_hat_sqrt = tf.math.sqrt(v_hat_t)
272 |             var_update = tf.compat.v1.assign_sub(
273 |                 var,
274 |                 coefficients['lr'] * m_t / (v_hat_sqrt + coefficients['epsilon']),
275 |                 use_locking=self._use_locking)
276 |             return tf.group(*[var_update, m_t, v_t, v_hat_t])
277 | 
278 |     def get_config(self):
279 |         config = super(AdaBelief, self).get_config()
280 |         config.update({
281 |             'learning_rate': self._serialize_hyperparameter('learning_rate'),
282 |             'decay': self._serialize_hyperparameter('decay'),
283 |             'beta_1': self._serialize_hyperparameter('beta_1'),
284 |             'beta_2': self._serialize_hyperparameter('beta_2'),
285 |             'epsilon': self.epsilon,
286 |             'amsgrad': self.amsgrad,
287 |         })
288 |         return config
289 | 
290 | 
291 | def sigmoid_focal_crossentropy(
292 |     y_true,
293 |     y_pred,
294 |     alpha = 0.25,
295 |     gamma = 2.0,
296 |     from_logits: bool = False,
297 | ) -> tf.Tensor:
298 |     """Implements the focal loss function.
299 | 
300 |     Focal loss was first introduced in the RetinaNet paper
301 |     (https://arxiv.org/pdf/1708.02002.pdf). Focal loss is extremely useful for
302 |     classification when you have highly imbalanced classes. It down-weights
303 |     well-classified examples and focuses on hard examples. The loss value is
304 |     much high for a sample which is misclassified by the classifier as compared
305 |     to the loss value corresponding to a well-classified example. One of the
306 |     best use-cases of focal loss is its usage in object detection where the
307 |     imbalance between the background class and other classes is extremely high.
308 | 
309 |     Args:
310 |         y_true: true targets tensor.
311 |         y_pred: predictions tensor.
312 |         alpha: balancing factor.
313 |         gamma: modulating factor.
314 | 
315 |     Returns:
316 |         Weighted loss float `Tensor`. If `reduction` is `NONE`,this has the
317 |         same shape as `y_true`; otherwise, it is scalar.
318 |     """
319 |     if gamma and gamma < 0:
320 |         raise ValueError("Value of gamma should be greater than or equal to zero.")
321 | 
322 |     y_pred = tf.convert_to_tensor(y_pred)
323 |     y_true = tf.cast(y_true, dtype=y_pred.dtype)
324 | 
325 |     # Get the cross_entropy for each entry
326 |     ce = tf.keras.backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits)
327 | 
328 |     # If logits are provided then convert the predictions into probabilities
329 |     if from_logits:
330 |         pred_prob = tf.sigmoid(y_pred)
331 |     else:
332 |         pred_prob = y_pred
333 | 
334 |     p_t = (y_true * pred_prob) + ((1 - y_true) * (1 - pred_prob))
335 |     alpha_factor = 1.0
336 |     modulating_factor = 1.0
337 | 
338 |     if alpha:
339 |         alpha = tf.cast(alpha, dtype=y_true.dtype)
340 |         alpha_factor = y_true * alpha + (1 - y_true) * (1 - alpha)
341 | 
342 |     if gamma:
343 |         gamma = tf.cast(gamma, dtype=y_true.dtype)
344 |         modulating_factor = tf.pow((1.0 - p_t), gamma)
345 | 
346 |     # compute the final loss and return
347 |     return tf.reduce_mean(tf.reduce_sum(alpha_factor * modulating_factor * ce, axis=-1), axis=-1)
348 | 
349 | 
350 | def unitwise_norm(x):
351 |     if len(x.get_shape()) <= 1:  # Scalars and vectors
352 |         axis = None
353 |         keepdims = False
354 |     elif len(x.get_shape()) in [2, 3]:  # Linear layers of shape IO or multihead linear
355 |         axis = 0
356 |         keepdims = True
357 |     elif len(x.get_shape()) == 4:  # Conv kernels of shape HWIO
358 |         axis = [0, 1, 2,]
359 |         keepdims = True
360 |     else:
361 |         raise ValueError(f"Got a parameter with shape not in [1, 2, 4]! {x}")
362 |     return compute_norm(x, axis, keepdims)
363 | 
364 | 
365 | def compute_norm(x, axis, keepdims):
366 |     return tf.math.reduce_sum(x ** 2, axis=axis, keepdims=keepdims) ** 0.5
367 | 
368 |     


--------------------------------------------------------------------------------
/utils_test.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import unittest
  3 | from utils import *
  4 | 
  5 | 
  6 | class UtilsTest(unittest.TestCase):
  7 |     def test_seq_to_windows(self):
  8 |         seq = np.array([1, 2, 3, 4, 5])
  9 |         window = np.array([-3, -1, 0, 1, 3])
 10 | 
 11 |         target = np.array([[0, 0, 1, 2, 4],
 12 |                            [0, 1, 2, 3, 5],
 13 |                            [0, 2, 3, 4, 0],
 14 |                            [1, 3, 4, 5, 0],
 15 |                            [2, 4, 5, 0, 0]])
 16 |         self.assertEqual(target.tolist(), 
 17 |                          seq_to_windows(seq, window).tolist())
 18 |         self.assertEqual(target[::2].tolist(),
 19 |                          seq_to_windows(seq, window, 2).tolist())
 20 | 
 21 |     def test_windows_to_seq(self):
 22 |         windows = np.array([[0, 0, 1, 2, 4],
 23 |                             [0, 1, 2, 3, 5],
 24 |                             [0, 2, 3, 4, 0],
 25 |                             [1, 3, 4, 5, 0],
 26 |                             [2, 4, 5, 0, 0]])
 27 |         window = np.array([-3, -1, 0, 1, 3])
 28 | 
 29 |         target = np.array([1, 2, 3, 4, 5])
 30 |         self.assertTrue(
 31 |             np.allclose(target, windows_to_seq(windows, window)))
 32 |         self.assertTrue(
 33 |             np.allclose(target, windows_to_seq(windows[::2], window, skip=2)))
 34 | 
 35 |     def test_list_to_generator(self):
 36 |         n_samples = 4
 37 |         x = np.random.randn(n_samples, 30)
 38 |         y = np.random.randn(n_samples)
 39 | 
 40 |         x_gen = list_to_generator(x)
 41 |         self.assertTrue(callable(x_gen))
 42 |         for i, x_ in enumerate(x_gen()):
 43 |             self.assertEqual(x[i].tolist(), x_.tolist())
 44 | 
 45 |         xy_gen = list_to_generator((x, y))
 46 |         self.assertTrue(callable(xy_gen))
 47 |         for i, (x_, y_) in enumerate(xy_gen()):
 48 |             self.assertEqual(x[i].tolist(), x_.tolist())
 49 |             self.assertEqual(y[i], y_)
 50 | 
 51 |     def test_load_data(self):
 52 |         raise NotImplemented('TODO: not yet implemented')
 53 | 
 54 |     def test_apply_kernel_regularizer(self):
 55 |         n_samples, in_shape, out_shape = 128, 4, 4
 56 |         x = np.random.randn(n_samples, in_shape)
 57 |         y = np.random.randint(out_shape, size=n_samples)
 58 | 
 59 |         # model without regularizer
 60 |         tf.random.set_seed(0)
 61 |         model = tf.keras.models.Sequential()
 62 |         model.add(tf.keras.layers.Input(shape=(in_shape,)))
 63 |         model.add(tf.keras.layers.Dense(out_shape, activation='softmax'))
 64 |         model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
 65 | 
 66 |         model.fit(x, y, verbose=False)
 67 |         base_weights = model.weights[:]
 68 | 
 69 |         # model with regularizer
 70 |         tf.random.set_seed(0)
 71 |         model = tf.keras.models.Sequential()
 72 |         model.add(tf.keras.layers.Input(shape=(in_shape,)))
 73 |         model.add(tf.keras.layers.Dense(out_shape, activation='softmax'))
 74 | 
 75 |         model = apply_kernel_regularizer(model, tf.keras.regularizers.l2(0.1))
 76 |         model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
 77 | 
 78 |         model.fit(x, y, verbose=False)
 79 |         new_weights = model.weights[:]
 80 | 
 81 |         for b, n in zip(base_weights, new_weights):
 82 |             self.assertNotEqual(b.numpy().tolist(), n.numpy().tolist())
 83 | 
 84 |     def test_safe_div(self):
 85 |         raise NotImplemented('TODO: not yet implemented')
 86 | 
 87 |     def test_predict(self):
 88 |         raise NotImplemented('TODO: not yet implemented')
 89 | 
 90 |     def test_adabelief(self):
 91 |         n_samples, in_shape, out_shape = 128, 4, 4
 92 |         x = np.random.randn(n_samples, in_shape)
 93 |         y = np.random.randint(out_shape, size=n_samples)
 94 | 
 95 |         # AdaBelief
 96 |         tf.random.set_seed(0)
 97 |         model = tf.keras.models.Sequential()
 98 |         model.add(tf.keras.layers.Input(shape=(in_shape,)))
 99 |         model.add(tf.keras.layers.Dense(out_shape, activation='softmax'))
100 |         model.compile(optimizer=AdaBelief(), loss='sparse_categorical_crossentropy')
101 |         model.fit(x, y, epochs=32, verbose=True)
102 |         print()
103 | 
104 |         # Adam
105 |         tf.random.set_seed(0)
106 |         model = tf.keras.models.Sequential()
107 |         model.add(tf.keras.layers.Input(shape=(in_shape,)))
108 |         model.add(tf.keras.layers.Dense(out_shape, activation='softmax'))
109 |         model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
110 |         model.fit(x, y, epochs=32, verbose=True)
111 | 
112 | if __name__ == '__main__':
113 |     os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
114 |     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
115 |     unittest.main()
116 | 


--------------------------------------------------------------------------------