├── .gitignore ├── README.md ├── data_utils.py ├── eval.py ├── get_csv_data.py ├── metrics.py ├── metrics_test.py ├── pipeline.py ├── pipeline_test.py ├── requirements.txt ├── sample_answer.json ├── sj_train.py ├── swa.py ├── trainer.py ├── transforms.py ├── transforms_test.py ├── utils.py └── utils_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | tensorboard_log/ 132 | *.h5 133 | *.csv 134 | *.log 135 | *.pickle 136 | *.npy 137 | *.wav -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # challenge 2 | ready for AI grand challenge 3 | 4 | 5 | ## Contributors: 6 | 7 | ### Prof. Jong-hwan Ko, Sungkyunkwan University 8 | ### Phd. Ji-ho Chang 9 | ### Tae-soo Kim 10 | ### Daniel Rho 11 | ### Seung-jin Lee 12 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | import tensorflow as tf 4 | 5 | from utils import EPSILON, safe_div 6 | from transforms import mask 7 | 8 | 9 | def load_wav(wav_fname: str): 10 | ''' 11 | OUTPUT 12 | complex_specs: list of complex spectrograms 13 | each complex spectrogram has shape of 14 | [freq, time, chan*2] 15 | ''' 16 | 17 | stft = torchaudio.transforms.Spectrogram(512, power=None) 18 | 19 | wav, r = torchaudio.load(wav_fname) 20 | wav = torchaudio.compliance.kaldi.resample_waveform( 21 | wav, r, 16000) 22 | wav = normalize(wav) 23 | wav = stft(wav) 24 | 25 | # [chan, freq, time, 2] -> [freq, time, chan, 2] 26 | wav = wav.numpy().transpose(1, 2, 3, 0) 27 | wav = wav.reshape((*wav.shape[:2], -1)) 28 | 29 | return wav 30 | 31 | 32 | def normalize(wav): 33 | rms = torch.sqrt(torch.mean(torch.pow(wav, 2))) * 10 34 | return wav / rms 35 | 36 | 37 | def minmax(x, y=None): 38 | # batch-wise pre-processing 39 | axis = tuple(range(1, len(x.shape))) 40 | 41 | # MIN-MAX 42 | x_max = tf.math.reduce_max(x, axis=axis, keepdims=True) 43 | x_min = tf.math.reduce_min(x, axis=axis, keepdims=True) 44 | x = safe_div(x-x_min, x_max-x_min) 45 | if y is not None: 46 | return x, y 47 | return x 48 | 49 | 50 | def log_on_mel(mel, labels=None): 51 | mel = tf.math.log(mel + EPSILON) 52 | 53 | if labels is not None: 54 | return mel, labels 55 | return mel 56 | 57 | 58 | def augment(specs, labels, time_axis=-2, freq_axis=-3): 59 | specs = mask(specs, axis=time_axis, max_mask_size=24, n_mask=6) 60 | specs = mask(specs, axis=freq_axis, max_mask_size=16) 61 | return specs, labels 62 | 63 | 64 | def to_frame_labels(x, y): 65 | """ 66 | :param y: [..., n_voices, n_frames, n_classes] 67 | :return: [..., n_frames, n_classes] 68 | """ 69 | y = tf.reduce_sum(y, axis=-3) 70 | return x, y 71 | 72 | 73 | def mono_chan(x, y=None): 74 | if y is not None: 75 | return x[..., :1] + x[..., 1:], y 76 | return x 77 | 78 | 79 | def stereo_mono(x, y=None): 80 | if y is None: 81 | return tf.concat([x[..., :2], x[..., :1] + x[..., 1:2], x[..., 2:4], x[..., 2:3] + x[..., 3:4]], -1) 82 | return tf.concat([x[..., :2], x[..., :1] + x[..., 1:2], x[..., 2:4], x[..., 2:3] + x[..., 3:4]], -1), y 83 | 84 | 85 | def label_downsample(resolution=32): 86 | def _label_downsample(x, y): 87 | if isinstance(y, (list, tuple)): 88 | y_ = y[0] 89 | y_ = tf.keras.layers.AveragePooling1D(resolution, resolution, padding='same')(y_) 90 | y_ = tf.cast(y_ >= 0.5, y_.dtype)[:resolution] 91 | y = (y_,) + tuple([*y[1:]]) 92 | else: 93 | y = tf.keras.layers.AveragePooling1D(resolution, resolution, padding='same')(y) 94 | y = tf.cast(y >= 0.5, y.dtype)[:resolution] 95 | 96 | return x, y 97 | return _label_downsample 98 | 99 | 100 | def random_merge_aug(number): 101 | def _random_merge_aug(x, y=None): 102 | chan = x.shape[-1] // 2 103 | if chan != 2: 104 | raise ValueError('This augment can be used in 2 channel audio') 105 | 106 | real = x[...,:chan] 107 | imag = x[...,chan:] 108 | 109 | factor = tf.random.uniform((1, 1, number - chan), 0.1, 0.9) 110 | aug_real = factor * tf.repeat(real[..., :1], number - chan, -1) + tf.sqrt(1 - factor) * tf.repeat(real[..., 1:], number - chan, -1) 111 | 112 | real = tf.concat([real, aug_real], -1) 113 | imag = tf.concat([imag, tf.repeat(imag[...,:1] + imag[...,1:], number - chan, -1)], -1) 114 | if y is not None: 115 | return tf.concat([real, imag], -1), y 116 | return tf.concat([real, imag], -1) 117 | return _random_merge_aug 118 | 119 | 120 | def multiply_label(multiply_factor): 121 | def _multiply_label(x, y): 122 | return x, y * multiply_factor 123 | return _multiply_label 124 | 125 | 126 | def stft_filter(filter_num): 127 | def _stft_filter(x, y=None): 128 | mask = tf.concat([tf.ones([1] + [*x.shape[1:]], x.dtype), 129 | tf.zeros([filter_num] + [*x.shape[1:]], x.dtype), 130 | tf.ones([x.shape[0] - filter_num - 1] + [*x.shape[1:]], x.dtype), 131 | ], 0) 132 | x *= mask 133 | if y is None: 134 | return x 135 | return x, y 136 | return _stft_filter 137 | 138 | 139 | def speech_enhancement_preprocess(x, y=None): 140 | """ 141 | :param y: ([..., n_voices, n_frames, n_classes], ..., ...) 142 | :return: [..., n_frames, n_classes] 143 | """ 144 | x = x[1:,...,:x.shape[-1] // 2] 145 | if y is None: 146 | return x 147 | y = (tf.reduce_sum(y[0], axis=-3), y[1][1:, ...,:x.shape[-1] // 2], y[2][1:, ...,:x.shape[-1] // 2]) 148 | return x, y 149 | 150 | 151 | if __name__ == '__main__': 152 | import glob 153 | wavs = glob.glob('/codes/2020_track3/t3_audio/*.wav') 154 | print(wavs) 155 | stfts = [load_wav(wav) for wav in wavs] 156 | 157 | for stft in stfts: 158 | print(stft.shape) 159 | 160 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | import tensorflow as tf 3 | import json 4 | 5 | from transforms import * 6 | from utils import * 7 | from data_utils import * 8 | 9 | from sj_train import get_model, ARGS, random_merge_aug, stereo_mono, stft_filter, label_downsample_model 10 | from metrics import Challenge_Metric, output_to_metric, get_er, evaluate 11 | 12 | 13 | def minmax_log_on_mel(mel, labels=None): 14 | # batch-wise pre-processing 15 | axis = tuple(range(1, len(mel.shape))) 16 | 17 | # MIN-MAX 18 | mel_max = tf.math.reduce_max(mel, axis=axis, keepdims=True) 19 | mel_min = tf.math.reduce_min(mel, axis=axis, keepdims=True) 20 | mel = safe_div(mel-mel_min, mel_max-mel_min) 21 | 22 | # LOG 23 | mel = tf.math.log(mel + EPSILON) 24 | 25 | if labels is not None: 26 | return mel, labels 27 | return mel 28 | 29 | 30 | def second2frame(seconds: list, frame_num, resolution): 31 | # seconds = [[class, start, end], ...] 32 | frames = np.zeros([frame_num, 3], dtype=np.float32) 33 | for second in seconds: 34 | class_num = second[0] 35 | start = int(np.round(second[1] * resolution)) 36 | end = int(np.round(second[2] * resolution)) 37 | frames[start:end,class_num] += 1 38 | return tf.convert_to_tensor(frames, dtype=tf.float32) 39 | 40 | 41 | 42 | if __name__ == "__main__": 43 | config = ARGS() 44 | config.args.add_argument('--verbose', help='verbose', type=bool, default=True) 45 | config.args.add_argument('--p', help='parsing name', action='store_true') 46 | config.args.add_argument('--path', type=str, default='') 47 | config = config.get() 48 | if config.p: 49 | parsed_name = config.name.split('_') 50 | if parsed_name[0][0] not in ('B', 'v'): 51 | parsed_name = parsed_name[1:] 52 | if parsed_name[0] == 'vad': 53 | config.model_type = 'vad' 54 | config.model = 1 55 | else: 56 | config.model = int(parsed_name[0][-1]) 57 | config.v = int(parsed_name[1][-1]) 58 | config.n_mels = int(parsed_name[6][3:]) 59 | config.n_chan = int(parsed_name[7][-1]) 60 | config.n_frame = int(parsed_name[9].split('framelen')[-1]) 61 | os.environ['CUDA_VISIBLE_DEVICES'] = config.gpus 62 | 63 | model = get_model(config) 64 | model.load_weights(os.path.join(config.path, f'{config.name}.h5')) 65 | final_score = evaluate(config, model, verbose=config.verbose) 66 | 67 | -------------------------------------------------------------------------------- /get_csv_data.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from glob import glob 3 | import os 4 | from numpy import max, mean 5 | 6 | from tqdm import tqdm 7 | 8 | from sj_train import ARGS, get_model 9 | from metrics import evaluate 10 | 11 | 12 | def main(config): 13 | os.environ['CUDA_VISIBLE_DEVICES'] = config.gpus 14 | data_path = config.path 15 | paths = sorted(glob(os.path.join(data_path, '*.csv'))) 16 | result_path = os.path.join(data_path, 'result.csv') 17 | category = ['이름', '모델', 'version', 'batch', 'lr', 'optimizer', 'loss function', 'input', 'chan', 'output', 'epoch', 'cos_sim', 'er', 'f1_score', 'loss', 'val_cos_sim', 'val_er', 'val_f1_score', 'val_loss', 'test_er', 'swa_test_er', 'sample_test_er'] 18 | 19 | prev_lines = [category] 20 | 21 | if len(prev_lines) == 0: 22 | with open(result_path, 'w') as f: 23 | wr = csv.writer(f) 24 | wr.writerow(category) 25 | 26 | for path in tqdm(paths): 27 | if path == result_path: 28 | continue 29 | 30 | lines = [] 31 | with open(path, 'r') as f: 32 | data = csv.reader(f) 33 | for i, line in enumerate(data): 34 | if i == 0: 35 | continue 36 | lines.append(line) 37 | data = lines[max([len(lines)-config.patience, 0])] 38 | filename = os.path.splitext(path.split('/')[-1])[0] 39 | if 'vad' not in filename: 40 | name = filename[filename.find('B'):].split('_') 41 | else: 42 | name = filename[filename.find('vad'):].split('_') 43 | model_name = name[0] 44 | version = name[1][1:] 45 | lr = name[2][2:] 46 | batch = name[3].split('batch')[-1] 47 | opt = name[5] 48 | n_mel = name[6].split('mel')[-1] 49 | chan = name[7].split('chan')[-1] 50 | loss = name[8] 51 | framelen = name[9].split('framelen')[-1] 52 | if 'vad' in name: 53 | config.model_type = 'vad' 54 | elif 'se' in name: 55 | config.model_type = 'se' 56 | else: 57 | config.model_type = 'eff' 58 | evaluation = max([len(lines)-config.patience, 0]) > 5 59 | 60 | 61 | config.model = model_name[1:] 62 | config.v = int(version) 63 | config.n_mels = int(n_mel) 64 | config.n_chan = int(chan) 65 | config.n_frame = int(framelen) 66 | try: 67 | model = get_model(config) 68 | except ValueError: 69 | continue 70 | 71 | if config.model_type == 'se': 72 | output = str(tuple([i for i in model.output[0].shape[1:]])) 73 | else: 74 | output = str(tuple([i for i in model.output.shape[1:]])) 75 | data = [filename, 'vad' if config.model_type == 'vad' else model_name, version, batch, lr, opt, loss, str(tuple([i for i in model.input.shape[1:-1]])), chan, output] + data 76 | if os.path.exists(f'{os.path.splitext(path)[0]}.h5'): 77 | if evaluation: 78 | try: 79 | model.load_weights(f'{os.path.splitext(path)[0]}.h5') 80 | score = evaluate(config, model, overlap_hop=int(framelen) // 2, verbose=True) 81 | except: 82 | continue 83 | else: 84 | score = 1.0 85 | data += [mean(score)] 86 | else: 87 | data += 'None' 88 | 89 | if os.path.exists(f'{os.path.splitext(path)[0]}_SWA.h5'): 90 | if evaluation: 91 | model.load_weights(f'{os.path.splitext(path)[0]}_SWA.h5') 92 | score = evaluate(config, model, overlap_hop=int(framelen) // 2, verbose=True) 93 | else: 94 | score = 1.0 95 | data += [mean(score)] 96 | else: 97 | data += ['None'] 98 | 99 | if os.path.exists(f'{os.path.splitext(path)[0]}_sample.h5'): 100 | if evaluation: 101 | model.load_weights(f'{os.path.splitext(path)[0]}_sample.h5') 102 | score = evaluate(config, model, overlap_hop=int(framelen) // 2, verbose=True) 103 | else: 104 | score = 1.0 105 | data += [mean(score)] 106 | else: 107 | data += ['None'] 108 | 109 | prev_lines.append(data) 110 | 111 | with open(result_path, 'w') as f: 112 | wr = csv.writer(f) 113 | wr.writerows(prev_lines) 114 | 115 | 116 | if __name__ == '__main__': 117 | args = ARGS() 118 | args.args.add_argument('--path', type=str, default='') 119 | main(args.get()) 120 | -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | import json 2 | from glob import glob 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from tensorflow.keras.callbacks import * 7 | import tensorflow_addons as tfa 8 | 9 | from utils import * 10 | from data_utils import * 11 | from transforms import * 12 | 13 | 14 | class eval_callback(tf.keras.callbacks.Callback): 15 | def __init__(self, config, NAME): 16 | super(eval_callback, self).__init__() 17 | self.config = config 18 | self.name = NAME 19 | self.score = np.inf 20 | 21 | def on_epoch_end(self, epoch, logs=None): 22 | if epoch % 5 == 2: 23 | model = tf.keras.models.clone_model(self.model) 24 | model.load_weights(self.name) 25 | score = tf.reduce_mean(evaluate(self.config, model, verbose=True)) 26 | if score <= self.score: 27 | self.score = score 28 | tf.keras.models.save_model(model, os.path.splitext(self.name)[0] + '_sample.h5') 29 | 30 | 31 | def evaluate(config, model, overlap_hop = 512, verbose: bool = False): 32 | final_score = [] 33 | with open('sample_answer.json') as f: 34 | answer_gt = json.load(f) 35 | answer_gt = answer_gt['task2_answer'] 36 | sr = 16000 37 | hop = 256 38 | metric = Challenge_Metric() 39 | 40 | for path in sorted(glob('*.wav')): 41 | inputs = load_wav(path) 42 | if config.n_chan == 1: 43 | inputs = mono_chan(inputs) 44 | elif config.n_chan == 3: 45 | inputs = stereo_mono(inputs) 46 | elif config.n_chan > 3: 47 | inputs = random_merge_aug(config.n_chan)(inputs, None) 48 | 49 | if config.model_type != 'se': 50 | inputs = stft_filter(int(round(256 * 1000 / 16000)))(inputs) 51 | inputs = complex_to_magphase(inputs) 52 | inputs = magphase_to_mel(config.n_mels)(inputs) 53 | inputs = minmax(inputs) 54 | inputs = log_on_mel(inputs) 55 | else: 56 | # inputs = complex_to_magphase(inputs) 57 | inputs = speech_enhancement_preprocess(inputs) 58 | 59 | frame_len = inputs.shape[-2] 60 | inputs = tf.signal.frame(inputs, config.n_frame, overlap_hop, pad_end=True, axis=-2) 61 | inputs = tf.transpose(inputs, (1, 0, 2, 3)) 62 | preds = model.predict(inputs[..., :config.n_chan]) # [batch, time, class] 63 | 64 | if config.model_type == 'se' and config.v == 9: 65 | preds = preds[0] 66 | 67 | if config.v in label_downsample_model: 68 | resolution = config.n_frame / preds.shape[-2] 69 | preds = tf.keras.layers.UpSampling1D(resolution)(preds) 70 | 71 | preds = tf.transpose(preds, [2, 0, 1]) 72 | total_counts = tf.signal.overlap_and_add(tf.ones_like(preds), overlap_hop)[..., :frame_len] 73 | preds = tf.signal.overlap_and_add(preds, overlap_hop)[..., :frame_len] 74 | preds /= total_counts 75 | preds = tf.transpose(preds, [1, 0]) 76 | 77 | # smoothing 78 | smoothing_kernel_size = int(0.5 * sr) // hop # 0.5초 길이의 kernel 79 | preds = tf.keras.layers.AveragePooling1D(smoothing_kernel_size, 1, padding='same')(preds[tf.newaxis, ...])[0] 80 | preds = tf.keras.layers.MaxPooling1D(smoothing_kernel_size * 4, 1, padding='same')(preds[tf.newaxis, ...])[0] 81 | preds = tf.cast(preds >= 0.5, tf.float32) 82 | cls0, cls1, cls2 = metric.get_start_end_frame(preds) 83 | answer_gt_temp = tf.convert_to_tensor(answer_gt[os.path.basename(path)[:-4]]) 84 | answer_predict = output_to_metric(hop, sr)(cls0, cls1, cls2) 85 | er = get_er(answer_gt_temp, answer_predict) 86 | 87 | final_score.append(er) 88 | if verbose: 89 | print('FINAL SCORE:', np.mean(final_score)) 90 | return final_score 91 | 92 | 93 | class Challenge_Metric: 94 | def __init__(self, sr=16000, hop=256) -> None: 95 | self.reset_state() 96 | self.sr = sr 97 | self.hop = hop 98 | 99 | def get_start_end_time(self, data): 100 | data1, data2, data3 = self.get_start_end_frame(data) 101 | data1 = tf.cast(tf.round(data1 * self.hop / self.sr), tf.int32) 102 | data2 = tf.cast(tf.round(data2 * self.hop / self.sr), tf.int32) 103 | data3 = tf.cast(tf.round(data3 * self.hop / self.sr), tf.int32) 104 | data1 = tf.gather(data1, np.unique(data1, True, axis=0)[1]) 105 | data2 = tf.gather(data2, np.unique(data2, True, axis=0)[1]) 106 | data3 = tf.gather(data3, np.unique(data3, True, axis=0)[1]) 107 | return data1, data2, data3 108 | 109 | def get_start_end_frame(self, data): 110 | data_temp = tf.concat([tf.zeros([1,3]), data[:-1,:]], 0) 111 | diff_index = tf.where(data_temp != data) 112 | class_0 = diff_index[diff_index[:,1] == 0][:,0] 113 | class_1 = diff_index[diff_index[:,1] == 1][:,0] 114 | class_2 = diff_index[diff_index[:,1] == 2][:,0] 115 | 116 | if (class_0.shape[0] % 2 != 0): 117 | class_0 = tf.concat((class_0, tf.Variable([len(data)], dtype=tf.int64)),0) 118 | 119 | class_0 = tf.reshape(class_0, [-1, 2]) 120 | class_0 = tf.transpose(tf.concat([[class_0[:,0]], [class_0[:,1] -1]], 0)) 121 | 122 | if (class_1.shape[0] % 2 != 0): 123 | class_1 = tf.concat((class_1, tf.Variable([len(data)], dtype=tf.int64)),0) 124 | 125 | class_1 = tf.reshape(class_1, [-1, 2]) 126 | class_1 = tf.transpose(tf.concat([[class_1[:,0]], [class_1[:,1] -1]], 0)) 127 | 128 | if (class_2.shape[0] % 2 != 0): 129 | class_2 = tf.concat((class_2, tf.Variable([len(data)], dtype=tf.int64)),0) 130 | 131 | class_2 = tf.reshape(class_2, [-1, 2]) 132 | class_2 = tf.transpose(tf.concat([[class_2[:,0]], [class_2[:,1] -1]], 0)) 133 | return class_0, class_1, class_2 134 | 135 | def get_second_answer(self, data): 136 | data_second = np.asarray([self.hop*i//self.sr for i in range(len(data))]) 137 | second_true = np.zeros([np.max(data_second), 3]) 138 | for i in range(np.max(data_second)): 139 | second_true[i, 0] = (tf.reduce_mean(data[:, 0][data_second == i]) > 0.5) 140 | second_true[i, 1] = (tf.reduce_mean(data[:, 1][data_second == i]) > 0.5) 141 | second_true[i, 2] = (tf.reduce_mean(data[:, 2][data_second == i]) > 0.5) 142 | cls0, cls1, cls2 = self.get_1(second_true) 143 | cls0 = tf.cast(cls0, dtype=tf.int32) 144 | cls1 = tf.cast(cls1, dtype=tf.int32) 145 | cls2 = tf.cast(cls2, dtype=tf.int32) 146 | return cls0, cls1, cls2 147 | 148 | def reset_state(self): 149 | self.arr0 = tf.TensorArray(tf.int64, size=0, dynamic_size=True, clear_after_read=False) 150 | self.arr1 = tf.TensorArray(tf.int64, size=0, dynamic_size=True, clear_after_read=False) 151 | self.arr2 = tf.TensorArray(tf.int64, size=0, dynamic_size=True, clear_after_read=False) 152 | self.tmp0 = tf.TensorArray(tf.int64, size=2, dynamic_size=True, clear_after_read=True) 153 | self.tmp1 = tf.TensorArray(tf.int64, size=2, dynamic_size=True, clear_after_read=True) 154 | self.tmp2 = tf.TensorArray(tf.int64, size=2, dynamic_size=True, clear_after_read=True) 155 | self.ts0 = 0 # tmp size 156 | self.ts1 = 0 # tmp size 157 | self.ts2 = 0 # tmp size 158 | 159 | 160 | def extract_middle(y_pred): 161 | # [batch, time, cls] 162 | pred_starts = tf.clip_by_value(y_pred - tf.pad(y_pred, [[0, 0], [1, 0], [0, 0]])[:, :-1], 0, 1) 163 | pred_ends = tf.clip_by_value(y_pred - tf.pad(y_pred, [[0, 0], [0, 1], [0, 0]])[:, 1:], 0, 1) 164 | n_pred = tf.reduce_sum(tf.cast(pred_starts, tf.float32), (1, 2)) 165 | pred_starts = tf.where(pred_starts) 166 | pred_ends = tf.where(pred_ends) 167 | pred_starts = tf.gather(pred_starts, tf.argsort(pred_starts[:, -1]), -1) 168 | pred_starts = tf.gather(pred_starts, tf.argsort(pred_starts[:, 0]), 0) 169 | pred_ends = tf.gather(pred_ends, tf.argsort(pred_ends[:, -1]), -1) 170 | pred_ends = tf.gather(pred_ends, tf.argsort(pred_ends[:, 0]), 0) 171 | 172 | middle = tf.cast((pred_starts+pred_ends)/2, tf.int64) 173 | return middle 174 | 175 | 176 | def get_er(gt, predict): 177 | predict_2 = tf.identity(predict) 178 | predict_2 = tf.gather(predict_2, tf.argsort(predict_2[:,1])) 179 | gt = tf.gather(gt, tf.argsort(gt[:,1])) 180 | N = len(predict_2) + len(gt) 181 | answer = 0 182 | for gt_item in gt: 183 | remove = False 184 | for i, pred_item in enumerate(predict_2): 185 | if (gt_item[1] <= pred_item[1]) and (pred_item[1] <= gt_item[2]): 186 | if gt_item[0] == pred_item[0]: 187 | answer += 2 188 | temp = i 189 | remove = True 190 | break 191 | if remove: 192 | predict_2 = tf.concat((predict_2[:i,:], predict_2[i+1:, :]), axis=0) 193 | return (N - answer) / len(gt) 194 | 195 | 196 | def output_to_metric(hop, sr): 197 | hop = hop 198 | sr = sr 199 | def output_to_metric_(cls0, cls1, cls2): 200 | answer_list = tf.cast(tf.zeros([0,2]), tf.int32) 201 | 202 | for item in cls0: 203 | new_item = tf.cast(tf.stack([0, ((item[0] + item[1]) / 2)*hop/sr], 0), answer_list.dtype)[tf.newaxis, ...] 204 | answer_list = tf.concat([answer_list, new_item], axis=0) 205 | 206 | for item in cls1: 207 | new_item = tf.cast(tf.stack([1, ((item[0] + item[1]) / 2)*hop/sr], 0), answer_list.dtype)[tf.newaxis, ...] 208 | answer_list = tf.concat([answer_list, new_item], axis=0) 209 | 210 | for item in cls2: 211 | new_item = tf.cast(tf.stack([2, ((item[0] + item[1]) / 2)*hop/sr], 0), answer_list.dtype)[tf.newaxis, ...] 212 | answer_list = tf.concat([answer_list, new_item], axis=0) 213 | return answer_list 214 | return output_to_metric_ 215 | 216 | 217 | def er_score(threshold=0.5, smoothing=True): 218 | threshold = tf.constant(threshold, tf.float32) 219 | 220 | def er(y_true, y_pred): 221 | y_true = tf.cast(y_true >= threshold, tf.int32) 222 | if smoothing: 223 | smoothing_kernel_size = int(0.5 * 16000) // 256 # 0.5 224 | y_pred = tf.keras.layers.AveragePooling1D(smoothing_kernel_size, padding='same')(y_pred) 225 | y_pred = tf.cast(y_pred >= threshold, tf.int32) 226 | 227 | # True values 228 | # [batch, time, cls] 229 | true_starts = tf.clip_by_value( 230 | y_true - tf.pad(y_true, [[0, 0], [1, 0], [0, 0]])[:, :-1], 0, 1) 231 | true_ends = tf.clip_by_value( 232 | y_true - tf.pad(y_true, [[0, 0], [0, 1], [0, 0]])[:, 1:], 0, 1) 233 | n_true = tf.reduce_sum(tf.cast(true_starts, tf.float32), (1, 2)) 234 | 235 | true_starts = tf.where(true_starts) 236 | true_ends = tf.where(true_ends) 237 | true_starts = tf.gather(true_starts, tf.argsort(true_starts[:, -1]), -1) 238 | true_starts = tf.gather(true_starts, tf.argsort(true_starts[:, 0]), 0) 239 | true_ends = tf.gather(true_ends, tf.argsort(true_ends[:, -1]), -1) 240 | true_ends = tf.gather(true_ends, tf.argsort(true_ends[:, 0]), 0) 241 | 242 | # prediction values 243 | pred_starts = tf.clip_by_value( 244 | y_pred - tf.pad(y_pred, [[0, 0], [1, 0], [0, 0]])[:, :-1], 0, 1) 245 | pred_ends = tf.clip_by_value( 246 | y_pred - tf.pad(y_pred, [[0, 0], [0, 1], [0, 0]])[:, 1:], 0, 1) 247 | n_pred = tf.reduce_sum(tf.cast(pred_starts, tf.float32), (1, 2)) 248 | 249 | pred_starts = tf.where(pred_starts) 250 | pred_ends = tf.where(pred_ends) 251 | pred_starts = tf.gather(pred_starts, tf.argsort(pred_starts[:, -1]), -1) 252 | pred_starts = tf.gather(pred_starts, tf.argsort(pred_starts[:, 0]), 0) 253 | pred_ends = tf.gather(pred_ends, tf.argsort(pred_ends[:, -1]), -1) 254 | pred_ends = tf.gather(pred_ends, tf.argsort(pred_ends[:, 0]), 0) 255 | 256 | middle = tf.cast((pred_starts+pred_ends)/2, tf.int64) 257 | 258 | # correct: correct batch and cls (true, pred) 259 | correct = ( 260 | true_starts[:, ::2, None]==tf.transpose(middle, (1, 0))[None, ::2]) 261 | correct = tf.reduce_min(tf.cast(correct, tf.float32), axis=1) 262 | 263 | mid_time = tf.transpose(middle[:, 1:2], (1, 0)) 264 | correct *= tf.cast(true_starts[:, 1:2] <= mid_time, tf.float32) 265 | correct *= tf.cast(true_ends[:, 1:2] >= mid_time, tf.float32) 266 | correct = tf.reduce_max(tf.pad(correct, [[0, 0], [0, 1]]), -1) 267 | 268 | correct_per_sample = tf.reduce_sum( 269 | tf.one_hot(true_starts[:, 0], tf.shape(y_pred)[0])*correct[:, None], 270 | 0) 271 | score = n_true + n_pred - 2 * correct_per_sample 272 | score /= tf.clip_by_value(n_true, 1, tf.reduce_max(n_true)) 273 | return score 274 | return er 275 | 276 | 277 | def cos_sim(y_true, y_pred): 278 | if isinstance(y_true, tuple): 279 | y_true = y_true[0] 280 | if isinstance(y_pred, tuple): 281 | y_pred = y_pred[0] 282 | mask = tf.cast( 283 | tf.reduce_sum(y_true, axis=-2) > 0., tf.float32) # [None, 3] 284 | mask = safe_div(mask, tf.reduce_sum(mask, axis=-1, keepdims=True)) 285 | return tf.reduce_sum( 286 | tf.keras.losses.cosine_similarity(y_true, y_pred, axis=-2) * mask, 287 | axis=-1) 288 | 289 | 290 | def f1_score(): 291 | f1_score_fn = tfa.metrics.F1Score(num_classes=3, threshold=0.5, average='micro') 292 | def f1_score(y_true, y_pred): 293 | if isinstance(y_true, tuple): 294 | y_true = y_true[0] 295 | if isinstance(y_pred, tuple): 296 | y_pred = y_pred[0] 297 | return f1_score_fn(y_true, y_pred) 298 | return f1_score 299 | 300 | -------------------------------------------------------------------------------- /metrics_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | from metrics import * 5 | 6 | 7 | class MetricsTest(tf.test.TestCase): 8 | def setUp(self): 9 | self.gt = tf.convert_to_tensor([[0, 0, 10], [2, 0, 20], [1, 15, 30], [2, 31, 40], [1, 32, 35]]) 10 | self.predict = tf.convert_to_tensor([[1, 5], [1, 19], [2, 32], [2, 38], [0, 38]]) 11 | 12 | def test_er_score(self): 13 | gt_numpy = self.gt.numpy() 14 | gt_array = np.zeros([2, 40, 3]) 15 | pred_array = np.zeros([2, 40, 3]) 16 | for item in gt_numpy: 17 | gt_array[0, item[1]:item[2], item[0]] = 1 18 | gt_array[1, item[1]:item[2], item[0]] = 1 19 | for item in self.predict.numpy(): 20 | pred_array[0, item[1]-2:item[1]+2, item[0]] = 1 21 | pred_array[1, item[1]-2:item[1]+2, item[0]] = 1 22 | 23 | er_func = er_score(smoothing=False) 24 | er = er_func(gt_array, pred_array) 25 | self.assertEqual(tf.reduce_mean(er), 1.2) 26 | 27 | 28 | if __name__ == '__main__': 29 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 30 | tf.test.main() 31 | 32 | -------------------------------------------------------------------------------- /pipeline.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from functools import partial 3 | from utils import list_to_generator 4 | 5 | 6 | def merge_complex_specs(background, 7 | voices_and_labels, 8 | noises=None, 9 | n_frame=300, 10 | n_classes=3, 11 | t_axis=1, # time-axis 12 | min_ratio=2/3, 13 | min_noise_ratio=1/2, 14 | snr=-20, 15 | seperate_noise_voice=False): 16 | ''' 17 | OUTPUT: 18 | complex_spec: (freq, time, chan2) 19 | labels: (n_voices, time, n_classes) 20 | ''' 21 | voices, labels = voices_and_labels 22 | output_shape = tuple( 23 | [s if i != t_axis else n_frame 24 | for i, s in enumerate(background.shape)]) 25 | n_dims = len(output_shape) 26 | axis = tuple(i for i in range(n_dims) if i != t_axis) 27 | 28 | # background and its label 29 | bg_frame = tf.shape(background)[t_axis] 30 | background = tf.tile( 31 | background, 32 | [1 if i != t_axis else (n_frame+bg_frame-1) // bg_frame 33 | for i in range(n_dims)]) 34 | # background = tf.pad(background, [[4, 0], [0, 0], [0, 0]]) 35 | complex_spec = tf.image.random_crop(background, output_shape) 36 | 37 | only_voice = tf.zeros_like(complex_spec) 38 | only_noise = tf.identity(complex_spec) 39 | 40 | # voices 41 | max_voices = tf.shape(voices)[0] 42 | if max_voices > 1: 43 | n_voices = tf.random.uniform([], minval=1, maxval=max_voices, 44 | dtype='int32') 45 | else: 46 | n_voices = 1 47 | label = tf.zeros(shape=[max_voices, n_frame, n_classes], dtype='float32') 48 | for v in range(n_voices): 49 | voice = voices[v] 50 | v_ratio = tf.math.pow(10., -tf.random.uniform([], maxval=-snr/10)) 51 | v_frame = tf.shape(voice)[t_axis] 52 | 53 | l = labels[v:v+1] # shape=[1, n_classes] 54 | l = tf.tile(l, [v_frame, 1]) # [v_frame, n_classes] 55 | mask = tf.cast(tf.reduce_max(voice, axis=axis) > 0, tf.float32) 56 | l *= tf.expand_dims(mask, axis=-1) 57 | 58 | v_frame = tf.cast(v_frame, tf.float32) 59 | pad_size = n_frame - tf.cast(min_ratio*v_frame, tf.int32) 60 | 61 | if pad_size > 0: 62 | voice = tf.pad( 63 | voice, 64 | [[0, 0] if i != t_axis else [pad_size] * 2 65 | for i in range(n_dims)]) 66 | l = tf.pad(l, [[pad_size]*2, [0, 0]]) 67 | 68 | maxval = tf.shape(voice)[t_axis] - n_frame 69 | offset = tf.random.uniform([], maxval=maxval, dtype=tf.int32) 70 | voice = tf.slice( 71 | voice, 72 | [0 if i != t_axis else offset for i in range(n_dims)], 73 | output_shape) 74 | l = tf.slice(l, [offset, 0], [n_frame, n_classes]) 75 | l = tf.reshape(tf.one_hot(v, max_voices, dtype='float32'), (-1, 1, 1)) \ 76 | * tf.expand_dims(l, axis=0) 77 | 78 | no_overlap = tf.cast(tf.reduce_max(tf.reduce_sum(label+l, axis=0)) < 2, 79 | tf.float32) 80 | 81 | complex_spec += v_ratio * voice * no_overlap 82 | if seperate_noise_voice: 83 | only_voice += v_ratio * voice * no_overlap 84 | label += l * no_overlap 85 | 86 | if noises is not None: 87 | n_noises = tf.random.uniform([], maxval=tf.shape(noises)[0], 88 | dtype='int32') 89 | 90 | for n in range(n_noises): 91 | noise = noises[n] 92 | 93 | # SNR 0 ~ -20 94 | n_ratio = tf.math.pow(10., -tf.random.uniform([], maxval=2)) 95 | ns_frame = tf.cast(tf.shape(noise)[t_axis], tf.float32) 96 | pad_size = n_frame - tf.cast(min_noise_ratio*ns_frame, tf.int32) 97 | 98 | if pad_size > 0: 99 | noise = tf.pad( 100 | noise, 101 | [[0, 0] if i != t_axis else [pad_size]*2 102 | for i in range(n_dims)]) 103 | noise = tf.image.random_crop(noise, output_shape) 104 | if seperate_noise_voice: 105 | only_noise += n_ratio * noise 106 | complex_spec += n_ratio * noise 107 | if seperate_noise_voice: 108 | label = (label, only_voice, only_noise) 109 | 110 | return complex_spec, label 111 | 112 | 113 | def make_pipeline(backgrounds, # a list of backgrounds noises 114 | voices, # a list of human voicess 115 | labels, # a list of labelss of human voicess 116 | noises=None, # a list of additional noises 117 | n_frame=300, # number of frames per sample 118 | max_voices=10, 119 | max_noises=10, 120 | n_classes=3, 121 | **kwargs): 122 | ''' 123 | OUTPUT 124 | dataset: tf.data.Dataset 125 | it only returns a raw complex spectrogram 126 | and its labels 127 | you have to apply augmentations (ex. mixup) 128 | or preprocessing functions (ex. applying log) 129 | you don't have to apply shuffle 130 | 131 | complex spectrogram: [freq_bins, n_frame, chan*2] 132 | [..., :chan] = real 133 | [..., chan:] = imag 134 | labels: [n_frame, n_classes] 135 | ''' 136 | assert len(backgrounds[0].shape) == 3, 'each spec must be a 3D-tensor' 137 | assert len(voices) == len(labels) 138 | assert len(labels[0].shape) == 1 and labels[0].shape[0] == n_classes, \ 139 | 'labels must be in the form of [n_samples, n_classes]' 140 | 141 | # BACKGROUND NOISE (DRONE) 142 | freq, _, chan = backgrounds[0].shape 143 | b_dataset = tf.data.Dataset.from_generator( 144 | list_to_generator(backgrounds), 145 | tf.float32, 146 | tf.TensorShape([freq, None, chan])) 147 | b_dataset = b_dataset.repeat().shuffle(len(backgrounds)) 148 | 149 | # HUMAN VOICE 150 | v_dataset = tf.data.Dataset.from_generator( 151 | list_to_generator((voices, labels)), 152 | (tf.float32, tf.float32), 153 | (tf.TensorShape([freq, None, chan]), tf.TensorShape([n_classes]))) 154 | v_dataset = v_dataset.repeat().shuffle(len(voices)) 155 | v_dataset = v_dataset.padded_batch( 156 | max_voices, padded_shapes=([freq, None, chan], [n_classes])) 157 | 158 | # NOISES 159 | if noises is not None: 160 | n_dataset = tf.data.Dataset.from_generator( 161 | list_to_generator(noises), 162 | tf.float32, 163 | tf.TensorShape([freq, None, chan])) 164 | n_dataset = n_dataset.repeat().shuffle(len(noises)) 165 | n_dataset = n_dataset.padded_batch( 166 | max_noises, padded_shapes=[freq, None, chan]) 167 | dataset = tf.data.Dataset.zip((b_dataset, v_dataset, n_dataset)) 168 | else: 169 | dataset = tf.data.Dataset.zip((b_dataset, v_dataset)) 170 | 171 | dataset = dataset.map(partial(merge_complex_specs, 172 | n_frame=n_frame, 173 | n_classes=n_classes, 174 | **kwargs)) 175 | return dataset 176 | 177 | -------------------------------------------------------------------------------- /pipeline_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | from pipeline import * 5 | 6 | 7 | class PipelineTest(tf.test.TestCase): 8 | def setUp(self): 9 | self.freq = 257 10 | self.chan = 4 11 | self.n_classes = 30 12 | 13 | def test_merge_complex_specs(self): 14 | n_frame = 10 15 | 16 | background = np.random.randn(self.freq, 8, self.chan).astype('float32') 17 | 18 | n_voices = 4 19 | voices = np.random.randn(n_voices, self.freq, n_frame, self.chan) 20 | voices = voices.astype('float32') 21 | mask = tf.sequence_mask(np.random.randint(1, n_frame, size=n_voices), 22 | n_frame) 23 | mask = tf.reshape(mask, (n_voices, 1, n_frame, 1)) 24 | voices *= tf.cast(mask, tf.float32) 25 | labels = np.random.randint(1, n_frame, size=n_voices) 26 | labels = np.eye(self.n_classes, dtype='float32')[labels] 27 | 28 | n_noises = 2 29 | noises = np.random.randn(n_noises, self.freq, n_frame, self.chan) 30 | noises = noises.astype('float32') 31 | mask = tf.sequence_mask(np.random.randint(1, n_frame, size=n_noises), 32 | n_frame) 33 | mask = tf.reshape(mask, (n_noises, 1, n_frame, 1)) 34 | noises *= tf.cast(mask, tf.float32) 35 | 36 | spec, l = merge_complex_specs(background, 37 | (voices, labels), 38 | noises, 39 | n_frame=n_frame, 40 | n_classes=self.n_classes) 41 | self.assertEqual(spec.shape, [self.freq, n_frame, self.chan]) 42 | self.assertEqual(l.shape, [n_voices, n_frame, self.n_classes]) 43 | 44 | def test_make_pipeline(self): 45 | n_frame = 30 46 | 47 | backgrounds = [np.random.randn(self.freq, 48 | np.random.randint(1, n_frame*2), 49 | self.chan) 50 | for _ in range(30)] 51 | voices = [np.random.randn(self.freq, 52 | np.random.randint(1, n_frame//2), 53 | self.chan) 54 | for _ in range(40)] 55 | labels = np.random.randint(self.n_classes, size=(40,)) 56 | labels = np.eye(self.n_classes, dtype='float32')[labels] 57 | 58 | noises = [np.random.randn(self.freq, 59 | np.random.randint(1, n_frame//2), 60 | self.chan) 61 | for _ in range(50)] 62 | 63 | pipeline = make_pipeline(backgrounds, 64 | voices, 65 | labels, 66 | noises, 67 | n_frame=n_frame, 68 | max_voices=4, 69 | max_noises=4, 70 | n_classes=self.n_classes) 71 | 72 | for s, l in pipeline.take(3): 73 | self.assertEqual(s.shape, [self.freq, n_frame, self.chan]) 74 | self.assertEqual(l.shape, [4, n_frame, self.n_classes]) 75 | 76 | 77 | if __name__ == '__main__': 78 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 79 | tf.test.main() 80 | 81 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-gpu == 2.2.0 2 | tensorflow-probability == 0.10.0 3 | tensorflow_addons 4 | torch # for data processing 5 | torchaudio # for data processing 6 | numpy 7 | efficientnet -------------------------------------------------------------------------------- /sample_answer.json: -------------------------------------------------------------------------------- 1 | { 2 | "task2_answer": { 3 | "set01_drone01": [ 4 | [ 5 | 0, 6 | 210, 7 | 213 8 | ], 9 | [ 10 | 0, 11 | 216, 12 | 219 13 | ], 14 | [ 15 | 1, 16 | 222, 17 | 225 18 | ], 19 | [ 20 | 2, 21 | 74, 22 | 78 23 | ], 24 | [ 25 | 2, 26 | 225, 27 | 231 28 | ] 29 | ], 30 | "set01_drone02": [ 31 | [ 32 | 0, 33 | 168, 34 | 171 35 | ], 36 | [ 37 | 0, 38 | 183, 39 | 186 40 | ], 41 | [ 42 | 1, 43 | 175, 44 | 179 45 | ], 46 | [ 47 | 2, 48 | 165, 49 | 168 50 | ] 51 | ], 52 | "set01_drone03": [ 53 | [ 54 | 0, 55 | 213, 56 | 216 57 | ], 58 | [ 59 | 0, 60 | 220, 61 | 224 62 | ], 63 | [ 64 | 1, 65 | 214, 66 | 218 67 | ], 68 | [ 69 | 2, 70 | 227, 71 | 231 72 | ] 73 | ], 74 | "set01_drone01_new": [ 75 | [ 76 | 0, 77 | 33, 78 | 36 79 | ], 80 | [ 81 | 0, 82 | 60, 83 | 64 84 | ], 85 | [ 86 | 0, 87 | 210, 88 | 213 89 | ], 90 | [ 91 | 0, 92 | 216, 93 | 219 94 | ], 95 | [ 96 | 1, 97 | 91, 98 | 94 99 | ], 100 | [ 101 | 1, 102 | 149, 103 | 154 104 | ], 105 | [ 106 | 1, 107 | 222, 108 | 225 109 | ], 110 | [ 111 | 2, 112 | 44, 113 | 48 114 | ], 115 | [ 116 | 2, 117 | 74, 118 | 78 119 | ], 120 | [ 121 | 2, 122 | 104, 123 | 107 124 | ], 125 | [ 126 | 2, 127 | 225, 128 | 231 129 | ] 130 | ], 131 | "set01_drone02_new": [ 132 | [ 133 | 0, 134 | 67, 135 | 70 136 | ], 137 | [ 138 | 0, 139 | 95, 140 | 98 141 | ], 142 | [ 143 | 0, 144 | 168, 145 | 171 146 | ], 147 | [ 148 | 0, 149 | 183, 150 | 186 151 | ], 152 | [ 153 | 1, 154 | 64, 155 | 68 156 | ], 157 | [ 158 | 1, 159 | 115, 160 | 118 161 | ], 162 | [ 163 | 1, 164 | 175, 165 | 179 166 | ], 167 | [ 168 | 2, 169 | 32, 170 | 35 171 | ], 172 | [ 173 | 2, 174 | 116, 175 | 121 176 | ], 177 | [ 178 | 2, 179 | 165, 180 | 168 181 | ] 182 | ], 183 | "set01_drone03_new": [ 184 | [ 185 | 0, 186 | 18, 187 | 21 188 | ], 189 | [ 190 | 0, 191 | 105, 192 | 108 193 | ], 194 | [ 195 | 0, 196 | 213, 197 | 216 198 | ], 199 | [ 200 | 0, 201 | 220, 202 | 224 203 | ], 204 | [ 205 | 1, 206 | 55, 207 | 59 208 | ], 209 | [ 210 | 1, 211 | 131, 212 | 135 213 | ], 214 | [ 215 | 1, 216 | 214, 217 | 218 218 | ], 219 | [ 220 | 2, 221 | 57, 222 | 60 223 | ], 224 | [ 225 | 2, 226 | 154, 227 | 157 228 | ], 229 | [ 230 | 2, 231 | 227, 232 | 231 233 | ] 234 | ] 235 | } 236 | } -------------------------------------------------------------------------------- /sj_train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from copy import deepcopy 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from tensorflow.keras.callbacks import * 8 | from tensorflow.keras.losses import * 9 | from tensorflow.keras.metrics import * 10 | from tensorflow.keras.optimizers import * 11 | 12 | from metrics import * 13 | from pipeline import * 14 | from data_utils import * 15 | from swa import SWA, NO_SWA_ERROR 16 | from transforms import * 17 | from utils import * 18 | 19 | 20 | class ARGS: 21 | def __init__(self) -> None: 22 | self.args = argparse.ArgumentParser() 23 | self.args.add_argument('--name', type=str, default='') 24 | self.args.add_argument('--gpus', type=str, default='-1') 25 | self.args.add_argument('--model', type=int, default=0) 26 | self.args.add_argument('--model_type', type=str, default='vad', choices=['vad', 'eff', 'se']) 27 | self.args.add_argument('--v', type=int, default=1) 28 | self.args.add_argument('--pretrain', type=bool, default=False) 29 | self.args.add_argument('--n_layers', type=int, default=0) 30 | self.args.add_argument('--n_dim', type=int, default=256) 31 | self.args.add_argument('--n_chan', type=int, default=2) 32 | self.args.add_argument('--n_classes', type=int, default=3) 33 | self.args.add_argument('--patience', type=int, default=10) 34 | 35 | # DATA 36 | self.args.add_argument('--mse_multiplier', type=int, default=1) 37 | self.args.add_argument('--datapath', type=str, default='/root/datasets/Interspeech2020/generate_wavs/codes') 38 | self.args.add_argument('--background_sounds', type=str, default='drone_normed_complex_v4.pickle') 39 | self.args.add_argument('--voices', type=str, default='voice_normed_complex_v3.pickle') 40 | self.args.add_argument('--labels', type=str, default='voice_labels_mfc_v3.npy') 41 | self.args.add_argument('--noises', type=str, default='noises_specs_v2.pickle') 42 | self.args.add_argument('--test_background_sounds', type=str, 43 | default='test_drone_normed_complex_v2.pickle') 44 | self.args.add_argument('--test_voices', type=str, default='test_voice_normed_complex.pickle') 45 | self.args.add_argument('--test_labels', type=str, default='test_voice_labels_mfc.npy') 46 | self.args.add_argument('--n_mels', type=int, default=80) 47 | 48 | # TRAINING 49 | self.args.add_argument('--optimizer', type=str, default='adam', 50 | choices=['adam', 'sgd', 'rmsprop', 'adabelief']) 51 | self.args.add_argument('--lr', type=float, default=1e-3) 52 | self.args.add_argument('--end_lr', type=float, default=1e-4) 53 | self.args.add_argument('--lr_power', type=float, default=0.5) 54 | self.args.add_argument('--lr_div', type=float, default=2) 55 | self.args.add_argument('--clipvalue', type=float, default=0.01) 56 | 57 | self.args.add_argument('--epochs', type=int, default=300) 58 | self.args.add_argument('--batch_size', type=int, default=12) 59 | self.args.add_argument('--n_frame', type=int, default=512) 60 | self.args.add_argument('--steps_per_epoch', type=int, default=100) 61 | self.args.add_argument('--l1', type=float, default=0) 62 | self.args.add_argument('--l2', type=float, default=1e-6) 63 | self.args.add_argument('--loss', type=str, default='BCE') 64 | 65 | # AUGMENTATION 66 | self.args.add_argument('--snr', type=float, default=-20) 67 | self.args.add_argument('--max_voices', type=int, default=7) 68 | self.args.add_argument('--max_noises', type=int, default=2) 69 | 70 | def get(self): 71 | return self.args.parse_args() 72 | 73 | 74 | def make_dataset(config, training=True, n_classes=3): 75 | # Load required datasets 76 | if not os.path.exists(config.datapath): 77 | config.datapath = '' 78 | if training: 79 | backgrounds = load_data(os.path.join(config.datapath, config.background_sounds)) 80 | voices = load_data(os.path.join(config.datapath, config.voices)) 81 | labels = load_data(os.path.join(config.datapath, config.labels)) 82 | else: 83 | backgrounds = load_data(os.path.join(config.datapath, config.test_background_sounds)) 84 | voices = load_data(os.path.join(config.datapath, config.test_voices)) 85 | labels = load_data(os.path.join(config.datapath, config.test_labels)) 86 | if labels.max() - 1 != config.n_classes: 87 | labels //= 10 88 | labels = np.eye(n_classes, dtype='float32')[labels] # to one-hot vectors 89 | noises = load_data(os.path.join(config.datapath, config.noises)) 90 | 91 | # Make pipeline and process the pipeline 92 | pipeline = make_pipeline(backgrounds, 93 | voices, labels, noises, 94 | n_frame=config.n_frame, 95 | max_voices=config.max_voices, 96 | max_noises=config.max_noises, 97 | n_classes=n_classes, 98 | snr=config.snr, 99 | min_ratio=1, 100 | seperate_noise_voice=config.model_type == 'se' and config.v == 9) 101 | if config.model_type == 'se' and config.v == 9: 102 | # pipeline = pipeline.map(complex_to_magphase) 103 | pipeline = pipeline.map(speech_enhancement_preprocess) 104 | pipeline = pipeline.batch(config.batch_size, drop_remainder=False) 105 | pipeline = pipeline.map(label_downsample(32)) 106 | return pipeline.prefetch(AUTOTUNE) 107 | pipeline = pipeline.map(to_frame_labels) 108 | if training: 109 | pipeline = pipeline.map(augment) 110 | if config.n_chan == 1: 111 | pipeline = pipeline.map(mono_chan) 112 | elif config.n_chan == 3: 113 | pipeline = pipeline.map(stereo_mono) 114 | elif config.n_chan > 3: 115 | pipeline = pipeline.map(random_merge_aug(config.n_chan)) 116 | if 'filter' in config.name: 117 | pipeline = pipeline.map(stft_filter(int(round(200 / (16000 / 256))))) 118 | pipeline = pipeline.batch(config.batch_size, drop_remainder=False) 119 | pipeline = pipeline.map(complex_to_magphase) 120 | pipeline = pipeline.map(magphase_to_mel(config.n_mels)) 121 | if 'nominmax' not in config.name: 122 | pipeline = pipeline.map(minmax) 123 | pipeline = pipeline.map(log_on_mel) 124 | if config.v in label_downsample_model: 125 | pipeline = pipeline.map(label_downsample(32)) 126 | elif config.v == 5: 127 | pipeline = pipeline.map(label_downsample(config.n_frame // (config.n_frame * 256 // 16000))) 128 | if config.loss.upper() in ('MSE', 'MAE'): 129 | pipeline = pipeline.map(multiply_label(config.mse_multiplier)) 130 | return pipeline.prefetch(AUTOTUNE) 131 | 132 | 133 | def custom_scheduler(d_model, warmup_steps=4000, lr_div=2): 134 | # https://www.tensorflow.org/tutorials/text/transformer#optimizer 135 | d_model = tf.cast(d_model, tf.float32) 136 | 137 | def _scheduler(step): 138 | step = tf.cast(step+1, tf.float32) 139 | arg1 = tf.math.rsqrt(step) 140 | arg2 = step * (warmup_steps ** -1.5) 141 | return tf.math.rsqrt(d_model) * tf.math.minimum(arg1, arg2) / lr_div 142 | return _scheduler 143 | 144 | 145 | def adaptive_clip_grad(parameters, gradients, clip_factor=0.01, 146 | eps=1e-3): 147 | new_grads = [] 148 | for (params, grads) in zip(parameters, gradients): 149 | p_norm = unitwise_norm(params) 150 | max_norm = tf.math.maximum(p_norm, eps) * clip_factor 151 | grad_norm = unitwise_norm(grads) 152 | clipped_grad = grads * (max_norm / tf.math.maximum(grad_norm, 1e-6)) 153 | new_grad = tf.where(grad_norm < max_norm, grads, clipped_grad) 154 | new_grads.append(new_grad) 155 | return new_grads 156 | 157 | 158 | class CustomModel(tf.keras.Model): 159 | def __init__(self, **kwargs) -> None: 160 | super(CustomModel, self).__init__(**kwargs) 161 | 162 | def train_step(self, data): 163 | # Unpack the data. Its structure depends on your model and 164 | # on what you pass to `fit()`. 165 | x, y = data 166 | if not isinstance(y, tuple): 167 | y = (y,) 168 | with tf.GradientTape() as tape: 169 | y_pred = self(x, training=True) # Forward pass 170 | if not isinstance(y_pred, (tuple, list)): 171 | y_pred = (y_pred,) 172 | # Compute the loss value 173 | # (the loss function is configured in `compile()`) 174 | loss = self.compiled_loss(y, y_pred) 175 | 176 | # Compute gradients 177 | trainable_vars = self.trainable_variables 178 | 179 | gradients = tape.gradient(loss, trainable_vars) 180 | gradients = adaptive_clip_grad(self.trainable_variables, gradients) 181 | # Update weights 182 | self.optimizer.apply_gradients(zip(gradients, trainable_vars)) 183 | # Update metrics (includes the metric that tracks the loss) 184 | 185 | self.compiled_metrics.update_state(y, y_pred[0]) 186 | 187 | # Return a dict mapping metric names to current value 188 | return {m.name: m.result() for m in self.metrics} 189 | 190 | 191 | def ConvMPBlock(x, num_convs=2, fsize=32, kernel_size=3, pool_size=(2,2), strides=(2,2), BN=False, DO=False, MP=True): 192 | for i in range(num_convs): 193 | x = tf.keras.layers.Conv2D(fsize, kernel_size, padding='same')(x) 194 | if BN: 195 | x = tf.keras.layers.BatchNormalization()(x) 196 | if DO: 197 | x = tf.keras.layers.Dropout(DO)(x) 198 | x = tf.keras.layers.Activation('relu')(x) 199 | if MP: 200 | x = tf.keras.layers.MaxPooling2D(pool_size=pool_size, strides=strides, padding='same')(x) 201 | return x 202 | 203 | 204 | def FullyConnectedLayer(x, nodes=512, act='relu', BN=False, DO=False, name=None): 205 | x = tf.keras.layers.Dense(nodes)(x) 206 | if BN: 207 | x = tf.keras.layers.BatchNormalization()(x) 208 | if DO: 209 | x = tf.keras.layers.Dropout(DO)(x) 210 | x = tf.keras.layers.Activation(act, name=name)(x) 211 | return x 212 | 213 | 214 | def define_keras_model(config=None): 215 | fsize = 32 216 | if config.model_type == 'vad' and config.v == 8: 217 | fsize = 48 218 | 219 | td_dim = 1024 220 | input_tensor = tf.keras.layers.Input( 221 | shape=(config.n_mels, config.n_frame, config.n_chan)) 222 | x = input_tensor 223 | x = ConvMPBlock(x, num_convs=2, fsize=fsize, BN=True) 224 | for i in range(1, 5): 225 | if config.model_type == 'vad' and config.v == 6: 226 | seconds = 0.5 227 | kernel_size = int(round(seconds / (256 * config.n_frame / 16000 / x.shape[-2]))) 228 | x = tf.keras.layers.AveragePooling2D((1,kernel_size,), 1, padding='same')(x) 229 | x = tf.keras.layers.MaxPooling2D((1,kernel_size * 2,), 1, padding='same')(x) 230 | if config.model_type == 'vad' and config.v == 7: 231 | skip = x 232 | x = tf.keras.layers.Conv2D(skip.shape[-1] // 4, 1, 1, padding='same')(x) 233 | x = tf.keras.layers.BatchNormalization()(x) 234 | x = tf.keras.layers.Activation('relu')(x) 235 | x = tf.keras.layers.Conv2D(skip.shape[-1] // 4, 3, 1, padding='same')(x) 236 | x = tf.keras.layers.BatchNormalization()(x) 237 | x = tf.keras.layers.Activation('relu')(x) 238 | x = tf.keras.layers.Conv2D(skip.shape[-1], 1, 1, padding='same')(x) 239 | x = tf.keras.layers.BatchNormalization()(x) 240 | x = tf.keras.layers.Activation('relu')(x) 241 | x += skip 242 | x = ConvMPBlock(x, num_convs=3, fsize=fsize * 2**i, BN=True) 243 | 244 | x = tf.keras.layers.Permute((2,1,3))(x) 245 | x = tf.keras.layers.Reshape((x.shape[1], x.shape[2]*x.shape[3]))(x) 246 | x = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(td_dim, activation='relu'))(x) 247 | if config.model_type == 'vad' and config.v == 9: 248 | x = FullyConnectedLayer(x, 512, BN=True) 249 | x = FullyConnectedLayer(x, 256, BN=True) 250 | x = FullyConnectedLayer(x, 128, BN=True) 251 | if config.model_type == 'vad' and config.v == 9: 252 | x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x) 253 | x = FullyConnectedLayer(x, 64, BN=True) 254 | x = FullyConnectedLayer(x, 3, act='sigmoid' if config.model_type == 'vad' else 'relu', name='class' if config.model_type == 'se' else None) 255 | return CustomModel(inputs=input_tensor, outputs=x) 256 | 257 | 258 | def convset(inp, chan=16): 259 | out = inp 260 | for _ in range(2): 261 | out = tf.keras.layers.Conv2D(chan, 3, strides=1, padding='same')(out) 262 | out = tf.keras.layers.BatchNormalization()(out) 263 | out = tf.keras.layers.Activation('relu')(out) 264 | out = tf.keras.layers.MaxPooling2D()(out) 265 | return out 266 | 267 | 268 | def upsampling(inp, chan=64): 269 | out = inp 270 | out = tf.keras.layers.Conv2D(chan, 3, strides=1, padding='same')(out) 271 | out = tf.keras.layers.BatchNormalization()(out) 272 | out = tf.keras.layers.Activation('relu')(out) 273 | return tf.keras.layers.Conv2DTranspose(chan, 2, 2, padding='same')(out) 274 | 275 | 276 | def speech_enhancement_model(input): 277 | merge_input = tf.keras.layers.Input(tensor=input[1:]) 278 | inp1 = convset(merge_input, 64) 279 | inp2 = convset(inp1, 128) 280 | inp3 = convset(inp2, 256) 281 | latent = convset(inp3, 512) 282 | 283 | speech3 = upsampling(latent, 256) 284 | speech2 = upsampling(tf.keras.layers.Concatenate(-1)([inp3, speech3]), 128) 285 | speech1 = upsampling(tf.keras.layers.Concatenate(-1)([inp2, speech2]), 64) 286 | speech = upsampling(tf.keras.layers.Concatenate(-1)([inp1, speech1]), 2) 287 | 288 | noise3 = upsampling(latent, 256) 289 | noise2 = upsampling(tf.keras.layers.Concatenate(-1)([inp3, noise3]), 128) 290 | noise1 = upsampling(tf.keras.layers.Concatenate(-1)([inp2, noise2]), 64) 291 | noise = upsampling(tf.keras.layers.Concatenate(-1)([inp1, noise1]), 2) 292 | return CustomModel(inputs=merge_input, outputs=[speech, noise]) 293 | 294 | 295 | def get_model(config): 296 | input_tensor = tf.keras.layers.Input( 297 | shape=(config.n_mels, config.n_frame, config.n_chan)) 298 | 299 | if config.model_type == 'se': 300 | input_tensor = tf.keras.layers.Input(shape=(256, config.n_frame, config.n_chan)) 301 | merge_input = input_tensor[:, 1:] 302 | merge_input = tf.transpose(input_tensor, perm=[0, 2, 1, 3]) 303 | 304 | se_model = speech_enhancement_model(merge_input) 305 | if not config.pretrain: 306 | se_model.trainable = False 307 | speech, noise = se_model(merge_input) 308 | 309 | # out = tf.keras.layers.Concatenate(-1)([speech, noise]) 310 | out = speech 311 | out = tf.transpose(out, perm=[0, 2, 1, 3]) 312 | config.n_mels = out.shape[1] 313 | tmp_config = deepcopy(config) 314 | tmp_config.n_chan = out.shape[-1] 315 | vadmodel = define_keras_model(tmp_config) 316 | if config.pretrain: 317 | vadmodel.trainable = False 318 | out = vadmodel(out) 319 | 320 | # backbone = getattr(tf.keras.applications.efficientnet, f'EfficientNetB4')( 321 | # include_top=False, weights=None, input_tensor=out) 322 | # out = tf.keras.layers.Permute((2, 1, 3))(backbone.output) 323 | # out = tf.keras.layers.Reshape((-1, out.shape[-1] * out.shape[-2]))(out) 324 | # out = tf.keras.layers.Conv1DTranspose(128, 2, 2)(out) 325 | # out = tf.keras.layers.Activation('relu')(out) 326 | # out = tf.keras.layers.Conv1DTranspose(64, 2, 2)(out) 327 | # out = tf.keras.layers.Activation('relu')(out) 328 | # out = tf.keras.layers.Conv1DTranspose(32, 2, 2)(out) 329 | # out = tf.keras.layers.Activation('relu')(out) 330 | # out = tf.keras.layers.Conv1DTranspose(16, 2, 2)(out) 331 | # out = tf.keras.layers.Activation('relu')(out) 332 | # out = tf.keras.layers.Conv1DTranspose(8, 2, 2)(out) 333 | # out = tf.keras.layers.Activation('relu')(out) 334 | # out = tf.keras.layers.Dense(config.n_classes)(out) 335 | # out = tf.keras.layers.Activation('sigmoid', name='class')(out) 336 | 337 | speech = tf.keras.layers.Permute((2, 1, 3), name='speech')(speech) 338 | noise = tf.keras.layers.Permute((2, 1, 3), name='noise')(noise) 339 | return CustomModel(inputs=[input_tensor], outputs=[out, speech, noise]) 340 | elif config.model_type == 'eff': 341 | backbone = getattr(tf.keras.applications.efficientnet, f'EfficientNetB{config.model}')( 342 | include_top=False, weights=None, input_tensor=input_tensor) 343 | 344 | out = tf.transpose(backbone.output, perm=[0, 2, 1, 3]) 345 | out = tf.keras.layers.Reshape([-1, out.shape[-1]*out.shape[-2]])(out) 346 | 347 | for i in range(config.n_layers): 348 | out = tf.keras.layers.Dense(config.n_dim)(out) 349 | out = tf.keras.layers.BatchNormalization()(out) 350 | out = tf.keras.layers.Activation('sigmoid')(out) * out 351 | 352 | # v1 ------------------------- 353 | if config.v == 1: 354 | out = tf.keras.layers.Conv1DTranspose(128, 2, 2)(out) 355 | out = tf.keras.layers.Activation('relu')(out) 356 | out = tf.keras.layers.Conv1DTranspose(64, 2, 2)(out) 357 | out = tf.keras.layers.Activation('relu')(out) 358 | out = tf.keras.layers.Conv1DTranspose(32, 2, 2)(out) 359 | out = tf.keras.layers.Activation('relu')(out) 360 | out = tf.keras.layers.Conv1DTranspose(16, 2, 2)(out) 361 | out = tf.keras.layers.Activation('relu')(out) 362 | out = tf.keras.layers.Conv1DTranspose(3, 2, 2)(out) 363 | out = tf.keras.layers.Activation('relu')(out) 364 | # v2 ------------------------- 365 | elif config.v == 2: 366 | raise ValueError('version 2 is deprecated') 367 | out = tf.keras.layers.Conv1DTranspose(128, 2, 2)(out) 368 | out = tf.keras.layers.Conv1DTranspose(64, 2, 2)(out) 369 | out = tf.keras.layers.Conv1DTranspose(32, 2, 2)(out) 370 | out = tf.keras.layers.Conv1DTranspose(16, 2, 2)(out) 371 | out = tf.keras.layers.Conv1DTranspose(3, 2, 2)(out) 372 | elif config.v == 3: 373 | out = out 374 | elif config.v == 4: 375 | raise ValueError('version 4 is deprecated') 376 | out = tf.keras.layers.Conv1D(config.n_frame, 1, use_bias=False, data_format='channels_first')(out) 377 | elif config.v == 5: 378 | if out.shape[1] != config.n_frame * 256 // 16000: 379 | out = tf.keras.layers.Conv1D(config.n_frame * 256 // 16000, 1, use_bias=False, data_format='channels_first')(out) 380 | out = tf.keras.layers.BatchNormalization()(out) 381 | out = tf.keras.layers.Activation('relu')(out) 382 | out = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(out) 383 | elif config.v == 6: 384 | out = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(out) 385 | out = FullyConnectedLayer(out, 256, BN=True) 386 | out = FullyConnectedLayer(out, 128, BN=True) 387 | out = FullyConnectedLayer(out, 64, BN=True) 388 | elif config.v == 7: 389 | out = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(out) 390 | big = tf.keras.layers.Reshape((config.n_mels, -1))(input_tensor) 391 | big = tf.keras.layers.Conv1D(out.shape[-1], 16, strides=5, padding='same')(big) 392 | big = tf.keras.layers.Activation('tanh')(big) 393 | out *= big 394 | else: 395 | raise ValueError('wrong version') 396 | 397 | out = tf.keras.layers.Dense(config.n_classes)(out) 398 | # out= tf.keras.layers.Activation('relu')(out) 399 | # out *= tf.cast(out < 1., out.dtype) 400 | out = tf.keras.layers.Activation('sigmoid')(out) 401 | return tf.keras.models.Model(inputs=input_tensor, outputs=out) 402 | elif config.model_type == 'vad': 403 | return define_keras_model(config) 404 | 405 | 406 | def main(): 407 | config = ARGS().get() 408 | os.environ['CUDA_VISIBLE_DEVICES'] = config.gpus 409 | config.loss = config.loss.upper() 410 | if config.loss != 'MSE': 411 | config.mse_multiplier = 1 412 | print(config) 413 | 414 | TOTAL_EPOCH = config.epochs 415 | BATCH_SIZE = config.batch_size 416 | NAME = (config.name + '_') if config.name != '' else '' 417 | if config.model_type == 'eff': 418 | model_first_name = f'B{config.model}' 419 | elif config.model_type == 'se': 420 | model_first_name = 'se' 421 | elif config.model_type == 'vad': 422 | model_first_name = 'vad' 423 | 424 | NAME = NAME + '_'.join([model_first_name, f'v{config.v}', f'lr{config.lr}', 425 | f'batch{config.batch_size}', f'opt_{config.optimizer}', 426 | f'mel{config.n_mels}', f'chan{config.n_chan}', f'{config.loss.upper()}', f'framelen{config.n_frame}']) 427 | if config.model_type == 'se' and config.v == 9 and config.pretrain: 428 | NAME += '_weight' 429 | NAME = NAME if NAME.endswith('.h5') else NAME + '.h5' 430 | """ MODEL """ 431 | model = get_model(config) 432 | 433 | lr = config.lr 434 | if config.optimizer == 'adam': 435 | opt = Adam(lr, clipvalue=config.clipvalue) 436 | elif config.optimizer == 'sgd': 437 | opt = SGD(lr, momentum=0.9, clipvalue=config.clipvalue) 438 | elif config.optimizer == 'rmsprop': 439 | opt = RMSprop(lr, momentum=0.9, clipvalue=config.clipvalue) 440 | else: 441 | raise ValueError('adabelief is deprecated') 442 | opt = AdaBelief(lr, clipvalue=config.clipvalue) 443 | # if config.l2 > 0: 444 | # model = apply_kernel_regularizer( 445 | # model, tf.keras.regularizers.l1_l2(config.l1, config.l2)) 446 | 447 | if config.loss.upper() == 'BCE': 448 | loss = tf.keras.losses.BinaryCrossentropy() 449 | elif config.loss.upper() == 'FOCAL': 450 | loss = sigmoid_focal_crossentropy 451 | if config.model_type == 'se' and config.v == 9: 452 | loss = [loss, tf.losses.MAE, tf.losses.MAE] 453 | 454 | metrics = [cos_sim, 455 | f1_score()] 456 | if config.v != 5: 457 | metrics.append(er_score(smoothing=False)) 458 | model.compile(optimizer=opt, 459 | # loss=custom_loss(alpha=config.loss_alpha, l2=config.loss_l2), 460 | loss=loss, 461 | loss_weights=[1, 10, 10], 462 | metrics=metrics) 463 | setattr(model, 'train_config', config) 464 | model.summary() 465 | print(NAME) 466 | 467 | if config.model_type == 'se' and config.v == 9 and not config.pretrain: 468 | model.load_weights(NAME) 469 | print('loaded pretrained model') 470 | 471 | """ DATA """ 472 | train_set = make_dataset(config, training=True) 473 | test_set = make_dataset(config, training=False) 474 | 475 | earlystop_monitor = 'val_loss' 476 | model_checkpoint_monitor = 'val_class_er' if config.v == 9 and config.model_type == 'eff' else 'val_er' 477 | if config.model_type == 'se' and config.v == 9: 478 | if config.pretrain: 479 | earlystop_monitor = 'val_speech_loss' 480 | model_checkpoint_monitor = 'val_speech_loss' 481 | else: 482 | earlystop_monitor = 'val_class_loss' 483 | model_checkpoint_monitor = 'val_class_er' 484 | else: 485 | earlystop_monitor = 'val_loss' 486 | model_checkpoint_monitor = 'val_er' 487 | 488 | """ TRAINING """ 489 | callbacks = [ 490 | CSVLogger(NAME.replace('.h5', '.csv'), append=True), 491 | SWA(start_epoch=TOTAL_EPOCH//4, swa_freq=2), 492 | ModelCheckpoint(NAME, monitor=model_checkpoint_monitor, save_best_only=True, verbose=1), 493 | TerminateOnNaN(), 494 | TensorBoard(log_dir=os.path.join('tensorboard_log', NAME.split('.h5')[0])), 495 | EarlyStopping(monitor=earlystop_monitor, patience=config.patience, restore_best_weights=True), 496 | eval_callback(config, NAME), 497 | # LearningRateScheduler(tf.keras.optimizers.schedules.CosineDecayRestarts(config.lr, 5), verbose=1), 498 | # LearningRateScheduler(lr_schedule, verbose=1), 499 | # ReduceLROnPlateau(monitor='val_loss', factor=1 / 2**0.5, patience=5, verbose=1, mode='min') 500 | ] 501 | callbacks.append( 502 | LearningRateScheduler( 503 | custom_scheduler(4096, TOTAL_EPOCH/12, config.lr_div))) 504 | 505 | # if not config.pretrain: 506 | # callbacks.append( 507 | # LearningRateScheduler( 508 | # custom_scheduler(4096, TOTAL_EPOCH/12, config.lr_div))) 509 | # else: 510 | # callbacks.append(ReduceLROnPlateau(monitor='val_loss', factor=1 / 2**0.5, patience=5, verbose=1, mode='min')) 511 | 512 | try: 513 | model.fit(train_set, 514 | epochs=TOTAL_EPOCH, 515 | batch_size=BATCH_SIZE, 516 | steps_per_epoch=config.steps_per_epoch, 517 | validation_data=test_set, 518 | validation_steps=16, 519 | callbacks=callbacks) 520 | print('best model:', NAME.replace('.h5', '_SWA.h5')) 521 | model.save(NAME.replace('.h5', '_SWA.h5')) 522 | except NO_SWA_ERROR: 523 | pass 524 | print(NAME.split('.h5')[0]) 525 | exit() 526 | 527 | 528 | if __name__ == "__main__": 529 | main() 530 | 531 | -------------------------------------------------------------------------------- /swa.py: -------------------------------------------------------------------------------- 1 | # https://github.com/simon-larsson/keras-swa/blob/master/swa/keras.py 2 | import tensorflow as tf 3 | 4 | 5 | class NO_SWA_ERROR(Exception): 6 | def __init__(self, msg="Didn't use SWA") -> None: 7 | self.msg = msg 8 | 9 | def __str__(self) -> str: 10 | return self.msg 11 | 12 | 13 | class SWA(tf.keras.callbacks.Callback): 14 | def __init__(self, start_epoch, swa_freq=1, verbose=True): 15 | super(SWA, self).__init__() 16 | self.start_epoch = start_epoch - 1 17 | self.swa_freq = swa_freq 18 | self.swa_weights = None 19 | self.cnt = 0 20 | self.verbose = verbose 21 | 22 | def on_epoch_end(self, epoch, logs=None): 23 | epoch = epoch - self.start_epoch 24 | if epoch == 0 or (epoch > 0 and epoch % self.swa_freq == 0): 25 | if self.verbose: 26 | print("\nSaving Weights... ", epoch+self.start_epoch) 27 | self.update_swa_weights() 28 | 29 | def on_train_end(self, logs=None): 30 | print("\nFinal Model Has Been Saved... Please Reset BN") 31 | try: 32 | self.model.set_weights(self.swa_weights) 33 | except TypeError: 34 | raise NO_SWA_ERROR() 35 | 36 | def update_swa_weights(self): 37 | if self.swa_weights is None: 38 | self.swa_weights = self.model.get_weights() 39 | else: 40 | self.swa_weights = [ 41 | (swa_w*self.cnt + w) / (self.cnt+1) 42 | for swa_w, w in zip(self.swa_weights, self.model.get_weights())] 43 | 44 | self.cnt += 1 45 | 46 | -------------------------------------------------------------------------------- /trainer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | from tensorflow.keras.callbacks import * 6 | from tensorflow.keras.losses import * 7 | from tensorflow.keras.metrics import * 8 | from tensorflow.keras.optimizers import * 9 | 10 | from metrics import * 11 | from pipeline import * 12 | from swa import SWA 13 | from transforms import * 14 | from utils import * 15 | 16 | 17 | args = argparse.ArgumentParser() 18 | args.add_argument('--name', type=str, required=True) 19 | args.add_argument('--model', type=str, default='EfficientNetB4') 20 | args.add_argument('--pretrain', type=bool, default=False) 21 | args.add_argument('--n_layers', type=int, default=0) 22 | args.add_argument('--n_dim', type=int, default=256) 23 | args.add_argument('--n_chan', type=int, default=1) 24 | args.add_argument('--n_classes', type=int, default=3) 25 | 26 | # DATA 27 | args.add_argument('--datapath', type=str, default='/root/datasets/Interspeech2020/generate_wavs/codes') 28 | args.add_argument('--background_sounds', type=str, default='drone_normed_complex_v3.pickle') 29 | args.add_argument('--voices', type=str, default='voice_normed_complex_v3.pickle') 30 | args.add_argument('--labels', type=str, default='voice_labels_mfc_v3.npy') 31 | args.add_argument('--noises', type=str, default='noises_specs_v2.pickle') 32 | args.add_argument('--test_background_sounds', type=str, 33 | default='dummy_specs.pickle') 34 | args.add_argument('--test_voices', type=str, default='dummy_specs.pickle') 35 | args.add_argument('--test_labels', type=str, default='dummy_labels.npy') 36 | args.add_argument('--n_mels', type=int, default=80) 37 | 38 | # TRAINING 39 | args.add_argument('--optimizer', type=str, default='adabelief', 40 | choices=['adam', 'sgd', 'rmsprop', 'adabelief']) 41 | args.add_argument('--lr', type=float, default=1e-4) 42 | args.add_argument('--end_lr', type=float, default=1e-4) 43 | args.add_argument('--lr_power', type=float, default=0.5) 44 | args.add_argument('--lr_div', type=float, default=2) 45 | args.add_argument('--clipvalue', type=float, default=0.01) 46 | 47 | args.add_argument('--epochs', type=int, default=500) 48 | args.add_argument('--batch_size', type=int, default=12) 49 | args.add_argument('--n_frame', type=int, default=2048) 50 | args.add_argument('--steps_per_epoch', type=int, default=100) 51 | args.add_argument('--l1', type=float, default=0) 52 | args.add_argument('--l2', type=float, default=1e-6) 53 | args.add_argument('--loss_alpha', type=float, default=0.8) 54 | args.add_argument('--loss_l2', type=float, default=1.) 55 | args.add_argument('--multiplier', type=float, default=10) 56 | 57 | # AUGMENTATION 58 | args.add_argument('--snr', type=float, default=-15) 59 | args.add_argument('--max_voices', type=int, default=10) 60 | args.add_argument('--max_noises', type=int, default=6) 61 | 62 | 63 | def minmax_log_on_mel(mel, labels=None): 64 | # batch-wise pre-processing 65 | axis = tuple(range(1, len(mel.shape))) 66 | 67 | # MIN-MAX 68 | mel_max = tf.math.reduce_max(mel, axis=axis, keepdims=True) 69 | mel_min = tf.math.reduce_min(mel, axis=axis, keepdims=True) 70 | mel = safe_div(mel-mel_min, mel_max-mel_min) 71 | 72 | # LOG 73 | mel = tf.math.log(mel + EPSILON) 74 | 75 | if labels is not None: 76 | return mel, labels 77 | return mel 78 | 79 | 80 | def augment(specs, labels, time_axis=-2, freq_axis=-3): 81 | specs = mask(specs, axis=time_axis, max_mask_size=24, n_mask=6) 82 | specs = mask(specs, axis=freq_axis, max_mask_size=16) 83 | return specs, labels 84 | 85 | 86 | def preprocess_labels(multiplier): 87 | def _preprocess(x, y): 88 | # process y: [None, time, classes] -> [None, time', classes] 89 | for i in range(5): 90 | # sum_pool1d 91 | y = tf.nn.avg_pool1d(y, 2, strides=2, padding='SAME') * 2 92 | y *= multiplier 93 | return x, y 94 | return _preprocess 95 | 96 | 97 | def to_density_labels(x, y): 98 | """ 99 | :param y: [..., n_voices, n_frames, n_classes] 100 | :return: [..., n_frames, n_classes] 101 | """ 102 | y = safe_div(y, tf.reduce_sum(y, axis=(-2, -1), keepdims=True)) 103 | y = tf.reduce_sum(y, axis=-3) 104 | return x, y 105 | 106 | 107 | def make_dataset(config, training=True, n_classes=3): 108 | # Load required datasets 109 | if not os.path.exists(config.datapath): 110 | config.datapath = '' 111 | if training: 112 | backgrounds = load_data(os.path.join(config.datapath, config.background_sounds)) 113 | voices = load_data(os.path.join(config.datapath, config.voices)) 114 | labels = load_data(os.path.join(config.datapath, config.labels)) 115 | else: 116 | backgrounds = load_data(os.path.join(config.datapath, config.test_background_sounds)) 117 | voices = load_data(os.path.join(config.datapath, config.test_voices)) 118 | labels = load_data(os.path.join(config.datapath, config.test_labels)) 119 | if labels.max() - 1 != config.n_classes: 120 | labels //= 10 121 | labels = np.eye(n_classes, dtype='float32')[labels] # to one-hot vectors 122 | noises = load_data(os.path.join(config.datapath, config.noises)) 123 | 124 | # Make pipeline and process the pipeline 125 | pipeline = make_pipeline(backgrounds, 126 | voices, labels, noises, 127 | n_frame=config.n_frame, 128 | max_voices=config.max_voices, 129 | max_noises=config.max_noises, 130 | n_classes=n_classes, 131 | snr=config.snr, 132 | min_ratio=1) 133 | pipeline = pipeline.map(to_density_labels) 134 | if training: 135 | pipeline = pipeline.map(augment) 136 | pipeline = pipeline.batch(config.batch_size, drop_remainder=False) 137 | pipeline = pipeline.map(complex_to_magphase) 138 | pipeline = pipeline.map(magphase_to_mel(config.n_mels)) 139 | pipeline = pipeline.map(minmax_log_on_mel) 140 | pipeline = pipeline.map(preprocess_labels(config.multiplier)) 141 | return pipeline.prefetch(AUTOTUNE) 142 | 143 | 144 | def custom_loss(alpha=0.8, l2=1): 145 | def _custom(y_true, y_pred): 146 | # y_true, y_pred = [None, time, 30] 147 | # [None, time, 30] -> [None, time, 3, 10] 148 | t_true = tf.stack(tf.split(y_true, 3, axis=-1), axis=-2) 149 | t_pred = tf.stack(tf.split(y_pred, 3, axis=-1), axis=-2) 150 | 151 | # [None, time, 10] 152 | d_y_true = tf.reduce_sum(t_true, axis=-2) 153 | d_y_pred = tf.reduce_sum(t_pred, axis=-2) 154 | 155 | # [None, time, 3] 156 | c_y_true = tf.reduce_sum(t_true, axis=-1) 157 | c_y_pred = tf.reduce_sum(t_pred, axis=-1) 158 | 159 | loss = alpha * tf.keras.losses.MAE(tf.reduce_sum(d_y_true, axis=1), 160 | tf.reduce_sum(d_y_pred, axis=1)) \ 161 | + (1-alpha) * tf.keras.losses.MAE(tf.reduce_sum(c_y_true, axis=1), 162 | tf.reduce_sum(c_y_pred, axis=1)) 163 | 164 | # TODO: OT loss 165 | # TV: total variation loss 166 | # normed - degrees [None, time, 10] 167 | n_d_true = safe_div( 168 | d_y_true, tf.reduce_sum(d_y_true, axis=1, keepdims=True)) 169 | n_d_pred = safe_div( 170 | d_y_pred, tf.reduce_sum(d_y_pred, axis=1, keepdims=True)) 171 | 172 | # normed - classes [None, time, 3] 173 | n_c_true = safe_div( 174 | c_y_true, tf.reduce_sum(c_y_true, axis=1, keepdims=True)) 175 | n_c_pred = safe_div( 176 | c_y_pred, tf.reduce_sum(c_y_pred, axis=1, keepdims=True)) 177 | 178 | tv = alpha * tf.reduce_mean( 179 | tf.reduce_sum(tf.math.abs(n_d_true - n_d_pred), axis=1) 180 | * tf.reduce_sum(d_y_true, axis=1), # [None, 10] 181 | axis=1) 182 | tv += (1-alpha) * tf.reduce_mean( 183 | tf.reduce_sum(tf.math.abs(n_c_true - n_c_pred), axis=1) 184 | * tf.reduce_sum(c_y_true, axis=1), # [None, 3] 185 | axis=1) 186 | loss += l2 * tv 187 | 188 | return loss 189 | return _custom 190 | 191 | 192 | def cos_sim(y_true, y_pred): 193 | mask = tf.cast( 194 | tf.reduce_sum(y_true, axis=-2) > 0., tf.float32) # [None, 30] 195 | mask = safe_div(mask, tf.reduce_sum(mask, axis=-1, keepdims=True)) 196 | return tf.reduce_sum( 197 | tf.keras.losses.cosine_similarity(y_true, y_pred, axis=-2) * mask, 198 | axis=-1) 199 | 200 | 201 | def custom_scheduler(d_model, warmup_steps=4000, lr_div=2): 202 | # https://www.tensorflow.org/tutorials/text/transformer#optimizer 203 | d_model = tf.cast(d_model, tf.float32) 204 | 205 | def _scheduler(step): 206 | step = tf.cast(step+1, tf.float32) 207 | arg1 = tf.math.rsqrt(step) 208 | arg2 = step * (warmup_steps ** -1.5) 209 | return tf.math.rsqrt(d_model) * tf.math.minimum(arg1, arg2) / lr_div 210 | return _scheduler 211 | 212 | 213 | if __name__ == "__main__": 214 | config = args.parse_args() 215 | print(config) 216 | 217 | TOTAL_EPOCH = config.epochs 218 | BATCH_SIZE = config.batch_size 219 | NAME = config.name if config.name.endswith('.h5') else config.name + '.h5' 220 | 221 | """ MODEL """ 222 | input_tensor = tf.keras.layers.Input( 223 | shape=(config.n_mels, config.n_frame, config.n_chan)) 224 | backbone = getattr(tf.keras.applications.efficientnet, config.model)( 225 | include_top=False, weights=None, input_tensor=input_tensor) 226 | 227 | out = tf.transpose(backbone.output, perm=[0, 2, 1, 3]) 228 | out = tf.keras.layers.Reshape([-1, out.shape[-1]*out.shape[-2]])(out) 229 | 230 | for i in range(config.n_layers): 231 | out = tf.keras.layers.Dense(config.n_dim)(out) 232 | out = tf.keras.layers.BatchNormalization()(out) 233 | out = tf.keras.layers.Activation('sigmoid')(out) * out 234 | 235 | out = tf.keras.layers.Dense(config.n_classes, activation='relu')(out) 236 | model = tf.keras.models.Model(inputs=input_tensor, outputs=out) 237 | 238 | lr = config.lr 239 | if config.optimizer == 'adam': 240 | opt = Adam(lr, clipvalue=config.clipvalue) 241 | elif config.optimizer == 'sgd': 242 | opt = SGD(lr, momentum=0.9, clipvalue=config.clipvalue) 243 | elif config.optimizer == 'rmsprop': 244 | opt = RMSprop(lr, momentum=0.9, clipvalue=config.clipvalue) 245 | else: 246 | opt = AdaBelief(lr, clipvalue=config.clipvalue) 247 | 248 | if config.l2 > 0: 249 | model = apply_kernel_regularizer( 250 | model, tf.keras.regularizers.l1_l2(config.l1, config.l2)) 251 | model.compile(optimizer=opt, 252 | loss=custom_loss(alpha=config.loss_alpha, l2=config.loss_l2), 253 | metrics=[cos_sim]) 254 | # model.summary() 255 | 256 | if config.pretrain: 257 | model.load_weights(NAME) 258 | print('loaded pretrained model') 259 | 260 | """ DATA """ 261 | train_set = make_dataset(config, training=True) 262 | test_set = make_dataset(config, training=False) 263 | 264 | """ TRAINING """ 265 | callbacks = [ 266 | CSVLogger(NAME.replace('.h5', '.log'), append=True), 267 | SWA(start_epoch=TOTAL_EPOCH//2, swa_freq=2), 268 | ModelCheckpoint(NAME, monitor='val_loss', save_best_only=True, 269 | verbose=1), 270 | TerminateOnNaN() 271 | ] 272 | 273 | if not config.pretrain: 274 | callbacks.append( 275 | LearningRateScheduler( 276 | custom_scheduler(4096, TOTAL_EPOCH/12, config.lr_div))) 277 | else: 278 | callbacks.append( 279 | ReduceLROnPlateau(monitor='loss', factor=0.9, patience=5)) 280 | 281 | model.fit(train_set, 282 | epochs=TOTAL_EPOCH, 283 | batch_size=BATCH_SIZE, 284 | steps_per_epoch=config.steps_per_epoch, 285 | validation_data=test_set, 286 | validation_steps=16, 287 | callbacks=callbacks) 288 | 289 | model.save(NAME.replace('.h5', '_SWA.h5')) 290 | 291 | -------------------------------------------------------------------------------- /transforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from math import log, e 4 | 5 | 6 | AUTOTUNE = tf.data.experimental.AUTOTUNE 7 | EPSILON = 1e-8 8 | LOG_EPSILON = log(EPSILON) / log(e) 9 | 10 | 11 | """ FEATURE INDEPENDENT AUGMENTATIONS """ 12 | def mask(specs, axis, max_mask_size=None, n_mask=1): 13 | def make_shape(size): 14 | # returns (1, ..., size, ..., 1) 15 | shape = [1] * len(specs.shape) 16 | shape[axis] = size 17 | return tuple(shape) 18 | 19 | total = specs.shape[axis] 20 | mask = tf.ones(make_shape(total), dtype=specs.dtype) 21 | if max_mask_size is None: 22 | max_mask_size = total 23 | 24 | def apply_random_mask(mask): 25 | size = tf.random.uniform([], maxval=max_mask_size, dtype=tf.int32) 26 | offset = tf.random.uniform([], maxval=total-size, dtype=tf.int32) 27 | 28 | mask *= tf.concat( 29 | (tf.ones(shape=make_shape(offset), dtype=mask.dtype), 30 | tf.zeros(shape=make_shape(size), dtype=mask.dtype), 31 | tf.ones(shape=make_shape(total-size-offset), dtype=mask.dtype)), 32 | axis=axis) 33 | return mask 34 | 35 | i = tf.constant(0) 36 | cond = lambda i, m: i < n_mask 37 | body = lambda i, m: (i+1, apply_random_mask(m)) 38 | _, mask = tf.while_loop(cond, body, (i, mask)) 39 | 40 | return specs * mask 41 | 42 | 43 | def random_shift(specs, axis=0, width=16): 44 | new_specs = tf.pad(specs, [[0]*2 if i != axis else [width]*2 45 | for i in range(len(specs.shape))]) 46 | new_specs = tf.image.random_crop(new_specs, specs.shape) 47 | return new_specs 48 | 49 | 50 | """ MAGNITUDE-PHASE SPECTROGRAM """ 51 | def magphase_to_mel(num_mel_bins=80, 52 | num_spectrogram_bins=257, 53 | sample_rate=16000, 54 | **kwargs): 55 | mel_matrix = tf.signal.linear_to_mel_weight_matrix( 56 | num_mel_bins, num_spectrogram_bins, sample_rate, **kwargs) 57 | 58 | def _magphase_to_mel(x, y=None): 59 | ''' 60 | x: [batch_size, freq, time, chan2] 61 | 62 | output: [batch_size, mel_freq, time, chan] 63 | ''' 64 | x = x[..., :tf.shape(x)[-1] // 2] # remove phase 65 | x = tf.tensordot(x, mel_matrix, axes=[-3, 0]) # [b, time, chan, mel] 66 | 67 | if len(x.shape) == 4: 68 | x = tf.transpose(x, perm=[0, 3, 1, 2]) 69 | elif len(x.shape) == 3: 70 | x = tf.transpose(x, perm=[2, 0, 1]) 71 | else: 72 | raise ValueError('len(x.shape) must be 3 or 4') 73 | 74 | if y is None: 75 | return x 76 | return x, y 77 | return _magphase_to_mel 78 | 79 | 80 | def log_magphase(specs, labels=None, n_chan=2): 81 | specs = tf.concat( 82 | [tf.math.log(specs[..., :n_chan]+EPSILON), specs[..., n_chan:]], 83 | axis=-1) 84 | if labels is not None: 85 | return specs, labels 86 | return specs 87 | 88 | 89 | def minmax_norm_magphase(specs, labels=None): 90 | n_chan = specs.shape[-1] // 2 91 | mag = specs[..., :n_chan] 92 | phase = specs[..., n_chan:] 93 | axis = tuple(range(1, len(specs.shape))) 94 | 95 | mag_max = tf.math.reduce_max(mag, axis=axis, keepdims=True) 96 | mag_min = tf.math.reduce_min(mag, axis=axis, keepdims=True) 97 | phase_max = tf.math.reduce_max(phase, axis=axis, keepdims=True) 98 | phase_min = tf.math.reduce_min(phase, axis=axis, keepdims=True) 99 | 100 | specs = tf.concat( 101 | [(mag-mag_min)/(mag_max-mag_min+EPSILON), 102 | (phase-phase_min)/(phase_max-phase_min+EPSILON)], 103 | axis=-1) 104 | 105 | if labels is not None: 106 | return specs, labels 107 | return specs 108 | 109 | 110 | """ COMPLEX-SPECTROGRAMS """ 111 | def complex_to_magphase(complex_tensor, y=None): 112 | n_chan = complex_tensor.shape[-1] // 2 113 | real = complex_tensor[..., :n_chan] 114 | img = complex_tensor[..., n_chan:] 115 | 116 | mag = tf.math.sqrt(real**2 + img**2) 117 | phase = tf.math.atan2(img, real) 118 | 119 | magphase = tf.concat([mag, phase], axis=-1) 120 | 121 | if y is None: 122 | return magphase 123 | return magphase, y 124 | 125 | 126 | def magphase_to_complex(magphase): 127 | n_chan = magphase.shape[-1] // 2 128 | mag = magphase[..., :n_chan] 129 | phase = magphase[..., n_chan:] 130 | 131 | real = mag * tf.cos(phase) 132 | img = mag * tf.sin(phase) 133 | 134 | return tf.concat([real, img], axis=-1) 135 | 136 | 137 | def phase_vocoder(complex_spec: tf.Tensor, 138 | rate: float=1.) -> tf.Tensor: 139 | """ 140 | https://pytorch.org/audio/_modules/torchaudio/functional.html#phase_vocoder 141 | 142 | complex_spec: [freq, time, chan*2] 143 | [..., :chan] = real, [..., chan:] = imag 144 | rate: float > 0. 145 | """ 146 | if rate == 1: 147 | return complex_spec 148 | 149 | # shape = tf.shape(complex_spec) 150 | freq = complex_spec.shape[0] 151 | hop_length = freq - 1 # n_fft // 2 152 | n_chan = complex_spec.shape[-1] // 2 153 | 154 | def angle(spec): 155 | return tf.math.atan2(spec[..., n_chan:], spec[..., :n_chan]) 156 | 157 | phase_advance = tf.linspace( 158 | 0., np.pi * tf.cast(hop_length, 'float32'), freq) 159 | phase_advance = tf.reshape(phase_advance, (-1, 1, 1)) 160 | time_steps = tf.range( 161 | 0, tf.shape(complex_spec)[1], rate, dtype=complex_spec.dtype) 162 | 163 | spec = tf.pad( 164 | complex_spec, 165 | [[0, 0] if i != 1 else [0, 2] for i in range(len(complex_spec.shape))]) 166 | 167 | spec_0 = tf.gather(spec, tf.cast(time_steps, 'int32'), axis=1) 168 | spec_1 = tf.gather(spec, tf.cast(time_steps+1, 'int32'), axis=1) 169 | 170 | angle_0 = angle(spec_0) 171 | angle_1 = angle(spec_1) 172 | 173 | norm_0 = tf.norm( 174 | tf.transpose(tf.reshape(spec_0, (freq, -1, 2, n_chan)), (0, 1, 3, 2)), 175 | 2, axis=-1) 176 | norm_1 = tf.norm( 177 | tf.transpose(tf.reshape(spec_1, (freq, -1, 2, n_chan)), (0, 1, 3, 2)), 178 | 2, axis=-1) 179 | 180 | # Compute Phase Accum 181 | phase_0 = angle(spec[..., :1, :]) # first frame angle 182 | phase = angle_1 - angle_0 - phase_advance 183 | phase = phase - 2 * np.pi * tf.math.round(phase / (2 * np.pi)) 184 | phase = phase + phase_advance 185 | phase = tf.concat([phase_0, phase[:, :-1]], axis=1) 186 | phase_acc = tf.cumsum(phase, 1) 187 | 188 | alphas = tf.reshape(time_steps % 1., (1, -1, 1)) 189 | mag = alphas * norm_1 + (1 - alphas) * norm_0 190 | 191 | real = mag * tf.cos(phase_acc) 192 | imag = mag * tf.sin(phase_acc) 193 | 194 | spec = tf.concat([real, imag], axis=-1) 195 | return spec 196 | 197 | -------------------------------------------------------------------------------- /transforms_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torchaudio 4 | import numpy as np 5 | import tensorflow as tf 6 | from transforms import * 7 | 8 | 9 | class TransformsTest(tf.test.TestCase): 10 | def test_mask(self): 11 | tf.random.set_seed(100) 12 | org = np.array([[ 0, 1, 2, 3, 4], 13 | [ 5, 6, 7, 8, 9], 14 | [10, 11, 12, 13, 14], 15 | [15, 16, 17, 18, 19], 16 | [20, 21, 22, 23, 24]]) 17 | target = np.array([[ 0, 0, 0, 0, 0], 18 | [ 0, 0, 0, 0, 0], 19 | [ 0, 0, 0, 0, 0], 20 | [15, 16, 17, 18, 19], 21 | [20, 21, 22, 23, 24]]) 22 | self.assertAllEqual(target, 23 | mask(org, axis=0, max_mask_size=None, n_mask=1)) 24 | 25 | tf.random.set_seed(2020) 26 | target = np.array([[ 0, 1, 0, 3, 4], 27 | [ 0, 6, 0, 8, 9], 28 | [ 0, 11, 0, 13, 14], 29 | [ 0, 16, 0, 18, 19], 30 | [ 0, 21, 0, 23, 24]]) 31 | self.assertAllEqual(target, 32 | mask(org, axis=1, max_mask_size=3, n_mask=2)) 33 | 34 | def test_random_shift(self): 35 | tf.random.set_seed(0) 36 | org = np.array([[0, 1, 2], 37 | [3, 4, 5], 38 | [6, 7, 8]]) 39 | target = np.array([[3, 4, 5], 40 | [6, 7, 8], 41 | [0, 0, 0]]) 42 | self.assertAllEqual(target, 43 | random_shift(org, axis=0, width=2)) 44 | 45 | def test_magphase_to_mel(self): 46 | # BATCH 47 | n_mels = 80 48 | magphase = np.random.randn(32, 257, 100, 4).astype('float32') 49 | mel = magphase_to_mel(n_mels)(magphase) 50 | self.assertEqual(mel.shape, [32, n_mels, 100, 2]) 51 | 52 | # SINGLE SAMPLE 53 | magphase = np.random.randn(257, 100, 4).astype('float32') 54 | mel = magphase_to_mel(n_mels)(magphase) 55 | self.assertEqual(mel.shape, [n_mels, 100, 2]) 56 | 57 | def test_log_magphase(self): 58 | specs = np.array([[ 1, 10, 100, 0, 1, -1], 59 | [500, 50, 5, 3, -3, 0]]) 60 | t_specs = np.array([[0. , 2.302585, 4.605170, 0, 1, -1], 61 | [6.214608, 3.912023, 1.609438, 3, -3, 0]]) 62 | self.assertAllClose(t_specs, log_magphase(specs, n_chan=3)) 63 | 64 | def test_minmax_norm_magphse(self): 65 | n_sample, n_feature, n_chan = 5, 10, 2 66 | axis = tuple(range(1, 3)) 67 | mag = np.random.randn(n_sample, n_feature, n_chan) 68 | phase = np.random.rand(n_sample, n_feature, n_chan) 69 | phase = (2*phase - 1) * np.pi 70 | magphase = np.concatenate([mag, phase], axis=-1) 71 | 72 | minmax_normed = minmax_norm_magphase(magphase) 73 | mins = tf.math.reduce_min(minmax_normed, axis=axis) 74 | maxs = tf.math.reduce_max(minmax_normed, axis=axis) 75 | 76 | self.assertAllClose(mins, tf.zeros_like(mins)) 77 | self.assertAllClose(maxs, tf.ones_like(maxs)) 78 | 79 | def test_complex_to_magphase(self): 80 | complex_tensor = np.array( 81 | [[1, 0], [0, 1], [-1, 0], [0, -1]], dtype='float32') 82 | magphase = np.array( 83 | [[1, 0], [1, np.pi/2], [1, np.pi], [1, -np.pi/2]], 84 | dtype='float32') 85 | 86 | self.assertAllClose(magphase, 87 | complex_to_magphase(complex_tensor)) 88 | 89 | def test_magphase_to_complex(self): 90 | magphase = np.array( 91 | [[1, 0], [1, np.pi/2], [1, np.pi], [1, -np.pi/2]], 92 | dtype='float32') 93 | complex_tensor = np.array( 94 | [[1, 0], [0, 1], [-1, 0], [0, -1]], dtype='float32') 95 | 96 | self.assertAllClose(complex_tensor, magphase_to_complex(magphase)) 97 | 98 | def test_phase_vocoder(self): 99 | n_freq, time, chan2 = 257, 100, 6 100 | complex_spec = tf.random.normal([n_freq, time, chan2]) 101 | 102 | self.assertAllEqual(complex_spec, 103 | phase_vocoder(complex_spec, 1.)) 104 | 105 | for rate in [1.2, 0.8]: 106 | pv = phase_vocoder(complex_spec, rate=rate) 107 | self.assertAllEqual([n_freq, int(np.ceil(time/rate)), chan2], 108 | pv.shape) 109 | 110 | 111 | if __name__ == '__main__': 112 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 113 | tf.test.main() 114 | 115 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import pickle 4 | import tensorflow as tf 5 | 6 | EPSILON = 1e-8 7 | label_downsample_model = (3, 6, 7, 8, 9) 8 | 9 | 10 | ''' 11 | UTILS FOR FRAMES AND WINDOWS 12 | ''' 13 | def seq_to_windows(seq, 14 | window, 15 | skip=1, 16 | padding=True, 17 | **kwargs): 18 | ''' 19 | INPUT: 20 | seq: np.ndarray 21 | window: array of indices 22 | ex) [-3, -1, 0, 1, 3] 23 | skip: int 24 | padding: bool 25 | **kwargs: params for np.pad 26 | 27 | OUTPUT: 28 | windows: [n_windows, window_size, ...] 29 | ''' 30 | window = np.array(window - np.min(window)).astype(np.int32) 31 | win_size = max(window) + 1 32 | windows = window[np.newaxis, :] \ 33 | + np.arange(0, len(seq), skip)[:, np.newaxis] 34 | if padding: 35 | seq = np.pad( 36 | seq, 37 | [[win_size//2, (win_size-1)//2]] + [[0, 0]]*len(seq.shape[1:]), 38 | mode='constant', 39 | **kwargs) 40 | 41 | return np.take(seq, windows, axis=0) 42 | 43 | 44 | def windows_to_seq(windows, 45 | window, 46 | skip=1): 47 | ''' 48 | INPUT: 49 | windows: np.ndarray (n_windows, window_size, ...) 50 | window: array of indices 51 | skip: int 52 | 53 | OUTPUT: 54 | seq 55 | ''' 56 | n_window = windows.shape[0] 57 | window = np.array(window - np.min(window)).astype(np.int32) 58 | win_size = max(window) 59 | 60 | seq_len = (n_window-1)*skip + 1 61 | seq = np.zeros([seq_len, *windows.shape[2:]], dtype=windows.dtype) 62 | count = np.zeros(seq_len) 63 | 64 | for i, w in enumerate(window): 65 | indices = np.arange(n_window)*skip - win_size//2 + w 66 | select = np.logical_and(0 <= indices, indices < seq_len) 67 | seq[indices[select]] += windows[select, i] 68 | count[indices[select]] += 1 69 | 70 | seq = seq / (count + EPSILON) 71 | return seq 72 | 73 | 74 | ''' 75 | DATASET 76 | ''' 77 | def list_to_generator(dataset: list): 78 | def _gen(): 79 | if isinstance(dataset, tuple): 80 | for z in zip(*dataset): 81 | yield z 82 | else: 83 | for data in dataset: 84 | yield data 85 | return _gen 86 | 87 | 88 | def load_data(path): 89 | if path.endswith('.pickle'): 90 | return pickle.load(open(path, 'rb')) 91 | elif path.endswith('.npy'): 92 | return np.load(path) 93 | else: 94 | raise ValueError('invalid file format') 95 | 96 | 97 | ''' 98 | MODEL 99 | ''' 100 | def apply_kernel_regularizer(model, kernel_regularizer): 101 | model = tf.keras.models.clone_model(model) 102 | layer_types = (tf.keras.layers.Dense, tf.keras.layers.Conv2D) 103 | for layer in model.layers: 104 | if isinstance(layer, layer_types): 105 | layer.kernel_regularizer = kernel_regularizer 106 | 107 | model = tf.keras.models.clone_model(model) 108 | return model 109 | 110 | 111 | ''' 112 | ETC 113 | ''' 114 | def safe_div(x, y, eps=EPSILON): 115 | # returns safe x / max(y, epsilon) 116 | return x / tf.maximum(y, eps) 117 | 118 | 119 | def predict(model, xs, reverse_and_add=False, vad=False, **kwargs): 120 | output = model.predict(xs, **kwargs) 121 | if vad: 122 | output = output[..., :30] * tf.nn.sigmoid(output[..., 30:]) 123 | 124 | if reverse_and_add: 125 | rev_output = model.predict(tf.reverse(xs, [-1]), **kwargs) 126 | if vad: 127 | rev_output = rev_output[..., :30] * tf.nn.sigmoid(rev_output[..., 30:]) 128 | shape = rev_output.shape[:-1] 129 | rev_output = rev_output.reshape(*shape, -1, 10) 130 | rev_output = np.flip(rev_output, -1) 131 | rev_output = rev_output.reshape(*shape, -1) 132 | 133 | output = (output + rev_output) / 2 134 | return output 135 | 136 | 137 | ''' 138 | OPTIMIZER 139 | ''' 140 | class AdaBelief(tf.keras.optimizers.Optimizer): 141 | _HAS_AGGREGATE_GRAD = True 142 | 143 | def __init__(self, 144 | learning_rate=0.001, 145 | beta_1=0.9, 146 | beta_2=0.999, 147 | epsilon=1e-7, 148 | amsgrad=False, 149 | name='AdaBelief', 150 | **kwargs): 151 | super(AdaBelief, self).__init__(name, **kwargs) 152 | self._set_hyper('learning_rate', kwargs.get('lr', learning_rate)) 153 | self._set_hyper('decay', self._initial_decay) 154 | self._set_hyper('beta_1', beta_1) 155 | self._set_hyper('beta_2', beta_2) 156 | self.epsilon = epsilon or backend_config.epsilon() 157 | self.amsgrad = amsgrad 158 | 159 | def _create_slots(self, var_list): 160 | # Create slots for the first and second moments. 161 | # Separate for-loops to respect the ordering of slot variables from v1. 162 | for var in var_list: 163 | self.add_slot(var, 'm') 164 | for var in var_list: 165 | self.add_slot(var, 'v') 166 | if self.amsgrad: 167 | for var in var_list: 168 | self.add_slot(var, 'vhat') 169 | 170 | def _prepare_local(self, var_device, var_dtype, apply_state): 171 | super(AdaBelief, self)._prepare_local(var_device, var_dtype, apply_state) 172 | 173 | local_step = tf.cast(self.iterations + 1, var_dtype) 174 | beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype)) 175 | beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype)) 176 | beta_1_power = tf.math.pow(beta_1_t, local_step) 177 | beta_2_power = tf.math.pow(beta_2_t, local_step) 178 | lr = (apply_state[(var_device, var_dtype)]['lr_t'] * 179 | (tf.math.sqrt(1 - beta_2_power) / (1 - beta_1_power))) 180 | apply_state[(var_device, var_dtype)].update( 181 | dict( 182 | lr=lr, 183 | epsilon=tf.convert_to_tensor(self.epsilon, var_dtype), 184 | beta_1_t=beta_1_t, 185 | beta_1_power=beta_1_power, 186 | one_minus_beta_1_t=1 - beta_1_t, 187 | beta_2_t=beta_2_t, 188 | beta_2_power=beta_2_power, 189 | one_minus_beta_2_t=1 - beta_2_t)) 190 | 191 | def set_weights(self, weights): 192 | params = self.weights 193 | num_vars = int((len(params) - 1) / 2) 194 | if len(weights) == 3 * num_vars + 1: 195 | weights = weights[:len(params)] 196 | super(AdaBelief, self).set_weights(weights) 197 | 198 | def _resource_apply_dense(self, grad, var, apply_state=None): 199 | var_device, var_dtype = var.device, var.dtype.base_dtype 200 | coefficients = ((apply_state or {}).get((var_device, var_dtype)) 201 | or self._fallback_apply_state(var_device, var_dtype)) 202 | 203 | # m_t = beta1 * m + (1 - beta1) * g_t 204 | m = self.get_slot(var, 'm') 205 | m_scaled_g_values = grad * coefficients['one_minus_beta_1_t'] 206 | m_t = tf.compat.v1.assign(m, 207 | m * coefficients['beta_1_t'] + m_scaled_g_values, 208 | use_locking=self._use_locking) 209 | 210 | # v_t = beta2 * v + (1 - beta2) * ((g_t-m_t) * (g_t-m_t)) 211 | v = self.get_slot(var, 'v') 212 | grad_dev = grad - m_t 213 | v_scaled_g_values = (grad_dev * grad_dev) * coefficients['one_minus_beta_2_t'] 214 | v_t = tf.compat.v1.assign(v, 215 | v * coefficients['beta_2_t'] + v_scaled_g_values, 216 | use_locking=self._use_locking) 217 | 218 | if not self.amsgrad: 219 | v_sqrt = tf.math.sqrt(v_t) 220 | var_update = tf.compat.v1.assign_sub( 221 | var, coefficients['lr'] * m_t / (v_sqrt + coefficients['epsilon']), 222 | use_locking=self._use_locking) 223 | return tf.group(*[var_update, m_t, v_t]) 224 | else: 225 | v_hat = self.get_slot(var, 'vhat') 226 | v_hat_t = tf.math.maximum(v_hat, v_t) 227 | with ops.control_dependencies([v_hat_t]): 228 | v_hat_t = tf.compat.v1.assign( 229 | v_hat, v_hat_t, use_locking=self._use_locking) 230 | v_hat_sqrt = tf.math.sqrt(v_hat_t) 231 | var_update = tf.compat.v1.assign_sub( 232 | var, 233 | coefficients['lr'] * m_t / (v_hat_sqrt + coefficients['epsilon']), 234 | use_locking=self._use_locking) 235 | return tf.group(*[var_update, m_t, v_t, v_hat_t]) 236 | 237 | def _resource_apply_sparse(self, grad, var, indices, apply_state=None): 238 | var_device, var_dtype = var.device, var.dtype.base_dtype 239 | coefficients = ((apply_state or {}).get((var_device, var_dtype)) 240 | or self._fallback_apply_state(var_device, var_dtype)) 241 | 242 | # m_t = beta1 * m + (1 - beta1) * g_t 243 | m = self.get_slot(var, 'm') 244 | m_scaled_g_values = grad * coefficients['one_minus_beta_1_t'] 245 | m_t = tf.compat.v1.assign(m, m * coefficients['beta_1_t'], 246 | use_locking=self._use_locking) 247 | with ops.control_dependencies([m_t]): 248 | m_t = self._resource_scatter_add(m, indices, m_scaled_g_values) 249 | 250 | # v_t = beta2 * v + (1 - beta2) * ((g_t-m_t) * (g_t-m_t)) 251 | v = self.get_slot(var, 'v') 252 | grad_dev = grad - m_t 253 | v_scaled_g_values = (grad_dev * grad_dev) * coefficients['one_minus_beta_2_t'] 254 | v_t = tf.compat.v1.assign(v, v * coefficients['beta_2_t'], 255 | use_locking=self._use_locking) 256 | with ops.control_dependencies([v_t]): 257 | v_t = self._resource_scatter_add(v, indices, v_scaled_g_values) 258 | 259 | if not self.amsgrad: 260 | v_sqrt = tf.math.sqrt(v_t) 261 | var_update = tf.compat.v1.assign_sub( 262 | var, coefficients['lr'] * m_t / (v_sqrt + coefficients['epsilon']), 263 | use_locking=self._use_locking) 264 | return tf.group(*[var_update, m_t, v_t]) 265 | else: 266 | v_hat = self.get_slot(var, 'vhat') 267 | v_hat_t = tf.math.maximum(v_hat, v_t) 268 | with ops.control_dependencies([v_hat_t]): 269 | v_hat_t = tf.compat.v1.assign( 270 | v_hat, v_hat_t, use_locking=self._use_locking) 271 | v_hat_sqrt = tf.math.sqrt(v_hat_t) 272 | var_update = tf.compat.v1.assign_sub( 273 | var, 274 | coefficients['lr'] * m_t / (v_hat_sqrt + coefficients['epsilon']), 275 | use_locking=self._use_locking) 276 | return tf.group(*[var_update, m_t, v_t, v_hat_t]) 277 | 278 | def get_config(self): 279 | config = super(AdaBelief, self).get_config() 280 | config.update({ 281 | 'learning_rate': self._serialize_hyperparameter('learning_rate'), 282 | 'decay': self._serialize_hyperparameter('decay'), 283 | 'beta_1': self._serialize_hyperparameter('beta_1'), 284 | 'beta_2': self._serialize_hyperparameter('beta_2'), 285 | 'epsilon': self.epsilon, 286 | 'amsgrad': self.amsgrad, 287 | }) 288 | return config 289 | 290 | 291 | def sigmoid_focal_crossentropy( 292 | y_true, 293 | y_pred, 294 | alpha = 0.25, 295 | gamma = 2.0, 296 | from_logits: bool = False, 297 | ) -> tf.Tensor: 298 | """Implements the focal loss function. 299 | 300 | Focal loss was first introduced in the RetinaNet paper 301 | (https://arxiv.org/pdf/1708.02002.pdf). Focal loss is extremely useful for 302 | classification when you have highly imbalanced classes. It down-weights 303 | well-classified examples and focuses on hard examples. The loss value is 304 | much high for a sample which is misclassified by the classifier as compared 305 | to the loss value corresponding to a well-classified example. One of the 306 | best use-cases of focal loss is its usage in object detection where the 307 | imbalance between the background class and other classes is extremely high. 308 | 309 | Args: 310 | y_true: true targets tensor. 311 | y_pred: predictions tensor. 312 | alpha: balancing factor. 313 | gamma: modulating factor. 314 | 315 | Returns: 316 | Weighted loss float `Tensor`. If `reduction` is `NONE`,this has the 317 | same shape as `y_true`; otherwise, it is scalar. 318 | """ 319 | if gamma and gamma < 0: 320 | raise ValueError("Value of gamma should be greater than or equal to zero.") 321 | 322 | y_pred = tf.convert_to_tensor(y_pred) 323 | y_true = tf.cast(y_true, dtype=y_pred.dtype) 324 | 325 | # Get the cross_entropy for each entry 326 | ce = tf.keras.backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits) 327 | 328 | # If logits are provided then convert the predictions into probabilities 329 | if from_logits: 330 | pred_prob = tf.sigmoid(y_pred) 331 | else: 332 | pred_prob = y_pred 333 | 334 | p_t = (y_true * pred_prob) + ((1 - y_true) * (1 - pred_prob)) 335 | alpha_factor = 1.0 336 | modulating_factor = 1.0 337 | 338 | if alpha: 339 | alpha = tf.cast(alpha, dtype=y_true.dtype) 340 | alpha_factor = y_true * alpha + (1 - y_true) * (1 - alpha) 341 | 342 | if gamma: 343 | gamma = tf.cast(gamma, dtype=y_true.dtype) 344 | modulating_factor = tf.pow((1.0 - p_t), gamma) 345 | 346 | # compute the final loss and return 347 | return tf.reduce_mean(tf.reduce_sum(alpha_factor * modulating_factor * ce, axis=-1), axis=-1) 348 | 349 | 350 | def unitwise_norm(x): 351 | if len(x.get_shape()) <= 1: # Scalars and vectors 352 | axis = None 353 | keepdims = False 354 | elif len(x.get_shape()) in [2, 3]: # Linear layers of shape IO or multihead linear 355 | axis = 0 356 | keepdims = True 357 | elif len(x.get_shape()) == 4: # Conv kernels of shape HWIO 358 | axis = [0, 1, 2,] 359 | keepdims = True 360 | else: 361 | raise ValueError(f"Got a parameter with shape not in [1, 2, 4]! {x}") 362 | return compute_norm(x, axis, keepdims) 363 | 364 | 365 | def compute_norm(x, axis, keepdims): 366 | return tf.math.reduce_sum(x ** 2, axis=axis, keepdims=keepdims) ** 0.5 367 | 368 | -------------------------------------------------------------------------------- /utils_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import unittest 3 | from utils import * 4 | 5 | 6 | class UtilsTest(unittest.TestCase): 7 | def test_seq_to_windows(self): 8 | seq = np.array([1, 2, 3, 4, 5]) 9 | window = np.array([-3, -1, 0, 1, 3]) 10 | 11 | target = np.array([[0, 0, 1, 2, 4], 12 | [0, 1, 2, 3, 5], 13 | [0, 2, 3, 4, 0], 14 | [1, 3, 4, 5, 0], 15 | [2, 4, 5, 0, 0]]) 16 | self.assertEqual(target.tolist(), 17 | seq_to_windows(seq, window).tolist()) 18 | self.assertEqual(target[::2].tolist(), 19 | seq_to_windows(seq, window, 2).tolist()) 20 | 21 | def test_windows_to_seq(self): 22 | windows = np.array([[0, 0, 1, 2, 4], 23 | [0, 1, 2, 3, 5], 24 | [0, 2, 3, 4, 0], 25 | [1, 3, 4, 5, 0], 26 | [2, 4, 5, 0, 0]]) 27 | window = np.array([-3, -1, 0, 1, 3]) 28 | 29 | target = np.array([1, 2, 3, 4, 5]) 30 | self.assertTrue( 31 | np.allclose(target, windows_to_seq(windows, window))) 32 | self.assertTrue( 33 | np.allclose(target, windows_to_seq(windows[::2], window, skip=2))) 34 | 35 | def test_list_to_generator(self): 36 | n_samples = 4 37 | x = np.random.randn(n_samples, 30) 38 | y = np.random.randn(n_samples) 39 | 40 | x_gen = list_to_generator(x) 41 | self.assertTrue(callable(x_gen)) 42 | for i, x_ in enumerate(x_gen()): 43 | self.assertEqual(x[i].tolist(), x_.tolist()) 44 | 45 | xy_gen = list_to_generator((x, y)) 46 | self.assertTrue(callable(xy_gen)) 47 | for i, (x_, y_) in enumerate(xy_gen()): 48 | self.assertEqual(x[i].tolist(), x_.tolist()) 49 | self.assertEqual(y[i], y_) 50 | 51 | def test_load_data(self): 52 | raise NotImplemented('TODO: not yet implemented') 53 | 54 | def test_apply_kernel_regularizer(self): 55 | n_samples, in_shape, out_shape = 128, 4, 4 56 | x = np.random.randn(n_samples, in_shape) 57 | y = np.random.randint(out_shape, size=n_samples) 58 | 59 | # model without regularizer 60 | tf.random.set_seed(0) 61 | model = tf.keras.models.Sequential() 62 | model.add(tf.keras.layers.Input(shape=(in_shape,))) 63 | model.add(tf.keras.layers.Dense(out_shape, activation='softmax')) 64 | model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') 65 | 66 | model.fit(x, y, verbose=False) 67 | base_weights = model.weights[:] 68 | 69 | # model with regularizer 70 | tf.random.set_seed(0) 71 | model = tf.keras.models.Sequential() 72 | model.add(tf.keras.layers.Input(shape=(in_shape,))) 73 | model.add(tf.keras.layers.Dense(out_shape, activation='softmax')) 74 | 75 | model = apply_kernel_regularizer(model, tf.keras.regularizers.l2(0.1)) 76 | model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') 77 | 78 | model.fit(x, y, verbose=False) 79 | new_weights = model.weights[:] 80 | 81 | for b, n in zip(base_weights, new_weights): 82 | self.assertNotEqual(b.numpy().tolist(), n.numpy().tolist()) 83 | 84 | def test_safe_div(self): 85 | raise NotImplemented('TODO: not yet implemented') 86 | 87 | def test_predict(self): 88 | raise NotImplemented('TODO: not yet implemented') 89 | 90 | def test_adabelief(self): 91 | n_samples, in_shape, out_shape = 128, 4, 4 92 | x = np.random.randn(n_samples, in_shape) 93 | y = np.random.randint(out_shape, size=n_samples) 94 | 95 | # AdaBelief 96 | tf.random.set_seed(0) 97 | model = tf.keras.models.Sequential() 98 | model.add(tf.keras.layers.Input(shape=(in_shape,))) 99 | model.add(tf.keras.layers.Dense(out_shape, activation='softmax')) 100 | model.compile(optimizer=AdaBelief(), loss='sparse_categorical_crossentropy') 101 | model.fit(x, y, epochs=32, verbose=True) 102 | print() 103 | 104 | # Adam 105 | tf.random.set_seed(0) 106 | model = tf.keras.models.Sequential() 107 | model.add(tf.keras.layers.Input(shape=(in_shape,))) 108 | model.add(tf.keras.layers.Dense(out_shape, activation='softmax')) 109 | model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') 110 | model.fit(x, y, epochs=32, verbose=True) 111 | 112 | if __name__ == '__main__': 113 | os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' 114 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 115 | unittest.main() 116 | --------------------------------------------------------------------------------