├── main.sh ├── .gitignore ├── __pycache__ ├── model.cpython-37.pyc ├── model.cpython-38.pyc ├── trainer.cpython-37.pyc ├── trainer.cpython-38.pyc ├── datasets.cpython-37.pyc ├── test_data_pkl.cpython-37.pyc ├── test_data_pkl.cpython-38.pyc ├── model_utilities.cpython-37.pyc ├── model_utilities.cpython-38.pyc ├── train_data_mine.cpython-37.pyc └── train_data_mine.cpython-38.pyc ├── test.sh ├── utils ├── __pycache__ │ ├── loss.cpython-37.pyc │ ├── loss.cpython-38.pyc │ ├── resnet.cpython-37.pyc │ ├── resnet.cpython-38.pyc │ ├── setup.cpython-37.pyc │ ├── setup.cpython-38.pyc │ ├── utils.cpython-37.pyc │ ├── utils.cpython-38.pyc │ └── utilities.cpython-37.pyc ├── extract.sh ├── loss.py ├── setup.py ├── check_data_len.py ├── utils.py ├── vad.py ├── utilities.py ├── resnet.py └── feature_extractor.py ├── README.md ├── config └── config.yml ├── LICENSE.md ├── main.py ├── model_utilities.py ├── datasets.py ├── trainer.py └── model.py /main.sh: -------------------------------------------------------------------------------- 1 | python3 main.py -b './' -c './config/' -d 'config' -m 'Train' 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.zip 2 | *.idea 3 | __pycache__/ 4 | *.pt 5 | *.swp 6 | *.log 7 | ./out_train 8 | *.wav 9 | -------------------------------------------------------------------------------- /__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/model.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/trainer.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/trainer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/trainer.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/datasets.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/datasets.cpython-37.pyc -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #python3 main.py -b './' -c './config/' -d 'config' -m 'Train' 2 | python3 main.py -b './' -c './config/' -d 'config' -m 'Test' 3 | -------------------------------------------------------------------------------- /utils/__pycache__/loss.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/loss.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/loss.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/resnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/resnet.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/resnet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/resnet.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/setup.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/setup.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/setup.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/setup.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/test_data_pkl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/test_data_pkl.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/test_data_pkl.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/test_data_pkl.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/model_utilities.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/model_utilities.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/model_utilities.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/model_utilities.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/train_data_mine.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/train_data_mine.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/train_data_mine.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/train_data_mine.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/utilities.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/utilities.cpython-37.pyc -------------------------------------------------------------------------------- /utils/extract.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Data directory 4 | DATASET_DIR='/home/nas/DB/AI_grand_challenge_2020/2020/t3_audio/' 5 | 6 | # Feature directory 7 | FEATURE_DIR='/home/minseok/Audio/AI_Challenge/features' 8 | 9 | # Workspace 10 | WORKSPACE='/home/minseok/Audio/AI_Challenge' 11 | cd $WORKSPACE 12 | 13 | ########### Hyper-parameters ########### 14 | FEATURE_TYPE='logmelgccintensity' # 'logmel' | 'logmelgcc' | 'logmelintensity' | 'logmelgccintensity' 15 | AUDIO_TYPE='mic' # 'mic' | 'foa' | 'foa&mic' 16 | 17 | ############ Extract Features ############ 18 | # dev 19 | python utils/feature_extractor.py --dataset_dir=$DATASET_DIR --feature_dir=$FEATURE_DIR --feature_type=$FEATURE_TYPE --data_type='dev' --audio_type=$AUDIO_TYPE 20 | 21 | # eval 22 | python utils/feature_extractor.py --dataset_dir=$DATASET_DIR --feature_dir=$FEATURE_DIR --feature_type=$FEATURE_TYPE --data_type='eval' --audio_type=$AUDIO_TYPE 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /utils/loss.py: -------------------------------------------------------------------------------- 1 | from torch.nn import functional as F 2 | from torch import nn 3 | import torch 4 | import pdb 5 | 6 | def create_criterion(loss_name): 7 | 8 | if loss_name == 'CTCLoss': 9 | criterion = nn.CTCLoss(blank=2, zero_infinity=True) 10 | elif loss_name == 'CrossEntropy': 11 | criterion = nn.CrossEntropyLoss() 12 | elif loss_name == 'BCEWithLogits': 13 | criterion = nn.BCEWithLogitsLoss() 14 | elif loss_name == 'regression': 15 | criterion = mean_error 16 | elif loss_name == "BCE": 17 | criterion = nn.BCELoss() 18 | return criterion 19 | 20 | 21 | def mean_error(output, target, loss_type='MSE'): 22 | 23 | # Align the time_steps of output and target 24 | pdb.set_trace() 25 | N = min(output.shape[1], target.shape[1]) 26 | 27 | output = output[:, 0: N, :] 28 | target = target[:, 0: N, :] 29 | 30 | out = torch.sqrt(torch.sum((output - target)**2)) 31 | 32 | return out 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sound-Source-Localization 2 | Sound Source Localization study for AI Grand Challenge 2021 (sponsored by NC Soft Vision Lab) 3 | 4 | ## Preparation 5 | ### 1. Create the environment. 6 | ``` 7 | $ cd Sound-Source-Localization/ 8 | $ conda create -y -n varco python=3.8 9 | $ conda activate varco 10 | 11 | ####select according to your conda version: https://pytorch.org/#### 12 | $ conda install pytorch torchvision torchaudio cudatoolkit=11.1 -c pytorch -c nvidia 13 | 14 | $ conda install -y pandas h5py scipy 15 | $ conda install -y pysoundfile librosa youtube-dl tqdm -c conda-forge 16 | $ pip install PyYAML 17 | $ pip install tensorboard 18 | ``` 19 | 20 | ### 2. Place the data-listed csv file in the path below. 21 | ``` 22 | Sound-Source-Localization/ 23 | └── dataset/ 24 | └── dataset.csv 25 | ``` 26 | 27 | ### 3. Run main.sh for training 28 | ``` 29 | $ cd Sound-Source-Localization/ 30 | $ sh train.sh 31 | ``` 32 | 33 | ### 4. Run test.sh for test 34 | ``` 35 | $ cd Sound-Source-Localization/ 36 | $ sh test.sh 37 | ``` 38 | 39 | ## Acknowledgement 40 | 이 데이터는 2021년도 정부(과학기술정보통신부)의 재원으로 정보통신기획평가원의 지원을 받아 수행된 연구의 결과물임 (No.171125972, 인명 구조용 드론을 위한 영상/음성 인지 기술 개발) 41 | 42 | This work was supported by Institute of Information & communications Technology Planning & Evaluation (IITP) grant funded by the Korea government(MSIT) (No.171125972, Audio-Visual Perception for Autonomous Rescue Drones) 43 | 44 | -------------------------------------------------------------------------------- /config/config.yml: -------------------------------------------------------------------------------- 1 | use_tb_logger: true 2 | 3 | #### datasets 4 | datasets: 5 | 6 | csv: ./dataset/dataset.csv 7 | test: ./test_dataset/ 8 | 9 | dataloader: 10 | train: 11 | batch_size: 32 12 | shuffle: true 13 | # pin_memeory: true 14 | #num_workers: 20 15 | num_workers: 20 16 | 17 | valid: 18 | batch_size: 32 19 | shuffle: true 20 | # pin_memeory: true 21 | num_workers: 20 22 | 23 | test: 24 | batch_size: 1 25 | shuffle: false 26 | # pin_memeory: true 27 | num_workers: 0 28 | 29 | #### network structures 30 | MYNET: 31 | embed_size: 8 32 | sequence_size: 16 # Temporal duration of input clips 33 | encoder: resnet50 34 | n_classes: 2 35 | input_size: 224 36 | pretrained: true 37 | num_layers: 1 38 | bidirectional: false 39 | 40 | #### training settings: learning rate scheme, loss 41 | trainer: 42 | epochs: 10000 43 | device: 1 44 | save_path: '''PATH WHERE THE MODEL WILL BE SAVED''' 45 | #ckpt_path: '''PATH OF THE MODEL TO BE LOADED''' 46 | comment: no comment 47 | 48 | tester: 49 | ckpt_path: '''PATH OF THE MODEL TO BE LOADED''' 50 | device: 2 51 | 52 | 53 | criterion: 54 | #name: regression 55 | #name: BCEWithLogits 56 | name: BCE 57 | 58 | #### Optimizer settings 59 | # optimizer: 60 | # name: Adam ### Adam, RMSprop, SGD 61 | # lr: !!float 1e-3 62 | # weight_decay: 0 63 | # eps: !!float 1e-3 64 | optimizer: 65 | name: Adam ## Adam, RMSprop, SGD 66 | lr: !!float 0.0001 67 | # betas: (0.9, 0.999) 68 | eps: !!float 1e-5 69 | weight_decay: !!float 1e-3 70 | 71 | 72 | #### scheduler settings 73 | scheduler: 74 | name: plateau 75 | min_lr: !!float 1e-8 76 | patience: 2 77 | factor: 0.5 78 | -------------------------------------------------------------------------------- /utils/setup.py: -------------------------------------------------------------------------------- 1 | from torch import optim 2 | from torch.optim import lr_scheduler 3 | import torch 4 | 5 | def setup_solver(parameters, config): 6 | if config['optimizer']['name'] == 'Adam': 7 | optimizer = optim.Adam(parameters, lr=config['optimizer']['lr'], weight_decay=config['optimizer']['weight_decay'], 8 | eps=config['optimizer']['eps']) 9 | # eps=1e-3 if args.fp16 else 1e-8) 10 | elif config['optimizer']['name'] == 'SGD': 11 | optimizer = optim.SGD(parameters, lr=config['optimizer']['lr'], weight_decay=config['optimizer']['weight_decay'], 12 | momentum=0.9, nesterov=True) 13 | 14 | 15 | if config['scheduler']['name'] == 'plateau': 16 | scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience=config['scheduler']['patience'], threshold=0.01, cooldown=0, 17 | threshold_mode='abs', mode='max', factor=config['scheduler']['factor']) 18 | 19 | 20 | elif config['scheduler']['name'] == 'step': 21 | scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9, last_epoch=-1) 22 | 23 | elif config['scheduler']['name'] == 'exp': 24 | scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.99) 25 | 26 | 27 | 28 | elif config['scheduler']['name'] == 'cycle': 29 | scheduler = torch.optim.lr_scheduler.OneCycleLR( 30 | optimizer, 31 | max_lr=0.001, 32 | total_steps=10000, 33 | pct_start=0.3, 34 | base_momentum=0.9*0.95, 35 | max_momentum=0.95, 36 | final_div_factor=1/0.0001, 37 | ) 38 | 39 | # else: # args.scheduler == 'step': 40 | # scheduler = lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.gamma) 41 | 42 | return optimizer, scheduler 43 | 44 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | (c) 2022 NCSOFT Corporation & Sogang University. All Rights Reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | 13 | Any questions about our licensed work can be sent to opensource@ncsoft.com. 14 | ________________________________________ 15 | 16 | This software uses Open Source Software (OSS). You can find the link for the source code of these open source projects, along with applicable license information, below. 17 | 18 | DCASE2019-TASK3
19 | https://github.com/yinkalario/DCASE2019-TASK3
20 | Copyright (c) 2019 Yin Cao
21 | MIT License
22 | See license text at https://github.com/yinkalario/DCASE2019-TASK3/blob/master/Licenses/MIT_LICENSE 23 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | import sys 4 | sys.path.append('./') 5 | import argparse 6 | import torch 7 | import torch.nn as nn 8 | import pdb 9 | import yaml 10 | import numpy as np 11 | from torch.utils.data import DataLoader 12 | import os 13 | import pickle 14 | from pathlib import Path 15 | from trainer import ModelTrainer, ModelTester 16 | from utils.setup import setup_solver 17 | from utils.loss import create_criterion 18 | from utils.utils import tr_val_split 19 | from datasets import Audio_Reader, Audio_Collate, Test_Reader 20 | from model import pretrained_Gated_CRNN8 21 | 22 | def train(config): 23 | 24 | #pdb.set_trace() 25 | '''Dataset Preparation''' 26 | train_list, val_list = tr_val_split(config['datasets']['csv']) 27 | 28 | '''Data loader''' 29 | train_dataset = Audio_Reader(train_list) 30 | train_loader = DataLoader(dataset=train_dataset, batch_size=config['dataloader']['train']['batch_size'], shuffle=True, collate_fn=lambda x: Audio_Collate(x), num_workers=config['dataloader']['train']['num_workers']) 31 | valid_dataset = Audio_Reader(val_list) 32 | valid_loader = DataLoader(dataset=valid_dataset, batch_size=config['dataloader']['valid']['batch_size'], shuffle=True, collate_fn=lambda x: Audio_Collate(x), num_workers=config['dataloader']['valid']['num_workers']) 33 | 34 | '''Model / Loss Criterion / Optimizer/ Scheduler''' 35 | SSL_model = pretrained_Gated_CRNN8(10) 36 | criterion = create_criterion(config['criterion']['name']) 37 | optimizer, scheduler = setup_solver(SSL_model.parameters(), config) 38 | 39 | '''Trainer''' 40 | trainer = ModelTrainer(SSL_model, train_loader, valid_loader, criterion, optimizer, scheduler, config, **config['trainer']) 41 | trainer.train() 42 | 43 | def test(config): 44 | 45 | test_dataset = Test_Reader(config['datasets']['test']) 46 | test_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False, pin_memory = True, num_workers=0) 47 | 48 | SSL_model = pretrained_Gated_CRNN8(10) 49 | 50 | tester = ModelTester(SSL_model, test_loader, config['tester']['ckpt_path'], config['tester']['device']) 51 | tester.test() 52 | 53 | if __name__ == '__main__': 54 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument('-b', '--base_dir', type=str, default='.', help='Root directory') 57 | parser.add_argument('-c', '--config', type=str, help='Path to option YAML file.') 58 | parser.add_argument('-d', '--dataset', type=str, help='Dataset') 59 | parser.add_argument('-m', '--mode', type=str, help='Train or Test') 60 | args = parser.parse_args() 61 | 62 | '''Load Config''' 63 | with open(os.path.join(args.config, args.dataset + '.yml'), mode='r') as f: 64 | config = yaml.load(f,Loader=yaml.FullLoader) 65 | 66 | if args.mode == 'Train': 67 | train(config) 68 | elif args.mode == 'Test': 69 | test(config) 70 | -------------------------------------------------------------------------------- /utils/check_data_len.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import torch 3 | from torch import nn 4 | import pdb 5 | import os 6 | import numpy as np 7 | import csv 8 | import random 9 | import librosa 10 | 11 | split_ratio = 0.9 12 | seed_num = 100 13 | random.seed(seed_num) 14 | 15 | correct_csv_file = '../correct.csv' 16 | with open(correct_csv_file, newline='') as f: 17 | reader = csv.reader(f) 18 | tmp = list(reader) 19 | correct_list = [x[-1].split('/')[-1] for x in tmp] 20 | f.close() 21 | 22 | data_path = '../dataset' 23 | csv_path = '../csv' 24 | csv_list = [f for f in os.listdir(csv_path) if f.split('.')[-1] == 'csv'] 25 | 26 | total_list = [] 27 | for fi in csv_list: 28 | csv_file = os.path.join(csv_path,fi) 29 | #with open(csv_file, newline='',encoding='cp949') as f: 30 | with open(csv_file, newline='') as f: 31 | reader = csv.reader(f) 32 | tmp = list(reader) 33 | data_name_list = tmp[0] 34 | total_list += tmp[1:] 35 | f.close() 36 | 37 | t_audio, v_audio = [], [] 38 | t_text, v_text = [], [] 39 | t_label, v_label = [], [] 40 | class_dict = {'negative': [],'neutral': [],'positive': []} 41 | data_label_idx = data_name_list.index('mul_emotion') 42 | class_dict['negative'] = [(x,'negative') for x in total_list if x[data_label_idx] != 'happy' and x[data_label_idx] != 'neutral' and x[data_label_idx] != 'surprise' and x[0].split('/')[-1] in correct_list] 43 | class_dict['neutral'] = [(x,'neutral') for x in total_list if x[data_label_idx] == 'neutral' and x[0].split('/')[-1] in correct_list] 44 | class_dict['positive'] = [(x,'positive') for x in total_list if x[data_label_idx] == 'happy' and x[0].split('/')[-1] in correct_list] 45 | 46 | 47 | train_list,val_list = [],[] 48 | data_num = 7113 49 | for class_name in class_dict.keys(): 50 | temp_list = class_dict[class_name] 51 | random.shuffle(temp_list) 52 | if len(temp_list) < data_num: 53 | split_idx = int(split_ratio*len(temp_list)) 54 | train_list += temp_list[:split_idx] 55 | val_list += temp_list[split_idx:len(temp_list)] 56 | else: 57 | split_idx = int(split_ratio*data_num) 58 | train_list += temp_list[:split_idx] 59 | val_list += temp_list[split_idx:data_num] 60 | 61 | 62 | '''Train Data''' 63 | for data in train_list: 64 | t_label.append(data[1].replace("\ufeff","")) 65 | t_audio.append(os.path.join(data_path,data[0][0])) 66 | t_text.append(data[0][1]) 67 | 68 | '''Valid Data''' 69 | for data in val_list: 70 | v_label.append(data[1].replace("\ufeff","")) 71 | v_audio.append(os.path.join(data_path,data[0][0])) 72 | v_text.append(data[0][1]) 73 | 74 | 75 | count_dict = {} 76 | count_dict["over_twenty"] = 0 77 | count_dict["over_fifteen"] = 0 78 | count_dict["over_ten"] = 0 79 | count_dict["over_five"] = 0 80 | count_dict["below_five"] = 0 81 | 82 | pdb.set_trace() 83 | for audio_path in t_audio: 84 | audio, _ = librosa.load(audio_path, sr=16000, dtype=np.float32) 85 | audio_len = len(audio) / 16000 86 | print("{}: {}".format(audio_path,audio_len)) 87 | if audio_len > 20: 88 | count_dict["over_twenty"] +=1 89 | elif audio_len > 15: 90 | count_dict["over_fifteen"] +=1 91 | elif audio_len > 10: 92 | count_dict["over_ten"] +=1 93 | elif audio_len > 5: 94 | count_dict["over_five"] +=1 95 | else: 96 | count_dict["below_five"] +=1 97 | 98 | for audio_path in v_audio: 99 | audio, _ = librosa.load(audio_path, sr=16000, dtype=np.float32) 100 | audio_len = len(audio) / 16000 101 | print("{}: {}".format(audio_path,audio_len)) 102 | if audio_len > 20: 103 | count_dict["over_twenty"] +=1 104 | elif audio_len > 15: 105 | count_dict["over_fifteen"] +=1 106 | elif audio_len > 10: 107 | count_dict["over_ten"] +=1 108 | elif audio_len > 5: 109 | count_dict["over_five"] +=1 110 | else: 111 | count_dict["below_five"] +=1 112 | 113 | 114 | for class_name in count_dict.keys(): 115 | print('{}: {}'.format(class_name, count_dict[class_name])) -------------------------------------------------------------------------------- /model_utilities.py: -------------------------------------------------------------------------------- 1 | ''' 2 | MIT License 3 | 4 | Copyright (c) 2019 Yin Cao 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | ''' 24 | 25 | import math 26 | import pdb 27 | 28 | import numpy as np 29 | import torch 30 | import torch.nn as nn 31 | import torch.nn.functional as F 32 | 33 | 34 | def interpolate(x, ratio): 35 | ''' 36 | Interpolate the x to have equal time steps as targets 37 | Input: 38 | x: (batch_size, time_steps, class_num) 39 | Output: 40 | out: (batch_size, time_steps*ratio, class_num) 41 | ''' 42 | (batch_size, time_steps, classes_num) = x.shape 43 | upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1) 44 | upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num) 45 | 46 | return upsampled 47 | 48 | 49 | def init_layer(layer, nonlinearity='leaky_relu'): 50 | ''' 51 | Initialize a layer 52 | ''' 53 | classname = layer.__class__.__name__ 54 | if (classname.find('Conv') != -1) or (classname.find('Linear') != -1): 55 | nn.init.kaiming_uniform_(layer.weight, nonlinearity=nonlinearity) 56 | if hasattr(layer, 'bias'): 57 | if layer.bias is not None: 58 | nn.init.constant_(layer.bias, 0.0) 59 | elif classname.find('BatchNorm') != -1: 60 | nn.init.normal_(layer.weight, 1.0, 0.02) 61 | nn.init.constant_(layer.bias, 0.0) 62 | 63 | 64 | def init_mask(layer, nonlinearity='leaky_relu'): 65 | ''' 66 | Initialize a layer 67 | ''' 68 | classname = layer.__class__.__name__ 69 | if (classname.find('Conv') != -1) or (classname.find('Linear') != -1): 70 | nn.init.constant_(layer.weight, 1.0) 71 | if hasattr(layer, 'bias'): 72 | if layer.bias is not None: 73 | nn.init.constant_(layer.bias, 0.0) 74 | elif classname.find('BatchNorm') != -1: 75 | nn.init.normal_(layer.weight, 1.0, 0.02) 76 | nn.init.constant_(layer.bias, 0.0) 77 | 78 | 79 | def init_gru(rnn): 80 | """Initialize a GRU layer. """ 81 | 82 | def _concat_init(tensor, init_funcs): 83 | (length, fan_out) = tensor.shape 84 | fan_in = length // len(init_funcs) 85 | 86 | for (i, init_func) in enumerate(init_funcs): 87 | init_func(tensor[i * fan_in : (i + 1) * fan_in, :]) 88 | 89 | def _inner_uniform(tensor): 90 | fan_in = nn.init._calculate_correct_fan(tensor, 'fan_in') 91 | nn.init.uniform_(tensor, -math.sqrt(3 / fan_in), math.sqrt(3 / fan_in)) 92 | 93 | for i in range(rnn.num_layers): 94 | _concat_init( 95 | getattr(rnn, 'weight_ih_l{}'.format(i)), 96 | [_inner_uniform, _inner_uniform, _inner_uniform] 97 | ) 98 | torch.nn.init.constant_(getattr(rnn, 'bias_ih_l{}'.format(i)), 0) 99 | 100 | _concat_init( 101 | getattr(rnn, 'weight_hh_l{}'.format(i)), 102 | [_inner_uniform, _inner_uniform, nn.init.orthogonal_] 103 | ) 104 | torch.nn.init.constant_(getattr(rnn, 'bias_hh_l{}'.format(i)), 0) 105 | 106 | 107 | class ConvBlock(nn.Module): 108 | def __init__(self, in_channels, out_channels, 109 | kernel_size=(3,3), stride=(1,1), padding=(1,1)): 110 | 111 | super().__init__() 112 | 113 | self.conv1 = nn.Conv2d(in_channels=in_channels, 114 | out_channels=out_channels, 115 | kernel_size=kernel_size, stride=stride, 116 | padding=padding, bias=False) 117 | 118 | self.conv2 = nn.Conv2d(in_channels=out_channels, 119 | out_channels=out_channels, 120 | kernel_size=kernel_size, stride=stride, 121 | padding=padding, bias=False) 122 | 123 | self.bn1 = nn.BatchNorm2d(out_channels) 124 | self.bn2 = nn.BatchNorm2d(out_channels) 125 | 126 | self.init_weights() 127 | 128 | def init_weights(self): 129 | 130 | init_layer(self.conv1) 131 | init_layer(self.conv2) 132 | init_layer(self.bn1) 133 | init_layer(self.bn2) 134 | 135 | def forward(self, x, pool_type='avg', pool_size=(2, 2)): 136 | 137 | x = F.relu_(self.bn1(self.conv1(x))) 138 | x = F.relu_(self.bn2(self.conv2(x))) 139 | if pool_type == 'avg': 140 | x = F.avg_pool2d(x, kernel_size=pool_size) 141 | elif pool_type == 'max': 142 | x = F.max_pool2d(x, kernel_size=pool_size) 143 | elif pool_type == 'frac': 144 | fractional_maxpool2d = nn.FractionalMaxPool2d(kernel_size=pool_size, output_ratio=1/np.sqrt(2)) 145 | x = fractional_maxpool2d(x) 146 | 147 | return x 148 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import torch 3 | from torch import nn 4 | from utils.resnet import resnet18, resnet50, resnet101 5 | import pdb 6 | import random 7 | import os 8 | import csv 9 | Encoder = namedtuple('Encoder', ('model', 'features', 'features_shape')) 10 | 11 | 12 | 13 | def tr_val_split(data_csv): 14 | split_ratio = 0.9 15 | seed_num = 100 16 | random.seed(seed_num) 17 | 18 | #pdb.set_trace() 19 | csv_path = data_csv 20 | 21 | total_list = [] 22 | csv_file = os.path.join(csv_path) 23 | #with open(csv_file, newline='',encoding='cp949') as f: 24 | with open(csv_file, newline='') as f: 25 | reader = csv.reader(f) 26 | tmp = list(reader) 27 | total_list += tmp 28 | f.close() 29 | 30 | #pdb.set_trace() 31 | class_dict = {} 32 | 33 | for data in total_list: 34 | data = data[0] 35 | class_name = data.split('/')[-2] 36 | 37 | if class_name not in class_dict.keys(): 38 | class_dict[class_name] = [] 39 | class_dict[class_name].append(data) 40 | 41 | #pdb.set_trace() 42 | 43 | train_list,val_list = [],[] 44 | 45 | for class_name in class_dict.keys(): 46 | temp_list = class_dict[class_name] 47 | random.shuffle(temp_list) 48 | 49 | split_idx = int(split_ratio*len(temp_list)) 50 | train_list += temp_list[:split_idx] 51 | val_list += temp_list[split_idx:len(temp_list)] 52 | 53 | #pdb.set_trace() 54 | 55 | return train_list, val_list 56 | 57 | 58 | 59 | def make_encoder(name, input_size=224, input_channels=3, pretrained=True, pretrain_path=None): 60 | """Make encoder (backbone) with a given name and parameters""" 61 | 62 | features_size = input_size // 32 63 | num_features = 2048 64 | # if name.startswith('resnet'): 65 | if name == 'resnet50': 66 | model = resnet50(pretrained=True) 67 | features = nn.Sequential(*list(model.children())[:-2]) 68 | # features[0] = nn.Conv2d(in_channels=4, out_channels=64, kernel_size=7, stride=2, padding=3, bias=False) 69 | num_features = 512 if int(name[6:]) < 50 else 2048 70 | 71 | features_shape = (num_features, features_size, features_size) 72 | return Encoder(model, features, features_shape) 73 | 74 | elif name == 'resnet101': 75 | model = resnet101(pretrained=True) 76 | features = nn.Sequential(*list(model.children())[:-2]) 77 | #num_features = 512 if int(name[6:]) < 50 else 2048 78 | #features_shape = (num_features, features_size, features_size) 79 | return features#Encoder(model, features, features_shape) 80 | 81 | 82 | elif name == 'resnet18': 83 | print('resnet18') 84 | model = resnet18(pretrained=True) 85 | features = nn.Sequential(*list(model.children())[:-3]) 86 | # features_shape = (num_features, features_size, features_size) 87 | return features #Encoder(model, features, features_shape) 88 | 89 | 90 | 91 | 92 | elif name == 'resnet2p1d': 93 | model = resnet2p1d.generate_model(model_depth=18) 94 | # n_classes=opt.n_classes, 95 | # n_input_channels=opt.n_input_channels, 96 | # shortcut_type='B', 97 | # conv1_t_size=opt.conv1_t_size, 98 | # conv1_t_stride=opt.conv1_t_stride, 99 | # no_max_pool=opt.no_max_pool, 100 | # widen_factor=opt.resnet_widen_factor) 101 | 102 | # model_without_last = nn.Sequential(*list(model.children())[:-1]).state_dict() 103 | #model_dict = model.state_dict() 104 | #pretrained_dict = torch.load(pretrain_path, map_location='cpu')['state_dict'] 105 | #pretrained_dict = {k: v for k, v in pretrained_dict.items() if 'fc' not in k} 106 | #model_dict.update(pretrained_dict) 107 | #model.load_state_dict(model_dict) 108 | features = nn.Sequential(*list(model.children())[:-3]) 109 | #features[0] = nn.Conv3d(4, 110, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False) 110 | return features 111 | # features = model 112 | 113 | else: 114 | raise KeyError("Unknown model name: {}".format(name)) 115 | 116 | 117 | 118 | # elif name.startswith('mobilenetv2'): 119 | # model = mobilenetv2.MobileNetV2(input_size=input_size, pretrained=None) 120 | # features = model.features 121 | # num_features = 1280 122 | # elif name.startswith('rmnet'): 123 | # model = rmnet.RMNetClassifier(1000, pretrained=None) 124 | # features = nn.Sequential(*list(model.children())[:-2]) 125 | # num_features = 512 126 | 127 | # elif name.startswith('se_res'): 128 | # model = load_from_pretrainedmodels(name)(pretrained='imagenet' if pretrained else None) 129 | # features = nn.Sequential(*list(model.children())[:-2]) 130 | 131 | 132 | 133 | 134 | def load_from_pretrainedmodels(model_name): 135 | import pretrainedmodels 136 | return getattr(pretrainedmodels, model_name) 137 | 138 | 139 | 140 | def squash_dims(tensor, dims): 141 | """ 142 | Squashes dimension, given in dims into one, which equals to product of given. 143 | 144 | Args: 145 | tensor (Tensor): input tensor 146 | dims: dimensions over which tensor should be squashed 147 | 148 | """ 149 | assert len(dims) >= 2, "Expected two or more dims to be squashed" 150 | 151 | size = tensor.size() 152 | 153 | squashed_dim = size[dims[0]] 154 | for i in range(1, len(dims)): 155 | assert dims[i] == dims[i - 1] + 1, "Squashed dims should be consecutive" 156 | squashed_dim *= size[dims[i]] 157 | 158 | result_dims = size[:dims[0]] + (squashed_dim,) + size[dims[-1] + 1:] 159 | return tensor.contiguous().view(*result_dims) 160 | 161 | 162 | def unsquash_dim(tensor, dim, res_dim): 163 | """ 164 | Unsquashes dimension, given in dim into separate dimensions given is res_dim 165 | Args: 166 | tensor (Tensor): input tensor 167 | dim (int): dimension that should be unsquashed 168 | res_dim (tuple): list of dimensions, that given dim should be unfolded to 169 | 170 | """ 171 | size = tensor.size() 172 | result_dim = size[:dim] + res_dim + size[dim + 1:] 173 | return tensor.view(*result_dim) 174 | -------------------------------------------------------------------------------- /utils/vad.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pdb 3 | import librosa 4 | import os 5 | import soundfile 6 | import random 7 | import pickle 8 | import torch 9 | import torchaudio 10 | import math 11 | 12 | wav_path = "../pickle/1_enhanced/" 13 | out_path = "../pickle/2_vad/" 14 | os.makedirs(out_path,exist_ok=True) 15 | 16 | wav_list = [f for f in os.listdir(wav_path)] 17 | wav_list = sorted(wav_list) 18 | frame_size = 512 19 | hop_size = 128 20 | RISING_TERM = 30 # 15*4 21 | LEAST_TERM = 80 #25*4 22 | power_list = [] 23 | save_wav_count=0 24 | eps = 1e-5 25 | 26 | def stft(_wav_path): 27 | window=torch.hann_window(window_length=512, periodic=True, dtype=None, layout=torch.strided, device=None, requires_grad=False) 28 | data_wav,_ = librosa.load(_wav_path,sr=16000,mono=False) 29 | #pdb.set_trace() 30 | data_wav = torch.from_numpy(data_wav) 31 | spec_noi1 = torchaudio.functional.spectrogram(waveform=data_wav, pad=0, window=window, n_fft=512, hop_length=128, win_length=512, power=None, normalized=False) 32 | input_wav_real1 =spec_noi1[:,:,:,0] 33 | input_wav_imag1 = spec_noi1[:,:,:,1] 34 | phase = torch.atan(input_wav_imag1/(input_wav_real1+1e-8)) 35 | input_wav_magnitude = torch.sqrt(input_wav_real1**2 + input_wav_imag1**2) 36 | return input_wav_magnitude, phase 37 | 38 | #def spec2audio_tensor(power_mag,phase,window,length,nfft): 39 | # window = window 40 | # length = length 41 | # mag = power_mag #[1 F T] 42 | # phase = phase #[1 F T] 43 | # sqrt_mag = torch.sqrt(mag) 44 | # cos_phase = torch.cos(phase) 45 | # sin_phase = torch.sin(phase) 46 | # real = sqrt_mag * cos_phase 47 | # imagine = sqrt_mag * sin_phase 48 | # real = real.unsqueeze(3) 49 | # imagine = imagine.unsqueeze(3) 50 | # complex_ri = torch.cat((real,imagine),3) 51 | # audio = torch.istft(input = complex_ri, n_fft=int(nfft), hop_length=int(0.25*nfft), win_length=int(nfft), window=window, center=True, normalized=False, onesided=True, length=length) 52 | # return audio 53 | 54 | #pdb.set_trace() 55 | for pkl in wav_list: 56 | time_list=[] 57 | left_or_right_list=[] 58 | 59 | with open(os.path.join(wav_path,pkl),'rb') as f: 60 | data = pickle.load(f) 61 | wav = data["output_path"] 62 | #wav = data["audio_path"] 63 | 64 | power_list=[] 65 | active_cnt = 0 66 | tmp_dummy_frame = 0 67 | dummy_frame = 0 68 | inactive_cnt = 0 69 | state= 0 70 | i=0 71 | num=0 72 | #pdb.set_trace() 73 | (total_audio,fs) = soundfile.read(wav) 74 | if len(total_audio.shape) == 1: 75 | pdb.set_trace() 76 | if fs != 16000: 77 | pdb.set_trace() 78 | tmp0 = librosa.resample(total_audio[:,0],fs,16000) 79 | tmp1 = librosa.resample(total_audio[:,1],fs,16000) 80 | total_audio = np.stack((tmp1,tmp2),axis=1) 81 | fs = 16000 82 | else: 83 | tmp0 = total_audio[:,0] 84 | tmp1 = total_audio[:,1] 85 | 86 | 87 | frame_idx_list = range(0,len(total_audio)-hop_size+1,hop_size) 88 | input_wav_mag,phase = stft(wav) 89 | 90 | mean_power = abs(input_wav_mag[:,20:,:]).mean() 91 | thre = mean_power / 10 92 | 93 | 94 | for frame_idx in frame_idx_list: 95 | num+=1 96 | if abs(input_wav_mag[:,20:,frame_idx//hop_size]).mean() > thre: 97 | if state == 0: 98 | active_cnt = 1 99 | tmp_dummy_frame = 1 100 | rising_idx = frame_idx 101 | state =1 102 | 103 | elif state == 1: 104 | active_cnt+=1 105 | tmp_dummy_frame+=1 106 | if active_cnt == RISING_TERM: 107 | state=2 108 | 109 | elif state == 2: 110 | active_cnt+=1 111 | 112 | elif state == 3: 113 | inactive_cnt=0 114 | active_cnt+=1 115 | state = 2 116 | 117 | elif state == 4: 118 | active_cnt =1 119 | tmp_dummy_frame = 1 120 | rising_idx = frame_idx 121 | state = 1 122 | 123 | else: 124 | 125 | if state == 0: 126 | dummy_frame+=1 127 | state = 0 128 | 129 | elif state == 1: 130 | active_cnt = 0 131 | dummy_frame+=tmp_dummy_frame 132 | tmp_dummy_frame = 0 133 | state=0 134 | 135 | elif state == 2: 136 | inactive_cnt =1 137 | active_cnt+=1 138 | state = 3 139 | 140 | elif state == 3: 141 | inactive_cnt+=1 142 | active_cnt+=1 143 | if inactive_cnt == LEAST_TERM: 144 | state = 4 145 | 146 | elif state == 4: 147 | dummy_frame = 1 148 | state = 0 149 | 150 | # save VAD chunk here in wav 151 | if state == 4 or (num == len(frame_idx_list) and active_cnt > RISING_TERM): 152 | falling_idx = frame_idx 153 | if rising_idx-hop_size < 0: 154 | rising_idx = 128 155 | rising_idx = (rising_idx-hop_size) 156 | if state == 4: 157 | falling_idx = (falling_idx-(LEAST_TERM-2)*hop_size) 158 | else: 159 | falling_idx = (falling_idx-(inactive_cnt-2)*hop_size) 160 | tmp0_power = np.sum(np.abs(tmp0[rising_idx:falling_idx])) 161 | tmp1_power = np.sum(np.abs(tmp1[rising_idx:falling_idx])) 162 | if tmp0_power > tmp1_power: 163 | left_or_right_list.append(0) 164 | else: 165 | left_or_right_list.append(1) 166 | 167 | rising_idx = rising_idx/fs 168 | falling_idx = falling_idx/fs 169 | time_list.append([rising_idx,falling_idx]) 170 | save_wav_count +=1 171 | #save chunk for another channel 172 | i+=1 173 | state = 4 174 | active_cnt = 0 175 | inactive_cnt = 0 176 | tmp_dummy_frame = 0 177 | dummy_frame = 0 178 | 179 | #pdb.set_trace() 180 | data["time"] = time_list 181 | data["LR"] = left_or_right_list 182 | wav_total_mean_power = np.mean(np.abs((tmp0+tmp1)/2)) 183 | if time_list == [] or wav_total_mean_power < eps: 184 | pdb.set_trace() 185 | wav = data["input_path"] 186 | (total_audio,fs) = soundfile.read(wav) 187 | 188 | if fs != 16000: 189 | tmp0 = librosa.resample(total_audio[:,0],fs,16000) 190 | tmp1 = librosa.resample(total_audio[:,1],fs,16000) 191 | total_audio = np.stack((tmp1,tmp2),axis=1) 192 | fs = 16000 193 | else: 194 | tmp0 = total_audio[:,0] 195 | tmp1 = total_audio[:,1] 196 | 197 | tmp0_power = np.sum(np.abs(tmp0)) 198 | tmp1_power = np.sum(np.abs(tmp1)) 199 | if tmp0_power > tmp1_power: 200 | left_or_right_list.append(0) 201 | else: 202 | left_or_right_list.append(1) 203 | 204 | data["LR"] = left_or_right_list 205 | 206 | with open(os.path.join(out_path,pkl),"wb") as fw: 207 | pickle.dump(data,fw) 208 | print("pickle dumped!!: {}".format(pkl)) -------------------------------------------------------------------------------- /datasets.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | sys.path.append('..') 4 | from torch.utils.data.dataset import Dataset 5 | from pathlib import Path 6 | import pickle 7 | import pdb 8 | import torch 9 | import numpy as np 10 | import argparse 11 | import os 12 | import sys 13 | # import h5py 14 | import librosa 15 | import numpy as np 16 | # import pandas as pd 17 | import scipy.io as sio 18 | from scipy import signal 19 | from tqdm import tqdm 20 | import warnings 21 | warnings.filterwarnings("ignore") 22 | 23 | class Audio_Reader(Dataset): 24 | def __init__(self, datalist): 25 | super(Audio_Reader, self).__init__() 26 | self.datalist = datalist 27 | self.classlist = ['0','20','40','60','80','100','120','140','160','180'] 28 | self.nfft = 512 29 | self.hopsize = self.nfft // 4 30 | self.window = 'hann' 31 | 32 | def __len__(self): 33 | return len(self.datalist) 34 | 35 | def LogMelGccExtractor(self, sig): 36 | def logmel(sig): 37 | 38 | #pdb.set_trace() 39 | S = np.abs(librosa.stft(y=sig, 40 | n_fft=self.nfft, 41 | hop_length=self.hopsize, 42 | center=True, 43 | window=self.window, 44 | pad_mode='reflect'))**2 45 | 46 | # S_mel = np.dot(self.melW, S).T 47 | S = librosa.power_to_db(S**2, ref=1.0, amin=1e-10, top_db=None) 48 | S = np.expand_dims(S, axis=0) 49 | 50 | return S 51 | 52 | def gcc_phat(sig, refsig): 53 | 54 | #pdb.set_trace() 55 | Px = librosa.stft(y=sig, 56 | n_fft=self.nfft, 57 | hop_length=self.hopsize, 58 | center=True, 59 | window=self.window, 60 | pad_mode='reflect') 61 | 62 | Px_ref = librosa.stft(y=refsig, 63 | n_fft=self.nfft, 64 | hop_length=self.hopsize, 65 | center=True, 66 | window=self.window, 67 | pad_mode='reflect') 68 | 69 | R = Px*np.conj(Px_ref) 70 | return R 71 | 72 | def transform(audio): 73 | 74 | channel_num = audio.shape[0] 75 | feature_logmel = [] 76 | feature_gcc_phat = [] 77 | for n in range(channel_num): 78 | feature_logmel.append(logmel(audio[n])) 79 | for m in range(n+1, channel_num): 80 | feature_gcc_phat.append( 81 | gcc_phat(sig=audio[m], refsig=audio[n])) 82 | 83 | #pdb.set_trace() 84 | feature_logmel = np.concatenate(feature_logmel, axis=0) 85 | feature_gcc_phat = np.concatenate(feature_gcc_phat, axis=0) 86 | feature = np.concatenate([feature_logmel, np.expand_dims(feature_gcc_phat, axis=0)]) 87 | 88 | return feature 89 | 90 | return transform(sig) 91 | 92 | def __getitem__(self, idx): 93 | 94 | audio_path = self.datalist[idx] 95 | class_name = audio_path.split('/')[-2].strip('degree') 96 | class_num = self.classlist.index(class_name) 97 | audio, _ = librosa.load(audio_path, sr=16000, mono=False, dtype=np.float32) 98 | if audio.shape[1] >80000: 99 | audio = audio[:,:80000] 100 | 101 | feature = self.LogMelGccExtractor(audio) 102 | #pdb.set_trace() 103 | return torch.FloatTensor(feature).transpose(1,2), np.array([class_num]) 104 | 105 | 106 | def Audio_Collate(batch): 107 | 108 | #pdb.set_trace() 109 | data, class_num = list(zip(*batch)) 110 | data_len = torch.LongTensor(np.array([x.size(1) for x in data if x.size(1)!=1])) 111 | #if len(data_len) == 0: 112 | # return -1 113 | 114 | max_len = max(data_len) 115 | wrong_indices = [] 116 | 117 | #for i, a_ in enumerate(class_num): 118 | # if a_[0] == -1: 119 | # wrong_indices.append(i) 120 | 121 | B = len(data) 122 | #pdb.set_trace() 123 | #inputs = torch.zeros(B-len(wrong_indices), 1, max_len, 10) 124 | #labels = torch.zeros(B-len(wrong_indices), 2) 125 | inputs = torch.zeros(B, 3, max_len, 257) 126 | labels = torch.zeros(B, 10) 127 | j = 0 128 | #pdb.set_trace() 129 | '''zero pad''' 130 | for i in range(B): 131 | #if i in wrong_indices: 132 | # continue 133 | 134 | inputs[j, : , :data[i].size(1),:] = data[i] 135 | labels[j, class_num[i]] = 1.0 136 | j += 1 137 | 138 | #pdb.set_trace() 139 | #data = (inputs, labels, data_len) 140 | data = (inputs, labels) 141 | return data 142 | 143 | 144 | class Test_Reader(Dataset): 145 | def __init__(self, datalist): 146 | super(Audio_Reader, self).__init__() 147 | self.datalist = datalist 148 | self.classlist = ['0','20','40','60','80','100','120','140','160','180'] 149 | self.nfft = 512 150 | self.hopsize = self.nfft // 4 151 | self.window = 'hann' 152 | 153 | def __len__(self): 154 | return len(self.datalist) 155 | 156 | def LogMelGccExtractor(self, sig): 157 | def logmel(sig): 158 | 159 | #pdb.set_trace() 160 | S = np.abs(librosa.stft(y=sig, 161 | n_fft=self.nfft, 162 | hop_length=self.hopsize, 163 | center=True, 164 | window=self.window, 165 | pad_mode='reflect'))**2 166 | 167 | # S_mel = np.dot(self.melW, S).T 168 | S = librosa.power_to_db(S**2, ref=1.0, amin=1e-10, top_db=None) 169 | S = np.expand_dims(S, axis=0) 170 | 171 | return S 172 | 173 | def gcc_phat(sig, refsig): 174 | 175 | #pdb.set_trace() 176 | Px = librosa.stft(y=sig, 177 | n_fft=self.nfft, 178 | hop_length=self.hopsize, 179 | center=True, 180 | window=self.window, 181 | pad_mode='reflect') 182 | 183 | Px_ref = librosa.stft(y=refsig, 184 | n_fft=self.nfft, 185 | hop_length=self.hopsize, 186 | center=True, 187 | window=self.window, 188 | pad_mode='reflect') 189 | 190 | R = Px*np.conj(Px_ref) 191 | return R 192 | 193 | def transform(audio): 194 | 195 | channel_num = audio.shape[0] 196 | feature_logmel = [] 197 | feature_gcc_phat = [] 198 | for n in range(channel_num): 199 | feature_logmel.append(logmel(audio[n])) 200 | for m in range(n+1, channel_num): 201 | feature_gcc_phat.append( 202 | gcc_phat(sig=audio[m], refsig=audio[n])) 203 | 204 | #pdb.set_trace() 205 | feature_logmel = np.concatenate(feature_logmel, axis=0) 206 | feature_gcc_phat = np.concatenate(feature_gcc_phat, axis=0) 207 | feature = np.concatenate([feature_logmel, np.expand_dims(feature_gcc_phat, axis=0)]) 208 | 209 | return feature 210 | 211 | return transform(sig) 212 | 213 | def __getitem__(self, idx): 214 | 215 | audio_path = self.datalist[idx] 216 | class_name = audio_path.split('/')[-2].strip('degree') 217 | class_num = self.classlist.index(class_name) 218 | audio, _ = librosa.load(audio_path, sr=16000, mono=False, dtype=np.float32) 219 | if audio.shape[1] >80000: 220 | audio = audio[:,:80000] 221 | 222 | feature = self.LogMelGccExtractor(audio) 223 | #pdb.set_trace() 224 | return torch.FloatTensor(feature).transpose(1,2), np.array([class_num]) 225 | -------------------------------------------------------------------------------- /trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import numpy as np 5 | import datetime 6 | 7 | import pickle as pkl 8 | 9 | from pathlib import Path 10 | import torch 11 | import pdb 12 | from tqdm import tqdm 13 | from datetime import datetime 14 | 15 | import torch 16 | from torch.utils.tensorboard import SummaryWriter 17 | from torch.optim.lr_scheduler import ReduceLROnPlateau 18 | from torch.distributions.multivariate_normal import MultivariateNormal 19 | import torch.nn.functional as F 20 | 21 | import logging 22 | import json 23 | from multiprocessing import Pool 24 | import time 25 | import warnings 26 | warnings.filterwarnings("ignore") 27 | 28 | class ModelTrainer: 29 | 30 | def __init__(self, model, train_loader, valid_loader, criterion, optimizer, scheduler, config, epochs, device, save_path, ckpt_path=None, comment=None, fold=2): 31 | 32 | self.device = torch.device('cuda:{}'.format(device)) 33 | #self.model = model.to(self.device) 34 | self.model = model.cuda() 35 | 36 | self.train_loader = train_loader 37 | self.valid_loader = valid_loader 38 | self.criterion = criterion 39 | self.optimizer = optimizer 40 | self.scheduler = scheduler 41 | 42 | self.exp_path = Path(os.path.join(save_path, datetime.now().strftime('%d%B_%0l%0M'))) #21November_0430 43 | self.exp_path.mkdir(exist_ok=True, parents=True) 44 | 45 | # Set logger 46 | self.logger = logging.getLogger('') 47 | self.logger.setLevel(logging.INFO) 48 | fh = logging.FileHandler(os.path.join(self.exp_path, 'training.log')) 49 | sh = logging.StreamHandler(sys.stdout) 50 | self.logger.addHandler(fh) 51 | self.logger.addHandler(sh) 52 | 53 | #Dump hyper-parameters 54 | with open(str(self.exp_path.joinpath('config.json')), 'w') as f: 55 | json.dump(config, f, indent=2) 56 | 57 | if comment != None: 58 | self.logger.info(comment) 59 | 60 | self.writter = SummaryWriter(self.exp_path.joinpath('logs')) 61 | self.epochs = epochs 62 | self.best_acc = 0.0 63 | self.best_epoch = 0 64 | 65 | if ckpt_path != None: 66 | self.load_checkpoint(ckpt_path) 67 | self.optimizer.param_groups[0]['lr'] = 0.0001 68 | 69 | def train(self): 70 | for epoch in tqdm(range(self.epochs)): 71 | start = time.time() 72 | train_loss, t_accuracy= self.train_single_epoch(epoch) 73 | valid_loss, v_accuracy = self.inference() 74 | duration = time.time() - start 75 | 76 | if v_accuracy > self.best_acc: 77 | self.best_acc = v_accuracy 78 | self.best_epoch = epoch 79 | 80 | self.scheduler.step(v_accuracy) 81 | self.logger.info("epoch: {} --- t_loss : {:0.3f}, train_acc = {}%, v_loss: {:0.3f}, val_acc: {}%, best_acc: {}%, best_epoch: {}, time: {:0.2f}s, lr: {}"\ 82 | .format(epoch, train_loss, t_accuracy, valid_loss, v_accuracy, self.best_acc, self.best_epoch, duration,self.optimizer.param_groups[0]['lr'])) 83 | 84 | self.save_checkpoint(epoch, v_accuracy) 85 | 86 | self.writter.add_scalar('data/Train_Loss', train_loss, epoch) 87 | self.writter.add_scalar('data/Valid_Loss', valid_loss, epoch) 88 | self.writter.add_scalar('data/Train_Accuracy', t_accuracy, epoch) 89 | self.writter.add_scalar('data/Valid_Accuracy', v_accuracy, epoch) 90 | 91 | self.writter.close() 92 | 93 | 94 | def train_single_epoch(self, epoch): 95 | self.model.train() 96 | 97 | total_loss = 0.0 98 | accuracy = 0.0 99 | correct_cnt = 0 100 | tot_cnt = 0 101 | batch_size = len(self.train_loader) 102 | 103 | for b, batch in (enumerate(self.train_loader)): 104 | 105 | inputs, labels = batch 106 | B, C, T, Freq = inputs.size() 107 | inputs = inputs.cuda() 108 | labels = labels.cuda() 109 | 110 | self.optimizer.zero_grad() 111 | outputs = self.model(inputs) 112 | scores = outputs.mean(1) 113 | best_prediction = scores.max(-1)[1] 114 | 115 | for i in range(B): 116 | if labels[i, best_prediction[i]] == 1.0: 117 | correct_cnt += 1 118 | 119 | batch_loss = self.criterion(scores, labels) 120 | batch_loss.backward() 121 | total_loss += batch_loss.item() 122 | self.optimizer.step() 123 | tot_cnt += B 124 | 125 | print("{}/{}: {}/{}".format(b, batch_size, correct_cnt, tot_cnt), end='\r') 126 | 127 | mean_loss = total_loss / tot_cnt 128 | return mean_loss, (correct_cnt/tot_cnt)*100 129 | 130 | 131 | def inference(self): 132 | self.model.eval() 133 | 134 | total_loss = 0.0 135 | accuracy = 0.0 136 | correct_cnt = 0 137 | tot_cnt = 0 138 | batch_size = len(self.valid_loader) 139 | with torch.no_grad(): 140 | for b, batch in enumerate(self.valid_loader): 141 | 142 | inputs, labels = batch 143 | B, C, T, Freq = inputs.size() 144 | inputs = inputs.cuda() 145 | labels = labels.cuda() 146 | outputs = self.model(inputs) 147 | 148 | scores = outputs.mean(1) 149 | best_prediction = scores.max(-1)[1] 150 | 151 | for i in range(B): 152 | if labels[i, best_prediction[i]] == 1.0: 153 | correct_cnt += 1 154 | 155 | batch_loss = self.criterion(scores, labels) 156 | total_loss += batch_loss.item() 157 | tot_cnt += B 158 | 159 | print("{}/{}: {}/{}".format(b, batch_size, correct_cnt, tot_cnt), end='\r') 160 | 161 | mean_loss = total_loss / tot_cnt 162 | return mean_loss, (correct_cnt/tot_cnt)*100 163 | 164 | 165 | def load_checkpoint(self, ckpt): 166 | self.logger.info("Loading checkpoint from {ckpt}") 167 | print('Loading checkpoint : {}'.format(ckpt)) 168 | checkpoint = torch.load(ckpt) 169 | self.model.load_state_dict(checkpoint['model_state_dict'], strict=False) 170 | self.optimizer.load_state_dict(checkpoint['optimizer'])#, strict=False) 171 | 172 | 173 | def save_checkpoint(self, epoch, vacc, best=True): 174 | 175 | state_dict = { 176 | 'epoch': epoch, 177 | 'model_state_dict': self.model.state_dict(), 178 | 'optimizer': self.optimizer.state_dict() 179 | } 180 | 181 | self.exp_path.joinpath('ckpt').mkdir(exist_ok=True, parents=True) 182 | save_path = "{}/ckpt/{}_{:0.4f}.pt".format(self.exp_path, epoch, vacc) 183 | torch.save(state_dict, save_path) 184 | 185 | 186 | class ModelTester: 187 | def __init__(self, model, test_loader, ckpt_path, device): 188 | 189 | # Essential parts 190 | self.device = torch.device('cuda:{}'.format(device)) 191 | #self.model = model.to(self.device) 192 | self.model = model.cuda() 193 | self.test_loader = test_loader 194 | # Set logger 195 | self.logger = logging.getLogger('') 196 | self.logger.setLevel(logging.INFO) 197 | sh = logging.StreamHandler(sys.stdout) 198 | self.logger.addHandler(sh) 199 | 200 | self.load_checkpoint(ckpt_path) 201 | 202 | 203 | def load_checkpoint(self, ckpt): 204 | self.logger.info(f"Loading checkpoint from {ckpt}") 205 | # print('Loading checkpoint : {}'.format(ckpt)) 206 | checkpoint = torch.load(ckpt) 207 | self.model.load_state_dict(checkpoint['model_state_dict'], strict=False) 208 | 209 | 210 | 211 | def test(self): 212 | """ 213 | images : [B x T x C x H x W] 214 | labels : [B x T] 215 | """ 216 | self.model.eval() 217 | result = ['FA','MA'] 218 | batch_size = len(self.test_loader) 219 | final = open('/home/ygchoi/gender_detection/result.csv', 'w') 220 | final.write('filename'+'\t'+'prediction'+'\n') 221 | 222 | with torch.no_grad(): 223 | for b, batch in tqdm(enumerate(self.test_loader), total=len(self.test_loader)): 224 | 225 | inputs, audio_path = batch 226 | inputs = torch.unsqueeze(inputs,1) 227 | B, C, T, Freq = inputs.size() 228 | inputs = inputs.cuda() 229 | outputs = self.model(inputs) 230 | best_prediction = outputs.max(2)[1].mode()[0] 231 | final.write(audio_path[0]+'\t'+result[best_prediction.item()]+'\n') 232 | 233 | final.close() 234 | -------------------------------------------------------------------------------- /utils/utilities.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import itertools 4 | import logging 5 | import os 6 | import sys 7 | 8 | import librosa 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import pandas as pd 12 | import torch 13 | from librosa.display import specshow 14 | from torch.backends import cudnn 15 | from tqdm import tqdm 16 | 17 | event_labels = ['knock', 'drawer', 'clearthroat', 'phone', 'keysDrop',\ 18 | 'speech', 'keyboard', 'pageturn', 'cough', 'doorslam', 'laughter'] 19 | lb_to_ix = {lb: i for i, lb in enumerate(event_labels)} 20 | ix_to_lb = {i: lb for i, lb in enumerate(event_labels)} 21 | 22 | azimuths = range(-180, 171, 10) 23 | elevations = range(-40, 41, 10) 24 | doa = [azimuths, elevations] 25 | doa_labels = list(itertools.product(*doa)) 26 | doa_to_ix = {doa: i for i, doa in enumerate(doa_labels)} 27 | ix_to_doa = {i: doa for i, doa in enumerate(doa_labels)} 28 | 29 | train_splits_dict = {1: [2,3,4], 2: [1,3,4], 3: [1,2,4], 4: [1,2,3], -1: [1,2,3,4]} 30 | valid_split_dict = {1: [1], 2: [2], 3: [3], 4: [4], -1: []} 31 | test_split_dict = {1: [1], 2: [2], 3: [3], 4: [4], -1: []} 32 | 33 | 34 | def get_doas(indexes): 35 | ''' 36 | Get multiple doas from indexes 37 | ''' 38 | doas = [] 39 | for idx in indexes: 40 | doas.append(ix_to_doa[idx]) 41 | return doas 42 | 43 | 44 | def calculate_scalar(features): 45 | 46 | mean = [] 47 | std = [] 48 | 49 | channels = features.shape[0] 50 | for channel in range(channels): 51 | feat = features[channel, :, :] 52 | mean.append(np.mean(feat, axis=0)) 53 | std.append(np.std(feat, axis=0)) 54 | 55 | mean = np.array(mean) 56 | std = np.array(std) 57 | mean = np.expand_dims(mean, axis=0) 58 | std = np.expand_dims(std, axis=0) 59 | mean = np.expand_dims(mean, axis=2) 60 | std = np.expand_dims(std, axis=2) 61 | 62 | return mean, std 63 | 64 | 65 | def one_hot_encode(target, length): 66 | """Convert batches of class indices to classes of one-hot vectors.""" 67 | target = np.array(target) 68 | if len(target.shape) == 0: 69 | one_hot_vec = np.zeros((1, length)) 70 | one_hot_vec[0, target] = 1.0 71 | else: 72 | batch_s = target.shape[0] 73 | one_hot_vec = np.zeros((batch_s, length)) 74 | for i in range(batch_s): 75 | one_hot_vec[i, target[i].astype(int)] = 1.0 76 | 77 | return one_hot_vec 78 | 79 | 80 | def get_filename(path): 81 | path = os.path.realpath(path) 82 | na_ext = path.split('/')[-1] 83 | na = os.path.splitext(na_ext)[0] 84 | return na 85 | 86 | 87 | class TqdmLoggingHandler(logging.Handler): 88 | def __init__(self, level=logging.NOTSET): 89 | super().__init__(level) 90 | 91 | def emit(self, record): 92 | try: 93 | msg = self.format(record) 94 | tqdm.write(msg) 95 | self.flush() 96 | except: 97 | self.handleError(record) 98 | 99 | 100 | def create_logging(log_dir, filemode): 101 | 102 | os.makedirs(log_dir, exist_ok=True) 103 | 104 | i1 = 0 105 | 106 | while os.path.isfile(os.path.join(log_dir, '{:04d}.log'.format(i1))): 107 | i1 += 1 108 | 109 | log_path = os.path.join(log_dir, '{:04d}.log'.format(i1)) 110 | logging.basicConfig( 111 | level=logging.DEBUG, 112 | format='%(filename)s[line:%(lineno)d] %(levelname)s %(message)s', 113 | # format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', 114 | datefmt='%a, %d %b %Y %H:%M:%S', 115 | filename=log_path, 116 | filemode=filemode) 117 | 118 | # Print to console 119 | console = logging.StreamHandler() 120 | console.setLevel(logging.INFO) 121 | formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') 122 | console.setFormatter(formatter) 123 | # logging.getLogger('').addHandler(console) 124 | logging.getLogger('').addHandler(TqdmLoggingHandler()) 125 | 126 | logging.info(datetime.datetime.now()) 127 | logging.info('\n') 128 | 129 | return logging 130 | 131 | 132 | def to_torch(x, cuda): 133 | 134 | if 'float' in str(x.dtype): 135 | x = torch.Tensor(x) 136 | elif 'int' in str(x.dtype): 137 | x = torch.LongTensor(x) 138 | else: 139 | raise Exception("Error!") 140 | 141 | if cuda: 142 | x = x.cuda() 143 | 144 | return x 145 | 146 | 147 | def to_np(x): 148 | """ 149 | Convert values of the model parameters to numpy.array. 150 | """ 151 | return x.cpu().data.numpy() 152 | 153 | 154 | def move_model_to_gpu(model): 155 | ''' 156 | Move model to GPU 157 | ''' 158 | logging.info('\nUtilize GPUs for computation') 159 | logging.info('\nNumber of GPU available: {}'.format(torch.cuda.device_count())) 160 | if torch.cuda.device_count() > 1: 161 | Multi_GPU = True 162 | else: 163 | Multi_GPU = False 164 | model.cuda() 165 | cudnn.benchmark = False # for cuda 10.0 166 | model = torch.nn.DataParallel(model) 167 | 168 | return model, Multi_GPU 169 | 170 | def logging_and_writer(data_type, metrics, logging, writer=[], batch_idx=0): 171 | ''' 172 | Logging to tqdm, and write to tensorboard 173 | 174 | Input: 175 | data_type: 'train' | 'valid' | 'test' 176 | metrics: output from evaluate function, including loss and other metrics 177 | logging: logging 178 | writer: tensorboard writer 179 | batch_idx: batch iteration index, only for 'train' and 'valid' 180 | ''' 181 | 182 | if data_type == 'train': 183 | 184 | [tr_loss, tr_sed_mAP, tr_sed_scores, tr_doa_er_metric, 185 | tr_seld_metric] = metrics 186 | 187 | logging.info('Train SELD loss: {:.3f}, Train SED loss: {:.3f}, Train DOA loss: {:.3f}, ' 188 | 'Train SED mAP(micro): {:.3f}, Train SED mAP(macro): {:.3f}'.format( 189 | tr_loss[0], tr_loss[1], tr_loss[2], tr_sed_mAP[0], tr_sed_mAP[1])) 190 | writer.add_scalar('train/SELD_loss', tr_loss[0], batch_idx) 191 | writer.add_scalar('train/SED_loss', tr_loss[1], batch_idx) 192 | writer.add_scalar('train/DOA_loss', tr_loss[2], batch_idx) 193 | writer.add_scalar('train/SED_mAP_micro', tr_sed_mAP[0], batch_idx) 194 | writer.add_scalar('train/SED_mAP_macro', tr_sed_mAP[1], batch_idx) 195 | 196 | logging.info('Train ER: {:.3f}, Train F-score: {:.3f}, Train DOA error: {:.3f}, Train DOA frame recall: {:.3f}, Train SELD error: {:.3f}'.format( 197 | tr_sed_scores[0], tr_sed_scores[1], tr_doa_er_metric[0], tr_doa_er_metric[1], tr_seld_metric)) 198 | writer.add_scalar('train/ER', tr_sed_scores[0], batch_idx) 199 | writer.add_scalar('train/F_score', tr_sed_scores[1], batch_idx) 200 | writer.add_scalar('train/DOA_error', tr_doa_er_metric[0], batch_idx) 201 | writer.add_scalar('train/DOA_frame_recall', tr_doa_er_metric[1], batch_idx) 202 | writer.add_scalar('train/SELD_error', tr_seld_metric, batch_idx) 203 | 204 | elif data_type == 'valid': 205 | 206 | [train_metrics, valid_metrics] = metrics 207 | 208 | [tr_loss, tr_sed_mAP, tr_sed_scores, tr_doa_er_metric, 209 | tr_seld_metric] = train_metrics 210 | 211 | [va_loss, va_sed_mAP, va_sed_scores, va_doa_er_metric, 212 | va_seld_metric] = valid_metrics 213 | 214 | logging.info('Train SELD loss: {:.3f}, Train SED loss: {:.3f}, Train DOA loss: {:.3f}, ' 215 | 'Train SED mAP(micro): {:.3f}, Train SED mAP(macro): {:.3f}'.format( 216 | tr_loss[0], tr_loss[1], tr_loss[2], tr_sed_mAP[0], tr_sed_mAP[1])) 217 | writer.add_scalar('train/SELD_loss', tr_loss[0], batch_idx) 218 | writer.add_scalar('train/SED_loss', tr_loss[1], batch_idx) 219 | writer.add_scalar('train/DOA_loss', tr_loss[2], batch_idx) 220 | writer.add_scalar('train/SED_mAP_micro', tr_sed_mAP[0], batch_idx) 221 | writer.add_scalar('train/SED_mAP_macro', tr_sed_mAP[1], batch_idx) 222 | 223 | logging.info('Valid SELD loss: {:.3f}, Valid SED loss: {:.3f}, Valid DOA loss: {:.3f}, ' 224 | 'Valid SED mAP(micro): {:.3f}, Valid SED mAP(macro): {:.3f}'.format( 225 | va_loss[0], va_loss[1], va_loss[2], va_sed_mAP[0], va_sed_mAP[1])) 226 | writer.add_scalar('valid/SELD_loss', va_loss[0], batch_idx) 227 | writer.add_scalar('valid/SED_loss', va_loss[1], batch_idx) 228 | writer.add_scalar('valid/DOA_loss', va_loss[2], batch_idx) 229 | writer.add_scalar('valid/SED_mAP_micro', va_sed_mAP[0], batch_idx) 230 | writer.add_scalar('valid/SED_mAP_macro', va_sed_mAP[1], batch_idx) 231 | 232 | logging.info('Train ER: {:.3f}, Train F-score: {:.3f}, Train DOA error: {:.3f}, Train DOA frame recall: {:.3f}, Train SELD error: {:.3f}'.format( 233 | tr_sed_scores[0], tr_sed_scores[1], tr_doa_er_metric[0], tr_doa_er_metric[1], tr_seld_metric)) 234 | writer.add_scalar('train/ER', tr_sed_scores[0], batch_idx) 235 | writer.add_scalar('train/F_score', tr_sed_scores[1], batch_idx) 236 | writer.add_scalar('train/DOA_error', tr_doa_er_metric[0], batch_idx) 237 | writer.add_scalar('train/DOA_frame_recall', tr_doa_er_metric[1], batch_idx) 238 | writer.add_scalar('train/SELD_error', tr_seld_metric, batch_idx) 239 | 240 | logging.info('Valid ER: {:.3f}, Valid F-score: {:.3f}, Valid DOA error: {:.3f}, Valid DOA frame recall: {:.3f}, Valid SELD error: {:.3f}'.format( 241 | va_sed_scores[0], va_sed_scores[1], va_doa_er_metric[0], va_doa_er_metric[1], va_seld_metric)) 242 | writer.add_scalar('valid/ER', va_sed_scores[0], batch_idx) 243 | writer.add_scalar('valid/F_score', va_sed_scores[1], batch_idx) 244 | writer.add_scalar('valid/DOA_error', va_doa_er_metric[0], batch_idx) 245 | writer.add_scalar('valid/DOA_frame_recall', va_doa_er_metric[1], batch_idx) 246 | writer.add_scalar('valid/SELD_error', va_seld_metric, batch_idx) 247 | 248 | elif data_type == 'test': 249 | 250 | [te_loss, te_sed_mAP, te_sed_scores, te_doa_er_metric, 251 | te_seld_metric] = metrics 252 | 253 | logging.info('Test SELD loss: {:.3f}, Test SED loss: {:.3f}, Test DOA loss: {:.3f}, ' 254 | 'Test SED mAP(micro): {:.3f}, Test SED mAP(macro): {:.3f}'.format( 255 | te_loss[0], te_loss[1], te_loss[2], te_sed_mAP[0], te_sed_mAP[1])) 256 | 257 | logging.info('Test ER: {:.3f}, Test F-score: {:.3f}, Test DOA error: {:.3f}, Test DOA frame recall: {:.3f}, Test SELD error: {:.3f}'.format( 258 | te_sed_scores[0], te_sed_scores[1], te_doa_er_metric[0], te_doa_er_metric[1], te_seld_metric)) 259 | 260 | 261 | def print_evaluation(metrics): 262 | 263 | [te_loss, te_sed_mAP, te_sed_scores, te_doa_er_metric, 264 | te_seld_metric] = metrics 265 | 266 | print('Test SELD loss: {:.3f}, Test SED loss: {:.3f}, Test DOA loss: {:.3f}, ' 267 | 'Test SED mAP(micro): {:.3f}, Test SED mAP(macro): {:.3f}'.format( 268 | te_loss[0], te_loss[1], te_loss[2], te_sed_mAP[0], te_sed_mAP[1])) 269 | 270 | print('Test ER: {:.3f}, Test F-score: {:.3f}, Test DOA error: {:.3f}, Test DOA frame recall: {:.3f}, Test SELD error: {:.3f}'.format( 271 | te_sed_scores[0], te_sed_scores[1], te_doa_er_metric[0], te_doa_er_metric[1], te_seld_metric)) 272 | 273 | 274 | def str2bool(v): 275 | if isinstance(v, bool): 276 | return v 277 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 278 | return True 279 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 280 | return False 281 | else: 282 | raise argparse.ArgumentTypeError('Boolean value expected.') 283 | -------------------------------------------------------------------------------- /utils/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.utils.model_zoo import load_url as load_state_dict_from_url 4 | import torch.utils.model_zoo as model_zoo 5 | 6 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 7 | 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', 8 | 'wide_resnet50_2', 'wide_resnet101_2'] 9 | 10 | 11 | model_urls = { 12 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 13 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 14 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 15 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 16 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 17 | 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth', 18 | 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth', 19 | 'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth', 20 | 'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth', 21 | } 22 | 23 | 24 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): 25 | """3x3 convolution with padding""" 26 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 27 | padding=dilation, groups=groups, bias=False, dilation=dilation) 28 | 29 | 30 | def conv1x1(in_planes, out_planes, stride=1): 31 | """1x1 convolution""" 32 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 33 | 34 | 35 | class BasicBlock(nn.Module): 36 | expansion = 1 37 | 38 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 39 | base_width=64, dilation=1, norm_layer=None): 40 | super(BasicBlock, self).__init__() 41 | if norm_layer is None: 42 | norm_layer = nn.BatchNorm2d 43 | if groups != 1 or base_width != 64: 44 | raise ValueError('BasicBlock only supports groups=1 and base_width=64') 45 | if dilation > 1: 46 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock") 47 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1 48 | self.conv1 = conv3x3(inplanes, planes, stride) 49 | self.bn1 = norm_layer(planes) 50 | self.relu = nn.ReLU(inplace=True) 51 | self.conv2 = conv3x3(planes, planes) 52 | self.bn2 = norm_layer(planes) 53 | self.downsample = downsample 54 | self.stride = stride 55 | 56 | def forward(self, x): 57 | identity = x 58 | 59 | out = self.conv1(x) 60 | out = self.bn1(out) 61 | out = self.relu(out) 62 | 63 | out = self.conv2(out) 64 | out = self.bn2(out) 65 | 66 | if self.downsample is not None: 67 | identity = self.downsample(x) 68 | 69 | out += identity 70 | out = self.relu(out) 71 | 72 | return out 73 | 74 | 75 | class Bottleneck(nn.Module): 76 | # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) 77 | # while original implementation places the stride at the first 1x1 convolution(self.conv1) 78 | # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. 79 | # This variant is also known as ResNet V1.5 and improves accuracy according to 80 | # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. 81 | 82 | expansion = 4 83 | 84 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 85 | base_width=64, dilation=1, norm_layer=None): 86 | super(Bottleneck, self).__init__() 87 | if norm_layer is None: 88 | norm_layer = nn.BatchNorm2d 89 | width = int(planes * (base_width / 64.)) * groups 90 | # Both self.conv2 and self.downsample layers downsample the input when stride != 1 91 | self.conv1 = conv1x1(inplanes, width) 92 | self.bn1 = norm_layer(width) 93 | self.conv2 = conv3x3(width, width, stride, groups, dilation) 94 | self.bn2 = norm_layer(width) 95 | self.conv3 = conv1x1(width, planes * self.expansion) 96 | self.bn3 = norm_layer(planes * self.expansion) 97 | self.relu = nn.ReLU(inplace=True) 98 | self.downsample = downsample 99 | self.stride = stride 100 | 101 | def forward(self, x): 102 | identity = x 103 | 104 | out = self.conv1(x) 105 | out = self.bn1(out) 106 | out = self.relu(out) 107 | 108 | out = self.conv2(out) 109 | out = self.bn2(out) 110 | out = self.relu(out) 111 | 112 | out = self.conv3(out) 113 | out = self.bn3(out) 114 | 115 | if self.downsample is not None: 116 | identity = self.downsample(x) 117 | 118 | out += identity 119 | out = self.relu(out) 120 | 121 | return out 122 | 123 | 124 | class ResNet(nn.Module): 125 | 126 | def __init__(self, block, layers, num_classes=1000, zero_init_residual=False, 127 | groups=1, width_per_group=64, replace_stride_with_dilation=None, 128 | norm_layer=None): 129 | super(ResNet, self).__init__() 130 | if norm_layer is None: 131 | norm_layer = nn.BatchNorm2d 132 | self._norm_layer = norm_layer 133 | 134 | self.inplanes = 64 135 | self.dilation = 1 136 | if replace_stride_with_dilation is None: 137 | # each element in the tuple indicates if we should replace 138 | # the 2x2 stride with a dilated convolution instead 139 | replace_stride_with_dilation = [False, False, False] 140 | if len(replace_stride_with_dilation) != 3: 141 | raise ValueError("replace_stride_with_dilation should be None " 142 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) 143 | self.groups = groups 144 | self.base_width = width_per_group 145 | self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, 146 | bias=False) 147 | self.bn1 = norm_layer(self.inplanes) 148 | self.relu = nn.ReLU(inplace=True) 149 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 150 | self.layer1 = self._make_layer(block, 64, layers[0]) 151 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2, 152 | dilate=replace_stride_with_dilation[0]) 153 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2, 154 | dilate=replace_stride_with_dilation[1]) 155 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2, 156 | dilate=replace_stride_with_dilation[2]) 157 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 158 | self.fc = nn.Linear(512 * block.expansion, num_classes) 159 | 160 | for m in self.modules(): 161 | if isinstance(m, nn.Conv2d): 162 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 163 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): 164 | nn.init.constant_(m.weight, 1) 165 | nn.init.constant_(m.bias, 0) 166 | 167 | # Zero-initialize the last BN in each residual branch, 168 | # so that the residual branch starts with zeros, and each residual block behaves like an identity. 169 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 170 | if zero_init_residual: 171 | for m in self.modules(): 172 | if isinstance(m, Bottleneck): 173 | nn.init.constant_(m.bn3.weight, 0) 174 | elif isinstance(m, BasicBlock): 175 | nn.init.constant_(m.bn2.weight, 0) 176 | 177 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False): 178 | norm_layer = self._norm_layer 179 | downsample = None 180 | previous_dilation = self.dilation 181 | if dilate: 182 | self.dilation *= stride 183 | stride = 1 184 | if stride != 1 or self.inplanes != planes * block.expansion: 185 | downsample = nn.Sequential( 186 | conv1x1(self.inplanes, planes * block.expansion, stride), 187 | norm_layer(planes * block.expansion), 188 | ) 189 | 190 | layers = [] 191 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups, 192 | self.base_width, previous_dilation, norm_layer)) 193 | self.inplanes = planes * block.expansion 194 | for _ in range(1, blocks): 195 | layers.append(block(self.inplanes, planes, groups=self.groups, 196 | base_width=self.base_width, dilation=self.dilation, 197 | norm_layer=norm_layer)) 198 | 199 | return nn.Sequential(*layers) 200 | 201 | def _forward_impl(self, x): 202 | # See note [TorchScript super()] 203 | x = self.conv1(x) 204 | x = self.bn1(x) 205 | x = self.relu(x) 206 | x = self.maxpool(x) 207 | 208 | x = self.layer1(x) 209 | x = self.layer2(x) 210 | x = self.layer3(x) 211 | x = self.layer4(x) 212 | 213 | x = self.avgpool(x) 214 | x = torch.flatten(x, 1) 215 | x = self.fc(x) 216 | 217 | return x 218 | 219 | def forward(self, x): 220 | return self._forward_impl(x) 221 | 222 | 223 | def _resnet(arch, block, layers, pretrained, progress, **kwargs): 224 | model = ResNet(block, layers, **kwargs) 225 | if pretrained: 226 | state_dict = load_state_dict_from_url(model_urls[arch], 227 | progress=progress) 228 | model.load_state_dict(state_dict) 229 | return model 230 | 231 | 232 | def resnet18(pretrained=False, progress=True, **kwargs): 233 | r"""ResNet-18 model from 234 | `"Deep Residual Learning for Image Recognition" `_ 235 | Args: 236 | pretrained (bool): If True, returns a model pre-trained on ImageNet 237 | progress (bool): If True, displays a progress bar of the download to stderr 238 | """ 239 | return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, 240 | **kwargs) 241 | 242 | 243 | def resnet34(pretrained=False, progress=True, **kwargs): 244 | r"""ResNet-34 model from 245 | `"Deep Residual Learning for Image Recognition" `_ 246 | Args: 247 | pretrained (bool): If True, returns a model pre-trained on ImageNet 248 | progress (bool): If True, displays a progress bar of the download to stderr 249 | """ 250 | return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress, 251 | **kwargs) 252 | 253 | 254 | def resnet50(pretrained=False, progress=True, **kwargs): 255 | r"""ResNet-50 model from 256 | `"Deep Residual Learning for Image Recognition" `_ 257 | Args: 258 | pretrained (bool): If True, returns a model pre-trained on ImageNet 259 | progress (bool): If True, displays a progress bar of the download to stderr 260 | """ 261 | return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress, 262 | **kwargs) 263 | 264 | 265 | def resnet101(pretrained=False, progress=True, **kwargs): 266 | r"""ResNet-101 model from 267 | `"Deep Residual Learning for Image Recognition" `_ 268 | Args: 269 | pretrained (bool): If True, returns a model pre-trained on ImageNet 270 | progress (bool): If True, displays a progress bar of the download to stderr 271 | """ 272 | return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress, 273 | **kwargs) 274 | 275 | 276 | def resnet152(pretrained=False, progress=True, **kwargs): 277 | r"""ResNet-152 model from 278 | `"Deep Residual Learning for Image Recognition" `_ 279 | Args: 280 | pretrained (bool): If True, returns a model pre-trained on ImageNet 281 | progress (bool): If True, displays a progress bar of the download to stderr 282 | """ 283 | return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress, 284 | **kwargs) 285 | 286 | 287 | def resnext50_32x4d(pretrained=False, progress=True, **kwargs): 288 | r"""ResNeXt-50 32x4d model from 289 | `"Aggregated Residual Transformation for Deep Neural Networks" `_ 290 | Args: 291 | pretrained (bool): If True, returns a model pre-trained on ImageNet 292 | progress (bool): If True, displays a progress bar of the download to stderr 293 | """ 294 | kwargs['groups'] = 32 295 | kwargs['width_per_group'] = 4 296 | return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], 297 | pretrained, progress, **kwargs) 298 | 299 | 300 | def resnext101_32x8d(pretrained=False, progress=True, **kwargs): 301 | r"""ResNeXt-101 32x8d model from 302 | `"Aggregated Residual Transformation for Deep Neural Networks" `_ 303 | Args: 304 | pretrained (bool): If True, returns a model pre-trained on ImageNet 305 | progress (bool): If True, displays a progress bar of the download to stderr 306 | """ 307 | kwargs['groups'] = 32 308 | kwargs['width_per_group'] = 8 309 | return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], 310 | pretrained, progress, **kwargs) 311 | 312 | 313 | def wide_resnet50_2(pretrained=False, progress=True, **kwargs): 314 | r"""Wide ResNet-50-2 model from 315 | `"Wide Residual Networks" `_ 316 | The model is the same as ResNet except for the bottleneck number of channels 317 | which is twice larger in every block. The number of channels in outer 1x1 318 | convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 319 | channels, and in Wide ResNet-50-2 has 2048-1024-2048. 320 | Args: 321 | pretrained (bool): If True, returns a model pre-trained on ImageNet 322 | progress (bool): If True, displays a progress bar of the download to stderr 323 | """ 324 | kwargs['width_per_group'] = 64 * 2 325 | return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], 326 | pretrained, progress, **kwargs) 327 | 328 | 329 | def wide_resnet101_2(pretrained=False, progress=True, **kwargs): 330 | r"""Wide ResNet-101-2 model from 331 | `"Wide Residual Networks" `_ 332 | The model is the same as ResNet except for the bottleneck number of channels 333 | which is twice larger in every block. The number of channels in outer 1x1 334 | convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 335 | channels, and in Wide ResNet-50-2 has 2048-1024-2048. 336 | Args: 337 | pretrained (bool): If True, returns a model pre-trained on ImageNet 338 | progress (bool): If True, displays a progress bar of the download to stderr 339 | """ 340 | kwargs['width_per_group'] = 64 * 2 341 | return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], 342 | pretrained, progress, **kwargs) -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | ''' 2 | MIT License 3 | 4 | Copyright (c) 2019 Yin Cao 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | ''' 24 | 25 | import torch 26 | import torch.nn as nn 27 | import torch.nn.functional as F 28 | import pdb 29 | from utils.utils import make_encoder 30 | 31 | import pdb 32 | 33 | import numpy as np 34 | import torch 35 | import torch.nn as nn 36 | import torch.nn.functional as F 37 | 38 | from model_utilities import ConvBlock, init_gru, init_layer, interpolate 39 | 40 | 41 | class CRNN9(nn.Module): 42 | def __init__(self, class_num, pool_type='avg', pool_size=(2,2), pretrained_path=None): 43 | 44 | super().__init__() 45 | self.class_num = class_num 46 | self.pool_type = pool_type 47 | self.pool_size = pool_size 48 | self.interp_ratio = 8 49 | 50 | self.conv_block1 = ConvBlock(in_channels=3, out_channels=128) # 1: 7, 128 2: 7, 64 51 | self.conv_block2 = ConvBlock(in_channels=128, out_channels=256) # 1: 128, 256 2: 64, 256 52 | self.conv_block3 = ConvBlock(in_channels=256, out_channels=512) 53 | 54 | #self.gru = nn.GRU(input_size=512, hidden_size=256, 55 | # num_layers=2, dropout=0.3, batch_first=True, bidirectional=True) 56 | 57 | self.azimuth_fc = nn.Linear(512, class_num, bias=True) 58 | 59 | 60 | self.init_weights() 61 | 62 | def init_weights(self): 63 | 64 | #init_gru(self.gru) 65 | init_layer(self.azimuth_fc) 66 | 67 | 68 | def forward(self, x): 69 | '''input: (batch_size, mic_channels, time_steps, mel_bins)''' 70 | 71 | x = self.conv_block1(x, self.pool_type, pool_size=self.pool_size) 72 | x = self.conv_block2(x, self.pool_type, pool_size=self.pool_size) 73 | x = self.conv_block3(x, self.pool_type, pool_size=self.pool_size) 74 | '''(batch_size, feature_maps, time_steps, mel_bins)''' 75 | 76 | if self.pool_type == 'avg': 77 | x = torch.mean(x, dim=3) 78 | elif self.pool_type == 'max': 79 | (x, _) = torch.max(x, dim=3) 80 | '''(batch_size, feature_maps, time_steps)''' 81 | 82 | x = x.transpose(1,2) 83 | ''' (batch_size, time_steps, feature_maps):''' 84 | # 85 | # self.gru.flatten_parameters() 86 | 87 | '''if pack padded''' 88 | # '''else''' 89 | #(x, _) = self.gru(x) 90 | 91 | azimuth_output = self.azimuth_fc(x) 92 | # Interpolate 93 | output = interpolate(azimuth_output, self.interp_ratio) 94 | 95 | return output 96 | 97 | 98 | class pretrained_CRNN8(CRNN9): 99 | 100 | def __init__(self, class_num, pool_type='avg', pool_size=(2,2), pretrained_path=None): 101 | 102 | super().__init__(class_num, pool_type, pool_size, pretrained_path=pretrained_path) 103 | if pretrained_path: 104 | self.load_weights(pretrained_path) 105 | 106 | self.gru = nn.GRU(input_size=512, hidden_size=256, 107 | num_layers=1, batch_first=True, bidirectional=True) 108 | 109 | init_gru(self.gru) 110 | init_layer(self.azimuth_fc) 111 | 112 | def load_weights(self, pretrained_path): 113 | 114 | model = CRNN9(self.class_num, self.pool_type, self.pool_size) 115 | checkpoint = torch.load(pretrained_path, map_location=lambda storage, loc: storage) 116 | model.load_state_dict(checkpoint['model_state_dict']) 117 | 118 | self.conv_block1 = model.conv_block1 119 | self.conv_block2 = model.conv_block2 120 | self.conv_block3 = model.conv_block3 121 | 122 | 123 | 124 | class CRNN11(nn.Module): 125 | def __init__(self, class_num, pool_type='avg', pool_size=(2,2), pretrained_path=None): 126 | 127 | super().__init__() 128 | 129 | self.class_num = class_num 130 | self.pool_type = pool_type 131 | self.pool_size = pool_size 132 | self.interp_ratio = 16 133 | 134 | self.conv_block1 = ConvBlock(in_channels=3, out_channels=64) 135 | self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) 136 | self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) 137 | self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) 138 | 139 | self.gru = nn.GRU(input_size=512, hidden_size=256, 140 | num_layers=2, dropout=0.3, batch_first=True, bidirectional=True) 141 | 142 | self.azimuth_fc = nn.Linear(512, class_num, bias=True) 143 | self.init_weights() 144 | 145 | def init_weights(self): 146 | 147 | init_gru(self.gru) 148 | init_layer(self.azimuth_fc) 149 | 150 | def forward(self, x): 151 | '''input: (batch_size, mic_channels, time_steps, mel_bins)''' 152 | 153 | x = self.conv_block1(x, self.pool_type, pool_size=self.pool_size) 154 | x = self.conv_block2(x, self.pool_type, pool_size=self.pool_size) 155 | x = self.conv_block3(x, self.pool_type, pool_size=self.pool_size) 156 | x = self.conv_block4(x, self.pool_type, pool_size=self.pool_size) 157 | '''(batch_size, feature_maps, time_steps, mel_bins)''' 158 | 159 | if self.pool_type == 'avg': 160 | x = torch.mean(x, dim=3) 161 | elif self.pool_type == 'max': 162 | (x, _) = torch.max(x, dim=3) 163 | '''(batch_size, feature_maps, time_steps)''' 164 | 165 | x = x.transpose(1,2) 166 | ''' (batch_size, time_steps, feature_maps):''' 167 | 168 | # self.gru.flatten_parameters() 169 | (x, _) = self.gru(x) 170 | 171 | azimuth_output = self.azimuth_fc(x) 172 | '''(batch_size, time_steps, class_num)''' 173 | 174 | # Interpolate 175 | azimuth_output = interpolate(azimuth_output, self.interp_ratio) 176 | return azimuth_output 177 | 178 | 179 | class pretrained_CRNN10(CRNN11): 180 | 181 | def __init__(self, class_num, pool_type='avg', pool_size=(2,2), pretrained_path=None): 182 | 183 | super().__init__(class_num, pool_type, pool_size, pretrained_path=pretrained_path) 184 | 185 | if pretrained_path: 186 | self.load_weights(pretrained_path) 187 | 188 | self.gru = nn.GRU(input_size=512, hidden_size=256, 189 | num_layers=1, batch_first=True, bidirectional=True) 190 | 191 | init_gru(self.gru) 192 | init_layer(self.azimuth_fc) 193 | 194 | def load_weights(self, pretrained_path): 195 | 196 | model = CRNN11(self.class_num, self.pool_type, self.pool_size) 197 | checkpoint = torch.load(pretrained_path, map_location=lambda storage, loc: storage) 198 | model.load_state_dict(checkpoint['model_state_dict']) 199 | 200 | self.conv_block1 = model.conv_block1 201 | self.conv_block2 = model.conv_block2 202 | self.conv_block3 = model.conv_block3 203 | self.conv_block4 = model.conv_block4 204 | 205 | 206 | 207 | class Gated_CRNN9(nn.Module): 208 | def __init__(self, class_num, pool_type='avg', pool_size=(2,2), pretrained_path=None): 209 | 210 | super().__init__() 211 | 212 | self.class_num = class_num 213 | self.pool_type = pool_type 214 | self.pool_size = pool_size 215 | self.interp_ratio = 8 216 | 217 | self.conv_block1 = ConvBlock(in_channels=3, out_channels=64) # 1: 7, 128 2: 7, 64 218 | self.conv_block2 = ConvBlock(in_channels=64, out_channels=256) # 1: 128, 256 2: 64, 256 219 | self.conv_block3 = ConvBlock(in_channels=256, out_channels=512) 220 | 221 | self.gate_block1 = ConvBlock(in_channels=3, out_channels=64) 222 | self.gate_block2 = ConvBlock(in_channels=64, out_channels=256) 223 | self.gate_block3 = ConvBlock(in_channels=256, out_channels=512) 224 | 225 | self.gru = nn.GRU(input_size=512, hidden_size=256, 226 | num_layers=2, dropout=0.3, batch_first=True, bidirectional=True) 227 | 228 | #self.azimuth_fc = nn.Linear(512, class_num, bias=True) 229 | self.azimuth_fc1 = nn.Linear(512, 128, bias=True) 230 | self.azimuth_fc2 = nn.Linear(128, class_num, bias=True) 231 | 232 | self.init_weights() 233 | 234 | def init_weights(self): 235 | 236 | init_gru(self.gru) 237 | #init_layer(self.azimuth_fc) 238 | init_layer(self.azimuth_fc1) 239 | init_layer(self.azimuth_fc2) 240 | 241 | def forward(self, x): 242 | #pdb.set_trace() 243 | '''input: (batch_size, mic_channels, time_steps, mel_bins)''' 244 | gate = self.gate_block1(x, self.pool_type, pool_size=self.pool_size) 245 | x = self.conv_block1(x, self.pool_type, pool_size=self.pool_size) 246 | x = x * torch.sigmoid(gate) 247 | 248 | gate = self.gate_block2(x, self.pool_type, pool_size=self.pool_size) 249 | x = self.conv_block2(x, self.pool_type, pool_size=self.pool_size) 250 | x = x * torch.sigmoid(gate) 251 | 252 | gate = self.gate_block3(x, self.pool_type, pool_size=self.pool_size) 253 | x = self.conv_block3(x, self.pool_type, pool_size=self.pool_size) 254 | x = x * torch.sigmoid(gate) 255 | '''(batch_size, feature_maps, time_steps, mel_bins)''' 256 | 257 | if self.pool_type == 'avg': 258 | x = torch.mean(x, dim=3) 259 | elif self.pool_type == 'max': 260 | (x, _) = torch.max(x, dim=3) 261 | '''(batch_size, feature_maps, time_steps)''' 262 | 263 | x = x.transpose(1,2) 264 | ''' (batch_size, time_steps, feature_maps):''' 265 | 266 | self.gru.flatten_parameters() 267 | (x, _) = self.gru(x) 268 | 269 | x = self.azimuth_fc1(x) 270 | azimuth_output = self.azimuth_fc2(x) 271 | #azimuth_output = self.azimuth_fc(x) 272 | '''(batch_size, time_steps, class_num)''' 273 | 274 | # Interpolate 275 | azimuth_output = interpolate(azimuth_output, self.interp_ratio) 276 | azimuth_output = F.sigmoid(azimuth_output) 277 | 278 | return azimuth_output 279 | 280 | 281 | class pretrained_Gated_CRNN8(Gated_CRNN9): 282 | 283 | def __init__(self, class_num, pool_type='avg', pool_size=(2,2), pretrained_path=None): 284 | 285 | super().__init__(class_num, pool_type, pool_size, pretrained_path=pretrained_path) 286 | 287 | if pretrained_path: 288 | self.load_weights(pretrained_path) 289 | 290 | self.gru = nn.GRU(input_size=512, hidden_size=256, 291 | num_layers=1, batch_first=True, bidirectional=True) 292 | 293 | init_gru(self.gru) 294 | #init_layer(self.azimuth_fc) 295 | init_layer(self.azimuth_fc1) 296 | init_layer(self.azimuth_fc2) 297 | 298 | def load_weights(self, pretrained_path): 299 | 300 | model = Gated_CRNN9(self.class_num, self.pool_type, self.pool_size) 301 | checkpoint = torch.load(pretrained_path, map_location=lambda storage, loc: storage) 302 | model.load_state_dict(checkpoint['model_state_dict']) 303 | 304 | self.conv_block1 = model.conv_block1 305 | self.conv_block2 = model.conv_block2 306 | self.conv_block3 = model.conv_block3 307 | 308 | class JUNGMIN(nn.Module): 309 | def __init__(self): 310 | super().__init__() 311 | 312 | in_channel = 514 313 | out_channel = 514 314 | 315 | self.layer1 = nn.Sequential( 316 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 317 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 318 | nn.BatchNorm1d(out_channel), 319 | nn.ReLU(), 320 | ) 321 | 322 | 323 | 324 | self.layer2 = nn.Sequential( 325 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 326 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 327 | nn.BatchNorm1d(out_channel), 328 | nn.ReLU(), 329 | ) 330 | 331 | self.layer3 = nn.Sequential( 332 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 333 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 334 | nn.BatchNorm1d(out_channel), 335 | nn.ReLU(), 336 | ) 337 | 338 | self.fc = nn.Linear(514, 10) 339 | self.init_weights() 340 | 341 | def init_weights(self): 342 | init_layer(self.fc) 343 | 344 | def forward(self, x): 345 | 346 | B,M,T,F = x.size() 347 | # pdb.set_trace() 348 | '''input: (batch_size, mic_channels, time_steps, mel_bins)''' 349 | x = x.permute(0, 1, 3, 2).reshape(B, -1, T) 350 | 351 | x = self.layer1(x) 352 | x = self.layer2(x) 353 | x = self.layer3(x) 354 | x = x.transpose(1,2) 355 | 356 | # (x, _) = self.gru(x) 357 | return self.fc(x) 358 | 359 | class JUNGMIN2(nn.Module): 360 | def __init__(self): 361 | super().__init__() 362 | 363 | in_channel = 514 364 | out_channel = 514 365 | 366 | self.layer1 = nn.Sequential( 367 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 368 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 369 | nn.BatchNorm1d(out_channel), 370 | nn.ReLU(), 371 | ) 372 | 373 | 374 | 375 | self.layer2 = nn.Sequential( 376 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 377 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 378 | nn.BatchNorm1d(out_channel), 379 | nn.ReLU(), 380 | ) 381 | 382 | self.layer3 = nn.Sequential( 383 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 384 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 385 | nn.BatchNorm1d(out_channel), 386 | nn.ReLU(), 387 | ) 388 | 389 | 390 | self.gru = nn.GRU(input_size=514, hidden_size=256, 391 | num_layers=2, dropout=0.3, batch_first=True, bidirectional=True) 392 | 393 | self.fc = nn.Linear(512, 10) 394 | 395 | self.init_weights() 396 | 397 | def init_weights(self): 398 | 399 | init_gru(self.gru) 400 | init_layer(self.fc) 401 | 402 | def forward(self, x): 403 | 404 | B,M,T,F = x.size() 405 | # pdb.set_trace() 406 | '''input: (batch_size, mic_channels, time_steps, mel_bins)''' 407 | x = x.permute(0, 1, 3, 2).reshape(B, -1, T) 408 | 409 | x = self.layer1(x) 410 | x = self.layer2(x) 411 | x = self.layer3(x) 412 | x = x.transpose(1,2) 413 | 414 | (x, _) = self.gru(x) 415 | return self.fc(x) 416 | 417 | 418 | class JUNGMIN3(nn.Module): 419 | def __init__(self): 420 | super().__init__() 421 | 422 | in_channel = 514 423 | out_channel = 514 424 | 425 | self.layer1 = nn.Sequential( 426 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 427 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 428 | nn.BatchNorm1d(out_channel), 429 | nn.ReLU(), 430 | ) 431 | 432 | 433 | 434 | self.layer2 = nn.Sequential( 435 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 436 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 437 | nn.BatchNorm1d(out_channel), 438 | nn.ReLU(), 439 | ) 440 | 441 | self.layer3 = nn.Sequential( 442 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 443 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 444 | nn.BatchNorm1d(out_channel), 445 | nn.ReLU(), 446 | ) 447 | 448 | self.fc = nn.Sequential( 449 | nn.Linear(514, 256), 450 | nn.ReLU(), 451 | nn.Linear(256, 10) 452 | ) 453 | 454 | 455 | self.init_weights() 456 | 457 | def init_weights(self): 458 | init_layer(self.fc) 459 | 460 | def forward(self, x): 461 | 462 | B,M,T,F = x.size() 463 | # pdb.set_trace() 464 | '''input: (batch_size, mic_channels, time_steps, mel_bins)''' 465 | x = x.permute(0, 1, 3, 2).reshape(B, -1, T) 466 | 467 | x = self.layer1(x) 468 | x = self.layer2(x) 469 | x = self.layer3(x) 470 | x = x.transpose(1,2) 471 | 472 | # (x, _) = self.gru(x) 473 | return self.fc(x) 474 | 475 | 476 | 477 | 478 | 479 | class JUNGMIN4(nn.Module): 480 | def __init__(self): 481 | super().__init__() 482 | 483 | in_channel = 514 484 | out_channel = 514 485 | 486 | self.layer1 = nn.Sequential( 487 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 488 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 489 | nn.BatchNorm1d(out_channel), 490 | nn.ReLU(), 491 | ) 492 | 493 | 494 | 495 | self.layer2 = nn.Sequential( 496 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 497 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 498 | nn.BatchNorm1d(out_channel), 499 | nn.ReLU(), 500 | ) 501 | 502 | self.layer3 = nn.Sequential( 503 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 504 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 505 | nn.BatchNorm1d(out_channel), 506 | nn.ReLU(), 507 | ) 508 | 509 | 510 | self.layer4 = nn.Sequential( 511 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 512 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 513 | nn.BatchNorm1d(out_channel), 514 | nn.ReLU(), 515 | ) 516 | 517 | 518 | 519 | self.fc = nn.Sequential( 520 | nn.Linear(514, 256), 521 | nn.ReLU(), 522 | nn.Linear(256, 10) 523 | ) 524 | 525 | 526 | self.init_weights() 527 | 528 | def init_weights(self): 529 | init_layer(self.fc) 530 | 531 | def forward(self, x): 532 | 533 | B,M,T,F = x.size() 534 | # pdb.set_trace() 535 | '''input: (batch_size, mic_channels, time_steps, mel_bins)''' 536 | x = x.permute(0, 1, 3, 2).reshape(B, -1, T) 537 | 538 | x = self.layer1(x) 539 | x = self.layer2(x) 540 | x = self.layer3(x) 541 | x = self.layer4(x) 542 | x = x.transpose(1,2) 543 | 544 | # (x, _) = self.gru(x) 545 | return self.fc(x) 546 | 547 | 548 | 549 | 550 | class JUNGMIN5(nn.Module): 551 | def __init__(self): 552 | super().__init__() 553 | 554 | in_channel = 514 555 | out_channel = 514 556 | 557 | self.layer1 = nn.Sequential( 558 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 559 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 560 | nn.BatchNorm1d(out_channel), 561 | nn.ReLU(), 562 | ) 563 | 564 | 565 | 566 | self.layer2 = nn.Sequential( 567 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 568 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 569 | nn.BatchNorm1d(out_channel), 570 | nn.ReLU(), 571 | ) 572 | 573 | self.layer3 = nn.Sequential( 574 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 575 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 576 | nn.BatchNorm1d(out_channel), 577 | nn.ReLU(), 578 | ) 579 | 580 | 581 | self.layer4 = nn.Sequential( 582 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel), 583 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0), 584 | nn.BatchNorm1d(out_channel), 585 | nn.ReLU(), 586 | ) 587 | 588 | 589 | 590 | self.fc = nn.Sequential( 591 | nn.Linear(514, 256), 592 | nn.ReLU(), 593 | nn.Linear(256, 10) 594 | ) 595 | 596 | 597 | self.init_weights() 598 | 599 | def init_weights(self): 600 | init_layer(self.fc) 601 | 602 | def forward(self, x): 603 | 604 | B,M,T,F = x.size() 605 | # pdb.set_trace() 606 | '''input: (batch_size, mic_channels, time_steps, mel_bins)''' 607 | x = x.permute(0, 1, 3, 2).reshape(B, -1, T) 608 | 609 | x = self.layer1(x) 610 | x = self.layer2(x) 611 | x = self.layer3(x) 612 | x = self.layer4(x) 613 | x = x.transpose(1,2) 614 | 615 | # (x, _) = self.gru(x) 616 | return self.fc(x) 617 | -------------------------------------------------------------------------------- /utils/feature_extractor.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pdb 4 | import sys 5 | from timeit import default_timer as timer 6 | 7 | import h5py 8 | import librosa 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import pandas as pd 12 | import scipy.io as sio 13 | from scipy import signal 14 | from tqdm import tqdm 15 | 16 | from utilities import calculate_scalar, event_labels, lb_to_ix 17 | 18 | fs = 32000 19 | nfft = 1024 20 | hopsize = 320 # 640 for 20 ms 21 | mel_bins = 128 22 | window = 'hann' 23 | fmin = 50 24 | hdf5_folder_name = '{}fs_{}nfft_{}hs_{}melb'.format(fs, nfft, hopsize, mel_bins) 25 | 26 | 27 | class LogMelExtractor(): 28 | def __init__(self, fs, nfft, hopsize, mel_bins, window, fmin): 29 | 30 | self.nfft = nfft 31 | self.hopsize = hopsize 32 | self.window = window 33 | self.melW = librosa.filters.mel(sr=fs, 34 | n_fft=nfft, 35 | n_mels=mel_bins, 36 | fmin=fmin) 37 | 38 | def transform(self, audio): 39 | 40 | channel_num = audio.shape[0] 41 | feature_logmel = [] 42 | 43 | for n in range(channel_num): 44 | S = np.abs(librosa.stft(y=audio[n], 45 | n_fft=self.nfft, 46 | hop_length=self.hopsize, 47 | center=True, 48 | window=self.window, 49 | pad_mode='reflect'))**2 50 | 51 | S_mel = np.dot(self.melW, S).T 52 | S_logmel = librosa.power_to_db(S_mel, ref=1.0, amin=1e-10, top_db=None) 53 | S_logmel = np.expand_dims(S_logmel, axis=0) 54 | feature_logmel.append(S_logmel) 55 | 56 | feature_logmel = np.concatenate(feature_logmel, axis=0) 57 | 58 | return feature_logmel 59 | 60 | 61 | class LogMelGccExtractor(): 62 | def __init__(self, fs, nfft, hopsize, mel_bins, window, fmin): 63 | 64 | self.nfft = nfft 65 | self.hopsize = hopsize 66 | self.window = window 67 | self.melW = librosa.filters.mel(sr=fs, 68 | n_fft=nfft, 69 | n_mels=mel_bins, 70 | fmin=fmin) 71 | 72 | def logmel(self, sig): 73 | 74 | S = np.abs(librosa.stft(y=sig, 75 | n_fft=self.nfft, 76 | hop_length=self.hopsize, 77 | center=True, 78 | window=self.window, 79 | pad_mode='reflect'))**2 80 | S_mel = np.dot(self.melW, S).T 81 | S_logmel = librosa.power_to_db(S_mel, ref=1.0, amin=1e-10, top_db=None) 82 | S_logmel = np.expand_dims(S_logmel, axis=0) 83 | 84 | return S_logmel 85 | 86 | def gcc_phat(self, sig, refsig): 87 | 88 | ncorr = 2*self.nfft - 1 89 | nfft = int(2**np.ceil(np.log2(np.abs(ncorr)))) 90 | Px = librosa.stft(y=sig, 91 | n_fft=nfft, 92 | hop_length=self.hopsize, 93 | center=True, 94 | window=self.window, 95 | pad_mode='reflect') 96 | Px_ref = librosa.stft(y=refsig, 97 | n_fft=nfft, 98 | hop_length=self.hopsize, 99 | center=True, 100 | window=self.window, 101 | pad_mode='reflect') 102 | 103 | R = Px*np.conj(Px_ref) 104 | 105 | n_frames = R.shape[1] 106 | gcc_phat = [] 107 | for i in range(n_frames): 108 | spec = R[:, i].flatten() 109 | cc = np.fft.irfft(np.exp(1.j*np.angle(spec))) 110 | cc = np.concatenate((cc[-mel_bins//2:], cc[:mel_bins//2])) 111 | gcc_phat.append(cc) 112 | gcc_phat = np.array(gcc_phat) 113 | gcc_phat = gcc_phat[None,:,:] 114 | 115 | return gcc_phat 116 | 117 | def transform(self, audio): 118 | 119 | channel_num = audio.shape[0] 120 | feature_logmel = [] 121 | feature_gcc_phat = [] 122 | for n in range(channel_num): 123 | feature_logmel.append(self.logmel(audio[n])) 124 | for m in range(n+1, channel_num): 125 | feature_gcc_phat.append( 126 | self.gcc_phat(sig=audio[m], refsig=audio[n])) 127 | 128 | feature_logmel = np.concatenate(feature_logmel, axis=0) 129 | feature_gcc_phat = np.concatenate(feature_gcc_phat, axis=0) 130 | feature = np.concatenate([feature_logmel, feature_gcc_phat]) 131 | 132 | return feature 133 | 134 | 135 | class LogMelIntensityExtractor(): 136 | def __init__(self, fs, nfft, hopsize, mel_bins, window, fmin): 137 | 138 | self.nfft = nfft 139 | self.hopsize = hopsize 140 | self.window = window 141 | self.melW = librosa.filters.mel(sr=fs, 142 | n_fft=nfft, 143 | n_mels=mel_bins, 144 | fmin=fmin) 145 | 146 | def logmel(self, sig): 147 | 148 | S = np.abs(librosa.stft(y=sig, 149 | n_fft=nfft, 150 | hop_length=self.hopsize, 151 | center=True, 152 | window=self.window, 153 | pad_mode='reflect'))**2 154 | S_mel = np.dot(self.melW, S).T 155 | S_logmel = librosa.power_to_db(S_mel, ref=1.0, amin=1e-10, top_db=None) 156 | S_logmel = np.expand_dims(S_logmel, axis=0) 157 | 158 | return S_logmel 159 | 160 | def intensity(self, sig): 161 | 162 | ref = sig[0] 163 | x = sig[1] 164 | y = sig[2] 165 | z = sig[3] 166 | 167 | Pref = librosa.stft(y=ref, 168 | n_fft=nfft, 169 | hop_length=hopsize, 170 | center=True, 171 | window=self.window, 172 | pad_mode='reflect') 173 | Px = librosa.stft(y=x, 174 | n_fft=nfft, 175 | hop_length=hopsize, 176 | center=True, 177 | window=self.window, 178 | pad_mode='reflect') 179 | Py = librosa.stft(y=y, 180 | n_fft=nfft, 181 | hop_length=hopsize, 182 | center=True, 183 | window=self.window, 184 | pad_mode='reflect') 185 | Pz = librosa.stft(y=z, 186 | n_fft=nfft, 187 | hop_length=hopsize, 188 | center=True, 189 | window=self.window, 190 | pad_mode='reflect') 191 | 192 | I1 = np.real(np.conj(Pref) * Px) 193 | I2 = np.real(np.conj(Pref) * Py) 194 | I3 = np.real(np.conj(Pref) * Pz) 195 | normal = np.sqrt(I1**2 + I2**2 + I3**2) 196 | I1 = np.dot(self.melW, I1 / normal).T 197 | I2 = np.dot(self.melW, I2 / normal).T 198 | I3 = np.dot(self.melW, I3 / normal).T 199 | intensity = np.array([I1, I2, I3]) 200 | 201 | return intensity 202 | 203 | def transform(self, audio): 204 | 205 | channel_num = audio.shape[0] 206 | feature_logmel = [] 207 | for n in range(0, channel_num): 208 | feature_logmel.append(self.logmel(audio[n])) 209 | feature_intensity = self.intensity(sig=audio) 210 | 211 | feature_logmel = np.concatenate(feature_logmel, axis=0) 212 | feature = np.concatenate([feature_logmel, feature_intensity], axis=0) 213 | 214 | return feature 215 | 216 | 217 | class LogMelGccIntensityExtractor(): 218 | def __init__(self, fs, nfft, hopsize, mel_bins, window, fmin): 219 | 220 | self.nfft = nfft 221 | self.hopsize = hopsize 222 | self.window = window 223 | self.melW = librosa.filters.mel(sr=fs, 224 | n_fft=nfft, 225 | n_mels=mel_bins, 226 | fmin=fmin) 227 | 228 | def logmel(self, sig): 229 | 230 | S = np.abs(librosa.stft(y=sig, 231 | n_fft=self.nfft, 232 | hop_length=self.hopsize, 233 | center=True, 234 | window=self.window, 235 | pad_mode='reflect'))**2 236 | S_mel = np.dot(self.melW, S).T 237 | S_logmel = librosa.power_to_db(S_mel, ref=1.0, amin=1e-10, top_db=None) 238 | S_logmel = np.expand_dims(S_logmel, axis=0) 239 | 240 | return S_logmel 241 | 242 | def gcc_phat(self, sig, refsig): 243 | 244 | ncorr = 2*self.nfft - 1 245 | nfft = int(2**np.ceil(np.log2(np.abs(ncorr)))) 246 | Px = librosa.stft(y=sig, 247 | n_fft=nfft, 248 | hop_length=self.hopsize, 249 | center=True, 250 | window=self.window, 251 | pad_mode='reflect') 252 | Px_ref = librosa.stft(y=refsig, 253 | n_fft=nfft, 254 | hop_length=self.hopsize, 255 | center=True, 256 | window=self.window, 257 | pad_mode='reflect') 258 | 259 | R = Px*np.conj(Px_ref) 260 | 261 | n_frames = R.shape[1] 262 | gcc_phat = [] 263 | for i in range(n_frames): 264 | spec = R[:, i].flatten() 265 | cc = np.fft.irfft(np.exp(1.j*np.angle(spec))) 266 | cc = np.concatenate((cc[-mel_bins//2:], cc[:mel_bins//2])) 267 | gcc_phat.append(cc) 268 | gcc_phat = np.array(gcc_phat) 269 | gcc_phat = gcc_phat[None,:,:] 270 | 271 | return gcc_phat 272 | 273 | def intensity(self, sig): 274 | 275 | ref = sig[0] 276 | x = sig[1] 277 | y = sig[2] 278 | z = sig[3] 279 | 280 | Pref = librosa.stft(y=ref, 281 | n_fft=nfft, 282 | hop_length=hopsize, 283 | center=True, 284 | window=self.window, 285 | pad_mode='reflect') 286 | Px = librosa.stft(y=x, 287 | n_fft=nfft, 288 | hop_length=hopsize, 289 | center=True, 290 | window=self.window, 291 | pad_mode='reflect') 292 | Py = librosa.stft(y=y, 293 | n_fft=nfft, 294 | hop_length=hopsize, 295 | center=True, 296 | window=self.window, 297 | pad_mode='reflect') 298 | Pz = librosa.stft(y=z, 299 | n_fft=nfft, 300 | hop_length=hopsize, 301 | center=True, 302 | window=self.window, 303 | pad_mode='reflect') 304 | 305 | I1 = np.real(np.conj(Pref) * Px) 306 | I2 = np.real(np.conj(Pref) * Py) 307 | I3 = np.real(np.conj(Pref) * Pz) 308 | normal = np.sqrt(I1**2 + I2**2 + I3**2) 309 | I1 = np.dot(self.melW, I1 / normal).T 310 | I2 = np.dot(self.melW, I2 / normal).T 311 | I3 = np.dot(self.melW, I3 / normal).T 312 | intensity = np.array([I1, I2, I3]) 313 | 314 | return intensity 315 | 316 | def transform(self, audio): 317 | 318 | feature_logmel = [] 319 | for n in range(0, 4): 320 | feature_logmel.append(self.logmel(audio[n])) 321 | feature_intensity = self.intensity(sig=audio[0:4]) 322 | feature_logmel = np.concatenate(feature_logmel, axis=0) 323 | feature_foa = np.concatenate([feature_logmel, feature_intensity], axis=0) 324 | 325 | feature_logmel = [] 326 | feature_gcc_phat = [] 327 | for n in range(4, 8): 328 | feature_logmel.append(self.logmel(audio[n])) 329 | for m in range(n+1, 8): 330 | feature_gcc_phat.append( 331 | self.gcc_phat(sig=audio[m], refsig=audio[n])) 332 | feature_logmel = np.concatenate(feature_logmel, axis=0) 333 | feature_gcc_phat = np.concatenate(feature_gcc_phat, axis=0) 334 | feature_mic = np.concatenate([feature_logmel, feature_gcc_phat], axis=0) 335 | 336 | feature = np.concatenate([feature_foa, feature_mic], axis=0) 337 | 338 | return feature 339 | 340 | 341 | def RT_preprocessing(extractor, audio): 342 | 343 | '''This step needs to be considered''' 344 | # audio = audio / (np.max(np.abs(audio)) + np.finfo(np.float).eps) 345 | 346 | feature = extractor.transform(audio) 347 | '''(channels, seq_len, mel_bins)''' 348 | '''(channels, time, frequency)''' 349 | 350 | return feature 351 | 352 | def extract_dev_features(args): 353 | """ 354 | Write features and infos of audios to hdf5. 355 | Args: 356 | dataset_dir: dataset path 357 | feature_dir: feature path 358 | audio_type: 'foa' | 'mic' | 'foa&mic' 359 | """ 360 | # extractor 361 | if args.feature_type == 'logmel': 362 | extractor = LogMelExtractor(fs=fs, 363 | nfft=nfft, 364 | hopsize=hopsize, 365 | mel_bins=mel_bins, 366 | window=window, 367 | fmin=fmin) 368 | elif args.feature_type == 'logmelgcc': 369 | extractor = LogMelGccExtractor(fs=fs, 370 | nfft=nfft, 371 | hopsize=hopsize, 372 | mel_bins=mel_bins, 373 | window=window, 374 | fmin=fmin) 375 | elif args.feature_type == 'logmelintensity': 376 | extractor = LogMelIntensityExtractor(fs=fs, 377 | nfft=nfft, 378 | hopsize=hopsize, 379 | mel_bins=mel_bins, 380 | window=window, 381 | fmin=fmin) 382 | elif args.feature_type == 'logmelgccintensity': 383 | extractor = LogMelGccIntensityExtractor(fs=fs, 384 | nfft=nfft, 385 | hopsize=hopsize, 386 | mel_bins=mel_bins, 387 | window=window, 388 | fmin=fmin) 389 | 390 | # Path 391 | if args.feature_type == 'logmelgccintensity': 392 | audio_dir = [os.path.join(args.dataset_dir, 'dev', 'foa_dev'), os.path.join(args.dataset_dir, 'dev', 'mic_dev')] 393 | hdf5_dir = os.path.join(args.feature_dir, args.feature_type, 394 | hdf5_folder_name, 'foa&mic_dev') 395 | os.makedirs(hdf5_dir, exist_ok=True) 396 | else: 397 | audio_dir = [os.path.join(args.dataset_dir, 'dev', args.audio_type + '_dev')] 398 | hdf5_dir = os.path.join(args.feature_dir, args.feature_type, 399 | hdf5_folder_name, args.audio_type + '_dev') 400 | os.makedirs(hdf5_dir, exist_ok=True) 401 | 402 | meta_dir = os.path.join(args.dataset_dir, 'dev', 'metadata_dev') 403 | 404 | begin_time = timer() 405 | audio_count = 0 406 | 407 | print('\n============> Start Extracting Features\n') 408 | 409 | iterator = tqdm(sorted(os.listdir(audio_dir[0])), total=len(os.listdir(audio_dir[0])), unit='it') 410 | 411 | for audio_fn in iterator: 412 | 413 | if audio_fn.endswith('.wav') and not audio_fn.startswith('.'): 414 | 415 | fn = audio_fn.split('.')[0] 416 | if args.feature_type == 'logmelgccintensity': 417 | audio_path = [os.path.join(audio_dir[0], audio_fn), os.path.join(audio_dir[1], audio_fn)] 418 | audio_foa, _ = librosa.load(audio_path[0], sr=fs, mono=False, dtype=np.float32) 419 | audio_mic, _ = librosa.load(audio_path[1], sr=fs, mono=False, dtype=np.float32) 420 | audio_len = min(audio_foa.shape[1], audio_mic.shape[1]) 421 | audio = np.concatenate([audio_foa[:, :audio_len], audio_mic[:, :audio_len]], axis=0) 422 | '''(channel_nums, samples)''' 423 | else: 424 | audio_path = os.path.join(audio_dir[0], audio_fn) 425 | audio, _ = librosa.load(audio_path, sr=fs, mono=False, dtype=np.float32) 426 | '''(channel_nums, samples)''' 427 | 428 | audio_count += 1 429 | 430 | if np.sum(np.abs(audio)) < len(audio)*1e-4: 431 | with open("feature_removed.txt", "a+") as text_file: 432 | # print("Purchase Amount: {}".format(TotalAmount), file=text_file) 433 | print(f"Silent file removed in feature extractor: {audio_fn}", 434 | file=text_file) 435 | tqdm.write("Silent file removed in feature extractor: {}".format(audio_fn)) 436 | continue 437 | 438 | # features 439 | feature = RT_preprocessing(extractor, audio) 440 | '''(channels, time, frequency)''' 441 | 442 | meta_fn = fn + '.csv' 443 | df = pd.read_csv(os.path.join(meta_dir, meta_fn)) 444 | 445 | target_event = df['sound_event_recording'].values 446 | target_start_time = df['start_time'].values 447 | target_end_time = df['end_time'].values 448 | target_ele = df['ele'].values 449 | target_azi = df['azi'].values 450 | target_dist = df['dist'].values 451 | 452 | hdf5_path = os.path.join(hdf5_dir, fn + '.h5') 453 | with h5py.File(hdf5_path, 'w') as hf: 454 | 455 | hf.create_dataset('feature', data=feature, dtype=np.float32) 456 | # hf.create_dataset('filename', data=[na.encode() for na in [fn]], dtype='S20') 457 | 458 | hf.create_group('target') 459 | hf['target'].create_dataset('event', data=[e.encode() for e in target_event], dtype='S20') 460 | hf['target'].create_dataset('start_time', data=target_start_time, dtype=np.float32) 461 | hf['target'].create_dataset('end_time', data=target_end_time, dtype=np.float32) 462 | hf['target'].create_dataset('elevation', data=target_ele, dtype=np.float32) 463 | hf['target'].create_dataset('azimuth', data=target_azi, dtype=np.float32) 464 | hf['target'].create_dataset('distance', data=target_dist, dtype=np.float32) 465 | 466 | tqdm.write('{}, {}, {}'.format(audio_count, hdf5_path, feature.shape)) 467 | 468 | iterator.close() 469 | print("Extacting feature finished! Time spent: {:.3f} s".format(timer() - begin_time)) 470 | 471 | 472 | def fit(args): 473 | """ 474 | Calculate scalar. 475 | Args: 476 | feature_dir: feature path 477 | audio_type: 'foa' | 'mic' | 'foa&mic' 478 | """ 479 | 480 | hdf5_dir = os.path.join(args.feature_dir, args.feature_type, 481 | hdf5_folder_name, args.audio_type + '_dev') 482 | 483 | scalar_path = os.path.join(args.feature_dir, args.feature_type, 484 | hdf5_folder_name, args.audio_type + '_scalar.h5') 485 | 486 | os.makedirs(os.path.dirname(scalar_path), exist_ok=True) 487 | 488 | print('\n============> Start Calculating Scalar.\n') 489 | 490 | load_time = timer() 491 | features = [] 492 | for hdf5_fn in os.listdir(hdf5_dir): 493 | hdf5_path = os.path.join(hdf5_dir, hdf5_fn) 494 | with h5py.File(hdf5_path, 'r') as hf: 495 | features.append(hf['feature'][:]) 496 | print('Load feature time: {:.3f} s'.format(timer() - load_time)) 497 | 498 | features = np.concatenate(features, axis=1) 499 | (mean, std) = calculate_scalar(features) 500 | 501 | with h5py.File(scalar_path, 'w') as hf_scalar: 502 | hf_scalar.create_dataset('mean', data=mean, dtype=np.float32) 503 | hf_scalar.create_dataset('std', data=std, dtype=np.float32) 504 | 505 | print('Features shape: {}'.format(features.shape)) 506 | print('mean {}:\n{}'.format(mean.shape, mean)) 507 | print('std {}:\n{}'.format(std.shape, std)) 508 | print('Write out scalar to {}'.format(scalar_path)) 509 | 510 | 511 | def extract_eval_features(args): 512 | """ 513 | Write features and infos of audios to hdf5. 514 | Args: 515 | dataset_dir: dataset path 516 | feature_dir: feature path 517 | audio_type: 'foa' | 'mic' | 'foa&mic' 518 | """ 519 | # extractor 520 | if args.feature_type == 'logmel': 521 | extractor = LogMelExtractor(fs=fs, 522 | nfft=nfft, 523 | hopsize=hopsize, 524 | mel_bins=mel_bins, 525 | window=window, 526 | fmin=fmin) 527 | elif args.feature_type == 'logmelgcc': 528 | extractor = LogMelGccExtractor(fs=fs, 529 | nfft=nfft, 530 | hopsize=hopsize, 531 | mel_bins=mel_bins, 532 | window=window, 533 | fmin=fmin) 534 | elif args.feature_type == 'logmelintensity': 535 | extractor = LogMelIntensityExtractor(fs=fs, 536 | nfft=nfft, 537 | hopsize=hopsize, 538 | mel_bins=mel_bins, 539 | window=window, 540 | fmin=fmin) 541 | elif args.feature_type == 'logmelgccintensity': 542 | extractor = LogMelGccIntensityExtractor(fs=fs, 543 | nfft=nfft, 544 | hopsize=hopsize, 545 | mel_bins=mel_bins, 546 | window=window, 547 | fmin=fmin) 548 | 549 | # Path 550 | if args.feature_type == 'logmelgccintensity': 551 | audio_dir = [os.path.join(args.dataset_dir, 'eval', 'foa_eval'), os.path.join(args.dataset_dir, 'eval', 'mic_eval')] 552 | hdf5_dir = os.path.join(args.feature_dir, args.feature_type, 553 | hdf5_folder_name, 'foa&mic_eval') 554 | os.makedirs(hdf5_dir, exist_ok=True) 555 | else: 556 | audio_dir = [os.path.join(args.dataset_dir, 'eval', args.audio_type + '_eval')] 557 | hdf5_dir = os.path.join(args.feature_dir, args.feature_type, 558 | hdf5_folder_name, args.audio_type + '_eval') 559 | os.makedirs(hdf5_dir, exist_ok=True) 560 | 561 | begin_time = timer() 562 | audio_count = 0 563 | 564 | print('\n============> Start Extracting Features\n') 565 | pdb.set_trace() 566 | iterator = tqdm(sorted(os.listdir(audio_dir[0])), total=len(os.listdir(audio_dir[0])), unit='it') 567 | 568 | for audio_fn in iterator: 569 | 570 | if audio_fn.endswith('.wav') and not audio_fn.startswith('.'): 571 | 572 | fn = audio_fn.split('.')[0] 573 | if args.feature_type == 'logmelgccintensity': 574 | audio_path = [os.path.join(audio_dir[0], audio_fn), os.path.join(audio_dir[1], audio_fn)] 575 | audio_foa, _ = librosa.load(audio_path[0], sr=fs, mono=False, dtype=np.float32) 576 | audio_mic, _ = librosa.load(audio_path[1], sr=fs, mono=False, dtype=np.float32) 577 | audio_len = min(audio_foa.shape[1], audio_mic.shape[1]) 578 | audio = np.concatenate([audio_foa[:, :audio_len], audio_mic[:, :audio_len]], axis=0) 579 | '''(channel_nums, samples)''' 580 | else: 581 | audio_path = os.path.join(audio_dir[0], audio_fn) 582 | audio, _ = librosa.load(audio_path, sr=fs, mono=False, dtype=np.float32) 583 | '''(channel_nums, samples)''' 584 | 585 | audio_count += 1 586 | 587 | if np.sum(np.abs(audio)) < len(audio)*1e-4: 588 | with open("feature_removed.txt", "a+") as text_file: 589 | print(f"Silent file removed in feature extractor: {audio_fn}", 590 | file=text_file) 591 | tqdm.write("Silent file removed in feature extractor: {}".format(audio_fn)) 592 | continue 593 | 594 | # features 595 | feature = RT_preprocessing(extractor, audio) 596 | '''(channels, time, frequency)''' 597 | 598 | hdf5_path = os.path.join(hdf5_dir, fn + '.h5') 599 | with h5py.File(hdf5_path, 'w') as hf: 600 | 601 | hf.create_dataset('feature', data=feature, dtype=np.float32) 602 | 603 | tqdm.write('{}, {}, {}'.format(audio_count, hdf5_path, feature.shape)) 604 | 605 | iterator.close() 606 | print("Extacting feature finished! Time spent: {:.3f} s".format(timer() - begin_time)) 607 | 608 | 609 | if __name__ == '__main__': 610 | parser = argparse.ArgumentParser(description='Extract features from audio file') 611 | 612 | parser.add_argument('--dataset_dir', type=str, required=True) 613 | parser.add_argument('--feature_dir', type=str, required=True) 614 | parser.add_argument('--feature_type', type=str, required=True, 615 | choices=['logmel', 'logmelgcc', 'logmelintensity', 'logmelgccintensity']) 616 | parser.add_argument('--data_type', type=str, required=True, 617 | choices=['dev', 'eval']) 618 | parser.add_argument('--audio_type', type=str, required=True, 619 | choices=['foa', 'mic', 'foa&mic']) 620 | 621 | args = parser.parse_args() 622 | 623 | if args.feature_type == 'logmelgccintensity': 624 | args.audio_type = 'foa&mic' 625 | 626 | if args.data_type == 'dev': 627 | extract_dev_features(args) 628 | fit(args) 629 | elif args.data_type == 'eval': 630 | extract_eval_features(args) --------------------------------------------------------------------------------