├── main.sh
├── .gitignore
├── __pycache__
├── model.cpython-37.pyc
├── model.cpython-38.pyc
├── trainer.cpython-37.pyc
├── trainer.cpython-38.pyc
├── datasets.cpython-37.pyc
├── test_data_pkl.cpython-37.pyc
├── test_data_pkl.cpython-38.pyc
├── model_utilities.cpython-37.pyc
├── model_utilities.cpython-38.pyc
├── train_data_mine.cpython-37.pyc
└── train_data_mine.cpython-38.pyc
├── test.sh
├── utils
├── __pycache__
│ ├── loss.cpython-37.pyc
│ ├── loss.cpython-38.pyc
│ ├── resnet.cpython-37.pyc
│ ├── resnet.cpython-38.pyc
│ ├── setup.cpython-37.pyc
│ ├── setup.cpython-38.pyc
│ ├── utils.cpython-37.pyc
│ ├── utils.cpython-38.pyc
│ └── utilities.cpython-37.pyc
├── extract.sh
├── loss.py
├── setup.py
├── check_data_len.py
├── utils.py
├── vad.py
├── utilities.py
├── resnet.py
└── feature_extractor.py
├── README.md
├── config
└── config.yml
├── LICENSE.md
├── main.py
├── model_utilities.py
├── datasets.py
├── trainer.py
└── model.py
/main.sh:
--------------------------------------------------------------------------------
1 | python3 main.py -b './' -c './config/' -d 'config' -m 'Train'
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.zip
2 | *.idea
3 | __pycache__/
4 | *.pt
5 | *.swp
6 | *.log
7 | ./out_train
8 | *.wav
9 |
--------------------------------------------------------------------------------
/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/model.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/model.cpython-38.pyc
--------------------------------------------------------------------------------
/__pycache__/trainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/trainer.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/trainer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/trainer.cpython-38.pyc
--------------------------------------------------------------------------------
/__pycache__/datasets.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/datasets.cpython-37.pyc
--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | #python3 main.py -b './' -c './config/' -d 'config' -m 'Train'
2 | python3 main.py -b './' -c './config/' -d 'config' -m 'Test'
3 |
--------------------------------------------------------------------------------
/utils/__pycache__/loss.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/loss.cpython-37.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/loss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/loss.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/resnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/resnet.cpython-37.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/resnet.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/resnet.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/setup.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/setup.cpython-37.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/setup.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/setup.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/utils.cpython-37.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/utils.cpython-38.pyc
--------------------------------------------------------------------------------
/__pycache__/test_data_pkl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/test_data_pkl.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/test_data_pkl.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/test_data_pkl.cpython-38.pyc
--------------------------------------------------------------------------------
/__pycache__/model_utilities.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/model_utilities.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/model_utilities.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/model_utilities.cpython-38.pyc
--------------------------------------------------------------------------------
/__pycache__/train_data_mine.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/train_data_mine.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/train_data_mine.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/__pycache__/train_data_mine.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/utilities.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shleee47/Sound-Source-Localization/HEAD/utils/__pycache__/utilities.cpython-37.pyc
--------------------------------------------------------------------------------
/utils/extract.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Data directory
4 | DATASET_DIR='/home/nas/DB/AI_grand_challenge_2020/2020/t3_audio/'
5 |
6 | # Feature directory
7 | FEATURE_DIR='/home/minseok/Audio/AI_Challenge/features'
8 |
9 | # Workspace
10 | WORKSPACE='/home/minseok/Audio/AI_Challenge'
11 | cd $WORKSPACE
12 |
13 | ########### Hyper-parameters ###########
14 | FEATURE_TYPE='logmelgccintensity' # 'logmel' | 'logmelgcc' | 'logmelintensity' | 'logmelgccintensity'
15 | AUDIO_TYPE='mic' # 'mic' | 'foa' | 'foa&mic'
16 |
17 | ############ Extract Features ############
18 | # dev
19 | python utils/feature_extractor.py --dataset_dir=$DATASET_DIR --feature_dir=$FEATURE_DIR --feature_type=$FEATURE_TYPE --data_type='dev' --audio_type=$AUDIO_TYPE
20 |
21 | # eval
22 | python utils/feature_extractor.py --dataset_dir=$DATASET_DIR --feature_dir=$FEATURE_DIR --feature_type=$FEATURE_TYPE --data_type='eval' --audio_type=$AUDIO_TYPE
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/utils/loss.py:
--------------------------------------------------------------------------------
1 | from torch.nn import functional as F
2 | from torch import nn
3 | import torch
4 | import pdb
5 |
6 | def create_criterion(loss_name):
7 |
8 | if loss_name == 'CTCLoss':
9 | criterion = nn.CTCLoss(blank=2, zero_infinity=True)
10 | elif loss_name == 'CrossEntropy':
11 | criterion = nn.CrossEntropyLoss()
12 | elif loss_name == 'BCEWithLogits':
13 | criterion = nn.BCEWithLogitsLoss()
14 | elif loss_name == 'regression':
15 | criterion = mean_error
16 | elif loss_name == "BCE":
17 | criterion = nn.BCELoss()
18 | return criterion
19 |
20 |
21 | def mean_error(output, target, loss_type='MSE'):
22 |
23 | # Align the time_steps of output and target
24 | pdb.set_trace()
25 | N = min(output.shape[1], target.shape[1])
26 |
27 | output = output[:, 0: N, :]
28 | target = target[:, 0: N, :]
29 |
30 | out = torch.sqrt(torch.sum((output - target)**2))
31 |
32 | return out
33 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Sound-Source-Localization
2 | Sound Source Localization study for AI Grand Challenge 2021 (sponsored by NC Soft Vision Lab)
3 |
4 | ## Preparation
5 | ### 1. Create the environment.
6 | ```
7 | $ cd Sound-Source-Localization/
8 | $ conda create -y -n varco python=3.8
9 | $ conda activate varco
10 |
11 | ####select according to your conda version: https://pytorch.org/####
12 | $ conda install pytorch torchvision torchaudio cudatoolkit=11.1 -c pytorch -c nvidia
13 |
14 | $ conda install -y pandas h5py scipy
15 | $ conda install -y pysoundfile librosa youtube-dl tqdm -c conda-forge
16 | $ pip install PyYAML
17 | $ pip install tensorboard
18 | ```
19 |
20 | ### 2. Place the data-listed csv file in the path below.
21 | ```
22 | Sound-Source-Localization/
23 | └── dataset/
24 | └── dataset.csv
25 | ```
26 |
27 | ### 3. Run main.sh for training
28 | ```
29 | $ cd Sound-Source-Localization/
30 | $ sh train.sh
31 | ```
32 |
33 | ### 4. Run test.sh for test
34 | ```
35 | $ cd Sound-Source-Localization/
36 | $ sh test.sh
37 | ```
38 |
39 | ## Acknowledgement
40 | 이 데이터는 2021년도 정부(과학기술정보통신부)의 재원으로 정보통신기획평가원의 지원을 받아 수행된 연구의 결과물임 (No.171125972, 인명 구조용 드론을 위한 영상/음성 인지 기술 개발)
41 |
42 | This work was supported by Institute of Information & communications Technology Planning & Evaluation (IITP) grant funded by the Korea government(MSIT) (No.171125972, Audio-Visual Perception for Autonomous Rescue Drones)
43 |
44 |
--------------------------------------------------------------------------------
/config/config.yml:
--------------------------------------------------------------------------------
1 | use_tb_logger: true
2 |
3 | #### datasets
4 | datasets:
5 |
6 | csv: ./dataset/dataset.csv
7 | test: ./test_dataset/
8 |
9 | dataloader:
10 | train:
11 | batch_size: 32
12 | shuffle: true
13 | # pin_memeory: true
14 | #num_workers: 20
15 | num_workers: 20
16 |
17 | valid:
18 | batch_size: 32
19 | shuffle: true
20 | # pin_memeory: true
21 | num_workers: 20
22 |
23 | test:
24 | batch_size: 1
25 | shuffle: false
26 | # pin_memeory: true
27 | num_workers: 0
28 |
29 | #### network structures
30 | MYNET:
31 | embed_size: 8
32 | sequence_size: 16 # Temporal duration of input clips
33 | encoder: resnet50
34 | n_classes: 2
35 | input_size: 224
36 | pretrained: true
37 | num_layers: 1
38 | bidirectional: false
39 |
40 | #### training settings: learning rate scheme, loss
41 | trainer:
42 | epochs: 10000
43 | device: 1
44 | save_path: '''PATH WHERE THE MODEL WILL BE SAVED'''
45 | #ckpt_path: '''PATH OF THE MODEL TO BE LOADED'''
46 | comment: no comment
47 |
48 | tester:
49 | ckpt_path: '''PATH OF THE MODEL TO BE LOADED'''
50 | device: 2
51 |
52 |
53 | criterion:
54 | #name: regression
55 | #name: BCEWithLogits
56 | name: BCE
57 |
58 | #### Optimizer settings
59 | # optimizer:
60 | # name: Adam ### Adam, RMSprop, SGD
61 | # lr: !!float 1e-3
62 | # weight_decay: 0
63 | # eps: !!float 1e-3
64 | optimizer:
65 | name: Adam ## Adam, RMSprop, SGD
66 | lr: !!float 0.0001
67 | # betas: (0.9, 0.999)
68 | eps: !!float 1e-5
69 | weight_decay: !!float 1e-3
70 |
71 |
72 | #### scheduler settings
73 | scheduler:
74 | name: plateau
75 | min_lr: !!float 1e-8
76 | patience: 2
77 | factor: 0.5
78 |
--------------------------------------------------------------------------------
/utils/setup.py:
--------------------------------------------------------------------------------
1 | from torch import optim
2 | from torch.optim import lr_scheduler
3 | import torch
4 |
5 | def setup_solver(parameters, config):
6 | if config['optimizer']['name'] == 'Adam':
7 | optimizer = optim.Adam(parameters, lr=config['optimizer']['lr'], weight_decay=config['optimizer']['weight_decay'],
8 | eps=config['optimizer']['eps'])
9 | # eps=1e-3 if args.fp16 else 1e-8)
10 | elif config['optimizer']['name'] == 'SGD':
11 | optimizer = optim.SGD(parameters, lr=config['optimizer']['lr'], weight_decay=config['optimizer']['weight_decay'],
12 | momentum=0.9, nesterov=True)
13 |
14 |
15 | if config['scheduler']['name'] == 'plateau':
16 | scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience=config['scheduler']['patience'], threshold=0.01, cooldown=0,
17 | threshold_mode='abs', mode='max', factor=config['scheduler']['factor'])
18 |
19 |
20 | elif config['scheduler']['name'] == 'step':
21 | scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9, last_epoch=-1)
22 |
23 | elif config['scheduler']['name'] == 'exp':
24 | scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
25 |
26 |
27 |
28 | elif config['scheduler']['name'] == 'cycle':
29 | scheduler = torch.optim.lr_scheduler.OneCycleLR(
30 | optimizer,
31 | max_lr=0.001,
32 | total_steps=10000,
33 | pct_start=0.3,
34 | base_momentum=0.9*0.95,
35 | max_momentum=0.95,
36 | final_div_factor=1/0.0001,
37 | )
38 |
39 | # else: # args.scheduler == 'step':
40 | # scheduler = lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.gamma)
41 |
42 | return optimizer, scheduler
43 |
44 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | (c) 2022 NCSOFT Corporation & Sogang University. All Rights Reserved.
2 |
3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
4 |
5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
6 |
7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8 |
9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 |
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 |
13 | Any questions about our licensed work can be sent to opensource@ncsoft.com.
14 | ________________________________________
15 |
16 | This software uses Open Source Software (OSS). You can find the link for the source code of these open source projects, along with applicable license information, below.
17 |
18 | DCASE2019-TASK3
19 | https://github.com/yinkalario/DCASE2019-TASK3
20 | Copyright (c) 2019 Yin Cao
21 | MIT License
22 | See license text at https://github.com/yinkalario/DCASE2019-TASK3/blob/master/Licenses/MIT_LICENSE
23 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | warnings.filterwarnings("ignore")
3 | import sys
4 | sys.path.append('./')
5 | import argparse
6 | import torch
7 | import torch.nn as nn
8 | import pdb
9 | import yaml
10 | import numpy as np
11 | from torch.utils.data import DataLoader
12 | import os
13 | import pickle
14 | from pathlib import Path
15 | from trainer import ModelTrainer, ModelTester
16 | from utils.setup import setup_solver
17 | from utils.loss import create_criterion
18 | from utils.utils import tr_val_split
19 | from datasets import Audio_Reader, Audio_Collate, Test_Reader
20 | from model import pretrained_Gated_CRNN8
21 |
22 | def train(config):
23 |
24 | #pdb.set_trace()
25 | '''Dataset Preparation'''
26 | train_list, val_list = tr_val_split(config['datasets']['csv'])
27 |
28 | '''Data loader'''
29 | train_dataset = Audio_Reader(train_list)
30 | train_loader = DataLoader(dataset=train_dataset, batch_size=config['dataloader']['train']['batch_size'], shuffle=True, collate_fn=lambda x: Audio_Collate(x), num_workers=config['dataloader']['train']['num_workers'])
31 | valid_dataset = Audio_Reader(val_list)
32 | valid_loader = DataLoader(dataset=valid_dataset, batch_size=config['dataloader']['valid']['batch_size'], shuffle=True, collate_fn=lambda x: Audio_Collate(x), num_workers=config['dataloader']['valid']['num_workers'])
33 |
34 | '''Model / Loss Criterion / Optimizer/ Scheduler'''
35 | SSL_model = pretrained_Gated_CRNN8(10)
36 | criterion = create_criterion(config['criterion']['name'])
37 | optimizer, scheduler = setup_solver(SSL_model.parameters(), config)
38 |
39 | '''Trainer'''
40 | trainer = ModelTrainer(SSL_model, train_loader, valid_loader, criterion, optimizer, scheduler, config, **config['trainer'])
41 | trainer.train()
42 |
43 | def test(config):
44 |
45 | test_dataset = Test_Reader(config['datasets']['test'])
46 | test_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False, pin_memory = True, num_workers=0)
47 |
48 | SSL_model = pretrained_Gated_CRNN8(10)
49 |
50 | tester = ModelTester(SSL_model, test_loader, config['tester']['ckpt_path'], config['tester']['device'])
51 | tester.test()
52 |
53 | if __name__ == '__main__':
54 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
55 | parser = argparse.ArgumentParser()
56 | parser.add_argument('-b', '--base_dir', type=str, default='.', help='Root directory')
57 | parser.add_argument('-c', '--config', type=str, help='Path to option YAML file.')
58 | parser.add_argument('-d', '--dataset', type=str, help='Dataset')
59 | parser.add_argument('-m', '--mode', type=str, help='Train or Test')
60 | args = parser.parse_args()
61 |
62 | '''Load Config'''
63 | with open(os.path.join(args.config, args.dataset + '.yml'), mode='r') as f:
64 | config = yaml.load(f,Loader=yaml.FullLoader)
65 |
66 | if args.mode == 'Train':
67 | train(config)
68 | elif args.mode == 'Test':
69 | test(config)
70 |
--------------------------------------------------------------------------------
/utils/check_data_len.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 | import torch
3 | from torch import nn
4 | import pdb
5 | import os
6 | import numpy as np
7 | import csv
8 | import random
9 | import librosa
10 |
11 | split_ratio = 0.9
12 | seed_num = 100
13 | random.seed(seed_num)
14 |
15 | correct_csv_file = '../correct.csv'
16 | with open(correct_csv_file, newline='') as f:
17 | reader = csv.reader(f)
18 | tmp = list(reader)
19 | correct_list = [x[-1].split('/')[-1] for x in tmp]
20 | f.close()
21 |
22 | data_path = '../dataset'
23 | csv_path = '../csv'
24 | csv_list = [f for f in os.listdir(csv_path) if f.split('.')[-1] == 'csv']
25 |
26 | total_list = []
27 | for fi in csv_list:
28 | csv_file = os.path.join(csv_path,fi)
29 | #with open(csv_file, newline='',encoding='cp949') as f:
30 | with open(csv_file, newline='') as f:
31 | reader = csv.reader(f)
32 | tmp = list(reader)
33 | data_name_list = tmp[0]
34 | total_list += tmp[1:]
35 | f.close()
36 |
37 | t_audio, v_audio = [], []
38 | t_text, v_text = [], []
39 | t_label, v_label = [], []
40 | class_dict = {'negative': [],'neutral': [],'positive': []}
41 | data_label_idx = data_name_list.index('mul_emotion')
42 | class_dict['negative'] = [(x,'negative') for x in total_list if x[data_label_idx] != 'happy' and x[data_label_idx] != 'neutral' and x[data_label_idx] != 'surprise' and x[0].split('/')[-1] in correct_list]
43 | class_dict['neutral'] = [(x,'neutral') for x in total_list if x[data_label_idx] == 'neutral' and x[0].split('/')[-1] in correct_list]
44 | class_dict['positive'] = [(x,'positive') for x in total_list if x[data_label_idx] == 'happy' and x[0].split('/')[-1] in correct_list]
45 |
46 |
47 | train_list,val_list = [],[]
48 | data_num = 7113
49 | for class_name in class_dict.keys():
50 | temp_list = class_dict[class_name]
51 | random.shuffle(temp_list)
52 | if len(temp_list) < data_num:
53 | split_idx = int(split_ratio*len(temp_list))
54 | train_list += temp_list[:split_idx]
55 | val_list += temp_list[split_idx:len(temp_list)]
56 | else:
57 | split_idx = int(split_ratio*data_num)
58 | train_list += temp_list[:split_idx]
59 | val_list += temp_list[split_idx:data_num]
60 |
61 |
62 | '''Train Data'''
63 | for data in train_list:
64 | t_label.append(data[1].replace("\ufeff",""))
65 | t_audio.append(os.path.join(data_path,data[0][0]))
66 | t_text.append(data[0][1])
67 |
68 | '''Valid Data'''
69 | for data in val_list:
70 | v_label.append(data[1].replace("\ufeff",""))
71 | v_audio.append(os.path.join(data_path,data[0][0]))
72 | v_text.append(data[0][1])
73 |
74 |
75 | count_dict = {}
76 | count_dict["over_twenty"] = 0
77 | count_dict["over_fifteen"] = 0
78 | count_dict["over_ten"] = 0
79 | count_dict["over_five"] = 0
80 | count_dict["below_five"] = 0
81 |
82 | pdb.set_trace()
83 | for audio_path in t_audio:
84 | audio, _ = librosa.load(audio_path, sr=16000, dtype=np.float32)
85 | audio_len = len(audio) / 16000
86 | print("{}: {}".format(audio_path,audio_len))
87 | if audio_len > 20:
88 | count_dict["over_twenty"] +=1
89 | elif audio_len > 15:
90 | count_dict["over_fifteen"] +=1
91 | elif audio_len > 10:
92 | count_dict["over_ten"] +=1
93 | elif audio_len > 5:
94 | count_dict["over_five"] +=1
95 | else:
96 | count_dict["below_five"] +=1
97 |
98 | for audio_path in v_audio:
99 | audio, _ = librosa.load(audio_path, sr=16000, dtype=np.float32)
100 | audio_len = len(audio) / 16000
101 | print("{}: {}".format(audio_path,audio_len))
102 | if audio_len > 20:
103 | count_dict["over_twenty"] +=1
104 | elif audio_len > 15:
105 | count_dict["over_fifteen"] +=1
106 | elif audio_len > 10:
107 | count_dict["over_ten"] +=1
108 | elif audio_len > 5:
109 | count_dict["over_five"] +=1
110 | else:
111 | count_dict["below_five"] +=1
112 |
113 |
114 | for class_name in count_dict.keys():
115 | print('{}: {}'.format(class_name, count_dict[class_name]))
--------------------------------------------------------------------------------
/model_utilities.py:
--------------------------------------------------------------------------------
1 | '''
2 | MIT License
3 |
4 | Copyright (c) 2019 Yin Cao
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | '''
24 |
25 | import math
26 | import pdb
27 |
28 | import numpy as np
29 | import torch
30 | import torch.nn as nn
31 | import torch.nn.functional as F
32 |
33 |
34 | def interpolate(x, ratio):
35 | '''
36 | Interpolate the x to have equal time steps as targets
37 | Input:
38 | x: (batch_size, time_steps, class_num)
39 | Output:
40 | out: (batch_size, time_steps*ratio, class_num)
41 | '''
42 | (batch_size, time_steps, classes_num) = x.shape
43 | upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
44 | upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
45 |
46 | return upsampled
47 |
48 |
49 | def init_layer(layer, nonlinearity='leaky_relu'):
50 | '''
51 | Initialize a layer
52 | '''
53 | classname = layer.__class__.__name__
54 | if (classname.find('Conv') != -1) or (classname.find('Linear') != -1):
55 | nn.init.kaiming_uniform_(layer.weight, nonlinearity=nonlinearity)
56 | if hasattr(layer, 'bias'):
57 | if layer.bias is not None:
58 | nn.init.constant_(layer.bias, 0.0)
59 | elif classname.find('BatchNorm') != -1:
60 | nn.init.normal_(layer.weight, 1.0, 0.02)
61 | nn.init.constant_(layer.bias, 0.0)
62 |
63 |
64 | def init_mask(layer, nonlinearity='leaky_relu'):
65 | '''
66 | Initialize a layer
67 | '''
68 | classname = layer.__class__.__name__
69 | if (classname.find('Conv') != -1) or (classname.find('Linear') != -1):
70 | nn.init.constant_(layer.weight, 1.0)
71 | if hasattr(layer, 'bias'):
72 | if layer.bias is not None:
73 | nn.init.constant_(layer.bias, 0.0)
74 | elif classname.find('BatchNorm') != -1:
75 | nn.init.normal_(layer.weight, 1.0, 0.02)
76 | nn.init.constant_(layer.bias, 0.0)
77 |
78 |
79 | def init_gru(rnn):
80 | """Initialize a GRU layer. """
81 |
82 | def _concat_init(tensor, init_funcs):
83 | (length, fan_out) = tensor.shape
84 | fan_in = length // len(init_funcs)
85 |
86 | for (i, init_func) in enumerate(init_funcs):
87 | init_func(tensor[i * fan_in : (i + 1) * fan_in, :])
88 |
89 | def _inner_uniform(tensor):
90 | fan_in = nn.init._calculate_correct_fan(tensor, 'fan_in')
91 | nn.init.uniform_(tensor, -math.sqrt(3 / fan_in), math.sqrt(3 / fan_in))
92 |
93 | for i in range(rnn.num_layers):
94 | _concat_init(
95 | getattr(rnn, 'weight_ih_l{}'.format(i)),
96 | [_inner_uniform, _inner_uniform, _inner_uniform]
97 | )
98 | torch.nn.init.constant_(getattr(rnn, 'bias_ih_l{}'.format(i)), 0)
99 |
100 | _concat_init(
101 | getattr(rnn, 'weight_hh_l{}'.format(i)),
102 | [_inner_uniform, _inner_uniform, nn.init.orthogonal_]
103 | )
104 | torch.nn.init.constant_(getattr(rnn, 'bias_hh_l{}'.format(i)), 0)
105 |
106 |
107 | class ConvBlock(nn.Module):
108 | def __init__(self, in_channels, out_channels,
109 | kernel_size=(3,3), stride=(1,1), padding=(1,1)):
110 |
111 | super().__init__()
112 |
113 | self.conv1 = nn.Conv2d(in_channels=in_channels,
114 | out_channels=out_channels,
115 | kernel_size=kernel_size, stride=stride,
116 | padding=padding, bias=False)
117 |
118 | self.conv2 = nn.Conv2d(in_channels=out_channels,
119 | out_channels=out_channels,
120 | kernel_size=kernel_size, stride=stride,
121 | padding=padding, bias=False)
122 |
123 | self.bn1 = nn.BatchNorm2d(out_channels)
124 | self.bn2 = nn.BatchNorm2d(out_channels)
125 |
126 | self.init_weights()
127 |
128 | def init_weights(self):
129 |
130 | init_layer(self.conv1)
131 | init_layer(self.conv2)
132 | init_layer(self.bn1)
133 | init_layer(self.bn2)
134 |
135 | def forward(self, x, pool_type='avg', pool_size=(2, 2)):
136 |
137 | x = F.relu_(self.bn1(self.conv1(x)))
138 | x = F.relu_(self.bn2(self.conv2(x)))
139 | if pool_type == 'avg':
140 | x = F.avg_pool2d(x, kernel_size=pool_size)
141 | elif pool_type == 'max':
142 | x = F.max_pool2d(x, kernel_size=pool_size)
143 | elif pool_type == 'frac':
144 | fractional_maxpool2d = nn.FractionalMaxPool2d(kernel_size=pool_size, output_ratio=1/np.sqrt(2))
145 | x = fractional_maxpool2d(x)
146 |
147 | return x
148 |
--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 | import torch
3 | from torch import nn
4 | from utils.resnet import resnet18, resnet50, resnet101
5 | import pdb
6 | import random
7 | import os
8 | import csv
9 | Encoder = namedtuple('Encoder', ('model', 'features', 'features_shape'))
10 |
11 |
12 |
13 | def tr_val_split(data_csv):
14 | split_ratio = 0.9
15 | seed_num = 100
16 | random.seed(seed_num)
17 |
18 | #pdb.set_trace()
19 | csv_path = data_csv
20 |
21 | total_list = []
22 | csv_file = os.path.join(csv_path)
23 | #with open(csv_file, newline='',encoding='cp949') as f:
24 | with open(csv_file, newline='') as f:
25 | reader = csv.reader(f)
26 | tmp = list(reader)
27 | total_list += tmp
28 | f.close()
29 |
30 | #pdb.set_trace()
31 | class_dict = {}
32 |
33 | for data in total_list:
34 | data = data[0]
35 | class_name = data.split('/')[-2]
36 |
37 | if class_name not in class_dict.keys():
38 | class_dict[class_name] = []
39 | class_dict[class_name].append(data)
40 |
41 | #pdb.set_trace()
42 |
43 | train_list,val_list = [],[]
44 |
45 | for class_name in class_dict.keys():
46 | temp_list = class_dict[class_name]
47 | random.shuffle(temp_list)
48 |
49 | split_idx = int(split_ratio*len(temp_list))
50 | train_list += temp_list[:split_idx]
51 | val_list += temp_list[split_idx:len(temp_list)]
52 |
53 | #pdb.set_trace()
54 |
55 | return train_list, val_list
56 |
57 |
58 |
59 | def make_encoder(name, input_size=224, input_channels=3, pretrained=True, pretrain_path=None):
60 | """Make encoder (backbone) with a given name and parameters"""
61 |
62 | features_size = input_size // 32
63 | num_features = 2048
64 | # if name.startswith('resnet'):
65 | if name == 'resnet50':
66 | model = resnet50(pretrained=True)
67 | features = nn.Sequential(*list(model.children())[:-2])
68 | # features[0] = nn.Conv2d(in_channels=4, out_channels=64, kernel_size=7, stride=2, padding=3, bias=False)
69 | num_features = 512 if int(name[6:]) < 50 else 2048
70 |
71 | features_shape = (num_features, features_size, features_size)
72 | return Encoder(model, features, features_shape)
73 |
74 | elif name == 'resnet101':
75 | model = resnet101(pretrained=True)
76 | features = nn.Sequential(*list(model.children())[:-2])
77 | #num_features = 512 if int(name[6:]) < 50 else 2048
78 | #features_shape = (num_features, features_size, features_size)
79 | return features#Encoder(model, features, features_shape)
80 |
81 |
82 | elif name == 'resnet18':
83 | print('resnet18')
84 | model = resnet18(pretrained=True)
85 | features = nn.Sequential(*list(model.children())[:-3])
86 | # features_shape = (num_features, features_size, features_size)
87 | return features #Encoder(model, features, features_shape)
88 |
89 |
90 |
91 |
92 | elif name == 'resnet2p1d':
93 | model = resnet2p1d.generate_model(model_depth=18)
94 | # n_classes=opt.n_classes,
95 | # n_input_channels=opt.n_input_channels,
96 | # shortcut_type='B',
97 | # conv1_t_size=opt.conv1_t_size,
98 | # conv1_t_stride=opt.conv1_t_stride,
99 | # no_max_pool=opt.no_max_pool,
100 | # widen_factor=opt.resnet_widen_factor)
101 |
102 | # model_without_last = nn.Sequential(*list(model.children())[:-1]).state_dict()
103 | #model_dict = model.state_dict()
104 | #pretrained_dict = torch.load(pretrain_path, map_location='cpu')['state_dict']
105 | #pretrained_dict = {k: v for k, v in pretrained_dict.items() if 'fc' not in k}
106 | #model_dict.update(pretrained_dict)
107 | #model.load_state_dict(model_dict)
108 | features = nn.Sequential(*list(model.children())[:-3])
109 | #features[0] = nn.Conv3d(4, 110, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
110 | return features
111 | # features = model
112 |
113 | else:
114 | raise KeyError("Unknown model name: {}".format(name))
115 |
116 |
117 |
118 | # elif name.startswith('mobilenetv2'):
119 | # model = mobilenetv2.MobileNetV2(input_size=input_size, pretrained=None)
120 | # features = model.features
121 | # num_features = 1280
122 | # elif name.startswith('rmnet'):
123 | # model = rmnet.RMNetClassifier(1000, pretrained=None)
124 | # features = nn.Sequential(*list(model.children())[:-2])
125 | # num_features = 512
126 |
127 | # elif name.startswith('se_res'):
128 | # model = load_from_pretrainedmodels(name)(pretrained='imagenet' if pretrained else None)
129 | # features = nn.Sequential(*list(model.children())[:-2])
130 |
131 |
132 |
133 |
134 | def load_from_pretrainedmodels(model_name):
135 | import pretrainedmodels
136 | return getattr(pretrainedmodels, model_name)
137 |
138 |
139 |
140 | def squash_dims(tensor, dims):
141 | """
142 | Squashes dimension, given in dims into one, which equals to product of given.
143 |
144 | Args:
145 | tensor (Tensor): input tensor
146 | dims: dimensions over which tensor should be squashed
147 |
148 | """
149 | assert len(dims) >= 2, "Expected two or more dims to be squashed"
150 |
151 | size = tensor.size()
152 |
153 | squashed_dim = size[dims[0]]
154 | for i in range(1, len(dims)):
155 | assert dims[i] == dims[i - 1] + 1, "Squashed dims should be consecutive"
156 | squashed_dim *= size[dims[i]]
157 |
158 | result_dims = size[:dims[0]] + (squashed_dim,) + size[dims[-1] + 1:]
159 | return tensor.contiguous().view(*result_dims)
160 |
161 |
162 | def unsquash_dim(tensor, dim, res_dim):
163 | """
164 | Unsquashes dimension, given in dim into separate dimensions given is res_dim
165 | Args:
166 | tensor (Tensor): input tensor
167 | dim (int): dimension that should be unsquashed
168 | res_dim (tuple): list of dimensions, that given dim should be unfolded to
169 |
170 | """
171 | size = tensor.size()
172 | result_dim = size[:dim] + res_dim + size[dim + 1:]
173 | return tensor.view(*result_dim)
174 |
--------------------------------------------------------------------------------
/utils/vad.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pdb
3 | import librosa
4 | import os
5 | import soundfile
6 | import random
7 | import pickle
8 | import torch
9 | import torchaudio
10 | import math
11 |
12 | wav_path = "../pickle/1_enhanced/"
13 | out_path = "../pickle/2_vad/"
14 | os.makedirs(out_path,exist_ok=True)
15 |
16 | wav_list = [f for f in os.listdir(wav_path)]
17 | wav_list = sorted(wav_list)
18 | frame_size = 512
19 | hop_size = 128
20 | RISING_TERM = 30 # 15*4
21 | LEAST_TERM = 80 #25*4
22 | power_list = []
23 | save_wav_count=0
24 | eps = 1e-5
25 |
26 | def stft(_wav_path):
27 | window=torch.hann_window(window_length=512, periodic=True, dtype=None, layout=torch.strided, device=None, requires_grad=False)
28 | data_wav,_ = librosa.load(_wav_path,sr=16000,mono=False)
29 | #pdb.set_trace()
30 | data_wav = torch.from_numpy(data_wav)
31 | spec_noi1 = torchaudio.functional.spectrogram(waveform=data_wav, pad=0, window=window, n_fft=512, hop_length=128, win_length=512, power=None, normalized=False)
32 | input_wav_real1 =spec_noi1[:,:,:,0]
33 | input_wav_imag1 = spec_noi1[:,:,:,1]
34 | phase = torch.atan(input_wav_imag1/(input_wav_real1+1e-8))
35 | input_wav_magnitude = torch.sqrt(input_wav_real1**2 + input_wav_imag1**2)
36 | return input_wav_magnitude, phase
37 |
38 | #def spec2audio_tensor(power_mag,phase,window,length,nfft):
39 | # window = window
40 | # length = length
41 | # mag = power_mag #[1 F T]
42 | # phase = phase #[1 F T]
43 | # sqrt_mag = torch.sqrt(mag)
44 | # cos_phase = torch.cos(phase)
45 | # sin_phase = torch.sin(phase)
46 | # real = sqrt_mag * cos_phase
47 | # imagine = sqrt_mag * sin_phase
48 | # real = real.unsqueeze(3)
49 | # imagine = imagine.unsqueeze(3)
50 | # complex_ri = torch.cat((real,imagine),3)
51 | # audio = torch.istft(input = complex_ri, n_fft=int(nfft), hop_length=int(0.25*nfft), win_length=int(nfft), window=window, center=True, normalized=False, onesided=True, length=length)
52 | # return audio
53 |
54 | #pdb.set_trace()
55 | for pkl in wav_list:
56 | time_list=[]
57 | left_or_right_list=[]
58 |
59 | with open(os.path.join(wav_path,pkl),'rb') as f:
60 | data = pickle.load(f)
61 | wav = data["output_path"]
62 | #wav = data["audio_path"]
63 |
64 | power_list=[]
65 | active_cnt = 0
66 | tmp_dummy_frame = 0
67 | dummy_frame = 0
68 | inactive_cnt = 0
69 | state= 0
70 | i=0
71 | num=0
72 | #pdb.set_trace()
73 | (total_audio,fs) = soundfile.read(wav)
74 | if len(total_audio.shape) == 1:
75 | pdb.set_trace()
76 | if fs != 16000:
77 | pdb.set_trace()
78 | tmp0 = librosa.resample(total_audio[:,0],fs,16000)
79 | tmp1 = librosa.resample(total_audio[:,1],fs,16000)
80 | total_audio = np.stack((tmp1,tmp2),axis=1)
81 | fs = 16000
82 | else:
83 | tmp0 = total_audio[:,0]
84 | tmp1 = total_audio[:,1]
85 |
86 |
87 | frame_idx_list = range(0,len(total_audio)-hop_size+1,hop_size)
88 | input_wav_mag,phase = stft(wav)
89 |
90 | mean_power = abs(input_wav_mag[:,20:,:]).mean()
91 | thre = mean_power / 10
92 |
93 |
94 | for frame_idx in frame_idx_list:
95 | num+=1
96 | if abs(input_wav_mag[:,20:,frame_idx//hop_size]).mean() > thre:
97 | if state == 0:
98 | active_cnt = 1
99 | tmp_dummy_frame = 1
100 | rising_idx = frame_idx
101 | state =1
102 |
103 | elif state == 1:
104 | active_cnt+=1
105 | tmp_dummy_frame+=1
106 | if active_cnt == RISING_TERM:
107 | state=2
108 |
109 | elif state == 2:
110 | active_cnt+=1
111 |
112 | elif state == 3:
113 | inactive_cnt=0
114 | active_cnt+=1
115 | state = 2
116 |
117 | elif state == 4:
118 | active_cnt =1
119 | tmp_dummy_frame = 1
120 | rising_idx = frame_idx
121 | state = 1
122 |
123 | else:
124 |
125 | if state == 0:
126 | dummy_frame+=1
127 | state = 0
128 |
129 | elif state == 1:
130 | active_cnt = 0
131 | dummy_frame+=tmp_dummy_frame
132 | tmp_dummy_frame = 0
133 | state=0
134 |
135 | elif state == 2:
136 | inactive_cnt =1
137 | active_cnt+=1
138 | state = 3
139 |
140 | elif state == 3:
141 | inactive_cnt+=1
142 | active_cnt+=1
143 | if inactive_cnt == LEAST_TERM:
144 | state = 4
145 |
146 | elif state == 4:
147 | dummy_frame = 1
148 | state = 0
149 |
150 | # save VAD chunk here in wav
151 | if state == 4 or (num == len(frame_idx_list) and active_cnt > RISING_TERM):
152 | falling_idx = frame_idx
153 | if rising_idx-hop_size < 0:
154 | rising_idx = 128
155 | rising_idx = (rising_idx-hop_size)
156 | if state == 4:
157 | falling_idx = (falling_idx-(LEAST_TERM-2)*hop_size)
158 | else:
159 | falling_idx = (falling_idx-(inactive_cnt-2)*hop_size)
160 | tmp0_power = np.sum(np.abs(tmp0[rising_idx:falling_idx]))
161 | tmp1_power = np.sum(np.abs(tmp1[rising_idx:falling_idx]))
162 | if tmp0_power > tmp1_power:
163 | left_or_right_list.append(0)
164 | else:
165 | left_or_right_list.append(1)
166 |
167 | rising_idx = rising_idx/fs
168 | falling_idx = falling_idx/fs
169 | time_list.append([rising_idx,falling_idx])
170 | save_wav_count +=1
171 | #save chunk for another channel
172 | i+=1
173 | state = 4
174 | active_cnt = 0
175 | inactive_cnt = 0
176 | tmp_dummy_frame = 0
177 | dummy_frame = 0
178 |
179 | #pdb.set_trace()
180 | data["time"] = time_list
181 | data["LR"] = left_or_right_list
182 | wav_total_mean_power = np.mean(np.abs((tmp0+tmp1)/2))
183 | if time_list == [] or wav_total_mean_power < eps:
184 | pdb.set_trace()
185 | wav = data["input_path"]
186 | (total_audio,fs) = soundfile.read(wav)
187 |
188 | if fs != 16000:
189 | tmp0 = librosa.resample(total_audio[:,0],fs,16000)
190 | tmp1 = librosa.resample(total_audio[:,1],fs,16000)
191 | total_audio = np.stack((tmp1,tmp2),axis=1)
192 | fs = 16000
193 | else:
194 | tmp0 = total_audio[:,0]
195 | tmp1 = total_audio[:,1]
196 |
197 | tmp0_power = np.sum(np.abs(tmp0))
198 | tmp1_power = np.sum(np.abs(tmp1))
199 | if tmp0_power > tmp1_power:
200 | left_or_right_list.append(0)
201 | else:
202 | left_or_right_list.append(1)
203 |
204 | data["LR"] = left_or_right_list
205 |
206 | with open(os.path.join(out_path,pkl),"wb") as fw:
207 | pickle.dump(data,fw)
208 | print("pickle dumped!!: {}".format(pkl))
--------------------------------------------------------------------------------
/datasets.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 | sys.path.append('..')
4 | from torch.utils.data.dataset import Dataset
5 | from pathlib import Path
6 | import pickle
7 | import pdb
8 | import torch
9 | import numpy as np
10 | import argparse
11 | import os
12 | import sys
13 | # import h5py
14 | import librosa
15 | import numpy as np
16 | # import pandas as pd
17 | import scipy.io as sio
18 | from scipy import signal
19 | from tqdm import tqdm
20 | import warnings
21 | warnings.filterwarnings("ignore")
22 |
23 | class Audio_Reader(Dataset):
24 | def __init__(self, datalist):
25 | super(Audio_Reader, self).__init__()
26 | self.datalist = datalist
27 | self.classlist = ['0','20','40','60','80','100','120','140','160','180']
28 | self.nfft = 512
29 | self.hopsize = self.nfft // 4
30 | self.window = 'hann'
31 |
32 | def __len__(self):
33 | return len(self.datalist)
34 |
35 | def LogMelGccExtractor(self, sig):
36 | def logmel(sig):
37 |
38 | #pdb.set_trace()
39 | S = np.abs(librosa.stft(y=sig,
40 | n_fft=self.nfft,
41 | hop_length=self.hopsize,
42 | center=True,
43 | window=self.window,
44 | pad_mode='reflect'))**2
45 |
46 | # S_mel = np.dot(self.melW, S).T
47 | S = librosa.power_to_db(S**2, ref=1.0, amin=1e-10, top_db=None)
48 | S = np.expand_dims(S, axis=0)
49 |
50 | return S
51 |
52 | def gcc_phat(sig, refsig):
53 |
54 | #pdb.set_trace()
55 | Px = librosa.stft(y=sig,
56 | n_fft=self.nfft,
57 | hop_length=self.hopsize,
58 | center=True,
59 | window=self.window,
60 | pad_mode='reflect')
61 |
62 | Px_ref = librosa.stft(y=refsig,
63 | n_fft=self.nfft,
64 | hop_length=self.hopsize,
65 | center=True,
66 | window=self.window,
67 | pad_mode='reflect')
68 |
69 | R = Px*np.conj(Px_ref)
70 | return R
71 |
72 | def transform(audio):
73 |
74 | channel_num = audio.shape[0]
75 | feature_logmel = []
76 | feature_gcc_phat = []
77 | for n in range(channel_num):
78 | feature_logmel.append(logmel(audio[n]))
79 | for m in range(n+1, channel_num):
80 | feature_gcc_phat.append(
81 | gcc_phat(sig=audio[m], refsig=audio[n]))
82 |
83 | #pdb.set_trace()
84 | feature_logmel = np.concatenate(feature_logmel, axis=0)
85 | feature_gcc_phat = np.concatenate(feature_gcc_phat, axis=0)
86 | feature = np.concatenate([feature_logmel, np.expand_dims(feature_gcc_phat, axis=0)])
87 |
88 | return feature
89 |
90 | return transform(sig)
91 |
92 | def __getitem__(self, idx):
93 |
94 | audio_path = self.datalist[idx]
95 | class_name = audio_path.split('/')[-2].strip('degree')
96 | class_num = self.classlist.index(class_name)
97 | audio, _ = librosa.load(audio_path, sr=16000, mono=False, dtype=np.float32)
98 | if audio.shape[1] >80000:
99 | audio = audio[:,:80000]
100 |
101 | feature = self.LogMelGccExtractor(audio)
102 | #pdb.set_trace()
103 | return torch.FloatTensor(feature).transpose(1,2), np.array([class_num])
104 |
105 |
106 | def Audio_Collate(batch):
107 |
108 | #pdb.set_trace()
109 | data, class_num = list(zip(*batch))
110 | data_len = torch.LongTensor(np.array([x.size(1) for x in data if x.size(1)!=1]))
111 | #if len(data_len) == 0:
112 | # return -1
113 |
114 | max_len = max(data_len)
115 | wrong_indices = []
116 |
117 | #for i, a_ in enumerate(class_num):
118 | # if a_[0] == -1:
119 | # wrong_indices.append(i)
120 |
121 | B = len(data)
122 | #pdb.set_trace()
123 | #inputs = torch.zeros(B-len(wrong_indices), 1, max_len, 10)
124 | #labels = torch.zeros(B-len(wrong_indices), 2)
125 | inputs = torch.zeros(B, 3, max_len, 257)
126 | labels = torch.zeros(B, 10)
127 | j = 0
128 | #pdb.set_trace()
129 | '''zero pad'''
130 | for i in range(B):
131 | #if i in wrong_indices:
132 | # continue
133 |
134 | inputs[j, : , :data[i].size(1),:] = data[i]
135 | labels[j, class_num[i]] = 1.0
136 | j += 1
137 |
138 | #pdb.set_trace()
139 | #data = (inputs, labels, data_len)
140 | data = (inputs, labels)
141 | return data
142 |
143 |
144 | class Test_Reader(Dataset):
145 | def __init__(self, datalist):
146 | super(Audio_Reader, self).__init__()
147 | self.datalist = datalist
148 | self.classlist = ['0','20','40','60','80','100','120','140','160','180']
149 | self.nfft = 512
150 | self.hopsize = self.nfft // 4
151 | self.window = 'hann'
152 |
153 | def __len__(self):
154 | return len(self.datalist)
155 |
156 | def LogMelGccExtractor(self, sig):
157 | def logmel(sig):
158 |
159 | #pdb.set_trace()
160 | S = np.abs(librosa.stft(y=sig,
161 | n_fft=self.nfft,
162 | hop_length=self.hopsize,
163 | center=True,
164 | window=self.window,
165 | pad_mode='reflect'))**2
166 |
167 | # S_mel = np.dot(self.melW, S).T
168 | S = librosa.power_to_db(S**2, ref=1.0, amin=1e-10, top_db=None)
169 | S = np.expand_dims(S, axis=0)
170 |
171 | return S
172 |
173 | def gcc_phat(sig, refsig):
174 |
175 | #pdb.set_trace()
176 | Px = librosa.stft(y=sig,
177 | n_fft=self.nfft,
178 | hop_length=self.hopsize,
179 | center=True,
180 | window=self.window,
181 | pad_mode='reflect')
182 |
183 | Px_ref = librosa.stft(y=refsig,
184 | n_fft=self.nfft,
185 | hop_length=self.hopsize,
186 | center=True,
187 | window=self.window,
188 | pad_mode='reflect')
189 |
190 | R = Px*np.conj(Px_ref)
191 | return R
192 |
193 | def transform(audio):
194 |
195 | channel_num = audio.shape[0]
196 | feature_logmel = []
197 | feature_gcc_phat = []
198 | for n in range(channel_num):
199 | feature_logmel.append(logmel(audio[n]))
200 | for m in range(n+1, channel_num):
201 | feature_gcc_phat.append(
202 | gcc_phat(sig=audio[m], refsig=audio[n]))
203 |
204 | #pdb.set_trace()
205 | feature_logmel = np.concatenate(feature_logmel, axis=0)
206 | feature_gcc_phat = np.concatenate(feature_gcc_phat, axis=0)
207 | feature = np.concatenate([feature_logmel, np.expand_dims(feature_gcc_phat, axis=0)])
208 |
209 | return feature
210 |
211 | return transform(sig)
212 |
213 | def __getitem__(self, idx):
214 |
215 | audio_path = self.datalist[idx]
216 | class_name = audio_path.split('/')[-2].strip('degree')
217 | class_num = self.classlist.index(class_name)
218 | audio, _ = librosa.load(audio_path, sr=16000, mono=False, dtype=np.float32)
219 | if audio.shape[1] >80000:
220 | audio = audio[:,:80000]
221 |
222 | feature = self.LogMelGccExtractor(audio)
223 | #pdb.set_trace()
224 | return torch.FloatTensor(feature).transpose(1,2), np.array([class_num])
225 |
--------------------------------------------------------------------------------
/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 | import numpy as np
5 | import datetime
6 |
7 | import pickle as pkl
8 |
9 | from pathlib import Path
10 | import torch
11 | import pdb
12 | from tqdm import tqdm
13 | from datetime import datetime
14 |
15 | import torch
16 | from torch.utils.tensorboard import SummaryWriter
17 | from torch.optim.lr_scheduler import ReduceLROnPlateau
18 | from torch.distributions.multivariate_normal import MultivariateNormal
19 | import torch.nn.functional as F
20 |
21 | import logging
22 | import json
23 | from multiprocessing import Pool
24 | import time
25 | import warnings
26 | warnings.filterwarnings("ignore")
27 |
28 | class ModelTrainer:
29 |
30 | def __init__(self, model, train_loader, valid_loader, criterion, optimizer, scheduler, config, epochs, device, save_path, ckpt_path=None, comment=None, fold=2):
31 |
32 | self.device = torch.device('cuda:{}'.format(device))
33 | #self.model = model.to(self.device)
34 | self.model = model.cuda()
35 |
36 | self.train_loader = train_loader
37 | self.valid_loader = valid_loader
38 | self.criterion = criterion
39 | self.optimizer = optimizer
40 | self.scheduler = scheduler
41 |
42 | self.exp_path = Path(os.path.join(save_path, datetime.now().strftime('%d%B_%0l%0M'))) #21November_0430
43 | self.exp_path.mkdir(exist_ok=True, parents=True)
44 |
45 | # Set logger
46 | self.logger = logging.getLogger('')
47 | self.logger.setLevel(logging.INFO)
48 | fh = logging.FileHandler(os.path.join(self.exp_path, 'training.log'))
49 | sh = logging.StreamHandler(sys.stdout)
50 | self.logger.addHandler(fh)
51 | self.logger.addHandler(sh)
52 |
53 | #Dump hyper-parameters
54 | with open(str(self.exp_path.joinpath('config.json')), 'w') as f:
55 | json.dump(config, f, indent=2)
56 |
57 | if comment != None:
58 | self.logger.info(comment)
59 |
60 | self.writter = SummaryWriter(self.exp_path.joinpath('logs'))
61 | self.epochs = epochs
62 | self.best_acc = 0.0
63 | self.best_epoch = 0
64 |
65 | if ckpt_path != None:
66 | self.load_checkpoint(ckpt_path)
67 | self.optimizer.param_groups[0]['lr'] = 0.0001
68 |
69 | def train(self):
70 | for epoch in tqdm(range(self.epochs)):
71 | start = time.time()
72 | train_loss, t_accuracy= self.train_single_epoch(epoch)
73 | valid_loss, v_accuracy = self.inference()
74 | duration = time.time() - start
75 |
76 | if v_accuracy > self.best_acc:
77 | self.best_acc = v_accuracy
78 | self.best_epoch = epoch
79 |
80 | self.scheduler.step(v_accuracy)
81 | self.logger.info("epoch: {} --- t_loss : {:0.3f}, train_acc = {}%, v_loss: {:0.3f}, val_acc: {}%, best_acc: {}%, best_epoch: {}, time: {:0.2f}s, lr: {}"\
82 | .format(epoch, train_loss, t_accuracy, valid_loss, v_accuracy, self.best_acc, self.best_epoch, duration,self.optimizer.param_groups[0]['lr']))
83 |
84 | self.save_checkpoint(epoch, v_accuracy)
85 |
86 | self.writter.add_scalar('data/Train_Loss', train_loss, epoch)
87 | self.writter.add_scalar('data/Valid_Loss', valid_loss, epoch)
88 | self.writter.add_scalar('data/Train_Accuracy', t_accuracy, epoch)
89 | self.writter.add_scalar('data/Valid_Accuracy', v_accuracy, epoch)
90 |
91 | self.writter.close()
92 |
93 |
94 | def train_single_epoch(self, epoch):
95 | self.model.train()
96 |
97 | total_loss = 0.0
98 | accuracy = 0.0
99 | correct_cnt = 0
100 | tot_cnt = 0
101 | batch_size = len(self.train_loader)
102 |
103 | for b, batch in (enumerate(self.train_loader)):
104 |
105 | inputs, labels = batch
106 | B, C, T, Freq = inputs.size()
107 | inputs = inputs.cuda()
108 | labels = labels.cuda()
109 |
110 | self.optimizer.zero_grad()
111 | outputs = self.model(inputs)
112 | scores = outputs.mean(1)
113 | best_prediction = scores.max(-1)[1]
114 |
115 | for i in range(B):
116 | if labels[i, best_prediction[i]] == 1.0:
117 | correct_cnt += 1
118 |
119 | batch_loss = self.criterion(scores, labels)
120 | batch_loss.backward()
121 | total_loss += batch_loss.item()
122 | self.optimizer.step()
123 | tot_cnt += B
124 |
125 | print("{}/{}: {}/{}".format(b, batch_size, correct_cnt, tot_cnt), end='\r')
126 |
127 | mean_loss = total_loss / tot_cnt
128 | return mean_loss, (correct_cnt/tot_cnt)*100
129 |
130 |
131 | def inference(self):
132 | self.model.eval()
133 |
134 | total_loss = 0.0
135 | accuracy = 0.0
136 | correct_cnt = 0
137 | tot_cnt = 0
138 | batch_size = len(self.valid_loader)
139 | with torch.no_grad():
140 | for b, batch in enumerate(self.valid_loader):
141 |
142 | inputs, labels = batch
143 | B, C, T, Freq = inputs.size()
144 | inputs = inputs.cuda()
145 | labels = labels.cuda()
146 | outputs = self.model(inputs)
147 |
148 | scores = outputs.mean(1)
149 | best_prediction = scores.max(-1)[1]
150 |
151 | for i in range(B):
152 | if labels[i, best_prediction[i]] == 1.0:
153 | correct_cnt += 1
154 |
155 | batch_loss = self.criterion(scores, labels)
156 | total_loss += batch_loss.item()
157 | tot_cnt += B
158 |
159 | print("{}/{}: {}/{}".format(b, batch_size, correct_cnt, tot_cnt), end='\r')
160 |
161 | mean_loss = total_loss / tot_cnt
162 | return mean_loss, (correct_cnt/tot_cnt)*100
163 |
164 |
165 | def load_checkpoint(self, ckpt):
166 | self.logger.info("Loading checkpoint from {ckpt}")
167 | print('Loading checkpoint : {}'.format(ckpt))
168 | checkpoint = torch.load(ckpt)
169 | self.model.load_state_dict(checkpoint['model_state_dict'], strict=False)
170 | self.optimizer.load_state_dict(checkpoint['optimizer'])#, strict=False)
171 |
172 |
173 | def save_checkpoint(self, epoch, vacc, best=True):
174 |
175 | state_dict = {
176 | 'epoch': epoch,
177 | 'model_state_dict': self.model.state_dict(),
178 | 'optimizer': self.optimizer.state_dict()
179 | }
180 |
181 | self.exp_path.joinpath('ckpt').mkdir(exist_ok=True, parents=True)
182 | save_path = "{}/ckpt/{}_{:0.4f}.pt".format(self.exp_path, epoch, vacc)
183 | torch.save(state_dict, save_path)
184 |
185 |
186 | class ModelTester:
187 | def __init__(self, model, test_loader, ckpt_path, device):
188 |
189 | # Essential parts
190 | self.device = torch.device('cuda:{}'.format(device))
191 | #self.model = model.to(self.device)
192 | self.model = model.cuda()
193 | self.test_loader = test_loader
194 | # Set logger
195 | self.logger = logging.getLogger('')
196 | self.logger.setLevel(logging.INFO)
197 | sh = logging.StreamHandler(sys.stdout)
198 | self.logger.addHandler(sh)
199 |
200 | self.load_checkpoint(ckpt_path)
201 |
202 |
203 | def load_checkpoint(self, ckpt):
204 | self.logger.info(f"Loading checkpoint from {ckpt}")
205 | # print('Loading checkpoint : {}'.format(ckpt))
206 | checkpoint = torch.load(ckpt)
207 | self.model.load_state_dict(checkpoint['model_state_dict'], strict=False)
208 |
209 |
210 |
211 | def test(self):
212 | """
213 | images : [B x T x C x H x W]
214 | labels : [B x T]
215 | """
216 | self.model.eval()
217 | result = ['FA','MA']
218 | batch_size = len(self.test_loader)
219 | final = open('/home/ygchoi/gender_detection/result.csv', 'w')
220 | final.write('filename'+'\t'+'prediction'+'\n')
221 |
222 | with torch.no_grad():
223 | for b, batch in tqdm(enumerate(self.test_loader), total=len(self.test_loader)):
224 |
225 | inputs, audio_path = batch
226 | inputs = torch.unsqueeze(inputs,1)
227 | B, C, T, Freq = inputs.size()
228 | inputs = inputs.cuda()
229 | outputs = self.model(inputs)
230 | best_prediction = outputs.max(2)[1].mode()[0]
231 | final.write(audio_path[0]+'\t'+result[best_prediction.item()]+'\n')
232 |
233 | final.close()
234 |
--------------------------------------------------------------------------------
/utils/utilities.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import datetime
3 | import itertools
4 | import logging
5 | import os
6 | import sys
7 |
8 | import librosa
9 | import matplotlib.pyplot as plt
10 | import numpy as np
11 | import pandas as pd
12 | import torch
13 | from librosa.display import specshow
14 | from torch.backends import cudnn
15 | from tqdm import tqdm
16 |
17 | event_labels = ['knock', 'drawer', 'clearthroat', 'phone', 'keysDrop',\
18 | 'speech', 'keyboard', 'pageturn', 'cough', 'doorslam', 'laughter']
19 | lb_to_ix = {lb: i for i, lb in enumerate(event_labels)}
20 | ix_to_lb = {i: lb for i, lb in enumerate(event_labels)}
21 |
22 | azimuths = range(-180, 171, 10)
23 | elevations = range(-40, 41, 10)
24 | doa = [azimuths, elevations]
25 | doa_labels = list(itertools.product(*doa))
26 | doa_to_ix = {doa: i for i, doa in enumerate(doa_labels)}
27 | ix_to_doa = {i: doa for i, doa in enumerate(doa_labels)}
28 |
29 | train_splits_dict = {1: [2,3,4], 2: [1,3,4], 3: [1,2,4], 4: [1,2,3], -1: [1,2,3,4]}
30 | valid_split_dict = {1: [1], 2: [2], 3: [3], 4: [4], -1: []}
31 | test_split_dict = {1: [1], 2: [2], 3: [3], 4: [4], -1: []}
32 |
33 |
34 | def get_doas(indexes):
35 | '''
36 | Get multiple doas from indexes
37 | '''
38 | doas = []
39 | for idx in indexes:
40 | doas.append(ix_to_doa[idx])
41 | return doas
42 |
43 |
44 | def calculate_scalar(features):
45 |
46 | mean = []
47 | std = []
48 |
49 | channels = features.shape[0]
50 | for channel in range(channels):
51 | feat = features[channel, :, :]
52 | mean.append(np.mean(feat, axis=0))
53 | std.append(np.std(feat, axis=0))
54 |
55 | mean = np.array(mean)
56 | std = np.array(std)
57 | mean = np.expand_dims(mean, axis=0)
58 | std = np.expand_dims(std, axis=0)
59 | mean = np.expand_dims(mean, axis=2)
60 | std = np.expand_dims(std, axis=2)
61 |
62 | return mean, std
63 |
64 |
65 | def one_hot_encode(target, length):
66 | """Convert batches of class indices to classes of one-hot vectors."""
67 | target = np.array(target)
68 | if len(target.shape) == 0:
69 | one_hot_vec = np.zeros((1, length))
70 | one_hot_vec[0, target] = 1.0
71 | else:
72 | batch_s = target.shape[0]
73 | one_hot_vec = np.zeros((batch_s, length))
74 | for i in range(batch_s):
75 | one_hot_vec[i, target[i].astype(int)] = 1.0
76 |
77 | return one_hot_vec
78 |
79 |
80 | def get_filename(path):
81 | path = os.path.realpath(path)
82 | na_ext = path.split('/')[-1]
83 | na = os.path.splitext(na_ext)[0]
84 | return na
85 |
86 |
87 | class TqdmLoggingHandler(logging.Handler):
88 | def __init__(self, level=logging.NOTSET):
89 | super().__init__(level)
90 |
91 | def emit(self, record):
92 | try:
93 | msg = self.format(record)
94 | tqdm.write(msg)
95 | self.flush()
96 | except:
97 | self.handleError(record)
98 |
99 |
100 | def create_logging(log_dir, filemode):
101 |
102 | os.makedirs(log_dir, exist_ok=True)
103 |
104 | i1 = 0
105 |
106 | while os.path.isfile(os.path.join(log_dir, '{:04d}.log'.format(i1))):
107 | i1 += 1
108 |
109 | log_path = os.path.join(log_dir, '{:04d}.log'.format(i1))
110 | logging.basicConfig(
111 | level=logging.DEBUG,
112 | format='%(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
113 | # format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
114 | datefmt='%a, %d %b %Y %H:%M:%S',
115 | filename=log_path,
116 | filemode=filemode)
117 |
118 | # Print to console
119 | console = logging.StreamHandler()
120 | console.setLevel(logging.INFO)
121 | formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
122 | console.setFormatter(formatter)
123 | # logging.getLogger('').addHandler(console)
124 | logging.getLogger('').addHandler(TqdmLoggingHandler())
125 |
126 | logging.info(datetime.datetime.now())
127 | logging.info('\n')
128 |
129 | return logging
130 |
131 |
132 | def to_torch(x, cuda):
133 |
134 | if 'float' in str(x.dtype):
135 | x = torch.Tensor(x)
136 | elif 'int' in str(x.dtype):
137 | x = torch.LongTensor(x)
138 | else:
139 | raise Exception("Error!")
140 |
141 | if cuda:
142 | x = x.cuda()
143 |
144 | return x
145 |
146 |
147 | def to_np(x):
148 | """
149 | Convert values of the model parameters to numpy.array.
150 | """
151 | return x.cpu().data.numpy()
152 |
153 |
154 | def move_model_to_gpu(model):
155 | '''
156 | Move model to GPU
157 | '''
158 | logging.info('\nUtilize GPUs for computation')
159 | logging.info('\nNumber of GPU available: {}'.format(torch.cuda.device_count()))
160 | if torch.cuda.device_count() > 1:
161 | Multi_GPU = True
162 | else:
163 | Multi_GPU = False
164 | model.cuda()
165 | cudnn.benchmark = False # for cuda 10.0
166 | model = torch.nn.DataParallel(model)
167 |
168 | return model, Multi_GPU
169 |
170 | def logging_and_writer(data_type, metrics, logging, writer=[], batch_idx=0):
171 | '''
172 | Logging to tqdm, and write to tensorboard
173 |
174 | Input:
175 | data_type: 'train' | 'valid' | 'test'
176 | metrics: output from evaluate function, including loss and other metrics
177 | logging: logging
178 | writer: tensorboard writer
179 | batch_idx: batch iteration index, only for 'train' and 'valid'
180 | '''
181 |
182 | if data_type == 'train':
183 |
184 | [tr_loss, tr_sed_mAP, tr_sed_scores, tr_doa_er_metric,
185 | tr_seld_metric] = metrics
186 |
187 | logging.info('Train SELD loss: {:.3f}, Train SED loss: {:.3f}, Train DOA loss: {:.3f}, '
188 | 'Train SED mAP(micro): {:.3f}, Train SED mAP(macro): {:.3f}'.format(
189 | tr_loss[0], tr_loss[1], tr_loss[2], tr_sed_mAP[0], tr_sed_mAP[1]))
190 | writer.add_scalar('train/SELD_loss', tr_loss[0], batch_idx)
191 | writer.add_scalar('train/SED_loss', tr_loss[1], batch_idx)
192 | writer.add_scalar('train/DOA_loss', tr_loss[2], batch_idx)
193 | writer.add_scalar('train/SED_mAP_micro', tr_sed_mAP[0], batch_idx)
194 | writer.add_scalar('train/SED_mAP_macro', tr_sed_mAP[1], batch_idx)
195 |
196 | logging.info('Train ER: {:.3f}, Train F-score: {:.3f}, Train DOA error: {:.3f}, Train DOA frame recall: {:.3f}, Train SELD error: {:.3f}'.format(
197 | tr_sed_scores[0], tr_sed_scores[1], tr_doa_er_metric[0], tr_doa_er_metric[1], tr_seld_metric))
198 | writer.add_scalar('train/ER', tr_sed_scores[0], batch_idx)
199 | writer.add_scalar('train/F_score', tr_sed_scores[1], batch_idx)
200 | writer.add_scalar('train/DOA_error', tr_doa_er_metric[0], batch_idx)
201 | writer.add_scalar('train/DOA_frame_recall', tr_doa_er_metric[1], batch_idx)
202 | writer.add_scalar('train/SELD_error', tr_seld_metric, batch_idx)
203 |
204 | elif data_type == 'valid':
205 |
206 | [train_metrics, valid_metrics] = metrics
207 |
208 | [tr_loss, tr_sed_mAP, tr_sed_scores, tr_doa_er_metric,
209 | tr_seld_metric] = train_metrics
210 |
211 | [va_loss, va_sed_mAP, va_sed_scores, va_doa_er_metric,
212 | va_seld_metric] = valid_metrics
213 |
214 | logging.info('Train SELD loss: {:.3f}, Train SED loss: {:.3f}, Train DOA loss: {:.3f}, '
215 | 'Train SED mAP(micro): {:.3f}, Train SED mAP(macro): {:.3f}'.format(
216 | tr_loss[0], tr_loss[1], tr_loss[2], tr_sed_mAP[0], tr_sed_mAP[1]))
217 | writer.add_scalar('train/SELD_loss', tr_loss[0], batch_idx)
218 | writer.add_scalar('train/SED_loss', tr_loss[1], batch_idx)
219 | writer.add_scalar('train/DOA_loss', tr_loss[2], batch_idx)
220 | writer.add_scalar('train/SED_mAP_micro', tr_sed_mAP[0], batch_idx)
221 | writer.add_scalar('train/SED_mAP_macro', tr_sed_mAP[1], batch_idx)
222 |
223 | logging.info('Valid SELD loss: {:.3f}, Valid SED loss: {:.3f}, Valid DOA loss: {:.3f}, '
224 | 'Valid SED mAP(micro): {:.3f}, Valid SED mAP(macro): {:.3f}'.format(
225 | va_loss[0], va_loss[1], va_loss[2], va_sed_mAP[0], va_sed_mAP[1]))
226 | writer.add_scalar('valid/SELD_loss', va_loss[0], batch_idx)
227 | writer.add_scalar('valid/SED_loss', va_loss[1], batch_idx)
228 | writer.add_scalar('valid/DOA_loss', va_loss[2], batch_idx)
229 | writer.add_scalar('valid/SED_mAP_micro', va_sed_mAP[0], batch_idx)
230 | writer.add_scalar('valid/SED_mAP_macro', va_sed_mAP[1], batch_idx)
231 |
232 | logging.info('Train ER: {:.3f}, Train F-score: {:.3f}, Train DOA error: {:.3f}, Train DOA frame recall: {:.3f}, Train SELD error: {:.3f}'.format(
233 | tr_sed_scores[0], tr_sed_scores[1], tr_doa_er_metric[0], tr_doa_er_metric[1], tr_seld_metric))
234 | writer.add_scalar('train/ER', tr_sed_scores[0], batch_idx)
235 | writer.add_scalar('train/F_score', tr_sed_scores[1], batch_idx)
236 | writer.add_scalar('train/DOA_error', tr_doa_er_metric[0], batch_idx)
237 | writer.add_scalar('train/DOA_frame_recall', tr_doa_er_metric[1], batch_idx)
238 | writer.add_scalar('train/SELD_error', tr_seld_metric, batch_idx)
239 |
240 | logging.info('Valid ER: {:.3f}, Valid F-score: {:.3f}, Valid DOA error: {:.3f}, Valid DOA frame recall: {:.3f}, Valid SELD error: {:.3f}'.format(
241 | va_sed_scores[0], va_sed_scores[1], va_doa_er_metric[0], va_doa_er_metric[1], va_seld_metric))
242 | writer.add_scalar('valid/ER', va_sed_scores[0], batch_idx)
243 | writer.add_scalar('valid/F_score', va_sed_scores[1], batch_idx)
244 | writer.add_scalar('valid/DOA_error', va_doa_er_metric[0], batch_idx)
245 | writer.add_scalar('valid/DOA_frame_recall', va_doa_er_metric[1], batch_idx)
246 | writer.add_scalar('valid/SELD_error', va_seld_metric, batch_idx)
247 |
248 | elif data_type == 'test':
249 |
250 | [te_loss, te_sed_mAP, te_sed_scores, te_doa_er_metric,
251 | te_seld_metric] = metrics
252 |
253 | logging.info('Test SELD loss: {:.3f}, Test SED loss: {:.3f}, Test DOA loss: {:.3f}, '
254 | 'Test SED mAP(micro): {:.3f}, Test SED mAP(macro): {:.3f}'.format(
255 | te_loss[0], te_loss[1], te_loss[2], te_sed_mAP[0], te_sed_mAP[1]))
256 |
257 | logging.info('Test ER: {:.3f}, Test F-score: {:.3f}, Test DOA error: {:.3f}, Test DOA frame recall: {:.3f}, Test SELD error: {:.3f}'.format(
258 | te_sed_scores[0], te_sed_scores[1], te_doa_er_metric[0], te_doa_er_metric[1], te_seld_metric))
259 |
260 |
261 | def print_evaluation(metrics):
262 |
263 | [te_loss, te_sed_mAP, te_sed_scores, te_doa_er_metric,
264 | te_seld_metric] = metrics
265 |
266 | print('Test SELD loss: {:.3f}, Test SED loss: {:.3f}, Test DOA loss: {:.3f}, '
267 | 'Test SED mAP(micro): {:.3f}, Test SED mAP(macro): {:.3f}'.format(
268 | te_loss[0], te_loss[1], te_loss[2], te_sed_mAP[0], te_sed_mAP[1]))
269 |
270 | print('Test ER: {:.3f}, Test F-score: {:.3f}, Test DOA error: {:.3f}, Test DOA frame recall: {:.3f}, Test SELD error: {:.3f}'.format(
271 | te_sed_scores[0], te_sed_scores[1], te_doa_er_metric[0], te_doa_er_metric[1], te_seld_metric))
272 |
273 |
274 | def str2bool(v):
275 | if isinstance(v, bool):
276 | return v
277 | if v.lower() in ('yes', 'true', 't', 'y', '1'):
278 | return True
279 | elif v.lower() in ('no', 'false', 'f', 'n', '0'):
280 | return False
281 | else:
282 | raise argparse.ArgumentTypeError('Boolean value expected.')
283 |
--------------------------------------------------------------------------------
/utils/resnet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.utils.model_zoo import load_url as load_state_dict_from_url
4 | import torch.utils.model_zoo as model_zoo
5 |
6 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
7 | 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
8 | 'wide_resnet50_2', 'wide_resnet101_2']
9 |
10 |
11 | model_urls = {
12 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
13 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
14 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
15 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
16 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
17 | 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
18 | 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
19 | 'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
20 | 'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
21 | }
22 |
23 |
24 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
25 | """3x3 convolution with padding"""
26 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
27 | padding=dilation, groups=groups, bias=False, dilation=dilation)
28 |
29 |
30 | def conv1x1(in_planes, out_planes, stride=1):
31 | """1x1 convolution"""
32 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
33 |
34 |
35 | class BasicBlock(nn.Module):
36 | expansion = 1
37 |
38 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
39 | base_width=64, dilation=1, norm_layer=None):
40 | super(BasicBlock, self).__init__()
41 | if norm_layer is None:
42 | norm_layer = nn.BatchNorm2d
43 | if groups != 1 or base_width != 64:
44 | raise ValueError('BasicBlock only supports groups=1 and base_width=64')
45 | if dilation > 1:
46 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
47 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1
48 | self.conv1 = conv3x3(inplanes, planes, stride)
49 | self.bn1 = norm_layer(planes)
50 | self.relu = nn.ReLU(inplace=True)
51 | self.conv2 = conv3x3(planes, planes)
52 | self.bn2 = norm_layer(planes)
53 | self.downsample = downsample
54 | self.stride = stride
55 |
56 | def forward(self, x):
57 | identity = x
58 |
59 | out = self.conv1(x)
60 | out = self.bn1(out)
61 | out = self.relu(out)
62 |
63 | out = self.conv2(out)
64 | out = self.bn2(out)
65 |
66 | if self.downsample is not None:
67 | identity = self.downsample(x)
68 |
69 | out += identity
70 | out = self.relu(out)
71 |
72 | return out
73 |
74 |
75 | class Bottleneck(nn.Module):
76 | # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
77 | # while original implementation places the stride at the first 1x1 convolution(self.conv1)
78 | # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
79 | # This variant is also known as ResNet V1.5 and improves accuracy according to
80 | # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
81 |
82 | expansion = 4
83 |
84 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
85 | base_width=64, dilation=1, norm_layer=None):
86 | super(Bottleneck, self).__init__()
87 | if norm_layer is None:
88 | norm_layer = nn.BatchNorm2d
89 | width = int(planes * (base_width / 64.)) * groups
90 | # Both self.conv2 and self.downsample layers downsample the input when stride != 1
91 | self.conv1 = conv1x1(inplanes, width)
92 | self.bn1 = norm_layer(width)
93 | self.conv2 = conv3x3(width, width, stride, groups, dilation)
94 | self.bn2 = norm_layer(width)
95 | self.conv3 = conv1x1(width, planes * self.expansion)
96 | self.bn3 = norm_layer(planes * self.expansion)
97 | self.relu = nn.ReLU(inplace=True)
98 | self.downsample = downsample
99 | self.stride = stride
100 |
101 | def forward(self, x):
102 | identity = x
103 |
104 | out = self.conv1(x)
105 | out = self.bn1(out)
106 | out = self.relu(out)
107 |
108 | out = self.conv2(out)
109 | out = self.bn2(out)
110 | out = self.relu(out)
111 |
112 | out = self.conv3(out)
113 | out = self.bn3(out)
114 |
115 | if self.downsample is not None:
116 | identity = self.downsample(x)
117 |
118 | out += identity
119 | out = self.relu(out)
120 |
121 | return out
122 |
123 |
124 | class ResNet(nn.Module):
125 |
126 | def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
127 | groups=1, width_per_group=64, replace_stride_with_dilation=None,
128 | norm_layer=None):
129 | super(ResNet, self).__init__()
130 | if norm_layer is None:
131 | norm_layer = nn.BatchNorm2d
132 | self._norm_layer = norm_layer
133 |
134 | self.inplanes = 64
135 | self.dilation = 1
136 | if replace_stride_with_dilation is None:
137 | # each element in the tuple indicates if we should replace
138 | # the 2x2 stride with a dilated convolution instead
139 | replace_stride_with_dilation = [False, False, False]
140 | if len(replace_stride_with_dilation) != 3:
141 | raise ValueError("replace_stride_with_dilation should be None "
142 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
143 | self.groups = groups
144 | self.base_width = width_per_group
145 | self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
146 | bias=False)
147 | self.bn1 = norm_layer(self.inplanes)
148 | self.relu = nn.ReLU(inplace=True)
149 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
150 | self.layer1 = self._make_layer(block, 64, layers[0])
151 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
152 | dilate=replace_stride_with_dilation[0])
153 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
154 | dilate=replace_stride_with_dilation[1])
155 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
156 | dilate=replace_stride_with_dilation[2])
157 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
158 | self.fc = nn.Linear(512 * block.expansion, num_classes)
159 |
160 | for m in self.modules():
161 | if isinstance(m, nn.Conv2d):
162 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
163 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
164 | nn.init.constant_(m.weight, 1)
165 | nn.init.constant_(m.bias, 0)
166 |
167 | # Zero-initialize the last BN in each residual branch,
168 | # so that the residual branch starts with zeros, and each residual block behaves like an identity.
169 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
170 | if zero_init_residual:
171 | for m in self.modules():
172 | if isinstance(m, Bottleneck):
173 | nn.init.constant_(m.bn3.weight, 0)
174 | elif isinstance(m, BasicBlock):
175 | nn.init.constant_(m.bn2.weight, 0)
176 |
177 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
178 | norm_layer = self._norm_layer
179 | downsample = None
180 | previous_dilation = self.dilation
181 | if dilate:
182 | self.dilation *= stride
183 | stride = 1
184 | if stride != 1 or self.inplanes != planes * block.expansion:
185 | downsample = nn.Sequential(
186 | conv1x1(self.inplanes, planes * block.expansion, stride),
187 | norm_layer(planes * block.expansion),
188 | )
189 |
190 | layers = []
191 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
192 | self.base_width, previous_dilation, norm_layer))
193 | self.inplanes = planes * block.expansion
194 | for _ in range(1, blocks):
195 | layers.append(block(self.inplanes, planes, groups=self.groups,
196 | base_width=self.base_width, dilation=self.dilation,
197 | norm_layer=norm_layer))
198 |
199 | return nn.Sequential(*layers)
200 |
201 | def _forward_impl(self, x):
202 | # See note [TorchScript super()]
203 | x = self.conv1(x)
204 | x = self.bn1(x)
205 | x = self.relu(x)
206 | x = self.maxpool(x)
207 |
208 | x = self.layer1(x)
209 | x = self.layer2(x)
210 | x = self.layer3(x)
211 | x = self.layer4(x)
212 |
213 | x = self.avgpool(x)
214 | x = torch.flatten(x, 1)
215 | x = self.fc(x)
216 |
217 | return x
218 |
219 | def forward(self, x):
220 | return self._forward_impl(x)
221 |
222 |
223 | def _resnet(arch, block, layers, pretrained, progress, **kwargs):
224 | model = ResNet(block, layers, **kwargs)
225 | if pretrained:
226 | state_dict = load_state_dict_from_url(model_urls[arch],
227 | progress=progress)
228 | model.load_state_dict(state_dict)
229 | return model
230 |
231 |
232 | def resnet18(pretrained=False, progress=True, **kwargs):
233 | r"""ResNet-18 model from
234 | `"Deep Residual Learning for Image Recognition" `_
235 | Args:
236 | pretrained (bool): If True, returns a model pre-trained on ImageNet
237 | progress (bool): If True, displays a progress bar of the download to stderr
238 | """
239 | return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
240 | **kwargs)
241 |
242 |
243 | def resnet34(pretrained=False, progress=True, **kwargs):
244 | r"""ResNet-34 model from
245 | `"Deep Residual Learning for Image Recognition" `_
246 | Args:
247 | pretrained (bool): If True, returns a model pre-trained on ImageNet
248 | progress (bool): If True, displays a progress bar of the download to stderr
249 | """
250 | return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
251 | **kwargs)
252 |
253 |
254 | def resnet50(pretrained=False, progress=True, **kwargs):
255 | r"""ResNet-50 model from
256 | `"Deep Residual Learning for Image Recognition" `_
257 | Args:
258 | pretrained (bool): If True, returns a model pre-trained on ImageNet
259 | progress (bool): If True, displays a progress bar of the download to stderr
260 | """
261 | return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
262 | **kwargs)
263 |
264 |
265 | def resnet101(pretrained=False, progress=True, **kwargs):
266 | r"""ResNet-101 model from
267 | `"Deep Residual Learning for Image Recognition" `_
268 | Args:
269 | pretrained (bool): If True, returns a model pre-trained on ImageNet
270 | progress (bool): If True, displays a progress bar of the download to stderr
271 | """
272 | return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
273 | **kwargs)
274 |
275 |
276 | def resnet152(pretrained=False, progress=True, **kwargs):
277 | r"""ResNet-152 model from
278 | `"Deep Residual Learning for Image Recognition" `_
279 | Args:
280 | pretrained (bool): If True, returns a model pre-trained on ImageNet
281 | progress (bool): If True, displays a progress bar of the download to stderr
282 | """
283 | return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
284 | **kwargs)
285 |
286 |
287 | def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
288 | r"""ResNeXt-50 32x4d model from
289 | `"Aggregated Residual Transformation for Deep Neural Networks" `_
290 | Args:
291 | pretrained (bool): If True, returns a model pre-trained on ImageNet
292 | progress (bool): If True, displays a progress bar of the download to stderr
293 | """
294 | kwargs['groups'] = 32
295 | kwargs['width_per_group'] = 4
296 | return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
297 | pretrained, progress, **kwargs)
298 |
299 |
300 | def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
301 | r"""ResNeXt-101 32x8d model from
302 | `"Aggregated Residual Transformation for Deep Neural Networks" `_
303 | Args:
304 | pretrained (bool): If True, returns a model pre-trained on ImageNet
305 | progress (bool): If True, displays a progress bar of the download to stderr
306 | """
307 | kwargs['groups'] = 32
308 | kwargs['width_per_group'] = 8
309 | return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
310 | pretrained, progress, **kwargs)
311 |
312 |
313 | def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
314 | r"""Wide ResNet-50-2 model from
315 | `"Wide Residual Networks" `_
316 | The model is the same as ResNet except for the bottleneck number of channels
317 | which is twice larger in every block. The number of channels in outer 1x1
318 | convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
319 | channels, and in Wide ResNet-50-2 has 2048-1024-2048.
320 | Args:
321 | pretrained (bool): If True, returns a model pre-trained on ImageNet
322 | progress (bool): If True, displays a progress bar of the download to stderr
323 | """
324 | kwargs['width_per_group'] = 64 * 2
325 | return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
326 | pretrained, progress, **kwargs)
327 |
328 |
329 | def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
330 | r"""Wide ResNet-101-2 model from
331 | `"Wide Residual Networks" `_
332 | The model is the same as ResNet except for the bottleneck number of channels
333 | which is twice larger in every block. The number of channels in outer 1x1
334 | convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
335 | channels, and in Wide ResNet-50-2 has 2048-1024-2048.
336 | Args:
337 | pretrained (bool): If True, returns a model pre-trained on ImageNet
338 | progress (bool): If True, displays a progress bar of the download to stderr
339 | """
340 | kwargs['width_per_group'] = 64 * 2
341 | return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
342 | pretrained, progress, **kwargs)
--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
1 | '''
2 | MIT License
3 |
4 | Copyright (c) 2019 Yin Cao
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | '''
24 |
25 | import torch
26 | import torch.nn as nn
27 | import torch.nn.functional as F
28 | import pdb
29 | from utils.utils import make_encoder
30 |
31 | import pdb
32 |
33 | import numpy as np
34 | import torch
35 | import torch.nn as nn
36 | import torch.nn.functional as F
37 |
38 | from model_utilities import ConvBlock, init_gru, init_layer, interpolate
39 |
40 |
41 | class CRNN9(nn.Module):
42 | def __init__(self, class_num, pool_type='avg', pool_size=(2,2), pretrained_path=None):
43 |
44 | super().__init__()
45 | self.class_num = class_num
46 | self.pool_type = pool_type
47 | self.pool_size = pool_size
48 | self.interp_ratio = 8
49 |
50 | self.conv_block1 = ConvBlock(in_channels=3, out_channels=128) # 1: 7, 128 2: 7, 64
51 | self.conv_block2 = ConvBlock(in_channels=128, out_channels=256) # 1: 128, 256 2: 64, 256
52 | self.conv_block3 = ConvBlock(in_channels=256, out_channels=512)
53 |
54 | #self.gru = nn.GRU(input_size=512, hidden_size=256,
55 | # num_layers=2, dropout=0.3, batch_first=True, bidirectional=True)
56 |
57 | self.azimuth_fc = nn.Linear(512, class_num, bias=True)
58 |
59 |
60 | self.init_weights()
61 |
62 | def init_weights(self):
63 |
64 | #init_gru(self.gru)
65 | init_layer(self.azimuth_fc)
66 |
67 |
68 | def forward(self, x):
69 | '''input: (batch_size, mic_channels, time_steps, mel_bins)'''
70 |
71 | x = self.conv_block1(x, self.pool_type, pool_size=self.pool_size)
72 | x = self.conv_block2(x, self.pool_type, pool_size=self.pool_size)
73 | x = self.conv_block3(x, self.pool_type, pool_size=self.pool_size)
74 | '''(batch_size, feature_maps, time_steps, mel_bins)'''
75 |
76 | if self.pool_type == 'avg':
77 | x = torch.mean(x, dim=3)
78 | elif self.pool_type == 'max':
79 | (x, _) = torch.max(x, dim=3)
80 | '''(batch_size, feature_maps, time_steps)'''
81 |
82 | x = x.transpose(1,2)
83 | ''' (batch_size, time_steps, feature_maps):'''
84 | #
85 | # self.gru.flatten_parameters()
86 |
87 | '''if pack padded'''
88 | # '''else'''
89 | #(x, _) = self.gru(x)
90 |
91 | azimuth_output = self.azimuth_fc(x)
92 | # Interpolate
93 | output = interpolate(azimuth_output, self.interp_ratio)
94 |
95 | return output
96 |
97 |
98 | class pretrained_CRNN8(CRNN9):
99 |
100 | def __init__(self, class_num, pool_type='avg', pool_size=(2,2), pretrained_path=None):
101 |
102 | super().__init__(class_num, pool_type, pool_size, pretrained_path=pretrained_path)
103 | if pretrained_path:
104 | self.load_weights(pretrained_path)
105 |
106 | self.gru = nn.GRU(input_size=512, hidden_size=256,
107 | num_layers=1, batch_first=True, bidirectional=True)
108 |
109 | init_gru(self.gru)
110 | init_layer(self.azimuth_fc)
111 |
112 | def load_weights(self, pretrained_path):
113 |
114 | model = CRNN9(self.class_num, self.pool_type, self.pool_size)
115 | checkpoint = torch.load(pretrained_path, map_location=lambda storage, loc: storage)
116 | model.load_state_dict(checkpoint['model_state_dict'])
117 |
118 | self.conv_block1 = model.conv_block1
119 | self.conv_block2 = model.conv_block2
120 | self.conv_block3 = model.conv_block3
121 |
122 |
123 |
124 | class CRNN11(nn.Module):
125 | def __init__(self, class_num, pool_type='avg', pool_size=(2,2), pretrained_path=None):
126 |
127 | super().__init__()
128 |
129 | self.class_num = class_num
130 | self.pool_type = pool_type
131 | self.pool_size = pool_size
132 | self.interp_ratio = 16
133 |
134 | self.conv_block1 = ConvBlock(in_channels=3, out_channels=64)
135 | self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
136 | self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
137 | self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
138 |
139 | self.gru = nn.GRU(input_size=512, hidden_size=256,
140 | num_layers=2, dropout=0.3, batch_first=True, bidirectional=True)
141 |
142 | self.azimuth_fc = nn.Linear(512, class_num, bias=True)
143 | self.init_weights()
144 |
145 | def init_weights(self):
146 |
147 | init_gru(self.gru)
148 | init_layer(self.azimuth_fc)
149 |
150 | def forward(self, x):
151 | '''input: (batch_size, mic_channels, time_steps, mel_bins)'''
152 |
153 | x = self.conv_block1(x, self.pool_type, pool_size=self.pool_size)
154 | x = self.conv_block2(x, self.pool_type, pool_size=self.pool_size)
155 | x = self.conv_block3(x, self.pool_type, pool_size=self.pool_size)
156 | x = self.conv_block4(x, self.pool_type, pool_size=self.pool_size)
157 | '''(batch_size, feature_maps, time_steps, mel_bins)'''
158 |
159 | if self.pool_type == 'avg':
160 | x = torch.mean(x, dim=3)
161 | elif self.pool_type == 'max':
162 | (x, _) = torch.max(x, dim=3)
163 | '''(batch_size, feature_maps, time_steps)'''
164 |
165 | x = x.transpose(1,2)
166 | ''' (batch_size, time_steps, feature_maps):'''
167 |
168 | # self.gru.flatten_parameters()
169 | (x, _) = self.gru(x)
170 |
171 | azimuth_output = self.azimuth_fc(x)
172 | '''(batch_size, time_steps, class_num)'''
173 |
174 | # Interpolate
175 | azimuth_output = interpolate(azimuth_output, self.interp_ratio)
176 | return azimuth_output
177 |
178 |
179 | class pretrained_CRNN10(CRNN11):
180 |
181 | def __init__(self, class_num, pool_type='avg', pool_size=(2,2), pretrained_path=None):
182 |
183 | super().__init__(class_num, pool_type, pool_size, pretrained_path=pretrained_path)
184 |
185 | if pretrained_path:
186 | self.load_weights(pretrained_path)
187 |
188 | self.gru = nn.GRU(input_size=512, hidden_size=256,
189 | num_layers=1, batch_first=True, bidirectional=True)
190 |
191 | init_gru(self.gru)
192 | init_layer(self.azimuth_fc)
193 |
194 | def load_weights(self, pretrained_path):
195 |
196 | model = CRNN11(self.class_num, self.pool_type, self.pool_size)
197 | checkpoint = torch.load(pretrained_path, map_location=lambda storage, loc: storage)
198 | model.load_state_dict(checkpoint['model_state_dict'])
199 |
200 | self.conv_block1 = model.conv_block1
201 | self.conv_block2 = model.conv_block2
202 | self.conv_block3 = model.conv_block3
203 | self.conv_block4 = model.conv_block4
204 |
205 |
206 |
207 | class Gated_CRNN9(nn.Module):
208 | def __init__(self, class_num, pool_type='avg', pool_size=(2,2), pretrained_path=None):
209 |
210 | super().__init__()
211 |
212 | self.class_num = class_num
213 | self.pool_type = pool_type
214 | self.pool_size = pool_size
215 | self.interp_ratio = 8
216 |
217 | self.conv_block1 = ConvBlock(in_channels=3, out_channels=64) # 1: 7, 128 2: 7, 64
218 | self.conv_block2 = ConvBlock(in_channels=64, out_channels=256) # 1: 128, 256 2: 64, 256
219 | self.conv_block3 = ConvBlock(in_channels=256, out_channels=512)
220 |
221 | self.gate_block1 = ConvBlock(in_channels=3, out_channels=64)
222 | self.gate_block2 = ConvBlock(in_channels=64, out_channels=256)
223 | self.gate_block3 = ConvBlock(in_channels=256, out_channels=512)
224 |
225 | self.gru = nn.GRU(input_size=512, hidden_size=256,
226 | num_layers=2, dropout=0.3, batch_first=True, bidirectional=True)
227 |
228 | #self.azimuth_fc = nn.Linear(512, class_num, bias=True)
229 | self.azimuth_fc1 = nn.Linear(512, 128, bias=True)
230 | self.azimuth_fc2 = nn.Linear(128, class_num, bias=True)
231 |
232 | self.init_weights()
233 |
234 | def init_weights(self):
235 |
236 | init_gru(self.gru)
237 | #init_layer(self.azimuth_fc)
238 | init_layer(self.azimuth_fc1)
239 | init_layer(self.azimuth_fc2)
240 |
241 | def forward(self, x):
242 | #pdb.set_trace()
243 | '''input: (batch_size, mic_channels, time_steps, mel_bins)'''
244 | gate = self.gate_block1(x, self.pool_type, pool_size=self.pool_size)
245 | x = self.conv_block1(x, self.pool_type, pool_size=self.pool_size)
246 | x = x * torch.sigmoid(gate)
247 |
248 | gate = self.gate_block2(x, self.pool_type, pool_size=self.pool_size)
249 | x = self.conv_block2(x, self.pool_type, pool_size=self.pool_size)
250 | x = x * torch.sigmoid(gate)
251 |
252 | gate = self.gate_block3(x, self.pool_type, pool_size=self.pool_size)
253 | x = self.conv_block3(x, self.pool_type, pool_size=self.pool_size)
254 | x = x * torch.sigmoid(gate)
255 | '''(batch_size, feature_maps, time_steps, mel_bins)'''
256 |
257 | if self.pool_type == 'avg':
258 | x = torch.mean(x, dim=3)
259 | elif self.pool_type == 'max':
260 | (x, _) = torch.max(x, dim=3)
261 | '''(batch_size, feature_maps, time_steps)'''
262 |
263 | x = x.transpose(1,2)
264 | ''' (batch_size, time_steps, feature_maps):'''
265 |
266 | self.gru.flatten_parameters()
267 | (x, _) = self.gru(x)
268 |
269 | x = self.azimuth_fc1(x)
270 | azimuth_output = self.azimuth_fc2(x)
271 | #azimuth_output = self.azimuth_fc(x)
272 | '''(batch_size, time_steps, class_num)'''
273 |
274 | # Interpolate
275 | azimuth_output = interpolate(azimuth_output, self.interp_ratio)
276 | azimuth_output = F.sigmoid(azimuth_output)
277 |
278 | return azimuth_output
279 |
280 |
281 | class pretrained_Gated_CRNN8(Gated_CRNN9):
282 |
283 | def __init__(self, class_num, pool_type='avg', pool_size=(2,2), pretrained_path=None):
284 |
285 | super().__init__(class_num, pool_type, pool_size, pretrained_path=pretrained_path)
286 |
287 | if pretrained_path:
288 | self.load_weights(pretrained_path)
289 |
290 | self.gru = nn.GRU(input_size=512, hidden_size=256,
291 | num_layers=1, batch_first=True, bidirectional=True)
292 |
293 | init_gru(self.gru)
294 | #init_layer(self.azimuth_fc)
295 | init_layer(self.azimuth_fc1)
296 | init_layer(self.azimuth_fc2)
297 |
298 | def load_weights(self, pretrained_path):
299 |
300 | model = Gated_CRNN9(self.class_num, self.pool_type, self.pool_size)
301 | checkpoint = torch.load(pretrained_path, map_location=lambda storage, loc: storage)
302 | model.load_state_dict(checkpoint['model_state_dict'])
303 |
304 | self.conv_block1 = model.conv_block1
305 | self.conv_block2 = model.conv_block2
306 | self.conv_block3 = model.conv_block3
307 |
308 | class JUNGMIN(nn.Module):
309 | def __init__(self):
310 | super().__init__()
311 |
312 | in_channel = 514
313 | out_channel = 514
314 |
315 | self.layer1 = nn.Sequential(
316 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
317 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
318 | nn.BatchNorm1d(out_channel),
319 | nn.ReLU(),
320 | )
321 |
322 |
323 |
324 | self.layer2 = nn.Sequential(
325 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
326 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
327 | nn.BatchNorm1d(out_channel),
328 | nn.ReLU(),
329 | )
330 |
331 | self.layer3 = nn.Sequential(
332 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
333 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
334 | nn.BatchNorm1d(out_channel),
335 | nn.ReLU(),
336 | )
337 |
338 | self.fc = nn.Linear(514, 10)
339 | self.init_weights()
340 |
341 | def init_weights(self):
342 | init_layer(self.fc)
343 |
344 | def forward(self, x):
345 |
346 | B,M,T,F = x.size()
347 | # pdb.set_trace()
348 | '''input: (batch_size, mic_channels, time_steps, mel_bins)'''
349 | x = x.permute(0, 1, 3, 2).reshape(B, -1, T)
350 |
351 | x = self.layer1(x)
352 | x = self.layer2(x)
353 | x = self.layer3(x)
354 | x = x.transpose(1,2)
355 |
356 | # (x, _) = self.gru(x)
357 | return self.fc(x)
358 |
359 | class JUNGMIN2(nn.Module):
360 | def __init__(self):
361 | super().__init__()
362 |
363 | in_channel = 514
364 | out_channel = 514
365 |
366 | self.layer1 = nn.Sequential(
367 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
368 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
369 | nn.BatchNorm1d(out_channel),
370 | nn.ReLU(),
371 | )
372 |
373 |
374 |
375 | self.layer2 = nn.Sequential(
376 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
377 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
378 | nn.BatchNorm1d(out_channel),
379 | nn.ReLU(),
380 | )
381 |
382 | self.layer3 = nn.Sequential(
383 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
384 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
385 | nn.BatchNorm1d(out_channel),
386 | nn.ReLU(),
387 | )
388 |
389 |
390 | self.gru = nn.GRU(input_size=514, hidden_size=256,
391 | num_layers=2, dropout=0.3, batch_first=True, bidirectional=True)
392 |
393 | self.fc = nn.Linear(512, 10)
394 |
395 | self.init_weights()
396 |
397 | def init_weights(self):
398 |
399 | init_gru(self.gru)
400 | init_layer(self.fc)
401 |
402 | def forward(self, x):
403 |
404 | B,M,T,F = x.size()
405 | # pdb.set_trace()
406 | '''input: (batch_size, mic_channels, time_steps, mel_bins)'''
407 | x = x.permute(0, 1, 3, 2).reshape(B, -1, T)
408 |
409 | x = self.layer1(x)
410 | x = self.layer2(x)
411 | x = self.layer3(x)
412 | x = x.transpose(1,2)
413 |
414 | (x, _) = self.gru(x)
415 | return self.fc(x)
416 |
417 |
418 | class JUNGMIN3(nn.Module):
419 | def __init__(self):
420 | super().__init__()
421 |
422 | in_channel = 514
423 | out_channel = 514
424 |
425 | self.layer1 = nn.Sequential(
426 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
427 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
428 | nn.BatchNorm1d(out_channel),
429 | nn.ReLU(),
430 | )
431 |
432 |
433 |
434 | self.layer2 = nn.Sequential(
435 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
436 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
437 | nn.BatchNorm1d(out_channel),
438 | nn.ReLU(),
439 | )
440 |
441 | self.layer3 = nn.Sequential(
442 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
443 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
444 | nn.BatchNorm1d(out_channel),
445 | nn.ReLU(),
446 | )
447 |
448 | self.fc = nn.Sequential(
449 | nn.Linear(514, 256),
450 | nn.ReLU(),
451 | nn.Linear(256, 10)
452 | )
453 |
454 |
455 | self.init_weights()
456 |
457 | def init_weights(self):
458 | init_layer(self.fc)
459 |
460 | def forward(self, x):
461 |
462 | B,M,T,F = x.size()
463 | # pdb.set_trace()
464 | '''input: (batch_size, mic_channels, time_steps, mel_bins)'''
465 | x = x.permute(0, 1, 3, 2).reshape(B, -1, T)
466 |
467 | x = self.layer1(x)
468 | x = self.layer2(x)
469 | x = self.layer3(x)
470 | x = x.transpose(1,2)
471 |
472 | # (x, _) = self.gru(x)
473 | return self.fc(x)
474 |
475 |
476 |
477 |
478 |
479 | class JUNGMIN4(nn.Module):
480 | def __init__(self):
481 | super().__init__()
482 |
483 | in_channel = 514
484 | out_channel = 514
485 |
486 | self.layer1 = nn.Sequential(
487 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
488 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
489 | nn.BatchNorm1d(out_channel),
490 | nn.ReLU(),
491 | )
492 |
493 |
494 |
495 | self.layer2 = nn.Sequential(
496 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
497 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
498 | nn.BatchNorm1d(out_channel),
499 | nn.ReLU(),
500 | )
501 |
502 | self.layer3 = nn.Sequential(
503 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
504 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
505 | nn.BatchNorm1d(out_channel),
506 | nn.ReLU(),
507 | )
508 |
509 |
510 | self.layer4 = nn.Sequential(
511 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
512 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
513 | nn.BatchNorm1d(out_channel),
514 | nn.ReLU(),
515 | )
516 |
517 |
518 |
519 | self.fc = nn.Sequential(
520 | nn.Linear(514, 256),
521 | nn.ReLU(),
522 | nn.Linear(256, 10)
523 | )
524 |
525 |
526 | self.init_weights()
527 |
528 | def init_weights(self):
529 | init_layer(self.fc)
530 |
531 | def forward(self, x):
532 |
533 | B,M,T,F = x.size()
534 | # pdb.set_trace()
535 | '''input: (batch_size, mic_channels, time_steps, mel_bins)'''
536 | x = x.permute(0, 1, 3, 2).reshape(B, -1, T)
537 |
538 | x = self.layer1(x)
539 | x = self.layer2(x)
540 | x = self.layer3(x)
541 | x = self.layer4(x)
542 | x = x.transpose(1,2)
543 |
544 | # (x, _) = self.gru(x)
545 | return self.fc(x)
546 |
547 |
548 |
549 |
550 | class JUNGMIN5(nn.Module):
551 | def __init__(self):
552 | super().__init__()
553 |
554 | in_channel = 514
555 | out_channel = 514
556 |
557 | self.layer1 = nn.Sequential(
558 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
559 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
560 | nn.BatchNorm1d(out_channel),
561 | nn.ReLU(),
562 | )
563 |
564 |
565 |
566 | self.layer2 = nn.Sequential(
567 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
568 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
569 | nn.BatchNorm1d(out_channel),
570 | nn.ReLU(),
571 | )
572 |
573 | self.layer3 = nn.Sequential(
574 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
575 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
576 | nn.BatchNorm1d(out_channel),
577 | nn.ReLU(),
578 | )
579 |
580 |
581 | self.layer4 = nn.Sequential(
582 | nn.Conv1d(in_channel, in_channel, kernel_size=5,stride=1,padding=2,groups=in_channel),
583 | nn.Conv1d(in_channel, out_channel,kernel_size=1,stride=1,padding=0),
584 | nn.BatchNorm1d(out_channel),
585 | nn.ReLU(),
586 | )
587 |
588 |
589 |
590 | self.fc = nn.Sequential(
591 | nn.Linear(514, 256),
592 | nn.ReLU(),
593 | nn.Linear(256, 10)
594 | )
595 |
596 |
597 | self.init_weights()
598 |
599 | def init_weights(self):
600 | init_layer(self.fc)
601 |
602 | def forward(self, x):
603 |
604 | B,M,T,F = x.size()
605 | # pdb.set_trace()
606 | '''input: (batch_size, mic_channels, time_steps, mel_bins)'''
607 | x = x.permute(0, 1, 3, 2).reshape(B, -1, T)
608 |
609 | x = self.layer1(x)
610 | x = self.layer2(x)
611 | x = self.layer3(x)
612 | x = self.layer4(x)
613 | x = x.transpose(1,2)
614 |
615 | # (x, _) = self.gru(x)
616 | return self.fc(x)
617 |
--------------------------------------------------------------------------------
/utils/feature_extractor.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import pdb
4 | import sys
5 | from timeit import default_timer as timer
6 |
7 | import h5py
8 | import librosa
9 | import matplotlib.pyplot as plt
10 | import numpy as np
11 | import pandas as pd
12 | import scipy.io as sio
13 | from scipy import signal
14 | from tqdm import tqdm
15 |
16 | from utilities import calculate_scalar, event_labels, lb_to_ix
17 |
18 | fs = 32000
19 | nfft = 1024
20 | hopsize = 320 # 640 for 20 ms
21 | mel_bins = 128
22 | window = 'hann'
23 | fmin = 50
24 | hdf5_folder_name = '{}fs_{}nfft_{}hs_{}melb'.format(fs, nfft, hopsize, mel_bins)
25 |
26 |
27 | class LogMelExtractor():
28 | def __init__(self, fs, nfft, hopsize, mel_bins, window, fmin):
29 |
30 | self.nfft = nfft
31 | self.hopsize = hopsize
32 | self.window = window
33 | self.melW = librosa.filters.mel(sr=fs,
34 | n_fft=nfft,
35 | n_mels=mel_bins,
36 | fmin=fmin)
37 |
38 | def transform(self, audio):
39 |
40 | channel_num = audio.shape[0]
41 | feature_logmel = []
42 |
43 | for n in range(channel_num):
44 | S = np.abs(librosa.stft(y=audio[n],
45 | n_fft=self.nfft,
46 | hop_length=self.hopsize,
47 | center=True,
48 | window=self.window,
49 | pad_mode='reflect'))**2
50 |
51 | S_mel = np.dot(self.melW, S).T
52 | S_logmel = librosa.power_to_db(S_mel, ref=1.0, amin=1e-10, top_db=None)
53 | S_logmel = np.expand_dims(S_logmel, axis=0)
54 | feature_logmel.append(S_logmel)
55 |
56 | feature_logmel = np.concatenate(feature_logmel, axis=0)
57 |
58 | return feature_logmel
59 |
60 |
61 | class LogMelGccExtractor():
62 | def __init__(self, fs, nfft, hopsize, mel_bins, window, fmin):
63 |
64 | self.nfft = nfft
65 | self.hopsize = hopsize
66 | self.window = window
67 | self.melW = librosa.filters.mel(sr=fs,
68 | n_fft=nfft,
69 | n_mels=mel_bins,
70 | fmin=fmin)
71 |
72 | def logmel(self, sig):
73 |
74 | S = np.abs(librosa.stft(y=sig,
75 | n_fft=self.nfft,
76 | hop_length=self.hopsize,
77 | center=True,
78 | window=self.window,
79 | pad_mode='reflect'))**2
80 | S_mel = np.dot(self.melW, S).T
81 | S_logmel = librosa.power_to_db(S_mel, ref=1.0, amin=1e-10, top_db=None)
82 | S_logmel = np.expand_dims(S_logmel, axis=0)
83 |
84 | return S_logmel
85 |
86 | def gcc_phat(self, sig, refsig):
87 |
88 | ncorr = 2*self.nfft - 1
89 | nfft = int(2**np.ceil(np.log2(np.abs(ncorr))))
90 | Px = librosa.stft(y=sig,
91 | n_fft=nfft,
92 | hop_length=self.hopsize,
93 | center=True,
94 | window=self.window,
95 | pad_mode='reflect')
96 | Px_ref = librosa.stft(y=refsig,
97 | n_fft=nfft,
98 | hop_length=self.hopsize,
99 | center=True,
100 | window=self.window,
101 | pad_mode='reflect')
102 |
103 | R = Px*np.conj(Px_ref)
104 |
105 | n_frames = R.shape[1]
106 | gcc_phat = []
107 | for i in range(n_frames):
108 | spec = R[:, i].flatten()
109 | cc = np.fft.irfft(np.exp(1.j*np.angle(spec)))
110 | cc = np.concatenate((cc[-mel_bins//2:], cc[:mel_bins//2]))
111 | gcc_phat.append(cc)
112 | gcc_phat = np.array(gcc_phat)
113 | gcc_phat = gcc_phat[None,:,:]
114 |
115 | return gcc_phat
116 |
117 | def transform(self, audio):
118 |
119 | channel_num = audio.shape[0]
120 | feature_logmel = []
121 | feature_gcc_phat = []
122 | for n in range(channel_num):
123 | feature_logmel.append(self.logmel(audio[n]))
124 | for m in range(n+1, channel_num):
125 | feature_gcc_phat.append(
126 | self.gcc_phat(sig=audio[m], refsig=audio[n]))
127 |
128 | feature_logmel = np.concatenate(feature_logmel, axis=0)
129 | feature_gcc_phat = np.concatenate(feature_gcc_phat, axis=0)
130 | feature = np.concatenate([feature_logmel, feature_gcc_phat])
131 |
132 | return feature
133 |
134 |
135 | class LogMelIntensityExtractor():
136 | def __init__(self, fs, nfft, hopsize, mel_bins, window, fmin):
137 |
138 | self.nfft = nfft
139 | self.hopsize = hopsize
140 | self.window = window
141 | self.melW = librosa.filters.mel(sr=fs,
142 | n_fft=nfft,
143 | n_mels=mel_bins,
144 | fmin=fmin)
145 |
146 | def logmel(self, sig):
147 |
148 | S = np.abs(librosa.stft(y=sig,
149 | n_fft=nfft,
150 | hop_length=self.hopsize,
151 | center=True,
152 | window=self.window,
153 | pad_mode='reflect'))**2
154 | S_mel = np.dot(self.melW, S).T
155 | S_logmel = librosa.power_to_db(S_mel, ref=1.0, amin=1e-10, top_db=None)
156 | S_logmel = np.expand_dims(S_logmel, axis=0)
157 |
158 | return S_logmel
159 |
160 | def intensity(self, sig):
161 |
162 | ref = sig[0]
163 | x = sig[1]
164 | y = sig[2]
165 | z = sig[3]
166 |
167 | Pref = librosa.stft(y=ref,
168 | n_fft=nfft,
169 | hop_length=hopsize,
170 | center=True,
171 | window=self.window,
172 | pad_mode='reflect')
173 | Px = librosa.stft(y=x,
174 | n_fft=nfft,
175 | hop_length=hopsize,
176 | center=True,
177 | window=self.window,
178 | pad_mode='reflect')
179 | Py = librosa.stft(y=y,
180 | n_fft=nfft,
181 | hop_length=hopsize,
182 | center=True,
183 | window=self.window,
184 | pad_mode='reflect')
185 | Pz = librosa.stft(y=z,
186 | n_fft=nfft,
187 | hop_length=hopsize,
188 | center=True,
189 | window=self.window,
190 | pad_mode='reflect')
191 |
192 | I1 = np.real(np.conj(Pref) * Px)
193 | I2 = np.real(np.conj(Pref) * Py)
194 | I3 = np.real(np.conj(Pref) * Pz)
195 | normal = np.sqrt(I1**2 + I2**2 + I3**2)
196 | I1 = np.dot(self.melW, I1 / normal).T
197 | I2 = np.dot(self.melW, I2 / normal).T
198 | I3 = np.dot(self.melW, I3 / normal).T
199 | intensity = np.array([I1, I2, I3])
200 |
201 | return intensity
202 |
203 | def transform(self, audio):
204 |
205 | channel_num = audio.shape[0]
206 | feature_logmel = []
207 | for n in range(0, channel_num):
208 | feature_logmel.append(self.logmel(audio[n]))
209 | feature_intensity = self.intensity(sig=audio)
210 |
211 | feature_logmel = np.concatenate(feature_logmel, axis=0)
212 | feature = np.concatenate([feature_logmel, feature_intensity], axis=0)
213 |
214 | return feature
215 |
216 |
217 | class LogMelGccIntensityExtractor():
218 | def __init__(self, fs, nfft, hopsize, mel_bins, window, fmin):
219 |
220 | self.nfft = nfft
221 | self.hopsize = hopsize
222 | self.window = window
223 | self.melW = librosa.filters.mel(sr=fs,
224 | n_fft=nfft,
225 | n_mels=mel_bins,
226 | fmin=fmin)
227 |
228 | def logmel(self, sig):
229 |
230 | S = np.abs(librosa.stft(y=sig,
231 | n_fft=self.nfft,
232 | hop_length=self.hopsize,
233 | center=True,
234 | window=self.window,
235 | pad_mode='reflect'))**2
236 | S_mel = np.dot(self.melW, S).T
237 | S_logmel = librosa.power_to_db(S_mel, ref=1.0, amin=1e-10, top_db=None)
238 | S_logmel = np.expand_dims(S_logmel, axis=0)
239 |
240 | return S_logmel
241 |
242 | def gcc_phat(self, sig, refsig):
243 |
244 | ncorr = 2*self.nfft - 1
245 | nfft = int(2**np.ceil(np.log2(np.abs(ncorr))))
246 | Px = librosa.stft(y=sig,
247 | n_fft=nfft,
248 | hop_length=self.hopsize,
249 | center=True,
250 | window=self.window,
251 | pad_mode='reflect')
252 | Px_ref = librosa.stft(y=refsig,
253 | n_fft=nfft,
254 | hop_length=self.hopsize,
255 | center=True,
256 | window=self.window,
257 | pad_mode='reflect')
258 |
259 | R = Px*np.conj(Px_ref)
260 |
261 | n_frames = R.shape[1]
262 | gcc_phat = []
263 | for i in range(n_frames):
264 | spec = R[:, i].flatten()
265 | cc = np.fft.irfft(np.exp(1.j*np.angle(spec)))
266 | cc = np.concatenate((cc[-mel_bins//2:], cc[:mel_bins//2]))
267 | gcc_phat.append(cc)
268 | gcc_phat = np.array(gcc_phat)
269 | gcc_phat = gcc_phat[None,:,:]
270 |
271 | return gcc_phat
272 |
273 | def intensity(self, sig):
274 |
275 | ref = sig[0]
276 | x = sig[1]
277 | y = sig[2]
278 | z = sig[3]
279 |
280 | Pref = librosa.stft(y=ref,
281 | n_fft=nfft,
282 | hop_length=hopsize,
283 | center=True,
284 | window=self.window,
285 | pad_mode='reflect')
286 | Px = librosa.stft(y=x,
287 | n_fft=nfft,
288 | hop_length=hopsize,
289 | center=True,
290 | window=self.window,
291 | pad_mode='reflect')
292 | Py = librosa.stft(y=y,
293 | n_fft=nfft,
294 | hop_length=hopsize,
295 | center=True,
296 | window=self.window,
297 | pad_mode='reflect')
298 | Pz = librosa.stft(y=z,
299 | n_fft=nfft,
300 | hop_length=hopsize,
301 | center=True,
302 | window=self.window,
303 | pad_mode='reflect')
304 |
305 | I1 = np.real(np.conj(Pref) * Px)
306 | I2 = np.real(np.conj(Pref) * Py)
307 | I3 = np.real(np.conj(Pref) * Pz)
308 | normal = np.sqrt(I1**2 + I2**2 + I3**2)
309 | I1 = np.dot(self.melW, I1 / normal).T
310 | I2 = np.dot(self.melW, I2 / normal).T
311 | I3 = np.dot(self.melW, I3 / normal).T
312 | intensity = np.array([I1, I2, I3])
313 |
314 | return intensity
315 |
316 | def transform(self, audio):
317 |
318 | feature_logmel = []
319 | for n in range(0, 4):
320 | feature_logmel.append(self.logmel(audio[n]))
321 | feature_intensity = self.intensity(sig=audio[0:4])
322 | feature_logmel = np.concatenate(feature_logmel, axis=0)
323 | feature_foa = np.concatenate([feature_logmel, feature_intensity], axis=0)
324 |
325 | feature_logmel = []
326 | feature_gcc_phat = []
327 | for n in range(4, 8):
328 | feature_logmel.append(self.logmel(audio[n]))
329 | for m in range(n+1, 8):
330 | feature_gcc_phat.append(
331 | self.gcc_phat(sig=audio[m], refsig=audio[n]))
332 | feature_logmel = np.concatenate(feature_logmel, axis=0)
333 | feature_gcc_phat = np.concatenate(feature_gcc_phat, axis=0)
334 | feature_mic = np.concatenate([feature_logmel, feature_gcc_phat], axis=0)
335 |
336 | feature = np.concatenate([feature_foa, feature_mic], axis=0)
337 |
338 | return feature
339 |
340 |
341 | def RT_preprocessing(extractor, audio):
342 |
343 | '''This step needs to be considered'''
344 | # audio = audio / (np.max(np.abs(audio)) + np.finfo(np.float).eps)
345 |
346 | feature = extractor.transform(audio)
347 | '''(channels, seq_len, mel_bins)'''
348 | '''(channels, time, frequency)'''
349 |
350 | return feature
351 |
352 | def extract_dev_features(args):
353 | """
354 | Write features and infos of audios to hdf5.
355 | Args:
356 | dataset_dir: dataset path
357 | feature_dir: feature path
358 | audio_type: 'foa' | 'mic' | 'foa&mic'
359 | """
360 | # extractor
361 | if args.feature_type == 'logmel':
362 | extractor = LogMelExtractor(fs=fs,
363 | nfft=nfft,
364 | hopsize=hopsize,
365 | mel_bins=mel_bins,
366 | window=window,
367 | fmin=fmin)
368 | elif args.feature_type == 'logmelgcc':
369 | extractor = LogMelGccExtractor(fs=fs,
370 | nfft=nfft,
371 | hopsize=hopsize,
372 | mel_bins=mel_bins,
373 | window=window,
374 | fmin=fmin)
375 | elif args.feature_type == 'logmelintensity':
376 | extractor = LogMelIntensityExtractor(fs=fs,
377 | nfft=nfft,
378 | hopsize=hopsize,
379 | mel_bins=mel_bins,
380 | window=window,
381 | fmin=fmin)
382 | elif args.feature_type == 'logmelgccintensity':
383 | extractor = LogMelGccIntensityExtractor(fs=fs,
384 | nfft=nfft,
385 | hopsize=hopsize,
386 | mel_bins=mel_bins,
387 | window=window,
388 | fmin=fmin)
389 |
390 | # Path
391 | if args.feature_type == 'logmelgccintensity':
392 | audio_dir = [os.path.join(args.dataset_dir, 'dev', 'foa_dev'), os.path.join(args.dataset_dir, 'dev', 'mic_dev')]
393 | hdf5_dir = os.path.join(args.feature_dir, args.feature_type,
394 | hdf5_folder_name, 'foa&mic_dev')
395 | os.makedirs(hdf5_dir, exist_ok=True)
396 | else:
397 | audio_dir = [os.path.join(args.dataset_dir, 'dev', args.audio_type + '_dev')]
398 | hdf5_dir = os.path.join(args.feature_dir, args.feature_type,
399 | hdf5_folder_name, args.audio_type + '_dev')
400 | os.makedirs(hdf5_dir, exist_ok=True)
401 |
402 | meta_dir = os.path.join(args.dataset_dir, 'dev', 'metadata_dev')
403 |
404 | begin_time = timer()
405 | audio_count = 0
406 |
407 | print('\n============> Start Extracting Features\n')
408 |
409 | iterator = tqdm(sorted(os.listdir(audio_dir[0])), total=len(os.listdir(audio_dir[0])), unit='it')
410 |
411 | for audio_fn in iterator:
412 |
413 | if audio_fn.endswith('.wav') and not audio_fn.startswith('.'):
414 |
415 | fn = audio_fn.split('.')[0]
416 | if args.feature_type == 'logmelgccintensity':
417 | audio_path = [os.path.join(audio_dir[0], audio_fn), os.path.join(audio_dir[1], audio_fn)]
418 | audio_foa, _ = librosa.load(audio_path[0], sr=fs, mono=False, dtype=np.float32)
419 | audio_mic, _ = librosa.load(audio_path[1], sr=fs, mono=False, dtype=np.float32)
420 | audio_len = min(audio_foa.shape[1], audio_mic.shape[1])
421 | audio = np.concatenate([audio_foa[:, :audio_len], audio_mic[:, :audio_len]], axis=0)
422 | '''(channel_nums, samples)'''
423 | else:
424 | audio_path = os.path.join(audio_dir[0], audio_fn)
425 | audio, _ = librosa.load(audio_path, sr=fs, mono=False, dtype=np.float32)
426 | '''(channel_nums, samples)'''
427 |
428 | audio_count += 1
429 |
430 | if np.sum(np.abs(audio)) < len(audio)*1e-4:
431 | with open("feature_removed.txt", "a+") as text_file:
432 | # print("Purchase Amount: {}".format(TotalAmount), file=text_file)
433 | print(f"Silent file removed in feature extractor: {audio_fn}",
434 | file=text_file)
435 | tqdm.write("Silent file removed in feature extractor: {}".format(audio_fn))
436 | continue
437 |
438 | # features
439 | feature = RT_preprocessing(extractor, audio)
440 | '''(channels, time, frequency)'''
441 |
442 | meta_fn = fn + '.csv'
443 | df = pd.read_csv(os.path.join(meta_dir, meta_fn))
444 |
445 | target_event = df['sound_event_recording'].values
446 | target_start_time = df['start_time'].values
447 | target_end_time = df['end_time'].values
448 | target_ele = df['ele'].values
449 | target_azi = df['azi'].values
450 | target_dist = df['dist'].values
451 |
452 | hdf5_path = os.path.join(hdf5_dir, fn + '.h5')
453 | with h5py.File(hdf5_path, 'w') as hf:
454 |
455 | hf.create_dataset('feature', data=feature, dtype=np.float32)
456 | # hf.create_dataset('filename', data=[na.encode() for na in [fn]], dtype='S20')
457 |
458 | hf.create_group('target')
459 | hf['target'].create_dataset('event', data=[e.encode() for e in target_event], dtype='S20')
460 | hf['target'].create_dataset('start_time', data=target_start_time, dtype=np.float32)
461 | hf['target'].create_dataset('end_time', data=target_end_time, dtype=np.float32)
462 | hf['target'].create_dataset('elevation', data=target_ele, dtype=np.float32)
463 | hf['target'].create_dataset('azimuth', data=target_azi, dtype=np.float32)
464 | hf['target'].create_dataset('distance', data=target_dist, dtype=np.float32)
465 |
466 | tqdm.write('{}, {}, {}'.format(audio_count, hdf5_path, feature.shape))
467 |
468 | iterator.close()
469 | print("Extacting feature finished! Time spent: {:.3f} s".format(timer() - begin_time))
470 |
471 |
472 | def fit(args):
473 | """
474 | Calculate scalar.
475 | Args:
476 | feature_dir: feature path
477 | audio_type: 'foa' | 'mic' | 'foa&mic'
478 | """
479 |
480 | hdf5_dir = os.path.join(args.feature_dir, args.feature_type,
481 | hdf5_folder_name, args.audio_type + '_dev')
482 |
483 | scalar_path = os.path.join(args.feature_dir, args.feature_type,
484 | hdf5_folder_name, args.audio_type + '_scalar.h5')
485 |
486 | os.makedirs(os.path.dirname(scalar_path), exist_ok=True)
487 |
488 | print('\n============> Start Calculating Scalar.\n')
489 |
490 | load_time = timer()
491 | features = []
492 | for hdf5_fn in os.listdir(hdf5_dir):
493 | hdf5_path = os.path.join(hdf5_dir, hdf5_fn)
494 | with h5py.File(hdf5_path, 'r') as hf:
495 | features.append(hf['feature'][:])
496 | print('Load feature time: {:.3f} s'.format(timer() - load_time))
497 |
498 | features = np.concatenate(features, axis=1)
499 | (mean, std) = calculate_scalar(features)
500 |
501 | with h5py.File(scalar_path, 'w') as hf_scalar:
502 | hf_scalar.create_dataset('mean', data=mean, dtype=np.float32)
503 | hf_scalar.create_dataset('std', data=std, dtype=np.float32)
504 |
505 | print('Features shape: {}'.format(features.shape))
506 | print('mean {}:\n{}'.format(mean.shape, mean))
507 | print('std {}:\n{}'.format(std.shape, std))
508 | print('Write out scalar to {}'.format(scalar_path))
509 |
510 |
511 | def extract_eval_features(args):
512 | """
513 | Write features and infos of audios to hdf5.
514 | Args:
515 | dataset_dir: dataset path
516 | feature_dir: feature path
517 | audio_type: 'foa' | 'mic' | 'foa&mic'
518 | """
519 | # extractor
520 | if args.feature_type == 'logmel':
521 | extractor = LogMelExtractor(fs=fs,
522 | nfft=nfft,
523 | hopsize=hopsize,
524 | mel_bins=mel_bins,
525 | window=window,
526 | fmin=fmin)
527 | elif args.feature_type == 'logmelgcc':
528 | extractor = LogMelGccExtractor(fs=fs,
529 | nfft=nfft,
530 | hopsize=hopsize,
531 | mel_bins=mel_bins,
532 | window=window,
533 | fmin=fmin)
534 | elif args.feature_type == 'logmelintensity':
535 | extractor = LogMelIntensityExtractor(fs=fs,
536 | nfft=nfft,
537 | hopsize=hopsize,
538 | mel_bins=mel_bins,
539 | window=window,
540 | fmin=fmin)
541 | elif args.feature_type == 'logmelgccintensity':
542 | extractor = LogMelGccIntensityExtractor(fs=fs,
543 | nfft=nfft,
544 | hopsize=hopsize,
545 | mel_bins=mel_bins,
546 | window=window,
547 | fmin=fmin)
548 |
549 | # Path
550 | if args.feature_type == 'logmelgccintensity':
551 | audio_dir = [os.path.join(args.dataset_dir, 'eval', 'foa_eval'), os.path.join(args.dataset_dir, 'eval', 'mic_eval')]
552 | hdf5_dir = os.path.join(args.feature_dir, args.feature_type,
553 | hdf5_folder_name, 'foa&mic_eval')
554 | os.makedirs(hdf5_dir, exist_ok=True)
555 | else:
556 | audio_dir = [os.path.join(args.dataset_dir, 'eval', args.audio_type + '_eval')]
557 | hdf5_dir = os.path.join(args.feature_dir, args.feature_type,
558 | hdf5_folder_name, args.audio_type + '_eval')
559 | os.makedirs(hdf5_dir, exist_ok=True)
560 |
561 | begin_time = timer()
562 | audio_count = 0
563 |
564 | print('\n============> Start Extracting Features\n')
565 | pdb.set_trace()
566 | iterator = tqdm(sorted(os.listdir(audio_dir[0])), total=len(os.listdir(audio_dir[0])), unit='it')
567 |
568 | for audio_fn in iterator:
569 |
570 | if audio_fn.endswith('.wav') and not audio_fn.startswith('.'):
571 |
572 | fn = audio_fn.split('.')[0]
573 | if args.feature_type == 'logmelgccintensity':
574 | audio_path = [os.path.join(audio_dir[0], audio_fn), os.path.join(audio_dir[1], audio_fn)]
575 | audio_foa, _ = librosa.load(audio_path[0], sr=fs, mono=False, dtype=np.float32)
576 | audio_mic, _ = librosa.load(audio_path[1], sr=fs, mono=False, dtype=np.float32)
577 | audio_len = min(audio_foa.shape[1], audio_mic.shape[1])
578 | audio = np.concatenate([audio_foa[:, :audio_len], audio_mic[:, :audio_len]], axis=0)
579 | '''(channel_nums, samples)'''
580 | else:
581 | audio_path = os.path.join(audio_dir[0], audio_fn)
582 | audio, _ = librosa.load(audio_path, sr=fs, mono=False, dtype=np.float32)
583 | '''(channel_nums, samples)'''
584 |
585 | audio_count += 1
586 |
587 | if np.sum(np.abs(audio)) < len(audio)*1e-4:
588 | with open("feature_removed.txt", "a+") as text_file:
589 | print(f"Silent file removed in feature extractor: {audio_fn}",
590 | file=text_file)
591 | tqdm.write("Silent file removed in feature extractor: {}".format(audio_fn))
592 | continue
593 |
594 | # features
595 | feature = RT_preprocessing(extractor, audio)
596 | '''(channels, time, frequency)'''
597 |
598 | hdf5_path = os.path.join(hdf5_dir, fn + '.h5')
599 | with h5py.File(hdf5_path, 'w') as hf:
600 |
601 | hf.create_dataset('feature', data=feature, dtype=np.float32)
602 |
603 | tqdm.write('{}, {}, {}'.format(audio_count, hdf5_path, feature.shape))
604 |
605 | iterator.close()
606 | print("Extacting feature finished! Time spent: {:.3f} s".format(timer() - begin_time))
607 |
608 |
609 | if __name__ == '__main__':
610 | parser = argparse.ArgumentParser(description='Extract features from audio file')
611 |
612 | parser.add_argument('--dataset_dir', type=str, required=True)
613 | parser.add_argument('--feature_dir', type=str, required=True)
614 | parser.add_argument('--feature_type', type=str, required=True,
615 | choices=['logmel', 'logmelgcc', 'logmelintensity', 'logmelgccintensity'])
616 | parser.add_argument('--data_type', type=str, required=True,
617 | choices=['dev', 'eval'])
618 | parser.add_argument('--audio_type', type=str, required=True,
619 | choices=['foa', 'mic', 'foa&mic'])
620 |
621 | args = parser.parse_args()
622 |
623 | if args.feature_type == 'logmelgccintensity':
624 | args.audio_type = 'foa&mic'
625 |
626 | if args.data_type == 'dev':
627 | extract_dev_features(args)
628 | fit(args)
629 | elif args.data_type == 'eval':
630 | extract_eval_features(args)
--------------------------------------------------------------------------------