├── requirements_ngc.txt ├── postprocess └── pp_dummy.py ├── scripts ├── convert_np_first_last_10_v2.py ├── create_test_fake_gt.py └── create_train_folded_v3c.py ├── metrics ├── default_metric.py └── metric_1.py ├── configs ├── default_config.py ├── cfg_1.py ├── cfg_pl_1.py ├── cfg_3.py ├── cfg_2.py └── augmentations.py ├── README.md ├── models ├── mdl_4.py ├── mdl_3.py ├── mdl_1.py └── mdl_2.py ├── data ├── ds_2.py ├── ds_1.py ├── ds_3.py └── ds_pl_1.py ├── utils.py └── train.py /requirements_ngc.txt: -------------------------------------------------------------------------------- 1 | torchaudio==2.2.2 2 | timm==1.0.7 3 | onnxruntime==1.17.3 4 | audiomentations==0.35.0 5 | colorednoise==2.2.0 6 | torch_audiomentations==0.11.1 7 | torcheval==0.0.7 -------------------------------------------------------------------------------- /postprocess/pp_dummy.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import torch 3 | from torch.nn import functional as F 4 | from tqdm import tqdm 5 | 6 | def post_process_pipeline(cfg, val_data, val_df): 7 | return val_data 8 | -------------------------------------------------------------------------------- /scripts/convert_np_first_last_10_v2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | import torchaudio 6 | import numpy as np 7 | import pandas as pd 8 | import os 9 | import glob 10 | import librosa 11 | import multiprocessing as mp 12 | from tqdm import tqdm 13 | 14 | fns = glob.glob('/mount/birdclef24/data/birdclef-2024/train_audio/*/*.ogg') 15 | 16 | TARGET_FOLDER = '/mount/birdclef24/data/birdclef-2024/train_audio_npy_10_v2/' 17 | 18 | 19 | sub_folders = [item.split('/')[-1] for item in glob.glob('/mount/birdclef24/data/birdclef-2024/train_audio/*')] 20 | for s in sub_folders: 21 | os.makedirs(TARGET_FOLDER + s,exist_ok=True) 22 | 23 | SR = 32000 24 | 25 | 26 | def do_one(fn): 27 | fn2 = '/'.join(fn.split('/')[-2:]).replace('.ogg','.npy') 28 | data = librosa.load(fn, sr=SR)[0].astype(np.float32) 29 | np.save(TARGET_FOLDER + fn2, data[:10*SR]) 30 | 31 | 32 | 33 | with mp.Pool(32) as p: 34 | res = list(tqdm(p.imap(do_one,fns))) 35 | 36 | 37 | 38 | ## last 10 sec 39 | 40 | TARGET_FOLDER = '/mount/birdclef24/data/birdclef-2024/train_audio_npy_last10_v2/' 41 | 42 | for s in sub_folders: 43 | os.makedirs(TARGET_FOLDER + s,exist_ok=True) 44 | 45 | def do_one(fn): 46 | fn2 = '/'.join(fn.split('/')[-2:]).replace('.ogg','.npy') 47 | data = librosa.load(fn, sr=SR)[0].astype(np.float32) 48 | np.save(TARGET_FOLDER + fn2, data[-10*SR:]) 49 | 50 | with mp.Pool(32) as p: 51 | res = list(tqdm(p.imap(do_one,fns))) -------------------------------------------------------------------------------- /scripts/create_test_fake_gt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import torchaudio 8 | import torch 9 | import pandas as pd 10 | import numpy as np 11 | 12 | 13 | # In[2]: 14 | 15 | 16 | import glob 17 | import multiprocessing as mp 18 | from tqdm import tqdm 19 | 20 | 21 | # In[3]: 22 | 23 | 24 | df = pd.read_csv('/mount/birdclef24/data/birdclef-2024/test.csv') 25 | df 26 | 27 | 28 | # In[5]: 29 | 30 | 31 | def do_one(fn): 32 | data, sr = torchaudio.load('/mount/birdclef24/data/birdclef-2024/unlabeled_soundscapes/' + fn) 33 | return data.shape[1] / sr 34 | 35 | 36 | # In[7]: 37 | 38 | 39 | with mp.Pool(32) as p: 40 | res = list(tqdm(p.imap(do_one,df['filename'].values))) 41 | 42 | 43 | # In[8]: 44 | 45 | 46 | df['len'] = res 47 | df['len'] = df['len'].clip(0,240) 48 | 49 | 50 | fns=[] 51 | secs = [] 52 | for i in range(df.shape[0]): 53 | fn, l = df.iloc[i] 54 | sec = list(range(5,int(l + 5),5)) 55 | secs += sec 56 | fns += [fn] * len(sec) 57 | 58 | 59 | df2 = pd.DataFrame({'row_id':[str(fn.replace('.ogg','')) + '_' + str(s) for fn, s in zip(fns,secs)]}) 60 | 61 | 62 | df2['asbfly'] = 0 63 | df2.loc[:df2.shape[0]//2,'asbfly'] = 1 64 | 65 | 66 | df2.to_csv('/mount/birdclef24/data/birdclef-2024/test_fake_gt.csv',index=False) 67 | 68 | 69 | # In[5]: 70 | 71 | 72 | fns = sorted(glob.glob('/mount/birdclef24/data/birdclef-2024/unlabeled_soundscapes/*')) 73 | fns[:5] 74 | 75 | 76 | # In[10]: 77 | 78 | 79 | test = pd.DataFrame({'filename':[fn.split('/')[-1] for fn in fns]}) 80 | 81 | 82 | # In[11]: 83 | 84 | 85 | test 86 | 87 | 88 | # In[12]: 89 | 90 | 91 | test.to_csv('/mount/birdclef24/data/birdclef-2024/test.csv',index=False) 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /metrics/default_metric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import f1_score 3 | import torch 4 | import scipy as sp 5 | 6 | def get_score(y_true, y_pred): 7 | return 0 8 | # score = sp.stats.pearsonr(y_true, y_pred)[0] 9 | # return score 10 | 11 | 12 | 13 | 14 | 15 | def calc_metric(cfg, pp_out, val_df, pre="val"): 16 | return 0 17 | # # if isinstance(pred_df,list): 18 | # # pred_df,gt_df = pred_df 19 | # # else: 20 | # # gt_df = None 21 | 22 | # y_true = val_df['score'].values 23 | # y_pred = val_data['preds'].cpu().numpy() 24 | # score = get_score(y_true.flatten(), y_pred.flatten()) 25 | # # print(score) 26 | 27 | # # df['score'] = df['location'].apply(ast.literal_eval) 28 | # # df['span'] = df['location'].apply(location_to_span) 29 | # # spans_true = df['span'].values 30 | 31 | # # df_pred = pred_df.copy() 32 | # # # df_pred['location'] = df_pred['location'].apply(ast.literal_eval) 33 | # # df_pred['span'] = df_pred['pred_location'].apply(pred_location_to_span) 34 | # # spans_pred = df_pred['span'].values 35 | 36 | # # score = span_micro_f1(spans_pred, spans_true) 37 | 38 | # if hasattr(cfg, "neptune_run"): 39 | # cfg.neptune_run[f"{pre}/score/"].log(score, step=cfg.curr_step) 40 | # print(f"{pre} score: {score:.6}") 41 | # # else: 42 | # # return score 43 | 44 | # # if gt_df is not None: 45 | # # df_pred = gt_df.copy() 46 | # # df_pred['span'] = df_pred['pred_location'].apply(pred_location_to_span) 47 | # # spans_pred = df_pred['span'].values 48 | 49 | # # score = span_micro_f1(spans_pred, spans_true) 50 | 51 | # # if hasattr(cfg, "neptune_run"): 52 | # # cfg.neptune_run[f"{pre}/score_debug/"].log(score, step=cfg.curr_step) 53 | # # # print(f"{pre} score_debug: {score:.6}") 54 | # return score 55 | 56 | -------------------------------------------------------------------------------- /metrics/metric_1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import roc_auc_score 3 | import torch 4 | import scipy as sp 5 | import pandas as pd 6 | 7 | 8 | 9 | from torcheval.metrics.functional import binary_auroc 10 | 11 | import time 12 | 13 | 14 | 15 | def calc_metric(cfg, pp_out, val_df, pre="val"): 16 | 17 | preds = pp_out['logits'].cpu().numpy() 18 | if pre == 'test': 19 | sub_df = val_df.copy() 20 | parts = cfg.test_duration // 5 21 | n = sub_df.shape[0] 22 | sub_df = sub_df.loc[sub_df.index.repeat(parts)].copy().reset_index(drop=True) 23 | sub_df['sec'] = np.arange(5,(parts+1)*5,5)[None,:].repeat(n,axis=0).reshape(-1) 24 | sub_df['row_id'] = sub_df['filename'].apply(lambda x: x.split('.')[0]).astype(str) + '_' + sub_df['sec'].astype(str) 25 | preds_df = pd.DataFrame(preds,columns=cfg.birds) 26 | sub_df = pd.concat([sub_df[['row_id']],preds_df], axis=1) 27 | 28 | test_gt = pd.read_csv(cfg.test_gt) 29 | 30 | row_ids = np.intersect1d(sub_df['row_id'],test_gt['row_id']) 31 | sub_df2 = sub_df.set_index('row_id') 32 | test_gt2 = test_gt.set_index('row_id') 33 | 34 | sub_df2 = sub_df2.loc[row_ids] 35 | 36 | sub_df2.to_csv(f'{cfg.output_dir}/fold{cfg.fold}/pl_df_{cfg.seed}.csv') 37 | test_gt2 = test_gt2.loc[row_ids] 38 | 39 | preds = sub_df2[test_gt2.columns].values 40 | if np.isnan(preds).sum() > 0: 41 | print(f'replaceing {np.isnan(preds).sum()} with 0') 42 | preds = np.nan_to_num(preds) 43 | target = test_gt2.values 44 | 45 | else: 46 | target = (pp_out['target'] > 0.5).float().cpu().numpy() 47 | 48 | good_idx = target.sum(0) > 0 49 | 50 | # s = time.time() 51 | num_tasks = good_idx.sum() 52 | if num_tasks > 1: 53 | score = binary_auroc(torch.from_numpy(preds[:,good_idx].T).cuda().float(),torch.from_numpy(target[:,good_idx].T).cuda(),num_tasks=num_tasks).mean() 54 | else: 55 | score = binary_auroc(torch.from_numpy(preds[:,good_idx].T[0]).cuda().float(),torch.from_numpy(target[:,good_idx].T[0]).cuda(),num_tasks=num_tasks).mean() 56 | 57 | return score 58 | 59 | -------------------------------------------------------------------------------- /configs/default_config.py: -------------------------------------------------------------------------------- 1 | from types import SimpleNamespace 2 | from copy import deepcopy 3 | 4 | cfg = SimpleNamespace(**{}) 5 | 6 | # stages 7 | cfg.train = True 8 | cfg.val = True 9 | cfg.test = True 10 | cfg.train_val = True 11 | 12 | # dataset 13 | cfg.dataset = "ds_dummy" 14 | cfg.batch_size = 32 15 | cfg.val_df = None 16 | cfg.test_df = None 17 | cfg.batch_size_val = None 18 | cfg.batch_size_test = None 19 | cfg.normalization = None 20 | cfg.train_aug = None 21 | cfg.val_aug = None 22 | cfg.data_sample = -1 23 | 24 | # model 25 | cfg.pretrained = True 26 | cfg.pretrained_weights = None 27 | cfg.pretrained_weights_strict = True 28 | cfg.model_args = {} 29 | cfg.rename_weights = None 30 | cfg.pop_weights = None 31 | 32 | # training routine 33 | cfg.fold = 0 34 | cfg.val_fold = -1 35 | cfg.lr = 1e-4 36 | cfg.schedule = "cosine" 37 | cfg.weight_decay = 0 38 | cfg.optimizer = "Adam" 39 | cfg.epochs = 10 40 | cfg.seed = -1 41 | cfg.resume_training = False 42 | cfg.simple_eval = False 43 | cfg.do_test = True 44 | cfg.do_seg = False 45 | cfg.eval_ddp = True 46 | cfg.clip_grad = 0 47 | cfg.debug = False 48 | cfg.save_val_data = True 49 | cfg.gradient_checkpointing = False 50 | cfg.awp = False 51 | cfg.awp_per_step = False 52 | cfg.pseudo_df = None 53 | 54 | # eval 55 | cfg.calc_metric = True 56 | cfg.calc_metric_epochs = 1 57 | cfg.eval_steps = 0 58 | cfg.post_process_pipeline = "pp_dummy" 59 | cfg.metric = 'default_metric' 60 | cfg.test_epochs = 0 61 | 62 | # ressources 63 | cfg.find_unused_parameters = False 64 | cfg.mixed_precision = True 65 | cfg.grad_accumulation = 1 66 | cfg.syncbn = False 67 | cfg.gpu = 0 68 | cfg.dp = False 69 | cfg.num_workers = 4 70 | cfg.drop_last = True 71 | cfg.save_checkpoint = True 72 | cfg.save_only_last_ckpt = False 73 | cfg.save_weights_only = False 74 | cfg.save_first_batch = False 75 | 76 | # logging, 77 | cfg.neptune_project = None 78 | cfg.neptune_connection_mode = "debug" 79 | cfg.tags = None 80 | cfg.save_first_batch = False 81 | cfg.save_first_batch_preds = False 82 | cfg.sgd_nesterov = True 83 | cfg.sgd_momentum = 0.9 84 | cfg.clip_mode = "norm" 85 | cfg.data_sample = -1 86 | cfg.track_grad_norm = True 87 | cfg.grad_norm_type = 2. 88 | cfg.norm_eps = 1e-4 89 | cfg.disable_tqdm = False 90 | 91 | 92 | 93 | cfg.create_submission = False 94 | 95 | cfg.loss = "bce" 96 | 97 | cfg.tta = [] 98 | 99 | cfg.s3_bucket_name = "" 100 | cfg.s3_access_key = "" 101 | cfg.s3_secret_key = "" 102 | 103 | basic_cfg = cfg 104 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | This repository represents Dieters part of teams NVBird solution to the BirdCLEF24 competition on kaggle. 4 | 5 | ## Installation 6 | 7 | I used the `nvcr.io/nvidia/pytorch:24.03-py3` container from the [ngc catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) to have a consistent environment between team members. You can run it via 8 | 9 | `docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:24.03-py3` 10 | 11 | Within the container clone this repository and install necessary packages with 12 | ``` 13 | git clone https://github.com/ChristofHenkel/kaggle-birdclef24-3rd-place-solution-dieter 14 | cd kaggle-birdclef24-3rd-place-solution-dieter 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | By default training is logged via neptune.ai. You need to use your own neptune project which is set via `cfg.neptune_project` in `configs/cfg_XXX.py` 19 | 20 | 21 | ## Preparations 22 | 23 | ### npy files 24 | 25 | We have preprocessed the dataset into numpy format files for fast loading. 26 | 27 | After downlaoding the competition data to `/mount/birdclef24/data/` run `scripts/convert_np_first_last_10_v2.py` to create npy files containing wav signal of first and last 10sec of competition data. 28 | 29 | ### folds and test fiels 30 | 31 | Run `scripts/create_train_folded_v3c.py` to process the original comeptition meta data and handle some duplicates and create folds. 32 | Run `scripts/create_test_fake_gt.py` to enable prediction on unlabled soundscapes. 33 | 34 | ### AVES weights 35 | Download model weights from [AVES](https://github.com/earthspecies/aves/) and put torchaudio config and weights of `birdaves-biox-base` into `/mount/birdclef24/data/` 36 | 37 | ## background noise 38 | 39 | Download background noise from https://www.kaggle.com/datasets/honglihang/background-noise and put into `/mount/birdclef24/data/background/` 40 | 41 | ## Training 42 | 43 | ### First round 44 | 45 | run the following commans to train 5 seeds each for 3 different model configs 46 | 47 | ``` 48 | 5x python train.py -C cfg_1 --fold -1 49 | 5x python train.py -C cfg_2 --fold -1 50 | 5x python train.py -C cfg_3 --fold -1 51 | ``` 52 | 53 | This results in 15 csv files pl_df_SEED in respective model folders. Simply average predictions from 3 models each to create 5x different 54 | 55 | ``` 56 | pl_blended_0.csv 57 | pl_blended_1.csv 58 | pl_blended_2.csv 59 | pl_blended_3.csv 60 | pl_blended_4.csv 61 | ``` 62 | 63 | needed for round 2 64 | 65 | ### Second round 66 | 67 | 68 | run 69 | 70 | ``` 71 | python train.py -C cfg_pl_1 --fold -1 --pl_df pl_blended_0.csv 72 | python train.py -C cfg_pl_1 --fold -1 --pl_df pl_blended_1.csv 73 | python train.py -C cfg_pl_1 --fold -1 --pl_df pl_blended_2.csv 74 | python train.py -C cfg_pl_1 --fold -1 --pl_df pl_blended_3.csv 75 | python train.py -C cfg_pl_1 --fold -1 --pl_df pl_blended_4.csv 76 | ``` 77 | 78 | to get final model weights 79 | 80 | ## Inference 81 | 82 | see public notebook -------------------------------------------------------------------------------- /models/mdl_4.py: -------------------------------------------------------------------------------- 1 | from torchaudio.transforms import MelSpectrogram, AmplitudeToDB 2 | import torch 3 | from torch import nn 4 | import timm 5 | from torch.cuda.amp import autocast 6 | 7 | 8 | 9 | class Preprocessor(nn.Module): 10 | def __init__(self, cfg): 11 | super(Preprocessor, self).__init__() 12 | self.mel_ampdb = torch.nn.Sequential(MelSpectrogram(**cfg.mel_spec_args),AmplitudeToDB(top_db=cfg.top_db)) 13 | self.m = cfg.norm_ms[0] 14 | self.s = cfg.norm_ms[1] 15 | 16 | def forward(self, x): 17 | with autocast(enabled=False), torch.no_grad(): 18 | x = x / (torch.std(x, 1, keepdim=True) + 0.01) 19 | x = x.float() 20 | x = self.mel_ampdb(x) 21 | x = (x - self.m) / self.s 22 | return x 23 | 24 | def bce_with_mask(preds, targets, mask): 25 | loss = nn.BCEWithLogitsLoss(reduction='none')(preds, targets) 26 | loss = loss * mask 27 | loss = loss.mean() 28 | return loss 29 | 30 | class Net(nn.Module): 31 | def __init__(self, cfg): 32 | super(Net, self).__init__() 33 | self.cfg = cfg 34 | self.preprocessing = Preprocessor(cfg) 35 | self.backbone = timm.create_model( 36 | cfg.backbone, 37 | pretrained=cfg.pretrained, 38 | drop_rate = 0.1, 39 | #drop_path_rate = 0.2, 40 | num_classes=cfg.num_labels, 41 | 42 | #global_pool='' 43 | ) 44 | self.loss_fn = bce_with_mask 45 | 46 | 47 | def forward(self, input_dict): 48 | x = input_dict['input'] 49 | targets = input_dict['targets'] 50 | secondary_mask = input_dict['secondary_mask'] 51 | 52 | #if test then flatten bs and parts 53 | if len(x.shape) == 3: 54 | bs, parts, seq_len = x.shape 55 | targets = torch.repeat_interleave(targets[:,None],parts,dim=1) 56 | secondary_mask = torch.repeat_interleave(secondary_mask[:,None],parts,dim=1) 57 | x = x.reshape(bs*parts,seq_len)#.unsqueeze(1) 58 | n_classes = targets.shape[-1] 59 | targets = targets.reshape(bs*parts,n_classes) 60 | secondary_mask = secondary_mask.reshape(bs*parts,n_classes) 61 | 62 | # return x, targets, secondary_mask 63 | 64 | 65 | x = self.preprocessing(x) 66 | with torch.no_grad(): 67 | x = x.unsqueeze(1) 68 | pos = torch.linspace(0., 1., x.size(2)).to(x.device) 69 | pos = pos.unsqueeze(0).unsqueeze(0).unsqueeze(-1) 70 | pos = pos.expand(x.size(0), 1, x.size(2), x.size(3)) 71 | x = x.expand(-1, 2, -1, -1) 72 | x = torch.cat([x, pos], 1) 73 | 74 | logits = self.backbone(x) 75 | 76 | 77 | result = {'logits': logits} 78 | loss = self.loss_fn(logits, targets, secondary_mask) 79 | result.update({'loss': loss, 'target': targets}) 80 | 81 | return result 82 | -------------------------------------------------------------------------------- /models/mdl_3.py: -------------------------------------------------------------------------------- 1 | from torchaudio.transforms import MelSpectrogram, AmplitudeToDB 2 | import torch 3 | from torch import nn 4 | import timm 5 | from torch.cuda.amp import autocast 6 | import numpy as np 7 | 8 | from torchaudio.models import wav2vec2_model 9 | from torch import nn 10 | import json 11 | import torch 12 | 13 | class AvesTorchaudioWrapper(nn.Module): 14 | 15 | def __init__(self, config_path, model_path): 16 | 17 | super().__init__() 18 | 19 | # reference: https://pytorch.org/audio/stable/_modules/torchaudio/models/wav2vec2/utils/import_fairseq.html 20 | 21 | self.config = self.load_config(config_path) 22 | self.model = wav2vec2_model(**self.config, aux_num_out=None) 23 | self.model.load_state_dict(torch.load(model_path)) 24 | self.model.feature_extractor.requires_grad_(False) 25 | 26 | def load_config(self, config_path): 27 | with open(config_path, 'r') as ff: 28 | obj = json.load(ff) 29 | 30 | return obj 31 | 32 | def forward(self, sig): 33 | # extract_feature in the sorchaudio version will output all 12 layers' output, -1 to select the final one 34 | out = self.model.extract_features(sig)[0][-1] 35 | out = out.mean(dim=1) 36 | return out 37 | 38 | def bce_with_mask(preds, targets): 39 | loss = nn.BCEWithLogitsLoss(reduction='none')(preds, targets) 40 | # loss = loss * mask 41 | loss = loss.mean() 42 | return loss 43 | 44 | def count_parameters(model): 45 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 46 | 47 | class Net(nn.Module): 48 | def __init__(self, cfg): 49 | super(Net, self).__init__() 50 | self.cfg = cfg 51 | self.backbone = AvesTorchaudioWrapper(cfg.backbone_config_path, cfg.backbone_model_path) 52 | self.head = nn.Linear(self.backbone.config['encoder_embed_dim'],cfg.n_classes) 53 | 54 | self.loss_fn = bce_with_mask 55 | print('Net params: ',count_parameters(self)) 56 | 57 | def forward(self, input_dict): 58 | x = input_dict['input'] 59 | targets = input_dict['targets'] 60 | # secondary_mask = input_dict['secondary_mask'] 61 | 62 | if self.training: 63 | bs = x.shape[0] 64 | perm = torch.randperm(bs) 65 | weights = 0.2 + 0.8 * torch.rand(bs,device=x.device) 66 | x = x + weights[:,None] * x[perm] 67 | targets = torch.max(targets,targets[perm]) 68 | # secondary_mask = torch.min(secondary_mask,secondary_mask[perm]) 69 | 70 | #if test then flatten bs and parts 71 | if len(x.shape) == 3: 72 | bs, parts, seq_len = x.shape 73 | targets = torch.repeat_interleave(targets[:,None],parts,dim=1) 74 | # secondary_mask = torch.repeat_interleave(secondary_mask[:,None],parts,dim=1) 75 | x = x.reshape(bs*parts,seq_len)#.unsqueeze(1) 76 | n_classes = targets.shape[-1] 77 | targets = targets.reshape(bs*parts,n_classes) 78 | # secondary_mask = secondary_mask.reshape(bs*parts,n_classes) 79 | 80 | # return x, targets, secondary_mask 81 | 82 | x = self.backbone(x) 83 | logits = self.head(x) 84 | 85 | result = {'logits': logits} 86 | loss = self.loss_fn(logits, targets) 87 | result.update({'loss': loss, 'target': targets}) 88 | 89 | return result 90 | 91 | 92 | -------------------------------------------------------------------------------- /models/mdl_1.py: -------------------------------------------------------------------------------- 1 | from torchaudio.transforms import MelSpectrogram, AmplitudeToDB 2 | import torch 3 | from torch import nn 4 | import timm 5 | from torch.cuda.amp import autocast 6 | 7 | 8 | 9 | class Preprocessor(nn.Module): 10 | def __init__(self, cfg): 11 | super(Preprocessor, self).__init__() 12 | self.mel_ampdb = torch.nn.Sequential(MelSpectrogram(**cfg.mel_spec_args),AmplitudeToDB(top_db=cfg.top_db)) 13 | self.m = cfg.norm_ms[0] 14 | self.s = cfg.norm_ms[1] 15 | 16 | def forward(self, x): 17 | with autocast(enabled=False), torch.no_grad(): 18 | x = x / torch.std(x, 1, keepdim=True) 19 | x = x.float() 20 | x = self.mel_ampdb(x) 21 | x = (x - self.m) / self.s 22 | return x 23 | 24 | def bce_with_mask(preds, targets, mask): 25 | loss = nn.BCEWithLogitsLoss(reduction='none')(preds, targets) 26 | loss = loss * mask 27 | loss = loss.mean() 28 | return loss 29 | 30 | class Net(nn.Module): 31 | def __init__(self, cfg): 32 | super(Net, self).__init__() 33 | self.cfg = cfg 34 | self.preprocessing = Preprocessor(cfg) 35 | self.backbone = timm.create_model( 36 | cfg.backbone, 37 | pretrained=cfg.pretrained, 38 | drop_rate = 0.1, 39 | #drop_path_rate = 0.2, 40 | num_classes=cfg.num_labels, 41 | 42 | #global_pool='' 43 | ) 44 | self.loss_fn = bce_with_mask 45 | 46 | 47 | def forward(self, input_dict): 48 | x = input_dict['input'] 49 | targets = input_dict['targets'] 50 | 51 | if len(x.shape) == 3: 52 | bs, parts, seq_len = x.shape 53 | targets = torch.repeat_interleave(targets[:,None],parts,dim=1) 54 | # secondary_mask = torch.repeat_interleave(secondary_mask[:,None],parts,dim=1) 55 | x = x.reshape(bs*parts,seq_len)#.unsqueeze(1) 56 | n_classes = targets.shape[-1] 57 | targets = targets.reshape(bs*parts,n_classes) 58 | 59 | x = self.preprocessing(x) 60 | with torch.no_grad(): 61 | x = x.unsqueeze(1) 62 | pos = torch.linspace(0., 1., x.size(2)).to(x.device) 63 | pos = pos.unsqueeze(0).unsqueeze(0).unsqueeze(-1) 64 | pos = pos.expand(x.size(0), 1, x.size(2), x.size(3)) 65 | x = x.expand(-1, 2, -1, -1) 66 | x = torch.cat([x, pos], 1) 67 | 68 | logits = self.backbone(x) 69 | 70 | 71 | result = {'logits': logits} 72 | 73 | secondary_mask = input_dict['secondary_mask'] 74 | loss = self.loss_fn(logits, targets, secondary_mask) 75 | result.update({'loss': loss, 'target': targets}) 76 | 77 | return result 78 | 79 | class TestNet(Net): 80 | 81 | def forward(self,x): 82 | bs, parts, seq_len = x.shape 83 | x = x.reshape(bs*parts,seq_len)#.unsqueeze(1) 84 | x = self.preprocessing(x) 85 | with torch.no_grad(): 86 | x = x.unsqueeze(1) 87 | pos = torch.linspace(0., 1., x.size(2)).to(x.device) 88 | pos = pos.unsqueeze(0).unsqueeze(0).unsqueeze(-1) 89 | pos = pos.expand(x.size(0), 1, x.size(2), x.size(3)) 90 | x = x.expand(-1, 2, -1, -1) 91 | x = torch.cat([x, pos], 1) 92 | logits = self.backbone(x) 93 | 94 | return logits -------------------------------------------------------------------------------- /data/ds_2.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler 2 | import torch 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | 7 | import torchaudio 8 | import ast 9 | 10 | from torch import nn 11 | from torchaudio.transforms import MelSpectrogram, AmplitudeToDB 12 | 13 | def batch_to_device(batch,device): 14 | batch_dict = {key:batch[key].to(device) for key in batch} 15 | return batch_dict 16 | 17 | 18 | tr_collate_fn = None 19 | val_collate_fn = None 20 | 21 | 22 | class CustomDataset(Dataset): 23 | 24 | def __init__(self, df, cfg, aug, mode='train'): 25 | 26 | self.cfg = cfg 27 | self.mode = mode 28 | self.df = df.copy() 29 | 30 | self.bird2id = {bird:idx for idx,bird in enumerate(cfg.birds)} 31 | 32 | if self.mode == 'test': 33 | self.data_folder = cfg.test_data_folder 34 | self.data_folder2 = cfg.test_data_folder 35 | self.duration = cfg.test_duration 36 | self.suffix = cfg.test_suffix 37 | else: 38 | self.data_folder = cfg.data_folder 39 | self.data_folder2 = cfg.data_folder2 40 | self.duration = cfg.duration 41 | self.suffix = cfg.suffix 42 | 43 | if self.mode == 'train': 44 | self.df = self.df[self.df['rating'] >= self.cfg.min_rating] 45 | self.weights = np.clip(df["rating"].values / df["rating"].max(), 0.1, 1.0) 46 | else: 47 | self.weights = np.ones(len(self.df)) 48 | if 'primary_label' in self.df.columns: 49 | labels = self.df['primary_label'].map(self.bird2id).astype(int).values 50 | self.labels2 = self.df['secondary_labels'].apply(lambda x: self.secondary2target(x)) 51 | self.targets = np.eye(self.cfg.n_classes)[labels] 52 | for i, t in enumerate(self.labels2): 53 | try: 54 | if len(t) > 0: 55 | self.targets[i,t] = 1 56 | except: 57 | print(i) 58 | else: 59 | self.targets = np.zeros((self.df.shape[0],self.cfg.n_classes)) 60 | 61 | #set augs 62 | self.aug_am = aug[0] 63 | 64 | if self.mode == 'test': 65 | # self.preprocessing = torch.nn.Sequential(MelSpectrogram(**cfg.mel_spec_args),AmplitudeToDB(**cfg.db_args)) 66 | # self.norm_by = cfg.norm_by 67 | self.test_parts = self.cfg.wav_crop_len // self.cfg.infer_duration 68 | 69 | 70 | 71 | def crop_or_pad(self,wav): 72 | 73 | expected_length = (self.cfg.wav_crop_len * self.cfg.sample_rate) 74 | if wav.shape[0] < expected_length: 75 | pad = self.cfg.wav_crop_len * self.cfg.sample_rate - wav.shape[0] 76 | 77 | wav_orig = wav.clone() 78 | 79 | l = wav.shape[0] 80 | 81 | if pad >= l: 82 | while wav.shape[0] <= expected_length: 83 | wav = torch.cat([wav, wav_orig], dim=0) 84 | else: 85 | max_offset = l - pad 86 | offset = np.random.randint(max_offset) 87 | wav = torch.cat([wav, wav_orig[offset:offset+pad]], dim=0) 88 | elif wav.shape[0] > expected_length: 89 | start = np.random.randint(0, wav.shape[0] - expected_length) 90 | wav = wav[start : start + expected_length] 91 | 92 | wav = wav[:expected_length] 93 | return wav 94 | 95 | 96 | def __getitem__(self, idx): 97 | 98 | 99 | 100 | row = self.df.iloc[idx] 101 | fn = row['filename'] 102 | wav_tensor = self.load_one(fn) 103 | if self.mode != 'test': 104 | wav_tensor = wav_tensor[:self.cfg.sample_rate*self.cfg.wav_max_len] 105 | target = self.targets[idx] 106 | weight = self.weights[idx] 107 | wav_tensor = self.crop_or_pad(wav_tensor) 108 | 109 | feature_dict = {'target':torch.tensor(target.astype(np.float32)),'weight':torch.tensor(weight.astype(np.float32))} 110 | if self.mode == 'test': 111 | #cut 112 | wav_tensor = wav_tensor.reshape(self.test_parts,wav_tensor.shape[0]//self.test_parts) 113 | feature_dict.update({'input':wav_tensor}) # seq_len 114 | # spec = self.preprocessing(wav_tensor) 115 | # #convert to melspec and norm 116 | # spec = (spec + self.norm_by) / self.norm_by 117 | # feature_dict.update({'input':spec}) # parts, mels, freqs 118 | else: 119 | if self.aug_am is not None: 120 | wav_tensor = torch.from_numpy(self.aug_am(wav_tensor.numpy(),sample_rate=self.cfg.sample_rate)) 121 | feature_dict.update({'input':wav_tensor}) # seq_len 122 | 123 | 124 | return feature_dict 125 | 126 | def __len__(self): 127 | return len(self.df) 128 | 129 | 130 | def load_one(self, id_): 131 | fp = self.data_folder + id_ 132 | f_id = fp.split('.')[0] 133 | try: 134 | if self.suffix == '.npy': 135 | data = torch.from_numpy(np.load(f_id + self.suffix)) 136 | else: 137 | data, rate = torchaudio.load(f_id + self.suffix) 138 | data = data[0] 139 | except: 140 | print("FAIL READING rec", fp) 141 | 142 | return data 143 | 144 | def secondary2target(self,secondary_label): 145 | birds = ast.literal_eval(secondary_label) 146 | target = [self.bird2id.get(item) for item in birds if not item == 'nocall'] 147 | target = [t for t in target if not t is None] 148 | return target 149 | -------------------------------------------------------------------------------- /configs/cfg_1.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from importlib import import_module 4 | import platform 5 | import json 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from default_config import basic_cfg 10 | import glob 11 | 12 | 13 | cfg = basic_cfg 14 | cfg.debug = True 15 | 16 | # paths 17 | 18 | cfg.name = os.path.basename(__file__).split(".")[0] 19 | cfg.output_dir = f"/mount/birdclef24/models/{os.path.basename(__file__).split('.')[0]}" 20 | 21 | cfg.data_folder = f"/mount/birdclef24/data/birdclef-2024/train_audio_npy_10_v2/" 22 | cfg.data_folder2 = f"/mount/birdclef24/data/birdclef-2024/train_audio_npy_last10_v2/" 23 | cfg.train_df = '/mount/birdclef24/data/train_folded_v3c.csv' 24 | cfg.test_df = '/mount/birdclef24/data/birdclef-2024/test.csv' 25 | cfg.test_gt = '/mount/birdclef24/data/birdclef-2024/test_fake_gt.csv' 26 | cfg.test_data_folder = '/mount/birdclef24/data/birdclef-2024/unlabeled_soundscapes/' 27 | cfg.test_suffix = '.ogg' 28 | cfg.test_duration = 240 29 | # stages 30 | cfg.test = True 31 | cfg.train = True 32 | cfg.train_val = False 33 | cfg.eval_epochs = 1 34 | 35 | #logging 36 | cfg.neptune_project = 'XXX' 37 | cfg.neptune_connection_mode = "async" 38 | cfg.tags = "base" 39 | 40 | #model 41 | cfg.model = "mdl_1" 42 | cfg.mixup = 0.5 43 | cfg.mix_beta=1. 44 | cfg.mixadd = False 45 | cfg.backbone = "efficientvit_b1.r288_in1k" 46 | cfg.pretrained = True 47 | cfg.in_chans = 1 48 | cfg.resample_train = 10 49 | 50 | cfg.birds = ['asbfly', 'ashdro1', 'ashpri1', 'ashwoo2', 'asikoe2', 'asiope1', 51 | 'aspfly1', 'aspswi1', 'barfly1', 'barswa', 'bcnher', 'bkcbul1', 52 | 'bkrfla1', 'bkskit1', 'bkwsti', 'bladro1', 'blaeag1', 'blakit1', 53 | 'blhori1', 'blnmon1', 'blrwar1', 'bncwoo3', 'brakit1', 'brasta1', 54 | 'brcful1', 'brfowl1', 'brnhao1', 'brnshr', 'brodro1', 'brwjac1', 55 | 'brwowl1', 'btbeat1', 'bwfshr1', 'categr', 'chbeat1', 'cohcuc1', 56 | 'comfla1', 'comgre', 'comior1', 'comkin1', 'commoo3', 'commyn', 57 | 'compea', 'comros', 'comsan', 'comtai1', 'copbar1', 'crbsun2', 58 | 'cregos1', 'crfbar1', 'crseag1', 'dafbab1', 'darter2', 'eaywag1', 59 | 'emedov2', 'eucdov', 'eurbla2', 'eurcoo', 'forwag1', 'gargan', 60 | 'gloibi', 'goflea1', 'graher1', 'grbeat1', 'grecou1', 'greegr', 61 | 'grefla1', 'grehor1', 'grejun2', 'grenig1', 'grewar3', 'grnsan', 62 | 'grnwar1', 'grtdro1', 'gryfra', 'grynig2', 'grywag', 'gybpri1', 63 | 'gyhcaf1', 'heswoo1', 'hoopoe', 'houcro1', 'houspa', 'inbrob1', 64 | 'indpit1', 'indrob1', 'indrol2', 'indtit1', 'ingori1', 'inpher1', 65 | 'insbab1', 'insowl1', 'integr', 'isbduc1', 'jerbus2', 'junbab2', 66 | 'junmyn1', 'junowl1', 'kenplo1', 'kerlau2', 'labcro1', 'laudov1', 67 | 'lblwar1', 'lesyel1', 'lewduc1', 'lirplo', 'litegr', 'litgre1', 68 | 'litspi1', 'litswi1', 'lobsun2', 'maghor2', 'malpar1', 'maltro1', 69 | 'malwoo1', 'marsan', 'mawthr1', 'moipig1', 'nilfly2', 'niwpig1', 70 | 'nutman', 'orihob2', 'oripip1', 'pabflo1', 'paisto1', 'piebus1', 71 | 'piekin1', 'placuc3', 'plaflo1', 'plapri1', 'plhpar1', 'pomgrp2', 72 | 'purher1', 'pursun3', 'pursun4', 'purswa3', 'putbab1', 'redspu1', 73 | 'rerswa1', 'revbul', 'rewbul', 'rewlap1', 'rocpig', 'rorpar', 74 | 'rossta2', 'rufbab3', 'ruftre2', 'rufwoo2', 'rutfly6', 'sbeowl1', 75 | 'scamin3', 'shikra1', 'smamin1', 'sohmyn1', 'spepic1', 'spodov', 76 | 'spoowl1', 'sqtbul1', 'stbkin1', 'sttwoo1', 'thbwar1', 'tibfly3', 77 | 'tilwar1', 'vefnut1', 'vehpar1', 'wbbfly1', 'wemhar1', 'whbbul2', 78 | 'whbsho3', 'whbtre1', 'whbwag1', 'whbwat1', 'whbwoo2', 'whcbar1', 79 | 'whiter2', 'whrmun', 'whtkin2', 'woosan', 'wynlau1', 'yebbab1', 80 | 'yebbul3', 'zitcis1'] 81 | 82 | cfg.labels = np.array(cfg.birds) 83 | cfg.targets = {v : i for i,v in enumerate(cfg.labels)} 84 | cfg.num_labels = len(cfg.labels) 85 | # augmentations 86 | cfg.resample_train = 10 87 | cfg.other_samples = 1 88 | cfg.max_shift = 1 89 | 90 | cfg.n_classes = len(cfg.birds) 91 | cfg.min_rating = 0 92 | cfg.sample_rate = 32000 93 | cfg.sr = cfg.sample_rate 94 | cfg.wav_crop_len = 6 95 | cfg.duration = 5 96 | 97 | cfg.image_height = 288 98 | cfg.image_width = 288 99 | n_fft=2048 100 | win_length=1024 101 | 102 | hop_length = int((cfg.duration * cfg.sr - win_length + n_fft) / (cfg.image_width)) + 1 103 | cfg.mel_spec_args = dict(sample_rate=cfg.sample_rate, 104 | n_fft=n_fft, 105 | win_length=win_length, 106 | hop_length=hop_length, 107 | f_min=90, 108 | f_max=14000, 109 | pad=0, 110 | n_mels=cfg.image_height, 111 | mel_scale='htk', 112 | power=2.,) 113 | 114 | cfg.db_args = dict(stype="power", top_db=None) 115 | cfg.top_db = None 116 | # cfg.gem_p_trainable = True 117 | cfg.norm_ms = (40,80) 118 | 119 | # OPTIMIZATION & SCHEDULE 120 | cfg.fold = 0 121 | cfg.epochs = 20 122 | cfg.lr = 3e-3 123 | cfg.optimizer = "AdamW" 124 | cfg.weight_decay = 0.01 125 | cfg.clip_grad = 10. 126 | cfg.warmup = 2 127 | cfg.batch_size = 128 128 | cfg.batch_size_test = 1 129 | cfg.mixed_precision = True # True 130 | cfg.pin_memory = False 131 | cfg.grad_accumulation = 1. 132 | cfg.num_workers = 8 133 | 134 | 135 | # DATASET 136 | cfg.dataset = "ds_1" 137 | cfg.suffix = '.npy' 138 | cfg.sample_weights = False 139 | cfg.normalization = 'channel' 140 | 141 | #EVAL 142 | cfg.calc_metric = False 143 | cfg.simple_eval = False 144 | # augs & tta 145 | 146 | # Postprocess 147 | cfg.post_process_pipeline = "pp_dummy" 148 | cfg.metric = "metric_1" 149 | # augs & tta 150 | 151 | #Saving 152 | cfg.save_weights_only = True 153 | cfg.save_only_last_ckpt = True 154 | 155 | 156 | -------------------------------------------------------------------------------- /configs/cfg_pl_1.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from importlib import import_module 4 | import platform 5 | import json 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from default_config import basic_cfg 10 | import glob 11 | 12 | 13 | cfg = basic_cfg 14 | cfg.debug = True 15 | 16 | # paths 17 | 18 | cfg.name = os.path.basename(__file__).split(".")[0] 19 | cfg.output_dir = f"/mount/birdclef24/models/{os.path.basename(__file__).split('.')[0]}" 20 | 21 | 22 | cfg.data_folder = f"/mount/birdclef24/data/birdclef-2024/train_audio_npy_10_v2/" 23 | cfg.data_folder2 = f"/mount/birdclef24/data/birdclef-2024/train_audio_npy_last10_v2/" 24 | cfg.train_df = '/mount/birdclef24/data/train_folded_v3c.csv' 25 | cfg.test_df = '/mount/birdclef24/data/birdclef-2024/test.csv' 26 | cfg.test_gt = '/mount/birdclef24/data/birdclef-2024/test_fake_gt.csv' 27 | cfg.test_data_folder = '/mount/birdclef24/data/birdclef-2024/unlabeled_soundscapes/' 28 | cfg.test_duration = 240 29 | cfg.pl_df = '/mount/birdclef24/data/pl_blended.csv' 30 | cfg.pl_data_folder = '/mount/birdclef24/data/birdclef-2024/unlabeled_soundscapes/' 31 | 32 | cfg.birds = ['asbfly', 'ashdro1', 'ashpri1', 'ashwoo2', 'asikoe2', 'asiope1', 33 | 'aspfly1', 'aspswi1', 'barfly1', 'barswa', 'bcnher', 'bkcbul1', 34 | 'bkrfla1', 'bkskit1', 'bkwsti', 'bladro1', 'blaeag1', 'blakit1', 35 | 'blhori1', 'blnmon1', 'blrwar1', 'bncwoo3', 'brakit1', 'brasta1', 36 | 'brcful1', 'brfowl1', 'brnhao1', 'brnshr', 'brodro1', 'brwjac1', 37 | 'brwowl1', 'btbeat1', 'bwfshr1', 'categr', 'chbeat1', 'cohcuc1', 38 | 'comfla1', 'comgre', 'comior1', 'comkin1', 'commoo3', 'commyn', 39 | 'compea', 'comros', 'comsan', 'comtai1', 'copbar1', 'crbsun2', 40 | 'cregos1', 'crfbar1', 'crseag1', 'dafbab1', 'darter2', 'eaywag1', 41 | 'emedov2', 'eucdov', 'eurbla2', 'eurcoo', 'forwag1', 'gargan', 42 | 'gloibi', 'goflea1', 'graher1', 'grbeat1', 'grecou1', 'greegr', 43 | 'grefla1', 'grehor1', 'grejun2', 'grenig1', 'grewar3', 'grnsan', 44 | 'grnwar1', 'grtdro1', 'gryfra', 'grynig2', 'grywag', 'gybpri1', 45 | 'gyhcaf1', 'heswoo1', 'hoopoe', 'houcro1', 'houspa', 'inbrob1', 46 | 'indpit1', 'indrob1', 'indrol2', 'indtit1', 'ingori1', 'inpher1', 47 | 'insbab1', 'insowl1', 'integr', 'isbduc1', 'jerbus2', 'junbab2', 48 | 'junmyn1', 'junowl1', 'kenplo1', 'kerlau2', 'labcro1', 'laudov1', 49 | 'lblwar1', 'lesyel1', 'lewduc1', 'lirplo', 'litegr', 'litgre1', 50 | 'litspi1', 'litswi1', 'lobsun2', 'maghor2', 'malpar1', 'maltro1', 51 | 'malwoo1', 'marsan', 'mawthr1', 'moipig1', 'nilfly2', 'niwpig1', 52 | 'nutman', 'orihob2', 'oripip1', 'pabflo1', 'paisto1', 'piebus1', 53 | 'piekin1', 'placuc3', 'plaflo1', 'plapri1', 'plhpar1', 'pomgrp2', 54 | 'purher1', 'pursun3', 'pursun4', 'purswa3', 'putbab1', 'redspu1', 55 | 'rerswa1', 'revbul', 'rewbul', 'rewlap1', 'rocpig', 'rorpar', 56 | 'rossta2', 'rufbab3', 'ruftre2', 'rufwoo2', 'rutfly6', 'sbeowl1', 57 | 'scamin3', 'shikra1', 'smamin1', 'sohmyn1', 'spepic1', 'spodov', 58 | 'spoowl1', 'sqtbul1', 'stbkin1', 'sttwoo1', 'thbwar1', 'tibfly3', 59 | 'tilwar1', 'vefnut1', 'vehpar1', 'wbbfly1', 'wemhar1', 'whbbul2', 60 | 'whbsho3', 'whbtre1', 'whbwag1', 'whbwat1', 'whbwoo2', 'whcbar1', 61 | 'whiter2', 'whrmun', 'whtkin2', 'woosan', 'wynlau1', 'yebbab1', 62 | 'yebbul3', 'zitcis1'] 63 | 64 | 65 | cfg.test_suffix = '.ogg' 66 | 67 | 68 | 69 | cfg.parts = cfg.test_duration // 5 70 | cfg.mixup_p = 0.1 71 | # cfg.pl_weight = 192 / (192 + 128) 72 | cfg.db_range = 10.0 73 | 74 | # stages 75 | cfg.test = True 76 | 77 | cfg.train = True 78 | cfg.train_val = False 79 | cfg.eval_epochs = 1 80 | 81 | #logging 82 | cfg.neptune_project = 'XXX' 83 | cfg.neptune_connection_mode = "async" 84 | cfg.tags = "base" 85 | 86 | #model 87 | cfg.model = "mdl_4" 88 | # cfg.mixup = 0.5 89 | # cfg.mix_beta=1. 90 | cfg.mixadd = False 91 | cfg.backbone = "efficientvit_b0.r224_in1k" 92 | cfg.pretrained = True 93 | cfg.in_chans = 1 94 | cfg.resample_train = 10 95 | 96 | 97 | 98 | 99 | cfg.labels = np.array(cfg.birds) 100 | cfg.targets = {v : i for i,v in enumerate(cfg.labels)} 101 | cfg.num_labels = len(cfg.labels) 102 | # augmentations 103 | cfg.resample_train = 10 104 | cfg.other_samples = 1 105 | cfg.max_shift = 1 106 | 107 | cfg.n_classes = len(cfg.birds) 108 | cfg.min_rating = 0 109 | cfg.sample_rate = 32000 110 | cfg.sr = cfg.sample_rate 111 | cfg.wav_crop_len = 6 112 | cfg.duration = 5 113 | 114 | cfg.image_height = 224 115 | cfg.image_width = 224 116 | n_fft=1536 117 | win_length=1024 118 | 119 | hop_length = cfg.duration * cfg.sr // (cfg.image_width - 1) 120 | cfg.mel_spec_args = dict(sample_rate=cfg.sample_rate, 121 | n_fft=n_fft, 122 | win_length=win_length, 123 | hop_length=hop_length, 124 | f_min=90, 125 | f_max=14000, 126 | pad=0, 127 | n_mels=cfg.image_height, 128 | mel_scale='htk', 129 | power=2.,) 130 | 131 | cfg.db_args = dict(stype="power", top_db=None) 132 | cfg.top_db = None 133 | # cfg.gem_p_trainable = True 134 | cfg.norm_ms = (40,80) 135 | 136 | # OPTIMIZATION & SCHEDULE 137 | cfg.fold = -1 138 | cfg.epochs = 20 139 | cfg.lr = 3e-3 140 | cfg.optimizer = "AdamW" 141 | cfg.weight_decay = 0.01 142 | cfg.clip_grad = 10. 143 | cfg.warmup = 2 144 | cfg.batch_size = 2 145 | cfg.batch_size_test = 2 146 | cfg.batch_size_val = 128 147 | cfg.mixed_precision = True # True 148 | cfg.pin_memory = False 149 | cfg.grad_accumulation = 1. 150 | cfg.num_workers = 2 151 | 152 | 153 | # DATASET 154 | cfg.dataset = "ds_pl_1" 155 | cfg.suffix = '.npy' 156 | cfg.sample_weights = False 157 | cfg.normalization = 'channel' 158 | 159 | #EVAL 160 | cfg.calc_metric = False 161 | cfg.simple_eval = False 162 | cfg.test_epochs = 0 163 | # augs & tta 164 | 165 | # Postprocess 166 | cfg.post_process_pipeline = "pp_dummy" 167 | cfg.metric = "metric_1" 168 | # augs & tta 169 | 170 | #Saving 171 | cfg.save_weights_only = True 172 | cfg.save_only_last_ckpt = True 173 | 174 | -------------------------------------------------------------------------------- /data/ds_1.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler 2 | import numpy as np 3 | import torchaudio 4 | import torch 5 | import librosa 6 | 7 | tr_collate_fn = None 8 | val_collate_fn = None 9 | 10 | 11 | 12 | 13 | 14 | 15 | # def load_one(self, id_): 16 | # fp = self.cfg.data_folder + id_ 17 | # try: 18 | # data, rate = torchaudio.load(fp) 19 | # except: 20 | # print("FAIL READING rec", fp) 21 | 22 | # return data[0] 23 | 24 | class CustomDataset(Dataset): 25 | def __init__(self, train, cfg, aug, mode='train'): 26 | self.cfg = cfg 27 | self.df = train.copy() 28 | print(self.df.head()) 29 | self.mode = mode 30 | self.istrain = mode=='train' 31 | 32 | if self.mode == 'test': 33 | self.data_folder = cfg.test_data_folder 34 | self.data_folder2 = cfg.test_data_folder 35 | self.duration = cfg.test_duration 36 | self.suffix = cfg.test_suffix 37 | else: 38 | self.data_folder = cfg.data_folder 39 | self.data_folder2 = cfg.data_folder2 40 | self.duration = cfg.duration 41 | self.suffix = cfg.suffix 42 | 43 | self.filename = train.filename.values 44 | #self.primary_label = train.primary_label.values 45 | if not 'primary_label' in train.columns: 46 | train['primary_label'] = np.array(['asbfly'] * train.shape[0]) 47 | if 'first_species' not in train.columns: 48 | train['first_species'] = train['primary_label'].values 49 | train['last_species'] = train['primary_label'].values 50 | if 'secondary_labels' in train.columns: 51 | self.secondary_labels = train.secondary_labels.values 52 | self.first_species = train.first_species.values 53 | self.last_species = train.last_species.values 54 | else: 55 | self.secondary_labels = np.array(['asbfly'] * train.shape[0]) 56 | self.first_species = np.array(['asbfly'] * train.shape[0]) 57 | self.last_species = np.array(['asbfly'] * train.shape[0]) 58 | 59 | if self.mode == 'test': 60 | self.test_parts = self.duration // 5 61 | 62 | 63 | 64 | def __len__(self): 65 | return len(self.filename) 66 | 67 | def get_audio(self, idx): 68 | filename = self.filename[idx] 69 | duration = self.cfg.sr * self.duration 70 | if self.istrain: 71 | first = np.random.rand() < 0.5 72 | audio = self.load_audio(filename, first, True, self.cfg) 73 | if len(audio) < duration: 74 | pad_length = np.random.randint(0, duration - len(audio) + 1) 75 | audio = np.pad(audio, 76 | ((pad_length, duration - len(audio) - pad_length),), 77 | mode='constant') 78 | else: 79 | start = np.random.randint(0, len(audio) - duration + 1) 80 | audio = audio[start : start + duration] 81 | else: 82 | audio = self.load_audio(filename, True, False, self.cfg) 83 | audio = audio[:duration] 84 | if len(audio) < duration: 85 | pad_length = (duration - len(audio)) // 2 86 | audio = np.pad(audio, 87 | ((pad_length, duration - len(audio) - pad_length),), 88 | mode='constant') 89 | return audio 90 | 91 | def __getitem__(self, idx): 92 | audio = self.get_audio(idx) 93 | targets = np.zeros(len(self.cfg.labels), dtype=np.float32) 94 | targets[self.cfg.targets[self.first_species[idx]]] = 1.0 95 | targets[self.cfg.targets[self.last_species[idx]]] = 1.0 96 | secondary_mask = np.ones(len(self.cfg.labels), dtype=np.float32) 97 | secondary_labels = self.secondary_labels[idx] 98 | if len(secondary_labels) > 0: 99 | for label in secondary_labels: 100 | if label in self.cfg.targets: 101 | secondary_mask[self.cfg.targets[label]] = 0 102 | if self.istrain and self.cfg.other_samples: 103 | num_samples = np.random.randint(0, self.cfg.other_samples + 1) 104 | for _ in range(num_samples): 105 | other_idx = np.random.randint(len(self.filename)) 106 | other_audio = self.get_audio(other_idx) 107 | weight = 0.2 + 0.8 * np.random.rand() 108 | audio += weight * other_audio 109 | targets[self.cfg.targets[self.first_species[other_idx]]] = 1.0 110 | targets[self.cfg.targets[self.last_species[other_idx]]] = 1.0 111 | secondary_labels = self.secondary_labels[other_idx] 112 | if len(secondary_labels) > 0: 113 | for label in secondary_labels: 114 | if label in self.cfg.targets: 115 | secondary_mask[self.cfg.targets[label]] = 0 116 | secondary_mask = np.maximum(secondary_mask, targets) 117 | 118 | wav_tensor = torch.from_numpy(audio) 119 | if self.mode == 'test': 120 | #cut 121 | wav_tensor = wav_tensor.reshape(self.test_parts,wav_tensor.shape[0]//self.test_parts) 122 | 123 | out = { 124 | 'input' : wav_tensor, 125 | 'targets' : torch.from_numpy(targets), 126 | 'secondary_mask' : secondary_mask, 127 | } 128 | return out 129 | 130 | def load_audio(self, filename, first, istrain, cfg): 131 | f_id = filename.split('.')[0] 132 | if istrain: 133 | max_duration = int((self.duration + cfg.max_shift) * cfg.sr) 134 | else: 135 | max_duration = self.duration * cfg.sr 136 | if first: 137 | fp = f'{self.data_folder}/{f_id}{self.suffix}' 138 | if self.suffix == '.wav': 139 | audio, rate = torchaudio.load(fp) 140 | audio = audio[0].numpy() 141 | elif self.suffix == '.ogg': 142 | audio = librosa.load(fp, sr=cfg.sr)[0].astype(np.float32) 143 | else: 144 | audio = np.load(fp) 145 | audio = audio[:max_duration] 146 | else: 147 | fp = f'{self.data_folder2}/{f_id}{self.suffix}' 148 | if cfg.suffix in ['.wav','.ogg']: 149 | audio, rate = torchaudio.load(fp) 150 | audio = audio[0].numpy() 151 | else: 152 | 153 | # filepath = filepath / f"first10_{fname}.npy" 154 | audio = np.load(fp) 155 | audio = audio[-max_duration:] 156 | return audio 157 | 158 | def batch_to_device(batch, device): 159 | return {k:batch[k].to(device, non_blocking=True) for k in batch.keys() if k not in []} -------------------------------------------------------------------------------- /configs/cfg_3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from importlib import import_module 4 | import platform 5 | import json 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from default_config import basic_cfg as cfg 10 | import glob 11 | 12 | # paths 13 | 14 | cfg.name = os.path.basename(__file__).split(".")[0] 15 | cfg.output_dir = f"/mount/birdclef24/models/{os.path.basename(__file__).split('.')[0]}" 16 | 17 | cfg.data_folder = f"/mount/birdclef24/data/birdclef-2024/train_audio_npy_10_v2/" 18 | cfg.data_folder2 = f"/mount/birdclef24/data/birdclef-2024/train_audio_npy_last10_v2/" 19 | cfg.train_df = '/mount/birdclef24/data/train_folded_v3c.csv' 20 | cfg.test_df = '/mount/birdclef24/data/birdclef-2024/test.csv' 21 | cfg.test_gt = '/mount/birdclef24/data/birdclef-2024/test_fake_gt.csv' 22 | cfg.test_data_folder = '/mount/birdclef24/data/birdclef-2024/unlabeled_soundscapes/' 23 | cfg.test_suffix = '.ogg' 24 | cfg.test_duration = 240 25 | cfg.birds = ['asbfly', 'ashdro1', 'ashpri1', 'ashwoo2', 'asikoe2', 'asiope1', 26 | 'aspfly1', 'aspswi1', 'barfly1', 'barswa', 'bcnher', 'bkcbul1', 27 | 'bkrfla1', 'bkskit1', 'bkwsti', 'bladro1', 'blaeag1', 'blakit1', 28 | 'blhori1', 'blnmon1', 'blrwar1', 'bncwoo3', 'brakit1', 'brasta1', 29 | 'brcful1', 'brfowl1', 'brnhao1', 'brnshr', 'brodro1', 'brwjac1', 30 | 'brwowl1', 'btbeat1', 'bwfshr1', 'categr', 'chbeat1', 'cohcuc1', 31 | 'comfla1', 'comgre', 'comior1', 'comkin1', 'commoo3', 'commyn', 32 | 'compea', 'comros', 'comsan', 'comtai1', 'copbar1', 'crbsun2', 33 | 'cregos1', 'crfbar1', 'crseag1', 'dafbab1', 'darter2', 'eaywag1', 34 | 'emedov2', 'eucdov', 'eurbla2', 'eurcoo', 'forwag1', 'gargan', 35 | 'gloibi', 'goflea1', 'graher1', 'grbeat1', 'grecou1', 'greegr', 36 | 'grefla1', 'grehor1', 'grejun2', 'grenig1', 'grewar3', 'grnsan', 37 | 'grnwar1', 'grtdro1', 'gryfra', 'grynig2', 'grywag', 'gybpri1', 38 | 'gyhcaf1', 'heswoo1', 'hoopoe', 'houcro1', 'houspa', 'inbrob1', 39 | 'indpit1', 'indrob1', 'indrol2', 'indtit1', 'ingori1', 'inpher1', 40 | 'insbab1', 'insowl1', 'integr', 'isbduc1', 'jerbus2', 'junbab2', 41 | 'junmyn1', 'junowl1', 'kenplo1', 'kerlau2', 'labcro1', 'laudov1', 42 | 'lblwar1', 'lesyel1', 'lewduc1', 'lirplo', 'litegr', 'litgre1', 43 | 'litspi1', 'litswi1', 'lobsun2', 'maghor2', 'malpar1', 'maltro1', 44 | 'malwoo1', 'marsan', 'mawthr1', 'moipig1', 'nilfly2', 'niwpig1', 45 | 'nutman', 'orihob2', 'oripip1', 'pabflo1', 'paisto1', 'piebus1', 46 | 'piekin1', 'placuc3', 'plaflo1', 'plapri1', 'plhpar1', 'pomgrp2', 47 | 'purher1', 'pursun3', 'pursun4', 'purswa3', 'putbab1', 'redspu1', 48 | 'rerswa1', 'revbul', 'rewbul', 'rewlap1', 'rocpig', 'rorpar', 49 | 'rossta2', 'rufbab3', 'ruftre2', 'rufwoo2', 'rutfly6', 'sbeowl1', 50 | 'scamin3', 'shikra1', 'smamin1', 'sohmyn1', 'spepic1', 'spodov', 51 | 'spoowl1', 'sqtbul1', 'stbkin1', 'sttwoo1', 'thbwar1', 'tibfly3', 52 | 'tilwar1', 'vefnut1', 'vehpar1', 'wbbfly1', 'wemhar1', 'whbbul2', 53 | 'whbsho3', 'whbtre1', 'whbwag1', 'whbwat1', 'whbwoo2', 'whcbar1', 54 | 'whiter2', 'whrmun', 'whtkin2', 'woosan', 'wynlau1', 'yebbab1', 55 | 'yebbul3', 'zitcis1'] 56 | cfg.test_epochs = 0 57 | 58 | # stages 59 | cfg.test = True 60 | cfg.train = True 61 | cfg.train_val = False 62 | cfg.eval_epochs = 1 63 | 64 | #logging 65 | cfg.neptune_project = 'XXX' 66 | cfg.neptune_connection_mode = "async" 67 | cfg.tags = "base" 68 | 69 | #model 70 | cfg.model = "mdl_3" 71 | 72 | cfg.backbone_config_path = '/mount/birdclef24/data/aves/birdaves-biox-base.torchaudio.model_config.json' 73 | cfg.backbone_model_path = '/mount/birdclef24/data/aves/birdaves-biox-base.torchaudio.pt' 74 | cfg.pretrained = True 75 | cfg.in_chans = 1 76 | cfg.resample_train = 10 77 | 78 | cfg.rare_birds = [] 79 | 80 | cfg.labels = np.array(cfg.birds) 81 | cfg.targets = {v : i for i,v in enumerate(cfg.labels)} 82 | cfg.rare_ids = np.array([cfg.targets[b] for b in cfg.rare_birds]) 83 | cfg.num_labels = len(cfg.labels) 84 | # augmentations 85 | cfg.resample_train = 10 86 | cfg.other_samples = 1 87 | cfg.max_shift = 1 88 | 89 | cfg.n_classes = len(cfg.birds) 90 | cfg.sample_rate = 32000 91 | cfg.sr = cfg.sample_rate 92 | cfg.duration = 5 93 | 94 | 95 | # OPTIMIZATION & SCHEDULE 96 | cfg.fold = -1 97 | cfg.epochs = 100 98 | cfg.lr = 5e-5 99 | cfg.optimizer = "AdamW" 100 | cfg.weight_decay = 0.001 101 | cfg.clip_grad = 10. 102 | cfg.warmup = 2 103 | cfg.batch_size = 48 104 | cfg.batch_size_test = 1 105 | cfg.mixed_precision = True # True 106 | cfg.pin_memory = False 107 | cfg.grad_accumulation = 1. 108 | cfg.num_workers = 8 109 | 110 | 111 | # DATASET 112 | cfg.dataset = "ds_3" 113 | cfg.label_secondary = 0.5 114 | cfg.suffix = '.npy' 115 | cfg.eval_epochs = 20 116 | 117 | cfg.sample_weights = False 118 | cfg.normalization = 'channel' 119 | 120 | #EVAL 121 | cfg.calc_metric = False 122 | cfg.simple_eval = False 123 | 124 | # augs & tta 125 | 126 | # Postprocess 127 | cfg.post_process_pipeline = "pp_dummy" 128 | cfg.metric = "metric_1" 129 | # augs & tta 130 | 131 | #Saving 132 | cfg.save_weights_only = True 133 | cfg.save_only_last_ckpt = True 134 | 135 | 136 | from augmentations import ( 137 | CustomCompose, 138 | CustomOneOf, 139 | NoiseInjection, 140 | GaussianNoise, 141 | PinkNoise, 142 | AddGaussianNoise, 143 | AddGaussianSNR, 144 | ) 145 | 146 | cfg.np_audio_transforms = CustomCompose( 147 | [ 148 | CustomOneOf( 149 | [ 150 | NoiseInjection(p=1, max_noise_level=0.04), 151 | GaussianNoise(p=1, min_snr=5, max_snr=20), 152 | PinkNoise(p=1, min_snr=5, max_snr=20), 153 | AddGaussianNoise(min_amplitude=0.0001, max_amplitude=0.03, p=0.5), 154 | AddGaussianSNR(min_snr_in_db=5, max_snr_in_db=15, p=0.5), 155 | ], 156 | p=0.3, 157 | ), 158 | ] 159 | ) 160 | 161 | from audiomentations import Compose as amCompose 162 | from audiomentations import OneOf as amOneOf 163 | from audiomentations import AddBackgroundNoise, Gain, GainTransition 164 | 165 | bg_folder = '/mount/birdclef24/data/background/honglihang_background_noise_sec_wav/' 166 | cfg.freefield = glob.glob(f"{bg_folder}freefield/*") 167 | cfg.warblrb = glob.glob(f"{bg_folder}warblrb/*") 168 | cfg.birdvox = glob.glob(f"{bg_folder}birdvox/*") 169 | cfg.rainforest = glob.glob(f"{bg_folder}rainforest/*") 170 | 171 | cfg.am_audio_transforms = amCompose( 172 | [ 173 | 174 | amOneOf([AddBackgroundNoise( 175 | cfg.rainforest, 176 | min_snr_in_db=3., 177 | max_snr_in_db=30., 178 | p=0.7,), 179 | AddBackgroundNoise( 180 | cfg.freefield + cfg.warblrb + cfg.birdvox, 181 | min_snr_in_db=3., 182 | max_snr_in_db=30., 183 | p=0.35, 184 | ) 185 | ],p=0.25), 186 | 187 | amOneOf( 188 | [ 189 | Gain(min_gain_in_db=-15, max_gain_in_db=15, p=0.8), 190 | GainTransition(min_gain_in_db=-15, max_gain_in_db=15, p=0.8), 191 | ], 192 | ), 193 | ] 194 | ) 195 | 196 | 197 | -------------------------------------------------------------------------------- /data/ds_3.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler 2 | import numpy as np 3 | import torchaudio 4 | from torchaudio import functional as TF 5 | import torch 6 | import librosa 7 | import pandas as pd 8 | 9 | tr_collate_fn = None 10 | val_collate_fn = None 11 | 12 | 13 | 14 | 15 | 16 | import ast 17 | 18 | def upsample(df, cfg): 19 | new_train = [] 20 | for species, sub_df in df.groupby('primary_label'): 21 | if len(df) < cfg.resample_train: 22 | n = np.ceil(cfg.resample_train/len(sub_df)).astype(int) 23 | sub_df = pd.concat([sub_df] * n) 24 | new_train += [sub_df] 25 | new_train = pd.concat(new_train).reset_index(drop=True) 26 | return new_train 27 | 28 | class CustomDataset(Dataset): 29 | def __init__(self, train, cfg, aug, mode='train'): 30 | self.cfg = cfg 31 | self.mode = mode 32 | if self.mode == 'train': 33 | self.df = upsample(train.copy(), cfg) 34 | else: 35 | self.df = train.copy() 36 | 37 | if self.mode == 'test': 38 | self.data_folder = cfg.test_data_folder 39 | self.data_folder2 = cfg.test_data_folder 40 | self.duration = cfg.test_duration 41 | self.suffix = cfg.test_suffix 42 | else: 43 | self.data_folder = cfg.data_folder 44 | self.data_folder2 = cfg.data_folder2 45 | self.duration = cfg.duration 46 | self.suffix = cfg.suffix 47 | 48 | self.istrain = mode=='train' 49 | 50 | self.filename = train.filename.values 51 | 52 | 53 | #self.primary_label = train.primary_label.values 54 | if 'primary_label' in train.columns: 55 | if 'first_species' not in train.columns: 56 | train['first_species'] = train['primary_label'].values 57 | if 'last_species' not in train.columns: 58 | train['last_species'] = train['primary_label'].values 59 | 60 | 61 | 62 | self.secondary_labels = [ast.literal_eval(item) for item in train.secondary_labels.values] 63 | self.first_species = train.first_species.values 64 | self.last_species = train.last_species.values 65 | else: 66 | self.secondary_labels = np.array([cfg.birds[0]] * train.shape[0]) 67 | self.first_species = np.array([cfg.birds[0]] * train.shape[0]) 68 | self.last_species = np.array([cfg.birds[0]] * train.shape[0]) 69 | 70 | if self.mode == 'test': 71 | self.test_parts = self.duration // 5 72 | 73 | 74 | 75 | def __len__(self): 76 | return len(self.filename) 77 | 78 | def get_audio(self, idx): 79 | filename = self.filename[idx] 80 | duration = self.cfg.sr * self.duration 81 | if self.istrain: 82 | first = np.random.rand() < 0.5 83 | audio = self.load_audio(filename, first, True, self.cfg) 84 | if len(audio) < duration: 85 | pad_length = np.random.randint(0, duration - len(audio) + 1) 86 | audio = np.pad(audio, ((pad_length, duration - len(audio) - pad_length),), mode='constant') 87 | else: 88 | start = np.random.randint(0, len(audio) - duration + 1) 89 | audio = audio[start : start + duration] 90 | 91 | audio = self.cfg.np_audio_transforms(audio) 92 | audio = self.cfg.am_audio_transforms(audio,sample_rate=self.cfg.sr) 93 | else: 94 | audio = self.load_audio(filename, True, False, self.cfg) 95 | audio = audio[:duration] 96 | if len(audio) < duration: 97 | pad_length = (duration - len(audio)) // 2 98 | audio = np.pad(audio, ((pad_length, duration - len(audio) - pad_length),), mode='constant') 99 | return audio 100 | 101 | def __getitem__(self, idx): 102 | audio = self.get_audio(idx) 103 | targets = np.zeros(len(self.cfg.labels), dtype=np.float32) 104 | targets[self.cfg.targets[self.first_species[idx]]] = 1.0 105 | targets[self.cfg.targets[self.last_species[idx]]] = 1.0 106 | secondary_mask = np.ones(len(self.cfg.labels), dtype=np.float32) 107 | secondary_labels = self.secondary_labels[idx] 108 | if len(secondary_labels) > 0: 109 | for label in secondary_labels: 110 | if label in self.cfg.targets: 111 | targets[self.cfg.targets[label]] = self.cfg.label_secondary 112 | if self.istrain and self.cfg.other_samples: 113 | num_samples = np.random.randint(0, self.cfg.other_samples + 1) 114 | for _ in range(num_samples): 115 | other_idx = np.random.randint(len(self.filename)) 116 | other_audio = self.get_audio(other_idx) 117 | weight = 0.2 + 0.8 * np.random.rand() 118 | audio += weight * other_audio 119 | targets[self.cfg.targets[self.first_species[other_idx]]] = 1.0 120 | targets[self.cfg.targets[self.last_species[other_idx]]] = 1.0 121 | secondary_labels = self.secondary_labels[other_idx] 122 | if len(secondary_labels) > 0: 123 | for label in secondary_labels: 124 | if label in self.cfg.targets: 125 | targets[self.cfg.targets[label]] = self.cfg.label_secondary 126 | secondary_mask = np.maximum(secondary_mask, targets) 127 | 128 | wav_tensor = torch.from_numpy(audio) 129 | 130 | wav_tensor = TF.resample(wav_tensor, 32000, 16000, resampling_method="sinc_interp_hann") 131 | if self.mode == 'test': 132 | #cut 133 | wav_tensor = wav_tensor.reshape(self.test_parts,wav_tensor.shape[0]//self.test_parts) 134 | 135 | out = { 136 | 'input' : wav_tensor, 137 | 'targets' : torch.from_numpy(targets), 138 | 'secondary_mask' : secondary_mask, 139 | } 140 | return out 141 | 142 | def load_audio(self, filename, first, istrain, cfg): 143 | f_id = filename.split('.')[0] 144 | if istrain: 145 | max_duration = int((self.duration + cfg.max_shift) * cfg.sr) 146 | else: 147 | max_duration = self.duration * cfg.sr 148 | if first: 149 | fp = f'{self.data_folder}/{f_id}{self.suffix}' 150 | if cfg.suffix == '.wav': 151 | audio, rate = torchaudio.load(fp) 152 | audio = audio[0].numpy() 153 | elif self.suffix in ['.ogg','.flac']: 154 | audio = librosa.load(fp, sr=cfg.sr)[0].astype(np.float32) 155 | else: 156 | try: 157 | audio = np.load(fp) 158 | except Exception as e: 159 | print(e, fp) 160 | audio = audio[:max_duration] 161 | else: 162 | fp = f'{self.data_folder2}/{f_id}{self.suffix}' 163 | if self.suffix in ['.wav','.ogg']: 164 | audio, rate = torchaudio.load(fp) 165 | audio = audio[0].numpy() 166 | else: 167 | 168 | # filepath = filepath / f"first10_{fname}.npy" 169 | try: 170 | audio = np.load(fp) 171 | except Exception as e: 172 | print(e, fp) 173 | audio = audio[-max_duration:] 174 | return audio 175 | 176 | def batch_to_device(batch, device): 177 | return {k:batch[k].to(device, non_blocking=True) for k in batch.keys() if k not in []} 178 | -------------------------------------------------------------------------------- /configs/cfg_2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from importlib import import_module 4 | import platform 5 | import json 6 | import numpy as np 7 | import pandas as pd 8 | import glob 9 | 10 | from default_config import basic_cfg as cfg 11 | 12 | 13 | cfg.debug = True 14 | 15 | # paths 16 | 17 | cfg.name = os.path.basename(__file__).split(".")[0] 18 | cfg.output_dir = f"/mount/birdclef24/models/{os.path.basename(__file__).split('.')[0]}" 19 | 20 | 21 | 22 | cfg.data_folder = f"/mount/birdclef24/data/birdclef-2024/train_audio_npy_10_v2/" 23 | cfg.data_folder2 = f"/mount/birdclef24/data/birdclef-2024/train_audio_npy_last10_v2/" 24 | cfg.train_df = '/mount/birdclef24/data/train_folded_v3c.csv' 25 | cfg.test_df = '/mount/birdclef24/data/birdclef-2024/test.csv' 26 | cfg.test_gt = '/mount/birdclef24/data/birdclef-2024/test_fake_gt.csv' 27 | cfg.test_data_folder = '/mount/birdclef24/data/birdclef-2024/unlabeled_soundscapes/' 28 | cfg.test_suffix = '.ogg' 29 | cfg.test_duration = 240 30 | cfg.birds = ['asbfly', 'ashdro1', 'ashpri1', 'ashwoo2', 'asikoe2', 'asiope1', 31 | 'aspfly1', 'aspswi1', 'barfly1', 'barswa', 'bcnher', 'bkcbul1', 32 | 'bkrfla1', 'bkskit1', 'bkwsti', 'bladro1', 'blaeag1', 'blakit1', 33 | 'blhori1', 'blnmon1', 'blrwar1', 'bncwoo3', 'brakit1', 'brasta1', 34 | 'brcful1', 'brfowl1', 'brnhao1', 'brnshr', 'brodro1', 'brwjac1', 35 | 'brwowl1', 'btbeat1', 'bwfshr1', 'categr', 'chbeat1', 'cohcuc1', 36 | 'comfla1', 'comgre', 'comior1', 'comkin1', 'commoo3', 'commyn', 37 | 'compea', 'comros', 'comsan', 'comtai1', 'copbar1', 'crbsun2', 38 | 'cregos1', 'crfbar1', 'crseag1', 'dafbab1', 'darter2', 'eaywag1', 39 | 'emedov2', 'eucdov', 'eurbla2', 'eurcoo', 'forwag1', 'gargan', 40 | 'gloibi', 'goflea1', 'graher1', 'grbeat1', 'grecou1', 'greegr', 41 | 'grefla1', 'grehor1', 'grejun2', 'grenig1', 'grewar3', 'grnsan', 42 | 'grnwar1', 'grtdro1', 'gryfra', 'grynig2', 'grywag', 'gybpri1', 43 | 'gyhcaf1', 'heswoo1', 'hoopoe', 'houcro1', 'houspa', 'inbrob1', 44 | 'indpit1', 'indrob1', 'indrol2', 'indtit1', 'ingori1', 'inpher1', 45 | 'insbab1', 'insowl1', 'integr', 'isbduc1', 'jerbus2', 'junbab2', 46 | 'junmyn1', 'junowl1', 'kenplo1', 'kerlau2', 'labcro1', 'laudov1', 47 | 'lblwar1', 'lesyel1', 'lewduc1', 'lirplo', 'litegr', 'litgre1', 48 | 'litspi1', 'litswi1', 'lobsun2', 'maghor2', 'malpar1', 'maltro1', 49 | 'malwoo1', 'marsan', 'mawthr1', 'moipig1', 'nilfly2', 'niwpig1', 50 | 'nutman', 'orihob2', 'oripip1', 'pabflo1', 'paisto1', 'piebus1', 51 | 'piekin1', 'placuc3', 'plaflo1', 'plapri1', 'plhpar1', 'pomgrp2', 52 | 'purher1', 'pursun3', 'pursun4', 'purswa3', 'putbab1', 'redspu1', 53 | 'rerswa1', 'revbul', 'rewbul', 'rewlap1', 'rocpig', 'rorpar', 54 | 'rossta2', 'rufbab3', 'ruftre2', 'rufwoo2', 'rutfly6', 'sbeowl1', 55 | 'scamin3', 'shikra1', 'smamin1', 'sohmyn1', 'spepic1', 'spodov', 56 | 'spoowl1', 'sqtbul1', 'stbkin1', 'sttwoo1', 'thbwar1', 'tibfly3', 57 | 'tilwar1', 'vefnut1', 'vehpar1', 'wbbfly1', 'wemhar1', 'whbbul2', 58 | 'whbsho3', 'whbtre1', 'whbwag1', 'whbwat1', 'whbwoo2', 'whcbar1', 59 | 'whiter2', 'whrmun', 'whtkin2', 'woosan', 'wynlau1', 'yebbab1', 60 | 'yebbul3', 'zitcis1'] 61 | 62 | cfg.test_epochs = 0 63 | 64 | # stages 65 | cfg.test = True 66 | # cfg.test_data_folder = cfg.data_folder 67 | cfg.train = True 68 | cfg.train_val = False 69 | cfg.eval_epochs = 1 70 | 71 | #logging 72 | cfg.neptune_project = 'XXX' 73 | cfg.neptune_connection_mode = "async" 74 | cfg.tags = "base" 75 | 76 | #model 77 | cfg.model = "mdl_2" 78 | cfg.backbone = 'tf_efficientnetv2_s_in21k' 79 | cfg.pretrained = True 80 | cfg.in_chans = 1 81 | 82 | cfg.mixup = True 83 | cfg.mixup2 = True 84 | cfg.mixup_prob = 0.7 85 | cfg.mixup_double = 0.5 86 | cfg.mixup2_prob = 0.15 87 | cfg.mix_beta = 5 88 | cfg.mix_beta2 = 2 89 | 90 | # cfg.birds = ['asbfly', 'ashdro1', 'ashpri1', 'ashwoo2', 'asikoe2', 'asiope1', 91 | # 'aspfly1', 'aspswi1', 'barfly1', 'barswa', 'bcnher', 'bkcbul1', 92 | # 'bkrfla1', 'bkskit1', 'bkwsti', 'bladro1', 'blaeag1', 'blakit1', 93 | # 'blhori1', 'blnmon1', 'blrwar1', 'bncwoo3', 'brakit1', 'brasta1', 94 | # 'brcful1', 'brfowl1', 'brnhao1', 'brnshr', 'brodro1', 'brwjac1', 95 | # 'brwowl1', 'btbeat1', 'bwfshr1', 'categr', 'chbeat1', 'cohcuc1', 96 | # 'comfla1', 'comgre', 'comior1', 'comkin1', 'commoo3', 'commyn', 97 | # 'compea', 'comros', 'comsan', 'comtai1', 'copbar1', 'crbsun2', 98 | # 'cregos1', 'crfbar1', 'crseag1', 'dafbab1', 'darter2', 'eaywag1', 99 | # 'emedov2', 'eucdov', 'eurbla2', 'eurcoo', 'forwag1', 'gargan', 100 | # 'gloibi', 'goflea1', 'graher1', 'grbeat1', 'grecou1', 'greegr', 101 | # 'grefla1', 'grehor1', 'grejun2', 'grenig1', 'grewar3', 'grnsan', 102 | # 'grnwar1', 'grtdro1', 'gryfra', 'grynig2', 'grywag', 'gybpri1', 103 | # 'gyhcaf1', 'heswoo1', 'hoopoe', 'houcro1', 'houspa', 'inbrob1', 104 | # 'indpit1', 'indrob1', 'indrol2', 'indtit1', 'ingori1', 'inpher1', 105 | # 'insbab1', 'insowl1', 'integr', 'isbduc1', 'jerbus2', 'junbab2', 106 | # 'junmyn1', 'junowl1', 'kenplo1', 'kerlau2', 'labcro1', 'laudov1', 107 | # 'lblwar1', 'lesyel1', 'lewduc1', 'lirplo', 'litegr', 'litgre1', 108 | # 'litspi1', 'litswi1', 'lobsun2', 'maghor2', 'malpar1', 'maltro1', 109 | # 'malwoo1', 'marsan', 'mawthr1', 'moipig1', 'nilfly2', 'niwpig1', 110 | # 'nutman', 'orihob2', 'oripip1', 'pabflo1', 'paisto1', 'piebus1', 111 | # 'piekin1', 'placuc3', 'plaflo1', 'plapri1', 'plhpar1', 'pomgrp2', 112 | # 'purher1', 'pursun3', 'pursun4', 'purswa3', 'putbab1', 'redspu1', 113 | # 'rerswa1', 'revbul', 'rewbul', 'rewlap1', 'rocpig', 'rorpar', 114 | # 'rossta2', 'rufbab3', 'ruftre2', 'rufwoo2', 'rutfly6', 'sbeowl1', 115 | # 'scamin3', 'shikra1', 'smamin1', 'sohmyn1', 'spepic1', 'spodov', 116 | # 'spoowl1', 'sqtbul1', 'stbkin1', 'sttwoo1', 'thbwar1', 'tibfly3', 117 | # 'tilwar1', 'vefnut1', 'vehpar1', 'wbbfly1', 'wemhar1', 'whbbul2', 118 | # 'whbsho3', 'whbtre1', 'whbwag1', 'whbwat1', 'whbwoo2', 'whcbar1', 119 | # 'whiter2', 'whrmun', 'whtkin2', 'woosan', 'wynlau1', 'yebbab1', 120 | # 'yebbul3', 'zitcis1'] 121 | 122 | cfg.n_classes = len(cfg.birds) 123 | cfg.min_rating = 0 124 | cfg.sample_rate = 32000 125 | cfg.wav_max_len = 10 126 | cfg.wav_crop_len = 5 127 | cfg.infer_duration = 5 128 | cfg.duration = 5 129 | cfg.img_size = 384 130 | hop_length = cfg.infer_duration * 32000 // (cfg.img_size - 1) 131 | cfg.mel_spec_args = dict(sample_rate=cfg.sample_rate, 132 | n_fft=2048, 133 | win_length=2048, 134 | hop_length=hop_length, 135 | f_min=0, 136 | f_max=16000, 137 | pad=0, 138 | n_mels=128, 139 | power=2., 140 | center=True, 141 | pad_mode="constant", 142 | norm="slaney", 143 | mel_scale="slaney", 144 | ) 145 | cfg.db_args = dict(stype="power", top_db=80) 146 | cfg.norm_by = 80 147 | cfg.gem_p_trainable = True 148 | 149 | 150 | # OPTIMIZATION & SCHEDULE 151 | cfg.fold = -1 152 | cfg.epochs = 50 153 | cfg.lr = 0.001 154 | cfg.optimizer = "Adam" 155 | cfg.weight_decay = 1e-3 156 | cfg.clip_grad = 5000 157 | cfg.warmup = 1 158 | cfg.batch_size = 96 159 | cfg.batch_size_test = 8 160 | cfg.mixed_precision = False # True 161 | cfg.pin_memory = False 162 | cfg.grad_accumulation = 1. 163 | cfg.num_workers = 8 164 | 165 | 166 | # DATASET 167 | cfg.dataset = "ds_2" 168 | cfg.suffix = '.npy' 169 | cfg.normalization = 'channel' 170 | 171 | #EVAL 172 | cfg.calc_metric = False 173 | cfg.simple_eval = False 174 | # augs & tta 175 | 176 | # Postprocess 177 | cfg.post_process_pipeline = "pp_dummy" 178 | cfg.metric = "metric_1" 179 | # augs & tta 180 | 181 | #Saving 182 | cfg.save_weights_only = True 183 | cfg.save_only_last_ckpt = False 184 | 185 | 186 | cfg.train_aug = [None] 187 | cfg.val_aug = [None] 188 | -------------------------------------------------------------------------------- /scripts/create_train_folded_v3c.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import glob 9 | import numpy as np 10 | 11 | 12 | # In[2]: 13 | 14 | 15 | train = pd.read_csv('/mount/birdclef24/data/birdclef-2024/train_metadata.csv') 16 | train 17 | 18 | 19 | # In[3]: 20 | 21 | 22 | train['species'] = [filename.split('/')[0] for filename in train.filename] 23 | train['record'] = [filename.split('/')[1] for filename in train.filename] 24 | train['secondary_labels'] = [eval(sls) for sls in train['secondary_labels']] 25 | 26 | 27 | # In[4]: 28 | 29 | 30 | df = train.groupby('record').size() 31 | df = df[df > 1] 32 | df 33 | 34 | 35 | # In[5]: 36 | 37 | 38 | df = train.groupby('record').agg({'species' : ['first', 'last'], 39 | 'secondary_labels': ['first', 'last'], 40 | }) 41 | df.columns = ['first_species', 'last_species', 'first_secondary', 'last_secondary'] 42 | df = df.reset_index() 43 | df 44 | 45 | 46 | # In[6]: 47 | 48 | 49 | train = train.merge(df[['record', 'first_species', 'last_species',]], 50 | how='left', 51 | on='record') 52 | 53 | 54 | # In[7]: 55 | 56 | 57 | dups = { 58 | ('asbfly/XC724266.ogg', 'asbfly/XC724148.ogg'), 59 | ('barswa/XC575749.ogg', 'barswa/XC575747.ogg'), 60 | ('bcnher/XC669544.ogg', 'bcnher/XC669542.ogg'), 61 | ('bkskit1/XC350251.ogg', 'bkskit1/XC350249.ogg'), 62 | ('blhori1/XC417215.ogg', 'blhori1/XC417133.ogg'), 63 | ('blhori1/XC743616.ogg', 'blhori1/XC537503.ogg'), 64 | ('blrwar1/XC662286.ogg', 'blrwar1/XC662285.ogg'), 65 | ('brakit1/XC743675.ogg', 'brakit1/XC537471.ogg'), 66 | ('brcful1/XC197746.ogg', 'brcful1/XC157971.ogg'), 67 | ('brnshr/XC510751.ogg', 'brnshr/XC510750.ogg'), 68 | ('btbeat1/XC665307.ogg', 'btbeat1/XC513403.ogg'), 69 | ('btbeat1/XC743618.ogg', 'btbeat1/XC683300.ogg'), 70 | #('btbeat1/XC743619.ogg', 'btbeat1/XC683300.ogg'), 71 | ('btbeat1/XC743618.ogg', 'btbeat1/XC743619.ogg'), 72 | ('categr/XC787914.ogg', 'categr/XC438523.ogg'), 73 | ('cohcuc1/XC253418.ogg', 'cohcuc1/XC241127.ogg'), 74 | ('cohcuc1/XC423422.ogg', 'cohcuc1/XC423419.ogg'), 75 | ('comgre/XC202776.ogg', 'comgre/XC192404.ogg'), 76 | ('comgre/XC602468.ogg', 'comgre/XC175341.ogg'), 77 | ('comgre/XC64628.ogg', 'comgre/XC58586.ogg'), 78 | ('comior1/XC305930.ogg', 'comior1/XC303819.ogg'), 79 | ('comkin1/XC207123.ogg', 'comior1/XC207062.ogg'), 80 | ('comkin1/XC691421.ogg', 'comkin1/XC690633.ogg'), 81 | ('commyn/XC577887.ogg', 'commyn/XC577886.ogg'), 82 | ('commyn/XC652903.ogg', 'commyn/XC652901.ogg'), 83 | ('compea/XC665320.ogg', 'compea/XC644022.ogg'), 84 | ('comsan/XC385909.ogg', 'comsan/XC385908.ogg'), 85 | ('comsan/XC643721.ogg', 'comsan/XC642698.ogg'), 86 | ('comsan/XC667807.ogg', 'comsan/XC667806.ogg'), 87 | ('comtai1/XC126749.ogg', 'comtai1/XC122978.ogg'), 88 | ('comtai1/XC305210.ogg', 'comtai1/XC304811.ogg'), 89 | ('comtai1/XC542375.ogg', 'comtai1/XC540351.ogg'), 90 | ('comtai1/XC542379.ogg', 'comtai1/XC540352.ogg'), 91 | ('crfbar1/XC615780.ogg', 'crfbar1/XC615778.ogg'), 92 | ('dafbab1/XC188307.ogg', 'dafbab1/XC187059.ogg'), 93 | ('dafbab1/XC188308.ogg', 'dafbab1/XC187068.ogg'), 94 | ('dafbab1/XC188309.ogg', 'dafbab1/XC187069.ogg'), 95 | ('dafbab1/XC197745.ogg', 'dafbab1/XC157972.ogg'), 96 | ('eaywag1/XC527600.ogg', 'eaywag1/XC527598.ogg'), 97 | ('eucdov/XC355153.ogg', 'eucdov/XC355152.ogg'), 98 | ('eucdov/XC360303.ogg', 'eucdov/XC347428.ogg'), 99 | ('eucdov/XC365606.ogg', 'eucdov/XC124694.ogg'), 100 | ('eucdov/XC371039.ogg', 'eucdov/XC368596.ogg'), 101 | ('eucdov/XC747422.ogg', 'eucdov/XC747408.ogg'), 102 | ('eucdov/XC789608.ogg', 'eucdov/XC788267.ogg'), 103 | ('goflea1/XC163901.ogg', 'bladro1/XC163901.ogg'), 104 | ('goflea1/XC208794.ogg', 'bladro1/XC208794.ogg'), 105 | ('goflea1/XC208795.ogg', 'bladro1/XC208795.ogg'), 106 | ('goflea1/XC209203.ogg', 'bladro1/XC209203.ogg'), 107 | ('goflea1/XC209549.ogg', 'bladro1/XC209549.ogg'), 108 | ('goflea1/XC209564.ogg', 'bladro1/XC209564.ogg'), 109 | ('graher1/XC357552.ogg', 'graher1/XC357551.ogg'), 110 | ('graher1/XC590235.ogg', 'graher1/XC590144.ogg'), 111 | ('grbeat1/XC304004.ogg', 'grbeat1/XC303999.ogg'), 112 | ('grecou1/XC365426.ogg', 'grecou1/XC365425.ogg'), 113 | ('greegr/XC247286.ogg', 'categr/XC197438.ogg'), 114 | ('grewar3/XC743681.ogg', 'grewar3/XC537475.ogg'), 115 | ('grnwar1/XC197744.ogg', 'grnwar1/XC157973.ogg'), 116 | ('grtdro1/XC651708.ogg', 'grtdro1/XC613192.ogg'), 117 | ('grywag/XC459760.ogg', 'grywag/XC457124.ogg'), 118 | ('grywag/XC575903.ogg', 'grywag/XC575901.ogg'), 119 | ('grywag/XC650696.ogg', 'grywag/XC592019.ogg'), 120 | ('grywag/XC690448.ogg', 'grywag/XC655063.ogg'), 121 | ('grywag/XC745653.ogg', 'grywag/XC745650.ogg'), 122 | ('grywag/XC812496.ogg', 'grywag/XC812495.ogg'), 123 | ('heswoo1/XC357155.ogg', 'heswoo1/XC357149.ogg'), 124 | ('heswoo1/XC744698.ogg', 'heswoo1/XC665715.ogg'), 125 | ('hoopoe/XC631301.ogg', 'hoopoe/XC365530.ogg'), 126 | ('hoopoe/XC631304.ogg', 'hoopoe/XC252584.ogg'), 127 | ('houcro1/XC744704.ogg', 'houcro1/XC683047.ogg'), 128 | ('houspa/XC326675.ogg', 'houspa/XC326674.ogg'), 129 | ('inbrob1/XC744708.ogg', 'inbrob1/XC744706.ogg'), 130 | ('insowl1/XC305214.ogg', 'insowl1/XC301142.ogg'), 131 | ('junbab2/XC282587.ogg', 'junbab2/XC282586.ogg'), 132 | ('labcro1/XC267645.ogg', 'labcro1/XC265731.ogg'), 133 | ('labcro1/XC345836.ogg', 'labcro1/XC312582.ogg'), 134 | ('labcro1/XC37773.ogg', 'labcro1/XC19736.ogg'), 135 | ('labcro1/XC447036.ogg', 'houcro1/XC447036.ogg'), 136 | ('labcro1/XC823514.ogg', 'gybpri1/XC823527.ogg'), 137 | ('laudov1/XC185511.ogg', 'grewar3/XC185505.ogg'), 138 | ('laudov1/XC405375.ogg', 'laudov1/XC405374.ogg'), 139 | ('laudov1/XC514027.ogg', 'eucdov/XC514027.ogg'), 140 | ('lblwar1/XC197743.ogg', 'lblwar1/XC157974.ogg'), 141 | ('lewduc1/XC261506.ogg', 'lewduc1/XC254813.ogg'), 142 | ('litegr/XC403621.ogg', 'bcnher/XC403621.ogg'), 143 | ('litegr/XC535540.ogg', 'litegr/XC448898.ogg'), 144 | ('litegr/XC535552.ogg', 'litegr/XC447850.ogg'), 145 | ('litgre1/XC630775.ogg', 'litgre1/XC630560.ogg'), 146 | ('litgre1/XC776082.ogg', 'litgre1/XC663244.ogg'), 147 | ('litspi1/XC674522.ogg', 'comtai1/XC674522.ogg'), 148 | ('litspi1/XC722435.ogg', 'litspi1/XC721636.ogg'), 149 | ('litspi1/XC722436.ogg', 'litspi1/XC721637.ogg'), 150 | ('litswi1/XC443070.ogg', 'litswi1/XC440301.ogg'), 151 | ('lobsun2/XC197742.ogg', 'lobsun2/XC157975.ogg'), 152 | ('maghor2/XC197740.ogg', 'maghor2/XC157978.ogg'), 153 | ('maghor2/XC786588.ogg', 'maghor2/XC786587.ogg'), 154 | ('malpar1/XC197770.ogg', 'malpar1/XC157976.ogg'), 155 | ('marsan/XC383290.ogg', 'marsan/XC383288.ogg'), 156 | ('marsan/XC733175.ogg', 'marsan/XC716673.ogg'), 157 | ('mawthr1/XC455222.ogg', 'mawthr1/XC455211.ogg'), 158 | ('orihob2/XC557991.ogg', 'orihob2/XC557293.ogg'), 159 | ('piebus1/XC165050.ogg', 'piebus1/XC122395.ogg'), 160 | ('piebus1/XC814459.ogg', 'piebus1/XC792272.ogg'), 161 | ('placuc3/XC490344.ogg', 'placuc3/XC486683.ogg'), 162 | ('placuc3/XC572952.ogg', 'placuc3/XC572950.ogg'), 163 | ('plaflo1/XC615781.ogg', 'plaflo1/XC614946.ogg'), 164 | ('purher1/XC467373.ogg', 'graher1/XC467373.ogg'), 165 | ('purher1/XC827209.ogg', 'purher1/XC827207.ogg'), 166 | ('pursun3/XC268375.ogg', 'comtai1/XC241382.ogg'), 167 | ('pursun4/XC514853.ogg', 'pursun4/XC514852.ogg'), 168 | ('putbab1/XC574864.ogg', 'brcful1/XC574864.ogg'), 169 | ('rewbul/XC306398.ogg', 'bkcbul1/XC306398.ogg'), 170 | ('rewbul/XC713308.ogg', 'asbfly/XC713467.ogg'), 171 | ('rewlap1/XC733007.ogg', 'rewlap1/XC732874.ogg'), 172 | ('rorpar/XC199488.ogg', 'rorpar/XC199339.ogg'), 173 | ('rorpar/XC402325.ogg', 'comior1/XC402326.ogg'), 174 | ('rorpar/XC516404.ogg', 'rorpar/XC516402.ogg'), 175 | ('sbeowl1/XC522123.ogg', 'brfowl1/XC522123.ogg'), 176 | ('sohmyn1/XC744700.ogg', 'sohmyn1/XC743682.ogg'), 177 | ('spepic1/XC804432.ogg', 'spepic1/XC804431.ogg'), 178 | ('spodov/XC163930.ogg', 'bladro1/XC163901.ogg'), 179 | ('spodov/XC163930.ogg', 'goflea1/XC163901.ogg'), 180 | ('spoowl1/XC591485.ogg', 'spoowl1/XC591177.ogg'), 181 | ('stbkin1/XC266782.ogg', 'stbkin1/XC266682.ogg'), 182 | ('stbkin1/XC360661.ogg', 'stbkin1/XC199815.ogg'), 183 | ('stbkin1/XC406140.ogg', 'stbkin1/XC406138.ogg'), 184 | ('vefnut1/XC197738.ogg', 'vefnut1/XC157979.ogg'), 185 | ('vefnut1/XC293526.ogg', 'vefnut1/XC289785.ogg'), 186 | ('wemhar1/XC581045.ogg', 'comsan/XC581045.ogg'), 187 | ('wemhar1/XC590355.ogg', 'wemhar1/XC590354.ogg'), 188 | ('whbbul2/XC335671.ogg', 'whbbul2/XC335670.ogg'), 189 | ('whbsho3/XC856465.ogg', 'whbsho3/XC856463.ogg'), 190 | #('whbsho3/XC856468.ogg', 'whbsho3/XC856463.ogg'), 191 | ('whbsho3/XC856465.ogg', 'whbsho3/XC856468.ogg'), 192 | ('whbwat1/XC840073.ogg', 'whbwat1/XC840071.ogg'), 193 | ('whbwoo2/XC239509.ogg', 'rufwoo2/XC239509.ogg'), 194 | ('whcbar1/XC659329.ogg', 'insowl1/XC659329.ogg'), 195 | ('whiter2/XC265271.ogg', 'whiter2/XC265267.ogg'), 196 | ('whtkin2/XC197737.ogg', 'whtkin2/XC157981.ogg'), 197 | ('whtkin2/XC430267.ogg', 'whtkin2/XC430256.ogg'), 198 | ('whtkin2/XC503389.ogg', 'comior1/XC503389.ogg'), 199 | ('whtkin2/XC540094.ogg', 'whtkin2/XC540087.ogg'), 200 | ('woosan/XC184466.ogg', 'marsan/XC184466.ogg'), 201 | ('woosan/XC545316.ogg', 'woosan/XC476064.ogg'), 202 | ('woosan/XC587076.ogg', 'woosan/XC578599.ogg'), 203 | ('woosan/XC742927.ogg', 'woosan/XC740798.ogg'), 204 | ('woosan/XC825766.ogg', 'grnsan/XC825765.ogg'), 205 | ('zitcis1/XC303866.ogg', 'zitcis1/XC302781.ogg'), 206 | } 207 | 208 | 209 | # In[8]: 210 | 211 | 212 | def to_remove(r0, r1): 213 | name0 = r0.split('/')[0] 214 | name1 = r1.split('/')[0] 215 | return name0 == name1 216 | 217 | 218 | # In[9]: 219 | 220 | 221 | dups = [(r0,r1) for (r0,r1) in dups if to_remove(r0, r1)] 222 | len(dups) 223 | 224 | 225 | # In[10]: 226 | 227 | 228 | to_remove = set(r1 for r0,r1 in dups) 229 | len(to_remove) 230 | 231 | 232 | # In[11]: 233 | 234 | 235 | train = train[~train.filename.isin(to_remove)].reset_index(drop=True) 236 | 237 | 238 | # In[12]: 239 | 240 | 241 | train = train.groupby('record').first().reset_index() 242 | 243 | 244 | # In[13]: 245 | 246 | 247 | from sklearn.model_selection import KFold, GroupKFold 248 | 249 | 250 | # In[14]: 251 | 252 | 253 | new_train = [] 254 | 255 | kf = KFold(n_splits=4, shuffle=True, random_state=0) 256 | for species, df in train.groupby('species'): 257 | df = df.reset_index(drop=True) 258 | df['fold'] = -1 259 | for fold, (train_index, valid_index) in enumerate(kf.split(df, df.primary_label)): 260 | df.loc[valid_index, "fold"] = int(fold) 261 | new_train.append(df) 262 | new_train = pd.concat(new_train).reset_index(drop=True) 263 | new_train.fold.value_counts() 264 | 265 | 266 | # In[15]: 267 | 268 | 269 | train = new_train.copy() 270 | train 271 | 272 | 273 | # In[16]: 274 | 275 | 276 | train.to_csv('/mount/birdclef24/data/train_folded_v3c.csv',index=False) 277 | 278 | 279 | # In[ ]: 280 | 281 | 282 | 283 | 284 | -------------------------------------------------------------------------------- /data/ds_pl_1.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler 2 | import numpy as np 3 | import torchaudio 4 | import torch 5 | import librosa 6 | import pandas as pd 7 | import ast 8 | 9 | 10 | def collate_fn(batch): 11 | 12 | new_d = {} 13 | k = batch[0].keys() 14 | for k in batch[0].keys(): 15 | new_d[k] = torch.cat([b[k] for b in batch]) 16 | 17 | return new_d 18 | 19 | tr_collate_fn = collate_fn 20 | val_collate_fn = None 21 | 22 | 23 | 24 | def upsample(df, cfg): 25 | new_train = [] 26 | for species, sub_df in df.groupby('primary_label'): 27 | if len(df) < cfg.resample_train: 28 | n = np.ceil(cfg.resample_train/len(sub_df)).astype(int) 29 | sub_df = pd.concat([sub_df] * n) 30 | new_train += [sub_df] 31 | new_train = pd.concat(new_train).reset_index(drop=True) 32 | return new_train 33 | 34 | class CustomDataset(Dataset): 35 | def __init__(self, train, cfg, aug, mode='train'): 36 | self.cfg = cfg 37 | self.mode = mode 38 | if self.mode == 'train': 39 | self.df = upsample(train.copy(), cfg) 40 | 41 | pl_df = pd.read_csv(cfg.pl_df) 42 | pl_df['file_id'] = pl_df['row_id'].apply(lambda x: '_'.join(x.split('_')[:-1])) 43 | pl_df['sec'] = pl_df['row_id'].apply(lambda x: int(x.split('_')[-1])) 44 | vcs = pl_df['file_id'].value_counts() 45 | file_id60 = vcs[vcs == cfg.parts].index.values 46 | pl_df = pl_df[pl_df['file_id'].isin(file_id60)] 47 | self.pl_files = pl_df['file_id'].unique() 48 | self.pl_df = pl_df.groupby('file_id') 49 | self.w = torch.ones(len(self.df))/len(self.df) 50 | 51 | else: 52 | self.df = train.copy() 53 | self.pl_df = None 54 | 55 | if self.mode == 'test': 56 | self.data_folder = cfg.test_data_folder 57 | self.data_folder2 = cfg.test_data_folder 58 | self.duration = cfg.test_duration 59 | self.suffix = cfg.test_suffix 60 | else: 61 | self.data_folder = cfg.data_folder 62 | self.data_folder2 = cfg.data_folder2 63 | self.duration = cfg.duration 64 | self.suffix = cfg.suffix 65 | 66 | self.istrain = mode=='train' 67 | 68 | self.filename = train.filename.values 69 | 70 | 71 | #self.primary_label = train.primary_label.values 72 | if 'primary_label' in train.columns: 73 | if 'first_species' not in train.columns: 74 | train['first_species'] = train['primary_label'].values 75 | if 'last_species' not in train.columns: 76 | train['last_species'] = train['primary_label'].values 77 | 78 | 79 | 80 | self.secondary_labels = [ast.literal_eval(item) for item in train.secondary_labels.values] 81 | self.first_species = train.first_species.values 82 | self.last_species = train.last_species.values 83 | else: 84 | self.secondary_labels = np.array([cfg.birds[0]] * train.shape[0]) 85 | self.first_species = np.array([cfg.birds[0]] * train.shape[0]) 86 | self.last_species = np.array([cfg.birds[0]] * train.shape[0]) 87 | 88 | if self.mode == 'test': 89 | self.test_parts = self.duration // 5 90 | 91 | 92 | 93 | def __len__(self): 94 | if self.mode == 'train': 95 | return len(self.filename) // 64 96 | else: 97 | return len(self.filename) 98 | 99 | def get_audio(self, idx): 100 | filename = self.filename[idx] 101 | duration = self.cfg.sr * self.duration 102 | if self.istrain: 103 | first = np.random.rand() < 0.5 104 | audio = self.load_audio(filename, first, True, self.cfg) 105 | if len(audio) < duration: 106 | pad_length = np.random.randint(0, duration - len(audio) + 1) 107 | audio = np.pad(audio, ((pad_length, duration - len(audio) - pad_length),), mode='constant') 108 | else: 109 | start = np.random.randint(0, len(audio) - duration + 1) 110 | audio = audio[start : start + duration] 111 | else: 112 | audio = self.load_audio(filename, True, False, self.cfg) 113 | audio = audio[:duration] 114 | if len(audio) < duration: 115 | pad_length = (duration - len(audio)) // 2 116 | audio = np.pad(audio, ((pad_length, duration - len(audio) - pad_length),), mode='constant') 117 | return audio 118 | 119 | def __getitem__(self, idx): 120 | 121 | 122 | 123 | 124 | if self.istrain: 125 | 126 | inner_bs = 64 127 | idxs = torch.multinomial(self.w,num_samples=inner_bs) 128 | audio = np.stack([self.get_audio(idx2) for idx2 in idxs]) 129 | 130 | targets = np.zeros((inner_bs,len(self.cfg.labels)), dtype=np.float32) 131 | first_species = self.first_species[idxs] 132 | mask = np.where(self.cfg.labels[None,:] == first_species[:,None]) 133 | targets[mask] = 1 134 | 135 | last_species = self.last_species[idxs] 136 | mask = np.where(self.cfg.labels[None,:] == last_species[:,None]) 137 | targets[mask] = 1 138 | 139 | secondary_mask = np.ones((inner_bs,len(self.cfg.labels)), dtype=np.float32) 140 | secondary_labelss = [self.secondary_labels[j] for j in idxs] 141 | for i, secondary_labels in enumerate(secondary_labelss): 142 | if len(secondary_labels) > 0: 143 | for label in secondary_labels: 144 | if label in self.cfg.targets: 145 | secondary_mask[i,self.cfg.targets[label]] = 0 146 | 147 | pl_idx = torch.randint(low=0,high=len(self.pl_files),size=(1,)) 148 | pl_file_id = self.pl_files[pl_idx] 149 | pl_audio = self.load_pl_audio(pl_file_id,self.cfg) 150 | pl_target = self.pl_df.get_group(pl_file_id).sort_values('sec')[self.cfg.birds].values 151 | pl_audio = pl_audio[:5*pl_target.shape[0]*self.cfg.sr] 152 | if pl_audio.shape[0] < 5*pl_target.shape[0]*self.cfg.sr: 153 | pl_audio_padded = np.zeros(5*pl_target.shape[0]*self.cfg.sr) 154 | pl_audio_padded[:pl_audio.shape[0]] = pl_audio.shape[0] 155 | pl_audio = pl_audio_padded 156 | pl_audio = pl_audio.reshape(pl_target.shape[0], -1) 157 | pl_secondary = np.ones_like(pl_target) 158 | 159 | audio = np.concatenate([audio,pl_audio]) 160 | targets = np.concatenate([targets,pl_target]) 161 | secondary_mask = np.concatenate([secondary_mask,pl_secondary]) 162 | 163 | mixup = (np.random.rand() < self.cfg.mixup_p) 164 | if mixup: 165 | bs = targets.shape[0] 166 | perm = torch.randperm(bs) 167 | weight = 0.1 ** (self.cfg.db_range * np.random.rand() / 10) 168 | audio = audio + weight * audio[perm] 169 | secondary_mask = np.minimum(secondary_mask, secondary_mask[perm]) 170 | targets = np.maximum(targets, targets[perm]) 171 | secondary_mask = np.maximum(secondary_mask, targets) 172 | 173 | 174 | # if (torch.rand(1) < cfg.mixup_p): 175 | # pl_idx = torch.randint(len(self.pl_df)) 176 | 177 | # pl_audio = self.load_pl_audio(self.pl_df['row_id'].values[pl_idx], self.cfg) 178 | # pl_target = self.pl_df[self.cfg.birds].values[pl_idx] 179 | # weight = 0.1 ** (self.cfg.db_range * np.random.rand() / 10) 180 | 181 | # audio += weight * pl_audio 182 | # targets = np.maximum(targets, pl_target) 183 | # secondary_mask = torch.maximum(secondary_mask, targets) 184 | 185 | # num_samples = np.random.randint(0, self.cfg.other_samples + 1) 186 | # for _ in range(num_samples): 187 | # other_idx = np.random.randint(len(self.filename)) 188 | # other_audio = self.get_audio(other_idx) 189 | # weight = 0.2 + 0.8 * np.random.rand() 190 | # audio += weight * other_audio 191 | # targets[self.cfg.targets[self.first_species[other_idx]]] = 1.0 192 | # targets[self.cfg.targets[self.last_species[other_idx]]] = 1.0 193 | # secondary_labels = self.secondary_labels[other_idx] 194 | # if len(secondary_labels) > 0: 195 | # for label in secondary_labels: 196 | # if label in self.cfg.targets: 197 | # secondary_mask[self.cfg.targets[label]] = 0 198 | 199 | else: 200 | audio = self.get_audio(idx) 201 | targets = np.zeros(len(self.cfg.labels), dtype=np.float32) 202 | targets[self.cfg.targets[self.first_species[idx]]] = 1.0 203 | targets[self.cfg.targets[self.last_species[idx]]] = 1.0 204 | secondary_mask = np.ones(len(self.cfg.labels), dtype=np.float32) 205 | secondary_labels = self.secondary_labels[idx] 206 | if len(secondary_labels) > 0: 207 | for label in secondary_labels: 208 | if label in self.cfg.targets: 209 | secondary_mask[self.cfg.targets[label]] = 0 210 | 211 | secondary_mask = np.maximum(secondary_mask, targets) 212 | 213 | wav_tensor = torch.from_numpy(audio) 214 | if self.mode == 'test': 215 | #cut 216 | wav_tensor = wav_tensor.reshape(self.test_parts,wav_tensor.shape[0]//self.test_parts) 217 | 218 | out = { 219 | 'input' : wav_tensor, 220 | 'targets' : torch.from_numpy(targets), 221 | 'secondary_mask' : torch.from_numpy(secondary_mask), 222 | } 223 | return out 224 | 225 | def load_audio(self, filename, first, istrain, cfg): 226 | f_id = filename.split('.')[0] 227 | if istrain: 228 | max_duration = int((self.duration + cfg.max_shift) * cfg.sr) 229 | else: 230 | max_duration = self.duration * cfg.sr 231 | if first: 232 | fp = f'{self.data_folder}/{f_id}{self.suffix}' 233 | if self.suffix == '.wav': 234 | audio, rate = torchaudio.load(fp) 235 | audio = audio[0].numpy() 236 | elif self.suffix in ['.ogg','.flac']: 237 | audio = librosa.load(fp, sr=cfg.sr)[0].astype(np.float32) 238 | else: 239 | audio = np.load(fp) 240 | audio = audio[:max_duration] 241 | else: 242 | fp = f'{self.data_folder2}/{f_id}{self.suffix}' 243 | if self.suffix in ['.wav','.ogg']: 244 | audio, rate = torchaudio.load(fp) 245 | audio = audio[0].numpy() 246 | else: 247 | 248 | # filepath = filepath / f"first10_{fname}.npy" 249 | audio = np.load(fp) 250 | audio = audio[-max_duration:] 251 | return audio 252 | 253 | def load_pl_audio(self, filename, cfg): 254 | fp = f'{cfg.pl_data_folder}/{filename}{cfg.test_suffix}' 255 | if cfg.test_suffix in ['.ogg','.flac']: 256 | audio = librosa.load(fp, sr=cfg.sr)[0].astype(np.float32) 257 | else: 258 | audio = np.load(fp) 259 | return audio 260 | 261 | def batch_to_device(batch, device): 262 | return {k:batch[k].to(device, non_blocking=True) for k in batch.keys() if k not in []} 263 | -------------------------------------------------------------------------------- /configs/augmentations.py: -------------------------------------------------------------------------------- 1 | import colorednoise as cn 2 | import numpy as np 3 | import librosa 4 | import torch.nn as nn 5 | 6 | class AudioTransform: 7 | def __init__(self, always_apply=False, p=0.5): 8 | self.always_apply = always_apply 9 | self.p = p 10 | 11 | def __call__(self, y: np.ndarray): 12 | if self.always_apply: 13 | return self.apply(y) 14 | else: 15 | if np.random.rand() < self.p: 16 | return self.apply(y) 17 | else: 18 | return y 19 | 20 | def apply(self, y: np.ndarray): 21 | raise NotImplementedError 22 | 23 | 24 | class CustomCompose: 25 | def __init__(self, transforms: list): 26 | self.transforms = transforms 27 | 28 | def __call__(self, y: np.ndarray): 29 | for trns in self.transforms: 30 | y = trns(y) 31 | return y 32 | 33 | 34 | class CustomOneOf: 35 | def __init__(self, transforms: list, p=1.0): 36 | self.transforms = transforms 37 | self.p = p 38 | 39 | def __call__(self, y: np.ndarray): 40 | if np.random.rand() < self.p: 41 | n_trns = len(self.transforms) 42 | trns_idx = np.random.choice(n_trns) 43 | trns = self.transforms[trns_idx] 44 | y = trns(y) 45 | return y 46 | 47 | 48 | class GaussianNoiseSNR(AudioTransform): 49 | def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=40.0, **kwargs): 50 | super().__init__(always_apply, p) 51 | 52 | self.min_snr = min_snr 53 | self.max_snr = max_snr 54 | 55 | def apply(self, y: np.ndarray, **params): 56 | snr = np.random.uniform(self.min_snr, self.max_snr) 57 | a_signal = np.sqrt(y**2).max() 58 | a_noise = a_signal / (10 ** (snr / 20)) 59 | 60 | white_noise = np.random.randn(len(y)) 61 | a_white = np.sqrt(white_noise**2).max() 62 | augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype) 63 | return augmented 64 | 65 | 66 | class PinkNoiseSNR(AudioTransform): 67 | def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=20.0, **kwargs): 68 | super().__init__(always_apply, p) 69 | 70 | self.min_snr = min_snr 71 | self.max_snr = max_snr 72 | 73 | def apply(self, y: np.ndarray, **params): 74 | snr = np.random.uniform(self.min_snr, self.max_snr) 75 | a_signal = np.sqrt(y**2).max() 76 | a_noise = a_signal / (10 ** (snr / 20)) 77 | 78 | pink_noise = cn.powerlaw_psd_gaussian(1, len(y)) 79 | a_pink = np.sqrt(pink_noise**2).max() 80 | augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype) 81 | return augmented 82 | 83 | 84 | class VolumeControl(AudioTransform): 85 | def __init__(self, always_apply=False, p=0.5, db_limit=10, mode="uniform"): 86 | super().__init__(always_apply, p) 87 | 88 | assert mode in [ 89 | "uniform", 90 | "fade", 91 | "fade", 92 | "cosine", 93 | "sine", 94 | ], "`mode` must be one of 'uniform', 'fade', 'cosine', 'sine'" 95 | 96 | self.db_limit = db_limit 97 | self.mode = mode 98 | 99 | def apply(self, y: np.ndarray, **params): 100 | db = np.random.uniform(-self.db_limit, self.db_limit) 101 | if self.mode == "uniform": 102 | db_translated = 10 ** (db / 20) 103 | elif self.mode == "fade": 104 | lin = np.arange(len(y))[::-1] / (len(y) - 1) 105 | db_translated = 10 ** (db * lin / 20) 106 | elif self.mode == "cosine": 107 | cosine = np.cos(np.arange(len(y)) / len(y) * np.pi * 2) 108 | db_translated = 10 ** (db * cosine / 20) 109 | else: 110 | sine = np.sin(np.arange(len(y)) / len(y) * np.pi * 2) 111 | db_translated = 10 ** (db * sine / 20) 112 | augmented = y * db_translated 113 | return augmented 114 | 115 | 116 | class NoiseInjection(AudioTransform): 117 | def __init__(self, always_apply=False, p=0.5, max_noise_level=0.5, sr=32000): 118 | super().__init__(always_apply, p) 119 | 120 | self.noise_level = (0.0, max_noise_level) 121 | self.sr = sr 122 | 123 | def apply(self, y: np.ndarray, **params): 124 | noise_level = np.random.uniform(*self.noise_level) 125 | noise = np.random.randn(len(y)) 126 | augmented = (y + noise * noise_level).astype(y.dtype) 127 | return augmented 128 | 129 | 130 | class GaussianNoise(AudioTransform): 131 | def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20, sr=32000): 132 | super().__init__(always_apply, p) 133 | 134 | self.min_snr = min_snr 135 | self.max_snr = max_snr 136 | self.sr = sr 137 | 138 | def apply(self, y: np.ndarray, **params): 139 | snr = np.random.uniform(self.min_snr, self.max_snr) 140 | a_signal = np.sqrt(y**2).max() 141 | a_noise = a_signal / (10 ** (snr / 20)) 142 | 143 | white_noise = np.random.randn(len(y)) 144 | a_white = np.sqrt(white_noise**2).max() 145 | augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype) 146 | return augmented 147 | 148 | 149 | class PinkNoise(AudioTransform): 150 | def __init__(self, always_apply=False, p=0.5, min_snr=5, max_snr=20, sr=32000): 151 | super().__init__(always_apply, p) 152 | 153 | self.min_snr = min_snr 154 | self.max_snr = max_snr 155 | self.sr = sr 156 | 157 | def apply(self, y: np.ndarray, **params): 158 | snr = np.random.uniform(self.min_snr, self.max_snr) 159 | a_signal = np.sqrt(y**2).max() 160 | a_noise = a_signal / (10 ** (snr / 20)) 161 | 162 | pink_noise = cn.powerlaw_psd_gaussian(1, len(y)) 163 | a_pink = np.sqrt(pink_noise**2).max() 164 | augmented = (y + pink_noise * 1 / a_pink * a_noise).astype(y.dtype) 165 | return augmented 166 | 167 | 168 | class TimeStretch(AudioTransform): 169 | def __init__(self, always_apply=False, p=0.5, max_rate=1, sr=32000): 170 | super().__init__(always_apply, p) 171 | self.max_rate = max_rate 172 | self.sr = sr 173 | 174 | def apply(self, y: np.ndarray, **params): 175 | rate = np.random.uniform(0, self.max_rate) 176 | augmented = librosa.effects.time_stretch(y, rate) 177 | return augmented 178 | 179 | 180 | def _db2float(db: float, amplitude=True): 181 | if amplitude: 182 | return 10 ** (db / 20) 183 | else: 184 | return 10 ** (db / 10) 185 | 186 | 187 | def volume_down(y: np.ndarray, db: float): 188 | """ 189 | Low level API for decreasing the volume 190 | Parameters 191 | ---------- 192 | y: numpy.ndarray 193 | stereo / monaural input audio 194 | db: float 195 | how much decibel to decrease 196 | Returns 197 | ------- 198 | applied: numpy.ndarray 199 | audio with decreased volume 200 | """ 201 | applied = y * _db2float(-db) 202 | return applied 203 | 204 | 205 | def volume_up(y: np.ndarray, db: float): 206 | """ 207 | Low level API for increasing the volume 208 | Parameters 209 | ---------- 210 | y: numpy.ndarray 211 | stereo / monaural input audio 212 | db: float 213 | how much decibel to increase 214 | Returns 215 | ------- 216 | applied: numpy.ndarray 217 | audio with increased volume 218 | """ 219 | applied = y * _db2float(db) 220 | return applied 221 | 222 | 223 | class RandomVolume(AudioTransform): 224 | def __init__(self, always_apply=False, p=0.5, limit=10): 225 | super().__init__(always_apply, p) 226 | self.limit = limit 227 | 228 | def apply(self, y: np.ndarray, **params): 229 | db = np.random.uniform(-self.limit, self.limit) 230 | if db >= 0: 231 | return volume_up(y, db) 232 | else: 233 | return volume_down(y, db) 234 | 235 | 236 | class CosineVolume(AudioTransform): 237 | def __init__(self, always_apply=False, p=0.5, limit=10): 238 | super().__init__(always_apply, p) 239 | self.limit = limit 240 | 241 | def apply(self, y: np.ndarray, **params): 242 | db = np.random.uniform(-self.limit, self.limit) 243 | cosine = np.cos(np.arange(len(y)) / len(y) * np.pi * 2) 244 | dbs = _db2float(cosine * db) 245 | return y * dbs 246 | 247 | 248 | class AddGaussianNoise(AudioTransform): 249 | """Add gaussian noise to the samples""" 250 | 251 | supports_multichannel = True 252 | 253 | def __init__( 254 | self, always_apply=False, min_amplitude=0.001, max_amplitude=0.015, p=0.5 255 | ): 256 | """ 257 | :param min_amplitude: Minimum noise amplification factor 258 | :param max_amplitude: Maximum noise amplification factor 259 | :param p: 260 | """ 261 | super().__init__(always_apply, p) 262 | assert min_amplitude > 0.0 263 | assert max_amplitude > 0.0 264 | assert max_amplitude >= min_amplitude 265 | self.min_amplitude = min_amplitude 266 | self.max_amplitude = max_amplitude 267 | 268 | def apply(self, samples: np.ndarray, sample_rate=32000): 269 | amplitude = np.random.uniform(self.min_amplitude, self.max_amplitude) 270 | noise = np.random.randn(*samples.shape).astype(np.float32) 271 | samples = samples + amplitude * noise 272 | return samples 273 | 274 | 275 | class AddGaussianSNR(AudioTransform): 276 | """ 277 | Add gaussian noise to the input. A random Signal to Noise Ratio (SNR) will be picked 278 | uniformly in the decibel scale. This aligns with human hearing, which is more 279 | logarithmic than linear. 280 | """ 281 | 282 | supports_multichannel = True 283 | 284 | def __init__( 285 | self, 286 | always_apply=False, 287 | min_snr_in_db: float = 5.0, 288 | max_snr_in_db: float = 40.0, 289 | p: float = 0.5, 290 | ): 291 | """ 292 | :param min_snr_in_db: Minimum signal-to-noise ratio in dB. A lower number means more noise. 293 | :param max_snr_in_db: Maximum signal-to-noise ratio in dB. A greater number means less noise. 294 | :param p: The probability of applying this transform 295 | """ 296 | super().__init__(always_apply, p) 297 | self.min_snr_in_db = min_snr_in_db 298 | self.max_snr_in_db = max_snr_in_db 299 | 300 | def apply(self, samples: np.ndarray, sample_rate=32000): 301 | snr = np.random.uniform(self.min_snr_in_db, self.max_snr_in_db) 302 | 303 | clean_rms = np.sqrt(np.mean(np.square(samples))) 304 | 305 | a = float(snr) / 20 306 | noise_rms = clean_rms / (10**a) 307 | 308 | noise = np.random.normal(0.0, noise_rms, size=samples.shape).astype(np.float32) 309 | return samples + noise 310 | 311 | 312 | class Normalize(AudioTransform): 313 | """ 314 | Apply a constant amount of gain, so that highest signal level present in the sound becomes 315 | 0 dBFS, i.e. the loudest level allowed if all samples must be between -1 and 1. Also known 316 | as peak normalization. 317 | """ 318 | 319 | supports_multichannel = True 320 | 321 | def __init__(self, always_apply=False, apply_to: str = "all", p: float = 0.5): 322 | super().__init__(always_apply, p) 323 | assert apply_to in ("all", "only_too_loud_sounds") 324 | self.apply_to = apply_to 325 | 326 | def apply(self, samples: np.ndarray, sample_rate=32000): 327 | max_amplitude = np.amax(np.abs(samples)) 328 | if self.apply_to == "only_too_loud_sounds" and max_amplitude < 1.0: 329 | return samples 330 | 331 | if max_amplitude > 0: 332 | return samples / max_amplitude 333 | else: 334 | return samples 335 | 336 | class NormalizeMelSpec(nn.Module): 337 | def __init__(self, eps=1e-6): 338 | super().__init__() 339 | self.eps = eps 340 | 341 | def forward(self, X): 342 | mean = X.mean((1, 2), keepdim=True) 343 | std = X.std((1, 2), keepdim=True) 344 | Xstd = (X - mean) / (std + self.eps) 345 | norm_min, norm_max = Xstd.min(-1)[0].min(-1)[0], Xstd.max(-1)[0].max(-1)[0] 346 | fix_ind = (norm_max - norm_min) > self.eps * torch.ones_like( 347 | (norm_max - norm_min) 348 | ) 349 | V = torch.zeros_like(Xstd) 350 | if fix_ind.sum(): 351 | V_fix = Xstd[fix_ind] 352 | norm_max_fix = norm_max[fix_ind, None, None] 353 | norm_min_fix = norm_min[fix_ind, None, None] 354 | V_fix = torch.max( 355 | torch.min(V_fix, norm_max_fix), 356 | norm_min_fix, 357 | ) 358 | # print(V_fix.shape, norm_min_fix.shape, norm_max_fix.shape) 359 | V_fix = (V_fix - norm_min_fix) / (norm_max_fix - norm_min_fix) 360 | V[fix_ind] = V_fix 361 | return V -------------------------------------------------------------------------------- /models/mdl_2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.nn as nn 4 | import timm 5 | from torch.distributions import Beta 6 | import torchaudio 7 | from torchaudio.transforms import MelSpectrogram, AmplitudeToDB 8 | from torch.cuda.amp import autocast 9 | 10 | def init_layer(layer): 11 | nn.init.xavier_uniform_(layer.weight) 12 | 13 | if hasattr(layer, "bias"): 14 | if layer.bias is not None: 15 | layer.bias.data.fill_(0.0) 16 | 17 | 18 | def init_bn(bn): 19 | bn.bias.data.fill_(0.0) 20 | bn.weight.data.fill_(1.0) 21 | 22 | def interpolate(x: torch.Tensor, ratio: int): 23 | """Interpolate data in time domain. This is used to compensate the 24 | resolution reduction in downsampling of a CNN. 25 | Args: 26 | x: (batch_size, time_steps, classes_num) 27 | ratio: int, ratio to interpolate 28 | Returns: 29 | upsampled: (batch_size, time_steps * ratio, classes_num) 30 | """ 31 | (batch_size, time_steps, classes_num) = x.shape 32 | upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1) 33 | upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num) 34 | return upsampled 35 | 36 | 37 | def pad_framewise_output(framewise_output: torch.Tensor, frames_num: int): 38 | """Pad framewise_output to the same length as input frames. The pad value 39 | is the same as the value of the last frame. 40 | Args: 41 | framewise_output: (batch_size, frames_num, classes_num) 42 | frames_num: int, number of frames to pad 43 | Outputs: 44 | output: (batch_size, frames_num, classes_num) 45 | """ 46 | output = F.interpolate( 47 | framewise_output.unsqueeze(1), 48 | size=(frames_num, framewise_output.size(2)), 49 | align_corners=True, 50 | mode="bilinear", 51 | ).squeeze(1) 52 | 53 | return output 54 | 55 | 56 | class AttBlockV2(nn.Module): 57 | def __init__(self, in_features: int, out_features: int, activation="linear"): 58 | super().__init__() 59 | 60 | self.activation = activation 61 | self.att = nn.Conv1d( 62 | in_channels=in_features, 63 | out_channels=out_features, 64 | kernel_size=1, 65 | stride=1, 66 | padding=0, 67 | bias=True, 68 | ) 69 | self.cla = nn.Conv1d( 70 | in_channels=in_features, 71 | out_channels=out_features, 72 | kernel_size=1, 73 | stride=1, 74 | padding=0, 75 | bias=True, 76 | ) 77 | 78 | self.init_weights() 79 | 80 | def init_weights(self): 81 | init_layer(self.att) 82 | init_layer(self.cla) 83 | 84 | def forward(self, x): 85 | # x: (n_samples, n_in, n_time) 86 | norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1) 87 | cla = self.nonlinear_transform(self.cla(x)) 88 | x = (norm_att * cla).sum(2) 89 | return x, norm_att, cla 90 | 91 | def nonlinear_transform(self, x): 92 | if self.activation == "linear": 93 | return x 94 | elif self.activation == "sigmoid": 95 | return torch.sigmoid(x) 96 | 97 | class Mixup(nn.Module): 98 | def __init__(self, mix_beta, mixup_prob, mixup_double): 99 | super(Mixup, self).__init__() 100 | self.beta_distribution = Beta(mix_beta, mix_beta) 101 | self.mixup_prob = mixup_prob 102 | self.mixup_double = mixup_double 103 | 104 | def forward(self, X, Y, weight=None): 105 | p = torch.rand((1,))[0] 106 | if p < self.mixup_prob: 107 | bs = X.shape[0] 108 | n_dims = len(X.shape) 109 | perm = torch.randperm(bs) 110 | 111 | p1 = torch.rand((1,))[0] 112 | if p1 < self.mixup_double: 113 | X = X + X[perm] 114 | Y = Y + Y[perm] 115 | Y = torch.clamp(Y, 0, 1) 116 | 117 | if weight is None: 118 | return X, Y 119 | else: 120 | weight = 0.5 * weight + 0.5 * weight[perm] 121 | return X, Y, weight 122 | else: 123 | perm2 = torch.randperm(bs) 124 | X = X + X[perm] + X[perm2] 125 | Y = Y + Y[perm] + Y[perm2] 126 | Y = torch.clamp(Y, 0, 1) 127 | 128 | if weight is None: 129 | return X, Y 130 | else: 131 | weight = ( 132 | 1 / 3 * weight + 1 / 3 * weight[perm] + 1 / 3 * weight[perm2] 133 | ) 134 | return X, Y, weight 135 | else: 136 | if weight is None: 137 | return X, Y 138 | else: 139 | return X, Y, weight 140 | 141 | 142 | class Mixup2(nn.Module): 143 | def __init__(self, mix_beta, mixup2_prob): 144 | super(Mixup2, self).__init__() 145 | self.beta_distribution = Beta(mix_beta, mix_beta) 146 | self.mixup2_prob = mixup2_prob 147 | 148 | def forward(self, X, Y, weight=None): 149 | p = torch.rand((1,))[0] 150 | if p < self.mixup2_prob: 151 | bs = X.shape[0] 152 | n_dims = len(X.shape) 153 | perm = torch.randperm(bs) 154 | coeffs = self.beta_distribution.rsample(torch.Size((bs,))).to(X.device) 155 | 156 | if n_dims == 2: 157 | X = coeffs.view(-1, 1) * X + (1 - coeffs.view(-1, 1)) * X[perm] 158 | elif n_dims == 3: 159 | X = coeffs.view(-1, 1, 1) * X + (1 - coeffs.view(-1, 1, 1)) * X[perm] 160 | else: 161 | X = ( 162 | coeffs.view(-1, 1, 1, 1) * X 163 | + (1 - coeffs.view(-1, 1, 1, 1)) * X[perm] 164 | ) 165 | Y = coeffs.view(-1, 1) * Y + (1 - coeffs.view(-1, 1)) * Y[perm] 166 | # Y = Y + Y[perm] 167 | # Y = torch.clamp(Y, 0, 1) 168 | 169 | if weight is None: 170 | return X, Y 171 | else: 172 | weight = coeffs.view(-1) * weight + (1 - coeffs.view(-1)) * weight[perm] 173 | return X, Y, weight 174 | else: 175 | if weight is None: 176 | return X, Y 177 | else: 178 | return X, Y, weight 179 | 180 | class Net(nn.Module): 181 | def __init__(self, cfg): 182 | super().__init__() 183 | 184 | self.cfg=cfg 185 | self.bn0 = nn.BatchNorm2d(cfg.mel_spec_args['n_mels']) 186 | self.num_classes = cfg.n_classes 187 | # base_model = timm.create_model( 188 | # cfg.backbone, 189 | # pretrained=True, 190 | # in_chans=cfg.in_chans, 191 | # drop_path_rate=0.2, 192 | # drop_rate=0.5, 193 | # ) 194 | # base_model.conv_stem.stride = (1,1) 195 | # layers = list(base_model.children())[:-2] 196 | # self.encoder = nn.Sequential(*layers) 197 | self.encoder = timm.create_model( 198 | cfg.backbone, 199 | pretrained=cfg.pretrained, 200 | num_classes=0, 201 | global_pool="", 202 | in_chans=cfg.in_chans, 203 | drop_path_rate=0.2, 204 | drop_rate=0.5, 205 | ) 206 | 207 | if "efficientnet" in cfg.backbone: 208 | in_features = self.encoder.num_features 209 | else: 210 | in_features = self.encoder.feature_info[-1]["num_chs"] 211 | # if "efficientnet" in self.cfg.backbone: 212 | # in_features = base_model.classifier.in_features 213 | # elif "eca" in self.cfg.backbone: 214 | # in_features = base_model.head.fc.in_features 215 | # elif "res" in self.cfg.backbone: 216 | # in_features = base_model.fc.in_features 217 | self.fc1 = nn.Linear(in_features, in_features, bias=True) 218 | self.att_block = AttBlockV2(in_features, self.num_classes, activation="sigmoid") 219 | 220 | self.init_weight() 221 | 222 | # self.audio_transforms = Compose( 223 | # [ 224 | # # AddColoredNoise(p=0.5), 225 | # PitchShift( 226 | # min_transpose_semitones=-4, 227 | # max_transpose_semitones=4, 228 | # sample_rate=self.cfg.SR, 229 | # p=0.4, 230 | # ), 231 | # Shift(min_shift=-0.5, max_shift=0.5, p=0.4), 232 | # ] 233 | # ) 234 | 235 | self.time_mask_transform = torchaudio.transforms.TimeMasking( 236 | time_mask_param=60, iid_masks=True, p=0.5 237 | ) 238 | self.freq_mask_transform = torchaudio.transforms.FrequencyMasking( 239 | freq_mask_param=24, iid_masks=True 240 | ) 241 | 242 | self.preprocessing = torch.nn.Sequential(MelSpectrogram(**cfg.mel_spec_args),AmplitudeToDB(**cfg.db_args)) 243 | self.norm_by = cfg.norm_by 244 | 245 | self.mixup = Mixup( 246 | mix_beta=self.cfg.mix_beta, 247 | mixup_prob=self.cfg.mixup_prob, 248 | mixup_double=self.cfg.mixup_double, 249 | ) 250 | self.mixup2 = Mixup2( 251 | mix_beta=self.cfg.mix_beta2, mixup2_prob=self.cfg.mixup2_prob 252 | ) 253 | 254 | # if self.loss == "ce": 255 | # self.loss_function = nn.CrossEntropyLoss( 256 | # label_smoothing=self.cfg.label_smoothing, reduction="none" 257 | # ) 258 | # elif self.loss == "bce": 259 | # self.loss_function = nn.BCEWithLogitsLoss(reduction="none") 260 | # else: 261 | # raise NotImplementedError 262 | self.loss_fn0 = nn.BCELoss(reduction="none") 263 | self.loss_fn1 = nn.BCEWithLogitsLoss(reduction="none") 264 | 265 | def transform_to_spec(self, audio): 266 | # if self.training: 267 | # audio = self.audio_transforms(audio, sample_rate=self.cfg.SR) 268 | 269 | 270 | spec = self.preprocessing(audio) 271 | spec = (spec + self.norm_by) / self.norm_by 272 | 273 | if self.training: 274 | spec = self.time_mask_transform(spec) 275 | if torch.rand(size=(1,))[0] < 0.5: 276 | spec = self.freq_mask_transform(spec) 277 | # if torch.rand(size=(1,))[0] < 0.5: 278 | # spec = self.lower_upper_freq(spec) 279 | return spec 280 | 281 | def init_weight(self): 282 | init_layer(self.fc1) 283 | init_bn(self.bn0) 284 | 285 | 286 | def extract_feature(self,x): 287 | x = x.permute((0, 1, 3, 2)) 288 | frames_num = x.shape[2] 289 | 290 | x = x.transpose(1, 3) 291 | x = self.bn0(x) 292 | x = x.transpose(1, 3) 293 | 294 | # if self.training: 295 | # x = self.spec_augmenter(x) 296 | 297 | x = x.transpose(2, 3) 298 | # (batch_size, channels, freq, frames) 299 | x = self.encoder(x) 300 | 301 | # (batch_size, channels, frames) 302 | x = torch.mean(x, dim=2) 303 | 304 | # channel smoothing 305 | x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1) 306 | x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1) 307 | x = x1 + x2 308 | 309 | x = F.dropout(x, p=0.5, training=self.training) 310 | x = x.transpose(1, 2) 311 | x = F.relu_(self.fc1(x)) 312 | x = x.transpose(1, 2) 313 | x = F.dropout(x, p=0.5, training=self.training) 314 | return x, frames_num 315 | 316 | def forward(self, batch): 317 | x = batch['input'] 318 | y = batch['target'] 319 | weight = batch['weight'] 320 | 321 | #if test then flatten bs and parts 322 | if len(x.shape) == 3: 323 | bs, parts, seq_len = x.shape 324 | y = torch.repeat_interleave(y[:,None],parts,dim=1) 325 | # secondary_mask = torch.repeat_interleave(secondary_mask[:,None],parts,dim=1) 326 | x = x.reshape(bs*parts,seq_len)#.unsqueeze(1) 327 | n_classes = y.shape[-1] 328 | y = y.reshape(bs*parts,n_classes) 329 | x = x[:,None] 330 | # if not self.training: 331 | # bs, channel, parts = x.shape[0], x.shape[1], x.shape[2] 332 | # x = x.reshape((bs * parts, channel, -1)) 333 | 334 | if self.training: 335 | if self.cfg.mixup: 336 | x, y, weight = self.mixup(x, y, weight) 337 | with autocast(enabled=False): 338 | x = self.transform_to_spec(x) 339 | # if self.in_chans == 3: 340 | # x = image_delta(x) 341 | 342 | if self.training: 343 | if self.cfg.mixup2: 344 | x, y, weight = self.mixup2(x, y, weight) 345 | 346 | x, frames_num = self.extract_feature(x) 347 | 348 | (clipwise_output, norm_att, segmentwise_output) = self.att_block(x) 349 | logit = torch.sum(norm_att * self.att_block.cla(x), dim=2) 350 | segmentwise_logit = self.att_block.cla(x).transpose(1, 2) 351 | segmentwise_output = segmentwise_output.transpose(1, 2) 352 | 353 | # interpolate_ratio = frames_num // segmentwise_output.size(1) 354 | 355 | # # Get framewise output 356 | # framewise_output = interpolate(segmentwise_output, interpolate_ratio) 357 | # framewise_output = pad_framewise_output(framewise_output, frames_num) 358 | 359 | # framewise_logit = interpolate(segmentwise_logit, interpolate_ratio) 360 | # framewise_logit = pad_framewise_output(framewise_logit, frames_num) 361 | output_dict = { 362 | # "framewise_output": framewise_output, 363 | "segmentwise_output": segmentwise_output, 364 | "logit": logit, 365 | # "framewise_logit": framewise_logit, 366 | "clipwise_output": clipwise_output, 367 | } 368 | # if not self.training: 369 | # clipwise_output = clipwise_output.reshape((bs, parts, -1)).max(dim=1).values 370 | # seg_num = segmentwise_logit.shape[1] 371 | # # fram_num = framewise_logit.shape[1] 372 | # segmentwise_logit = ( 373 | # segmentwise_logit.reshape((bs, parts, seg_num, -1)).max(dim=1).values 374 | # ) 375 | # # framewise_logit = ( 376 | # # framewise_logit.reshape((bs, parts, fram_num, -1)).max(dim=1).values 377 | # # ) 378 | with autocast(enabled=False): 379 | loss = 0.5 * self.loss_fn0(clipwise_output, y) + 0.5 * self.loss_fn1(segmentwise_logit.max(1)[0], y) 380 | # loss = 0.5*self.loss_function(torch.logit(clipwise_output), y) + 0.5*self.loss_function(framewise_logit.max(1)[0], y) 381 | # if self.loss == "ce": 382 | # loss = (loss * weight) / weight.sum() 383 | # elif self.loss == "bce": 384 | if self.training: 385 | loss = loss.sum(dim=1) * weight 386 | # else: 387 | # raise NotImplementedError 388 | loss = loss.sum() 389 | 390 | 391 | return {'loss': loss, 'logits': clipwise_output, 'target': y} 392 | 393 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | 2 | import random 3 | import os 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from torch.utils.data import Sampler, RandomSampler, SequentialSampler, DataLoader, WeightedRandomSampler 8 | from torch import nn, optim 9 | from torch.optim import AdamW 10 | from torch.optim.lr_scheduler import LambdaLR 11 | from torch.optim import lr_scheduler 12 | import importlib 13 | import math 14 | import neptune 15 | from neptune.utils import stringify_unsupported 16 | 17 | import logging 18 | import pickle 19 | 20 | 21 | 22 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): 23 | """ 24 | from https://github.com/huggingface/transformers/blob/main/src/transformers/optimization.py 25 | Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after 26 | a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer. 27 | Args: 28 | optimizer ([`~torch.optim.Optimizer`]): 29 | The optimizer for which to schedule the learning rate. 30 | num_warmup_steps (`int`): 31 | The number of steps for the warmup phase. 32 | num_training_steps (`int`): 33 | The total number of training steps. 34 | last_epoch (`int`, *optional*, defaults to -1): 35 | The index of the last epoch when resuming training. 36 | Return: 37 | `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. 38 | """ 39 | 40 | def lr_lambda(current_step: int): 41 | if current_step < num_warmup_steps: 42 | return float(current_step) / float(max(1, num_warmup_steps)) 43 | return max( 44 | 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)) 45 | ) 46 | 47 | return LambdaLR(optimizer, lr_lambda, last_epoch) 48 | 49 | 50 | def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles= 0.5, last_epoch= -1): 51 | """ 52 | from https://github.com/huggingface/transformers/blob/main/src/transformers/optimization.py 53 | Create a schedule with a learning rate that decreases following the values of the cosine function between the 54 | initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the 55 | initial lr set in the optimizer. 56 | Args: 57 | optimizer ([`~torch.optim.Optimizer`]): 58 | The optimizer for which to schedule the learning rate. 59 | num_warmup_steps (`int`): 60 | The number of steps for the warmup phase. 61 | num_training_steps (`int`): 62 | The total number of training steps. 63 | num_cycles (`float`, *optional*, defaults to 0.5): 64 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 65 | following a half-cosine). 66 | last_epoch (`int`, *optional*, defaults to -1): 67 | The index of the last epoch when resuming training. 68 | Return: 69 | `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. 70 | """ 71 | 72 | def lr_lambda(current_step): 73 | if current_step < num_warmup_steps: 74 | return float(current_step) / float(max(1, num_warmup_steps)) 75 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 76 | return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) 77 | 78 | return LambdaLR(optimizer, lr_lambda, last_epoch) 79 | 80 | 81 | 82 | 83 | def calc_grad_norm(parameters,norm_type=2.): 84 | 85 | if isinstance(parameters, torch.Tensor): 86 | parameters = [parameters] 87 | parameters = [p for p in parameters if p.grad is not None] 88 | norm_type = float(norm_type) 89 | if len(parameters) == 0: 90 | return torch.tensor(0.) 91 | device = parameters[0].grad.device 92 | total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) 93 | if torch.logical_or(total_norm.isnan(), total_norm.isinf()): 94 | total_norm = None 95 | 96 | return total_norm 97 | 98 | class OrderedDistributedSampler(Sampler): 99 | def __init__(self, dataset, num_replicas=None, rank=None): 100 | if num_replicas is None: 101 | if not dist.is_available(): 102 | raise RuntimeError("Requires distributed package to be available") 103 | num_replicas = dist.get_world_size() 104 | if rank is None: 105 | if not dist.is_available(): 106 | raise RuntimeError("Requires distributed package to be available") 107 | rank = dist.get_rank() 108 | self.dataset = dataset 109 | self.num_replicas = num_replicas 110 | self.rank = rank 111 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 112 | self.total_size = self.num_samples * self.num_replicas 113 | 114 | print("TOTAL SIZE", self.total_size) 115 | 116 | def __iter__(self): 117 | indices = list(range(len(self.dataset))) 118 | 119 | # add extra samples to make it evenly divisible 120 | indices += indices[: (self.total_size - len(indices))] 121 | assert len(indices) == self.total_size 122 | 123 | # subsample 124 | indices = indices[ 125 | self.rank * self.num_samples : self.rank * self.num_samples + self.num_samples 126 | ] 127 | print( 128 | "SAMPLES", 129 | self.rank * self.num_samples, 130 | self.rank * self.num_samples + self.num_samples, 131 | ) 132 | assert len(indices) == self.num_samples 133 | 134 | return iter(indices) 135 | 136 | def __len__(self): 137 | return self.num_samples 138 | 139 | 140 | def sync_across_gpus(t, world_size): 141 | torch.distributed.barrier() 142 | gather_t_tensor = [torch.ones_like(t) for _ in range(world_size)] 143 | torch.distributed.all_gather(gather_t_tensor, t) 144 | return torch.cat(gather_t_tensor) 145 | 146 | 147 | def set_seed(seed=1234): 148 | random.seed(seed) 149 | os.environ["PYTHONHASHSEED"] = str(seed) 150 | np.random.seed(seed) 151 | torch.manual_seed(seed) 152 | torch.cuda.manual_seed(seed) 153 | torch.backends.cudnn.deterministic = False 154 | torch.backends.cudnn.benchmark = True 155 | 156 | 157 | def worker_init_fn(worker_id): 158 | np.random.seed(np.random.get_state()[1][0] + worker_id) 159 | 160 | 161 | def get_model(cfg, ds): 162 | Net = importlib.import_module(cfg.model).Net 163 | net = Net(cfg) 164 | if cfg.pretrained_weights is not None: 165 | if type(cfg.pretrained_weights) == list: 166 | cfg.pretrained_weights = cfg.pretrained_weights[cfg.fold] 167 | print(f'{cfg.local_rank}: loading weights from',cfg.pretrained_weights) 168 | state_dict = torch.load(cfg.pretrained_weights, map_location='cpu') 169 | if "model" in state_dict.keys(): 170 | state_dict = state_dict['model'] 171 | 172 | for key,val in state_dict.items(): 173 | if key.startswith('module.'): 174 | state_dict[key[7:]] = state_dict.pop(k) 175 | # state_dict = {key.replace('module.',''):val for key,val in state_dict.items()} 176 | if cfg.pop_weights is not None: 177 | print(f'popping {cfg.pop_weights}') 178 | to_pop = [] 179 | for key in state_dict: 180 | for item in cfg.pop_weights: 181 | if item in key: 182 | to_pop += [key] 183 | for key in to_pop: 184 | print(f'popping {key}') 185 | state_dict.pop(key) 186 | if cfg.rename_weights is not None: 187 | for k,v in cfg.rename_weights.items(): 188 | state_dict[v] = state_dict.pop(k) 189 | net.load_state_dict(state_dict, strict=cfg.pretrained_weights_strict) 190 | print(f'{cfg.local_rank}: weights loaded from',cfg.pretrained_weights) 191 | 192 | return net 193 | 194 | 195 | def create_checkpoint(cfg, model, optimizer, epoch, scheduler=None, scaler=None): 196 | 197 | 198 | state_dict = model.state_dict() 199 | if cfg.save_weights_only: 200 | checkpoint = {"model": state_dict} 201 | return checkpoint 202 | 203 | checkpoint = { 204 | "model": state_dict, 205 | "optimizer": optimizer.state_dict(), 206 | "epoch": epoch, 207 | } 208 | 209 | if scheduler is not None: 210 | checkpoint["scheduler"] = scheduler.state_dict() 211 | 212 | if scaler is not None: 213 | checkpoint["scaler"] = scaler.state_dict() 214 | return checkpoint 215 | 216 | def load_checkpoint(cfg, model, optimizer, scheduler=None, scaler=None): 217 | 218 | print(f'loading ckpt {cfg.resume_from}') 219 | checkpoint = torch.load(cfg.resume_from, map_location='cpu') 220 | model.load_state_dict(checkpoint['model']) 221 | optimizer.load_state_dict(checkpoint['optimizer']) 222 | scheduler_dict = checkpoint['scheduler'] 223 | if scaler is not None: 224 | scaler.load_state_dict(checkpoint['scaler']) 225 | 226 | epoch = checkpoint['epoch'] 227 | return model, optimizer, scheduler_dict, scaler, epoch 228 | 229 | 230 | def get_dataset(df, cfg, mode='train'): 231 | 232 | #modes train, val, index 233 | print(f"Loading {mode} dataset") 234 | 235 | if mode == 'train': 236 | dataset = get_train_dataset(df, cfg) 237 | # elif mode == 'train_val': 238 | # dataset = get_val_dataset(df, cfg) 239 | elif mode == 'val': 240 | dataset = get_val_dataset(df, cfg) 241 | elif mode == 'test': 242 | dataset = get_test_dataset(df, cfg) 243 | else: 244 | pass 245 | return dataset 246 | 247 | def get_dataloader(ds, cfg, mode='train'): 248 | 249 | if mode == 'train': 250 | dl = get_train_dataloader(ds, cfg) 251 | elif mode =='val': 252 | dl = get_val_dataloader(ds, cfg) 253 | elif mode =='test': 254 | dl = get_test_dataloader(ds, cfg) 255 | return dl 256 | 257 | 258 | def get_train_dataset(train_df, cfg): 259 | 260 | train_dataset = cfg.CustomDataset(train_df, cfg, aug=cfg.train_aug, mode="train") 261 | if cfg.data_sample > 0: 262 | train_dataset = torch.utils.data.Subset(train_dataset, np.arange(cfg.data_sample)) 263 | return train_dataset 264 | 265 | 266 | def get_train_dataloader(train_ds, cfg): 267 | 268 | if cfg.distributed: 269 | sampler = torch.utils.data.distributed.DistributedSampler( 270 | train_ds, num_replicas=cfg.world_size, rank=cfg.local_rank, shuffle=True, seed=cfg.seed 271 | ) 272 | else: 273 | sampler = None 274 | try: 275 | if hasattr(cfg, 'random_sampler_frac'): 276 | if cfg.cfg.random_sampler_frac > 0: 277 | num_samples = int(len(train_ds) * cfg.random_sampler_frac) 278 | sample_weights = train_ds.sample_weights 279 | sampler = WeightedRandomSampler(sample_weights, num_samples= num_samples ) 280 | if hasattr(train_ds, 'sampler'): 281 | print('using sampler from train ds') 282 | sampler = train_ds.sampler 283 | 284 | except Excepttion as e: 285 | print(e) 286 | 287 | 288 | 289 | train_dataloader = DataLoader( 290 | train_ds, 291 | sampler=sampler, 292 | shuffle=(sampler is None), 293 | batch_size=cfg.batch_size, 294 | num_workers=cfg.num_workers, 295 | pin_memory=cfg.pin_memory, 296 | collate_fn=cfg.tr_collate_fn, 297 | drop_last=cfg.drop_last, 298 | worker_init_fn=worker_init_fn, 299 | ) 300 | print(f"train: dataset {len(train_ds)}, dataloader {len(train_dataloader)}") 301 | return train_dataloader 302 | 303 | 304 | def get_val_dataset(val_df, cfg, allowed_targets=None): 305 | val_dataset = cfg.CustomDataset(val_df, cfg, aug=cfg.val_aug, mode="val") 306 | return val_dataset 307 | 308 | # def get_val_index_dataset(train_df, train_dataset): 309 | # print("Loading val dataset") 310 | # val_dataset = cfg.CustomDataset(val_df, cfg, aug=cfg.val_aug, mode="val") 311 | # return val_dataset 312 | 313 | def get_val_dataloader(val_ds, cfg): 314 | 315 | if cfg.distributed and cfg.eval_ddp: 316 | sampler = OrderedDistributedSampler( 317 | val_ds, num_replicas=cfg.world_size, rank=cfg.local_rank 318 | ) 319 | else: 320 | sampler = SequentialSampler(val_ds) 321 | 322 | if cfg.batch_size_val is not None: 323 | batch_size = cfg.batch_size_val 324 | else: 325 | batch_size = cfg.batch_size 326 | val_dataloader = DataLoader( 327 | val_ds, 328 | sampler=sampler, 329 | batch_size=batch_size, 330 | num_workers=cfg.num_workers, 331 | pin_memory=cfg.pin_memory, 332 | collate_fn=cfg.val_collate_fn, 333 | worker_init_fn=worker_init_fn, 334 | ) 335 | print(f"valid: dataset {len(val_ds)}, dataloader {len(val_dataloader)}") 336 | return val_dataloader 337 | 338 | 339 | def get_test_dataset(test_df, cfg): 340 | test_dataset = cfg.CustomDataset(test_df, cfg, aug=cfg.val_aug, mode="test") 341 | return test_dataset 342 | 343 | 344 | def get_test_dataloader(test_ds, cfg): 345 | 346 | if cfg.distributed and cfg.eval_ddp: 347 | sampler = OrderedDistributedSampler( 348 | test_ds, num_replicas=cfg.world_size, rank=cfg.local_rank 349 | ) 350 | else: 351 | sampler = SequentialSampler(test_ds) 352 | 353 | if cfg.batch_size_test is not None: 354 | batch_size = cfg.batch_size_test 355 | else: 356 | batch_size = cfg.batch_size 357 | test_dataloader = DataLoader( 358 | test_ds, 359 | sampler=sampler, 360 | batch_size=batch_size, 361 | num_workers=cfg.num_workers, 362 | pin_memory=cfg.pin_memory, 363 | collate_fn=cfg.val_collate_fn, 364 | worker_init_fn=worker_init_fn, 365 | ) 366 | print(f"test: dataset {len(test_ds)}, dataloader {len(test_dataloader)}") 367 | return test_dataloader 368 | 369 | 370 | 371 | def get_optimizer(model, cfg): 372 | 373 | # params = [{"params": [param for name, param in model.named_parameters()], "lr": cfg.lr,"weight_decay":cfg.weight_decay}] 374 | params = model.parameters() 375 | 376 | if cfg.optimizer == "Adam": 377 | optimizer = optim.Adam(params, lr=cfg.lr, weight_decay=cfg.weight_decay) 378 | elif cfg.optimizer == "AdamW": 379 | optimizer = AdamW(params, lr=cfg.lr, weight_decay=cfg.weight_decay) 380 | 381 | return optimizer 382 | 383 | 384 | 385 | def get_scheduler(cfg, optimizer, total_steps): 386 | 387 | 388 | if cfg.schedule == "cosine": 389 | scheduler = get_cosine_schedule_with_warmup( 390 | optimizer, 391 | num_warmup_steps=cfg.warmup * (total_steps // cfg.batch_size) // cfg.world_size, 392 | num_training_steps=cfg.epochs * (total_steps // cfg.batch_size) // cfg.world_size, 393 | ) 394 | 395 | else: 396 | scheduler = None 397 | 398 | return scheduler 399 | 400 | 401 | def setup_neptune(cfg): 402 | 403 | 404 | neptune_run = neptune.init_run( 405 | project=cfg.neptune_project, 406 | tags=cfg.tags, 407 | mode=cfg.neptune_connection_mode, 408 | capture_stdout=False, 409 | capture_stderr=False, 410 | source_files=[f'models/{cfg.model}.py',f'data/{cfg.dataset}.py',f'configs/{cfg.name}.py'] 411 | ) 412 | 413 | 414 | neptune_run["cfg"] = stringify_unsupported(cfg.__dict__) 415 | 416 | return neptune_run 417 | 418 | 419 | def get_data(cfg): 420 | 421 | # setup dataset 422 | 423 | print(f"reading {cfg.train_df}") 424 | df = pd.read_csv(cfg.train_df) 425 | 426 | if cfg.test: 427 | test_df = pd.read_csv(cfg.test_df) 428 | else: 429 | test_df = None 430 | 431 | if cfg.fold == -1: 432 | val_df = df[df["fold"] == 0] 433 | else: 434 | val_df = df[df["fold"] == cfg.fold] 435 | 436 | train_df = df[df["fold"] != cfg.fold] 437 | 438 | return train_df, val_df, test_df 439 | 440 | 441 | 442 | def get_level(level_str): 443 | ''' get level''' 444 | l_names = {logging.getLevelName(lvl).lower(): lvl for lvl in [10, 20, 30, 40, 50]} # noqa 445 | return l_names.get(level_str.lower(), logging.INFO) 446 | 447 | def get_logger(name, level_str): 448 | ''' get logger''' 449 | logger = logging.getLogger(name) 450 | logger.setLevel(get_level(level_str)) 451 | handler = logging.StreamHandler() 452 | handler.setLevel(level_str) 453 | handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) # pylint: disable=C0301 # noqa 454 | logger.addHandler(handler) 455 | 456 | return logger 457 | 458 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import importlib 4 | import sys 5 | from tqdm import tqdm 6 | import gc 7 | import argparse 8 | import torch 9 | from torch.cuda.amp import GradScaler, autocast 10 | from torch.nn.parallel import DistributedDataParallel as NativeDDP 11 | from collections import defaultdict 12 | 13 | from utils import ( 14 | sync_across_gpus, 15 | set_seed, 16 | get_model, 17 | create_checkpoint, 18 | load_checkpoint, 19 | get_data, 20 | get_dataset, 21 | get_dataloader, 22 | calc_grad_norm, 23 | ) 24 | from utils import ( 25 | get_optimizer, 26 | get_scheduler, 27 | setup_neptune, 28 | ) 29 | 30 | 31 | from copy import copy 32 | import os 33 | 34 | os.environ["OMP_NUM_THREADS"] = "1" 35 | os.environ["MKL_NUM_THREADS"] = "1" 36 | os.environ["OPENBLAS_NUM_THREADS"] = "1" 37 | os.environ["VECLIB_MAXIMUM_THREADS"] = "1" 38 | os.environ["NUMEXPR_NUM_THREADS"] = "1" 39 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 40 | 41 | try: 42 | import cv2 43 | cv2.setNumThreads(0) 44 | except: 45 | print('no cv2 installed, running without') 46 | 47 | 48 | sys.path.append("configs") 49 | sys.path.append("models") 50 | sys.path.append("data") 51 | sys.path.append("postprocess") 52 | sys.path.append("metrics") 53 | 54 | 55 | 56 | def run_eval(model, val_dataloader, cfg, pre="val", curr_epoch=0): 57 | saved_images = False 58 | model.eval() 59 | torch.set_grad_enabled(False) 60 | 61 | # store information for evaluation 62 | val_data = defaultdict(list) 63 | val_score =0 64 | for ind_, data in enumerate(tqdm(val_dataloader, disable=(cfg.local_rank != 0) | cfg.disable_tqdm)): 65 | 66 | batch = cfg.batch_to_device(data, cfg.device) 67 | 68 | if cfg.mixed_precision: 69 | with autocast(): 70 | output = model(batch) 71 | else: 72 | output = model(batch) 73 | 74 | if (cfg.local_rank == 0) and (cfg.calc_metric) and (((curr_epoch + 1) % cfg.calc_metric_epochs) == 0): 75 | # per batch calculations 76 | pass 77 | 78 | if (not saved_images) & (cfg.save_first_batch_preds): 79 | save_first_batch_preds(batch, output, cfg) 80 | saved_images = True 81 | 82 | for key, val in output.items(): 83 | val_data[key] += [output[key]] 84 | 85 | for key, val in output.items(): 86 | value = val_data[key] 87 | if isinstance(value[0], list): 88 | val_data[key] = [item for sublist in value for item in sublist] 89 | 90 | else: 91 | if len(value[0].shape) == 0: 92 | val_data[key] = torch.stack(value) 93 | else: 94 | val_data[key] = torch.cat(value, dim=0) 95 | 96 | if (cfg.local_rank == 0) and (cfg.calc_metric) and (((curr_epoch + 1) % cfg.calc_metric_epochs) == 0): 97 | pass 98 | 99 | if cfg.distributed and cfg.eval_ddp: 100 | for key, val in output.items(): 101 | val_data[key] = sync_across_gpus(val_data[key], cfg.world_size) 102 | 103 | if cfg.local_rank == 0: 104 | if cfg.save_val_data: 105 | if cfg.distributed: 106 | for k, v in val_data.items(): 107 | val_data[k] = v[: len(val_dataloader.dataset)] 108 | torch.save(val_data, f"{cfg.output_dir}/fold{cfg.fold}/{pre}_data_seed{cfg.seed}.pth") 109 | 110 | loss_names = [key for key in output if 'loss' in key] 111 | loss_names += [key for key in output if 'score' in key] 112 | for k in loss_names: 113 | if cfg.local_rank == 0 and k in val_data: 114 | losses = val_data[k].cpu().numpy() 115 | loss = np.mean(losses) 116 | 117 | print(f"Mean {pre}_{k}", loss) 118 | if not np.isnan(loss): 119 | cfg.neptune_run[f"{pre}/{k}"].log(loss, step=cfg.curr_step) 120 | 121 | 122 | if (cfg.local_rank == 0) and (cfg.calc_metric) and (((curr_epoch + 1) % cfg.calc_metric_epochs) == 0): 123 | 124 | val_df = val_dataloader.dataset.df 125 | pp_out = cfg.post_process_pipeline(cfg, val_data, val_df) 126 | val_score = cfg.calc_metric(cfg, pp_out, val_df, pre) 127 | if type(val_score)!=dict: 128 | val_score = {f'score':val_score} 129 | 130 | for k, v in val_score.items(): 131 | print(f"{pre}_{k}: {v:.3f}") 132 | if cfg.neptune_run: 133 | cfg.neptune_run[f"{pre}/{k}"].log(v, step=cfg.curr_step) 134 | 135 | if cfg.distributed: 136 | torch.distributed.barrier() 137 | 138 | # print("EVAL FINISHED") 139 | 140 | return val_score 141 | 142 | 143 | def train(cfg): 144 | # set seed 145 | if cfg.seed < 0: 146 | cfg.seed = np.random.randint(1_000_000) 147 | print("seed", cfg.seed) 148 | 149 | cfg.distributed = False 150 | if "WORLD_SIZE" in os.environ: 151 | cfg.distributed = int(os.environ["WORLD_SIZE"]) > 1 152 | 153 | if cfg.distributed: 154 | 155 | cfg.local_rank = int(os.environ["LOCAL_RANK"]) 156 | device = "cuda:%d" % cfg.local_rank 157 | cfg.device = device 158 | 159 | 160 | 161 | torch.cuda.set_device(cfg.local_rank) 162 | torch.distributed.init_process_group(backend="nccl", init_method="env://") 163 | cfg.world_size = torch.distributed.get_world_size() 164 | cfg.rank = torch.distributed.get_rank() 165 | # print("Training in distributed mode with multiple processes, 1 GPU per process.") 166 | print(f"Process {cfg.rank}, total {cfg.world_size}, local rank {cfg.local_rank}.") 167 | cfg.group = torch.distributed.new_group(np.arange(cfg.world_size)) 168 | # print("Group", cfg.group) 169 | 170 | # syncing the random seed 171 | cfg.seed = int( 172 | sync_across_gpus(torch.Tensor([cfg.seed]).to(device), cfg.world_size) 173 | .detach() 174 | .cpu() 175 | .numpy()[0] 176 | ) # 177 | 178 | print(f"LOCAL_RANK {cfg.local_rank}, device {device}, seed {cfg.seed}") 179 | 180 | else: 181 | cfg.local_rank = 0 182 | cfg.world_size = 1 183 | cfg.rank = 0 # global rank 184 | 185 | device = "cuda:%d" % cfg.gpu 186 | cfg.device = device 187 | 188 | set_seed(cfg.seed) 189 | 190 | if cfg.local_rank == 0: 191 | cfg.neptune_run = setup_neptune(cfg) 192 | 193 | train_df, val_df, test_df = get_data(cfg) 194 | 195 | train_dataset = get_dataset(train_df, cfg, mode='train') 196 | train_dataloader = get_dataloader(train_dataset, cfg, mode='train') 197 | 198 | val_dataset = get_dataset(val_df, cfg, mode='val') 199 | val_dataloader = get_dataloader(val_dataset, cfg, mode='val') 200 | 201 | if cfg.test: 202 | test_dataset = get_dataset(test_df, cfg, mode='test') 203 | test_dataloader = get_dataloader(test_dataset, cfg, mode='test') 204 | 205 | if cfg.train_val: 206 | train_val_dataset = get_dataset(train_df, cfg, mode='val') 207 | train_val_dataloader = get_dataloader(train_val_dataset, cfg, 'val') 208 | 209 | model = get_model(cfg, train_dataset) 210 | model.to(device) 211 | 212 | if cfg.distributed: 213 | 214 | if cfg.syncbn: 215 | model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) 216 | 217 | model = NativeDDP( 218 | model, device_ids=[cfg.local_rank], find_unused_parameters=cfg.find_unused_parameters 219 | ) 220 | 221 | total_steps = len(train_dataset) 222 | if train_dataloader.sampler is not None: 223 | if 'WeightedRandomSampler' in str(train_dataloader.sampler.__class__): 224 | total_steps = train_dataloader.sampler.num_samples 225 | 226 | optimizer = get_optimizer(model, cfg) 227 | scheduler = get_scheduler(cfg, optimizer, total_steps) 228 | 229 | if cfg.mixed_precision: 230 | scaler = GradScaler() 231 | else: 232 | scaler = None 233 | 234 | cfg.curr_step = 0 235 | i = 0 236 | best_val_loss = np.inf 237 | optimizer.zero_grad() 238 | total_grad_norm = None 239 | total_grad_norm_after_clip = None 240 | val_score = 0 241 | for epoch in range(cfg.epochs): 242 | 243 | set_seed(cfg.seed + epoch + cfg.local_rank) 244 | 245 | cfg.curr_epoch = epoch 246 | if cfg.local_rank == 0: 247 | print("EPOCH:", epoch) 248 | 249 | 250 | if cfg.distributed: 251 | train_dataloader.sampler.set_epoch(epoch) 252 | 253 | progress_bar = tqdm(range(len(train_dataloader)),disable=cfg.disable_tqdm) 254 | tr_it = iter(train_dataloader) 255 | 256 | losses = [] 257 | 258 | gc.collect() 259 | 260 | if cfg.train: 261 | # ==== TRAIN LOOP 262 | for itr in progress_bar: 263 | i += 1 264 | 265 | cfg.curr_step += cfg.batch_size * cfg.world_size 266 | 267 | try: 268 | data = next(tr_it) 269 | except Exception as e: 270 | print(e) 271 | print("DATA FETCH ERROR") 272 | # continue 273 | 274 | if (i == 1) & cfg.save_first_batch: 275 | save_first_batch(data, cfg) 276 | 277 | model.train() 278 | torch.set_grad_enabled(True) 279 | 280 | 281 | batch = cfg.batch_to_device(data, device) 282 | 283 | if cfg.mixed_precision: 284 | with autocast(): 285 | output_dict = model(batch) 286 | else: 287 | output_dict = model(batch) 288 | 289 | loss = output_dict["loss"] 290 | 291 | losses.append(loss.item()) 292 | 293 | if cfg.grad_accumulation >1: 294 | loss /= cfg.grad_accumulation 295 | 296 | # Backward pass 297 | 298 | if cfg.mixed_precision: 299 | scaler.scale(loss).backward() 300 | 301 | if i % cfg.grad_accumulation == 0: 302 | if (cfg.track_grad_norm) or (cfg.clip_grad > 0): 303 | scaler.unscale_(optimizer) 304 | if cfg.track_grad_norm: 305 | total_grad_norm = calc_grad_norm(model.parameters(), cfg.grad_norm_type) 306 | if cfg.clip_grad > 0: 307 | torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.clip_grad) 308 | if cfg.track_grad_norm: 309 | total_grad_norm_after_clip = calc_grad_norm(model.parameters(), cfg.grad_norm_type) 310 | scaler.step(optimizer) 311 | scaler.update() 312 | optimizer.zero_grad() 313 | else: 314 | 315 | loss.backward() 316 | if i % cfg.grad_accumulation == 0: 317 | if cfg.track_grad_norm: 318 | total_grad_norm = calc_grad_norm(model.parameters()) 319 | if cfg.clip_grad > 0: 320 | torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.clip_grad) 321 | if cfg.track_grad_norm: 322 | total_grad_norm_after_clip = calc_grad_norm(model.parameters(), cfg.grad_norm_type) 323 | optimizer.step() 324 | optimizer.zero_grad() 325 | 326 | if cfg.distributed: 327 | torch.cuda.synchronize() 328 | 329 | if scheduler is not None: 330 | scheduler.step() 331 | 332 | if cfg.local_rank == 0 and cfg.curr_step % cfg.batch_size == 0: 333 | 334 | loss_names = [key for key in output_dict if 'loss' in key] 335 | for l in loss_names: 336 | cfg.neptune_run[f"train/{l}"].log(value=output_dict[l].item(), step=cfg.curr_step) 337 | 338 | cfg.neptune_run["lr"].log( 339 | value=optimizer.param_groups[0]["lr"], step=cfg.curr_step 340 | ) 341 | if total_grad_norm is not None: 342 | cfg.neptune_run["total_grad_norm"].log(value=total_grad_norm.item(), step=cfg.curr_step) 343 | cfg.neptune_run["total_grad_norm_after_clip"].log(value=total_grad_norm_after_clip.item(), step=cfg.curr_step) 344 | 345 | progress_bar.set_description(f"loss: {np.mean(losses[-10:]):.4f}") 346 | 347 | if cfg.eval_steps != 0: 348 | if i % cfg.eval_steps == 0: 349 | if cfg.distributed and cfg.eval_ddp: 350 | # torch.cuda.synchronize() 351 | val_loss = run_eval(model, val_dataloader, cfg, pre="val", curr_epoch=epoch) 352 | else: 353 | if cfg.local_rank == 0: 354 | val_loss = run_eval(model, val_dataloader, cfg, pre="val", curr_epoch=epoch) 355 | else: 356 | val_score = 0 357 | 358 | print(f"Mean train_loss {np.mean(losses):.4f}") 359 | 360 | if cfg.distributed: 361 | torch.cuda.synchronize() 362 | 363 | if cfg.val: 364 | 365 | if (epoch + 1) % cfg.eval_epochs == 0 or (epoch + 1) == cfg.epochs: 366 | if cfg.distributed and cfg.eval_ddp: 367 | val_score = run_eval(model, val_dataloader, cfg, pre="val", curr_epoch=epoch) 368 | else: 369 | if cfg.local_rank == 0: 370 | val_score = run_eval(model, val_dataloader, cfg, pre="val", curr_epoch=epoch) 371 | else: 372 | val_score = 0 373 | 374 | if cfg.test & (cfg.test_epochs > 0): 375 | if (epoch + 1) % cfg.test_epochs == 0: 376 | if not (epoch + 1) == cfg.epochs: 377 | test_score = run_eval(model, test_dataloader, cfg, pre="test", curr_epoch=epoch) 378 | 379 | if cfg.train_val == True: 380 | if (epoch + 1) % cfg.eval_train_epochs == 0 or (epoch + 1) == cfg.epochs: 381 | if cfg.distributed and cfg.eval_ddp: 382 | _ = get_preds(model, train_val_dataloader, cfg, pre=cfg.pre_train_val) 383 | 384 | else: 385 | if cfg.local_rank == 0: 386 | _ = get_preds(model, train_val_dataloader, cfg, pre=cfg.pre_train_val) 387 | 388 | # if cfg.local_rank == 0: 389 | 390 | # if val_loss < best_val_loss: 391 | # print(f'SAVING CHECKPOINT: val_loss {best_val_loss:.5} -> {val_loss:.5}') 392 | # if cfg.local_rank == 0: 393 | 394 | # checkpoint = create_checkpoint(model, 395 | # optimizer, 396 | # epoch, 397 | # scheduler=scheduler, 398 | # scaler=scaler) 399 | 400 | # torch.save(checkpoint, f"{cfg.output_dir}/fold{cfg.fold}/checkpoint_best_seed{cfg.seed}.pth") 401 | # best_val_loss = val_loss 402 | 403 | if cfg.distributed: 404 | torch.distributed.barrier() 405 | 406 | if (cfg.local_rank == 0) and (cfg.epochs > 0) and (cfg.save_checkpoint): 407 | if not cfg.save_only_last_ckpt: 408 | checkpoint = create_checkpoint(cfg, model, optimizer, epoch, scheduler=scheduler, scaler=scaler) 409 | 410 | torch.save( 411 | checkpoint, f"{cfg.output_dir}/fold{cfg.fold}/checkpoint_last_seed{cfg.seed}.pth" 412 | ) 413 | 414 | if (cfg.local_rank == 0) and (cfg.epochs > 0) and (cfg.save_checkpoint): 415 | checkpoint = create_checkpoint(cfg, model, optimizer, epoch, scheduler=scheduler, scaler=scaler) 416 | 417 | torch.save( 418 | checkpoint, f"{cfg.output_dir}/fold{cfg.fold}/checkpoint_last_seed{cfg.seed}.pth" 419 | ) 420 | 421 | if cfg.test: 422 | test_score = run_eval(model, test_dataloader, cfg, pre="test", curr_epoch=epoch) 423 | # print('test_score: ',test_score) 424 | 425 | 426 | return val_score 427 | 428 | 429 | if __name__ == "__main__": 430 | 431 | parser = argparse.ArgumentParser(description="") 432 | 433 | parser.add_argument("-C", "--config", help="config filename") 434 | parser.add_argument("-G", "--gpu_id", default="", help="GPU ID") 435 | parser_args, other_args = parser.parse_known_args(sys.argv) 436 | cfg = copy(importlib.import_module(parser_args.config).cfg) 437 | if parser_args.gpu_id != "": 438 | os.environ['CUDA_VISIBLE_DEVICES'] = str(parser_args.gpu_id) 439 | 440 | # overwrite params in config with additional args 441 | if len(other_args) > 1: 442 | other_args = {k.replace('-',''):v for k, v in zip(other_args[1::2], other_args[2::2])} 443 | 444 | for key in other_args: 445 | if key in cfg.__dict__: 446 | 447 | print(f'overwriting cfg.{key}: {cfg.__dict__[key]} -> {other_args[key]}') 448 | cfg_type = type(cfg.__dict__[key]) 449 | if cfg_type == bool: 450 | cfg.__dict__[key] = other_args[key] == 'True' 451 | elif cfg_type == type(None): 452 | cfg.__dict__[key] = other_args[key] 453 | else: 454 | cfg.__dict__[key] = cfg_type(other_args[key]) 455 | 456 | 457 | os.makedirs(str(cfg.output_dir + f"/fold{cfg.fold}/"), exist_ok=True) 458 | 459 | cfg.CustomDataset = importlib.import_module(cfg.dataset).CustomDataset 460 | cfg.tr_collate_fn = importlib.import_module(cfg.dataset).tr_collate_fn 461 | cfg.val_collate_fn = importlib.import_module(cfg.dataset).val_collate_fn 462 | cfg.batch_to_device = importlib.import_module(cfg.dataset).batch_to_device 463 | 464 | cfg.post_process_pipeline = importlib.import_module(cfg.post_process_pipeline).post_process_pipeline 465 | cfg.calc_metric = importlib.import_module(cfg.metric).calc_metric 466 | 467 | result = train(cfg) 468 | print(result) 469 | --------------------------------------------------------------------------------