├── .DS_Store ├── .gitignore ├── README.md ├── feats └── .gitkeep ├── step1_train_base_model ├── .gitkeep ├── data │ ├── .DS_Store │ ├── AVVP_dataset_full.csv │ ├── AVVP_eval_audio.csv │ ├── AVVP_eval_visual.csv │ ├── AVVP_test_pd.csv │ ├── AVVP_train.csv │ └── AVVP_val_pd.csv ├── dataloader.py ├── main_avvp.py ├── nets │ ├── __init__.py │ ├── models_utils.py │ └── net_audiovisual.py ├── run.sh ├── scripts │ ├── download_dataset.py │ ├── extract_3D_feat.py │ ├── extract_audio.py │ ├── extract_frames.py │ ├── extract_rgb_feat.py │ ├── transforms.py │ └── utils.py └── utils │ ├── __init__.py │ └── eval_metrics.py ├── step2_find_exchange ├── .gitkeep ├── data │ ├── .DS_Store │ ├── AVVP_dataset_full.csv │ ├── AVVP_eval_audio.csv │ ├── AVVP_eval_visual.csv │ ├── AVVP_test_pd.csv │ ├── AVVP_train.csv │ └── AVVP_val_pd.csv ├── dataloader.py ├── main_avvp.py ├── nets │ ├── __init__.py │ ├── models_utils.py │ └── net_audiovisual.py ├── run.sh ├── scripts │ ├── download_dataset.py │ ├── extract_3D_feat.py │ ├── extract_audio.py │ ├── extract_frames.py │ ├── extract_rgb_feat.py │ ├── transforms.py │ └── utils.py └── utils │ ├── __init__.py │ └── eval_metrics.py ├── step3_retrain ├── .gitkeep ├── data │ ├── .DS_Store │ ├── AVVP_dataset_full.csv │ ├── AVVP_eval_audio.csv │ ├── AVVP_eval_visual.csv │ ├── AVVP_test_pd.csv │ ├── AVVP_train.csv │ └── AVVP_val_pd.csv ├── dataloader.py ├── main_avvp.py ├── need_to_change.pkl ├── nets │ ├── __init__.py │ ├── models_utils.py │ └── net_audiovisual.py ├── run.sh ├── scripts │ ├── download_dataset.py │ ├── extract_3D_feat.py │ ├── extract_audio.py │ ├── extract_frames.py │ ├── extract_rgb_feat.py │ ├── transforms.py │ └── utils.py └── utils │ ├── __init__.py │ └── eval_metrics.py └── task.png /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Exploring Heterogeneous Clues for Weakly Supervised Audio-Visual Video Parsing 2 | Code for CVPR 2021 paper [_Exploring Heterogeneous Clues for Weakly-Supervised Audio-Visual Video Parsing_](https://yu-wu.net/pdf/CVPR21_audio.pdf) 3 | 4 | 5 | ## The Audio-Visual Video Parsing task 6 | We aim at identifying the audible and visible events and their temporal location in videos. Note that the visual and audio events might be asynchronous. 7 |
8 | 9 | 10 | ## Prepare data 11 | Please refer to https://github.com/YapengTian/AVVP-ECCV20 for downloading the LLP Dataset and the preprocessed audio and visual features. 12 | Put the downloaded `r2plus1d_18`, `res152`, `vggish` features into the `feats` folder. 13 | 14 | 15 | ## Training pipeline 16 | The training includes three stages. 17 | 18 | ### Train a base model 19 | We first train a base model using MIL and our proposed contrastive learning. 20 | ```shell 21 | cd step1_train_base_model 22 | python main_avvp.py --mode train --audio_dir ../feats/vggish/ --video_dir ../feats/res152/ --st_dir ../feats/r2plus1d_18 23 | ``` 24 | 25 | 26 | ### Generate modality-aware labels 27 | We then freeze the trained model and evaluate each video by swapping its audio and visual tracks with other unrelated videos. 28 | ```shell 29 | cd step2_find_exchange 30 | python main_avvp.py --mode estimate_labels --audio_dir ../feats/vggish/ --video_dir ../feats/res152/ --st_dir ../feats/r2plus1d_18 --model_save_dir ../step1_train_base_model/models/ 31 | ``` 32 | 33 | ### Re-train using modality-aware labels 34 | We then re-train the model from scratch using modality-aware labels. 35 | ```shell 36 | cd step3_retrain 37 | python main_avvp.py --mode retrain --audio_dir ../feats/vggish/ --video_dir ../feats/res152/ --st_dir ../feats/r2plus1d_18 38 | ``` 39 | 40 | 41 | 42 | 43 | ## Citation 44 | 45 | Please cite the following paper in your publications if it helps your research: 46 | 47 | 48 | @inproceedings{wu2021explore, 49 | title = {Exploring Heterogeneous Clues for Weakly-Supervised Audio-Visual Video Parsing}, 50 | author = {Wu, Yu and Yang, Yi}, 51 | booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 52 | year = {2021} 53 | 54 | } 55 | 56 | -------------------------------------------------------------------------------- /feats/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/feats/.gitkeep -------------------------------------------------------------------------------- /step1_train_base_model/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/step1_train_base_model/.gitkeep -------------------------------------------------------------------------------- /step1_train_base_model/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/step1_train_base_model/data/.DS_Store -------------------------------------------------------------------------------- /step1_train_base_model/dataloader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import os 4 | from torch.utils.data import Dataset, DataLoader 5 | from torchvision import transforms, utils 6 | import pandas as pd 7 | import random 8 | import pickle as pkl 9 | 10 | categories = ['Speech', 'Car', 'Cheering', 'Dog', 'Cat', 'Frying_(food)', 11 | 'Basketball_bounce', 'Fire_alarm', 'Chainsaw', 'Cello', 'Banjo', 12 | 'Singing', 'Chicken_rooster', 'Violin_fiddle', 'Vacuum_cleaner', 13 | 'Baby_laughter', 'Accordion', 'Lawn_mower', 'Motorcycle', 'Helicopter', 14 | 'Acoustic_guitar', 'Telephone_bell_ringing', 'Baby_cry_infant_cry', 'Blender', 15 | 'Clapping'] 16 | 17 | def ids_to_multinomial(ids): 18 | """ label encoding 19 | 20 | Returns: 21 | 1d array, multimonial representation, e.g. [1,0,1,0,0,...] 22 | """ 23 | id_to_idx = {id: index for index, id in enumerate(categories)} 24 | 25 | y = np.zeros(len(categories)) 26 | for id in ids: 27 | index = id_to_idx[id] 28 | y[index] = 1 29 | return y 30 | 31 | 32 | 33 | class LLP_dataset(Dataset): 34 | 35 | def __init__(self, label, audio_dir, video_dir, st_dir, train=None, transform=None): 36 | self.df = pd.read_csv(label, header=0, sep='\t') 37 | self.filenames = self.df["filename"] 38 | self.audio_dir = audio_dir 39 | self.video_dir = video_dir 40 | self.st_dir = st_dir 41 | self.transform = transform 42 | 43 | self.train = train 44 | 45 | labels_to_idx = {} 46 | for i in range(len(categories)): 47 | labels_to_idx[i] = [] 48 | 49 | for idx in range(len(self.filenames)): 50 | row = self.df.loc[idx, :] 51 | ids = row[-1].split(',') 52 | label = ids_to_multinomial(ids) 53 | 54 | if len(ids)==1: 55 | for c in range(25): 56 | if label[c] == 1: 57 | labels_to_idx[c].append(idx) 58 | 59 | self.labels_to_idx = labels_to_idx 60 | 61 | 62 | def __len__(self): 63 | return len(self.filenames) 64 | 65 | def __getitem__(self, idx): 66 | row = self.df.loc[idx, :] 67 | name = row[0][:11] 68 | audio = np.load(os.path.join(self.audio_dir, name + '.npy')) 69 | video_s = np.load(os.path.join(self.video_dir, name + '.npy')) 70 | video_st = np.load(os.path.join(self.st_dir, name + '.npy')) 71 | ids = row[-1].split(',') 72 | label = ids_to_multinomial(ids) 73 | 74 | # We move the label smooth from the main.py to dataloder.py 75 | # Origin position: https://github.com/YapengTian/AVVP-ECCV20/blob/master/main_avvp.py#L22 76 | v = 0.9 77 | pa = label 78 | pv = v * label + (1 - v) * 0.5 79 | 80 | sample = {'audio': audio, 'video_s': video_s, 'video_st': video_st, 'label': label, 'idx':np.array(idx), 81 | 'pa': pa, 'pv': pv} 82 | 83 | if self.transform: 84 | sample = self.transform(sample) 85 | 86 | return sample 87 | 88 | class ToTensor(object): 89 | 90 | def __call__(self, sample): 91 | if len(sample) == 2: 92 | audio = sample['audio'] 93 | label = sample['label'] 94 | return {'audio': torch.from_numpy(audio), 'label': torch.from_numpy(label)} 95 | else: 96 | audio = sample['audio'] 97 | video_s = sample['video_s'] 98 | video_st = sample['video_st'] 99 | label = sample['label'] 100 | pa = sample['pa'] 101 | pv = sample['pv'] 102 | return {'audio': torch.from_numpy(audio), 'video_s': torch.from_numpy(video_s), 103 | 'video_st': torch.from_numpy(video_st), 104 | 'pa':torch.from_numpy(pa), 'pv':torch.from_numpy(pv), 105 | 'label': torch.from_numpy(label), 'idx':torch.from_numpy(sample['idx'])} 106 | -------------------------------------------------------------------------------- /step1_train_base_model/main_avvp.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | from dataloader import * 7 | from nets.net_audiovisual import MMIL_Net 8 | from utils.eval_metrics import segment_level, event_level 9 | import pandas as pd 10 | import pickle as pkl 11 | 12 | class LabelSmoothingLoss(nn.Module): 13 | def __init__(self, classes, smoothing=0.0, dim=-1): 14 | super(LabelSmoothingLoss, self).__init__() 15 | self.confidence = 1.0 - smoothing 16 | self.smoothing = smoothing 17 | self.cls = classes 18 | self.dim = dim 19 | 20 | def forward(self, pred, target): 21 | pred = pred.log_softmax(dim=self.dim) 22 | with torch.no_grad(): 23 | # true_dist = pred.data.clone() 24 | true_dist = torch.zeros_like(pred) 25 | true_dist.fill_(self.smoothing / (self.cls - 1)) 26 | true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 27 | return torch.mean(torch.sum(-true_dist * pred, dim=self.dim)) 28 | 29 | 30 | def train(args, model, train_loader, optimizer, criterion, epoch): 31 | model.train() 32 | criterion2 = LabelSmoothingLoss(10, smoothing=0.1) 33 | accs = [] 34 | num = 0 35 | 36 | for batch_idx, sample in enumerate(train_loader): 37 | audio, video, video_st, target = sample['audio'].to('cuda'), sample['video_s'].to('cuda'), sample['video_st'].to('cuda'), sample['label'].type(torch.FloatTensor).to('cuda') 38 | data_idx = sample['idx'] 39 | optimizer.zero_grad() 40 | output, a_prob, v_prob, _, sims, mask = model(audio, video, video_st) 41 | output.clamp_(min=1e-7, max=1 - 1e-7) 42 | a_prob.clamp_(min=1e-7, max=1 - 1e-7) 43 | v_prob.clamp_(min=1e-7, max=1 - 1e-7) 44 | 45 | Pa = sample['pa'].type(torch.FloatTensor).to('cuda') 46 | Pv = sample['pv'].type(torch.FloatTensor).to('cuda') 47 | 48 | b=audio.size(0) 49 | loss1 = criterion(a_prob, Pa) 50 | loss2 = criterion(v_prob, Pv) 51 | loss3 = criterion(output, target) 52 | 53 | loss4 = criterion2(sims, mask) 54 | loss = loss1 + loss2 + loss3 + loss4 55 | 56 | 57 | loss.backward() 58 | optimizer.step() 59 | if batch_idx % args.log_interval == 0: 60 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.3f}\tLoss: {:.3f}\tLoss: {:.3f}\tLoss: {:.3f}'.format( 61 | epoch, batch_idx * len(audio), len(train_loader.dataset), 62 | 100. * batch_idx / len(train_loader), loss1.item(), loss2.item(), loss3.item(), loss3.item())) 63 | 64 | 65 | def eval(model, val_loader, set): 66 | categories = ['Speech', 'Car', 'Cheering', 'Dog', 'Cat', 'Frying_(food)', 67 | 'Basketball_bounce', 'Fire_alarm', 'Chainsaw', 'Cello', 'Banjo', 68 | 'Singing', 'Chicken_rooster', 'Violin_fiddle', 'Vacuum_cleaner', 69 | 'Baby_laughter', 'Accordion', 'Lawn_mower', 'Motorcycle', 'Helicopter', 70 | 'Acoustic_guitar', 'Telephone_bell_ringing', 'Baby_cry_infant_cry', 'Blender', 71 | 'Clapping'] 72 | model.eval() 73 | 74 | # load annotations 75 | df = pd.read_csv(set, header=0, sep='\t') 76 | df_a = pd.read_csv("data/AVVP_eval_audio.csv", header=0, sep='\t') 77 | df_v = pd.read_csv("data/AVVP_eval_visual.csv", header=0, sep='\t') 78 | 79 | id_to_idx = {id: index for index, id in enumerate(categories)} 80 | F_seg_a = [] 81 | F_seg_v = [] 82 | F_seg = [] 83 | F_seg_av = [] 84 | F_event_a = [] 85 | F_event_v = [] 86 | F_event = [] 87 | F_event_av = [] 88 | 89 | with torch.no_grad(): 90 | for batch_idx, sample in enumerate(val_loader): 91 | audio, video, video_st, target = sample['audio'].to('cuda'), sample['video_s'].to('cuda'),sample['video_st'].to('cuda'), sample['label'].to('cuda') 92 | output, a_prob, v_prob, frame_prob, a, is_real = model(audio, video, video_st) 93 | o = (output.cpu().detach().numpy() >= 0.5).astype(np.int_) 94 | oa = (a_prob.cpu().detach().numpy() >= 0.5).astype(np.int_) 95 | ov = (v_prob.cpu().detach().numpy() >= 0.5).astype(np.int_) 96 | 97 | Pa = frame_prob[0, :, 0, :].cpu().detach().numpy() 98 | Pv = frame_prob[0, :, 1, :].cpu().detach().numpy() 99 | 100 | Pa = (Pa >= 0.5).astype(np.int_) * np.repeat(oa, repeats=10, axis=0) 101 | Pv = (Pv >= 0.5).astype(np.int_) * np.repeat(ov, repeats=10, axis=0) 102 | 103 | # extract audio GT labels 104 | GT_a = np.zeros((25, 10)) 105 | GT_v =np.zeros((25, 10)) 106 | GT_aa = np.zeros(25, dtype=np.int) 107 | GT_vv = np.zeros(25, dtype=np.int) 108 | 109 | df_vid_a = df_a.loc[df_a['filename'] == df.loc[batch_idx, :][0]] 110 | filenames = df_vid_a["filename"] 111 | events = df_vid_a["event_labels"] 112 | onsets = df_vid_a["onset"] 113 | offsets = df_vid_a["offset"] 114 | num = len(filenames) 115 | if num >0: 116 | for i in range(num): 117 | 118 | x1 = int(onsets[df_vid_a.index[i]]) 119 | x2 = int(offsets[df_vid_a.index[i]]) 120 | event = events[df_vid_a.index[i]] 121 | idx = id_to_idx[event] 122 | GT_a[idx, x1:x2] = 1 123 | GT_aa[idx] = 1 124 | 125 | # extract visual GT labels 126 | df_vid_v = df_v.loc[df_v['filename'] == df.loc[batch_idx, :][0]] 127 | filenames = df_vid_v["filename"] 128 | events = df_vid_v["event_labels"] 129 | onsets = df_vid_v["onset"] 130 | offsets = df_vid_v["offset"] 131 | num = len(filenames) 132 | if num > 0: 133 | for i in range(num): 134 | x1 = int(onsets[df_vid_v.index[i]]) 135 | x2 = int(offsets[df_vid_v.index[i]]) 136 | event = events[df_vid_v.index[i]] 137 | idx = id_to_idx[event] 138 | GT_v[idx, x1:x2] = 1 139 | GT_vv[idx]=1 140 | GT_av = GT_a * GT_v 141 | 142 | # obtain prediction matrices 143 | SO_a = np.transpose(Pa) 144 | SO_v = np.transpose(Pv) 145 | SO_av = SO_a * SO_v 146 | 147 | # segment-level F1 scores 148 | f_a, f_v, f, f_av = segment_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av) 149 | F_seg_a.append(f_a) 150 | F_seg_v.append(f_v) 151 | F_seg.append(f) 152 | F_seg_av.append(f_av) 153 | 154 | # event-level F1 scores 155 | f_a, f_v, f, f_av = event_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av) 156 | F_event_a.append(f_a) 157 | F_event_v.append(f_v) 158 | F_event.append(f) 159 | F_event_av.append(f_av) 160 | 161 | print("\n") 162 | print('Audio \t {:.1f} \t {:.1f}'.format( 100 * np.mean(np.array(F_seg_a)), 100 * np.mean(np.array(F_event_a)))) 163 | print('Visual \t {:.1f} \t {:.1f}'.format( 100 * np.mean(np.array(F_seg_v)), 100 * np.mean(np.array(F_event_v)))) 164 | print('AudVis \t {:.1f} \t {:.1f}'.format( 100 * np.mean(np.array(F_seg_av)), 100 * np.mean(np.array(F_event_av)))) 165 | 166 | avg_type = (100 * np.mean(np.array(F_seg_av))+100 * np.mean(np.array(F_seg_a))+100 * np.mean(np.array(F_seg_v)))/3. 167 | avg_event = 100 * np.mean(np.array(F_seg)) 168 | print('Segment-levelType@Avg. F1: {:.1f}'.format(avg_type)) 169 | print('Segment-level Event@Avg. F1: {:.1f}'.format( avg_event)) 170 | 171 | avg_type_event = (100 * np.mean(np.array(F_event_av)) + 100 * np.mean(np.array(F_event_a)) + 100 * np.mean( 172 | np.array(F_event_v))) / 3. 173 | avg_event_level = 100 * np.mean(np.array(F_event)) 174 | print('Event-level Type@Avg. F1: {:.1f}'.format( avg_type_event)) 175 | print('Event-level Event@Avg. F1: {:.1f}'.format(avg_event_level)) 176 | print("\n") 177 | return avg_type 178 | 179 | 180 | def main(): 181 | parser = argparse.ArgumentParser(description='PyTorch Implementation of Audio-Visual Video Parsing') 182 | parser.add_argument( 183 | "--audio_dir", type=str, default='data/feats/vggish/', help="audio dir") 184 | parser.add_argument( 185 | "--video_dir", type=str, default='data/feats/res152/', 186 | help="video dir") 187 | parser.add_argument( 188 | "--st_dir", type=str, default='data/feats/r2plus1d_18/', 189 | help="video dir") 190 | parser.add_argument( 191 | "--label_train", type=str, default="data/AVVP_train.csv", help="weak train csv file") 192 | parser.add_argument( 193 | "--label_val", type=str, default="data/AVVP_val_pd.csv", help="weak val csv file") 194 | parser.add_argument( 195 | "--label_test", type=str, default="data/AVVP_test_pd.csv", help="weak test csv file") 196 | parser.add_argument('--batch-size', type=int, default=16, metavar='N', 197 | help='input batch size for training (default: 16)') 198 | parser.add_argument('--epochs', type=int, default=12, metavar='N', 199 | help='number of epochs to train (default: 60)') 200 | parser.add_argument('--lr', type=float, default=3e-4, metavar='LR', 201 | help='learning rate (default: 3e-4)') 202 | parser.add_argument( 203 | "--model", type=str, default='MMIL_Net', help="with model to use") 204 | parser.add_argument( 205 | "--mode", type=str, default='train', help="with mode to use") 206 | parser.add_argument('--seed', type=int, default=1, metavar='S', 207 | help='random seed (default: 1)') 208 | parser.add_argument('--log-interval', type=int, default=50, metavar='N', 209 | help='how many batches to wait before logging training status') 210 | parser.add_argument( 211 | "--model_save_dir", type=str, default='models/', help="model save dir") 212 | parser.add_argument( 213 | "--checkpoint", type=str, default='MMIL_Net', 214 | help="save model name") 215 | parser.add_argument( 216 | '--gpu', type=str, default='0', help='gpu device number') 217 | args = parser.parse_args() 218 | 219 | torch.manual_seed(args.seed) 220 | 221 | model = MMIL_Net().to('cuda') 222 | 223 | if args.mode == 'train': 224 | train_dataset = LLP_dataset(train=True, label=args.label_train, audio_dir=args.audio_dir, video_dir=args.video_dir, st_dir=args.st_dir, transform = transforms.Compose([ToTensor()])) 225 | val_dataset = LLP_dataset(label=args.label_val, audio_dir=args.audio_dir, video_dir=args.video_dir, st_dir=args.st_dir, transform = transforms.Compose([ToTensor()])) 226 | test_dataset = LLP_dataset(train=False, label=args.label_test, audio_dir=args.audio_dir, video_dir=args.video_dir, 227 | st_dir=args.st_dir, transform=transforms.Compose([ToTensor()])) 228 | 229 | train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=12, pin_memory = True) 230 | val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory = True) 231 | test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True) 232 | 233 | optimizer = optim.Adam(model.parameters(), lr=args.lr) 234 | scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) 235 | criterion = nn.BCELoss() 236 | best_F = 0 237 | 238 | for epoch in range(1, args.epochs + 1): 239 | train(args, model, train_loader, optimizer, criterion, epoch=epoch) 240 | scheduler.step() 241 | F = eval(model, val_loader, args.label_val) 242 | 243 | torch.save(model.state_dict(), args.model_save_dir + args.checkpoint + ".pt") 244 | 245 | elif args.mode == 'val': 246 | test_dataset = LLP_dataset(label=args.label_val, audio_dir=args.audio_dir, video_dir=args.video_dir, 247 | st_dir=args.st_dir, transform=transforms.Compose([ 248 | ToTensor()])) 249 | test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True) 250 | model.load_state_dict(torch.load(args.model_save_dir + args.checkpoint + ".pt")) 251 | eval(model, test_loader, args.label_val) 252 | else: 253 | test_dataset = LLP_dataset(label=args.label_test, audio_dir=args.audio_dir, video_dir=args.video_dir, st_dir=args.st_dir, transform = transforms.Compose([ 254 | ToTensor()])) 255 | test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True) 256 | model.load_state_dict(torch.load(args.model_save_dir + args.checkpoint + ".pt")) 257 | eval(model, test_loader, args.label_test) 258 | if __name__ == '__main__': 259 | main() 260 | -------------------------------------------------------------------------------- /step1_train_base_model/nets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/step1_train_base_model/nets/__init__.py -------------------------------------------------------------------------------- /step1_train_base_model/nets/models_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.autograd as autograd 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.autograd import Variable 7 | from torch.nn import init 8 | 9 | import math 10 | import numpy as np 11 | 12 | 13 | 14 | 15 | 16 | class ScaledDotProductAttention(nn.Module): 17 | ''' Scaled Dot-Product Attention ''' 18 | 19 | def __init__(self, temperature, attn_dropout=0.1): 20 | super().__init__() 21 | self.temperature = temperature 22 | self.dropout = nn.Dropout(attn_dropout) 23 | self.softmax = nn.Softmax(dim=2) 24 | 25 | def forward(self, q, k, v): 26 | 27 | attn = torch.bmm(q, k.transpose(1, 2)) 28 | attn = attn / self.temperature 29 | 30 | attn = self.softmax(attn) 31 | attn = self.dropout(attn) 32 | output = torch.bmm(attn, v) 33 | 34 | return output, attn 35 | 36 | 37 | class LayerNorm(nn.Module): 38 | 39 | def __init__(self, features, eps=1e-6): 40 | super().__init__() 41 | self.gamma = nn.Parameter(torch.ones(features)) 42 | self.beta = nn.Parameter(torch.zeros(features)) 43 | self.eps = eps 44 | 45 | def forward(self, x): 46 | mean = x.mean(-1, keepdim=True) 47 | std = x.std(-1, keepdim=True) 48 | return self.gamma * (x - mean) / (std + self.eps) + self.beta 49 | 50 | 51 | class PositionalEncoding2(nn.Module): 52 | "Implement the PE function." 53 | def __init__(self, d_model, dropout, max_len=500): 54 | super(PositionalEncoding, self).__init__() 55 | self.dropout = nn.Dropout(p=dropout) 56 | 57 | # Compute the positional encodings once in log space. 58 | pe = torch.zeros(max_len, d_model) 59 | position = torch.arange(0., max_len).unsqueeze(1) 60 | div_term = torch.exp(torch.arange(0., d_model, 2) * 61 | -(math.log(10000.0) / d_model)) 62 | pe[:, 0::2] = torch.sin(position * div_term) 63 | pe[:, 1::2] = torch.cos(position * div_term) 64 | pe = pe.unsqueeze(0) 65 | self.register_buffer('pe', pe) 66 | 67 | def forward(self, x): 68 | with torch.no_grad(): 69 | x1 = self.pe[:, :x.size(1)] 70 | x = x + x1 71 | #x = x + Variable(self.pe[:, :x.size(1)], 72 | # requires_grad=False) 73 | return self.dropout(x) 74 | 75 | class MultiHeadAttention2(nn.Module): 76 | ''' Multi-Head Attention module ''' 77 | 78 | def __init__(self, d_model, d_k, d_v, n_head=1, dropout=0.5): 79 | super().__init__() 80 | 81 | self.n_head = n_head 82 | self.d_k = d_k 83 | self.d_v = d_v 84 | 85 | self.w_qs = nn.Linear(d_model, n_head * d_k) 86 | self.w_ks = nn.Linear(d_model, n_head * d_k) 87 | self.w_vs = nn.Linear(d_model, n_head * d_v) 88 | nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 89 | nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 90 | nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) 91 | 92 | self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) 93 | self.layer_norm = LayerNorm(d_model) 94 | 95 | self.fc = nn.Linear(n_head * d_v, d_model) 96 | nn.init.xavier_normal_(self.fc.weight) 97 | 98 | self.dropout = nn.Dropout(dropout) 99 | 100 | 101 | def forward(self, q, k, v): 102 | 103 | d_k, d_v, n_head = self.d_k, self.d_v, self.n_head 104 | 105 | sz_b, len_q, _ = q.size() 106 | sz_b, len_k, _ = k.size() 107 | sz_b, len_v, _ = v.size() 108 | 109 | residual = q 110 | 111 | q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) 112 | k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) 113 | v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) 114 | 115 | q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk 116 | k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk 117 | v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv 118 | 119 | output, attn = self.attention(q, k, v) 120 | 121 | output = output.view(n_head, sz_b, len_q, d_v) 122 | output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv) 123 | 124 | output = self.dropout(self.fc(output)) 125 | output = self.layer_norm(output + residual) 126 | 127 | return output, attn 128 | 129 | 130 | 131 | 132 | 133 | 134 | class ContrastiveLoss(torch.nn.Module): 135 | """ 136 | Contrastive loss function. 137 | Based on: 138 | """ 139 | def __init__(self, margin=2.0): 140 | super(ContrastiveLoss, self).__init__() 141 | self.margin = margin 142 | 143 | def forward(self, dist, y): 144 | # euclidian distance 145 | dist_sq = torch.pow(dist, 2) 146 | dist = torch.clamp(self.margin - dist, min=0.0) 147 | 148 | 149 | assert len(y.data.shape) == 2, y.data.shape 150 | bs, time = y.data.shape 151 | y = y.view(-1) 152 | 153 | loss = y * dist_sq + (1 - y) * torch.pow(dist, 2) 154 | loss = torch.mean(loss) 155 | return loss 156 | 157 | 158 | 159 | 160 | 161 | class BinaryFocalLoss(nn.Module): 162 | """ 163 | This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in 164 | 'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)' 165 | Focal_Loss= -1*alpha*(1-pt)*log(pt) 166 | :param num_class: 167 | :param alpha: (tensor) 3D or 4D the scalar factor for this criterion 168 | :param gamma: (float,double) gamma > 0 reduces the relative loss for well-classified examples (p>0.5) putting more 169 | focus on hard misclassified example 170 | :param reduction: `none`|`mean`|`sum` 171 | :param **kwargs 172 | balance_index: (int) balance class index, should be specific when alpha is float 173 | """ 174 | 175 | def __init__(self, alpha=[1.0, 1.0], gamma=2, ignore_index=None, reduction='mean'): 176 | super(BinaryFocalLoss, self).__init__() 177 | if alpha is None: 178 | alpha = [0.25, 0.75] 179 | self.alpha = alpha 180 | self.gamma = gamma 181 | self.smooth = 1e-6 182 | self.ignore_index = ignore_index 183 | self.reduction = reduction 184 | 185 | assert self.reduction in ['none', 'mean', 'sum'] 186 | 187 | if self.alpha is None: 188 | self.alpha = torch.ones(2) 189 | elif isinstance(self.alpha, (list, np.ndarray)): 190 | self.alpha = np.asarray(self.alpha) 191 | self.alpha = np.reshape(self.alpha, (2)) 192 | assert self.alpha.shape[0] == 2, \ 193 | 'the `alpha` shape is not match the number of class' 194 | elif isinstance(self.alpha, (float, int)): 195 | self.alpha = np.asarray([self.alpha, 1.0 - self.alpha], dtype=np.float).view(2) 196 | 197 | else: 198 | raise TypeError('{} not supported'.format(type(self.alpha))) 199 | 200 | def forward(self, output, target): 201 | prob = torch.sigmoid(output) 202 | prob = torch.clamp(prob, self.smooth, 1.0 - self.smooth) 203 | 204 | pos_mask = (target == 1).float() 205 | neg_mask = (target == 0).float() 206 | 207 | pos_loss = -self.alpha[0] * torch.pow(torch.sub(1.0, prob), self.gamma) * torch.log(prob) * pos_mask 208 | neg_loss = -self.alpha[1] * torch.pow(prob, self.gamma) * \ 209 | torch.log(torch.sub(1.0, prob)) * neg_mask 210 | 211 | neg_loss = neg_loss.sum() 212 | pos_loss = pos_loss.sum() 213 | num_pos = pos_mask.view(pos_mask.size(0), -1).sum() 214 | num_neg = neg_mask.view(neg_mask.size(0), -1).sum() 215 | 216 | if num_pos == 0: 217 | loss = neg_loss 218 | else: 219 | loss = pos_loss / num_pos + neg_loss / num_neg 220 | return loss 221 | 222 | 223 | 224 | class FocalLoss_Ori(nn.Module): 225 | """ 226 | This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in 227 | 'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)' 228 | Focal_Loss= -1*alpha*(1-pt)*log(pt) 229 | :param num_class: 230 | :param alpha: (tensor) 3D or 4D the scalar factor for this criterion 231 | :param gamma: (float,double) gamma > 0 reduces the relative loss for well-classified examples (p>0.5) putting more 232 | focus on hard misclassified example 233 | :param smooth: (float,double) smooth value when cross entropy 234 | :param size_average: (bool, optional) By default, the losses are averaged over each loss element in the batch. 235 | """ 236 | 237 | def __init__(self, num_class, alpha=[0.25,0.75], gamma=2, balance_index=-1, size_average=True): 238 | super(FocalLoss_Ori, self).__init__() 239 | self.num_class = num_class 240 | self.alpha = alpha 241 | self.gamma = gamma 242 | self.size_average = size_average 243 | self.eps = 1e-6 244 | 245 | 246 | def forward(self, logit, target): 247 | 248 | if logit.dim() > 2: 249 | # N,C,d1,d2 -> N,C,m (m=d1*d2*...) 250 | logit = logit.view(logit.size(0), logit.size(1), -1) 251 | logit = logit.transpose(1, 2).contiguous() # [N,C,d1*d2..] -> [N,d1*d2..,C] 252 | logit = logit.view(-1, logit.size(-1)) # [N,d1*d2..,C]-> [N*d1*d2..,C] 253 | target = target.view(-1, 1) # [N,d1,d2,...]->[N*d1*d2*...,1] 254 | 255 | # -----------legacy way------------ 256 | # idx = target.cpu().long() 257 | # one_hot_key = torch.FloatTensor(target.size(0), self.num_class).zero_() 258 | # one_hot_key = one_hot_key.scatter_(1, idx, 1) 259 | # if one_hot_key.device != logit.device: 260 | # one_hot_key = one_hot_key.to(logit.device) 261 | # pt = (one_hot_key * logit).sum(1) + epsilon 262 | 263 | # ----------memory saving way-------- 264 | pt = logit.gather(1, target).view(-1) + self.eps # avoid apply 265 | logpt = pt.log() 266 | 267 | loss = -1 * torch.pow(torch.add(0.5, pt), self.gamma) * logpt 268 | 269 | if self.size_average: 270 | loss = loss.mean() 271 | else: 272 | loss = loss.sum() 273 | return loss 274 | 275 | 276 | -------------------------------------------------------------------------------- /step1_train_base_model/nets/net_audiovisual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy 6 | import copy 7 | import math 8 | from .models_utils import MultiHeadAttention2, PositionalEncoding2 9 | 10 | def _get_clones(module, N): 11 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 12 | 13 | class Encoder(nn.Module): 14 | 15 | def __init__(self, encoder_layer, num_layers, norm=None): 16 | super(Encoder, self).__init__() 17 | self.layers = _get_clones(encoder_layer, num_layers) 18 | self.num_layers = num_layers 19 | self.norm1 = nn.LayerNorm(512) 20 | self.norm2 = nn.LayerNorm(512) 21 | self.norm = norm 22 | 23 | def forward(self, src_a, src_v, mask=None, src_key_padding_mask=None): 24 | output_a = src_a 25 | output_v = src_v 26 | 27 | for i in range(self.num_layers): 28 | output_a = self.layers[i](src_a, src_v, src_mask=mask, 29 | src_key_padding_mask=src_key_padding_mask) 30 | output_v = self.layers[i](src_v, src_a, src_mask=mask, 31 | src_key_padding_mask=src_key_padding_mask) 32 | 33 | if self.norm: 34 | output_a = self.norm1(output_a) 35 | output_v = self.norm2(output_v) 36 | 37 | return output_a, output_v 38 | 39 | class HANLayer(nn.Module): 40 | 41 | def __init__(self, d_model, nhead, dim_feedforward=512, dropout=0.1): 42 | super(HANLayer, self).__init__() 43 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 44 | self.cm_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 45 | 46 | # Implementation of Feedforward model 47 | self.linear1 = nn.Linear(d_model, dim_feedforward) 48 | self.dropout = nn.Dropout(dropout) 49 | self.linear2 = nn.Linear(dim_feedforward, d_model) 50 | 51 | self.norm1 = nn.LayerNorm(d_model) 52 | self.norm2 = nn.LayerNorm(d_model) 53 | self.dropout11 = nn.Dropout(dropout) 54 | self.dropout12 = nn.Dropout(dropout) 55 | self.dropout2 = nn.Dropout(dropout) 56 | 57 | self.activation = nn.ReLU() 58 | 59 | def forward(self, src_q, src_v, src_mask=None, src_key_padding_mask=None): 60 | """Pass the input through the encoder layer. 61 | 62 | Args: 63 | src: the sequnce to the encoder layer (required). 64 | src_mask: the mask for the src sequence (optional). 65 | src_key_padding_mask: the mask for the src keys per batch (optional). 66 | 67 | Shape: 68 | see the docs in Transformer class. 69 | """ 70 | src_q = src_q.permute(1, 0, 2) 71 | src_v = src_v.permute(1, 0, 2) 72 | 73 | src1 = self.cm_attn(src_q, src_v, src_v, attn_mask=src_mask, 74 | key_padding_mask=src_key_padding_mask)[0] 75 | src2 = self.self_attn(src_q, src_q, src_q, attn_mask=src_mask, 76 | key_padding_mask=src_key_padding_mask)[0] 77 | 78 | src_q = src_q + self.dropout11(src1) + self.dropout12(src2) 79 | src_q = self.norm1(src_q) 80 | 81 | src2 = self.linear2(self.dropout(F.relu(self.linear1(src_q)))) 82 | src_q = src_q + self.dropout2(src2) 83 | src_q = self.norm2(src_q) 84 | return src_q.permute(1, 0, 2) 85 | 86 | 87 | 88 | class MMIL_Net(nn.Module): 89 | 90 | def __init__(self): 91 | super(MMIL_Net, self).__init__() 92 | 93 | self.fc_prob = nn.Linear(512, 25) 94 | self.fc_frame_att = nn.Linear(512, 25) 95 | self.fc_av_att = nn.Linear(512, 25) 96 | self.fc_a = nn.Linear(128, 512) 97 | self.fc_v = nn.Linear(2048, 512) 98 | self.fc_st = nn.Linear(512, 512) 99 | self.fc_fusion = nn.Linear(1024, 512) 100 | self.audio_encoder = nn.TransformerEncoder \ 101 | (nn.TransformerEncoderLayer(d_model=512, nhead=1, dim_feedforward=512), num_layers=1) 102 | self.visual_encoder = nn.TransformerEncoder \ 103 | (nn.TransformerEncoderLayer(d_model=512, nhead=1, dim_feedforward=512), num_layers=1) 104 | self.cmt_encoder = Encoder(CMTLayer(d_model=512, nhead=1, dim_feedforward=512), num_layers=1) 105 | self.hat_encoder = Encoder(HANLayer(d_model=512, nhead=1, dim_feedforward=512), num_layers=1) 106 | 107 | self.t_att = MultiHeadAttention2(512, 512, 512) 108 | self.t_att2 = MultiHeadAttention2(512, 512, 512) 109 | self.fc1= nn.Linear(1024, 256) 110 | self.fc2= nn.Linear(256, 2) 111 | 112 | def forward(self, audio, visual, visual_st): 113 | 114 | b, t, d = visual_st.size() 115 | x1 = self.fc_a(audio) 116 | x_audio = x1 117 | 118 | # 2d and 3d visual feature fusion (b, 80, 2048), (b, 10, 512) 119 | 120 | # merge (b, 80, 2048) -> (b, 10, 512) 121 | vid_s = self.fc_v(visual).permute(0, 2, 1).unsqueeze(-1) 122 | vid_s = F.avg_pool2d(vid_s, (8, 1)).squeeze(-1).permute(0, 2, 1) 123 | 124 | vid_st = self.fc_st(visual_st) 125 | x2 = torch.cat((vid_s, vid_st), dim =-1) 126 | x2 = self.fc_fusion(x2) 127 | x_visual = x2 128 | 129 | # HAN 130 | x1, x2 = self.hat_encoder(x1, x2) 131 | sims = 1 132 | 133 | xx1 = F.normalize(x_visual, p=2, dim=-1) 134 | xx2 = F.normalize(x1, p=2, dim=-1) 135 | 136 | sims = xx2.bmm(xx1.permute(0, 2, 1)).squeeze(1) / 0.2 137 | sims = sims.reshape(-1, 10) 138 | 139 | mask = torch.zeros(b, 10) 140 | mask = mask.long() 141 | for i in range(10): 142 | mask[:, i] = i 143 | mask = mask.cuda() 144 | mask = mask.reshape(-1) 145 | # prediction 146 | x = torch.cat([x1.unsqueeze(-2), x2.unsqueeze(-2)], dim=-2) 147 | frame_prob = torch.sigmoid(self.fc_prob(x)) 148 | 149 | # attentive MMIL pooling 150 | 151 | frame_att = torch.softmax(self.fc_frame_att(x), dim=1) 152 | av_att = torch.softmax(self.fc_av_att(x), dim=2) 153 | temporal_prob = (frame_att * frame_prob) 154 | global_prob = (temporal_prob*av_att).sum(dim=2).sum(dim=1) 155 | 156 | a_prob = temporal_prob[:, :, 0, :].sum(dim=1) 157 | v_prob =temporal_prob[:, :, 1, :].sum(dim=1) 158 | 159 | return global_prob, a_prob, v_prob, frame_prob, sims, mask 160 | 161 | 162 | class CMTLayer(nn.Module): 163 | 164 | def __init__(self, d_model, nhead, dim_feedforward=512, dropout=0.1): 165 | super(CMTLayer, self).__init__() 166 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 167 | # Implementation of Feedforward model 168 | self.linear1 = nn.Linear(d_model, dim_feedforward) 169 | self.dropout = nn.Dropout(dropout) 170 | self.linear2 = nn.Linear(dim_feedforward, d_model) 171 | 172 | self.norm1 = nn.LayerNorm(d_model) 173 | self.norm2 = nn.LayerNorm(d_model) 174 | self.dropout1 = nn.Dropout(dropout) 175 | self.dropout2 = nn.Dropout(dropout) 176 | 177 | self.activation = nn.ReLU() 178 | 179 | def forward(self, src_q, src_v, src_mask=None, src_key_padding_mask=None): 180 | r"""Pass the input through the encoder layer. 181 | 182 | Args: 183 | src: the sequnce to the encoder layer (required). 184 | src_mask: the mask for the src sequence (optional). 185 | src_key_padding_mask: the mask for the src keys per batch (optional). 186 | 187 | Shape: 188 | see the docs in Transformer class. 189 | """ 190 | src2 = self.self_attn(src_q, src_v, src_v, attn_mask=src_mask, 191 | key_padding_mask=src_key_padding_mask)[0] 192 | src_q = src_q + self.dropout1(src2) 193 | src_q = self.norm1(src_q) 194 | 195 | src2 = self.linear2(self.dropout(F.relu(self.linear1(src_q)))) 196 | src_q = src_q + self.dropout2(src2) 197 | src_q = self.norm2(src_q) 198 | return src_q 199 | -------------------------------------------------------------------------------- /step1_train_base_model/run.sh: -------------------------------------------------------------------------------- 1 | #CUDA_VISIBLE_DEVICES=1 . 2 | python main_avvp.py --mode train --audio_dir ../feats/vggish/ --video_dir ../feats/res152/ --st_dir ../feats/r2plus1d_18 3 | -------------------------------------------------------------------------------- /step1_train_base_model/scripts/download_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | def download(set, name, t_seg): 5 | #label = label.replace(" ", "_") # avoid space in folder name 6 | path_data = os.path.join(set, "video") 7 | print(path_data) 8 | if not os.path.exists(path_data): 9 | os.makedirs(path_data) 10 | link_prefix = "https://www.youtube.com/watch?v=" 11 | 12 | filename_full_video = os.path.join(path_data, name) + "_full_video.mp4" 13 | filename = os.path.join(path_data, name) + ".mp4" 14 | link = link_prefix + name 15 | 16 | if os.path.exists(filename): 17 | print("already exists, skip") 18 | return 19 | 20 | print( "download the whole video for: [%s] - [%s]" % (set, name)) 21 | command1 = 'youtube-dl --ignore-config ' 22 | command1 += link + " " 23 | command1 += "-o " + filename_full_video + " " 24 | command1 += "-f best " 25 | 26 | #command1 += '-q ' # print no log 27 | #print command1 28 | os.system(command1) 29 | 30 | t_start, t_end = t_seg 31 | t_dur = t_end - t_start 32 | print("trim the video to [%.1f-%.1f]" % (t_start, t_end)) 33 | command2 = 'ffmpeg ' 34 | command2 += '-ss ' 35 | command2 += str(t_start) + ' ' 36 | command2 += '-i ' 37 | command2 += filename_full_video + ' ' 38 | command2 += '-t ' 39 | command2 += str(t_dur) + ' ' 40 | command2 += '-vcodec libx264 ' 41 | command2 += '-acodec aac -strict -2 ' 42 | command2 += filename + ' ' 43 | command2 += '-y ' # overwrite without asking 44 | command2 += '-loglevel -8 ' # print no log 45 | #print(command2) 46 | os.system(command2) 47 | try: 48 | os.remove(filename_full_video) 49 | except: 50 | return 51 | 52 | print ("finish the video as: " + filename) 53 | 54 | 55 | ##%% read the label encoding 56 | # filename = "../doc/class_labels_indices.csv" 57 | # lines = [x.strip() for x in open(filename, 'r')][1:] 58 | # label_encode = {} 59 | # for l in lines: 60 | # l = l[l.find(",")+1:] 61 | # encode = l.split(",")[0] 62 | # label_encode[ l[len(encode)+2:-1] ] = encode 63 | # 64 | # 65 | # 66 | 67 | # %% read the video trim time indices 68 | filename_source = "data/AVVP_dataset_full.csv" # 69 | set = "data/LLP_dataset" 70 | df = pd.read_csv(filename_source, header=0, sep='\t') 71 | filenames = df["filename"] 72 | length = len(filenames) 73 | print(length) 74 | names = [] 75 | segments = {} 76 | for i in range(length): 77 | row = df.loc[i, :] 78 | name = row[0][:11] 79 | steps = row[0][11:].split("_") 80 | t_start = float(steps[1]) 81 | t_end = t_start + 10 82 | segments[name] = (t_start, t_end) 83 | download(set, name, segments[name]) 84 | names.append(name) 85 | print(len(segments)) 86 | 87 | -------------------------------------------------------------------------------- /step1_train_base_model/scripts/extract_3D_feat.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import sys 3 | import subprocess 4 | import glob 5 | from tqdm import tqdm 6 | import numpy as np 7 | import os 8 | import argparse 9 | from PIL import Image 10 | import torch 11 | from torch import nn 12 | import torch.nn.functional as F 13 | import torchvision.models as models 14 | import transforms as TF 15 | import utils 16 | import torchvision 17 | 18 | 19 | C, H, W = 3, 112, 112 20 | 21 | def extract_feats(params, model, load_img): 22 | global C, H, W 23 | model.eval() 24 | dir_fc = os.path.join(os.getcwd(), params['output_dir']) 25 | if not os.path.isdir(dir_fc): 26 | os.mkdir(dir_fc) 27 | 28 | video_list = os.listdir(params['video_path']) 29 | nn = 0 30 | for video in video_list: 31 | 32 | nn = nn + 1 33 | dst = video 34 | 35 | image_list = sorted(glob.glob(os.path.join(params['video_path'], dst, '*.jpg'))) 36 | samples = np.round(np.linspace( 37 | 0, len(image_list) - 1, params['n_frame_steps'])) 38 | 39 | image_list = [image_list[int(sample)] for sample in samples] 40 | images = torch.zeros((len(image_list)//8, C, 8, H, W)) 41 | i = 0 42 | for iImg in range(len(image_list)): 43 | 44 | ii = i//8 45 | img = load_img(image_list[iImg]) 46 | images[ii, :, i%8, :, :] = img 47 | i += 1 48 | 49 | with torch.no_grad(): 50 | fc_feats = model(images.cuda()).squeeze() 51 | img_feats = fc_feats.cpu().numpy() 52 | # Save the inception features 53 | outfile = os.path.join(dir_fc, video + '.npy') 54 | np.save(outfile, img_feats) 55 | # cleanup 56 | #shutil.rmtree(dst) 57 | print(nn) 58 | 59 | 60 | if __name__ == '__main__': 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument("--gpu", dest='gpu', type=str, default='1', 63 | help='Set CUDA_VISIBLE_DEVICES environment variable, optional') 64 | parser.add_argument("--output_dir", dest='output_dir', type=str, 65 | default='data/LLP_dataset/feats/r2plus1d_18', help='directory to store features') 66 | parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=80, 67 | help='how many frames to sampler per video') 68 | 69 | parser.add_argument("--video_path", dest='video_path', type=str, 70 | default='data/LLP_dataset/frame', help='path to video dataset') 71 | parser.add_argument("--model", dest="model", type=str, default='r2plus1d_18', 72 | help='the CNN model you want to use to extract_feats') 73 | 74 | args = parser.parse_args() 75 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 76 | params = vars(args) 77 | if params['model'] == 'r2plus1d_18': 78 | model = models.video.r2plus1d_18(pretrained=True) 79 | model = nn.Sequential(*list(model.children())[:-1]) 80 | for param in model.parameters(): 81 | param.requires_grad = False 82 | T, C, H, W = 8, 3, 112, 112 83 | load_img = utils.LoadTransformImage() 84 | 85 | else: 86 | print("doesn't support %s" % (params['model'])) 87 | 88 | model = nn.DataParallel(model) 89 | model = model.cuda() 90 | extract_feats(params, model, load_img) 91 | -------------------------------------------------------------------------------- /step1_train_base_model/scripts/extract_audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import moviepy 3 | from moviepy.audio.AudioClip import AudioArrayClip 4 | from moviepy.editor import VideoFileClip 5 | 6 | video_pth = "data/LLP_dataset/video" 7 | sound_list = os.listdir(video_pth) 8 | save_pth = "data/LLP_dataset/audio" 9 | 10 | for audio_id in sound_list: 11 | name = os.path.join(video_pth, audio_id) 12 | audio_name = audio_id[:-4] + '.wav' 13 | exist_lis = os.listdir(save_pth) 14 | if audio_name in exist_lis: 15 | print("already exist!") 16 | continue 17 | try: 18 | video = VideoFileClip(name) 19 | audio = video.audio 20 | audio.write_audiofile(os.path.join(save_pth, audio_name), fps=16000) 21 | print("finish video id: " + audio_name) 22 | except: 23 | print("cannot load ", name) -------------------------------------------------------------------------------- /step1_train_base_model/scripts/extract_frames.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import os 4 | import argparse 5 | import glob 6 | 7 | def extract_frames(video, dst): 8 | command1 = 'ffmpeg ' 9 | command1 += '-i ' + video + " " 10 | command1 += '-y' + " " 11 | command1 += "-r " + "8 " 12 | command1 += '{0}/%06d.jpg'.format(dst) 13 | print(command1) 14 | # print command1 15 | os.system(command1) 16 | 17 | return 18 | 19 | if __name__ == '__main__': 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--out_dir', dest='out_dir', type=str, default='data/LLP_dataset/frame') 22 | parser.add_argument('--video_path', dest='video_path', type=str, default='data/LLP_dataset/video') 23 | args = parser.parse_args() 24 | 25 | vid_list = os.listdir(args.video_path) 26 | 27 | for vid_id in vid_list: 28 | name = os.path.join(args.video_path, vid_id) 29 | dst = os.path.join(args.out_dir, vid_id[:-4]) 30 | print(dst) 31 | if not os.path.exists(dst): 32 | os.makedirs(dst) 33 | extract_frames(name, dst) 34 | print("finish video id: " + vid_id) 35 | -------------------------------------------------------------------------------- /step1_train_base_model/scripts/extract_rgb_feat.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import glob 4 | from tqdm import tqdm 5 | import numpy as np 6 | import os 7 | import argparse 8 | from PIL import Image 9 | import torch 10 | from torch import nn 11 | import torch.nn.functional as F 12 | import pretrainedmodels 13 | from pretrainedmodels import utils 14 | 15 | C, H, W = 3, 224, 224 16 | 17 | def extract_feats(params, model, load_image_fn): 18 | global C, H, W 19 | model.eval() 20 | dir_fc = os.path.join(os.getcwd(), params['output_dir']) 21 | if not os.path.isdir(dir_fc): 22 | os.mkdir(dir_fc) 23 | 24 | video_list = os.listdir(params['video_path']) 25 | nn = 0 26 | for video in video_list: 27 | 28 | nn = nn + 1 29 | dst = video 30 | 31 | image_list = sorted(glob.glob(os.path.join(params['video_path'], dst, '*.jpg'))) 32 | samples = np.round(np.linspace( 33 | 0, len(image_list) - 1, params['n_frame_steps'])) 34 | 35 | image_list = [image_list[int(sample)] for sample in samples] 36 | images = torch.zeros((len(image_list), C, H, W)) 37 | i = 0 38 | for iImg in range(len(image_list)): 39 | img = load_image_fn(image_list[iImg]) 40 | images[iImg] = img 41 | 42 | 43 | with torch.no_grad(): 44 | fc_feats = model(images.cuda()).squeeze() 45 | img_feats = fc_feats.cpu().numpy() 46 | #print(img_feats.shape) 47 | # Save the inception features 48 | outfile = os.path.join(dir_fc, video + '.npy') 49 | np.save(outfile, img_feats) 50 | # cleanup 51 | #shutil.rmtree(dst) 52 | print(nn) 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument("--gpu", dest='gpu', type=str, default='0', 58 | help='Set CUDA_VISIBLE_DEVICES environment variable, optional') 59 | parser.add_argument("--output_dir", dest='output_dir', type=str, 60 | default='data/LLP_dataset/feats/res152', help='directory to store features') 61 | parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=80, 62 | help='how many frames to sampler per video') 63 | 64 | parser.add_argument("--video_path", dest='video_path', type=str, 65 | default='data/LLP_dataset/frame', help='path to video dataset') 66 | parser.add_argument("--model", dest="model", type=str, default='resnet152', 67 | help='the CNN model you want to use to extract_feats') 68 | 69 | args = parser.parse_args() 70 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 71 | params = vars(args) 72 | if params['model'] == 'inception_v3': 73 | C, H, W = 3, 299, 299 74 | model = pretrainedmodels.inceptionv3(pretrained='imagenet') 75 | load_image_fn = utils.LoadTransformImage(model) 76 | 77 | elif params['model'] == 'resnet152': 78 | C, H, W = 3, 224, 224 79 | model = pretrainedmodels.resnet152(pretrained='imagenet') 80 | load_image_fn = utils.LoadTransformImage(model) 81 | elif params['model'] == 'vgg19_bn': 82 | C, H, W = 3, 224, 224 83 | model = pretrainedmodels.vgg19_bn(pretrained='imagenet') 84 | load_image_fn = utils.LoadTransformImage(model) 85 | elif params['model'] == 'inception_v4': 86 | C, H, W = 3, 299, 299 87 | model = pretrainedmodels.inceptionv4( 88 | num_classes=1000, pretrained='imagenet') 89 | load_image_fn = utils.LoadTransformImage(model) 90 | elif params['model'] == 'nasnetalarge': 91 | C, H, W = 3, 299, 299 92 | model = pretrainedmodels.inceptionv4( 93 | num_classes=1000, pretrained='imagenet') 94 | load_image_fn = utils.LoadTransformImage(model) 95 | 96 | else: 97 | print("doesn't support %s" % (params['model'])) 98 | 99 | model.last_linear = utils.Identity() 100 | model = nn.DataParallel(model) 101 | 102 | model = model.cuda() 103 | extract_feats(params, model, load_image_fn) 104 | -------------------------------------------------------------------------------- /step1_train_base_model/scripts/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | 4 | 5 | def crop(vid, i, j, h, w): 6 | return vid[..., i:(i + h), j:(j + w)] 7 | 8 | 9 | def center_crop(vid, output_size): 10 | h, w = vid.shape[-2:] 11 | th, tw = output_size 12 | 13 | i = int(round((h - th) / 2.)) 14 | j = int(round((w - tw) / 2.)) 15 | return crop(vid, i, j, th, tw) 16 | 17 | 18 | def hflip(vid): 19 | return vid.flip(dims=(-1,)) 20 | 21 | 22 | # NOTE: for those functions, which generally expect mini-batches, we keep them 23 | # as non-minibatch so that they are applied as if they were 4d (thus image). 24 | # this way, we only apply the transformation in the spatial domain 25 | def resize(vid, size, interpolation='bilinear'): 26 | # NOTE: using bilinear interpolation because we don't work on minibatches 27 | # at this level 28 | scale = None 29 | if isinstance(size, int): 30 | scale = float(size) / min(vid.shape[-2:]) 31 | size = None 32 | return torch.nn.functional.interpolate( 33 | vid, size=size, scale_factor=scale, mode=interpolation, align_corners=False) 34 | 35 | 36 | def pad(vid, padding, fill=0, padding_mode="constant"): 37 | # NOTE: don't want to pad on temporal dimension, so let as non-batch 38 | # (4d) before padding. This works as expected 39 | return torch.nn.functional.pad(vid, padding, value=fill, mode=padding_mode) 40 | 41 | 42 | def to_normalized_float_tensor(vid): 43 | return vid.permute(3, 0, 1, 2).to(torch.float32) / 255 44 | 45 | 46 | def normalize(vid, mean, std): 47 | shape = (-1,) + (1,) * (vid.dim() - 1) 48 | mean = torch.as_tensor(mean).reshape(shape) 49 | std = torch.as_tensor(std).reshape(shape) 50 | return (vid - mean) / std 51 | 52 | 53 | # Class interface 54 | 55 | class RandomCrop(object): 56 | def __init__(self, size): 57 | self.size = size 58 | 59 | @staticmethod 60 | def get_params(vid, output_size): 61 | """Get parameters for ``crop`` for a random crop. 62 | """ 63 | h, w = vid.shape[-2:] 64 | th, tw = output_size 65 | if w == tw and h == th: 66 | return 0, 0, h, w 67 | i = random.randint(0, h - th) 68 | j = random.randint(0, w - tw) 69 | return i, j, th, tw 70 | 71 | def __call__(self, vid): 72 | i, j, h, w = self.get_params(vid, self.size) 73 | return crop(vid, i, j, h, w) 74 | 75 | 76 | class CenterCrop(object): 77 | def __init__(self, size): 78 | self.size = size 79 | 80 | def __call__(self, vid): 81 | return center_crop(vid, self.size) 82 | 83 | 84 | class Resize(object): 85 | def __init__(self, size): 86 | self.size = size 87 | 88 | def __call__(self, vid): 89 | return resize(vid, self.size) 90 | 91 | 92 | class ToFloatTensorInZeroOne(object): 93 | def __call__(self, vid): 94 | return to_normalized_float_tensor(vid) 95 | 96 | 97 | class Normalize(object): 98 | def __init__(self, mean, std): 99 | self.mean = mean 100 | self.std = std 101 | 102 | def __call__(self, vid): 103 | return normalize(vid, self.mean, self.std) 104 | 105 | 106 | class RandomHorizontalFlip(object): 107 | def __init__(self, p=0.5): 108 | self.p = p 109 | 110 | def __call__(self, vid): 111 | if random.random() < self.p: 112 | return hflip(vid) 113 | return vid 114 | 115 | 116 | class Pad(object): 117 | def __init__(self, padding, fill=0): 118 | self.padding = padding 119 | self.fill = fill 120 | 121 | def __call__(self, vid): 122 | return pad(vid, self.padding, self.fill) 123 | -------------------------------------------------------------------------------- /step1_train_base_model/scripts/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division, absolute_import 2 | import math 3 | import torch 4 | import torch.nn as nn 5 | import torchvision.transforms as transforms 6 | from PIL import Image 7 | import numpy as np 8 | from munch import munchify 9 | 10 | class ToSpaceBGR(object): 11 | 12 | def __init__(self, is_bgr): 13 | self.is_bgr = is_bgr 14 | 15 | def __call__(self, tensor): 16 | if self.is_bgr: 17 | new_tensor = tensor.clone() 18 | new_tensor[0] = tensor[2] 19 | new_tensor[2] = tensor[0] 20 | tensor = new_tensor 21 | return tensor 22 | 23 | 24 | class ToRange255(object): 25 | 26 | def __init__(self, is_255): 27 | self.is_255 = is_255 28 | 29 | def __call__(self, tensor): 30 | if self.is_255: 31 | tensor.mul_(255) 32 | return tensor 33 | 34 | 35 | class TransformImage(object): 36 | 37 | def __init__(self): 38 | self.mean = [0.43216, 0.394666, 0.37645] 39 | self.std = [0.22803, 0.22145, 0.216989] 40 | tfs = [] 41 | tfs.append(transforms.Resize((112, 112))) 42 | tfs.append(transforms.ToTensor()) 43 | tfs.append(ToSpaceBGR('RGB'=='BGR')) 44 | tfs.append(ToRange255(max([0, 1])==255)) 45 | tfs.append(transforms.Normalize(mean=self.mean, std=self.std)) 46 | 47 | self.tf = transforms.Compose(tfs) 48 | 49 | def __call__(self, img): 50 | tensor = self.tf(img) 51 | return tensor 52 | 53 | 54 | class LoadImage(object): 55 | 56 | def __init__(self, space='RGB'): 57 | self.space = space 58 | 59 | def __call__(self, path_img): 60 | with open(path_img, 'rb') as f: 61 | with Image.open(f) as img: 62 | img = img.convert(self.space) 63 | return img 64 | 65 | 66 | class LoadTransformImage(object): 67 | 68 | def __init__(self): 69 | self.load = LoadImage() 70 | self.tf = TransformImage() 71 | 72 | def __call__(self, path_img): 73 | img = self.load(path_img) 74 | tensor = self.tf(img) 75 | return tensor 76 | 77 | 78 | class Identity(nn.Module): 79 | 80 | def __init__(self): 81 | super(Identity, self).__init__() 82 | 83 | def forward(self, x): 84 | return x -------------------------------------------------------------------------------- /step1_train_base_model/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/step1_train_base_model/utils/__init__.py -------------------------------------------------------------------------------- /step1_train_base_model/utils/eval_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def Precision(X_pre, X_gt): 4 | 5 | N = len(X_pre) 6 | p = 0.0 7 | for i in range(N): 8 | x = X_pre[i, :] 9 | y = X_gt[i, :] 10 | p += np.sum(x*y)/np.sum(x) 11 | return p/N 12 | 13 | 14 | def Recall(X_pre, X_gt): 15 | N = len(X_pre) 16 | p = 0.0 17 | for i in range(N): 18 | x = X_pre[i, :] 19 | y = X_gt[i, :] 20 | p += np.sum(x * y) / np.sum(y) 21 | return p/N 22 | 23 | 24 | def F1(X_pre, X_gt): 25 | N = len(X_pre) 26 | p = 0 27 | for i in range(N): 28 | x = X_pre[i, :] 29 | y = X_gt[i, :] 30 | p += 2*np.sum(x * y) / (np.sum(x) + np.sum(y)) 31 | return p/N 32 | 33 | def event_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av): 34 | # extract events 35 | N = 25 36 | event_p_a = [None for n in range(25)] 37 | event_gt_a = [None for n in range(25)] 38 | event_p_v = [None for n in range(25)] 39 | event_gt_v = [None for n in range(25)] 40 | event_p_av = [None for n in range(25)] 41 | event_gt_av = [None for n in range(25)] 42 | 43 | TP_a = np.zeros(25) 44 | TP_v = np.zeros(25) 45 | TP_av = np.zeros(25) 46 | 47 | FP_a = np.zeros(25) 48 | FP_v = np.zeros(25) 49 | FP_av = np.zeros(25) 50 | 51 | FN_a = np.zeros(25) 52 | FN_v = np.zeros(25) 53 | FN_av = np.zeros(25) 54 | 55 | for n in range(N): 56 | seq_pred = SO_a[n, :] 57 | if np.sum(seq_pred)!=0: 58 | x = extract_event(seq_pred, n) 59 | event_p_a[n] = x 60 | seq_gt = GT_a[n, :] 61 | if np.sum(seq_gt)!=0: 62 | x = extract_event(seq_gt, n) 63 | event_gt_a[n] = x 64 | 65 | seq_pred = SO_v[n, :] 66 | if np.sum(seq_pred) != 0: 67 | x = extract_event(seq_pred, n) 68 | event_p_v[n] = x 69 | seq_gt = GT_v[n, :] 70 | if np.sum(seq_gt) != 0: 71 | x = extract_event(seq_gt, n) 72 | event_gt_v[n] = x 73 | 74 | seq_pred = SO_av[n, :] 75 | if np.sum(seq_pred) != 0: 76 | x = extract_event(seq_pred, n) 77 | event_p_av[n] = x 78 | 79 | seq_gt = GT_av[n, :] 80 | if np.sum(seq_gt) != 0: 81 | x = extract_event(seq_gt, n) 82 | event_gt_av[n] = x 83 | 84 | tp, fp, fn = event_wise_metric(event_p_a[n], event_gt_a[n]) 85 | TP_a[n] += tp 86 | FP_a[n] += fp 87 | FN_a[n] += fn 88 | 89 | tp, fp, fn = event_wise_metric(event_p_v[n], event_gt_v[n]) 90 | TP_v[n] += tp 91 | FP_v[n] += fp 92 | FN_v[n] += fn 93 | 94 | tp, fp, fn = event_wise_metric(event_p_av[n], event_gt_av[n]) 95 | TP_av[n] += tp 96 | FP_av[n] += fp 97 | FN_av[n] += fn 98 | 99 | TP = TP_a + TP_v 100 | FN = FN_a + FN_v 101 | FP = FP_a + FP_v 102 | 103 | n = len(FP_a) 104 | F_a = [] 105 | for ii in range(n): 106 | if (TP_a + FP_a)[ii] != 0 or (TP_a + FN_a)[ii] != 0: 107 | F_a.append(2 * TP_a[ii] / (2 * TP_a[ii] + (FN_a + FP_a)[ii])) 108 | 109 | F_v = [] 110 | for ii in range(n): 111 | if (TP_v + FP_v)[ii] != 0 or (TP_v + FN_v)[ii] != 0: 112 | F_v.append(2 * TP_v[ii] / (2 * TP_v[ii] + (FN_v + FP_v)[ii])) 113 | 114 | F = [] 115 | for ii in range(n): 116 | if (TP + FP)[ii] != 0 or (TP + FN)[ii] != 0: 117 | F.append(2 * TP[ii] / (2 * TP[ii] + (FN + FP)[ii])) 118 | 119 | F_av = [] 120 | for ii in range(n): 121 | if (TP_av + FP_av)[ii] != 0 or (TP_av + FN_av)[ii] != 0: 122 | F_av.append(2 * TP_av[ii] / (2 * TP_av[ii] + (FN_av + FP_av)[ii])) 123 | 124 | if len(F_a) == 0: 125 | f_a = 1.0 # all true negatives 126 | else: 127 | f_a = (sum(F_a)/len(F_a)) 128 | 129 | if len(F_v) == 0: 130 | f_v = 1.0 # all true negatives 131 | else: 132 | f_v = (sum(F_v)/len(F_v)) 133 | 134 | if len(F) == 0: 135 | f = 1.0 # all true negatives 136 | else: 137 | f = (sum(F)/len(F)) 138 | if len(F_av) == 0: 139 | f_av = 1.0 # all true negatives 140 | else: 141 | f_av = (sum(F_av)/len(F_av)) 142 | 143 | return f_a, f_v, f, f_av 144 | 145 | 146 | def segment_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av): 147 | # compute F scores 148 | TP_a = np.sum(SO_a * GT_a, axis=1) 149 | FN_a = np.sum((1-SO_a)*GT_a, axis = 1) 150 | FP_a = np.sum(SO_a*(1-GT_a),axis=1) 151 | 152 | n = len(FP_a) 153 | F_a = [] 154 | for ii in range(n): 155 | if (TP_a+FP_a)[ii]!= 0 or (TP_a+FN_a)[ii]!= 0: 156 | F_a.append(2*TP_a[ii] / (2*TP_a[ii] + (FN_a + FP_a)[ii])) 157 | 158 | TP_v = np.sum(SO_v * GT_v, axis=1) 159 | FN_v = np.sum((1 - SO_v) * GT_v, axis=1) 160 | FP_v = np.sum(SO_v * (1 - GT_v), axis=1) 161 | F_v = [] 162 | for ii in range(n): 163 | if (TP_v + FP_v)[ii] != 0 or (TP_v + FN_v)[ii] != 0: 164 | F_v.append(2 * TP_v[ii] / (2 * TP_v[ii] + (FN_v + FP_v)[ii])) 165 | 166 | TP = TP_a + TP_v 167 | FN = FN_a + FN_v 168 | FP = FP_a + FP_v 169 | 170 | n = len(FP) 171 | 172 | F = [] 173 | for ii in range(n): 174 | if (TP + FP)[ii] != 0 or (TP + FN)[ii] != 0: 175 | F.append(2 * TP[ii] / (2 * TP[ii] + (FN + FP)[ii])) 176 | 177 | TP_av = np.sum(SO_av * GT_av, axis=1) 178 | FN_av = np.sum((1 - SO_av) * GT_av, axis=1) 179 | FP_av = np.sum(SO_av * (1 - GT_av), axis=1) 180 | n = len(FP_av) 181 | F_av = [] 182 | for ii in range(n): 183 | if (TP_av + FP_av)[ii] != 0 or (TP_av + FN_av)[ii] != 0: 184 | F_av.append(2 * TP_av[ii] / (2 * TP_av[ii] + (FN_av + FP_av)[ii])) 185 | 186 | 187 | if len(F_a) == 0: 188 | f_a = 1.0 # all true negatives 189 | else: 190 | f_a = (sum(F_a)/len(F_a)) 191 | 192 | if len(F_v) == 0: 193 | f_v = 1.0 # all true negatives 194 | else: 195 | f_v = (sum(F_v)/len(F_v)) 196 | 197 | if len(F) == 0: 198 | f = 1.0 # all true negatives 199 | else: 200 | f = (sum(F)/len(F)) 201 | if len(F_av) == 0: 202 | f_av = 1.0 # all true negatives 203 | else: 204 | f_av = (sum(F_av)/len(F_av)) 205 | 206 | return f_a, f_v, f, f_av 207 | 208 | 209 | def to_vec(start, end): 210 | x = np.zeros(10) 211 | for i in range(start, end): 212 | x[i] = 1 213 | return x 214 | 215 | def extract_event(seq, n): 216 | x = [] 217 | i = 0 218 | while i < 10: 219 | if seq[i] == 1: 220 | start = i 221 | if i + 1 == 10: 222 | i = i + 1 223 | end = i 224 | x.append(to_vec(start, end)) 225 | break 226 | 227 | for j in range(i + 1, 10): 228 | if seq[j] != 1: 229 | i = j + 1 230 | end = j 231 | x.append(to_vec(start, end)) 232 | break 233 | else: 234 | i = j + 1 235 | if i == 10: 236 | end = i 237 | x.append(to_vec(start, end)) 238 | break 239 | else: 240 | i += 1 241 | return x 242 | 243 | def event_wise_metric(event_p, event_gt): 244 | TP = 0 245 | FP = 0 246 | FN = 0 247 | 248 | if event_p is not None: 249 | num_event = len(event_p) 250 | for i in range(num_event): 251 | x1 = event_p[i] 252 | if event_gt is not None: 253 | nn = len(event_gt) 254 | flag = True 255 | for j in range(nn): 256 | x2 = event_gt[j] 257 | if np.sum(x1 * x2) >= 0.5 * np.sum(x1 + x2 - x1 * x2): #0.5 258 | TP += 1 259 | flag = False 260 | break 261 | if flag: 262 | FP += 1 263 | else: 264 | FP += 1 265 | 266 | if event_gt is not None: 267 | num_event = len(event_gt) 268 | for i in range(num_event): 269 | x1 = event_gt[i] 270 | if event_p is not None: 271 | nn = len(event_p) 272 | flag = True 273 | for j in range(nn): 274 | x2 = event_p[j] 275 | if np.sum(x1 * x2) >= 0.5 * np.sum(x1 + x2 - x1 * x2): #0.5 276 | flag = False 277 | break 278 | if flag: 279 | FN += 1 280 | else: 281 | FN += 1 282 | return TP, FP, FN 283 | -------------------------------------------------------------------------------- /step2_find_exchange/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/step2_find_exchange/.gitkeep -------------------------------------------------------------------------------- /step2_find_exchange/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/step2_find_exchange/data/.DS_Store -------------------------------------------------------------------------------- /step2_find_exchange/dataloader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import os 4 | from torch.utils.data import Dataset, DataLoader 5 | from torchvision import transforms, utils 6 | import pandas as pd 7 | import random 8 | 9 | 10 | 11 | def ids_to_multinomial(ids): 12 | """ label encoding 13 | 14 | Returns: 15 | 1d array, multimonial representation, e.g. [1,0,1,0,0,...] 16 | """ 17 | categories = ['Speech', 'Car', 'Cheering', 'Dog', 'Cat', 'Frying_(food)', 18 | 'Basketball_bounce', 'Fire_alarm', 'Chainsaw', 'Cello', 'Banjo', 19 | 'Singing', 'Chicken_rooster', 'Violin_fiddle', 'Vacuum_cleaner', 20 | 'Baby_laughter', 'Accordion', 'Lawn_mower', 'Motorcycle', 'Helicopter', 21 | 'Acoustic_guitar', 'Telephone_bell_ringing', 'Baby_cry_infant_cry', 'Blender', 22 | 'Clapping'] 23 | id_to_idx = {id: index for index, id in enumerate(categories)} 24 | 25 | y = np.zeros(len(categories)) 26 | for id in ids: 27 | index = id_to_idx[id] 28 | y[index] = 1 29 | return y 30 | 31 | 32 | 33 | class LLP_dataset(Dataset): 34 | 35 | def __init__(self, label, audio_dir, video_dir, st_dir, train=None, transform=None): 36 | self.df = pd.read_csv(label, header=0, sep='\t') 37 | self.filenames = self.df["filename"] 38 | self.audio_dir = audio_dir 39 | self.video_dir = video_dir 40 | self.st_dir = st_dir 41 | self.transform = transform 42 | 43 | self.train = train 44 | 45 | labels_to_idx = {} 46 | for i in range(25): 47 | labels_to_idx[i] = [] 48 | 49 | for idx in range(len(self.filenames)): 50 | row = self.df.loc[idx, :] 51 | ids = row[-1].split(',') 52 | label = ids_to_multinomial(ids) 53 | 54 | if len(ids)==1: 55 | for c in range(25): 56 | if label[c] == 1: 57 | labels_to_idx[c].append(idx) 58 | 59 | self.labels_to_idx = labels_to_idx 60 | 61 | 62 | 63 | def __len__(self): 64 | return len(self.filenames) 65 | 66 | def __getitem__(self, idx): 67 | row = self.df.loc[idx, :] 68 | name = row[0][:11] 69 | audio = np.load(os.path.join(self.audio_dir, name + '.npy')) 70 | video_s = np.load(os.path.join(self.video_dir, name + '.npy')) 71 | video_st = np.load(os.path.join(self.st_dir, name + '.npy')) 72 | ids = row[-1].split(',') 73 | label = ids_to_multinomial(ids) 74 | 75 | real = 1 76 | audio2 = np.array(1) 77 | if self.train: 78 | while True: 79 | idx2 = random.randint(0, len(self.filenames)-1) 80 | row = self.df.loc[idx2, :] 81 | name = row[0][:11] 82 | ids = row[-1].split(',') 83 | label2 = ids_to_multinomial(ids) 84 | intersection = np.logical_and(label, label2) 85 | intersection = intersection.astype(int).sum() 86 | if intersection == 0: 87 | break 88 | 89 | row = self.df.loc[idx2, :] 90 | name = row[0][:11] 91 | audio2 = np.load(os.path.join(self.audio_dir, name + '.npy')) 92 | video_s2 = np.load(os.path.join(self.video_dir, name + '.npy')) 93 | video_st2 = np.load(os.path.join(self.st_dir, name + '.npy')) 94 | real = 0 95 | ids = row[-1].split(',') 96 | label2 = ids_to_multinomial(ids) 97 | 98 | real = np.array(real) 99 | sample = {'audio': audio, 'video_s': video_s, 'video_st': video_st, 'label': label, 'audio2':audio2, 'data_idx':np.array(idx), 'label2':label2, 'video_s2': video_s2, 'video_st2': video_st2} 100 | 101 | if self.transform: 102 | sample = self.transform(sample) 103 | 104 | return sample 105 | 106 | class ToTensor(object): 107 | 108 | def __call__(self, sample): 109 | if len(sample) == 2: 110 | audio = sample['audio'] 111 | label = sample['label'] 112 | return {'audio': torch.from_numpy(audio), 'label': torch.from_numpy(label)} 113 | else: 114 | audio = sample['audio'] 115 | video_s = sample['video_s'] 116 | video_st = sample['video_st'] 117 | label = sample['label'] 118 | label2 = sample['label2'] 119 | video_s2 = sample['video_s2'] 120 | video_st2 = sample['video_st2'] 121 | return {'audio': torch.from_numpy(audio), 'video_s': torch.from_numpy(video_s), 122 | 'video_st': torch.from_numpy(video_st), 123 | 'label': torch.from_numpy(label), 'audio2':torch.from_numpy(sample['audio2']), 'data_idx':torch.from_numpy(sample['data_idx']), 'label2':torch.from_numpy(label2), 'video_s2': torch.from_numpy(video_s2), 'video_st2': torch.from_numpy(video_st2),} 124 | -------------------------------------------------------------------------------- /step2_find_exchange/main_avvp.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | from dataloader import * 7 | from nets.net_audiovisual import MMIL_Net 8 | from utils.eval_metrics import segment_level, event_level 9 | import pandas as pd 10 | import pickle as pkl 11 | 12 | 13 | def get_modality_aware_label(args, model, train_loader, optimizer, criterion, epoch): 14 | model.eval() 15 | v_accs = [] 16 | a_accs = [] 17 | num = 0 18 | 19 | das = [] 20 | for batch_idx, sample in enumerate(train_loader): 21 | audio, video, video_st, target = sample['audio'].to('cuda'), sample['video_s'].to('cuda'), sample['video_st'].to('cuda'), sample['label'].type(torch.FloatTensor).to('cuda') 22 | audio2 = sample['audio2'].to('cuda') 23 | label2 = sample['label2'] 24 | video2 = sample['video_s2'].to('cuda') 25 | video_st2 = sample['video_st2'].to('cuda') 26 | data_idx = sample['data_idx'] 27 | optimizer.zero_grad() 28 | output, a_prob, v_prob, _, sims, mask = model(audio, video, video_st, audio2) 29 | output2, a_prob2, v_prob2, _, _, _ = model(audio, video2, video_st2, audio2) 30 | output3, a_prob3, v_prob3, _, _, _ = model(audio2, video, video_st, audio2) 31 | 32 | 33 | a_v = a_prob2 34 | v_v = v_prob2 35 | 36 | a_a = a_prob3 37 | v_a = v_prob3 38 | 39 | da = { 40 | 'a': a_prob.cpu().detach(), 41 | 'v': v_prob.cpu().detach(), 42 | 'a_v':a_v.cpu().detach(), 43 | 'v_v': v_v.cpu().detach(), 44 | 'a_a': a_a.cpu().detach(), 45 | 'v_a':v_a.cpu().detach(), 46 | 'label':target.cpu(), 'label2':label2, 'idx': data_idx} 47 | das.append(da) 48 | output.clamp_(min=1e-7, max=1 - 1e-7) 49 | a_prob.clamp_(min=1e-7, max=1 - 1e-7) 50 | v_prob.clamp_(min=1e-7, max=1 - 1e-7) 51 | 52 | data_idx = sample['data_idx'] 53 | b=audio.size(0) 54 | 55 | acc = (torch.argmax(sims[0], dim=-1) == mask).float().mean().item() 56 | 57 | v_acc = (v_prob>0.5) == target 58 | a_acc = (a_prob>0.5) == target 59 | 60 | v_acc = v_acc.cpu().float() 61 | a_acc = a_acc.cpu().float() 62 | v_accs.append(v_acc) 63 | a_accs.append(a_acc) 64 | 65 | if batch_idx % args.log_interval == 0: 66 | print('Estimate Epoch: {} [{}/{} ({:.0f}%)]'.format( 67 | epoch, batch_idx * len(audio), len(train_loader.dataset), 68 | 100. * batch_idx / len(train_loader))) 69 | v_accs = torch.cat(v_accs, dim=0).mean(0) 70 | a_accs = torch.cat(a_accs, dim=0).mean(0) 71 | 72 | 73 | estimate_label(das, v_accs, a_accs) 74 | 75 | 76 | 77 | def estimate_label(datas, v_accs, a_accs): 78 | 79 | v_err = 1 - torch.Tensor(v_accs) 80 | a_err = 1 - torch.Tensor(a_accs) 81 | 82 | v_class = v_err / torch.max(v_err) 83 | a_class = a_err / torch.max(a_err) 84 | 85 | need_to_change_v = [[] for _ in range(25) ] 86 | need_to_change_a = [[] for _ in range(25) ] 87 | total_a = 0 88 | total_v = 0 89 | changed_a = 0 90 | changed_v = 0 91 | for data in datas: 92 | a = data['a'] 93 | v = data['v'] 94 | a_v = data['a_v'] 95 | v_v = data['v_v'] 96 | a_a = data['a_a'] 97 | v_a = data['v_a'] 98 | 99 | label = data['label'] 100 | idx = data['idx'] 101 | 102 | 103 | a = a * label 104 | v = v * label 105 | a_v = a_v * label 106 | v_v = v_v * label 107 | a_a = a_a * label 108 | v_a = v_a * label 109 | 110 | for b in range(len(a)): 111 | for c in range(25): 112 | if label[b][c] != 0: 113 | if v_a[b][c]/v_class[c] < 0.5: 114 | if a_a[b][c]/ v_class[c] < 0.5: 115 | # visual is not correct, given original visual data is input. 116 | need_to_change_v[c].append(idx[b]) 117 | 118 | if a_v[b][c]/a_class[c] < 0.5: 119 | if v_v[b][c] /a_class[c] < 0.5: 120 | need_to_change_a[c].append(idx[b]) 121 | 122 | 123 | with open("need_to_change.pkl", 'wb') as f: 124 | pkl.dump([need_to_change_v, need_to_change_a], f) 125 | 126 | 127 | def main(): 128 | 129 | # Training settings 130 | parser = argparse.ArgumentParser(description='PyTorch Implementation of Audio-Visual Video Parsing') 131 | parser.add_argument( 132 | "--audio_dir", type=str, default='data/feats/vggish/', help="audio dir") 133 | parser.add_argument( 134 | "--video_dir", type=str, default='data/feats/res152/', 135 | help="video dir") 136 | parser.add_argument( 137 | "--st_dir", type=str, default='data/feats/r2plus1d_18/', 138 | help="video dir") 139 | parser.add_argument( 140 | "--label_train", type=str, default="data/AVVP_train.csv", help="weak train csv file") 141 | parser.add_argument( 142 | "--label_val", type=str, default="data/AVVP_val_pd.csv", help="weak val csv file") 143 | parser.add_argument( 144 | "--label_test", type=str, default="data/AVVP_test_pd.csv", help="weak test csv file") 145 | parser.add_argument('--batch-size', type=int, default=16, metavar='N', 146 | help='input batch size for training (default: 16)') 147 | parser.add_argument('--epochs', type=int, default=40, metavar='N', 148 | help='number of epochs to train (default: 60)') 149 | parser.add_argument('--lr', type=float, default=3e-4, metavar='LR', 150 | help='learning rate (default: 3e-4)') 151 | parser.add_argument( 152 | "--model", type=str, default='MMIL_Net', help="with model to use") 153 | parser.add_argument( 154 | "--mode", type=str, default='train', help="with mode to use") 155 | parser.add_argument('--seed', type=int, default=2, metavar='S', 156 | help='random seed') 157 | parser.add_argument('--log-interval', type=int, default=50, metavar='N', 158 | help='how many batches to wait before logging training status') 159 | parser.add_argument( 160 | "--model_save_dir", type=str, default='models/', help="model save dir") 161 | parser.add_argument( 162 | "--checkpoint", type=str, default='MMIL_Net', 163 | help="save model name") 164 | parser.add_argument( 165 | '--gpu', type=str, default='0', help='gpu device number') 166 | args = parser.parse_args() 167 | 168 | #os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 169 | torch.manual_seed(args.seed) 170 | 171 | if args.model == 'MMIL_Net': 172 | model = MMIL_Net().to('cuda') 173 | else: 174 | raise ('not recognized') 175 | 176 | if args.mode == 'estimate_labels': 177 | train_dataset = LLP_dataset(train=True, label=args.label_train, audio_dir=args.audio_dir, video_dir=args.video_dir, st_dir=args.st_dir, transform = transforms.Compose([ 178 | ToTensor()])) 179 | val_dataset = LLP_dataset(label=args.label_val, audio_dir=args.audio_dir, video_dir=args.video_dir, st_dir=args.st_dir, transform = transforms.Compose([ 180 | ToTensor()])) 181 | train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=12, pin_memory = True) 182 | val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory = True) 183 | 184 | optimizer = optim.Adam(model.parameters(), lr=args.lr) 185 | scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) 186 | criterion = nn.BCELoss() 187 | best_F = 0 188 | test_dataset = LLP_dataset(train=False, label=args.label_test, audio_dir=args.audio_dir, video_dir=args.video_dir, 189 | st_dir=args.st_dir, transform=transforms.Compose([ 190 | ToTensor()])) 191 | test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True) 192 | 193 | 194 | model.load_state_dict(torch.load(args.model_save_dir + args.checkpoint + ".pt")) 195 | get_modality_aware_label(args, model, train_loader, optimizer, criterion, epoch=1) 196 | 197 | if __name__ == '__main__': 198 | main() 199 | -------------------------------------------------------------------------------- /step2_find_exchange/nets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/step2_find_exchange/nets/__init__.py -------------------------------------------------------------------------------- /step2_find_exchange/nets/models_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.autograd as autograd 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.autograd import Variable 7 | from torch.nn import init 8 | 9 | import math 10 | import numpy as np 11 | 12 | 13 | 14 | 15 | 16 | class ScaledDotProductAttention(nn.Module): 17 | ''' Scaled Dot-Product Attention ''' 18 | 19 | def __init__(self, temperature, attn_dropout=0.1): 20 | super().__init__() 21 | self.temperature = temperature 22 | self.dropout = nn.Dropout(attn_dropout) 23 | self.softmax = nn.Softmax(dim=2) 24 | 25 | def forward(self, q, k, v): 26 | 27 | attn = torch.bmm(q, k.transpose(1, 2)) 28 | attn = attn / self.temperature 29 | 30 | attn = self.softmax(attn) 31 | attn = self.dropout(attn) 32 | output = torch.bmm(attn, v) 33 | 34 | return output, attn 35 | 36 | 37 | class LayerNorm(nn.Module): 38 | 39 | def __init__(self, features, eps=1e-6): 40 | super().__init__() 41 | self.gamma = nn.Parameter(torch.ones(features)) 42 | self.beta = nn.Parameter(torch.zeros(features)) 43 | self.eps = eps 44 | 45 | def forward(self, x): 46 | mean = x.mean(-1, keepdim=True) 47 | std = x.std(-1, keepdim=True) 48 | return self.gamma * (x - mean) / (std + self.eps) + self.beta 49 | 50 | 51 | class PositionalEncoding2(nn.Module): 52 | "Implement the PE function." 53 | def __init__(self, d_model, dropout, max_len=500): 54 | super(PositionalEncoding, self).__init__() 55 | self.dropout = nn.Dropout(p=dropout) 56 | 57 | # Compute the positional encodings once in log space. 58 | pe = torch.zeros(max_len, d_model) 59 | position = torch.arange(0., max_len).unsqueeze(1) 60 | div_term = torch.exp(torch.arange(0., d_model, 2) * 61 | -(math.log(10000.0) / d_model)) 62 | pe[:, 0::2] = torch.sin(position * div_term) 63 | pe[:, 1::2] = torch.cos(position * div_term) 64 | pe = pe.unsqueeze(0) 65 | self.register_buffer('pe', pe) 66 | 67 | def forward(self, x): 68 | with torch.no_grad(): 69 | x1 = self.pe[:, :x.size(1)] 70 | x = x + x1 71 | #x = x + Variable(self.pe[:, :x.size(1)], 72 | # requires_grad=False) 73 | return self.dropout(x) 74 | 75 | class MultiHeadAttention2(nn.Module): 76 | ''' Multi-Head Attention module ''' 77 | 78 | def __init__(self, d_model, d_k, d_v, n_head=1, dropout=0.5): 79 | super().__init__() 80 | 81 | self.n_head = n_head 82 | self.d_k = d_k 83 | self.d_v = d_v 84 | 85 | self.w_qs = nn.Linear(d_model, n_head * d_k) 86 | self.w_ks = nn.Linear(d_model, n_head * d_k) 87 | self.w_vs = nn.Linear(d_model, n_head * d_v) 88 | nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 89 | nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 90 | nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) 91 | 92 | self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) 93 | self.layer_norm = LayerNorm(d_model) 94 | 95 | self.fc = nn.Linear(n_head * d_v, d_model) 96 | nn.init.xavier_normal_(self.fc.weight) 97 | 98 | self.dropout = nn.Dropout(dropout) 99 | 100 | 101 | def forward(self, q, k, v): 102 | 103 | d_k, d_v, n_head = self.d_k, self.d_v, self.n_head 104 | 105 | sz_b, len_q, _ = q.size() 106 | sz_b, len_k, _ = k.size() 107 | sz_b, len_v, _ = v.size() 108 | 109 | residual = q 110 | 111 | q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) 112 | k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) 113 | v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) 114 | 115 | q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk 116 | k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk 117 | v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv 118 | 119 | output, attn = self.attention(q, k, v) 120 | 121 | output = output.view(n_head, sz_b, len_q, d_v) 122 | output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv) 123 | 124 | output = self.dropout(self.fc(output)) 125 | output = self.layer_norm(output + residual) 126 | 127 | return output, attn 128 | 129 | 130 | 131 | 132 | 133 | 134 | class ContrastiveLoss(torch.nn.Module): 135 | """ 136 | Contrastive loss function. 137 | Based on: 138 | """ 139 | def __init__(self, margin=2.0): 140 | super(ContrastiveLoss, self).__init__() 141 | self.margin = margin 142 | 143 | def forward(self, dist, y): 144 | # euclidian distance 145 | dist_sq = torch.pow(dist, 2) 146 | dist = torch.clamp(self.margin - dist, min=0.0) 147 | 148 | 149 | assert len(y.data.shape) == 2, y.data.shape 150 | bs, time = y.data.shape 151 | y = y.view(-1) 152 | 153 | loss = y * dist_sq + (1 - y) * torch.pow(dist, 2) 154 | loss = torch.mean(loss) 155 | return loss 156 | 157 | 158 | 159 | 160 | 161 | class BinaryFocalLoss(nn.Module): 162 | """ 163 | This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in 164 | 'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)' 165 | Focal_Loss= -1*alpha*(1-pt)*log(pt) 166 | :param num_class: 167 | :param alpha: (tensor) 3D or 4D the scalar factor for this criterion 168 | :param gamma: (float,double) gamma > 0 reduces the relative loss for well-classified examples (p>0.5) putting more 169 | focus on hard misclassified example 170 | :param reduction: `none`|`mean`|`sum` 171 | :param **kwargs 172 | balance_index: (int) balance class index, should be specific when alpha is float 173 | """ 174 | 175 | def __init__(self, alpha=[1.0, 1.0], gamma=2, ignore_index=None, reduction='mean'): 176 | super(BinaryFocalLoss, self).__init__() 177 | if alpha is None: 178 | alpha = [0.25, 0.75] 179 | self.alpha = alpha 180 | self.gamma = gamma 181 | self.smooth = 1e-6 182 | self.ignore_index = ignore_index 183 | self.reduction = reduction 184 | 185 | assert self.reduction in ['none', 'mean', 'sum'] 186 | 187 | if self.alpha is None: 188 | self.alpha = torch.ones(2) 189 | elif isinstance(self.alpha, (list, np.ndarray)): 190 | self.alpha = np.asarray(self.alpha) 191 | self.alpha = np.reshape(self.alpha, (2)) 192 | assert self.alpha.shape[0] == 2, \ 193 | 'the `alpha` shape is not match the number of class' 194 | elif isinstance(self.alpha, (float, int)): 195 | self.alpha = np.asarray([self.alpha, 1.0 - self.alpha], dtype=np.float).view(2) 196 | 197 | else: 198 | raise TypeError('{} not supported'.format(type(self.alpha))) 199 | 200 | def forward(self, output, target): 201 | prob = torch.sigmoid(output) 202 | prob = torch.clamp(prob, self.smooth, 1.0 - self.smooth) 203 | 204 | pos_mask = (target == 1).float() 205 | neg_mask = (target == 0).float() 206 | 207 | pos_loss = -self.alpha[0] * torch.pow(torch.sub(1.0, prob), self.gamma) * torch.log(prob) * pos_mask 208 | neg_loss = -self.alpha[1] * torch.pow(prob, self.gamma) * \ 209 | torch.log(torch.sub(1.0, prob)) * neg_mask 210 | 211 | neg_loss = neg_loss.sum() 212 | pos_loss = pos_loss.sum() 213 | num_pos = pos_mask.view(pos_mask.size(0), -1).sum() 214 | num_neg = neg_mask.view(neg_mask.size(0), -1).sum() 215 | 216 | if num_pos == 0: 217 | loss = neg_loss 218 | else: 219 | loss = pos_loss / num_pos + neg_loss / num_neg 220 | return loss 221 | 222 | 223 | 224 | class FocalLoss_Ori(nn.Module): 225 | """ 226 | This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in 227 | 'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)' 228 | Focal_Loss= -1*alpha*(1-pt)*log(pt) 229 | :param num_class: 230 | :param alpha: (tensor) 3D or 4D the scalar factor for this criterion 231 | :param gamma: (float,double) gamma > 0 reduces the relative loss for well-classified examples (p>0.5) putting more 232 | focus on hard misclassified example 233 | :param smooth: (float,double) smooth value when cross entropy 234 | :param size_average: (bool, optional) By default, the losses are averaged over each loss element in the batch. 235 | """ 236 | 237 | def __init__(self, num_class, alpha=[0.25,0.75], gamma=2, balance_index=-1, size_average=True): 238 | super(FocalLoss_Ori, self).__init__() 239 | self.num_class = num_class 240 | self.alpha = alpha 241 | self.gamma = gamma 242 | self.size_average = size_average 243 | self.eps = 1e-6 244 | 245 | 246 | def forward(self, logit, target): 247 | 248 | if logit.dim() > 2: 249 | # N,C,d1,d2 -> N,C,m (m=d1*d2*...) 250 | logit = logit.view(logit.size(0), logit.size(1), -1) 251 | logit = logit.transpose(1, 2).contiguous() # [N,C,d1*d2..] -> [N,d1*d2..,C] 252 | logit = logit.view(-1, logit.size(-1)) # [N,d1*d2..,C]-> [N*d1*d2..,C] 253 | target = target.view(-1, 1) # [N,d1,d2,...]->[N*d1*d2*...,1] 254 | 255 | # -----------legacy way------------ 256 | # idx = target.cpu().long() 257 | # one_hot_key = torch.FloatTensor(target.size(0), self.num_class).zero_() 258 | # one_hot_key = one_hot_key.scatter_(1, idx, 1) 259 | # if one_hot_key.device != logit.device: 260 | # one_hot_key = one_hot_key.to(logit.device) 261 | # pt = (one_hot_key * logit).sum(1) + epsilon 262 | 263 | # ----------memory saving way-------- 264 | pt = logit.gather(1, target).view(-1) + self.eps # avoid apply 265 | logpt = pt.log() 266 | 267 | loss = -1 * torch.pow(torch.add(0.5, pt), self.gamma) * logpt 268 | 269 | if self.size_average: 270 | loss = loss.mean() 271 | else: 272 | loss = loss.sum() 273 | return loss 274 | 275 | 276 | -------------------------------------------------------------------------------- /step2_find_exchange/nets/net_audiovisual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy 6 | import copy 7 | import math 8 | from .models_utils import MultiHeadAttention2, PositionalEncoding2 9 | 10 | def _get_clones(module, N): 11 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 12 | 13 | class Encoder(nn.Module): 14 | 15 | def __init__(self, encoder_layer, num_layers, norm=None): 16 | super(Encoder, self).__init__() 17 | self.layers = _get_clones(encoder_layer, num_layers) 18 | self.num_layers = num_layers 19 | self.norm1 = nn.LayerNorm(512) 20 | self.norm2 = nn.LayerNorm(512) 21 | self.norm = norm 22 | 23 | def forward(self, src_a, src_v, mask=None, src_key_padding_mask=None): 24 | output_a = src_a 25 | output_v = src_v 26 | 27 | for i in range(self.num_layers): 28 | output_a = self.layers[i](src_a, src_v, src_mask=mask, 29 | src_key_padding_mask=src_key_padding_mask) 30 | output_v = self.layers[i](src_v, src_a, src_mask=mask, 31 | src_key_padding_mask=src_key_padding_mask) 32 | 33 | if self.norm: 34 | output_a = self.norm1(output_a) 35 | output_v = self.norm2(output_v) 36 | 37 | return output_a, output_v 38 | 39 | class HANLayer(nn.Module): 40 | 41 | def __init__(self, d_model, nhead, dim_feedforward=512, dropout=0.1): 42 | super(HANLayer, self).__init__() 43 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 44 | self.cm_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 45 | 46 | # Implementation of Feedforward model 47 | self.linear1 = nn.Linear(d_model, dim_feedforward) 48 | self.dropout = nn.Dropout(dropout) 49 | self.linear2 = nn.Linear(dim_feedforward, d_model) 50 | 51 | self.norm1 = nn.LayerNorm(d_model) 52 | self.norm2 = nn.LayerNorm(d_model) 53 | self.dropout11 = nn.Dropout(dropout) 54 | self.dropout12 = nn.Dropout(dropout) 55 | self.dropout2 = nn.Dropout(dropout) 56 | 57 | self.activation = nn.ReLU() 58 | 59 | def forward(self, src_q, src_v, src_mask=None, src_key_padding_mask=None): 60 | """Pass the input through the encoder layer. 61 | 62 | Args: 63 | src: the sequnce to the encoder layer (required). 64 | src_mask: the mask for the src sequence (optional). 65 | src_key_padding_mask: the mask for the src keys per batch (optional). 66 | 67 | Shape: 68 | see the docs in Transformer class. 69 | """ 70 | src_q = src_q.permute(1, 0, 2) 71 | src_v = src_v.permute(1, 0, 2) 72 | 73 | src1 = self.cm_attn(src_q, src_v, src_v, attn_mask=src_mask, 74 | key_padding_mask=src_key_padding_mask)[0] 75 | src2 = self.self_attn(src_q, src_q, src_q, attn_mask=src_mask, 76 | key_padding_mask=src_key_padding_mask)[0] 77 | 78 | src_q = src_q + self.dropout11(src1) + self.dropout12(src2) 79 | src_q = self.norm1(src_q) 80 | 81 | src2 = self.linear2(self.dropout(F.relu(self.linear1(src_q)))) 82 | src_q = src_q + self.dropout2(src2) 83 | src_q = self.norm2(src_q) 84 | return src_q.permute(1, 0, 2) 85 | 86 | 87 | 88 | class MMIL_Net(nn.Module): 89 | 90 | def __init__(self): 91 | super(MMIL_Net, self).__init__() 92 | 93 | self.fc_prob = nn.Linear(512, 25) 94 | self.fc_frame_att = nn.Linear(512, 25) 95 | self.fc_av_att = nn.Linear(512, 25) 96 | self.fc_a = nn.Linear(128, 512) 97 | self.fc_v = nn.Linear(2048, 512) 98 | self.fc_st = nn.Linear(512, 512) 99 | self.fc_fusion = nn.Linear(1024, 512) 100 | self.audio_encoder = nn.TransformerEncoder \ 101 | (nn.TransformerEncoderLayer(d_model=512, nhead=1, dim_feedforward=512), num_layers=1) 102 | self.visual_encoder = nn.TransformerEncoder \ 103 | (nn.TransformerEncoderLayer(d_model=512, nhead=1, dim_feedforward=512), num_layers=1) 104 | self.cmt_encoder = Encoder(CMTLayer(d_model=512, nhead=1, dim_feedforward=512), num_layers=1) 105 | self.hat_encoder = Encoder(HANLayer(d_model=512, nhead=1, dim_feedforward=512), num_layers=1) 106 | 107 | self.t_att = MultiHeadAttention2(512, 512, 512) 108 | self.t_att2 = MultiHeadAttention2(512, 512, 512) 109 | self.fc1= nn.Linear(1024, 256) 110 | self.fc2= nn.Linear(256, 2) 111 | 112 | def forward(self, audio, visual, visual_st, audio2): 113 | 114 | b, t, d = visual_st.size() 115 | x1 = self.fc_a(audio) 116 | x_fake = self.fc_a(audio2) 117 | x_audio = x1 118 | 119 | # 2d and 3d visual feature fusion (b, 80, 2048), (b, 10, 512) 120 | 121 | # merge (b, 80, 2048) -> (b, 10, 512) 122 | vid_s = self.fc_v(visual).permute(0, 2, 1).unsqueeze(-1) 123 | vid_s = F.avg_pool2d(vid_s, (8, 1)).squeeze(-1).permute(0, 2, 1) 124 | 125 | vid_st = self.fc_st(visual_st) 126 | x2 = torch.cat((vid_s, vid_st), dim =-1) 127 | x2 = self.fc_fusion(x2) 128 | x_visual = x2 129 | 130 | # HAN 131 | x1, x2 = self.hat_encoder(x1, x2) 132 | sims = 1 133 | 134 | xx1 = F.normalize(x_visual, p=2, dim=-1) 135 | xx2 = F.normalize(x1, p=2, dim=-1) 136 | xx3 = F.normalize(x_audio, p=2, dim=-1) 137 | xx4 = F.normalize(x2, p=2, dim=-1) 138 | 139 | sims = xx2.bmm(xx1.permute(0, 2, 1)).squeeze(1) * 5 140 | sims2 = xx3.bmm(xx4.permute(0, 2, 1)).squeeze(1) * 5 141 | sims = sims.reshape(-1, 10) 142 | sims2 = sims2.reshape(-1, 10) 143 | 144 | sims = [sims, sims2] 145 | 146 | mask = torch.zeros(b, 10) 147 | mask = mask.long() 148 | for i in range(10): 149 | mask[:, i] = i 150 | mask = mask.cuda() 151 | mask = mask.reshape(-1) 152 | # prediction 153 | x = torch.cat([x1.unsqueeze(-2), x2.unsqueeze(-2)], dim=-2) 154 | frame_prob = torch.sigmoid(self.fc_prob(x)) 155 | 156 | # attentive MMIL pooling 157 | 158 | frame_att = torch.softmax(self.fc_frame_att(x), dim=1) 159 | av_att = torch.softmax(self.fc_av_att(x), dim=2) 160 | temporal_prob = (frame_att * frame_prob) 161 | global_prob = (temporal_prob*av_att).sum(dim=2).sum(dim=1) 162 | 163 | a_prob = temporal_prob[:, :, 0, :].sum(dim=1) 164 | v_prob =temporal_prob[:, :, 1, :].sum(dim=1) 165 | 166 | return global_prob, a_prob, v_prob, frame_prob, sims, mask 167 | 168 | 169 | 170 | class CMTLayer(nn.Module): 171 | 172 | def __init__(self, d_model, nhead, dim_feedforward=512, dropout=0.1): 173 | super(CMTLayer, self).__init__() 174 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 175 | # Implementation of Feedforward model 176 | self.linear1 = nn.Linear(d_model, dim_feedforward) 177 | self.dropout = nn.Dropout(dropout) 178 | self.linear2 = nn.Linear(dim_feedforward, d_model) 179 | 180 | self.norm1 = nn.LayerNorm(d_model) 181 | self.norm2 = nn.LayerNorm(d_model) 182 | self.dropout1 = nn.Dropout(dropout) 183 | self.dropout2 = nn.Dropout(dropout) 184 | 185 | self.activation = nn.ReLU() 186 | 187 | def forward(self, src_q, src_v, src_mask=None, src_key_padding_mask=None): 188 | r"""Pass the input through the encoder layer. 189 | 190 | Args: 191 | src: the sequnce to the encoder layer (required). 192 | src_mask: the mask for the src sequence (optional). 193 | src_key_padding_mask: the mask for the src keys per batch (optional). 194 | 195 | Shape: 196 | see the docs in Transformer class. 197 | """ 198 | src2 = self.self_attn(src_q, src_v, src_v, attn_mask=src_mask, 199 | key_padding_mask=src_key_padding_mask)[0] 200 | src_q = src_q + self.dropout1(src2) 201 | src_q = self.norm1(src_q) 202 | 203 | src2 = self.linear2(self.dropout(F.relu(self.linear1(src_q)))) 204 | src_q = src_q + self.dropout2(src2) 205 | src_q = self.norm2(src_q) 206 | return src_q 207 | -------------------------------------------------------------------------------- /step2_find_exchange/run.sh: -------------------------------------------------------------------------------- 1 | #CUDA_VISIBLE_DEVICES=1 . 2 | python main_avvp.py --mode estimate_labels --audio_dir ../feats/vggish/ --video_dir ../feats/res152/ --st_dir ../feats/r2plus1d_18 --model_save_dir ../step1_train_base_model/models/ 3 | -------------------------------------------------------------------------------- /step2_find_exchange/scripts/download_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | def download(set, name, t_seg): 5 | #label = label.replace(" ", "_") # avoid space in folder name 6 | path_data = os.path.join(set, "video") 7 | print(path_data) 8 | if not os.path.exists(path_data): 9 | os.makedirs(path_data) 10 | link_prefix = "https://www.youtube.com/watch?v=" 11 | 12 | filename_full_video = os.path.join(path_data, name) + "_full_video.mp4" 13 | filename = os.path.join(path_data, name) + ".mp4" 14 | link = link_prefix + name 15 | 16 | if os.path.exists(filename): 17 | print("already exists, skip") 18 | return 19 | 20 | print( "download the whole video for: [%s] - [%s]" % (set, name)) 21 | command1 = 'youtube-dl --ignore-config ' 22 | command1 += link + " " 23 | command1 += "-o " + filename_full_video + " " 24 | command1 += "-f best " 25 | 26 | #command1 += '-q ' # print no log 27 | #print command1 28 | os.system(command1) 29 | 30 | t_start, t_end = t_seg 31 | t_dur = t_end - t_start 32 | print("trim the video to [%.1f-%.1f]" % (t_start, t_end)) 33 | command2 = 'ffmpeg ' 34 | command2 += '-ss ' 35 | command2 += str(t_start) + ' ' 36 | command2 += '-i ' 37 | command2 += filename_full_video + ' ' 38 | command2 += '-t ' 39 | command2 += str(t_dur) + ' ' 40 | command2 += '-vcodec libx264 ' 41 | command2 += '-acodec aac -strict -2 ' 42 | command2 += filename + ' ' 43 | command2 += '-y ' # overwrite without asking 44 | command2 += '-loglevel -8 ' # print no log 45 | #print(command2) 46 | os.system(command2) 47 | try: 48 | os.remove(filename_full_video) 49 | except: 50 | return 51 | 52 | print ("finish the video as: " + filename) 53 | 54 | 55 | ##%% read the label encoding 56 | # filename = "../doc/class_labels_indices.csv" 57 | # lines = [x.strip() for x in open(filename, 'r')][1:] 58 | # label_encode = {} 59 | # for l in lines: 60 | # l = l[l.find(",")+1:] 61 | # encode = l.split(",")[0] 62 | # label_encode[ l[len(encode)+2:-1] ] = encode 63 | # 64 | # 65 | # 66 | 67 | # %% read the video trim time indices 68 | filename_source = "data/AVVP_dataset_full.csv" # 69 | set = "data/LLP_dataset" 70 | df = pd.read_csv(filename_source, header=0, sep='\t') 71 | filenames = df["filename"] 72 | length = len(filenames) 73 | print(length) 74 | names = [] 75 | segments = {} 76 | for i in range(length): 77 | row = df.loc[i, :] 78 | name = row[0][:11] 79 | steps = row[0][11:].split("_") 80 | t_start = float(steps[1]) 81 | t_end = t_start + 10 82 | segments[name] = (t_start, t_end) 83 | download(set, name, segments[name]) 84 | names.append(name) 85 | print(len(segments)) 86 | 87 | -------------------------------------------------------------------------------- /step2_find_exchange/scripts/extract_3D_feat.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import sys 3 | import subprocess 4 | import glob 5 | from tqdm import tqdm 6 | import numpy as np 7 | import os 8 | import argparse 9 | from PIL import Image 10 | import torch 11 | from torch import nn 12 | import torch.nn.functional as F 13 | import torchvision.models as models 14 | import transforms as TF 15 | import utils 16 | import torchvision 17 | 18 | 19 | C, H, W = 3, 112, 112 20 | 21 | def extract_feats(params, model, load_img): 22 | global C, H, W 23 | model.eval() 24 | dir_fc = os.path.join(os.getcwd(), params['output_dir']) 25 | if not os.path.isdir(dir_fc): 26 | os.mkdir(dir_fc) 27 | 28 | video_list = os.listdir(params['video_path']) 29 | nn = 0 30 | for video in video_list: 31 | 32 | nn = nn + 1 33 | dst = video 34 | 35 | image_list = sorted(glob.glob(os.path.join(params['video_path'], dst, '*.jpg'))) 36 | samples = np.round(np.linspace( 37 | 0, len(image_list) - 1, params['n_frame_steps'])) 38 | 39 | image_list = [image_list[int(sample)] for sample in samples] 40 | images = torch.zeros((len(image_list)//8, C, 8, H, W)) 41 | i = 0 42 | for iImg in range(len(image_list)): 43 | 44 | ii = i//8 45 | img = load_img(image_list[iImg]) 46 | images[ii, :, i%8, :, :] = img 47 | i += 1 48 | 49 | with torch.no_grad(): 50 | fc_feats = model(images.cuda()).squeeze() 51 | img_feats = fc_feats.cpu().numpy() 52 | # Save the inception features 53 | outfile = os.path.join(dir_fc, video + '.npy') 54 | np.save(outfile, img_feats) 55 | # cleanup 56 | #shutil.rmtree(dst) 57 | print(nn) 58 | 59 | 60 | if __name__ == '__main__': 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument("--gpu", dest='gpu', type=str, default='1', 63 | help='Set CUDA_VISIBLE_DEVICES environment variable, optional') 64 | parser.add_argument("--output_dir", dest='output_dir', type=str, 65 | default='data/LLP_dataset/feats/r2plus1d_18', help='directory to store features') 66 | parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=80, 67 | help='how many frames to sampler per video') 68 | 69 | parser.add_argument("--video_path", dest='video_path', type=str, 70 | default='data/LLP_dataset/frame', help='path to video dataset') 71 | parser.add_argument("--model", dest="model", type=str, default='r2plus1d_18', 72 | help='the CNN model you want to use to extract_feats') 73 | 74 | args = parser.parse_args() 75 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 76 | params = vars(args) 77 | if params['model'] == 'r2plus1d_18': 78 | model = models.video.r2plus1d_18(pretrained=True) 79 | model = nn.Sequential(*list(model.children())[:-1]) 80 | for param in model.parameters(): 81 | param.requires_grad = False 82 | T, C, H, W = 8, 3, 112, 112 83 | load_img = utils.LoadTransformImage() 84 | 85 | else: 86 | print("doesn't support %s" % (params['model'])) 87 | 88 | model = nn.DataParallel(model) 89 | model = model.cuda() 90 | extract_feats(params, model, load_img) 91 | -------------------------------------------------------------------------------- /step2_find_exchange/scripts/extract_audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import moviepy 3 | from moviepy.audio.AudioClip import AudioArrayClip 4 | from moviepy.editor import VideoFileClip 5 | 6 | video_pth = "data/LLP_dataset/video" 7 | sound_list = os.listdir(video_pth) 8 | save_pth = "data/LLP_dataset/audio" 9 | 10 | for audio_id in sound_list: 11 | name = os.path.join(video_pth, audio_id) 12 | audio_name = audio_id[:-4] + '.wav' 13 | exist_lis = os.listdir(save_pth) 14 | if audio_name in exist_lis: 15 | print("already exist!") 16 | continue 17 | try: 18 | video = VideoFileClip(name) 19 | audio = video.audio 20 | audio.write_audiofile(os.path.join(save_pth, audio_name), fps=16000) 21 | print("finish video id: " + audio_name) 22 | except: 23 | print("cannot load ", name) -------------------------------------------------------------------------------- /step2_find_exchange/scripts/extract_frames.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import os 4 | import argparse 5 | import glob 6 | 7 | def extract_frames(video, dst): 8 | command1 = 'ffmpeg ' 9 | command1 += '-i ' + video + " " 10 | command1 += '-y' + " " 11 | command1 += "-r " + "8 " 12 | command1 += '{0}/%06d.jpg'.format(dst) 13 | print(command1) 14 | # print command1 15 | os.system(command1) 16 | 17 | return 18 | 19 | if __name__ == '__main__': 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--out_dir', dest='out_dir', type=str, default='data/LLP_dataset/frame') 22 | parser.add_argument('--video_path', dest='video_path', type=str, default='data/LLP_dataset/video') 23 | args = parser.parse_args() 24 | 25 | vid_list = os.listdir(args.video_path) 26 | 27 | for vid_id in vid_list: 28 | name = os.path.join(args.video_path, vid_id) 29 | dst = os.path.join(args.out_dir, vid_id[:-4]) 30 | print(dst) 31 | if not os.path.exists(dst): 32 | os.makedirs(dst) 33 | extract_frames(name, dst) 34 | print("finish video id: " + vid_id) 35 | -------------------------------------------------------------------------------- /step2_find_exchange/scripts/extract_rgb_feat.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import glob 4 | from tqdm import tqdm 5 | import numpy as np 6 | import os 7 | import argparse 8 | from PIL import Image 9 | import torch 10 | from torch import nn 11 | import torch.nn.functional as F 12 | import pretrainedmodels 13 | from pretrainedmodels import utils 14 | 15 | C, H, W = 3, 224, 224 16 | 17 | def extract_feats(params, model, load_image_fn): 18 | global C, H, W 19 | model.eval() 20 | dir_fc = os.path.join(os.getcwd(), params['output_dir']) 21 | if not os.path.isdir(dir_fc): 22 | os.mkdir(dir_fc) 23 | 24 | video_list = os.listdir(params['video_path']) 25 | nn = 0 26 | for video in video_list: 27 | 28 | nn = nn + 1 29 | dst = video 30 | 31 | image_list = sorted(glob.glob(os.path.join(params['video_path'], dst, '*.jpg'))) 32 | samples = np.round(np.linspace( 33 | 0, len(image_list) - 1, params['n_frame_steps'])) 34 | 35 | image_list = [image_list[int(sample)] for sample in samples] 36 | images = torch.zeros((len(image_list), C, H, W)) 37 | i = 0 38 | for iImg in range(len(image_list)): 39 | img = load_image_fn(image_list[iImg]) 40 | images[iImg] = img 41 | 42 | 43 | with torch.no_grad(): 44 | fc_feats = model(images.cuda()).squeeze() 45 | img_feats = fc_feats.cpu().numpy() 46 | #print(img_feats.shape) 47 | # Save the inception features 48 | outfile = os.path.join(dir_fc, video + '.npy') 49 | np.save(outfile, img_feats) 50 | # cleanup 51 | #shutil.rmtree(dst) 52 | print(nn) 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument("--gpu", dest='gpu', type=str, default='0', 58 | help='Set CUDA_VISIBLE_DEVICES environment variable, optional') 59 | parser.add_argument("--output_dir", dest='output_dir', type=str, 60 | default='data/LLP_dataset/feats/res152', help='directory to store features') 61 | parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=80, 62 | help='how many frames to sampler per video') 63 | 64 | parser.add_argument("--video_path", dest='video_path', type=str, 65 | default='data/LLP_dataset/frame', help='path to video dataset') 66 | parser.add_argument("--model", dest="model", type=str, default='resnet152', 67 | help='the CNN model you want to use to extract_feats') 68 | 69 | args = parser.parse_args() 70 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 71 | params = vars(args) 72 | if params['model'] == 'inception_v3': 73 | C, H, W = 3, 299, 299 74 | model = pretrainedmodels.inceptionv3(pretrained='imagenet') 75 | load_image_fn = utils.LoadTransformImage(model) 76 | 77 | elif params['model'] == 'resnet152': 78 | C, H, W = 3, 224, 224 79 | model = pretrainedmodels.resnet152(pretrained='imagenet') 80 | load_image_fn = utils.LoadTransformImage(model) 81 | elif params['model'] == 'vgg19_bn': 82 | C, H, W = 3, 224, 224 83 | model = pretrainedmodels.vgg19_bn(pretrained='imagenet') 84 | load_image_fn = utils.LoadTransformImage(model) 85 | elif params['model'] == 'inception_v4': 86 | C, H, W = 3, 299, 299 87 | model = pretrainedmodels.inceptionv4( 88 | num_classes=1000, pretrained='imagenet') 89 | load_image_fn = utils.LoadTransformImage(model) 90 | elif params['model'] == 'nasnetalarge': 91 | C, H, W = 3, 299, 299 92 | model = pretrainedmodels.inceptionv4( 93 | num_classes=1000, pretrained='imagenet') 94 | load_image_fn = utils.LoadTransformImage(model) 95 | 96 | else: 97 | print("doesn't support %s" % (params['model'])) 98 | 99 | model.last_linear = utils.Identity() 100 | model = nn.DataParallel(model) 101 | 102 | model = model.cuda() 103 | extract_feats(params, model, load_image_fn) 104 | -------------------------------------------------------------------------------- /step2_find_exchange/scripts/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | 4 | 5 | def crop(vid, i, j, h, w): 6 | return vid[..., i:(i + h), j:(j + w)] 7 | 8 | 9 | def center_crop(vid, output_size): 10 | h, w = vid.shape[-2:] 11 | th, tw = output_size 12 | 13 | i = int(round((h - th) / 2.)) 14 | j = int(round((w - tw) / 2.)) 15 | return crop(vid, i, j, th, tw) 16 | 17 | 18 | def hflip(vid): 19 | return vid.flip(dims=(-1,)) 20 | 21 | 22 | # NOTE: for those functions, which generally expect mini-batches, we keep them 23 | # as non-minibatch so that they are applied as if they were 4d (thus image). 24 | # this way, we only apply the transformation in the spatial domain 25 | def resize(vid, size, interpolation='bilinear'): 26 | # NOTE: using bilinear interpolation because we don't work on minibatches 27 | # at this level 28 | scale = None 29 | if isinstance(size, int): 30 | scale = float(size) / min(vid.shape[-2:]) 31 | size = None 32 | return torch.nn.functional.interpolate( 33 | vid, size=size, scale_factor=scale, mode=interpolation, align_corners=False) 34 | 35 | 36 | def pad(vid, padding, fill=0, padding_mode="constant"): 37 | # NOTE: don't want to pad on temporal dimension, so let as non-batch 38 | # (4d) before padding. This works as expected 39 | return torch.nn.functional.pad(vid, padding, value=fill, mode=padding_mode) 40 | 41 | 42 | def to_normalized_float_tensor(vid): 43 | return vid.permute(3, 0, 1, 2).to(torch.float32) / 255 44 | 45 | 46 | def normalize(vid, mean, std): 47 | shape = (-1,) + (1,) * (vid.dim() - 1) 48 | mean = torch.as_tensor(mean).reshape(shape) 49 | std = torch.as_tensor(std).reshape(shape) 50 | return (vid - mean) / std 51 | 52 | 53 | # Class interface 54 | 55 | class RandomCrop(object): 56 | def __init__(self, size): 57 | self.size = size 58 | 59 | @staticmethod 60 | def get_params(vid, output_size): 61 | """Get parameters for ``crop`` for a random crop. 62 | """ 63 | h, w = vid.shape[-2:] 64 | th, tw = output_size 65 | if w == tw and h == th: 66 | return 0, 0, h, w 67 | i = random.randint(0, h - th) 68 | j = random.randint(0, w - tw) 69 | return i, j, th, tw 70 | 71 | def __call__(self, vid): 72 | i, j, h, w = self.get_params(vid, self.size) 73 | return crop(vid, i, j, h, w) 74 | 75 | 76 | class CenterCrop(object): 77 | def __init__(self, size): 78 | self.size = size 79 | 80 | def __call__(self, vid): 81 | return center_crop(vid, self.size) 82 | 83 | 84 | class Resize(object): 85 | def __init__(self, size): 86 | self.size = size 87 | 88 | def __call__(self, vid): 89 | return resize(vid, self.size) 90 | 91 | 92 | class ToFloatTensorInZeroOne(object): 93 | def __call__(self, vid): 94 | return to_normalized_float_tensor(vid) 95 | 96 | 97 | class Normalize(object): 98 | def __init__(self, mean, std): 99 | self.mean = mean 100 | self.std = std 101 | 102 | def __call__(self, vid): 103 | return normalize(vid, self.mean, self.std) 104 | 105 | 106 | class RandomHorizontalFlip(object): 107 | def __init__(self, p=0.5): 108 | self.p = p 109 | 110 | def __call__(self, vid): 111 | if random.random() < self.p: 112 | return hflip(vid) 113 | return vid 114 | 115 | 116 | class Pad(object): 117 | def __init__(self, padding, fill=0): 118 | self.padding = padding 119 | self.fill = fill 120 | 121 | def __call__(self, vid): 122 | return pad(vid, self.padding, self.fill) 123 | -------------------------------------------------------------------------------- /step2_find_exchange/scripts/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division, absolute_import 2 | import math 3 | import torch 4 | import torch.nn as nn 5 | import torchvision.transforms as transforms 6 | from PIL import Image 7 | import numpy as np 8 | from munch import munchify 9 | 10 | class ToSpaceBGR(object): 11 | 12 | def __init__(self, is_bgr): 13 | self.is_bgr = is_bgr 14 | 15 | def __call__(self, tensor): 16 | if self.is_bgr: 17 | new_tensor = tensor.clone() 18 | new_tensor[0] = tensor[2] 19 | new_tensor[2] = tensor[0] 20 | tensor = new_tensor 21 | return tensor 22 | 23 | 24 | class ToRange255(object): 25 | 26 | def __init__(self, is_255): 27 | self.is_255 = is_255 28 | 29 | def __call__(self, tensor): 30 | if self.is_255: 31 | tensor.mul_(255) 32 | return tensor 33 | 34 | 35 | class TransformImage(object): 36 | 37 | def __init__(self): 38 | self.mean = [0.43216, 0.394666, 0.37645] 39 | self.std = [0.22803, 0.22145, 0.216989] 40 | tfs = [] 41 | tfs.append(transforms.Resize((112, 112))) 42 | tfs.append(transforms.ToTensor()) 43 | tfs.append(ToSpaceBGR('RGB'=='BGR')) 44 | tfs.append(ToRange255(max([0, 1])==255)) 45 | tfs.append(transforms.Normalize(mean=self.mean, std=self.std)) 46 | 47 | self.tf = transforms.Compose(tfs) 48 | 49 | def __call__(self, img): 50 | tensor = self.tf(img) 51 | return tensor 52 | 53 | 54 | class LoadImage(object): 55 | 56 | def __init__(self, space='RGB'): 57 | self.space = space 58 | 59 | def __call__(self, path_img): 60 | with open(path_img, 'rb') as f: 61 | with Image.open(f) as img: 62 | img = img.convert(self.space) 63 | return img 64 | 65 | 66 | class LoadTransformImage(object): 67 | 68 | def __init__(self): 69 | self.load = LoadImage() 70 | self.tf = TransformImage() 71 | 72 | def __call__(self, path_img): 73 | img = self.load(path_img) 74 | tensor = self.tf(img) 75 | return tensor 76 | 77 | 78 | class Identity(nn.Module): 79 | 80 | def __init__(self): 81 | super(Identity, self).__init__() 82 | 83 | def forward(self, x): 84 | return x -------------------------------------------------------------------------------- /step2_find_exchange/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/step2_find_exchange/utils/__init__.py -------------------------------------------------------------------------------- /step2_find_exchange/utils/eval_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def Precision(X_pre, X_gt): 4 | 5 | N = len(X_pre) 6 | p = 0.0 7 | for i in range(N): 8 | x = X_pre[i, :] 9 | y = X_gt[i, :] 10 | p += np.sum(x*y)/np.sum(x) 11 | return p/N 12 | 13 | 14 | def Recall(X_pre, X_gt): 15 | N = len(X_pre) 16 | p = 0.0 17 | for i in range(N): 18 | x = X_pre[i, :] 19 | y = X_gt[i, :] 20 | p += np.sum(x * y) / np.sum(y) 21 | return p/N 22 | 23 | 24 | def F1(X_pre, X_gt): 25 | N = len(X_pre) 26 | p = 0 27 | for i in range(N): 28 | x = X_pre[i, :] 29 | y = X_gt[i, :] 30 | p += 2*np.sum(x * y) / (np.sum(x) + np.sum(y)) 31 | return p/N 32 | 33 | def event_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av): 34 | # extract events 35 | N = 25 36 | event_p_a = [None for n in range(25)] 37 | event_gt_a = [None for n in range(25)] 38 | event_p_v = [None for n in range(25)] 39 | event_gt_v = [None for n in range(25)] 40 | event_p_av = [None for n in range(25)] 41 | event_gt_av = [None for n in range(25)] 42 | 43 | TP_a = np.zeros(25) 44 | TP_v = np.zeros(25) 45 | TP_av = np.zeros(25) 46 | 47 | FP_a = np.zeros(25) 48 | FP_v = np.zeros(25) 49 | FP_av = np.zeros(25) 50 | 51 | FN_a = np.zeros(25) 52 | FN_v = np.zeros(25) 53 | FN_av = np.zeros(25) 54 | 55 | for n in range(N): 56 | seq_pred = SO_a[n, :] 57 | if np.sum(seq_pred)!=0: 58 | x = extract_event(seq_pred, n) 59 | event_p_a[n] = x 60 | seq_gt = GT_a[n, :] 61 | if np.sum(seq_gt)!=0: 62 | x = extract_event(seq_gt, n) 63 | event_gt_a[n] = x 64 | 65 | seq_pred = SO_v[n, :] 66 | if np.sum(seq_pred) != 0: 67 | x = extract_event(seq_pred, n) 68 | event_p_v[n] = x 69 | seq_gt = GT_v[n, :] 70 | if np.sum(seq_gt) != 0: 71 | x = extract_event(seq_gt, n) 72 | event_gt_v[n] = x 73 | 74 | seq_pred = SO_av[n, :] 75 | if np.sum(seq_pred) != 0: 76 | x = extract_event(seq_pred, n) 77 | event_p_av[n] = x 78 | 79 | seq_gt = GT_av[n, :] 80 | if np.sum(seq_gt) != 0: 81 | x = extract_event(seq_gt, n) 82 | event_gt_av[n] = x 83 | 84 | tp, fp, fn = event_wise_metric(event_p_a[n], event_gt_a[n]) 85 | TP_a[n] += tp 86 | FP_a[n] += fp 87 | FN_a[n] += fn 88 | 89 | tp, fp, fn = event_wise_metric(event_p_v[n], event_gt_v[n]) 90 | TP_v[n] += tp 91 | FP_v[n] += fp 92 | FN_v[n] += fn 93 | 94 | tp, fp, fn = event_wise_metric(event_p_av[n], event_gt_av[n]) 95 | TP_av[n] += tp 96 | FP_av[n] += fp 97 | FN_av[n] += fn 98 | 99 | TP = TP_a + TP_v 100 | FN = FN_a + FN_v 101 | FP = FP_a + FP_v 102 | 103 | n = len(FP_a) 104 | F_a = [] 105 | for ii in range(n): 106 | if (TP_a + FP_a)[ii] != 0 or (TP_a + FN_a)[ii] != 0: 107 | F_a.append(2 * TP_a[ii] / (2 * TP_a[ii] + (FN_a + FP_a)[ii])) 108 | 109 | F_v = [] 110 | for ii in range(n): 111 | if (TP_v + FP_v)[ii] != 0 or (TP_v + FN_v)[ii] != 0: 112 | F_v.append(2 * TP_v[ii] / (2 * TP_v[ii] + (FN_v + FP_v)[ii])) 113 | 114 | F = [] 115 | for ii in range(n): 116 | if (TP + FP)[ii] != 0 or (TP + FN)[ii] != 0: 117 | F.append(2 * TP[ii] / (2 * TP[ii] + (FN + FP)[ii])) 118 | 119 | F_av = [] 120 | for ii in range(n): 121 | if (TP_av + FP_av)[ii] != 0 or (TP_av + FN_av)[ii] != 0: 122 | F_av.append(2 * TP_av[ii] / (2 * TP_av[ii] + (FN_av + FP_av)[ii])) 123 | 124 | if len(F_a) == 0: 125 | f_a = 1.0 # all true negatives 126 | else: 127 | f_a = (sum(F_a)/len(F_a)) 128 | 129 | if len(F_v) == 0: 130 | f_v = 1.0 # all true negatives 131 | else: 132 | f_v = (sum(F_v)/len(F_v)) 133 | 134 | if len(F) == 0: 135 | f = 1.0 # all true negatives 136 | else: 137 | f = (sum(F)/len(F)) 138 | if len(F_av) == 0: 139 | f_av = 1.0 # all true negatives 140 | else: 141 | f_av = (sum(F_av)/len(F_av)) 142 | 143 | return f_a, f_v, f, f_av 144 | 145 | 146 | def segment_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av): 147 | # compute F scores 148 | TP_a = np.sum(SO_a * GT_a, axis=1) 149 | FN_a = np.sum((1-SO_a)*GT_a, axis = 1) 150 | FP_a = np.sum(SO_a*(1-GT_a),axis=1) 151 | 152 | n = len(FP_a) 153 | F_a = [] 154 | for ii in range(n): 155 | if (TP_a+FP_a)[ii]!= 0 or (TP_a+FN_a)[ii]!= 0: 156 | F_a.append(2*TP_a[ii] / (2*TP_a[ii] + (FN_a + FP_a)[ii])) 157 | 158 | TP_v = np.sum(SO_v * GT_v, axis=1) 159 | FN_v = np.sum((1 - SO_v) * GT_v, axis=1) 160 | FP_v = np.sum(SO_v * (1 - GT_v), axis=1) 161 | F_v = [] 162 | for ii in range(n): 163 | if (TP_v + FP_v)[ii] != 0 or (TP_v + FN_v)[ii] != 0: 164 | F_v.append(2 * TP_v[ii] / (2 * TP_v[ii] + (FN_v + FP_v)[ii])) 165 | 166 | TP = TP_a + TP_v 167 | FN = FN_a + FN_v 168 | FP = FP_a + FP_v 169 | 170 | n = len(FP) 171 | 172 | F = [] 173 | for ii in range(n): 174 | if (TP + FP)[ii] != 0 or (TP + FN)[ii] != 0: 175 | F.append(2 * TP[ii] / (2 * TP[ii] + (FN + FP)[ii])) 176 | 177 | TP_av = np.sum(SO_av * GT_av, axis=1) 178 | FN_av = np.sum((1 - SO_av) * GT_av, axis=1) 179 | FP_av = np.sum(SO_av * (1 - GT_av), axis=1) 180 | n = len(FP_av) 181 | F_av = [] 182 | for ii in range(n): 183 | if (TP_av + FP_av)[ii] != 0 or (TP_av + FN_av)[ii] != 0: 184 | F_av.append(2 * TP_av[ii] / (2 * TP_av[ii] + (FN_av + FP_av)[ii])) 185 | 186 | 187 | if len(F_a) == 0: 188 | f_a = 1.0 # all true negatives 189 | else: 190 | f_a = (sum(F_a)/len(F_a)) 191 | 192 | if len(F_v) == 0: 193 | f_v = 1.0 # all true negatives 194 | else: 195 | f_v = (sum(F_v)/len(F_v)) 196 | 197 | if len(F) == 0: 198 | f = 1.0 # all true negatives 199 | else: 200 | f = (sum(F)/len(F)) 201 | if len(F_av) == 0: 202 | f_av = 1.0 # all true negatives 203 | else: 204 | f_av = (sum(F_av)/len(F_av)) 205 | 206 | return f_a, f_v, f, f_av 207 | 208 | 209 | def to_vec(start, end): 210 | x = np.zeros(10) 211 | for i in range(start, end): 212 | x[i] = 1 213 | return x 214 | 215 | def extract_event(seq, n): 216 | x = [] 217 | i = 0 218 | while i < 10: 219 | if seq[i] == 1: 220 | start = i 221 | if i + 1 == 10: 222 | i = i + 1 223 | end = i 224 | x.append(to_vec(start, end)) 225 | break 226 | 227 | for j in range(i + 1, 10): 228 | if seq[j] != 1: 229 | i = j + 1 230 | end = j 231 | x.append(to_vec(start, end)) 232 | break 233 | else: 234 | i = j + 1 235 | if i == 10: 236 | end = i 237 | x.append(to_vec(start, end)) 238 | break 239 | else: 240 | i += 1 241 | return x 242 | 243 | def event_wise_metric(event_p, event_gt): 244 | TP = 0 245 | FP = 0 246 | FN = 0 247 | 248 | if event_p is not None: 249 | num_event = len(event_p) 250 | for i in range(num_event): 251 | x1 = event_p[i] 252 | if event_gt is not None: 253 | nn = len(event_gt) 254 | flag = True 255 | for j in range(nn): 256 | x2 = event_gt[j] 257 | if np.sum(x1 * x2) >= 0.5 * np.sum(x1 + x2 - x1 * x2): #0.5 258 | TP += 1 259 | flag = False 260 | break 261 | if flag: 262 | FP += 1 263 | else: 264 | FP += 1 265 | 266 | if event_gt is not None: 267 | num_event = len(event_gt) 268 | for i in range(num_event): 269 | x1 = event_gt[i] 270 | if event_p is not None: 271 | nn = len(event_p) 272 | flag = True 273 | for j in range(nn): 274 | x2 = event_p[j] 275 | if np.sum(x1 * x2) >= 0.5 * np.sum(x1 + x2 - x1 * x2): #0.5 276 | flag = False 277 | break 278 | if flag: 279 | FN += 1 280 | else: 281 | FN += 1 282 | return TP, FP, FN 283 | -------------------------------------------------------------------------------- /step3_retrain/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/step3_retrain/.gitkeep -------------------------------------------------------------------------------- /step3_retrain/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/step3_retrain/data/.DS_Store -------------------------------------------------------------------------------- /step3_retrain/data/AVVP_val_pd.csv: -------------------------------------------------------------------------------- 1 | filename event_labels 2 | 4O9rI-FpqLg_10_20 Cheering,Speech,Clapping 3 | 8sfcMZ8A3OQ_180_190 Frying_(food),Speech 4 | JGdkVx81S-k_20_30 Blender 5 | 9ggKWjpQPeQ_0_10 Speech,Dog 6 | BRtGk2YmdJ0_20_30 Speech,Baby_laughter 7 | 6mWz9fxXO7s_490_500 Chainsaw,Speech 8 | 6TX5z51nhvQ_30_40 Accordion,Speech 9 | LzgeBj-P29Q_30_40 Violin_fiddle,Speech,Cheering,Clapping 10 | FLYjmRy39_g_10_20 Lawn_mower 11 | LYadFGHyPow_350_360 Chainsaw 12 | EwTLLwJtDyo_180_190 Singing,Acoustic_guitar 13 | P8saKKI43tA_20_30 Vacuum_cleaner 14 | 1PW3CB-gkHk_30_40 Car 15 | AsY6Bqnp8I0_590_600 Motorcycle,Speech,Car 16 | GvCEvNg0a9w_30_40 Speech,Dog 17 | FHgIDw8rGtA_150_160 Motorcycle,Speech 18 | 5bS6ZvqET4U_210_220 Singing,Acoustic_guitar 19 | FzpI8ggObiE_280_290 Violin_fiddle 20 | 2uNBP5Bm2to_30_40 Speech,Car 21 | Ep3yRHFevHo_0_10 Basketball_bounce,Speech 22 | Lnpe9u8cF50_90_100 Accordion,Singing,Acoustic_guitar 23 | 7yeWLEus7OI_80_90 Blender 24 | AErPqu3A5SI_250_260 Speech,Dog 25 | Cyi2fuiswx8_27_37 Speech,Baby_cry_infant_cry 26 | -9D8GR2VBKE_40_50 Speech,Frying_(food) 27 | C1Gy-TCPU1k_30_40 Helicopter 28 | IBdAV36uQCg_0_10 Singing,Speech 29 | OWrp89_xwVw_80_90 Acoustic_guitar,Singing 30 | CnXeNxV7se0_20_30 Cello,Violin_fiddle,Acoustic_guitar 31 | DS_kzdgXoV4_30_40 Accordion 32 | 8hkevfLlY6U_70_80 Accordion 33 | 1eOhQvtQHm0_160_170 Blender,Speech 34 | 7bPt8dhIAiU_120_130 Vacuum_cleaner 35 | J6cBf-yBto0_140_150 Blender 36 | FXM8uAh3nS8_70_80 Chicken_rooster,Speech 37 | 6puZFsJvC64_70_80 Banjo,Clapping 38 | GE8GjxkhqJQ_170_180 Singing 39 | GEoZoRyYc1Q_90_100 Baby_laughter,Speech 40 | 5jKH6UTb7sw_30_40 Banjo,Speech,Clapping,Acoustic_guitar 41 | GJbpzOoDmO4_210_220 Cello,Violin_fiddle,Singing 42 | HxPdX8caIaM_10_20 Cheering,Speech 43 | GkjMWXHp0Ek_150_160 Blender,Speech 44 | 0QXKGC5bXno_30_40 Cello,Violin_fiddle 45 | 1NAoH6dROvo_30_40 Cheering 46 | KXZXHrLBpPo_350_360 Acoustic_guitar,Speech 47 | Jcn4AhTKLuk_120_130 Motorcycle,Speech,Car 48 | 9cwBZS6IoQE_29_39 Chicken_rooster,Dog 49 | 84OQA_JJHPY_390_400 Accordion,Singing 50 | 3xgysBvOy6s_280_290 Speech,Clapping 51 | EbMfK3MVPYU_0_10 Telephone_bell_ringing 52 | AuxLUHyzMoQ_30_40 Speech,Dog 53 | FHvhhlwmJ3A_20_30 Lawn_mower,Speech 54 | FwODSWGmEdY_80_90 Speech,Dog,Car 55 | AK9paigQCcA_70_80 Speech,Baby_laughter 56 | 7hspZOznOIM_0_10 Telephone_bell_ringing,Speech 57 | 0Ft06J1FdAo_420_430 Cheering 58 | FR4em-G3gfo_30_40 Vacuum_cleaner,Cat 59 | -3G-bs5JaW8_100_110 Vacuum_cleaner 60 | 5eAw9Pp6dHk_260_270 Cello,Violin_fiddle 61 | 0lEavQUSRO4_410_420 Speech,Frying_(food) 62 | JRavR1Wdnp0_0_10 Baby_cry_infant_cry 63 | O9d8Yg9H0BQ_30_40 Cello,Violin_fiddle 64 | 55GEs-yyOXc_80_90 Basketball_bounce,Speech 65 | 7ECzsh5EkP8_30_40 Chicken_rooster 66 | CmjceF4lpVg_150_160 Helicopter 67 | LE3ux-hbUEg_160_170 Speech,Dog 68 | EmJch5XooOQ_0_10 Vacuum_cleaner 69 | GilJAJTcmo8_13_23 Speech,Baby_cry_infant_cry 70 | D0kNXqg7Kcw_20_30 Chainsaw 71 | N6JTpk6UdgQ_30_40 Violin_fiddle,Speech 72 | 7Cqk1_yMgkE_30_40 Clapping 73 | B_xS6aJj_Jc_30_40 Cheering,Speech 74 | 78yrwc9y1ik_10_20 Baby_cry_infant_cry 75 | DwzBJ19O0CI_60_70 Speech,Dog 76 | 403Hzy9-EsY_550_560 Cheering,Clapping,Speech 77 | HJZUFlrj2xk_0_10 Motorcycle 78 | 3RIFPfcoHEM_20_30 Basketball_bounce,Speech 79 | 9LloUO5C7do_50_60 Accordion 80 | 9H1fa9-f3q0_420_430 Frying_(food) 81 | DiBzz6M9cUk_30_40 Cello,Violin_fiddle 82 | 4zPHNzZiZlU_280_290 Singing,Acoustic_guitar 83 | OaJuXaA7tp8_30_40 Cello,Violin_fiddle 84 | Eo_FyQjui1k_10_20 Helicopter 85 | M1xbDGcD284_30_40 Baby_cry_infant_cry 86 | OAzPNdL-wQI_480_490 Helicopter 87 | FR0cOQrzaWQ_280_290 Singing,Acoustic_guitar 88 | CBAdaQsnfmQ_60_70 Singing,Banjo,Acoustic_guitar 89 | FtvApi4UlEM_30_40 Acoustic_guitar,Speech,Singing,Clapping 90 | 0dVXlandefw_4_14 Speech,Car 91 | An2wy6lxaVQ_290_300 Singing 92 | Ek8XKK6LE6Y_30_40 Vacuum_cleaner 93 | 6PKKgaQgm1g_90_100 Chicken_rooster 94 | H0j1eqsKVEw_120_130 Singing,Speech 95 | CIx729pLHGk_30_40 Speech,Dog 96 | 7y0E-IeaZBE_30_40 Chicken_rooster 97 | Kw-PLfbmjvQ_30_40 Helicopter 98 | 3MO-mchYrHA_30_40 Car 99 | GpPcLJ0b40o_140_150 Singing,Banjo,Acoustic_guitar 100 | 7YChvLYpyUk_30_40 Cat,Dog 101 | I--0mnbN4JI_10_20 Lawn_mower,Speech 102 | C0_NDX7DVd8_30_40 Chicken_rooster,Speech 103 | 4ANSBlG1pfs_60_70 Speech,Vacuum_cleaner 104 | G2jkaWZrsjE_310_320 Singing,Speech 105 | LxFenjME4NQ_0_10 Helicopter,Speech 106 | DGeCOmJq0Ow_70_80 Acoustic_guitar,Speech 107 | AGdT1gXPouk_200_210 Frying_(food),Speech 108 | 2yanV6gJNV0_0_10 Speech,Car 109 | 9IfN8dUgGaQ_30_40 Chicken_rooster,Cat,Dog 110 | 0qx_L11zBAY_30_40 Cello,Violin_fiddle 111 | 3YypZ-SiVjM_30_40 Helicopter 112 | O0tHJP8wnRc_70_80 Singing,Speech,Clapping 113 | Hky1DDrf23A_30_40 Cat,Speech 114 | Lpc7Ws6PSfg_2_12 Baby_cry_infant_cry 115 | 8h3tlcPyqwc_40_50 Basketball_bounce,Speech,Cheering,Clapping 116 | LO_DK-3poPg_130_140 Singing,Speech 117 | LBbDtEE9l2Q_18_28 Cat,Speech 118 | 2UZmUuUPqr8_90_100 Frying_(food),Speech 119 | GTxET6TlBRM_30_40 Speech,Baby_cry_infant_cry 120 | 8nfl_0VXMZs_30_40 Acoustic_guitar,Violin_fiddle 121 | 0zAFUHzL5kU_23_33 Speech,Car,Cheering 122 | 9rtvuHifBEY_30_40 Chicken_rooster,Speech 123 | LMqpXcUyLTU_0_10 Speech,Baby_cry_infant_cry 124 | Mj3zLi9jRjI_30_40 Cat,Speech 125 | 7g3idUTE1KM_30_40 Speech,Clapping 126 | DDVbCC9GbGE_50_60 Singing,Cheering 127 | GY-a6Wh7o_g_60_70 Motorcycle,Speech 128 | 8WLIDa9U9Ug_340_350 Cello,Violin_fiddle 129 | AE4ZaQMa2BE_30_40 Chicken_rooster,Speech 130 | OgMQvYOyH4s_30_40 Speech,Vacuum_cleaner 131 | NHs8LJTwrRU_13_23 Speech,Baby_cry_infant_cry 132 | 7USy-jxSsUg_30_40 Baby_cry_infant_cry 133 | 3KwxaHZD2yU_220_230 Cheering,Clapping 134 | BrH-NJpiTOo_90_100 Chicken_rooster,Speech 135 | F7CqvjjkHyA_30_40 Cat,Speech 136 | KMEisCK2xjQ_30_40 Violin_fiddle,Speech 137 | FdbtgaHTI9o_200_210 Helicopter,Speech 138 | NBFMvfX_yWA_180_190 Violin_fiddle,Speech 139 | ITP2x6GZGOw_180_190 Frying_(food),Speech 140 | 1RJsIRC9sVg_30_40 Car 141 | IiOj18HpE8E_30_40 Chainsaw 142 | 9_Pe-wbbehQ_10_20 Fire_alarm 143 | EggoG-tK9C0_0_10 Cat,Speech 144 | 0l4rszJ8M9E_90_100 Banjo,Violin_fiddle 145 | 0iS6S7e_j8Q_20_30 Clapping,Singing 146 | Of_01ci6ErU_20_30 Speech,Dog 147 | 3aO1wVxQhUk_590_600 Speech,Clapping,Cheering 148 | 07QDI50hlec_50_60 Vacuum_cleaner 149 | AMVvfK6Px8Y_280_290 Singing,Violin_fiddle,Speech 150 | 9ui9IiwF6w8_180_190 Frying_(food),Speech 151 | OYnra-ve09E_30_40 Chicken_rooster,Speech 152 | CFc2Ex8rFr8_460_470 Cello 153 | NBfoeB2u52c_30_40 Speech,Baby_cry_infant_cry 154 | EBCrVc6b7xI_120_130 Accordion 155 | JurJCOlwHnI_30_40 Motorcycle 156 | 7oPED09nVAk_100_110 Singing,Acoustic_guitar 157 | Fdp9dfnaLuY_30_40 Violin_fiddle,Speech 158 | 2U6WeaRLEds_140_150 Blender,Speech 159 | A-PigXQWChs_30_40 Helicopter 160 | 2UjAPgJudZU_100_110 Speech,Car 161 | H74oGbW1sw8_10_20 Cello,Speech 162 | 4sXCFq_FrPo_100_110 Banjo,Speech 163 | 7eO9zH2KCC4_30_40 Acoustic_guitar,Violin_fiddle 164 | B8CXFCBwBxs_540_550 Speech,Vacuum_cleaner 165 | 050iiMJKE20_180_190 Chainsaw,Speech,Helicopter 166 | 1ZDVv5aNQfk_30_40 Car 167 | 5PRDmll4w5g_60_70 Speech,Frying_(food) 168 | PAoRcQiNJGI_10_20 Cheering,Speech,Clapping 169 | P6xxuZnRHXM_150_160 Singing,Acoustic_guitar 170 | HuaiNWPbTFE_20_30 Chicken_rooster 171 | 0gmLAVhaTKE_5_15 Car 172 | 6fgxUTmkGmA_90_100 Basketball_bounce,Speech 173 | Dk9Db7pUggU_120_130 Singing,Acoustic_guitar,Speech 174 | 6dvq5W-9D04_30_40 Chicken_rooster 175 | OqoCNCPb1Fg_60_70 Cello,Violin_fiddle 176 | 7Z9KzMO4hbM_180_190 Telephone_bell_ringing,Speech 177 | 6pHtlOXBxTI_30_40 Cheering,Speech 178 | A2k8DsBeCvc_40_50 Chicken_rooster 179 | BHLdOTsouJw_30_40 Banjo,Speech 180 | 9-grzj23Bos_0_10 Violin_fiddle,Singing 181 | F-TRR1Qgo9g_90_100 Basketball_bounce,Speech,Clapping,Cheering 182 | BbDf0ZKc4zA_30_40 Speech,Dog 183 | AaITIVxmm2w_120_130 Accordion 184 | JHZgrp0pMaM_0_10 Vacuum_cleaner 185 | FpnKchSo3VE_160_170 Motorcycle,Speech,Singing 186 | 1ypY7k2Biao_90_100 Acoustic_guitar 187 | DptWWXVlKHw_40_50 Speech,Fire_alarm 188 | 530yiQ7x99E_570_580 Blender,Speech 189 | CYwI2Yo3eYs_60_70 Motorcycle,Speech 190 | NHPWU9bKcc4_210_220 Cello,Violin_fiddle 191 | Ke5YRnqT94o_130_140 Vacuum_cleaner 192 | 9osfCQTKjOY_70_80 Singing 193 | DQ3ZFO2RP04_14_24 Speech,Dog 194 | 2GfYNfKtpK0_30_40 Chainsaw 195 | Lke5mGYGlNE_20_30 Telephone_bell_ringing,Speech 196 | DHW0GEDee0k_20_30 Accordion,Speech 197 | 35Id1r22ypE_30_40 Car 198 | 9fcxPwWzDM0_30_40 Accordion 199 | 0J3OzIVLnEI_50_60 Chicken_rooster 200 | DgElEEFPRaE_570_580 Blender 201 | AtHewToyLTc_21_31 Speech 202 | H8_q1ovRwv8_90_100 Speech,Dog 203 | 7KzioU2hP7M_150_160 Violin_fiddle 204 | ESvhM605xUk_70_80 Singing 205 | 9dFgCv-R7i0_0_10 Speech,Dog 206 | FjtO-xQD_To_30_40 Cat,Speech 207 | 0lpXFLNGM5E_130_140 Car,Speech 208 | 0bNb6oHg1_w_21_31 Speech,Car 209 | 0T4gZQwzyKY_30_40 Cheering 210 | 22CHirhbaOg_30_40 Baby_cry_infant_cry 211 | EOeWgyrX-q8_30_40 Speech,Vacuum_cleaner 212 | 4tnW9atZKo0_30_40 Cello,Violin_fiddle 213 | 7Vfb0lpJkpU_50_60 Vacuum_cleaner,Speech 214 | FgCtqIgZlkU_30_40 Speech,Baby_cry_infant_cry 215 | B4Q6N08yhp8_80_90 Vacuum_cleaner,Speech 216 | 2VBTz0clX0U_60_70 Singing,Violin_fiddle,Cello 217 | CghIWk1nhts_70_80 Singing 218 | GtiLAXlEZvU_40_50 Singing,Speech,Acoustic_guitar 219 | GxBAyAeGJFk_40_50 Singing 220 | ArVDnCbQ6mU_30_40 Helicopter,Speech 221 | 15qmhCMftso_70_80 Accordion,Violin_fiddle 222 | Njs1QZwQ5jM_21_31 Baby_cry_infant_cry,Dog 223 | MTOS1YA9cjI_30_40 Speech,Baby_cry_infant_cry 224 | 33SlPXN3NPk_0_10 Chainsaw 225 | 5a_l3dBvgFM_30_40 Telephone_bell_ringing,Speech 226 | Mo-Z_28McaI_90_100 Baby_laughter 227 | GB93d-uKICQ_310_320 Speech,Dog 228 | KHeMI0Qmnqg_110_120 Vacuum_cleaner 229 | 8x96WxsrPxE_30_40 Cheering 230 | KWqZGTZQUwA_110_120 Lawn_mower 231 | 2Lg4zowpNxE_30_40 Vacuum_cleaner 232 | A_rpqF8NTh8_40_50 Singing 233 | 6mBfajqi1pg_560_570 Vacuum_cleaner 234 | B4FiW4xiK0Q_30_40 Acoustic_guitar,Speech 235 | FC8HlNb6iFE_0_10 Motorcycle,Speech 236 | 7Y0xgnuuFW4_21_31 Speech,Clapping 237 | HJfi8BlRNYk_30_40 Helicopter,Speech 238 | 4rYS7GAZNVg_18_28 Cheering,Clapping,Speech 239 | 8sW11oczLM4_10_20 Helicopter 240 | FwGq6BRGWns_40_50 Speech,Dog 241 | 43JvH6ly2rc_240_250 Singing,Acoustic_guitar 242 | GjoLR-sdj78_280_290 Acoustic_guitar,Speech 243 | DR8HpVUhdbA_11_21 Motorcycle 244 | Ae6pZjdGfRY_30_40 Chainsaw,Speech 245 | -14BFlDzjS4_6_16 Speech,Car 246 | Nh3YTYgtEJs_180_190 Acoustic_guitar,Speech 247 | O1n-pt3waz8_30_40 Vacuum_cleaner 248 | 0R-2iLl0dXg_100_110 Singing 249 | F9kNX3zLB04_6_16 Speech,Baby_cry_infant_cry 250 | LAVr291jgR8_210_220 Speech 251 | 5sXb7q-ek7I_60_70 Clapping 252 | C1fKe7KDd3g_30_40 Cello,Violin_fiddle 253 | FB5g9CP-M6w_30_40 Dog,Speech 254 | CDPXjPKsRMc_50_60 Speech,Dog 255 | C9dQ03e94ig_30_40 Motorcycle 256 | 1BgsmndYANA_27_37 Speech,Car 257 | GsNgBm53CGw_250_260 Speech,Acoustic_guitar 258 | 0eAGbJEQhJ8_0_10 Speech,Car 259 | 1Qcxwho5ZLQ_30_40 Accordion 260 | A_ehwKZ-sBc_0_10 Speech,Dog 261 | EMDr5DqtfwY_30_40 Helicopter,Speech 262 | 6-2SCYUDBZw_30_40 Vacuum_cleaner 263 | CVti4WjWXcQ_30_40 Speech,Vacuum_cleaner 264 | H-zboGVCulQ_250_260 Singing 265 | 6bmJLRXrH-g_11_21 Telephone_bell_ringing,Speech 266 | 0oUbMWJgokM_170_180 Speech,Frying_(food) 267 | J8sZgIgUv6A_30_40 Violin_fiddle,Speech 268 | EWA0puztuCE_50_60 Helicopter 269 | 2NRDi8g-6Qk_200_210 Fire_alarm 270 | 5XPkYYaXgLc_230_240 Basketball_bounce,Speech,Cheering 271 | OTattrwc_Qc_300_310 Basketball_bounce,Speech 272 | Hn5l7jGoTuc_100_110 Singing,Speech 273 | 54Ool76D12Q_70_80 Speech,Vacuum_cleaner 274 | EuWGmg8MBxw_160_170 Speech,Dog 275 | B47e2vDCzgI_50_60 Cello,Violin_fiddle 276 | 4-b0IdcVTqw_20_30 Cheering 277 | B6qzrmrFOms_100_110 Singing,Speech 278 | 2macpFUgV7g_70_80 Speech,Fire_alarm 279 | CmzO2WG_rK4_10_20 Helicopter,Speech 280 | 18eo1PeY1KM_0_10 Basketball_bounce,Speech 281 | MT0Bxc0nkLc_120_130 Speech,Dog 282 | 6m9RSmLeC6M_40_50 Fire_alarm 283 | IGPYTFL2vd4_20_30 Motorcycle,Speech 284 | 9qeBzzC27PE_300_310 Lawn_mower 285 | HaFqd2SZezE_20_30 Dog 286 | BaUfQOf2U6w_110_120 Acoustic_guitar,Speech 287 | -0bNnlJmBM0_30_40 Speech,Car,Chicken_rooster 288 | IUuUEvOBMIs_140_150 Violin_fiddle,Speech 289 | EaYQInm5Q14_240_250 Cello,Violin_fiddle 290 | 0ioAyzjtiZI_30_40 Car 291 | JIT9v30weXs_20_30 Speech,Dog 292 | 7mrVq35XPrk_330_340 Vacuum_cleaner,Speech 293 | M2UMpS30jRA_30_40 Speech,Baby_cry_infant_cry 294 | DmfGwmAWqK0_210_220 Basketball_bounce,Speech 295 | 8yZnLF45mgs_100_110 Singing,Speech 296 | I35Xp8ugqj4_30_40 Violin_fiddle,Speech 297 | 7uXeXKxHQ6A_300_310 Lawn_mower 298 | M0viHJ8hTxE_70_80 Basketball_bounce,Speech 299 | LFW5265aG4w_30_40 Car,Speech 300 | OaXr8Ns_o9g_80_90 Vacuum_cleaner 301 | GXdiEb8fkVQ_30_40 Accordion,Speech 302 | Fg765IwFFms_50_60 Vacuum_cleaner 303 | CNhHSTH60FE_30_40 Violin_fiddle,Speech,Singing 304 | NTFPgBw8PlU_18_28 Baby_cry_infant_cry 305 | AfBK1v5Llss_8_18 Violin_fiddle,Speech 306 | BkuBZMpG9XU_120_130 Singing,Speech 307 | E4ZkcNYUHfg_210_220 Singing,Speech,Clapping 308 | 5IDCyA2GDgs_30_40 Blender 309 | E6vpA7p7ebI_190_200 Helicopter 310 | IGaYfRj5dJQ_460_470 Cheering,Speech,Clapping 311 | KoKy3bC91cg_1_11 Baby_cry_infant_cry 312 | GlBya1JAQCE_0_10 Singing,Speech 313 | OUoplkiyF54_30_40 Acoustic_guitar,Violin_fiddle 314 | DEJPptjl7KA_23_33 Baby_cry_infant_cry 315 | FE0C4xy3YRo_30_40 Violin_fiddle,Speech,Banjo,Cello 316 | LrWwi0-wcrg_30_40 Chainsaw 317 | N8o5qLz5974_10_20 Speech,Dog 318 | Bd2nhNf9Tp8_240_250 Vacuum_cleaner 319 | JZ82xaUBcIY_470_480 Speech,Fire_alarm 320 | NAkYzdSLN44_30_40 Basketball_bounce,Speech,Clapping,Cheering 321 | 5Ut2Ji8FdMw_30_40 Cheering,Speech 322 | IAq7bVGH_zg_90_100 Singing,Speech 323 | HLSZ-Me0hgg_510_520 Basketball_bounce,Speech 324 | J_uk2QJ9T_M_190_200 Motorcycle,Speech,Car 325 | 48iqWuaAkSU_140_150 Cello,Violin_fiddle 326 | 6TRJu1kHIJY_10_20 Helicopter 327 | IwpvkdsO12s_230_240 Frying_(food),Speech 328 | E3bKVGHsA7I_30_40 Baby_cry_infant_cry 329 | KW2B-Rv8bjs_30_40 Cat,Speech 330 | Hln9D0soCes_420_430 Banjo,Speech 331 | Ki5FZjVfs7s_190_200 Speech,Dog 332 | JCVDz4BeWTg_30_40 Violin_fiddle,Speech 333 | 8OHjU9PCt7Y_0_10 Basketball_bounce,Speech 334 | L_NiIDd9qQQ_110_120 Speech,Dog 335 | IN_sIxaaRsY_380_390 Cat,Speech 336 | 3kUHtnaD2-Y_0_10 Basketball_bounce 337 | Mte4VJKUkWQ_240_250 Singing,Cello 338 | LbVxnuv3Ebw_130_140 Motorcycle,Speech 339 | 3-Zuj5gO60g_5_15 Dog 340 | 3zWX4iDWwTE_30_40 Basketball_bounce,Speech 341 | H1u4PtqVFMg_370_380 Speech,Baby_laughter 342 | BUq7YBMxHc4_30_40 Speech,Dog 343 | HzzJAWWJ-pQ_30_40 Violin_fiddle 344 | DAF5KEULuGo_230_240 Motorcycle,Speech 345 | DZb5LThmhQw_30_40 Violin_fiddle,Speech 346 | HZGkfYuEh7w_100_110 Helicopter,Speech,Clapping 347 | Im1UwtffSmY_30_40 Speech,Dog 348 | 2QB4g9ZeJmw_60_70 Cheering,Clapping 349 | GKZvKWwrDcE_30_40 Speech,Clapping 350 | 8H9V40ek4q8_30_40 Violin_fiddle 351 | CwUtBoBqSYY_30_40 Helicopter,Speech 352 | H_dVhOw7p64_80_90 Accordion,Speech 353 | C9RkVD1AgtU_60_70 Frying_(food) 354 | Kqjqdx-jfMw_180_190 Blender,Speech 355 | BLi6DZmr_fU_30_40 Speech,Dog 356 | Nox8PbNaydY_60_70 Singing,Speech 357 | AyZOG94lAXc_10_20 Speech,Dog 358 | AUuZYQpKvPE_80_90 Baby_laughter 359 | FKjHrCvqJL0_170_180 Singing 360 | EuQ6qqoAdUE_290_300 Blender 361 | 9D5aLKo92Sw_160_170 Chicken_rooster 362 | 06xprMsre2M_80_90 Blender,Speech 363 | 2vqn1vRl-No_30_40 Chainsaw 364 | GTrzjZECYKI_50_60 Accordion 365 | 6Ix6axuXbnA_470_480 Vacuum_cleaner 366 | 9nwPHX0NlUQ_30_40 Speech,Baby_cry_infant_cry 367 | HEi4GzCQCfc_350_360 Speech,Clapping 368 | IPAPtsik2uU_0_10 Cheering,Speech 369 | Dk4YYDzk6v4_140_150 Singing,Speech 370 | It4a2G-gbPQ_40_50 Singing,Acoustic_guitar 371 | AV-l8lEdUvI_30_40 Chicken_rooster,Speech 372 | GIo3jkn2bt0_30_40 Violin_fiddle,Speech 373 | 6SgzzYG-B1Q_30_40 Chicken_rooster 374 | 41TTeZMRpzs_30_40 Speech,Dog,Vacuum_cleaner 375 | OOw5CoIgV00_30_40 Accordion,Violin_fiddle 376 | ElcySYXYtew_10_20 Singing,Cello,Acoustic_guitar,Accordion 377 | NIlaWJiRnOQ_19_29 Frying_(food),Speech 378 | EMsdkma5MU8_30_40 Speech,Vacuum_cleaner 379 | LGeLv_bEBSA_420_430 Cello,Violin_fiddle 380 | 3dKj3OZw7OY_60_70 Basketball_bounce,Speech,Cheering 381 | LNGPfZsCqKY_18_28 Helicopter 382 | 8aDEqlAiLoo_140_150 Singing 383 | 435H0Ai36E4_40_50 Vacuum_cleaner,Dog 384 | BtjZjdUqzvY_4_14 Chainsaw 385 | 0_MLIPfb84s_30_40 Acoustic_guitar,Violin_fiddle 386 | K4SxwnI3EE4_70_80 Speech,Dog 387 | LH_cTOUJPhU_330_340 Blender,Speech 388 | 9w88_e0QLIw_30_40 Violin_fiddle,Speech 389 | BbB_Gks17K0_50_60 Speech,Fire_alarm 390 | CxtwZM-7k1M_80_90 Motorcycle,Speech 391 | AO15xf8M-N0_20_30 Speech,Dog 392 | BXVhuHTAels_100_110 Acoustic_guitar,Speech 393 | 5FmalezMAls_13_23 Cheering,Speech 394 | Kqa-iC4cuMs_370_380 Speech,Vacuum_cleaner 395 | 7rYjAdnb3CA_20_30 Car,Speech 396 | 1TzFK50oG_g_0_10 Car 397 | KL4uiNB7lMA_150_160 Vacuum_cleaner 398 | NDcJ3G22W0w_60_70 Basketball_bounce,Speech 399 | FTA0TnNtGlM_10_20 Fire_alarm 400 | DWV1qTrO7Bs_180_190 Singing,Speech 401 | BYEYNxKvij4_70_80 Singing,Cello,Acoustic_guitar 402 | E509105txOk_130_140 Helicopter,Speech 403 | 7rUcPOjvtXc_0_10 Singing,Acoustic_guitar 404 | EDJ-vOyHFNE_20_30 Speech,Clapping 405 | JKdmCnXxlKA_30_40 Chainsaw 406 | EbB_904ZztM_150_160 Acoustic_guitar,Speech 407 | 1FkR8wkSGU8_30_40 Speech,Car 408 | KFNee9YkgVI_17_27 Baby_cry_infant_cry,Speech 409 | 1DqXuuPiPho_240_250 Singing,Violin_fiddle 410 | Cg6Rryw1zls_140_150 Cello 411 | LlWf9r1ziZk_30_40 Violin_fiddle,Speech 412 | 3v7q1xTR2LQ_30_40 Baby_cry_infant_cry 413 | Gb2iX2TZ5v8_290_300 Speech,Frying_(food) 414 | BAYF00tB5mA_0_10 Speech,Vacuum_cleaner,Baby_laughter 415 | AIN9E2dwVtg_30_40 Banjo,Speech 416 | J1SkRrovroE_170_180 Basketball_bounce,Speech,Clapping 417 | MoZc4IsdhiU_20_30 Helicopter 418 | KXumlH_SF5Y_100_110 Helicopter,Speech 419 | CFODHCyurtc_140_150 Singing,Accordion 420 | EdSlrD3HGk8_110_120 Speech,Dog 421 | 5er9MoTAogc_10_20 Baby_cry_infant_cry 422 | 95Sd0Vy9vRg_40_50 Singing 423 | LDXGQBKnIkU_14_24 Lawn_mower,Speech 424 | DJDxBH4Bb2I_100_110 Basketball_bounce,Speech 425 | H0EAcUmwbMs_30_40 Accordion 426 | 3IVP5Gna8sI_30_40 Speech,Car 427 | ID7NdijmMvE_90_100 Banjo,Speech,Acoustic_guitar,Cello 428 | 7LjcPsua1Vw_170_180 Violin_fiddle 429 | J3YTzD0N_kw_20_30 Singing,Speech 430 | EikptOF21-s_30_40 Cat,Speech 431 | K-_lKMeitkU_200_210 Blender,Speech 432 | 5Q2UcVku9-E_150_160 Speech,Vacuum_cleaner 433 | 14ekd4nkpwc_28_38 Baby_cry_infant_cry 434 | NZVSQRzo48o_30_40 Violin_fiddle,Speech 435 | CSuoHEFbWVs_100_110 Violin_fiddle 436 | 2jzZGsHGIGg_30_40 Speech,Car,Motorcycle 437 | A2VKrhHv3R8_200_210 Speech,Baby_laughter 438 | Msme_HbKIik_30_40 Accordion 439 | DhFVH7Dvz8A_30_40 Helicopter 440 | Gtyp2DtLehs_10_20 Cat,Speech 441 | GgiYd4ZgsQA_50_60 Dog 442 | Dr4GrylmP9M_390_400 Telephone_bell_ringing 443 | L-42QHWGyJU_370_380 Basketball_bounce,Cheering,Speech 444 | 5JFecNRvbog_150_160 Lawn_mower,Speech 445 | 8jSXancacwQ_30_40 Lawn_mower 446 | EE6x3fRHk7M_20_30 Speech,Dog 447 | 0sqE1PT9w4E_0_10 Singing 448 | CKexU9L_lpE_20_30 Helicopter,Speech 449 | GmxS8d0B1QE_20_30 Speech,Dog 450 | 3-BAZ9IRM1s_530_540 Vacuum_cleaner 451 | KbeILnzvciI_30_40 Violin_fiddle,Speech 452 | 2UuUxUmhugo_0_10 Chainsaw 453 | 29-VHfXrz7E_30_40 Cat 454 | KaNCiByIxx0_130_140 Chicken_rooster,Speech 455 | 0BtXezW-VGI_10_20 Cheering 456 | 1FlvObIkYvo_10_20 Speech,Car 457 | 31iv4K9G_4E_30_40 Speech,Car 458 | HlFvWY0asH4_40_50 Basketball_bounce,Speech 459 | 1cpo-8IgL2c_120_130 Cheering,Clapping 460 | Eoc5RJc83Iw_60_70 Motorcycle,Speech,Cheering,Clapping 461 | NorHGeReG2s_10_20 Singing,Speech 462 | BzwjKmH0J-o_130_140 Singing 463 | 0yleWcyPyx0_210_220 Lawn_mower 464 | 00v4G1YbJAE_30_40 Lawn_mower,Speech 465 | LfJH5_vbZuk_170_180 Cat,Speech 466 | DwUr_EwHswo_40_50 Singing,Speech 467 | B3yC2M_tgHQ_30_40 Lawn_mower,Speech 468 | 6FviyjRjgnc_60_70 Lawn_mower,Speech 469 | K4CrCEHGnwU_10_20 Speech,Dog 470 | BO7DUflFfIc_30_40 Cat,Speech 471 | EybwVFGORSo_370_380 Speech,Fire_alarm 472 | -7yHd8yVL7o_10_20 Car 473 | I3nMKP3u2Wc_30_40 Helicopter 474 | ICNjHa1pF0g_80_90 Basketball_bounce,Speech,Cheering 475 | 6EeyngaELKA_0_10 Blender 476 | KV4CpfKk7FY_30_40 Acoustic_guitar,Speech,Singing 477 | KL7xOvr3Bxw_0_10 Violin_fiddle 478 | CJ81qB9p4UM_30_40 Cheering,Speech 479 | 585sVgKWYHg_130_140 Blender 480 | J7H-6clOB-M_0_10 Singing,Acoustic_guitar 481 | NyxZOJpXWF4_30_40 Violin_fiddle 482 | AAcHfp8Ls9c_30_40 Acoustic_guitar,Speech 483 | 37whM1O5Yy4_130_140 Basketball_bounce,Speech,Clapping 484 | KMhuOpxiU5Y_0_10 Speech,Dog 485 | 4R29rXTkzOU_90_100 Cheering 486 | BP4Z8KmDLHw_25_35 Chicken_rooster,Speech 487 | Gus-voUdc1U_30_40 Accordion 488 | GREbxPMiiFU_30_40 Violin_fiddle,Speech 489 | MqMKjrKRXmI_30_40 Baby_cry_infant_cry 490 | 8nb6g-GBj7I_60_70 Speech,Clapping 491 | -2YWkWhIVtQ_30_40 Car 492 | DD0iSzxHJ_Q_0_10 Helicopter 493 | FbcZx5syfXE_0_10 Helicopter 494 | AuM1rU__KQA_360_370 Blender 495 | P5wB7xTZRR8_130_140 Singing,Speech 496 | 8DF2PxdT5tE_390_400 Basketball_bounce,Cheering,Speech 497 | NokSF2SXqA8_30_40 Violin_fiddle,Speech 498 | 20ysZc0Bp5o_10_20 Chicken_rooster 499 | 8hcEJYEwAME_0_10 Chicken_rooster 500 | OJjDafCVsP4_70_80 Speech,Dog 501 | FRzVTWTQyI8_50_60 Speech,Dog 502 | 2jQwv1d0DyU_30_40 Vacuum_cleaner 503 | FbJMtOawkAo_20_30 Cat,Speech 504 | 7OhK3f4X1VQ_7_17 Car 505 | 08GFUY-gH68_30_40 Cheering,Speech,Clapping 506 | 19jWnmYnlD8_30_40 Violin_fiddle 507 | KZbA2DuNLYM_23_33 Baby_cry_infant_cry 508 | 7LVqHbJ6N1E_90_100 Helicopter,Car 509 | FcRPORGNLdA_200_210 Speech,Vacuum_cleaner 510 | FGnW2s4nrYU_60_70 Clapping,Speech 511 | 2ax04zihcPo_140_150 Accordion 512 | 12-uDhDhWOk_30_40 Car 513 | Bcmnnc3dNGw_30_40 Motorcycle,Speech 514 | Hd4Lrewto7Q_20_30 Accordion 515 | Jhgm6EjninI_20_30 Cheering,Speech 516 | IvxMdjfSvQQ_12_22 Speech,Baby_cry_infant_cry 517 | FImr2xBfqTs_30_40 Cello,Acoustic_guitar 518 | GMbYPPN6SNk_90_100 Cello,Violin_fiddle 519 | 051ZmzJgOiQ_200_210 Speech,Car 520 | AX1jRw5DVO0_190_200 Chicken_rooster,Speech 521 | KZhSh9lwAoI_20_30 Speech,Baby_laughter 522 | KCOHsYmUa20_190_200 Accordion 523 | 2jb3oGyRnWg_30_40 Speech,Car 524 | MZVInVyY90o_20_30 Lawn_mower 525 | 0mKRXJhlMb0_120_130 Chicken_rooster 526 | I6Drk3QmQn8_60_70 Speech,Dog 527 | CN8vnC7zyhw_220_230 Speech,Dog 528 | GBJGRSP2JFg_20_30 Violin_fiddle,Speech 529 | 6KfCKGRBxRc_30_40 Cheering,Singing 530 | -7tDh-UQR7Q_50_60 Cello,Violin_fiddle 531 | M7mcD0npbjI_50_60 Basketball_bounce,Speech 532 | 99WdRkgJC1Y_140_150 Chicken_rooster 533 | IzJtqrr3_4o_30_40 Chainsaw 534 | GG5XSsexHGk_370_380 Speech,Fire_alarm 535 | BbiDC4kHcpY_0_10 Violin_fiddle,Speech,Cello 536 | DG2eXe5Xrp0_30_40 Violin_fiddle,Speech,Clapping,Acoustic_guitar 537 | M_Bk6sPg7O4_10_20 Speech,Dog 538 | 4bSKwakkWIE_0_10 Chicken_rooster 539 | 14xiJ2TAfI0_430_440 Speech,Clapping 540 | Np6XrFtNBY4_160_170 Speech,Dog 541 | NM6U_DyP_0o_30_40 Frying_(food),Speech 542 | CVV6bhSYphc_550_560 Frying_(food) 543 | Ow2ooQlgGV4_420_430 Chainsaw,Speech 544 | 84vFdTtO2Vk_10_20 Speech,Baby_laughter,Dog 545 | MvlIx9Dlf7U_180_190 Blender,Speech 546 | 6pw5sDsjCio_100_110 Vacuum_cleaner 547 | 6BwASiY-tzU_21_31 Baby_cry_infant_cry 548 | 7MXZvu6cjuU_90_100 Basketball_bounce,Speech,Cheering 549 | E5mV1b1ol6Y_0_10 Singing,Speech 550 | 0AuRaeswC-8_20_30 Chicken_rooster,Speech 551 | 9jS1INXlQ1w_160_170 Singing,Clapping,Acoustic_guitar 552 | 1uy12CfHhDk_170_180 Singing,Acoustic_guitar 553 | OHMoNFP6n9g_20_30 Chicken_rooster,Speech 554 | BQmyfqCv4bc_40_50 Acoustic_guitar,Speech 555 | 6w0A1-5O9bI_70_80 Lawn_mower 556 | 0hE-lduyMng_150_160 Frying_(food) 557 | HUrt-wFxTEY_50_60 Baby_cry_infant_cry 558 | 1TsbyCeoZ_E_30_40 Speech,Car 559 | 9xszRnKqoTU_270_280 Singing,Speech 560 | CizC7WKKPjQ_30_40 Violin_fiddle,Speech,Cheering 561 | 3c70Lv8GH48_90_100 Singing,Acoustic_guitar 562 | Bvp91I4IJok_30_40 Acoustic_guitar,Speech 563 | 9NolcbOJZQo_30_40 Basketball_bounce,Speech,Cheering,Clapping 564 | 6Kj5byQcQ10_40_50 Cheering,Speech 565 | 4ejLeuJcmoQ_25_35 Helicopter 566 | 4nsff2073p4_80_90 Cheering,Singing,Clapping 567 | DsBUJquZ6po_0_10 Singing 568 | EI9OD8CKoYY_20_30 Chicken_rooster,Speech 569 | GJHdLtKI37w_10_20 Chicken_rooster 570 | ATecKAA7Ydc_130_140 Chicken_rooster 571 | Mc79pRPTjXw_10_20 Speech,Dog 572 | 74rZ_mljj18_90_100 Basketball_bounce,Speech 573 | 3oZLoVYIHFQ_50_60 Vacuum_cleaner 574 | KGxvfuRj444_160_170 Cat,Speech 575 | 6Iifhd-a9H8_120_130 Dog,Speech 576 | FoTOD4HkfAM_30_40 Speech,Baby_cry_infant_cry 577 | EUtpm8jPQes_0_10 Blender 578 | NoHGNjEB3Ss_60_70 Cello,Violin_fiddle 579 | GOtmAPjCWRA_30_40 Speech,Fire_alarm 580 | 3ERn2kwnfA4_2_12 Speech,Car 581 | 0kWsQAObBjo_30_40 Cheering,Speech,Clapping 582 | 2OJw3xGALI4_60_70 Vacuum_cleaner 583 | CoU--V817p4_40_50 Speech,Baby_laughter 584 | NjIY8G7o-cI_120_130 Helicopter,Speech 585 | 0A-Y2CvbVMs_30_40 Accordion 586 | 3zZ5Tsskz5I_200_210 Singing 587 | AEmpI-ntHpc_320_330 Singing,Speech 588 | OoPVYyMhuHQ_80_90 Singing,Speech 589 | DSwEi6cwPjM_170_180 Cello 590 | L2zidDxyBjw_220_230 Accordion 591 | 2i82jVZy1pk_100_110 Baby_laughter 592 | KxJCFboTYeU_60_70 Chicken_rooster 593 | 8a4Zec3h3LM_30_40 Vacuum_cleaner 594 | B4emRH8vbWQ_10_20 Helicopter,Speech 595 | BTrSKQ9EI4o_0_10 Acoustic_guitar 596 | 1BPMaFuSk5Y_12_22 Speech,Car 597 | 39KgkK23Zmk_0_10 Banjo,Speech 598 | 72xG8v65vOU_30_40 Cello,Violin_fiddle 599 | 3zt3Z3qEJvw_110_120 Basketball_bounce,Speech,Clapping 600 | 7SChBv97cmg_10_20 Chainsaw 601 | 9WzF78LNQYQ_180_190 Telephone_bell_ringing,Speech 602 | 0F2XPtG9pJM_30_40 Violin_fiddle 603 | 7Iwu2gczI1g_540_550 Helicopter 604 | HHO8LjlRIf4_30_40 Cat,Speech,Dog 605 | 5XyD3q9WOJQ_30_40 Accordion 606 | JgZCSNt7s2I_17_27 Violin_fiddle,Speech 607 | NkXfDQE9P70_50_60 Speech,Dog 608 | GzXz1u955mo_170_180 Acoustic_guitar,Speech 609 | Dx4O21IPz1c_80_90 Singing 610 | KHv3w-UiXeQ_20_30 Speech,Dog 611 | 8eR5kuvbjKA_30_40 Singing 612 | 2dih3AAlObo_10_20 Basketball_bounce,Speech 613 | 1XaDmkM-AAw_150_160 Acoustic_guitar,Violin_fiddle 614 | 7zAZVwxH9PM_550_560 Basketball_bounce,Speech 615 | NmpQX6IohMY_140_150 Acoustic_guitar,Banjo,Singing 616 | 3r9NflVxUQw_320_330 Blender 617 | CPkbvoDSkqs_40_50 Violin_fiddle,Speech,Acoustic_guitar 618 | J3MimBwsn_g_5_15 Violin_fiddle,Speech 619 | 5TpfTyEWr3k_70_80 Accordion 620 | BxU73eU3dEg_0_10 Motorcycle,Speech 621 | Js8vb6_jjR4_30_40 Chicken_rooster 622 | G3Szo_Im5rs_30_40 Accordion 623 | MTkN3EBB1uY_70_80 Banjo,Violin_fiddle,Singing,Acoustic_guitar 624 | GGr5RlDlG7U_90_100 Chainsaw,Speech 625 | JfjB3Tz3U_U_30_40 Motorcycle 626 | 0dhEeSX-59I_25_35 Car 627 | Cj8EEprq2pU_30_40 Speech,Baby_cry_infant_cry 628 | ABBJDDsYT68_30_40 Frying_(food),Speech 629 | F3NPz3dUJsI_30_40 Speech,Dog 630 | ARZFi1PaUko_30_40 Lawn_mower,Speech 631 | LGUdvF0PqC0_10_20 Cat,Speech 632 | IsKSYL5B-Us_10_20 Basketball_bounce,Speech 633 | HsFjrit3z4M_310_320 Singing,Clapping,Cheering 634 | Nszf2gBWruU_30_40 Chainsaw 635 | 2vxm0s-RdOY_260_270 Basketball_bounce,Cheering,Speech,Clapping 636 | MpWZU83HzL8_30_40 Helicopter,Speech,Car 637 | AD-m1uu7UhI_0_10 Singing,Clapping 638 | DvgyFP7jxh4_70_80 Blender,Speech 639 | 0gfxwS7Wla8_30_40 Speech,Car 640 | H_rWLZ4DNQU_30_40 Singing,Speech,Clapping 641 | 1MMFztEi-Hk_240_250 Blender 642 | DUfww-VAuFI_60_70 Speech,Fire_alarm 643 | Ju4o24LHKl0_30_40 Cat,Speech 644 | 3WETCdylMfY_300_310 Singing 645 | OFtExNYmSfM_30_40 Speech,Dog 646 | H49jElTxnzQ_50_60 Basketball_bounce 647 | CEtiON3_5yA_530_540 Violin_fiddle,Speech 648 | Kw7r1WV8y30_30_40 Violin_fiddle,Speech 649 | GWg7TQYUHFo_30_40 Speech,Baby_cry_infant_cry 650 | CFzJQ6eje0Y_30_40 Motorcycle,Speech,Dog 651 | -------------------------------------------------------------------------------- /step3_retrain/dataloader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import os 4 | from torch.utils.data import Dataset, DataLoader 5 | from torchvision import transforms, utils 6 | import pandas as pd 7 | import random 8 | import pickle as pkl 9 | 10 | categories = ['Speech', 'Car', 'Cheering', 'Dog', 'Cat', 'Frying_(food)', 11 | 'Basketball_bounce', 'Fire_alarm', 'Chainsaw', 'Cello', 'Banjo', 12 | 'Singing', 'Chicken_rooster', 'Violin_fiddle', 'Vacuum_cleaner', 13 | 'Baby_laughter', 'Accordion', 'Lawn_mower', 'Motorcycle', 'Helicopter', 14 | 'Acoustic_guitar', 'Telephone_bell_ringing', 'Baby_cry_infant_cry', 'Blender', 15 | 'Clapping'] 16 | 17 | def ids_to_multinomial(ids): 18 | """ label encoding 19 | 20 | Returns: 21 | 1d array, multimonial representation, e.g. [1,0,1,0,0,...] 22 | """ 23 | id_to_idx = {id: index for index, id in enumerate(categories)} 24 | 25 | y = np.zeros(len(categories)) 26 | for id in ids: 27 | index = id_to_idx[id] 28 | y[index] = 1 29 | return y 30 | 31 | 32 | 33 | class LLP_dataset(Dataset): 34 | 35 | def __init__(self, label, audio_dir, video_dir, st_dir, train=None, transform=None): 36 | self.df = pd.read_csv(label, header=0, sep='\t') 37 | self.filenames = self.df["filename"] 38 | self.audio_dir = audio_dir 39 | self.video_dir = video_dir 40 | self.st_dir = st_dir 41 | self.transform = transform 42 | 43 | self.need_to_change_v, self.need_to_change_a = pkl.load(open("need_to_change.pkl", 'rb')) 44 | 45 | self.train = train 46 | 47 | labels_to_idx = {} 48 | for i in range(len(categories)): 49 | labels_to_idx[i] = [] 50 | 51 | for idx in range(len(self.filenames)): 52 | row = self.df.loc[idx, :] 53 | ids = row[-1].split(',') 54 | label = ids_to_multinomial(ids) 55 | 56 | if len(ids)==1: 57 | for c in range(len(categories)): 58 | if label[c] == 1: 59 | labels_to_idx[c].append(idx) 60 | 61 | self.labels_to_idx = labels_to_idx 62 | 63 | 64 | def __len__(self): 65 | return len(self.filenames) 66 | 67 | def __getitem__(self, idx): 68 | row = self.df.loc[idx, :] 69 | name = row[0][:11] 70 | audio = np.load(os.path.join(self.audio_dir, name + '.npy')) 71 | video_s = np.load(os.path.join(self.video_dir, name + '.npy')) 72 | video_st = np.load(os.path.join(self.st_dir, name + '.npy')) 73 | ids = row[-1].split(',') 74 | label = ids_to_multinomial(ids) 75 | 76 | # We move the label smooth from the main.py to dataloder.py 77 | # Origin position: https://github.com/YapengTian/AVVP-ECCV20/blob/master/main_avvp.py#L22 78 | v = 0.9 79 | pa = label 80 | pv = v * label + (1 - v) * 0.5 81 | 82 | # We change modality-aware label here 83 | for c in range(25): 84 | if label[c] != 0: 85 | if idx in self.need_to_change_v[c]: 86 | pv[c] = 0 87 | if idx in self.need_to_change_a[c]: 88 | pa[c] = 0 89 | 90 | if self.train: 91 | # find the label idx 92 | if len(ids) == 1: 93 | for c in range(25): 94 | if label[c] == 1: 95 | break 96 | idx2 = random.choice(self.labels_to_idx[c]) 97 | else: 98 | idx2 = random.randint(0, len(self.filenames)-1) 99 | 100 | row = self.df.loc[idx2, :] 101 | name = row[0][:11] 102 | audio2 = np.load(os.path.join(self.audio_dir, name + '.npy')) 103 | else: 104 | audio2 = np.array(1) 105 | 106 | sample = {'audio': audio, 'video_s': video_s, 'video_st': video_st, 'label': label, 'audio2':audio2, 'idx':np.array(idx), 107 | 'pa': pa, 'pv': pv} 108 | 109 | if self.transform: 110 | sample = self.transform(sample) 111 | 112 | return sample 113 | 114 | class ToTensor(object): 115 | 116 | def __call__(self, sample): 117 | if len(sample) == 2: 118 | audio = sample['audio'] 119 | label = sample['label'] 120 | return {'audio': torch.from_numpy(audio), 'label': torch.from_numpy(label)} 121 | else: 122 | audio = sample['audio'] 123 | video_s = sample['video_s'] 124 | video_st = sample['video_st'] 125 | label = sample['label'] 126 | pa = sample['pa'] 127 | pv = sample['pv'] 128 | return {'audio': torch.from_numpy(audio), 'video_s': torch.from_numpy(video_s), 129 | 'video_st': torch.from_numpy(video_st), 130 | 'pa':torch.from_numpy(pa), 'pv':torch.from_numpy(pv), 131 | 'label': torch.from_numpy(label), 'audio2':torch.from_numpy(sample['audio2']), 'idx':torch.from_numpy(sample['idx'])} 132 | -------------------------------------------------------------------------------- /step3_retrain/main_avvp.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | from dataloader import * 7 | from nets.net_audiovisual import MMIL_Net 8 | from utils.eval_metrics import segment_level, event_level 9 | import pandas as pd 10 | import pickle as pkl 11 | 12 | class LabelSmoothingLoss(nn.Module): 13 | def __init__(self, classes, smoothing=0.0, dim=-1): 14 | super(LabelSmoothingLoss, self).__init__() 15 | self.confidence = 1.0 - smoothing 16 | self.smoothing = smoothing 17 | self.cls = classes 18 | self.dim = dim 19 | 20 | def forward(self, pred, target): 21 | pred = pred.log_softmax(dim=self.dim) 22 | with torch.no_grad(): 23 | # true_dist = pred.data.clone() 24 | true_dist = torch.zeros_like(pred) 25 | true_dist.fill_(self.smoothing / (self.cls - 1)) 26 | true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) 27 | return torch.mean(torch.sum(-true_dist * pred, dim=self.dim)) 28 | 29 | 30 | def train(args, model, train_loader, optimizer, criterion, epoch): 31 | model.train() 32 | criterion2 = LabelSmoothingLoss(10, smoothing=0.1) 33 | accs = [] 34 | num = 0 35 | 36 | for batch_idx, sample in enumerate(train_loader): 37 | audio, video, video_st, target = sample['audio'].to('cuda'), sample['video_s'].to('cuda'), sample['video_st'].to('cuda'), sample['label'].type(torch.FloatTensor).to('cuda') 38 | audio2 = sample['audio2'].to('cuda') 39 | data_idx = sample['idx'] 40 | optimizer.zero_grad() 41 | output, a_prob, v_prob, _, sims, mask = model(audio, video, video_st, audio2) 42 | output.clamp_(min=1e-7, max=1 - 1e-7) 43 | a_prob.clamp_(min=1e-7, max=1 - 1e-7) 44 | v_prob.clamp_(min=1e-7, max=1 - 1e-7) 45 | 46 | Pa = sample['pa'].type(torch.FloatTensor).to('cuda') 47 | Pv = sample['pv'].type(torch.FloatTensor).to('cuda') 48 | 49 | b=audio.size(0) 50 | loss1 = criterion(a_prob, Pa) 51 | loss2 = criterion(v_prob, Pv) 52 | loss3 = criterion(output, target) 53 | 54 | loss4 = criterion2(sims, mask) 55 | loss = loss1 + loss2 + loss3 + loss4 56 | 57 | 58 | loss.backward() 59 | optimizer.step() 60 | if batch_idx % args.log_interval == 0: 61 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.3f}\tLoss: {:.3f}\tLoss: {:.3f}\tLoss: {:.3f}'.format( 62 | epoch, batch_idx * len(audio), len(train_loader.dataset), 63 | 100. * batch_idx / len(train_loader), loss1.item(), loss2.item(), loss3.item(), loss3.item())) 64 | 65 | 66 | def eval(model, val_loader, set): 67 | categories = ['Speech', 'Car', 'Cheering', 'Dog', 'Cat', 'Frying_(food)', 68 | 'Basketball_bounce', 'Fire_alarm', 'Chainsaw', 'Cello', 'Banjo', 69 | 'Singing', 'Chicken_rooster', 'Violin_fiddle', 'Vacuum_cleaner', 70 | 'Baby_laughter', 'Accordion', 'Lawn_mower', 'Motorcycle', 'Helicopter', 71 | 'Acoustic_guitar', 'Telephone_bell_ringing', 'Baby_cry_infant_cry', 'Blender', 72 | 'Clapping'] 73 | model.eval() 74 | 75 | # load annotations 76 | df = pd.read_csv(set, header=0, sep='\t') 77 | df_a = pd.read_csv("data/AVVP_eval_audio.csv", header=0, sep='\t') 78 | df_v = pd.read_csv("data/AVVP_eval_visual.csv", header=0, sep='\t') 79 | 80 | id_to_idx = {id: index for index, id in enumerate(categories)} 81 | F_seg_a = [] 82 | F_seg_v = [] 83 | F_seg = [] 84 | F_seg_av = [] 85 | F_event_a = [] 86 | F_event_v = [] 87 | F_event = [] 88 | F_event_av = [] 89 | 90 | with torch.no_grad(): 91 | for batch_idx, sample in enumerate(val_loader): 92 | audio, video, video_st, target = sample['audio'].to('cuda'), sample['video_s'].to('cuda'),sample['video_st'].to('cuda'), sample['label'].to('cuda') 93 | output, a_prob, v_prob, frame_prob, a, is_real = model(audio, video, video_st, audio) 94 | o = (output.cpu().detach().numpy() >= 0.5).astype(np.int_) 95 | oa = (a_prob.cpu().detach().numpy() >= 0.5).astype(np.int_) 96 | ov = (v_prob.cpu().detach().numpy() >= 0.5).astype(np.int_) 97 | 98 | Pa = frame_prob[0, :, 0, :].cpu().detach().numpy() 99 | Pv = frame_prob[0, :, 1, :].cpu().detach().numpy() 100 | 101 | Pa = (Pa >= 0.5).astype(np.int_) * np.repeat(oa, repeats=10, axis=0) 102 | Pv = (Pv >= 0.5).astype(np.int_) * np.repeat(ov, repeats=10, axis=0) 103 | 104 | # extract audio GT labels 105 | GT_a = np.zeros((25, 10)) 106 | GT_v =np.zeros((25, 10)) 107 | GT_aa = np.zeros(25, dtype=np.int) 108 | GT_vv = np.zeros(25, dtype=np.int) 109 | 110 | df_vid_a = df_a.loc[df_a['filename'] == df.loc[batch_idx, :][0]] 111 | filenames = df_vid_a["filename"] 112 | events = df_vid_a["event_labels"] 113 | onsets = df_vid_a["onset"] 114 | offsets = df_vid_a["offset"] 115 | num = len(filenames) 116 | if num >0: 117 | for i in range(num): 118 | 119 | x1 = int(onsets[df_vid_a.index[i]]) 120 | x2 = int(offsets[df_vid_a.index[i]]) 121 | event = events[df_vid_a.index[i]] 122 | idx = id_to_idx[event] 123 | GT_a[idx, x1:x2] = 1 124 | GT_aa[idx] = 1 125 | 126 | # extract visual GT labels 127 | df_vid_v = df_v.loc[df_v['filename'] == df.loc[batch_idx, :][0]] 128 | filenames = df_vid_v["filename"] 129 | events = df_vid_v["event_labels"] 130 | onsets = df_vid_v["onset"] 131 | offsets = df_vid_v["offset"] 132 | num = len(filenames) 133 | if num > 0: 134 | for i in range(num): 135 | x1 = int(onsets[df_vid_v.index[i]]) 136 | x2 = int(offsets[df_vid_v.index[i]]) 137 | event = events[df_vid_v.index[i]] 138 | idx = id_to_idx[event] 139 | GT_v[idx, x1:x2] = 1 140 | GT_vv[idx]=1 141 | GT_av = GT_a * GT_v 142 | 143 | # obtain prediction matrices 144 | SO_a = np.transpose(Pa) 145 | SO_v = np.transpose(Pv) 146 | SO_av = SO_a * SO_v 147 | 148 | # segment-level F1 scores 149 | f_a, f_v, f, f_av = segment_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av) 150 | F_seg_a.append(f_a) 151 | F_seg_v.append(f_v) 152 | F_seg.append(f) 153 | F_seg_av.append(f_av) 154 | 155 | # event-level F1 scores 156 | f_a, f_v, f, f_av = event_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av) 157 | F_event_a.append(f_a) 158 | F_event_v.append(f_v) 159 | F_event.append(f) 160 | F_event_av.append(f_av) 161 | 162 | print("\n") 163 | print('Audio \t {:.1f} \t {:.1f}'.format( 100 * np.mean(np.array(F_seg_a)), 100 * np.mean(np.array(F_event_a)))) 164 | print('Visual \t {:.1f} \t {:.1f}'.format( 100 * np.mean(np.array(F_seg_v)), 100 * np.mean(np.array(F_event_v)))) 165 | print('AudVis \t {:.1f} \t {:.1f}'.format( 100 * np.mean(np.array(F_seg_av)), 100 * np.mean(np.array(F_event_av)))) 166 | 167 | avg_type = (100 * np.mean(np.array(F_seg_av))+100 * np.mean(np.array(F_seg_a))+100 * np.mean(np.array(F_seg_v)))/3. 168 | avg_event = 100 * np.mean(np.array(F_seg)) 169 | print('Segment-levelType@Avg. F1: {:.1f}'.format(avg_type)) 170 | print('Segment-level Event@Avg. F1: {:.1f}'.format( avg_event)) 171 | 172 | avg_type_event = (100 * np.mean(np.array(F_event_av)) + 100 * np.mean(np.array(F_event_a)) + 100 * np.mean( 173 | np.array(F_event_v))) / 3. 174 | avg_event_level = 100 * np.mean(np.array(F_event)) 175 | print('Event-level Type@Avg. F1: {:.1f}'.format( avg_type_event)) 176 | print('Event-level Event@Avg. F1: {:.1f}'.format(avg_event_level)) 177 | print("\n") 178 | return avg_type 179 | 180 | 181 | def main(): 182 | parser = argparse.ArgumentParser(description='PyTorch Implementation of Audio-Visual Video Parsing') 183 | parser.add_argument( 184 | "--audio_dir", type=str, default='data/feats/vggish/', help="audio dir") 185 | parser.add_argument( 186 | "--video_dir", type=str, default='data/feats/res152/', 187 | help="video dir") 188 | parser.add_argument( 189 | "--st_dir", type=str, default='data/feats/r2plus1d_18/', 190 | help="video dir") 191 | parser.add_argument( 192 | "--label_train", type=str, default="data/AVVP_train.csv", help="weak train csv file") 193 | parser.add_argument( 194 | "--label_val", type=str, default="data/AVVP_val_pd.csv", help="weak val csv file") 195 | parser.add_argument( 196 | "--label_test", type=str, default="data/AVVP_test_pd.csv", help="weak test csv file") 197 | parser.add_argument('--batch-size', type=int, default=16, metavar='N', 198 | help='input batch size for training (default: 16)') 199 | parser.add_argument('--epochs', type=int, default=12, metavar='N', 200 | help='number of epochs to train (default: 60)') 201 | parser.add_argument('--lr', type=float, default=3e-4, metavar='LR', 202 | help='learning rate (default: 3e-4)') 203 | parser.add_argument( 204 | "--model", type=str, default='MMIL_Net', help="with model to use") 205 | parser.add_argument( 206 | "--mode", type=str, default='train', help="with mode to use") 207 | parser.add_argument('--seed', type=int, default=1, metavar='S', 208 | help='random seed (default: 1)') 209 | parser.add_argument('--log-interval', type=int, default=50, metavar='N', 210 | help='how many batches to wait before logging training status') 211 | parser.add_argument( 212 | "--model_save_dir", type=str, default='models/', help="model save dir") 213 | parser.add_argument( 214 | "--checkpoint", type=str, default='MMIL_Net', 215 | help="save model name") 216 | parser.add_argument( 217 | '--gpu', type=str, default='0', help='gpu device number') 218 | args = parser.parse_args() 219 | 220 | torch.manual_seed(args.seed) 221 | 222 | model = MMIL_Net().to('cuda') 223 | 224 | if args.mode == 'retrain': 225 | train_dataset = LLP_dataset(train=True, label=args.label_train, audio_dir=args.audio_dir, video_dir=args.video_dir, st_dir=args.st_dir, transform = transforms.Compose([ToTensor()])) 226 | val_dataset = LLP_dataset(label=args.label_val, audio_dir=args.audio_dir, video_dir=args.video_dir, st_dir=args.st_dir, transform = transforms.Compose([ToTensor()])) 227 | test_dataset = LLP_dataset(train=False, label=args.label_test, audio_dir=args.audio_dir, video_dir=args.video_dir, 228 | st_dir=args.st_dir, transform=transforms.Compose([ToTensor()])) 229 | 230 | train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=12, pin_memory = True) 231 | val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory = True) 232 | test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True) 233 | 234 | optimizer = optim.Adam(model.parameters(), lr=args.lr) 235 | scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) 236 | criterion = nn.BCELoss() 237 | best_F = 0 238 | 239 | for epoch in range(1, args.epochs + 1): 240 | train(args, model, train_loader, optimizer, criterion, epoch=epoch) 241 | scheduler.step() 242 | print("Validation Performance of Epoch {}:".format(epoch)) 243 | F = eval(model, val_loader, args.label_val) 244 | 245 | print("Test the latest model:") 246 | F=eval(model, test_loader, args.label_test) 247 | 248 | elif args.mode == 'val': 249 | test_dataset = LLP_dataset(label=args.label_val, audio_dir=args.audio_dir, video_dir=args.video_dir, 250 | st_dir=args.st_dir, transform=transforms.Compose([ 251 | ToTensor()])) 252 | test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True) 253 | model.load_state_dict(torch.load(args.model_save_dir + args.checkpoint + ".pt")) 254 | eval(model, test_loader, args.label_val) 255 | else: 256 | test_dataset = LLP_dataset(label=args.label_test, audio_dir=args.audio_dir, video_dir=args.video_dir, st_dir=args.st_dir, transform = transforms.Compose([ 257 | ToTensor()])) 258 | test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=True) 259 | model.load_state_dict(torch.load(args.model_save_dir + args.checkpoint + ".pt")) 260 | eval(model, test_loader, args.label_test) 261 | if __name__ == '__main__': 262 | main() 263 | -------------------------------------------------------------------------------- /step3_retrain/need_to_change.pkl: -------------------------------------------------------------------------------- 1 | ../step2_find_exchange/need_to_change.pkl -------------------------------------------------------------------------------- /step3_retrain/nets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/step3_retrain/nets/__init__.py -------------------------------------------------------------------------------- /step3_retrain/nets/models_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.autograd as autograd 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.autograd import Variable 7 | from torch.nn import init 8 | 9 | import math 10 | import numpy as np 11 | 12 | 13 | 14 | 15 | 16 | class ScaledDotProductAttention(nn.Module): 17 | ''' Scaled Dot-Product Attention ''' 18 | 19 | def __init__(self, temperature, attn_dropout=0.1): 20 | super().__init__() 21 | self.temperature = temperature 22 | self.dropout = nn.Dropout(attn_dropout) 23 | self.softmax = nn.Softmax(dim=2) 24 | 25 | def forward(self, q, k, v): 26 | 27 | attn = torch.bmm(q, k.transpose(1, 2)) 28 | attn = attn / self.temperature 29 | 30 | attn = self.softmax(attn) 31 | attn = self.dropout(attn) 32 | output = torch.bmm(attn, v) 33 | 34 | return output, attn 35 | 36 | 37 | class LayerNorm(nn.Module): 38 | 39 | def __init__(self, features, eps=1e-6): 40 | super().__init__() 41 | self.gamma = nn.Parameter(torch.ones(features)) 42 | self.beta = nn.Parameter(torch.zeros(features)) 43 | self.eps = eps 44 | 45 | def forward(self, x): 46 | mean = x.mean(-1, keepdim=True) 47 | std = x.std(-1, keepdim=True) 48 | return self.gamma * (x - mean) / (std + self.eps) + self.beta 49 | 50 | 51 | class PositionalEncoding2(nn.Module): 52 | "Implement the PE function." 53 | def __init__(self, d_model, dropout, max_len=500): 54 | super(PositionalEncoding, self).__init__() 55 | self.dropout = nn.Dropout(p=dropout) 56 | 57 | # Compute the positional encodings once in log space. 58 | pe = torch.zeros(max_len, d_model) 59 | position = torch.arange(0., max_len).unsqueeze(1) 60 | div_term = torch.exp(torch.arange(0., d_model, 2) * 61 | -(math.log(10000.0) / d_model)) 62 | pe[:, 0::2] = torch.sin(position * div_term) 63 | pe[:, 1::2] = torch.cos(position * div_term) 64 | pe = pe.unsqueeze(0) 65 | self.register_buffer('pe', pe) 66 | 67 | def forward(self, x): 68 | with torch.no_grad(): 69 | x1 = self.pe[:, :x.size(1)] 70 | x = x + x1 71 | #x = x + Variable(self.pe[:, :x.size(1)], 72 | # requires_grad=False) 73 | return self.dropout(x) 74 | 75 | class MultiHeadAttention2(nn.Module): 76 | ''' Multi-Head Attention module ''' 77 | 78 | def __init__(self, d_model, d_k, d_v, n_head=1, dropout=0.5): 79 | super().__init__() 80 | 81 | self.n_head = n_head 82 | self.d_k = d_k 83 | self.d_v = d_v 84 | 85 | self.w_qs = nn.Linear(d_model, n_head * d_k) 86 | self.w_ks = nn.Linear(d_model, n_head * d_k) 87 | self.w_vs = nn.Linear(d_model, n_head * d_v) 88 | nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 89 | nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 90 | nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) 91 | 92 | self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) 93 | self.layer_norm = LayerNorm(d_model) 94 | 95 | self.fc = nn.Linear(n_head * d_v, d_model) 96 | nn.init.xavier_normal_(self.fc.weight) 97 | 98 | self.dropout = nn.Dropout(dropout) 99 | 100 | 101 | def forward(self, q, k, v): 102 | 103 | d_k, d_v, n_head = self.d_k, self.d_v, self.n_head 104 | 105 | sz_b, len_q, _ = q.size() 106 | sz_b, len_k, _ = k.size() 107 | sz_b, len_v, _ = v.size() 108 | 109 | residual = q 110 | 111 | q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) 112 | k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) 113 | v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) 114 | 115 | q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk 116 | k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk 117 | v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv 118 | 119 | output, attn = self.attention(q, k, v) 120 | 121 | output = output.view(n_head, sz_b, len_q, d_v) 122 | output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv) 123 | 124 | output = self.dropout(self.fc(output)) 125 | output = self.layer_norm(output + residual) 126 | 127 | return output, attn 128 | 129 | 130 | 131 | 132 | 133 | 134 | class ContrastiveLoss(torch.nn.Module): 135 | """ 136 | Contrastive loss function. 137 | Based on: 138 | """ 139 | def __init__(self, margin=2.0): 140 | super(ContrastiveLoss, self).__init__() 141 | self.margin = margin 142 | 143 | def forward(self, dist, y): 144 | # euclidian distance 145 | dist_sq = torch.pow(dist, 2) 146 | dist = torch.clamp(self.margin - dist, min=0.0) 147 | 148 | 149 | assert len(y.data.shape) == 2, y.data.shape 150 | bs, time = y.data.shape 151 | y = y.view(-1) 152 | 153 | loss = y * dist_sq + (1 - y) * torch.pow(dist, 2) 154 | loss = torch.mean(loss) 155 | return loss 156 | 157 | 158 | 159 | 160 | 161 | class BinaryFocalLoss(nn.Module): 162 | """ 163 | This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in 164 | 'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)' 165 | Focal_Loss= -1*alpha*(1-pt)*log(pt) 166 | :param num_class: 167 | :param alpha: (tensor) 3D or 4D the scalar factor for this criterion 168 | :param gamma: (float,double) gamma > 0 reduces the relative loss for well-classified examples (p>0.5) putting more 169 | focus on hard misclassified example 170 | :param reduction: `none`|`mean`|`sum` 171 | :param **kwargs 172 | balance_index: (int) balance class index, should be specific when alpha is float 173 | """ 174 | 175 | def __init__(self, alpha=[1.0, 1.0], gamma=2, ignore_index=None, reduction='mean'): 176 | super(BinaryFocalLoss, self).__init__() 177 | if alpha is None: 178 | alpha = [0.25, 0.75] 179 | self.alpha = alpha 180 | self.gamma = gamma 181 | self.smooth = 1e-6 182 | self.ignore_index = ignore_index 183 | self.reduction = reduction 184 | 185 | assert self.reduction in ['none', 'mean', 'sum'] 186 | 187 | if self.alpha is None: 188 | self.alpha = torch.ones(2) 189 | elif isinstance(self.alpha, (list, np.ndarray)): 190 | self.alpha = np.asarray(self.alpha) 191 | self.alpha = np.reshape(self.alpha, (2)) 192 | assert self.alpha.shape[0] == 2, \ 193 | 'the `alpha` shape is not match the number of class' 194 | elif isinstance(self.alpha, (float, int)): 195 | self.alpha = np.asarray([self.alpha, 1.0 - self.alpha], dtype=np.float).view(2) 196 | 197 | else: 198 | raise TypeError('{} not supported'.format(type(self.alpha))) 199 | 200 | def forward(self, output, target): 201 | prob = torch.sigmoid(output) 202 | prob = torch.clamp(prob, self.smooth, 1.0 - self.smooth) 203 | 204 | pos_mask = (target == 1).float() 205 | neg_mask = (target == 0).float() 206 | 207 | pos_loss = -self.alpha[0] * torch.pow(torch.sub(1.0, prob), self.gamma) * torch.log(prob) * pos_mask 208 | neg_loss = -self.alpha[1] * torch.pow(prob, self.gamma) * \ 209 | torch.log(torch.sub(1.0, prob)) * neg_mask 210 | 211 | neg_loss = neg_loss.sum() 212 | pos_loss = pos_loss.sum() 213 | num_pos = pos_mask.view(pos_mask.size(0), -1).sum() 214 | num_neg = neg_mask.view(neg_mask.size(0), -1).sum() 215 | 216 | if num_pos == 0: 217 | loss = neg_loss 218 | else: 219 | loss = pos_loss / num_pos + neg_loss / num_neg 220 | return loss 221 | 222 | 223 | 224 | class FocalLoss_Ori(nn.Module): 225 | """ 226 | This is a implementation of Focal Loss with smooth label cross entropy supported which is proposed in 227 | 'Focal Loss for Dense Object Detection. (https://arxiv.org/abs/1708.02002)' 228 | Focal_Loss= -1*alpha*(1-pt)*log(pt) 229 | :param num_class: 230 | :param alpha: (tensor) 3D or 4D the scalar factor for this criterion 231 | :param gamma: (float,double) gamma > 0 reduces the relative loss for well-classified examples (p>0.5) putting more 232 | focus on hard misclassified example 233 | :param smooth: (float,double) smooth value when cross entropy 234 | :param size_average: (bool, optional) By default, the losses are averaged over each loss element in the batch. 235 | """ 236 | 237 | def __init__(self, num_class, alpha=[0.25,0.75], gamma=2, balance_index=-1, size_average=True): 238 | super(FocalLoss_Ori, self).__init__() 239 | self.num_class = num_class 240 | self.alpha = alpha 241 | self.gamma = gamma 242 | self.size_average = size_average 243 | self.eps = 1e-6 244 | 245 | 246 | def forward(self, logit, target): 247 | 248 | if logit.dim() > 2: 249 | # N,C,d1,d2 -> N,C,m (m=d1*d2*...) 250 | logit = logit.view(logit.size(0), logit.size(1), -1) 251 | logit = logit.transpose(1, 2).contiguous() # [N,C,d1*d2..] -> [N,d1*d2..,C] 252 | logit = logit.view(-1, logit.size(-1)) # [N,d1*d2..,C]-> [N*d1*d2..,C] 253 | target = target.view(-1, 1) # [N,d1,d2,...]->[N*d1*d2*...,1] 254 | 255 | # -----------legacy way------------ 256 | # idx = target.cpu().long() 257 | # one_hot_key = torch.FloatTensor(target.size(0), self.num_class).zero_() 258 | # one_hot_key = one_hot_key.scatter_(1, idx, 1) 259 | # if one_hot_key.device != logit.device: 260 | # one_hot_key = one_hot_key.to(logit.device) 261 | # pt = (one_hot_key * logit).sum(1) + epsilon 262 | 263 | # ----------memory saving way-------- 264 | pt = logit.gather(1, target).view(-1) + self.eps # avoid apply 265 | logpt = pt.log() 266 | 267 | loss = -1 * torch.pow(torch.add(0.5, pt), self.gamma) * logpt 268 | 269 | if self.size_average: 270 | loss = loss.mean() 271 | else: 272 | loss = loss.sum() 273 | return loss 274 | 275 | 276 | -------------------------------------------------------------------------------- /step3_retrain/nets/net_audiovisual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy 6 | import copy 7 | import math 8 | from .models_utils import MultiHeadAttention2, PositionalEncoding2 9 | 10 | def _get_clones(module, N): 11 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 12 | 13 | class Encoder(nn.Module): 14 | 15 | def __init__(self, encoder_layer, num_layers, norm=None): 16 | super(Encoder, self).__init__() 17 | self.layers = _get_clones(encoder_layer, num_layers) 18 | self.num_layers = num_layers 19 | self.norm1 = nn.LayerNorm(512) 20 | self.norm2 = nn.LayerNorm(512) 21 | self.norm = norm 22 | 23 | def forward(self, src_a, src_v, mask=None, src_key_padding_mask=None): 24 | output_a = src_a 25 | output_v = src_v 26 | 27 | for i in range(self.num_layers): 28 | output_a = self.layers[i](src_a, src_v, src_mask=mask, 29 | src_key_padding_mask=src_key_padding_mask) 30 | output_v = self.layers[i](src_v, src_a, src_mask=mask, 31 | src_key_padding_mask=src_key_padding_mask) 32 | 33 | if self.norm: 34 | output_a = self.norm1(output_a) 35 | output_v = self.norm2(output_v) 36 | 37 | return output_a, output_v 38 | 39 | class HANLayer(nn.Module): 40 | 41 | def __init__(self, d_model, nhead, dim_feedforward=512, dropout=0.1): 42 | super(HANLayer, self).__init__() 43 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 44 | self.cm_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 45 | 46 | # Implementation of Feedforward model 47 | self.linear1 = nn.Linear(d_model, dim_feedforward) 48 | self.dropout = nn.Dropout(dropout) 49 | self.linear2 = nn.Linear(dim_feedforward, d_model) 50 | 51 | self.norm1 = nn.LayerNorm(d_model) 52 | self.norm2 = nn.LayerNorm(d_model) 53 | self.dropout11 = nn.Dropout(dropout) 54 | self.dropout12 = nn.Dropout(dropout) 55 | self.dropout2 = nn.Dropout(dropout) 56 | 57 | self.activation = nn.ReLU() 58 | 59 | def forward(self, src_q, src_v, src_mask=None, src_key_padding_mask=None): 60 | """Pass the input through the encoder layer. 61 | 62 | Args: 63 | src: the sequnce to the encoder layer (required). 64 | src_mask: the mask for the src sequence (optional). 65 | src_key_padding_mask: the mask for the src keys per batch (optional). 66 | 67 | Shape: 68 | see the docs in Transformer class. 69 | """ 70 | src_q = src_q.permute(1, 0, 2) 71 | src_v = src_v.permute(1, 0, 2) 72 | 73 | src1 = self.cm_attn(src_q, src_v, src_v, attn_mask=src_mask, 74 | key_padding_mask=src_key_padding_mask)[0] 75 | src2 = self.self_attn(src_q, src_q, src_q, attn_mask=src_mask, 76 | key_padding_mask=src_key_padding_mask)[0] 77 | 78 | src_q = src_q + self.dropout11(src1) + self.dropout12(src2) 79 | src_q = self.norm1(src_q) 80 | 81 | src2 = self.linear2(self.dropout(F.relu(self.linear1(src_q)))) 82 | src_q = src_q + self.dropout2(src2) 83 | src_q = self.norm2(src_q) 84 | return src_q.permute(1, 0, 2) 85 | 86 | 87 | 88 | class MMIL_Net(nn.Module): 89 | 90 | def __init__(self): 91 | super(MMIL_Net, self).__init__() 92 | 93 | self.fc_prob = nn.Linear(512, 25) 94 | self.fc_frame_att = nn.Linear(512, 25) 95 | self.fc_av_att = nn.Linear(512, 25) 96 | self.fc_a = nn.Linear(128, 512) 97 | self.fc_v = nn.Linear(2048, 512) 98 | self.fc_st = nn.Linear(512, 512) 99 | self.fc_fusion = nn.Linear(1024, 512) 100 | self.audio_encoder = nn.TransformerEncoder \ 101 | (nn.TransformerEncoderLayer(d_model=512, nhead=1, dim_feedforward=512), num_layers=1) 102 | self.visual_encoder = nn.TransformerEncoder \ 103 | (nn.TransformerEncoderLayer(d_model=512, nhead=1, dim_feedforward=512), num_layers=1) 104 | self.cmt_encoder = Encoder(CMTLayer(d_model=512, nhead=1, dim_feedforward=512), num_layers=1) 105 | self.hat_encoder = Encoder(HANLayer(d_model=512, nhead=1, dim_feedforward=512), num_layers=1) 106 | 107 | self.t_att = MultiHeadAttention2(512, 512, 512) 108 | self.t_att2 = MultiHeadAttention2(512, 512, 512) 109 | self.fc1= nn.Linear(1024, 256) 110 | self.fc2= nn.Linear(256, 2) 111 | 112 | def forward(self, audio, visual, visual_st, audio2): 113 | 114 | b, t, d = visual_st.size() 115 | x1 = self.fc_a(audio) 116 | x_fake = self.fc_a(audio2) 117 | x_audio = x1 118 | 119 | # 2d and 3d visual feature fusion (b, 80, 2048), (b, 10, 512) 120 | 121 | # merge (b, 80, 2048) -> (b, 10, 512) 122 | vid_s = self.fc_v(visual).permute(0, 2, 1).unsqueeze(-1) 123 | vid_s = F.avg_pool2d(vid_s, (8, 1)).squeeze(-1).permute(0, 2, 1) 124 | 125 | vid_st = self.fc_st(visual_st) 126 | x2 = torch.cat((vid_s, vid_st), dim =-1) 127 | x2 = self.fc_fusion(x2) 128 | x_visual = x2 129 | 130 | # HAN 131 | x1, x2 = self.hat_encoder(x1, x2) 132 | sims = 1 133 | 134 | xx1 = F.normalize(x_visual, p=2, dim=-1) 135 | xx2 = F.normalize(x1, p=2, dim=-1) 136 | 137 | sims = xx2.bmm(xx1.permute(0, 2, 1)).squeeze(1) / 0.2 138 | sims = sims.reshape(-1, 10) 139 | 140 | mask = torch.zeros(b, 10) 141 | mask = mask.long() 142 | for i in range(10): 143 | mask[:, i] = i 144 | mask = mask.cuda() 145 | mask = mask.reshape(-1) 146 | # prediction 147 | x = torch.cat([x1.unsqueeze(-2), x2.unsqueeze(-2)], dim=-2) 148 | frame_prob = torch.sigmoid(self.fc_prob(x)) 149 | 150 | # attentive MMIL pooling 151 | 152 | frame_att = torch.softmax(self.fc_frame_att(x), dim=1) 153 | av_att = torch.softmax(self.fc_av_att(x), dim=2) 154 | temporal_prob = (frame_att * frame_prob) 155 | global_prob = (temporal_prob*av_att).sum(dim=2).sum(dim=1) 156 | 157 | a_prob = temporal_prob[:, :, 0, :].sum(dim=1) 158 | v_prob =temporal_prob[:, :, 1, :].sum(dim=1) 159 | 160 | return global_prob, a_prob, v_prob, frame_prob, sims, mask 161 | 162 | 163 | class CMTLayer(nn.Module): 164 | 165 | def __init__(self, d_model, nhead, dim_feedforward=512, dropout=0.1): 166 | super(CMTLayer, self).__init__() 167 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 168 | # Implementation of Feedforward model 169 | self.linear1 = nn.Linear(d_model, dim_feedforward) 170 | self.dropout = nn.Dropout(dropout) 171 | self.linear2 = nn.Linear(dim_feedforward, d_model) 172 | 173 | self.norm1 = nn.LayerNorm(d_model) 174 | self.norm2 = nn.LayerNorm(d_model) 175 | self.dropout1 = nn.Dropout(dropout) 176 | self.dropout2 = nn.Dropout(dropout) 177 | 178 | self.activation = nn.ReLU() 179 | 180 | def forward(self, src_q, src_v, src_mask=None, src_key_padding_mask=None): 181 | r"""Pass the input through the encoder layer. 182 | 183 | Args: 184 | src: the sequnce to the encoder layer (required). 185 | src_mask: the mask for the src sequence (optional). 186 | src_key_padding_mask: the mask for the src keys per batch (optional). 187 | 188 | Shape: 189 | see the docs in Transformer class. 190 | """ 191 | src2 = self.self_attn(src_q, src_v, src_v, attn_mask=src_mask, 192 | key_padding_mask=src_key_padding_mask)[0] 193 | src_q = src_q + self.dropout1(src2) 194 | src_q = self.norm1(src_q) 195 | 196 | src2 = self.linear2(self.dropout(F.relu(self.linear1(src_q)))) 197 | src_q = src_q + self.dropout2(src2) 198 | src_q = self.norm2(src_q) 199 | return src_q 200 | -------------------------------------------------------------------------------- /step3_retrain/run.sh: -------------------------------------------------------------------------------- 1 | #CUDA_VISIBLE_DEVICES=1 . 2 | python main_avvp.py --mode retrain --audio_dir ../feats/vggish/ --video_dir ../feats/res152/ --st_dir ../feats/r2plus1d_18 3 | -------------------------------------------------------------------------------- /step3_retrain/scripts/download_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | def download(set, name, t_seg): 5 | #label = label.replace(" ", "_") # avoid space in folder name 6 | path_data = os.path.join(set, "video") 7 | print(path_data) 8 | if not os.path.exists(path_data): 9 | os.makedirs(path_data) 10 | link_prefix = "https://www.youtube.com/watch?v=" 11 | 12 | filename_full_video = os.path.join(path_data, name) + "_full_video.mp4" 13 | filename = os.path.join(path_data, name) + ".mp4" 14 | link = link_prefix + name 15 | 16 | if os.path.exists(filename): 17 | print("already exists, skip") 18 | return 19 | 20 | print( "download the whole video for: [%s] - [%s]" % (set, name)) 21 | command1 = 'youtube-dl --ignore-config ' 22 | command1 += link + " " 23 | command1 += "-o " + filename_full_video + " " 24 | command1 += "-f best " 25 | 26 | #command1 += '-q ' # print no log 27 | #print command1 28 | os.system(command1) 29 | 30 | t_start, t_end = t_seg 31 | t_dur = t_end - t_start 32 | print("trim the video to [%.1f-%.1f]" % (t_start, t_end)) 33 | command2 = 'ffmpeg ' 34 | command2 += '-ss ' 35 | command2 += str(t_start) + ' ' 36 | command2 += '-i ' 37 | command2 += filename_full_video + ' ' 38 | command2 += '-t ' 39 | command2 += str(t_dur) + ' ' 40 | command2 += '-vcodec libx264 ' 41 | command2 += '-acodec aac -strict -2 ' 42 | command2 += filename + ' ' 43 | command2 += '-y ' # overwrite without asking 44 | command2 += '-loglevel -8 ' # print no log 45 | #print(command2) 46 | os.system(command2) 47 | try: 48 | os.remove(filename_full_video) 49 | except: 50 | return 51 | 52 | print ("finish the video as: " + filename) 53 | 54 | 55 | ##%% read the label encoding 56 | # filename = "../doc/class_labels_indices.csv" 57 | # lines = [x.strip() for x in open(filename, 'r')][1:] 58 | # label_encode = {} 59 | # for l in lines: 60 | # l = l[l.find(",")+1:] 61 | # encode = l.split(",")[0] 62 | # label_encode[ l[len(encode)+2:-1] ] = encode 63 | # 64 | # 65 | # 66 | 67 | # %% read the video trim time indices 68 | filename_source = "data/AVVP_dataset_full.csv" # 69 | set = "data/LLP_dataset" 70 | df = pd.read_csv(filename_source, header=0, sep='\t') 71 | filenames = df["filename"] 72 | length = len(filenames) 73 | print(length) 74 | names = [] 75 | segments = {} 76 | for i in range(length): 77 | row = df.loc[i, :] 78 | name = row[0][:11] 79 | steps = row[0][11:].split("_") 80 | t_start = float(steps[1]) 81 | t_end = t_start + 10 82 | segments[name] = (t_start, t_end) 83 | download(set, name, segments[name]) 84 | names.append(name) 85 | print(len(segments)) 86 | 87 | -------------------------------------------------------------------------------- /step3_retrain/scripts/extract_3D_feat.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import sys 3 | import subprocess 4 | import glob 5 | from tqdm import tqdm 6 | import numpy as np 7 | import os 8 | import argparse 9 | from PIL import Image 10 | import torch 11 | from torch import nn 12 | import torch.nn.functional as F 13 | import torchvision.models as models 14 | import transforms as TF 15 | import utils 16 | import torchvision 17 | 18 | 19 | C, H, W = 3, 112, 112 20 | 21 | def extract_feats(params, model, load_img): 22 | global C, H, W 23 | model.eval() 24 | dir_fc = os.path.join(os.getcwd(), params['output_dir']) 25 | if not os.path.isdir(dir_fc): 26 | os.mkdir(dir_fc) 27 | 28 | video_list = os.listdir(params['video_path']) 29 | nn = 0 30 | for video in video_list: 31 | 32 | nn = nn + 1 33 | dst = video 34 | 35 | image_list = sorted(glob.glob(os.path.join(params['video_path'], dst, '*.jpg'))) 36 | samples = np.round(np.linspace( 37 | 0, len(image_list) - 1, params['n_frame_steps'])) 38 | 39 | image_list = [image_list[int(sample)] for sample in samples] 40 | images = torch.zeros((len(image_list)//8, C, 8, H, W)) 41 | i = 0 42 | for iImg in range(len(image_list)): 43 | 44 | ii = i//8 45 | img = load_img(image_list[iImg]) 46 | images[ii, :, i%8, :, :] = img 47 | i += 1 48 | 49 | with torch.no_grad(): 50 | fc_feats = model(images.cuda()).squeeze() 51 | img_feats = fc_feats.cpu().numpy() 52 | # Save the inception features 53 | outfile = os.path.join(dir_fc, video + '.npy') 54 | np.save(outfile, img_feats) 55 | # cleanup 56 | #shutil.rmtree(dst) 57 | print(nn) 58 | 59 | 60 | if __name__ == '__main__': 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument("--gpu", dest='gpu', type=str, default='1', 63 | help='Set CUDA_VISIBLE_DEVICES environment variable, optional') 64 | parser.add_argument("--output_dir", dest='output_dir', type=str, 65 | default='data/LLP_dataset/feats/r2plus1d_18', help='directory to store features') 66 | parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=80, 67 | help='how many frames to sampler per video') 68 | 69 | parser.add_argument("--video_path", dest='video_path', type=str, 70 | default='data/LLP_dataset/frame', help='path to video dataset') 71 | parser.add_argument("--model", dest="model", type=str, default='r2plus1d_18', 72 | help='the CNN model you want to use to extract_feats') 73 | 74 | args = parser.parse_args() 75 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 76 | params = vars(args) 77 | if params['model'] == 'r2plus1d_18': 78 | model = models.video.r2plus1d_18(pretrained=True) 79 | model = nn.Sequential(*list(model.children())[:-1]) 80 | for param in model.parameters(): 81 | param.requires_grad = False 82 | T, C, H, W = 8, 3, 112, 112 83 | load_img = utils.LoadTransformImage() 84 | 85 | else: 86 | print("doesn't support %s" % (params['model'])) 87 | 88 | model = nn.DataParallel(model) 89 | model = model.cuda() 90 | extract_feats(params, model, load_img) 91 | -------------------------------------------------------------------------------- /step3_retrain/scripts/extract_audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import moviepy 3 | from moviepy.audio.AudioClip import AudioArrayClip 4 | from moviepy.editor import VideoFileClip 5 | 6 | video_pth = "data/LLP_dataset/video" 7 | sound_list = os.listdir(video_pth) 8 | save_pth = "data/LLP_dataset/audio" 9 | 10 | for audio_id in sound_list: 11 | name = os.path.join(video_pth, audio_id) 12 | audio_name = audio_id[:-4] + '.wav' 13 | exist_lis = os.listdir(save_pth) 14 | if audio_name in exist_lis: 15 | print("already exist!") 16 | continue 17 | try: 18 | video = VideoFileClip(name) 19 | audio = video.audio 20 | audio.write_audiofile(os.path.join(save_pth, audio_name), fps=16000) 21 | print("finish video id: " + audio_name) 22 | except: 23 | print("cannot load ", name) -------------------------------------------------------------------------------- /step3_retrain/scripts/extract_frames.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import os 4 | import argparse 5 | import glob 6 | 7 | def extract_frames(video, dst): 8 | command1 = 'ffmpeg ' 9 | command1 += '-i ' + video + " " 10 | command1 += '-y' + " " 11 | command1 += "-r " + "8 " 12 | command1 += '{0}/%06d.jpg'.format(dst) 13 | print(command1) 14 | # print command1 15 | os.system(command1) 16 | 17 | return 18 | 19 | if __name__ == '__main__': 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--out_dir', dest='out_dir', type=str, default='data/LLP_dataset/frame') 22 | parser.add_argument('--video_path', dest='video_path', type=str, default='data/LLP_dataset/video') 23 | args = parser.parse_args() 24 | 25 | vid_list = os.listdir(args.video_path) 26 | 27 | for vid_id in vid_list: 28 | name = os.path.join(args.video_path, vid_id) 29 | dst = os.path.join(args.out_dir, vid_id[:-4]) 30 | print(dst) 31 | if not os.path.exists(dst): 32 | os.makedirs(dst) 33 | extract_frames(name, dst) 34 | print("finish video id: " + vid_id) 35 | -------------------------------------------------------------------------------- /step3_retrain/scripts/extract_rgb_feat.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import glob 4 | from tqdm import tqdm 5 | import numpy as np 6 | import os 7 | import argparse 8 | from PIL import Image 9 | import torch 10 | from torch import nn 11 | import torch.nn.functional as F 12 | import pretrainedmodels 13 | from pretrainedmodels import utils 14 | 15 | C, H, W = 3, 224, 224 16 | 17 | def extract_feats(params, model, load_image_fn): 18 | global C, H, W 19 | model.eval() 20 | dir_fc = os.path.join(os.getcwd(), params['output_dir']) 21 | if not os.path.isdir(dir_fc): 22 | os.mkdir(dir_fc) 23 | 24 | video_list = os.listdir(params['video_path']) 25 | nn = 0 26 | for video in video_list: 27 | 28 | nn = nn + 1 29 | dst = video 30 | 31 | image_list = sorted(glob.glob(os.path.join(params['video_path'], dst, '*.jpg'))) 32 | samples = np.round(np.linspace( 33 | 0, len(image_list) - 1, params['n_frame_steps'])) 34 | 35 | image_list = [image_list[int(sample)] for sample in samples] 36 | images = torch.zeros((len(image_list), C, H, W)) 37 | i = 0 38 | for iImg in range(len(image_list)): 39 | img = load_image_fn(image_list[iImg]) 40 | images[iImg] = img 41 | 42 | 43 | with torch.no_grad(): 44 | fc_feats = model(images.cuda()).squeeze() 45 | img_feats = fc_feats.cpu().numpy() 46 | #print(img_feats.shape) 47 | # Save the inception features 48 | outfile = os.path.join(dir_fc, video + '.npy') 49 | np.save(outfile, img_feats) 50 | # cleanup 51 | #shutil.rmtree(dst) 52 | print(nn) 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument("--gpu", dest='gpu', type=str, default='0', 58 | help='Set CUDA_VISIBLE_DEVICES environment variable, optional') 59 | parser.add_argument("--output_dir", dest='output_dir', type=str, 60 | default='data/LLP_dataset/feats/res152', help='directory to store features') 61 | parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=80, 62 | help='how many frames to sampler per video') 63 | 64 | parser.add_argument("--video_path", dest='video_path', type=str, 65 | default='data/LLP_dataset/frame', help='path to video dataset') 66 | parser.add_argument("--model", dest="model", type=str, default='resnet152', 67 | help='the CNN model you want to use to extract_feats') 68 | 69 | args = parser.parse_args() 70 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 71 | params = vars(args) 72 | if params['model'] == 'inception_v3': 73 | C, H, W = 3, 299, 299 74 | model = pretrainedmodels.inceptionv3(pretrained='imagenet') 75 | load_image_fn = utils.LoadTransformImage(model) 76 | 77 | elif params['model'] == 'resnet152': 78 | C, H, W = 3, 224, 224 79 | model = pretrainedmodels.resnet152(pretrained='imagenet') 80 | load_image_fn = utils.LoadTransformImage(model) 81 | elif params['model'] == 'vgg19_bn': 82 | C, H, W = 3, 224, 224 83 | model = pretrainedmodels.vgg19_bn(pretrained='imagenet') 84 | load_image_fn = utils.LoadTransformImage(model) 85 | elif params['model'] == 'inception_v4': 86 | C, H, W = 3, 299, 299 87 | model = pretrainedmodels.inceptionv4( 88 | num_classes=1000, pretrained='imagenet') 89 | load_image_fn = utils.LoadTransformImage(model) 90 | elif params['model'] == 'nasnetalarge': 91 | C, H, W = 3, 299, 299 92 | model = pretrainedmodels.inceptionv4( 93 | num_classes=1000, pretrained='imagenet') 94 | load_image_fn = utils.LoadTransformImage(model) 95 | 96 | else: 97 | print("doesn't support %s" % (params['model'])) 98 | 99 | model.last_linear = utils.Identity() 100 | model = nn.DataParallel(model) 101 | 102 | model = model.cuda() 103 | extract_feats(params, model, load_image_fn) 104 | -------------------------------------------------------------------------------- /step3_retrain/scripts/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | 4 | 5 | def crop(vid, i, j, h, w): 6 | return vid[..., i:(i + h), j:(j + w)] 7 | 8 | 9 | def center_crop(vid, output_size): 10 | h, w = vid.shape[-2:] 11 | th, tw = output_size 12 | 13 | i = int(round((h - th) / 2.)) 14 | j = int(round((w - tw) / 2.)) 15 | return crop(vid, i, j, th, tw) 16 | 17 | 18 | def hflip(vid): 19 | return vid.flip(dims=(-1,)) 20 | 21 | 22 | # NOTE: for those functions, which generally expect mini-batches, we keep them 23 | # as non-minibatch so that they are applied as if they were 4d (thus image). 24 | # this way, we only apply the transformation in the spatial domain 25 | def resize(vid, size, interpolation='bilinear'): 26 | # NOTE: using bilinear interpolation because we don't work on minibatches 27 | # at this level 28 | scale = None 29 | if isinstance(size, int): 30 | scale = float(size) / min(vid.shape[-2:]) 31 | size = None 32 | return torch.nn.functional.interpolate( 33 | vid, size=size, scale_factor=scale, mode=interpolation, align_corners=False) 34 | 35 | 36 | def pad(vid, padding, fill=0, padding_mode="constant"): 37 | # NOTE: don't want to pad on temporal dimension, so let as non-batch 38 | # (4d) before padding. This works as expected 39 | return torch.nn.functional.pad(vid, padding, value=fill, mode=padding_mode) 40 | 41 | 42 | def to_normalized_float_tensor(vid): 43 | return vid.permute(3, 0, 1, 2).to(torch.float32) / 255 44 | 45 | 46 | def normalize(vid, mean, std): 47 | shape = (-1,) + (1,) * (vid.dim() - 1) 48 | mean = torch.as_tensor(mean).reshape(shape) 49 | std = torch.as_tensor(std).reshape(shape) 50 | return (vid - mean) / std 51 | 52 | 53 | # Class interface 54 | 55 | class RandomCrop(object): 56 | def __init__(self, size): 57 | self.size = size 58 | 59 | @staticmethod 60 | def get_params(vid, output_size): 61 | """Get parameters for ``crop`` for a random crop. 62 | """ 63 | h, w = vid.shape[-2:] 64 | th, tw = output_size 65 | if w == tw and h == th: 66 | return 0, 0, h, w 67 | i = random.randint(0, h - th) 68 | j = random.randint(0, w - tw) 69 | return i, j, th, tw 70 | 71 | def __call__(self, vid): 72 | i, j, h, w = self.get_params(vid, self.size) 73 | return crop(vid, i, j, h, w) 74 | 75 | 76 | class CenterCrop(object): 77 | def __init__(self, size): 78 | self.size = size 79 | 80 | def __call__(self, vid): 81 | return center_crop(vid, self.size) 82 | 83 | 84 | class Resize(object): 85 | def __init__(self, size): 86 | self.size = size 87 | 88 | def __call__(self, vid): 89 | return resize(vid, self.size) 90 | 91 | 92 | class ToFloatTensorInZeroOne(object): 93 | def __call__(self, vid): 94 | return to_normalized_float_tensor(vid) 95 | 96 | 97 | class Normalize(object): 98 | def __init__(self, mean, std): 99 | self.mean = mean 100 | self.std = std 101 | 102 | def __call__(self, vid): 103 | return normalize(vid, self.mean, self.std) 104 | 105 | 106 | class RandomHorizontalFlip(object): 107 | def __init__(self, p=0.5): 108 | self.p = p 109 | 110 | def __call__(self, vid): 111 | if random.random() < self.p: 112 | return hflip(vid) 113 | return vid 114 | 115 | 116 | class Pad(object): 117 | def __init__(self, padding, fill=0): 118 | self.padding = padding 119 | self.fill = fill 120 | 121 | def __call__(self, vid): 122 | return pad(vid, self.padding, self.fill) 123 | -------------------------------------------------------------------------------- /step3_retrain/scripts/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division, absolute_import 2 | import math 3 | import torch 4 | import torch.nn as nn 5 | import torchvision.transforms as transforms 6 | from PIL import Image 7 | import numpy as np 8 | from munch import munchify 9 | 10 | class ToSpaceBGR(object): 11 | 12 | def __init__(self, is_bgr): 13 | self.is_bgr = is_bgr 14 | 15 | def __call__(self, tensor): 16 | if self.is_bgr: 17 | new_tensor = tensor.clone() 18 | new_tensor[0] = tensor[2] 19 | new_tensor[2] = tensor[0] 20 | tensor = new_tensor 21 | return tensor 22 | 23 | 24 | class ToRange255(object): 25 | 26 | def __init__(self, is_255): 27 | self.is_255 = is_255 28 | 29 | def __call__(self, tensor): 30 | if self.is_255: 31 | tensor.mul_(255) 32 | return tensor 33 | 34 | 35 | class TransformImage(object): 36 | 37 | def __init__(self): 38 | self.mean = [0.43216, 0.394666, 0.37645] 39 | self.std = [0.22803, 0.22145, 0.216989] 40 | tfs = [] 41 | tfs.append(transforms.Resize((112, 112))) 42 | tfs.append(transforms.ToTensor()) 43 | tfs.append(ToSpaceBGR('RGB'=='BGR')) 44 | tfs.append(ToRange255(max([0, 1])==255)) 45 | tfs.append(transforms.Normalize(mean=self.mean, std=self.std)) 46 | 47 | self.tf = transforms.Compose(tfs) 48 | 49 | def __call__(self, img): 50 | tensor = self.tf(img) 51 | return tensor 52 | 53 | 54 | class LoadImage(object): 55 | 56 | def __init__(self, space='RGB'): 57 | self.space = space 58 | 59 | def __call__(self, path_img): 60 | with open(path_img, 'rb') as f: 61 | with Image.open(f) as img: 62 | img = img.convert(self.space) 63 | return img 64 | 65 | 66 | class LoadTransformImage(object): 67 | 68 | def __init__(self): 69 | self.load = LoadImage() 70 | self.tf = TransformImage() 71 | 72 | def __call__(self, path_img): 73 | img = self.load(path_img) 74 | tensor = self.tf(img) 75 | return tensor 76 | 77 | 78 | class Identity(nn.Module): 79 | 80 | def __init__(self): 81 | super(Identity, self).__init__() 82 | 83 | def forward(self, x): 84 | return x -------------------------------------------------------------------------------- /step3_retrain/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/step3_retrain/utils/__init__.py -------------------------------------------------------------------------------- /step3_retrain/utils/eval_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def Precision(X_pre, X_gt): 4 | 5 | N = len(X_pre) 6 | p = 0.0 7 | for i in range(N): 8 | x = X_pre[i, :] 9 | y = X_gt[i, :] 10 | p += np.sum(x*y)/np.sum(x) 11 | return p/N 12 | 13 | 14 | def Recall(X_pre, X_gt): 15 | N = len(X_pre) 16 | p = 0.0 17 | for i in range(N): 18 | x = X_pre[i, :] 19 | y = X_gt[i, :] 20 | p += np.sum(x * y) / np.sum(y) 21 | return p/N 22 | 23 | 24 | def F1(X_pre, X_gt): 25 | N = len(X_pre) 26 | p = 0 27 | for i in range(N): 28 | x = X_pre[i, :] 29 | y = X_gt[i, :] 30 | p += 2*np.sum(x * y) / (np.sum(x) + np.sum(y)) 31 | return p/N 32 | 33 | def event_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av): 34 | # extract events 35 | N = 25 36 | event_p_a = [None for n in range(25)] 37 | event_gt_a = [None for n in range(25)] 38 | event_p_v = [None for n in range(25)] 39 | event_gt_v = [None for n in range(25)] 40 | event_p_av = [None for n in range(25)] 41 | event_gt_av = [None for n in range(25)] 42 | 43 | TP_a = np.zeros(25) 44 | TP_v = np.zeros(25) 45 | TP_av = np.zeros(25) 46 | 47 | FP_a = np.zeros(25) 48 | FP_v = np.zeros(25) 49 | FP_av = np.zeros(25) 50 | 51 | FN_a = np.zeros(25) 52 | FN_v = np.zeros(25) 53 | FN_av = np.zeros(25) 54 | 55 | for n in range(N): 56 | seq_pred = SO_a[n, :] 57 | if np.sum(seq_pred)!=0: 58 | x = extract_event(seq_pred, n) 59 | event_p_a[n] = x 60 | seq_gt = GT_a[n, :] 61 | if np.sum(seq_gt)!=0: 62 | x = extract_event(seq_gt, n) 63 | event_gt_a[n] = x 64 | 65 | seq_pred = SO_v[n, :] 66 | if np.sum(seq_pred) != 0: 67 | x = extract_event(seq_pred, n) 68 | event_p_v[n] = x 69 | seq_gt = GT_v[n, :] 70 | if np.sum(seq_gt) != 0: 71 | x = extract_event(seq_gt, n) 72 | event_gt_v[n] = x 73 | 74 | seq_pred = SO_av[n, :] 75 | if np.sum(seq_pred) != 0: 76 | x = extract_event(seq_pred, n) 77 | event_p_av[n] = x 78 | 79 | seq_gt = GT_av[n, :] 80 | if np.sum(seq_gt) != 0: 81 | x = extract_event(seq_gt, n) 82 | event_gt_av[n] = x 83 | 84 | tp, fp, fn = event_wise_metric(event_p_a[n], event_gt_a[n]) 85 | TP_a[n] += tp 86 | FP_a[n] += fp 87 | FN_a[n] += fn 88 | 89 | tp, fp, fn = event_wise_metric(event_p_v[n], event_gt_v[n]) 90 | TP_v[n] += tp 91 | FP_v[n] += fp 92 | FN_v[n] += fn 93 | 94 | tp, fp, fn = event_wise_metric(event_p_av[n], event_gt_av[n]) 95 | TP_av[n] += tp 96 | FP_av[n] += fp 97 | FN_av[n] += fn 98 | 99 | TP = TP_a + TP_v 100 | FN = FN_a + FN_v 101 | FP = FP_a + FP_v 102 | 103 | n = len(FP_a) 104 | F_a = [] 105 | for ii in range(n): 106 | if (TP_a + FP_a)[ii] != 0 or (TP_a + FN_a)[ii] != 0: 107 | F_a.append(2 * TP_a[ii] / (2 * TP_a[ii] + (FN_a + FP_a)[ii])) 108 | 109 | F_v = [] 110 | for ii in range(n): 111 | if (TP_v + FP_v)[ii] != 0 or (TP_v + FN_v)[ii] != 0: 112 | F_v.append(2 * TP_v[ii] / (2 * TP_v[ii] + (FN_v + FP_v)[ii])) 113 | 114 | F = [] 115 | for ii in range(n): 116 | if (TP + FP)[ii] != 0 or (TP + FN)[ii] != 0: 117 | F.append(2 * TP[ii] / (2 * TP[ii] + (FN + FP)[ii])) 118 | 119 | F_av = [] 120 | for ii in range(n): 121 | if (TP_av + FP_av)[ii] != 0 or (TP_av + FN_av)[ii] != 0: 122 | F_av.append(2 * TP_av[ii] / (2 * TP_av[ii] + (FN_av + FP_av)[ii])) 123 | 124 | if len(F_a) == 0: 125 | f_a = 1.0 # all true negatives 126 | else: 127 | f_a = (sum(F_a)/len(F_a)) 128 | 129 | if len(F_v) == 0: 130 | f_v = 1.0 # all true negatives 131 | else: 132 | f_v = (sum(F_v)/len(F_v)) 133 | 134 | if len(F) == 0: 135 | f = 1.0 # all true negatives 136 | else: 137 | f = (sum(F)/len(F)) 138 | if len(F_av) == 0: 139 | f_av = 1.0 # all true negatives 140 | else: 141 | f_av = (sum(F_av)/len(F_av)) 142 | 143 | return f_a, f_v, f, f_av 144 | 145 | 146 | def segment_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av): 147 | # compute F scores 148 | TP_a = np.sum(SO_a * GT_a, axis=1) 149 | FN_a = np.sum((1-SO_a)*GT_a, axis = 1) 150 | FP_a = np.sum(SO_a*(1-GT_a),axis=1) 151 | 152 | n = len(FP_a) 153 | F_a = [] 154 | for ii in range(n): 155 | if (TP_a+FP_a)[ii]!= 0 or (TP_a+FN_a)[ii]!= 0: 156 | F_a.append(2*TP_a[ii] / (2*TP_a[ii] + (FN_a + FP_a)[ii])) 157 | 158 | TP_v = np.sum(SO_v * GT_v, axis=1) 159 | FN_v = np.sum((1 - SO_v) * GT_v, axis=1) 160 | FP_v = np.sum(SO_v * (1 - GT_v), axis=1) 161 | F_v = [] 162 | for ii in range(n): 163 | if (TP_v + FP_v)[ii] != 0 or (TP_v + FN_v)[ii] != 0: 164 | F_v.append(2 * TP_v[ii] / (2 * TP_v[ii] + (FN_v + FP_v)[ii])) 165 | 166 | TP = TP_a + TP_v 167 | FN = FN_a + FN_v 168 | FP = FP_a + FP_v 169 | 170 | n = len(FP) 171 | 172 | F = [] 173 | for ii in range(n): 174 | if (TP + FP)[ii] != 0 or (TP + FN)[ii] != 0: 175 | F.append(2 * TP[ii] / (2 * TP[ii] + (FN + FP)[ii])) 176 | 177 | TP_av = np.sum(SO_av * GT_av, axis=1) 178 | FN_av = np.sum((1 - SO_av) * GT_av, axis=1) 179 | FP_av = np.sum(SO_av * (1 - GT_av), axis=1) 180 | n = len(FP_av) 181 | F_av = [] 182 | for ii in range(n): 183 | if (TP_av + FP_av)[ii] != 0 or (TP_av + FN_av)[ii] != 0: 184 | F_av.append(2 * TP_av[ii] / (2 * TP_av[ii] + (FN_av + FP_av)[ii])) 185 | 186 | 187 | if len(F_a) == 0: 188 | f_a = 1.0 # all true negatives 189 | else: 190 | f_a = (sum(F_a)/len(F_a)) 191 | 192 | if len(F_v) == 0: 193 | f_v = 1.0 # all true negatives 194 | else: 195 | f_v = (sum(F_v)/len(F_v)) 196 | 197 | if len(F) == 0: 198 | f = 1.0 # all true negatives 199 | else: 200 | f = (sum(F)/len(F)) 201 | if len(F_av) == 0: 202 | f_av = 1.0 # all true negatives 203 | else: 204 | f_av = (sum(F_av)/len(F_av)) 205 | 206 | return f_a, f_v, f, f_av 207 | 208 | 209 | def to_vec(start, end): 210 | x = np.zeros(10) 211 | for i in range(start, end): 212 | x[i] = 1 213 | return x 214 | 215 | def extract_event(seq, n): 216 | x = [] 217 | i = 0 218 | while i < 10: 219 | if seq[i] == 1: 220 | start = i 221 | if i + 1 == 10: 222 | i = i + 1 223 | end = i 224 | x.append(to_vec(start, end)) 225 | break 226 | 227 | for j in range(i + 1, 10): 228 | if seq[j] != 1: 229 | i = j + 1 230 | end = j 231 | x.append(to_vec(start, end)) 232 | break 233 | else: 234 | i = j + 1 235 | if i == 10: 236 | end = i 237 | x.append(to_vec(start, end)) 238 | break 239 | else: 240 | i += 1 241 | return x 242 | 243 | def event_wise_metric(event_p, event_gt): 244 | TP = 0 245 | FP = 0 246 | FN = 0 247 | 248 | if event_p is not None: 249 | num_event = len(event_p) 250 | for i in range(num_event): 251 | x1 = event_p[i] 252 | if event_gt is not None: 253 | nn = len(event_gt) 254 | flag = True 255 | for j in range(nn): 256 | x2 = event_gt[j] 257 | if np.sum(x1 * x2) >= 0.5 * np.sum(x1 + x2 - x1 * x2): #0.5 258 | TP += 1 259 | flag = False 260 | break 261 | if flag: 262 | FP += 1 263 | else: 264 | FP += 1 265 | 266 | if event_gt is not None: 267 | num_event = len(event_gt) 268 | for i in range(num_event): 269 | x1 = event_gt[i] 270 | if event_p is not None: 271 | nn = len(event_p) 272 | flag = True 273 | for j in range(nn): 274 | x2 = event_p[j] 275 | if np.sum(x1 * x2) >= 0.5 * np.sum(x1 + x2 - x1 * x2): #0.5 276 | flag = False 277 | break 278 | if flag: 279 | FN += 1 280 | else: 281 | FN += 1 282 | return TP, FP, FN 283 | -------------------------------------------------------------------------------- /task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yu-Wu/Modaily-Aware-Audio-Visual-Video-Parsing/46c6fe2e197bf4e09a18b92931d87a6a710d909f/task.png --------------------------------------------------------------------------------