├── used_idxs.pkl ├── __pycache__ ├── utils.cpython-37.pyc ├── datasets.cpython-37.pyc ├── image_cam.cpython-37.pyc ├── base_attacks.cpython-37.pyc ├── image_attacks.cpython-37.pyc ├── video_attacks.cpython-37.pyc ├── dataset_ucf101.cpython-37.pyc ├── image_cam_utils.cpython-37.pyc ├── reference_ucf101.cpython-37.pyc └── transforms_ucf101.cpython-37.pyc ├── utils.py ├── image_fine_tune_attack.py ├── image_main_ucf101.py ├── image_main.py ├── dataset_ucf101.py ├── reference.py ├── attack.py ├── I2V_attack-env.yml ├── attack_ucf101.py ├── README.md ├── reference_ucf101.py ├── image_cam.py ├── run_image_guided.py ├── image_cam_utils.py ├── video_attacks.py ├── datasets.py ├── TPAMI_attack.py ├── transforms_ucf101.py ├── kinetics400_attack_samples.csv ├── image_attacks.py └── base_attacks.py /used_idxs.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/used_idxs.pkl -------------------------------------------------------------------------------- /__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/datasets.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/datasets.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/image_cam.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/image_cam.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/base_attacks.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/base_attacks.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/image_attacks.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/image_attacks.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/video_attacks.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/video_attacks.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/dataset_ucf101.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/dataset_ucf101.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/image_cam_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/image_cam_utils.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/reference_ucf101.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/reference_ucf101.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/transforms_ucf101.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/transforms_ucf101.cpython-37.pyc -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from gluoncv.torch.engine.config import get_cfg_defaults 3 | import torch 4 | 5 | # config info of video models 6 | # refer to https://cv.gluon.ai/model_zoo/action_recognition.html 7 | CONFIG_ROOT = '' # 8 | CONFIG_PATHS = { 9 | 'i3d_resnet50': os.path.join(CONFIG_ROOT, 'i3d_nl5_resnet50_v1_kinetics400.yaml'), 10 | 'i3d_resnet101': os.path.join(CONFIG_ROOT, 'i3d_nl5_resnet101_v1_kinetics400.yaml'), 11 | 'slowfast_resnet50': os.path.join(CONFIG_ROOT, 'slowfast_8x8_resnet50_kinetics400.yaml'), 12 | 'slowfast_resnet101': os.path.join(CONFIG_ROOT, 'slowfast_8x8_resnet101_kinetics400.yaml'), 13 | 'tpn_resnet50': os.path.join(CONFIG_ROOT, 'tpn_resnet50_f32s2_kinetics400.yaml'), 14 | 'tpn_resnet101': os.path.join(CONFIG_ROOT, 'tpn_resnet101_f32s2_kinetics400.yaml') 15 | } 16 | 17 | # data info 18 | UCF_IMAGE_ROOT = '' # 19 | 20 | # save info 21 | OPT_PATH = '' # 22 | 23 | # checkpoints path for ucf101 24 | UCF_CKPT_PATH = '' # 25 | 26 | def change_cfg(cfg, batch_size, random): 27 | # modify video paths and pretrain setting. 28 | cfg.CONFIG.DATA.VAL_DATA_PATH = '' # 29 | cfg.CONFIG.DATA.VAL_ANNO_PATH = './kinetics400_attack_samples.csv' # selected 400 classified correct. 30 | cfg.CONFIG.MODEL.PRETRAINED = True 31 | cfg.CONFIG.VAL.BATCH_SIZE = batch_size 32 | return cfg 33 | 34 | def get_cfg_custom(cfg_path, batch_size=16, random=False): 35 | cfg = get_cfg_defaults() 36 | cfg.merge_from_file(cfg_path) 37 | cfg = change_cfg(cfg, batch_size, random) 38 | return cfg 39 | 40 | class AverageMeter(object): 41 | """Computes and stores the average and current value""" 42 | 43 | def __init__(self): 44 | self.reset() 45 | 46 | def reset(self): 47 | self.val = 0 48 | self.avg = 0 49 | self.sum = 0 50 | self.count = 0 51 | 52 | def update(self, val, n=1): 53 | self.val = val 54 | self.sum += val * n 55 | self.count += n 56 | self.avg = self.sum / self.count 57 | 58 | def norm_grads(grads, frame_level=True): 59 | # frame level norm 60 | # clip level norm 61 | assert len(grads.shape) == 5 and grads.shape[2] == 32 62 | if frame_level: 63 | norm = torch.mean(torch.abs(grads), [1,3,4], keepdim=True) 64 | else: 65 | norm = torch.mean(torch.abs(grads), [1,2,3,4], keepdim=True) 66 | # norm = torch.norm(grads, dim=[1,2,3,4], p=1) 67 | return grads / norm -------------------------------------------------------------------------------- /image_fine_tune_attack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import os 4 | import torch 5 | import math 6 | import json 7 | from torch.utils.data import Dataset, DataLoader 8 | 9 | import image_attacks 10 | from datasets import get_dataset 11 | from gluoncv.torch.model_zoo import get_model 12 | from utils import CONFIG_PATHS, get_cfg_custom 13 | import pickle as pkl 14 | from reference_ucf101 import MODEL_TO_CKPTS 15 | 16 | class AdvDataset(Dataset): 17 | def __init__(self, used_adv_path, used_ori_path): 18 | self.used_adv_path = used_adv_path 19 | files = os.listdir(self.used_adv_path) 20 | self.files = [i for i in files if 'adv' in i] 21 | self.used_ori_path = used_ori_path 22 | 23 | def __len__(self): 24 | return len(self.files) 25 | 26 | def __getitem__(self, idx): 27 | file = self.files[idx] 28 | vid_id = file.split('-')[0] 29 | ori_file = os.path.join(self.used_ori_path, '{}-ori.npy'.format(vid_id)) 30 | vid = torch.from_numpy(np.load(os.path.join(self.used_adv_path, file))) 31 | vid = vid[None] 32 | ori_vid = torch.from_numpy(np.load(ori_file)) 33 | ori_vid = ori_vid[None] 34 | label = [int(file.split('-')[0])] 35 | label = np.array(label).astype(np.int32) 36 | label = torch.from_numpy(label).long() 37 | return vid, ori_vid, label 38 | 39 | 40 | def arg_parse(): 41 | parser = argparse.ArgumentParser(description='') 42 | parser.add_argument('--gpu', type=str, default='0', help='gpu device.') 43 | parser.add_argument('--batch_size', type=int, default=4, metavar='N', 44 | help='input batch size for reference (default: 16)') 45 | parser.add_argument('--attack_method', type=str, default='ILAF', help='') 46 | parser.add_argument('--opt_path', type=str, default='') 47 | # adv path 48 | parser.add_argument('--used_adv', type=str, default='', help='') 49 | parser.add_argument('--used_ori', type=str, default='', help='') 50 | # white-box model 51 | parser.add_argument('--white_model', type=str, default='i3d_resnet101', help='i3d_resnet101 | slowfast_resnet101 | tpn_resnet101') 52 | parser.add_argument('--dataset', type=str, default='Kinetics-400', help='Kinetics-400 | UCF-101') 53 | args = parser.parse_args() 54 | return args 55 | 56 | if __name__ == '__main__': 57 | args = arg_parse() 58 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 59 | print (args) 60 | # loading cfg. 61 | cfg_path = CONFIG_PATHS[args.white_model] 62 | cfg = get_cfg_custom(cfg_path, args.batch_size) 63 | model = get_model(cfg) 64 | if args.dataset == 'UCF-101': 65 | ckpt_path = MODEL_TO_CKPTS[args.white_model] 66 | model.load_state_dict(torch.load(ckpt_path)['state_dict']) 67 | model.cuda() 68 | 69 | # loading dataset. 70 | dataset = AdvDataset(used_adv_path = args.used_adv, used_ori_path=args.used_ori) 71 | 72 | attack_method = getattr(image_attacks, args.attack_method)(model, args.white_model) 73 | for step in range(len(dataset)): 74 | if step %1 == 0: 75 | print ('Running {}, {}/{}'.format(args.attack_method, step+1, len(dataset))) 76 | # val_batch, val_label = generate_batch(files_batch[step]) 77 | val_batch, ori_batch, val_label = dataset[step] 78 | video_names = ['...'] 79 | adv_batches = attack_method(val_batch, ori_batch, val_label, video_names) 80 | for ind,label in enumerate(val_label): 81 | adv = adv_batches[ind].detach().cpu().numpy() 82 | np.save(os.path.join(args.opt_path, '{}-adv'.format(label.item())), adv) -------------------------------------------------------------------------------- /image_main_ucf101.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import os 4 | import torch 5 | 6 | import math 7 | import json 8 | 9 | import image_attacks 10 | from dataset_ucf101 import attack_genearte_dataeset 11 | from gluoncv.torch.model_zoo import get_model 12 | from utils import CONFIG_PATHS, get_cfg_custom, OPT_PATH 13 | import pickle as pkl 14 | 15 | def arg_parse(): 16 | parser = argparse.ArgumentParser(description='') 17 | # parallel run 18 | parser.add_argument('--batch_nums', type=int, default=1) 19 | parser.add_argument('--batch_index', type=int, default=1) 20 | 21 | # parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.') 22 | parser.add_argument('--gpu', type=str, default='0', help='gpu device.') 23 | parser.add_argument('--batch_size', type=int, default=1, metavar='N', 24 | help='input batch size for reference (default: 16)') 25 | parser.add_argument('--attack_method', type=str, default='ImageGuidedAttentionMap', help='') 26 | parser.add_argument('--step', type=int, default=10, metavar='N', 27 | help='Multi-step or One-step in TI and SGM.') 28 | parser.add_argument('--file_prefix', type=str, default='') 29 | 30 | # for std 31 | parser.add_argument('--depth', type=int, default=1, help='1,2,3,4') 32 | parser.add_argument('--lamb', type=float, default=0.1, help='') 33 | 34 | parser.add_argument('--mode', type=str, default='direction', help='diff_norm\direction') 35 | parser.add_argument('--step_size', type=float, default=0.004, help='') 36 | 37 | # for dropout 38 | parser.add_argument('--dropout', type=float, default=0.1, help='') 39 | 40 | # for direction with changing image model 41 | parser.add_argument('--direction_image_model', type=str, default='resnet', help='resnet, densenet, squeezenet, vgg, alexnet') 42 | args = parser.parse_args() 43 | args.adv_path = os.path.join(OPT_PATH, '{}-{}-{}-{}'.format('Image', args.attack_method, args.step, args.file_prefix)) 44 | if not os.path.exists(args.adv_path): 45 | os.makedirs(args.adv_path) 46 | return args 47 | 48 | if __name__ == '__main__': 49 | args = arg_parse() 50 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 51 | print (args) 52 | 53 | # loading dataset and model. 54 | dataset_loader = attack_genearte_dataeset(args.batch_size) 55 | 56 | nums_contained = int(400 / args.batch_nums) 57 | left = (args.batch_index-1) * nums_contained 58 | right = args.batch_index * nums_contained 59 | 60 | # attack 61 | if args.attack_method == 'ImageGuidedStd_Adam': 62 | model_name_lists = [args.direction_image_model] 63 | attack_method = getattr(image_attacks, args.attack_method)(model_name_lists, depth=args.depth, step_size=args.step_size, steps=args.step) 64 | elif args.attack_method == 'ImageGuidedFMDirection_Adam': 65 | model_name_lists = [args.direction_image_model] 66 | attack_method = getattr(image_attacks, args.attack_method)(model_name_lists, depth=args.depth, step_size=args.step_size, steps=args.step) 67 | elif args.attack_method == 'ImageGuidedFML2_Adam_MultiModels': 68 | model_name_lists = ['resnet', 'vgg', 'squeezenet', 'alexnet'] 69 | depths = { 70 | 'resnet':2, 71 | 'vgg':3, 72 | 'squeezenet':2, 73 | 'alexnet':3 74 | } 75 | attack_method = getattr(image_attacks, args.attack_method)(model_name_lists, depths=depths, steps=args.step) 76 | 77 | for step, data in enumerate(dataset_loader): 78 | if step >= left and step < right: 79 | if step %1 == 0: 80 | print ('Running {}, {}/{}'.format(args.attack_method, step+1, len(dataset_loader))) 81 | val_batch = data[0] 82 | val_label = data[1] 83 | video_names = str(val_label) 84 | # np.save(os.path.join(args.adv_path, '{}-ori'.format(val_label[0].item())), val_batch[0].detach().cpu().numpy()) 85 | adv_batches = attack_method(val_batch, val_label, video_names) 86 | for ind,label in enumerate(val_label): 87 | adv = adv_batches[ind].detach().cpu().numpy() 88 | np.save(os.path.join(args.adv_path, '{}-adv'.format(label.item())), adv) 89 | 90 | with open(os.path.join(args.adv_path, 'loss_info_{}.json'.format(args.batch_index)), 'w') as opt: 91 | json.dump(attack_method.loss_info, opt) 92 | -------------------------------------------------------------------------------- /image_main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import os 4 | import torch 5 | 6 | import math 7 | import json 8 | 9 | import image_attacks 10 | from datasets import get_dataset 11 | from gluoncv.torch.model_zoo import get_model 12 | from utils import CONFIG_PATHS, OPT_PATH, get_cfg_custom 13 | import pickle as pkl 14 | 15 | def arg_parse(): 16 | parser = argparse.ArgumentParser(description='') 17 | # parallel run 18 | parser.add_argument('--batch_nums', type=int, default=1) 19 | parser.add_argument('--batch_index', type=int, default=1) 20 | 21 | # parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.') 22 | parser.add_argument('--gpu', type=str, default='0', help='gpu device.') 23 | parser.add_argument('--batch_size', type=int, default=1, metavar='N', 24 | help='input batch size for reference (default: 16)') 25 | parser.add_argument('--attack_method', type=str, default='ImageGuidedAttentionMap', help='') 26 | # parser.add_argument('--step', type=int, default=60, metavar='N', 27 | # help='Multi-step or One-step in TI and SGM.') 28 | parser.add_argument('--step', type=int, default=60, metavar='N', 29 | help='Multi-step or One-step in TI and SGM.') 30 | parser.add_argument('--file_prefix', type=str, default='') 31 | 32 | # for std 33 | parser.add_argument('--depth', type=int, default=1, help='1,2,3,4') 34 | parser.add_argument('--lamb', type=float, default=0.1, help='') 35 | 36 | parser.add_argument('--mode', type=str, default='direction', help='diff_norm\direction') 37 | parser.add_argument('--step_size', type=float, default=0.004, help='') 38 | 39 | # for dropout 40 | parser.add_argument('--dropout', type=float, default=0.1, help='') 41 | 42 | # for direction with changing image model 43 | parser.add_argument('--direction_image_model', type=str, default='resnet', help='resnet, densenet, squeezenet, vgg, alexnet') 44 | args = parser.parse_args() 45 | args.adv_path = os.path.join(OPT_PATH, '{}-{}-{}-{}'.format('Image', args.attack_method, args.step, args.file_prefix)) 46 | if not os.path.exists(args.adv_path): 47 | os.makedirs(args.adv_path) 48 | return args 49 | 50 | if __name__ == '__main__': 51 | args = arg_parse() 52 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 53 | print (args) 54 | # loading cfg. 55 | cfg_path = CONFIG_PATHS['i3d_resnet101'] 56 | cfg = get_cfg_custom(cfg_path, args.batch_size) 57 | 58 | # loading dataset and model. 59 | dataset_loader = get_dataset(cfg) 60 | 61 | nums_contained = int(400 / args.batch_nums) 62 | left = (args.batch_index-1) * nums_contained 63 | right = args.batch_index * nums_contained 64 | 65 | # attack ImageGuidedStd_Adam ImageGuidedFMDirection_Adam ImageGuidedFML2_Adam_MultiModels ENS_FT_I2V ILAF 66 | if args.attack_method == 'ImageGuidedStd_Adam': 67 | model_name_lists = [args.direction_image_model] 68 | attack_method = getattr(image_attacks, args.attack_method)(model_name_lists, depth=args.depth, step_size=args.step_size, steps=args.step) 69 | elif args.attack_method == 'ImageGuidedFMDirection_Adam': 70 | model_name_lists = [args.direction_image_model] 71 | attack_method = getattr(image_attacks, args.attack_method)(model_name_lists, depth=args.depth, step_size=args.step_size, steps=args.step) 72 | elif args.attack_method == 'ImageGuidedFML2_Adam_MultiModels': 73 | model_name_lists = ['resnet', 'vgg', 'squeezenet', 'alexnet'] 74 | depths = { 75 | 'resnet':2, 76 | 'vgg':3, 77 | 'squeezenet':2, 78 | 'alexnet':3 79 | } 80 | attack_method = getattr(image_attacks, args.attack_method)(model_name_lists, depths=depths) 81 | 82 | for step, data in enumerate(dataset_loader): 83 | if step >= left and step < right: 84 | if step %1 == 0: 85 | print ('Running {}, {}/{}'.format(args.attack_method, step+1, len(dataset_loader))) 86 | val_batch = data[0] 87 | val_label = data[1] 88 | video_names = data[2] 89 | adv_batches = attack_method(val_batch, val_label, video_names) 90 | for ind,label in enumerate(val_label): 91 | adv = adv_batches[ind].detach().cpu().numpy() 92 | np.save(os.path.join(args.adv_path, '{}-adv'.format(label.item())), adv) 93 | 94 | with open(os.path.join(args.adv_path, 'loss_info_{}.json'.format(args.batch_index)), 'w') as opt: 95 | json.dump(attack_method.loss_info, opt) 96 | -------------------------------------------------------------------------------- /dataset_ucf101.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | from PIL import Image 3 | import os 4 | import math 5 | import functools 6 | import json 7 | import copy 8 | import numpy as np 9 | from transforms_ucf101 import * 10 | import pickle as pkl 11 | from utils import UCF_IMAGE_ROOT 12 | 13 | def pil_loader(path): 14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 15 | with open(path, 'rb') as f: 16 | with Image.open(f) as img: 17 | return img.convert('RGB') 18 | 19 | 20 | def accimage_loader(path): 21 | try: 22 | import accimage 23 | return accimage.Image(path) 24 | except IOError: 25 | # Potentially a decoding problem, fall back to PIL.Image 26 | return pil_loader(path) 27 | 28 | 29 | def get_default_image_loader(): 30 | from torchvision import get_image_backend 31 | if get_image_backend() == 'accimage': 32 | return accimage_loader 33 | else: 34 | return pil_loader 35 | 36 | 37 | def video_loader(video_dir_path, frame_indices, image_loader): 38 | video = [] 39 | for i in frame_indices: 40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 41 | if os.path.exists(image_path): 42 | video.append(image_loader(image_path)) 43 | else: 44 | return video 45 | 46 | return video 47 | 48 | def get_default_video_loader(): 49 | image_loader = get_default_image_loader() 50 | return functools.partial(video_loader, image_loader=image_loader) 51 | 52 | class attack_ucf101(data.Dataset): 53 | def __init__(self, spatial_transform=None, temporal_transform=None,get_loader=get_default_video_loader): 54 | setting='./test01_setting.txt' 55 | self.clips = self._make_dataset(setting) 56 | self.spatial_transform = spatial_transform 57 | self.temporal_transform = temporal_transform 58 | self.loader = get_loader() 59 | with open('./used_idxs.pkl', 'rb') as ipt: 60 | used_idxs = pkl.load(ipt) 61 | self.new_clips = [] 62 | for i in used_idxs: 63 | self.new_clips.append(self.clips[i]) 64 | self.clips = self.new_clips 65 | print ('length', len(self.clips)) 66 | def __getitem__(self, index): 67 | directory, duration, target = self.clips[index] 68 | frame_indices = list(range(1, duration + 1)) 69 | 70 | if self.temporal_transform is not None: 71 | frame_indices = self.temporal_transform(frame_indices) 72 | 73 | clip = self.loader(directory, frame_indices) 74 | 75 | if self.spatial_transform is not None: 76 | self.spatial_transform.randomize_parameters() 77 | clip = [self.spatial_transform(img) for img in clip] 78 | 79 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 80 | 81 | return clip, target 82 | 83 | def _make_dataset(self, setting): 84 | if not os.path.exists(setting): 85 | raise(RuntimeError("Setting file %s doesn't exist. Check opt.train-list and opt.val-list. " % (setting))) 86 | clips = [] 87 | with open(setting) as split_f: 88 | data = split_f.readlines() 89 | for line in data: 90 | line_info = line.split() 91 | # line format: video_path, video_duration, video_label 92 | if len(line_info) < 3: 93 | raise(RuntimeError('Video input format is not correct, missing one or more element. %s' % line)) 94 | clip_path = os.path.join(UCF_IMAGE_ROOT, line_info[0]) 95 | duration = int(line_info[1]) 96 | target = int(line_info[2]) 97 | item = (clip_path, duration, target) 98 | clips.append(item) 99 | return clips 100 | 101 | def __len__(self): 102 | return len(self.clips) 103 | 104 | def attack_genearte_dataeset(test_batch_size): 105 | test_spa_trans, test_temp_trans = test_transform() 106 | test_dataset = attack_ucf101(spatial_transform=test_spa_trans, temporal_transform=test_temp_trans) 107 | val_loader = torch.utils.data.DataLoader( 108 | test_dataset, batch_size=test_batch_size, shuffle=False, 109 | num_workers=9, pin_memory=True) 110 | 111 | return val_loader 112 | 113 | def test_transform(): 114 | input_size = 224 115 | scale_ratios = '1.0, 0.8' 116 | scale_ratios = [float(i) for i in scale_ratios.split(',')] 117 | default_mean = [0.485, 0.456, 0.406] 118 | default_std = [0.229, 0.224, 0.225] 119 | norm_method = Normalize(default_mean, default_std) 120 | spatial_transform = spatial_Compose([ 121 | Scale(int(input_size / 1.0)), 122 | CornerCrop(input_size, 'c'), 123 | ToTensor(), norm_method 124 | ]) 125 | temporal_transform = LoopPadding(32) 126 | return spatial_transform, temporal_transform -------------------------------------------------------------------------------- /reference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | import pandas as pd 5 | import json 6 | 7 | import torch 8 | from gluoncv.torch.model_zoo import get_model 9 | from utils import CONFIG_PATHS, get_cfg_custom, AverageMeter, OPT_PATH 10 | from datasets import get_dataset 11 | import argparse 12 | import math 13 | 14 | def arg_parse(): 15 | parser = argparse.ArgumentParser(description='') 16 | parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.') 17 | parser.add_argument('--gpu', type=str, default='0', help='gpu device.') 18 | parser.add_argument('--batch_size', type=int, default=16, metavar='N', 19 | help='input batch size for reference (default: 16)') 20 | args = parser.parse_args() 21 | if 'DATACENTER' in args.adv_path: 22 | pass 23 | else: 24 | args.adv_path = os.path.join(OPT_PATH, args.adv_path) 25 | args.adv_path = os.path.join(OPT_PATH, args.adv_path) 26 | return args 27 | 28 | def accuracy(output, target): 29 | batch_size = target.size(0) 30 | 31 | _, pred = output.topk(1, 1, True, True) 32 | pred = pred.t() # batch_size, 1 33 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 34 | 35 | correct_k = correct[:1].view(-1).float().sum(0) 36 | return correct_k.mul_(100.0 / batch_size), torch.squeeze(pred) 37 | 38 | def generate_batch(batch_files): 39 | batches = [] 40 | labels = [] 41 | for file in batch_files: 42 | batches.append(torch.from_numpy(np.load(os.path.join(args.adv_path, file))).cuda()) 43 | labels.append(int(file.split('-')[0])) 44 | labels = np.array(labels).astype(np.int32) 45 | labels = torch.from_numpy(labels) 46 | return torch.stack(batches), labels 47 | 48 | def reference(model, files_batch): 49 | data_time = AverageMeter() 50 | top1 = AverageMeter() 51 | batch_time = AverageMeter() 52 | 53 | predictions = [] 54 | labels = [] 55 | 56 | end = time.time() 57 | with torch.no_grad(): 58 | for step, batch in enumerate(files_batch): 59 | data_time.update(time.time() - end) 60 | val_batch, val_label = generate_batch(batch) 61 | 62 | val_batch = val_batch.cuda() 63 | val_label = val_label.cuda() 64 | 65 | batch_size = val_label.size(0) 66 | outputs = model(val_batch) 67 | 68 | prec1a, preds = accuracy(outputs.data, val_label) 69 | 70 | predictions += list(preds.cpu().numpy()) 71 | labels += list(val_label.cpu().numpy()) 72 | 73 | top1.update(prec1a.item(), val_batch.size(0)) 74 | batch_time.update(time.time() - end) 75 | end = time.time() 76 | 77 | if step % 5 == 0: 78 | print('----validation----') 79 | print_string = 'Process: [{0}/{1}]'.format(step + 1, len(files_batch)) 80 | print(print_string) 81 | print_string = 'data_time: {data_time:.3f}, batch time: {batch_time:.3f}'.format( 82 | data_time=data_time.val, 83 | batch_time=batch_time.val) 84 | print(print_string) 85 | print_string = 'top-1 accuracy: {top1_acc:.2f}%'.format(top1_acc = top1.avg) 86 | print (print_string) 87 | return predictions, labels, top1.avg 88 | 89 | if __name__ == '__main__': 90 | global args 91 | args = arg_parse() 92 | 93 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 94 | 95 | # loading adversarial examples. 96 | files = os.listdir(args.adv_path) 97 | files = [i for i in files if 'adv' in i] 98 | 99 | batch_times = math.ceil(len(files) / args.batch_size) 100 | files_batch = [] 101 | for i in range(batch_times): 102 | batch = files[i*args.batch_size: min((i+1)*args.batch_size, len(files))] 103 | files_batch.append(batch) 104 | 105 | model_val_acc = {} 106 | info_df = pd.DataFrame() 107 | info_df['gt_label'] = [i for i in range(400)] 108 | for model_name in CONFIG_PATHS.keys(): 109 | print ('Model-{}:'.format(model_name)) 110 | cfg_path = CONFIG_PATHS[model_name] 111 | cfg = get_cfg_custom(cfg_path) 112 | model = get_model(cfg).cuda() 113 | model.eval() 114 | preds, labels, top1_avg = reference(model, files_batch) 115 | 116 | predd = np.zeros_like(preds) 117 | inds = np.argsort(labels) 118 | for i,ind in enumerate(inds): 119 | predd[ind] = preds[i] 120 | 121 | print (args.adv_path) 122 | info_df['{}-pre'.format(model_name)] = predd 123 | model_val_acc[model_name] = top1_avg 124 | del model 125 | torch.cuda.empty_cache() 126 | 127 | info_df.to_csv(os.path.join(args.adv_path, 'results_all_models_prediction.csv'), index=False) 128 | with open(os.path.join(args.adv_path, 'top1_acc_all_models.json'), 'w') as opt: 129 | json.dump(model_val_acc, opt) 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /attack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import os 4 | import torch 5 | import math 6 | 7 | import base_attacks 8 | import video_attacks 9 | from datasets import get_dataset 10 | from gluoncv.torch.model_zoo import get_model 11 | from utils import CONFIG_PATHS, OPT_PATH, get_cfg_custom 12 | 13 | def arg_parse(): 14 | parser = argparse.ArgumentParser(description='') 15 | # parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.') 16 | parser.add_argument('--gpu', type=str, default='0', help='gpu device.') 17 | parser.add_argument('--batch_size', type=int, default=4, metavar='N', 18 | help='input batch size for reference (default: 16)') 19 | parser.add_argument('--model', type=str, default='i3d_resnet101', help='i3d_resnet101 | i3d_slow_resnet101 | slowfast_resnet101 | tpn_resnet101.') 20 | parser.add_argument('--attack_method', type=str, default='TemporalAugmentationMomentum', help='FGSM | BIM | MIFGSM | DIFGSM | TIFGSM | SGM') 21 | parser.add_argument('--attack_type', type=str, default='image', help='image | video') 22 | parser.add_argument('--step', type=int, default=10, metavar='N', 23 | help='Multi-step or One-step in TI and SGM.') 24 | parser.add_argument('--sf_frame', type=int, default=32, metavar='N', 25 | help='SFFGSM frame.') 26 | parser.add_argument('--cf_frame', type=str, default='small', metavar='N', 27 | help='CFFGSM frame.') 28 | parser.add_argument('--kernlen', type=int, default=15, metavar='N', 29 | help='SFFGSM frame.') 30 | parser.add_argument('--nsig', type=int, default=3, metavar='N', 31 | help='SFFGSM frame.') 32 | parser.add_argument('--file_prefix', type=str, default='') 33 | parser.add_argument('--kernel_mode', type=str, default='gaussian') 34 | parser.add_argument('--iterative_momentum', action='store_true', default=False, help='Use iterative momentum in MFFGSM.') 35 | parser.add_argument('--frame_conv', action='store_true', default=False, help='Use frame_conv in MFFGSM.') 36 | # for TemporalAugmentationMomentum 37 | parser.add_argument('--augmentation_weight', type=float, default=1.0, help='') 38 | parser.add_argument('--frame_momentum', action='store_true', default=False, help='') 39 | parser.add_argument('--gamma', type=float, default=1.0, help='') 40 | # for combine momentum 41 | parser.add_argument('--no_iterative_momentum', action='store_true', default=False, help='') 42 | parser.add_argument('--weight_add', action='store_true', default=False, help='') 43 | parser.add_argument('--momentum_weight', type=float, default=0.5, help='') 44 | parser.add_argument('--iterative_first', action='store_true', default=False, help='') 45 | # for TemporalAugmentation 46 | parser.add_argument('--translation_invariant', action='store_true', default=False, help='') 47 | parser.add_argument('--temporal_augmentation', action='store_true', default=False, help='') 48 | parser.add_argument('--TI_First', action='store_true', default=False, help='') 49 | # for noise and shuffle 50 | parser.add_argument('--noise', action='store_true', default=False, help='') 51 | parser.add_argument('--shuffle_grads', action='store_true', default=False, help='') 52 | # for cycle move 53 | parser.add_argument('--move_type', type=str, default='adj',help='adj | large | random') 54 | args = parser.parse_args() 55 | if args.attack_type == 'video': 56 | args.adv_path = os.path.join(OPT_PATH, '{}-{}-{}-{}'.format(args.model, args.attack_method, args.step, args.file_prefix)) 57 | elif args.attack_type == 'image': 58 | args.adv_path = os.path.join(OPT_PATH, '{}-{}-{}-{}'.format(args.model, args.attack_method, args.step, args.file_prefix)) 59 | if not os.path.exists(args.adv_path): 60 | os.makedirs(args.adv_path) 61 | return args 62 | 63 | if __name__ == '__main__': 64 | args = arg_parse() 65 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 66 | print (args) 67 | # loading cfg. 68 | cfg_path = CONFIG_PATHS[args.model] 69 | cfg = get_cfg_custom(cfg_path, args.batch_size) 70 | 71 | # loading dataset and model. 72 | dataset_loader = get_dataset(cfg) 73 | model = get_model(cfg).cuda() 74 | 75 | # attack 76 | if args.attack_type == 'image': 77 | attack_method = getattr(base_attacks, args.attack_method)(model, steps=args.step) 78 | elif args.attack_type == 'video': 79 | if args.attack_method == 'TemporalTranslation': 80 | spe_params = {'kernlen':args.kernlen, 'momentum':args.iterative_momentum, 'weight':args.augmentation_weight, 'move_type':args.move_type, 'kernel_mode':args.kernel_mode} 81 | print ('Used Params') 82 | print (spe_params) 83 | attack_method = getattr(video_attacks, args.attack_method)(model, params=spe_params, steps=args.step) 84 | 85 | for step, data in enumerate(dataset_loader): 86 | if step %1 == 0: 87 | print ('Running {}, {}/{}'.format(args.attack_method, step+1, len(dataset_loader))) 88 | val_batch = data[0].cuda() 89 | val_label = data[1].cuda() 90 | adv_batches = attack_method(val_batch, val_label) 91 | val_batch = val_batch.detach() 92 | for ind,label in enumerate(val_label): 93 | ori = val_batch[ind].cpu().numpy() 94 | adv = adv_batches[ind].cpu().numpy() 95 | np.save(os.path.join(args.adv_path, '{}-adv'.format(label.item())), adv) 96 | np.save(os.path.join(args.adv_path, '{}-ori'.format(label.item())), ori) 97 | -------------------------------------------------------------------------------- /I2V_attack-env.yml: -------------------------------------------------------------------------------- 1 | name: transfer 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - _openmp_mutex=4.5=1_gnu 8 | - argcomplete=1.12.3=pyhd3eb1b0_0 9 | - argon2-cffi=20.1.0=py37h27cfd23_1 10 | - async_generator=1.10=py37h28b3542_0 11 | - attrs=21.2.0=pyhd3eb1b0_0 12 | - backcall=0.2.0=pyhd3eb1b0_0 13 | - blas=1.0=mkl 14 | - bleach=4.0.0=pyhd3eb1b0_0 15 | - bzip2=1.0.8=h7b6447c_0 16 | - ca-certificates=2021.7.5=h06a4308_1 17 | - certifi=2021.5.30=py37h06a4308_0 18 | - cffi=1.14.6=py37h400218f_0 19 | - cudatoolkit=10.2.89=hfd86e86_1 20 | - dbus=1.13.18=hb2f20db_0 21 | - debugpy=1.4.1=py37h295c915_0 22 | - decorator=5.0.9=pyhd3eb1b0_0 23 | - defusedxml=0.7.1=pyhd3eb1b0_0 24 | - entrypoints=0.3=py37_0 25 | - expat=2.4.1=h2531618_2 26 | - ffmpeg=4.3=hf484d3e_0 27 | - fontconfig=2.13.1=h6c09931_0 28 | - freetype=2.10.4=h5ab3b9f_0 29 | - glib=2.69.1=h5202010_0 30 | - gmp=6.2.1=h2531618_2 31 | - gnutls=3.6.15=he1e5248_0 32 | - gst-plugins-base=1.14.0=h8213a91_2 33 | - gstreamer=1.14.0=h28cd5cc_2 34 | - icu=58.2=he6710b0_3 35 | - importlib-metadata=4.8.1=py37h06a4308_0 36 | - importlib_metadata=4.8.1=hd3eb1b0_0 37 | - intel-openmp=2021.3.0=h06a4308_3350 38 | - ipykernel=6.2.0=py37h06a4308_1 39 | - ipython=7.27.0=py37hb070fc8_0 40 | - ipython_genutils=0.2.0=pyhd3eb1b0_1 41 | - ipywidgets=7.6.3=pyhd3eb1b0_1 42 | - jedi=0.18.0=py37h06a4308_1 43 | - jinja2=3.0.1=pyhd3eb1b0_0 44 | - jpeg=9b=h024ee3a_2 45 | - jsonschema=3.2.0=pyhd3eb1b0_2 46 | - jupyter=1.0.0=py37_7 47 | - jupyter_client=7.0.1=pyhd3eb1b0_0 48 | - jupyter_console=6.4.0=pyhd3eb1b0_0 49 | - jupyter_core=4.7.1=py37h06a4308_0 50 | - jupyterlab_pygments=0.1.2=py_0 51 | - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1 52 | - lame=3.100=h7b6447c_0 53 | - lcms2=2.12=h3be6417_0 54 | - ld_impl_linux-64=2.35.1=h7274673_9 55 | - libffi=3.3=he6710b0_2 56 | - libgcc-ng=9.3.0=h5101ec6_17 57 | - libgomp=9.3.0=h5101ec6_17 58 | - libiconv=1.15=h63c8f33_5 59 | - libidn2=2.3.2=h7f8727e_0 60 | - libpng=1.6.37=hbc83047_0 61 | - libsodium=1.0.18=h7b6447c_0 62 | - libstdcxx-ng=9.3.0=hd4cf53a_17 63 | - libtasn1=4.16.0=h27cfd23_0 64 | - libtiff=4.2.0=h85742a9_0 65 | - libunistring=0.9.10=h27cfd23_0 66 | - libuuid=1.0.3=h1bed415_2 67 | - libuv=1.40.0=h7b6447c_0 68 | - libwebp-base=1.2.0=h27cfd23_0 69 | - libxcb=1.14=h7b6447c_0 70 | - libxml2=2.9.12=h03d6c58_0 71 | - lz4-c=1.9.3=h295c915_1 72 | - markupsafe=2.0.1=py37h27cfd23_0 73 | - matplotlib-inline=0.1.2=pyhd3eb1b0_2 74 | - mistune=0.8.4=py37h14c3975_1001 75 | - mkl=2021.3.0=h06a4308_520 76 | - mkl-service=2.4.0=py37h7f8727e_0 77 | - mkl_fft=1.3.0=py37h42c9631_2 78 | - mkl_random=1.2.2=py37h51133e4_0 79 | - nbclient=0.5.3=pyhd3eb1b0_0 80 | - nbconvert=6.1.0=py37h06a4308_0 81 | - nbformat=5.1.3=pyhd3eb1b0_0 82 | - ncurses=6.2=he6710b0_1 83 | - nest-asyncio=1.5.1=pyhd3eb1b0_0 84 | - nettle=3.7.3=hbbd107a_1 85 | - ninja=1.10.2=hff7bd54_1 86 | - notebook=6.4.3=py37h06a4308_0 87 | - numpy=1.20.3=py37hf144106_0 88 | - numpy-base=1.20.3=py37h74d4b33_0 89 | - olefile=0.46=py37_0 90 | - openh264=2.1.0=hd408876_0 91 | - openjpeg=2.4.0=h3ad879b_0 92 | - openssl=1.1.1l=h7f8727e_0 93 | - packaging=21.0=pyhd3eb1b0_0 94 | - pandocfilters=1.4.3=py37h06a4308_1 95 | - parso=0.8.2=pyhd3eb1b0_0 96 | - pcre=8.45=h295c915_0 97 | - pexpect=4.8.0=pyhd3eb1b0_3 98 | - pickleshare=0.7.5=pyhd3eb1b0_1003 99 | - pillow=8.3.1=py37h2c7a002_0 100 | - pip=21.0.1=py37h06a4308_0 101 | - prometheus_client=0.11.0=pyhd3eb1b0_0 102 | - prompt-toolkit=3.0.17=pyhca03da5_0 103 | - prompt_toolkit=3.0.17=hd3eb1b0_0 104 | - ptyprocess=0.7.0=pyhd3eb1b0_2 105 | - pycparser=2.20=py_2 106 | - pygments=2.10.0=pyhd3eb1b0_0 107 | - pyparsing=2.4.7=pyhd3eb1b0_0 108 | - pyqt=5.9.2=py37h05f1152_2 109 | - pyrsistent=0.17.3=py37h7b6447c_0 110 | - python=3.7.11=h12debd9_0 111 | - python-dateutil=2.8.2=pyhd3eb1b0_0 112 | - pytorch=1.9.1=py3.7_cuda10.2_cudnn7.6.5_0 113 | - pyzmq=22.2.1=py37h295c915_1 114 | - qt=5.9.7=h5867ecd_1 115 | - qtconsole=5.1.0=pyhd3eb1b0_0 116 | - qtpy=1.10.0=pyhd3eb1b0_0 117 | - readline=8.1=h27cfd23_0 118 | - send2trash=1.5.0=pyhd3eb1b0_1 119 | - setuptools=58.0.4=py37h06a4308_0 120 | - sip=4.19.8=py37hf484d3e_0 121 | - six=1.16.0=pyhd3eb1b0_0 122 | - sqlite=3.36.0=hc218d9a_0 123 | - terminado=0.9.4=py37h06a4308_0 124 | - testpath=0.5.0=pyhd3eb1b0_0 125 | - tk=8.6.10=hbc83047_0 126 | - torchaudio=0.9.1=py37 127 | - torchvision=0.10.1=py37_cu102 128 | - tornado=6.1=py37h27cfd23_0 129 | - traitlets=5.0.5=pyhd3eb1b0_0 130 | - typing_extensions=3.10.0.2=pyh06a4308_0 131 | - wcwidth=0.2.5=pyhd3eb1b0_0 132 | - webencodings=0.5.1=py37_1 133 | - wheel=0.37.0=pyhd3eb1b0_1 134 | - widgetsnbextension=3.5.1=py37_0 135 | - xz=5.2.5=h7b6447c_0 136 | - zeromq=4.3.4=h2531618_0 137 | - zipp=3.5.0=pyhd3eb1b0_0 138 | - zlib=1.2.11=h7b6447c_3 139 | - zstd=1.4.9=haebb681_0 140 | - pip: 141 | - autocfg==0.0.8 142 | - charset-normalizer==2.0.6 143 | - cycler==0.10.0 144 | - decord==0.6.0 145 | - gluoncv==0.10.4.post4 146 | - idna==3.2 147 | - kiwisolver==1.3.2 148 | - matplotlib==3.4.3 149 | - opencv-contrib-python==4.5.3.56 150 | - pandas==1.3.3 151 | - portalocker==2.3.2 152 | - pytz==2021.1 153 | - pyyaml==5.4.1 154 | - requests==2.26.0 155 | - scipy==1.7.1 156 | - seaborn==0.11.2 157 | - timm==0.5.0 158 | - tqdm==4.62.3 159 | - urllib3==1.26.6 160 | - yacs==0.1.8 161 | prefix: None 162 | -------------------------------------------------------------------------------- /attack_ucf101.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import torch 4 | import numpy as np 5 | import math 6 | 7 | import base_attacks 8 | import video_attacks 9 | from dataset_ucf101 import attack_genearte_dataeset 10 | from gluoncv.torch.model_zoo import get_model 11 | from utils import CONFIG_PATHS, get_cfg_custom, OPT_PATH 12 | from reference_ucf101 import MODEL_TO_CKPTS 13 | 14 | def arg_parse(): 15 | parser = argparse.ArgumentParser(description='') 16 | # parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.') 17 | parser.add_argument('--gpu', type=str, default='0', help='gpu device.') 18 | parser.add_argument('--batch_size', type=int, default=4, metavar='N', 19 | help='input batch size for reference (default: 16)') 20 | parser.add_argument('--model', type=str, default='i3d_resnet101', help='i3d_resnet101 | i3d_slow_resnet101 | slowfast_resnet101 | tpn_resnet101.') 21 | parser.add_argument('--attack_method', type=str, default='TemporalAugmentationMomentum', help='FGSM | BIM | MIFGSM | DIFGSM | TIFGSM | SGM') 22 | parser.add_argument('--attack_type', type=str, default='image', help='image | video') 23 | parser.add_argument('--step', type=int, default=10, metavar='N', 24 | help='Multi-step or One-step in TI and SGM.') 25 | parser.add_argument('--sf_frame', type=int, default=32, metavar='N', 26 | help='SFFGSM frame.') 27 | parser.add_argument('--cf_frame', type=str, default='small', metavar='N', 28 | help='CFFGSM frame.') 29 | parser.add_argument('--kernlen', type=int, default=15, metavar='N', 30 | help='SFFGSM frame.') 31 | parser.add_argument('--nsig', type=int, default=3, metavar='N', 32 | help='SFFGSM frame.') 33 | parser.add_argument('--file_prefix', type=str, default='') 34 | parser.add_argument('--kernel_mode', type=str, default='gaussian') 35 | parser.add_argument('--iterative_momentum', action='store_true', default=False, help='Use iterative momentum in MFFGSM.') 36 | parser.add_argument('--frame_conv', action='store_true', default=False, help='Use frame_conv in MFFGSM.') 37 | # for TemporalAugmentationMomentum 38 | parser.add_argument('--augmentation_weight', type=float, default=1.0, help='') 39 | parser.add_argument('--frame_momentum', action='store_true', default=False, help='') 40 | parser.add_argument('--gamma', type=float, default=1.0, help='') 41 | # for combine momentum 42 | parser.add_argument('--no_iterative_momentum', action='store_true', default=False, help='') 43 | parser.add_argument('--weight_add', action='store_true', default=False, help='') 44 | parser.add_argument('--momentum_weight', type=float, default=0.5, help='') 45 | parser.add_argument('--iterative_first', action='store_true', default=False, help='') 46 | # for TemporalAugmentation 47 | parser.add_argument('--translation_invariant', action='store_true', default=False, help='') 48 | parser.add_argument('--temporal_augmentation', action='store_true', default=False, help='') 49 | parser.add_argument('--TI_First', action='store_true', default=False, help='') 50 | # for noise and shuffle 51 | parser.add_argument('--noise', action='store_true', default=False, help='') 52 | parser.add_argument('--shuffle_grads', action='store_true', default=False, help='') 53 | # for cycle move 54 | parser.add_argument('--move_type', type=str, default='adj',help='adj | large | random') 55 | args = parser.parse_args() 56 | if args.attack_type == 'video': 57 | args.adv_path = os.path.join(OPT_PATH, 'UCF101_Video-{}-{}-{}-{}'.format(args.model, args.attack_method, args.step, args.file_prefix)) 58 | elif args.attack_type == 'image': 59 | args.adv_path = os.path.join(OPT_PATH, 'UCF101_Image-{}-{}-{}-{}'.format(args.model, args.attack_method, args.step, args.file_prefix)) 60 | if not os.path.exists(args.adv_path): 61 | os.makedirs(args.adv_path) 62 | return args 63 | 64 | if __name__ == '__main__': 65 | args = arg_parse() 66 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 67 | print (args) 68 | 69 | # loading cfg. 70 | cfg_path = CONFIG_PATHS[args.model] 71 | cfg = get_cfg_custom(cfg_path, args.batch_size) 72 | 73 | # loading dataset and model. 74 | dataset_loader = attack_genearte_dataeset(args.batch_size) 75 | ckpt_path = MODEL_TO_CKPTS[args.model] 76 | model = get_model(cfg) 77 | model.load_state_dict(torch.load(ckpt_path)['state_dict']) 78 | model.cuda() 79 | model.eval() 80 | 81 | # attack 82 | if args.attack_type == 'image': 83 | # FGSM, BIM, MIFGSM, DIFGSM, TIFGSM, SGM, SIM 84 | attack_method = getattr(base_attacks, args.attack_method)(model, steps=args.step) 85 | elif args.attack_type == 'video': 86 | if args.attack_method == 'TemporalTranslation': 87 | spe_params = {'kernlen':15, 'momentum':False, 'weight':1.0, 'move_type':'adj', 'kernel_mode':'gaussian'} 88 | print ('Used Params') 89 | print (spe_params) 90 | attack_method = getattr(video_attacks, args.attack_method)(model, params=spe_params, steps=args.step) 91 | 92 | for step, data in enumerate(dataset_loader): 93 | if step %1 == 0: 94 | print ('Running {}, {}/{}'.format(args.attack_method, step+1, len(dataset_loader))) 95 | val_batch = data[0].cuda() 96 | val_label = data[1].cuda() 97 | adv_batches = attack_method(val_batch, val_label) 98 | val_batch = val_batch.detach() 99 | for ind,label in enumerate(val_label): 100 | ori = val_batch[ind].cpu().numpy() 101 | adv = adv_batches[ind].cpu().numpy() 102 | np.save(os.path.join(args.adv_path, '{}-adv'.format(label.item())), adv) 103 | np.save(os.path.join(args.adv_path, '{}-ori'.format(label.item())), ori) 104 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 8 | 9 |
10 | 11 |

Adaptive Cross-Modal Transferable Adversarial Attacks From Images to Videos

12 | 13 | **IEEE Transactions on Pattern Analysis and Machine Intelligence ( Volume: 46, Issue: 5, May 2024)** 14 | 15 | **[Zhipeng Wei](https://zhipeng-wei.github.io/), [Jingjing Chen](https://fvl.fudan.edu.cn/people/jingjingchen), [Zuxuan Wu](https://zxwu.azurewebsites.net/), [Yu-Gang Jiang](https://fvl.fudan.edu.cn/people/yugangjiang/)** 16 |
17 | 18 | If you use our method for attacks in your research, please consider citing 19 | ``` 20 | @inproceedings{wei2022cross, 21 | title={Cross-Modal Transferable Adversarial Attacks from Images to Videos}, 22 | author={Wei, Zhipeng and Chen, Jingjing and Wu, Zuxuan and Jiang, Yu-Gang}, 23 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 24 | pages={15064--15073}, 25 | year={2022} 26 | } 27 | @ARTICLE{10375740, 28 | author={Wei, Zhipeng and Chen, Jingjing and Wu, Zuxuan and Jiang, Yu-Gang}, 29 | journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, 30 | title={Adaptive Cross-Modal Transferable Adversarial Attacks From Images to Videos}, 31 | year={2024}, 32 | volume={46}, 33 | number={5}, 34 | pages={3772-3783}, 35 | keywords={Videos;Adaptation models;Perturbation methods;Feature extraction;Computational modeling;Glass box;Closed box;Cross-modal attack;transferable attack}, 36 | doi={10.1109/TPAMI.2023.3347835}} 37 | ``` 38 | 39 | # Introduction 40 | We proposed the Image To Video (I2V) attack (CVPR paper), which generates adversarial video examples by optimizing against pretrained image models to deceive video models. Specifically, I2V reduces the cosine similarity between adversarial and benign features in the intermediate layers of image models for each video frame. 41 | Moreover, I2V can be easily extended to simultaneously perturb multi-layer features extracted from an ensemble of image models (TPAMI Paper). To efficiently integrate various features, we introduce an adaptive approach to re-weight the contributions of each layer based on its cosine similarity values of the previous attack step. 42 | 43 | # Python Environment. 44 | We provide I2V_attack-env.yml to recover the used environment. 45 | ``` 46 | conda env create -f I2V_attack-env.yml 47 | ``` 48 | GPU infos contain 49 | ``` 50 | NVIDIA GeForce RTX 2080TI 51 | NVIDIA-SMI 430.14 Driver Version: 430.14 CUDA Version: 10.2 52 | ``` 53 | 54 | # Prepare Model and Dataset 55 | ### Video model 56 | For Kinetics-400, download config files from [gluon](https://cv.gluon.ai/model_zoo/action_recognition.html). Models include i3d_nl5_resnet50_v1_kinetics400, i3d_nl5_resnet101_v1_kinetics400, slowfast_8x8_resnet50_kinetics400, slowfast_8x8_resnet101_kinetics400, tpn_resnet50_f32s2_kinetics400, tpn_resnet101_f32s2_kinetics400. 57 | After that, change the CONFIG_ROOT of utils.py into your custom path. We use pretrained models on Kinetics-400 from gluon to conduct experiments. 58 | 59 | For UCF-101, we fine-tune these models on UCF-101. Download checkpoint files from [here](https://drive.google.com/open?id=10KOlWdi5bsV9001uL4Bn1T48m9hkgsZ2&authuser=weizhipeng1226%40gmail.com&usp=drive_fs) and specify UCF_CKPT_PATH of utils.py. 60 | 61 | 62 | ### Dataset 63 | Download Kinetics-400 dataset and UCF-101 dataset and set OPT_PATH of utils.py to specify the output path. 64 | 65 | For Kinetics-400, change cfg.CONFIG.DATA.VAL_DATA_PATH of utils.py into your validation path. 66 | 67 | For UCF-101, split videos into images and change UCF_IMAGE_ROOT of utils.py into your images path of UCF-101. 68 | 69 | # Run the code. 70 | ## Ablation Study and Performance Comparison 71 | Using this code to obtain the results of Table 3, Table 4, and Figure 4, Figure 5. 72 | ```python 73 | python run_image_guided.py --gpu {gpu} 74 | ``` 75 | ## Generation of adversarial examples 76 | Before comparing the proposed ENS-I2V with ILAF, we need to generate adversarial examples with white-box video models. 77 | 78 | For kinetics-400, 79 | ```python 80 | python attack.py --gpu {gpu} --model {model} --attack_type image --attack_method {image_method} --step {step} --batch_size {batch_size} 81 | python attack.py --gpu {gpu} --model {model} --attack_type video --attack_method TemporalTranslation --step {step} --batch_size 1 82 | ``` 83 | * model: the white-box model. 84 | * attack_method: such as FGSM, BIM, MI, etc. See more attacks in base_attack.py 85 | * step: the iteration number. 86 | 87 | For UCF101, 88 | ```python 89 | python attack_ucf101.py --gpu {gpu} --model {model} --attack_type image --attack_method {image_method} --step {step} --batch_size {batch_size} 90 | python attack_ucf101.py --gpu {gpu} --model {model} --attack_type video --attack_method TemporalTranslation --step {step} --batch_size 1 91 | ``` 92 | 93 | These generated adversarial examples will be stored in the OPT_PATH of utils.py, which can be directly used as the parameter of "--used_ori" and "--used_adv" in subsequent commands. 94 | 95 | ## Comparing against Stronger Baselines 96 | Fine-tuning existing adversarial examples by: 97 | ```python 98 | python image_fine_tune_attack.py --gpu {gpu} --attack_method ILAF --used_ori {path} --used_adv {path} --opt_path {path} --white_model {model} --dataset {dataset} 99 | ``` 100 | * used_ori: the path of original examples. 101 | * used_adv: the path of existing adversarial examples. 102 | * opt_path: the output path. 103 | * white_model: the white-box model. 104 | * dataset: Kinetics-400 or UCF-101 105 | 106 | Predict these generated adversarial examples by 107 | ```python 108 | # ucf101 reference 109 | python reference_ucf101.py --gpu {gpu} --adv_path {adv_path} 110 | # kinetics reference 111 | python reference.py --gpu {gpu} --adv_path {adv_path} 112 | ``` 113 | * adv_path: the output path of generated adversarial examples. 114 | -------------------------------------------------------------------------------- /reference_ucf101.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | import pandas as pd 5 | import json 6 | 7 | import torch 8 | from gluoncv.torch.engine.config import get_cfg_defaults 9 | from gluoncv.torch.model_zoo import get_model 10 | from utils import AverageMeter, OPT_PATH, CONFIG_ROOT, UCF_CKPT_PATH 11 | from datasets import get_dataset 12 | import argparse 13 | import math 14 | 15 | CONFIG_PATHS = { 16 | 'i3d_resnet50': os.path.join(CONFIG_ROOT, 'i3d_nl5_resnet50_v1_kinetics400.yaml'), 17 | 'i3d_resnet101': os.path.join(CONFIG_ROOT, 'i3d_nl5_resnet101_v1_kinetics400.yaml'), 18 | 'slowfast_resnet50': os.path.join(CONFIG_ROOT, 'slowfast_8x8_resnet50_kinetics400.yaml'), 19 | 'slowfast_resnet101': os.path.join(CONFIG_ROOT, 'slowfast_8x8_resnet101_kinetics400.yaml'), 20 | 'tpn_resnet50': os.path.join(CONFIG_ROOT, 'tpn_resnet50_f32s2_kinetics400.yaml'), 21 | 'tpn_resnet101': os.path.join(CONFIG_ROOT, 'tpn_resnet101_f32s2_kinetics400.yaml') 22 | } 23 | 24 | MODEL_TO_CKPTS = { 25 | 'i3d_resnet50': os.path.join(UCF_CKPT_PATH, 'i3d_resnet50.pth'), 26 | 'i3d_resnet101': os.path.join(UCF_CKPT_PATH, 'i3d_resnet101.pth'), 27 | 'slowfast_resnet50': os.path.join(UCF_CKPT_PATH, 'slowfast_resnet50.pth'), 28 | 'slowfast_resnet101': os.path.join(UCF_CKPT_PATH, 'slowfast_resnet101.pth'), 29 | 'tpn_resnet50': os.path.join(UCF_CKPT_PATH, 'tpn_resnet50.pth'), 30 | 'tpn_resnet101': os.path.join(UCF_CKPT_PATH, 'tpn_resnet101.pth') 31 | } 32 | 33 | def arg_parse(): 34 | parser = argparse.ArgumentParser(description='') 35 | parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.') 36 | parser.add_argument('--gpu', type=str, default='0', help='gpu device.') 37 | parser.add_argument('--batch_size', type=int, default=16, metavar='N', 38 | help='input batch size for reference (default: 16)') 39 | args = parser.parse_args() 40 | if 'DATACENTER' in args.adv_path: 41 | pass 42 | else: 43 | args.adv_path = os.path.join(OPT_PATH, args.adv_path) 44 | return args 45 | 46 | def accuracy(output, target): 47 | batch_size = target.size(0) 48 | 49 | _, pred = output.topk(1, 1, True, True) 50 | pred = pred.t() # batch_size, 1 51 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 52 | 53 | correct_k = correct[:1].view(-1).float().sum(0) 54 | return correct_k.mul_(100.0 / batch_size), torch.squeeze(pred) 55 | 56 | def generate_batch(batch_files): 57 | batches = [] 58 | labels = [] 59 | for file in batch_files: 60 | batches.append(torch.from_numpy(np.load(os.path.join(args.adv_path, file))).cuda()) 61 | labels.append(int(file.split('-')[0])) 62 | labels = np.array(labels).astype(np.int32) 63 | labels = torch.from_numpy(labels) 64 | return torch.stack(batches), labels 65 | 66 | def reference(model, files_batch): 67 | data_time = AverageMeter() 68 | top1 = AverageMeter() 69 | batch_time = AverageMeter() 70 | 71 | predictions = [] 72 | labels = [] 73 | 74 | end = time.time() 75 | with torch.no_grad(): 76 | for step, batch in enumerate(files_batch): 77 | data_time.update(time.time() - end) 78 | val_batch, val_label = generate_batch(batch) 79 | 80 | val_batch = val_batch.cuda() 81 | val_label = val_label.cuda() 82 | 83 | batch_size = val_label.size(0) 84 | outputs = model(val_batch) 85 | 86 | prec1a, preds = accuracy(outputs.data, val_label) 87 | 88 | predictions += list(preds.cpu().numpy()) 89 | labels += list(val_label.cpu().numpy()) 90 | 91 | top1.update(prec1a.item(), val_batch.size(0)) 92 | batch_time.update(time.time() - end) 93 | end = time.time() 94 | 95 | if step % 5 == 0: 96 | print('----validation----') 97 | print_string = 'Process: [{0}/{1}]'.format(step + 1, len(files_batch)) 98 | print(print_string) 99 | print_string = 'data_time: {data_time:.3f}, batch time: {batch_time:.3f}'.format( 100 | data_time=data_time.val, 101 | batch_time=batch_time.val) 102 | print(print_string) 103 | print_string = 'top-1 accuracy: {top1_acc:.2f}%'.format(top1_acc = top1.avg) 104 | print (print_string) 105 | return predictions, labels, top1.avg 106 | 107 | def load_model(model_name): 108 | cfg = get_cfg_defaults() 109 | cfg_path = CONFIG_PATHS[model_name] 110 | cfg.merge_from_file(cfg_path) 111 | cfg.CONFIG.MODEL.PRETRAINED = False 112 | ckpt_path = MODEL_TO_CKPTS[model_name] 113 | model = get_model(cfg) 114 | model.load_state_dict(torch.load(ckpt_path)['state_dict']) 115 | model.cuda() 116 | model.eval() 117 | return model 118 | 119 | if __name__ == '__main__': 120 | global args 121 | args = arg_parse() 122 | 123 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu 124 | 125 | # loading adversarial examples. 126 | files = os.listdir(args.adv_path) 127 | files = [i for i in files if 'adv' in i] 128 | 129 | batch_times = math.ceil(len(files) / args.batch_size) 130 | files_batch = [] 131 | for i in range(batch_times): 132 | batch = files[i*args.batch_size: min((i+1)*args.batch_size, len(files))] 133 | files_batch.append(batch) 134 | 135 | model_val_acc = {} 136 | info_df = pd.DataFrame() 137 | info_df['gt_label'] = [i for i in range(101)] 138 | for model_name in MODEL_TO_CKPTS.keys(): 139 | print ('Model-{}:'.format(model_name)) 140 | model = load_model(model_name) 141 | preds, labels, top1_avg = reference(model, files_batch) 142 | 143 | predd = np.zeros_like(preds) 144 | inds = np.argsort(labels) 145 | for i,ind in enumerate(inds): 146 | predd[ind] = preds[i] 147 | 148 | print (args.adv_path) 149 | info_df['{}-pre'.format(model_name)] = predd 150 | model_val_acc[model_name] = top1_avg 151 | del model 152 | torch.cuda.empty_cache() 153 | 154 | info_df.to_csv(os.path.join(args.adv_path, 'results_all_models_prediction.csv'), index=False) 155 | with open(os.path.join(args.adv_path, 'top1_acc_all_models.json'), 'w') as opt: 156 | json.dump(model_val_acc, opt) 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /image_cam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torchvision.models as models 4 | 5 | from image_cam_utils import find_alexnet_layer, find_vgg_layer, find_resnet_layer, find_densenet_layer, find_squeezenet_layer 6 | 7 | # refer to https://github.com/1Konny/gradcam_plus_plus-pytorch/blob/master/gradcam.py 8 | 9 | def average_grad_cam_from_images(inps): 10 | ''' 11 | inps: [b,c,f,h,w] 12 | ''' 13 | b,c,f,h,w = inps.shape 14 | image_inps = inps.permute([0,2,1,3,4]) 15 | image_inps = image_inps.reshape(b*f, c, h, w) 16 | model_lists = ['alexnet', 'vgg', 'resnet', 'densenet', 'squeezenet'] 17 | masks = [] 18 | for model_name in model_lists: 19 | if model_name == 'alexnet': 20 | model = models.alexnet(pretrained=True) 21 | elif model_name == 'vgg': 22 | model = models.vgg16(pretrained=True) 23 | elif model_name == 'resnet': 24 | model = models.resnet101(pretrained=True) 25 | elif model_name == 'densenet': 26 | model = models.densenet161(pretrained=True) 27 | elif model_name == 'squeezenet': 28 | model = models.squeezenet1_1(pretrained=True) 29 | model.eval() 30 | model.cuda() 31 | model_dict = dict(type=model_name, arch=model, input_size=(224, 224)) 32 | 33 | gradcam = GradCAM(model_dict, False) 34 | mask, _ = gradcam(image_inps) 35 | masks.append(mask) 36 | average_mask = torch.stack(masks).mean(0, keepdim=False) 37 | return average_mask 38 | 39 | class GradCAM(object): 40 | """Calculate GradCAM salinecy map. 41 | A simple example: 42 | # initialize a model, model_dict and gradcam 43 | resnet = torchvision.models.resnet101(pretrained=True) 44 | resnet.eval() 45 | model_dict = dict(model_type='resnet', arch=resnet, layer_name='layer4', input_size=(224, 224)) 46 | gradcam = GradCAM(model_dict) 47 | # get an image and normalize with mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225) 48 | img = load_img() 49 | normed_img = normalizer(img) 50 | # get a GradCAM saliency map on the class index 10. 51 | mask, logit = gradcam(normed_img, class_idx=10) 52 | # make heatmap from mask and synthesize saliency map using heatmap and img 53 | heatmap, cam_result = visualize_cam(mask, img) 54 | Args: 55 | model_dict (dict): a dictionary that contains 'model_type', 'arch', layer_name', 'input_size'(optional) as keys. 56 | verbose (bool): whether to print output size of the saliency map givien 'layer_name' and 'input_size' in model_dict. 57 | """ 58 | def __init__(self, model_dict, verbose=False): 59 | model_type = model_dict['type'] 60 | self.model_arch = model_dict['arch'] 61 | 62 | self.gradients = dict() 63 | self.activations = dict() 64 | def backward_hook(module, grad_input, grad_output): 65 | self.gradients['value'] = grad_output[0] 66 | return None 67 | def forward_hook(module, input, output): 68 | self.activations['value'] = output 69 | return None 70 | 71 | if 'vgg' in model_type.lower(): 72 | target_layer = find_vgg_layer(self.model_arch, 'features_29') 73 | elif 'resnet' in model_type.lower(): 74 | target_layer = find_resnet_layer(self.model_arch, 'layer4') 75 | elif 'densenet' in model_type.lower(): 76 | target_layer = find_densenet_layer(self.model_arch, 'features_norm5') 77 | elif 'alexnet' in model_type.lower(): 78 | target_layer = find_alexnet_layer(self.model_arch, 'features_11') 79 | elif 'squeezenet' in model_type.lower(): 80 | target_layer = find_squeezenet_layer(self.model_arch, 'features_12_expand3x3_activation') 81 | 82 | target_layer.register_forward_hook(forward_hook) 83 | target_layer.register_backward_hook(backward_hook) 84 | 85 | if verbose: 86 | try: 87 | input_size = model_dict['input_size'] 88 | except KeyError: 89 | print("please specify size of input image in model_dict. e.g. {'input_size':(224, 224)}") 90 | pass 91 | else: 92 | device = 'cuda' if next(self.model_arch.parameters()).is_cuda else 'cpu' 93 | self.model_arch(torch.zeros(1, 3, *(input_size), device=device)) 94 | print('saliency_map size :', self.activations['value'].shape[2:]) 95 | 96 | 97 | def forward(self, input, ori_feature_mas, class_idx=None, retain_graph=False): 98 | """ 99 | Args: 100 | input: input image with shape of (1, 3, H, W) 101 | class_idx (int): class index for calculating GradCAM. 102 | If not specified, the class index that makes the highest model prediction score will be used. 103 | Return: 104 | mask: saliency map of the same spatial dimension with input 105 | logit: model output 106 | """ 107 | b, c, h, w = input.size() 108 | 109 | logit = self.model_arch(input) 110 | if class_idx is None: 111 | score = logit[:, logit.max(1)[-1]].squeeze() 112 | else: 113 | score = logit[:, class_idx].squeeze() 114 | 115 | self.model_arch.zero_grad() 116 | score.backward() 117 | gradients = self.gradients['value'] 118 | activations = self.activations['value'] 119 | b, k, u, v = gradients.size() 120 | 121 | alpha = gradients.view(b, k, -1).mean(2) 122 | #alpha = F.relu(gradients.view(b, k, -1)).mean(2) 123 | weights = alpha.view(b, k, 1, 1) 124 | 125 | saliency_map = (weights*activations).sum(1, keepdim=True) 126 | saliency_map = F.relu(saliency_map) 127 | # saliency_map = F.upsample(saliency_map, size=(h, w), mode='bilinear', align_corners=False) 128 | saliency_map_min, saliency_map_max = saliency_map.min(), saliency_map.max() 129 | saliency_map = (saliency_map - saliency_map_min).div(saliency_map_max - saliency_map_min).data 130 | 131 | 132 | if self.update: 133 | print (saliency_map.shape) 134 | print (ori_feature_mas.shape) 135 | cost = torch.norm(saliency_map.reshape(b, -1) - ori_feature_mas.reshape(b, -1), p=2, dim=1) 136 | grad = torch.autograd.grad(cost, input, grad_outputs=torch.ones_like(cost), 137 | retain_graph=False, create_graph=False)[0] 138 | return grad 139 | else: 140 | return saliency_map 141 | 142 | def __call__(self, input, ori_feature_mas, update=False, class_idx=None, retain_graph=False): 143 | self.update = update 144 | return self.forward(input, ori_feature_mas, class_idx, retain_graph) -------------------------------------------------------------------------------- /run_image_guided.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import argparse 4 | 5 | ImageGuidedFMDirection_Adam_attack_run = 'python image_main.py --gpu {gpu} --attack_method ImageGuidedFMDirection_Adam --step {step} --step_size {step_size} --direction_image_model resnet --batch_size {batch_size} --batch_nums {batch_nums} --batch_index {batch_index} --file_prefix resnet_step_size_{step_size}_paper_study' 6 | ImageGuidedFMDirection_Adam_reference_run = 'python reference.py --gpu {gpu} --adv_path Image-ImageGuidedFMDirection_Adam-{step}-resnet_step_size_{step_size}_paper_study' 7 | 8 | Aba_layer_ImageGuidedFMDirection_Adam_attack_run = 'python image_main.py --gpu {gpu} --attack_method ImageGuidedFMDirection_Adam --step 60 --step_size 0.005 --direction_image_model {image_model} --depth {depth} --file_prefix {image_model}-step_size-0.005-depth-{depth}_paper_study' 9 | Aba_layer_ImageGuidedFMDirection_Adam_reference_run = 'python reference.py --gpu {gpu} --adv_path Image-ImageGuidedFMDirection_Adam-60-{image_model}-step_size-0.005-depth-{depth}_paper_study' 10 | 11 | # Kinetic400 12 | Per_Com_ImageGuidedFMDirection_Adam_attack_run = 'python image_main.py --gpu {gpu} --attack_method ImageGuidedFMDirection_Adam --step 60 --step_size 0.005 --direction_image_model {image_model} --depth {depth} --file_prefix {image_model}-depth-{depth}_paper_per_com' 13 | Per_Com_ImageGuidedFMDirection_Adam_reference_run = 'python reference.py --gpu {gpu} --adv_path Image-ImageGuidedFMDirection_Adam-60-{image_model}-depth-{depth}_paper_per_com' 14 | 15 | Per_Com_ImageGuidedFML2_Adam_MultiModels_attack_run = 'python image_main.py --gpu {gpu} --attack_method ImageGuidedFML2_Adam_MultiModels --step 60 --step_size 0.005 --file_prefix paper_per_com' 16 | Per_Com_ImageGuidedFML2_Adam_MultiModels_reference_run = 'python reference.py --gpu {gpu} --adv_path Image-ImageGuidedFML2_Adam_MultiModels-60-paper_per_com' 17 | 18 | Per_Com_ImageGuidedStd_Adam_attack_run = 'python image_main.py --gpu {gpu} --attack_method ImageGuidedStd_Adam --step 60 --step_size 0.005 --direction_image_model {image_model} --depth {depth} --file_prefix {image_model}-depth-{depth}_paper_per_com' 19 | Per_Com_ImageGuidedStd_Adam_reference_run = 'python reference.py --gpu {gpu} --adv_path Image-ImageGuidedStd_Adam-60-{image_model}-depth-{depth}_paper_per_com' 20 | 21 | # UCF101 22 | Per_Com_UCF_ImageGuidedFMDirection_Adam_attack_run = 'python image_main_ucf101.py --gpu {gpu} --attack_method ImageGuidedFMDirection_Adam --step 60 --step_size 0.005 --direction_image_model {image_model} --depth {depth} --file_prefix {image_model}-depth-{depth}_paper_per_com' 23 | Per_Com_UCF_ImageGuidedFMDirection_Adam_reference_run = 'python reference_ucf101.py --gpu {gpu} --adv_path Image-ImageGuidedFMDirection_Adam-60-{image_model}-depth-{depth}_paper_per_com' 24 | 25 | Per_Com_UCF_ImageGuidedFML2_Adam_MultiModels_attack_run = 'python image_main_ucf101.py --gpu {gpu} --attack_method ImageGuidedFML2_Adam_MultiModels --step 60 --step_size 0.005 --file_prefix paper_per_com' 26 | Per_Com_UCF_ImageGuidedFML2_Adam_MultiModels_reference_run = 'python reference_ucf101.py --gpu {gpu} --adv_path Image-ImageGuidedFML2_Adam_MultiModels-60-paper_per_com' 27 | 28 | Per_Com_UCF_ImageGuidedStd_Adam_attack_run = 'python image_main_ucf101.py --gpu {gpu} --attack_method ImageGuidedStd_Adam --step 60 --step_size 0.005 --direction_image_model {image_model} --depth {depth} --file_prefix {image_model}-depth-{depth}_paper_per_com' 29 | Per_Com_UCF_ImageGuidedStd_Adam_reference_run = 'python reference_ucf101.py --gpu {gpu} --adv_path Image-ImageGuidedStd_Adam-60-{image_model}-depth-{depth}_paper_per_com' 30 | 31 | 32 | 33 | def arg_parse(): 34 | parser = argparse.ArgumentParser(description='') 35 | # parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.') 36 | parser.add_argument('--gpu', type=str, default='0', help='gpu device.') 37 | parser.add_argument('--batch_size', type=int, default=1, help='') 38 | # parser.add_argument('--kernlens', nargs='+', help=' Set flag', required=True) 39 | args = parser.parse_args() 40 | return args 41 | 42 | if __name__ == '__main__': 43 | args = arg_parse() 44 | 45 | # ablation study for step size and iteration number (Figure 4) 46 | steps = [20, 40, 60, 80, 100] 47 | step_sizes = [0.001, 0.0025, 0.0050, 0.0075, 0.010] 48 | for step in steps: 49 | # step_size = 0.004 50 | for step_size in step_sizes: 51 | os.system(ImageGuidedFMDirection_Adam_attack_run.format(gpu=args.gpu, step=step, step_size=step_size, batch_nums=1, batch_index=1, batch_size=1)) 52 | os.system(ImageGuidedFMDirection_Adam_reference_run.format(gpu=args.gpu, step=step, step_size=step_size)) 53 | 54 | # ablation study for attacked layer (Table 2 and Figure 5) 55 | depths = [1,2,3,4] 56 | image_models = ['resnet', 'squeezenet', 'vgg', 'alexnet'] 57 | for image_model in image_models: 58 | for depth in depths: 59 | os.system(Aba_layer_ImageGuidedFMDirection_Adam_attack_run.format(gpu=args.gpu, depth=depth, image_model=image_model)) 60 | os.system(Aba_layer_ImageGuidedFMDirection_Adam_reference_run.format(gpu=args.gpu, image_model=image_model, depth=depth)) 61 | 62 | # performance comparison for kinetics-400 (Table 3) 63 | step = 60 64 | step_size = 0.005 65 | image_models = ['squeezenet', 'vgg', 'alexnet', 'resnet'] 66 | for image_model in image_models: 67 | if image_model == 'resnet' or image_model == 'squeezenet': 68 | depth = 2 69 | else: 70 | depth = 3 71 | # I2V attack 72 | os.system(Per_Com_ImageGuidedFMDirection_Adam_attack_run.format(gpu=args.gpu, image_model=image_model, depth=depth)) 73 | os.system(Per_Com_ImageGuidedFMDirection_Adam_reference_run.format(gpu=args.gpu, image_model=image_model, depth=depth)) 74 | # STD attack 75 | os.system(Per_Com_ImageGuidedStd_Adam_attack_run.format(gpu=args.gpu, image_model=image_model, depth=depth)) 76 | os.system(Per_Com_ImageGuidedStd_Adam_reference_run.format(gpu=args.gpu, image_model=image_model, depth=depth)) 77 | 78 | # ENS-I2V attack 79 | os.system(Per_Com_ImageGuidedFML2_Adam_MultiModels_attack_run.format(gpu=args.gpu)) 80 | os.system(Per_Com_ImageGuidedFML2_Adam_MultiModels_reference_run.format(gpu=args.gpu)) 81 | 82 | # performance compasison for ucf101 (Table 4) 83 | step = 60 84 | step_size = 0.005 85 | image_models = ['resnet', 'squeezenet', 'vgg', 'alexnet'] # '', 86 | for image_model in image_models: 87 | if image_model == 'resnet' or image_model == 'squeezenet': 88 | depth = 2 89 | else: 90 | depth = 3 91 | # I2V attack 92 | os.system(Per_Com_UCF_ImageGuidedFMDirection_Adam_attack_run.format(gpu=args.gpu, image_model=image_model, depth=depth)) 93 | os.system(Per_Com_UCF_ImageGuidedFMDirection_Adam_reference_run.format(gpu=args.gpu, image_model=image_model, depth=depth)) 94 | # STD attack 95 | os.system(Per_Com_UCF_ImageGuidedStd_Adam_attack_run.format(gpu=args.gpu, image_model=image_model, depth=depth)) 96 | os.system(Per_Com_UCF_ImageGuidedStd_Adam_reference_run.format(gpu=args.gpu, image_model=image_model, depth=depth)) 97 | 98 | # ENS-I2V attack 99 | os.system(Per_Com_UCF_ImageGuidedFML2_Adam_MultiModels_attack_run.format(gpu=args.gpu)) 100 | os.system(Per_Com_UCF_ImageGuidedFML2_Adam_MultiModels_reference_run.format(gpu=args.gpu)) 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /image_cam_utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import torch 4 | 5 | def visualize_cam(mask, img): 6 | """Make heatmap from mask and synthesize GradCAM result image using heatmap and img. 7 | Args: 8 | mask (torch.tensor): mask shape of (1, 1, H, W) and each element has value in range [0, 1] 9 | img (torch.tensor): img shape of (1, 3, H, W) and each pixel value is in range [0, 1] 10 | 11 | Return: 12 | heatmap (torch.tensor): heatmap img shape of (3, H, W) 13 | result (torch.tensor): synthesized GradCAM result of same shape with heatmap. 14 | """ 15 | heatmap = cv2.applyColorMap(np.uint8(255 * mask.squeeze()), cv2.COLORMAP_JET) 16 | heatmap = torch.from_numpy(heatmap).permute(2, 0, 1).float().div(255) 17 | b, g, r = heatmap.split(1) 18 | heatmap = torch.cat([r, g, b]) 19 | 20 | result = heatmap+img.cpu() 21 | result = result.div(result.max()).squeeze() 22 | 23 | return heatmap, result 24 | 25 | 26 | def find_resnet_layer(arch, target_layer_name): 27 | """Find resnet layer to calculate GradCAM and GradCAM++ 28 | 29 | Args: 30 | arch: default torchvision densenet models 31 | target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below. 32 | target_layer_name = 'conv1' 33 | target_layer_name = 'layer1' 34 | target_layer_name = 'layer1_basicblock0' 35 | target_layer_name = 'layer1_basicblock0_relu' 36 | target_layer_name = 'layer1_bottleneck0' 37 | target_layer_name = 'layer1_bottleneck0_conv1' 38 | target_layer_name = 'layer1_bottleneck0_downsample' 39 | target_layer_name = 'layer1_bottleneck0_downsample_0' 40 | target_layer_name = 'avgpool' 41 | target_layer_name = 'fc' 42 | 43 | Return: 44 | target_layer: found layer. this layer will be hooked to get forward/backward pass information. 45 | """ 46 | if 'layer' in target_layer_name: 47 | hierarchy = target_layer_name.split('_') 48 | layer_num = int(hierarchy[0].lstrip('layer')) 49 | if layer_num == 1: 50 | target_layer = arch.layer1 51 | elif layer_num == 2: 52 | target_layer = arch.layer2 53 | elif layer_num == 3: 54 | target_layer = arch.layer3 55 | elif layer_num == 4: 56 | target_layer = arch.layer4 57 | else: 58 | raise ValueError('unknown layer : {}'.format(target_layer_name)) 59 | 60 | if len(hierarchy) >= 2: 61 | bottleneck_num = int(hierarchy[1].lower().lstrip('bottleneck').lstrip('basicblock')) 62 | target_layer = target_layer[bottleneck_num] 63 | 64 | if len(hierarchy) >= 3: 65 | target_layer = target_layer._modules[hierarchy[2]] 66 | 67 | if len(hierarchy) == 4: 68 | target_layer = target_layer._modules[hierarchy[3]] 69 | 70 | else: 71 | target_layer = arch._modules[target_layer_name] 72 | 73 | return target_layer 74 | 75 | 76 | def find_densenet_layer(arch, target_layer_name): 77 | """Find densenet layer to calculate GradCAM and GradCAM++ 78 | 79 | Args: 80 | arch: default torchvision densenet models 81 | target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below. 82 | target_layer_name = 'features' 83 | target_layer_name = 'features_transition1' 84 | target_layer_name = 'features_transition1_norm' 85 | target_layer_name = 'features_denseblock2_denselayer12' 86 | target_layer_name = 'features_denseblock2_denselayer12_norm1' 87 | target_layer_name = 'features_denseblock2_denselayer12_norm1' 88 | target_layer_name = 'classifier' 89 | 90 | Return: 91 | target_layer: found layer. this layer will be hooked to get forward/backward pass information. 92 | """ 93 | 94 | hierarchy = target_layer_name.split('_') 95 | target_layer = arch._modules[hierarchy[0]] 96 | 97 | if len(hierarchy) >= 2: 98 | target_layer = target_layer._modules[hierarchy[1]] 99 | 100 | if len(hierarchy) >= 3: 101 | target_layer = target_layer._modules[hierarchy[2]] 102 | 103 | if len(hierarchy) == 4: 104 | target_layer = target_layer._modules[hierarchy[3]] 105 | 106 | return target_layer 107 | 108 | 109 | def find_vgg_layer(arch, target_layer_name): 110 | """Find vgg layer to calculate GradCAM and GradCAM++ 111 | 112 | Args: 113 | arch: default torchvision densenet models 114 | target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below. 115 | target_layer_name = 'features' 116 | target_layer_name = 'features_42' 117 | target_layer_name = 'classifier' 118 | target_layer_name = 'classifier_0' 119 | 120 | Return: 121 | target_layer: found layer. this layer will be hooked to get forward/backward pass information. 122 | """ 123 | hierarchy = target_layer_name.split('_') 124 | 125 | if len(hierarchy) >= 1: 126 | target_layer = arch.features 127 | 128 | if len(hierarchy) == 2: 129 | target_layer = target_layer[int(hierarchy[1])] 130 | 131 | return target_layer 132 | 133 | 134 | def find_alexnet_layer(arch, target_layer_name): 135 | """Find alexnet layer to calculate GradCAM and GradCAM++ 136 | 137 | Args: 138 | arch: default torchvision densenet models 139 | target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below. 140 | target_layer_name = 'features' 141 | target_layer_name = 'features_0' 142 | target_layer_name = 'classifier' 143 | target_layer_name = 'classifier_0' 144 | 145 | Return: 146 | target_layer: found layer. this layer will be hooked to get forward/backward pass information. 147 | """ 148 | hierarchy = target_layer_name.split('_') 149 | 150 | if len(hierarchy) >= 1: 151 | target_layer = arch.features 152 | 153 | if len(hierarchy) == 2: 154 | target_layer = target_layer[int(hierarchy[1])] 155 | 156 | return target_layer 157 | 158 | 159 | def find_squeezenet_layer(arch, target_layer_name): 160 | """Find squeezenet layer to calculate GradCAM and GradCAM++ 161 | 162 | Args: 163 | arch: default torchvision densenet models 164 | target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below. 165 | target_layer_name = 'features_12' 166 | target_layer_name = 'features_12_expand3x3' 167 | target_layer_name = 'features_12_expand3x3_activation' 168 | 169 | Return: 170 | target_layer: found layer. this layer will be hooked to get forward/backward pass information. 171 | """ 172 | hierarchy = target_layer_name.split('_') 173 | target_layer = arch._modules[hierarchy[0]] 174 | 175 | if len(hierarchy) >= 2: 176 | target_layer = target_layer._modules[hierarchy[1]] 177 | 178 | if len(hierarchy) == 3: 179 | target_layer = target_layer._modules[hierarchy[2]] 180 | 181 | elif len(hierarchy) == 4: 182 | target_layer = target_layer._modules[hierarchy[2]+'_'+hierarchy[3]] 183 | 184 | return target_layer 185 | 186 | 187 | def denormalize(tensor, mean, std): 188 | if not tensor.ndimension() == 4: 189 | raise TypeError('tensor should be 4D') 190 | 191 | mean = torch.FloatTensor(mean).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device) 192 | std = torch.FloatTensor(std).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device) 193 | 194 | return tensor.mul(std).add(mean) 195 | 196 | 197 | def normalize(tensor, mean, std): 198 | if not tensor.ndimension() == 4: 199 | raise TypeError('tensor should be 4D') 200 | 201 | mean = torch.FloatTensor(mean).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device) 202 | std = torch.FloatTensor(std).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device) 203 | 204 | return tensor.sub(mean).div(std) 205 | 206 | 207 | class Normalize(object): 208 | def __init__(self, mean, std): 209 | self.mean = mean 210 | self.std = std 211 | 212 | def __call__(self, tensor): 213 | return self.do(tensor) 214 | 215 | def do(self, tensor): 216 | return normalize(tensor, self.mean, self.std) 217 | 218 | def undo(self, tensor): 219 | return denormalize(tensor, self.mean, self.std) 220 | 221 | def __repr__(self): 222 | return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std) -------------------------------------------------------------------------------- /video_attacks.py: -------------------------------------------------------------------------------- 1 | from base_attacks import Attack 2 | import torch 3 | import torch.nn as nn 4 | import scipy.stats as st 5 | import numpy as np 6 | import torchvision 7 | from PIL import Image 8 | import random 9 | import math 10 | import time 11 | import torch.nn.functional as F 12 | from utils import norm_grads 13 | 14 | class TemporalTranslation(Attack): 15 | ''' 16 | paper: Boosting the transferability of video adversarial examples via temporal translation 17 | Replace conv with multiple queries. 18 | There are two ways: Cycle and Exchange. 19 | Contain momentum or no momentum. 20 | params = {'kernlen':args.kernlen, # conv1 params 21 | 'momentum':args.momentum 22 | 'weight':args.augmentation_weight, 23 | 'move_type': 'adj', 24 | 'kernel_mode': 'gaussian'} 25 | ''' 26 | def __init__(self, model, params, epsilon=16/255, steps=10, delay=1.0): 27 | super(TemporalTranslation, self).__init__("TemporalTranslation", model) 28 | self.epsilon = epsilon 29 | self.steps = steps 30 | self.step_size = self.epsilon / self.steps 31 | self.delay = delay 32 | 33 | for name, value in params.items(): 34 | setattr(self, name, value) 35 | 36 | self.frames = 32 37 | self.cycle_move_list = self._move_info_generation() 38 | if self.kernel_mode == 'gaussian': 39 | kernel = self._initial_kernel_gaussian(self.kernlen).astype(np.float32) # (self.kernlen) 40 | elif self.kernel_mode == 'linear': 41 | kernel = self._initial_kernel_linear(self.kernlen).astype(np.float32) # (self.kernlen) 42 | elif self.kernel_mode == 'random': 43 | kernel = self._initial_kernel_uniform(self.kernlen).astype(np.float32) # (self.kernlen) 44 | 45 | self.kernel = torch.from_numpy(np.expand_dims(kernel, 0)).to(self.device) # 1,self.kernlen 46 | 47 | def _move_info_generation(self): 48 | max_move = int((self.kernlen - 1) / 2) 49 | lists = [i for i in range(-max_move, max_move+1)] 50 | return lists 51 | 52 | def _initial_kernel_linear(self, kernlen): 53 | k = int((kernlen - 1) / 2) 54 | kern1d = [] 55 | for i in range(k+1): 56 | kern1d.append(1 - i / (k+1)) 57 | kern1d = np.array(kern1d[::-1][:-1] + kern1d) 58 | kernel = kern1d / kern1d.sum() 59 | return kernel 60 | 61 | def _initial_kernel_uniform(self, kernlen): 62 | kern1d = np.ones(kernlen) 63 | kernel = kern1d / kern1d.sum() 64 | return kernel 65 | 66 | def _initial_kernel_gaussian(self, kernlen): 67 | assert kernlen%2 == 1 68 | k = (kernlen - 1) /2 69 | sigma = k/3 70 | k = int(k) 71 | def calculte_guassian(x, sigma): 72 | return (1/(sigma*np.sqrt(2*np.pi)) * np.math.exp(-(x**2)/(2* (sigma**2)))) 73 | kern1d = [] 74 | for i in range(-k, k+1): 75 | kern1d.append(calculte_guassian(i, sigma)) 76 | assert len(kern1d) == kernlen 77 | kern1d = np.array(kern1d) 78 | kernel = kern1d / kern1d.sum() 79 | return kernel 80 | 81 | def _conv1d_frame(self, grads): 82 | ''' 83 | grads: D, N, C, T, H, W 84 | ''' 85 | # cycle padding for grads 86 | D,N,C,T,H,W = grads.shape 87 | grads = grads.reshape(D, -1) 88 | 89 | grad = torch.matmul(self.kernel, grads) 90 | grad = grad.reshape(N,C,T,H,W) 91 | return grad 92 | 93 | def _cycle_move(self, adv_videos, cycle_move): 94 | if cycle_move < 0: 95 | direction = -1 96 | else: 97 | direction = 1 98 | cycle_move = abs(cycle_move) 99 | cycle_move = cycle_move % self.frames 100 | new_videos = torch.zeros_like(adv_videos) 101 | for i in range(self.frames): 102 | ori_ind = i 103 | new_ind = (ori_ind + direction * cycle_move) % self.frames 104 | new_videos[:,:,new_ind] = adv_videos[:,:,ori_ind] 105 | return new_videos 106 | 107 | def _cycle_move_large(self, adv_videos, cycle_move): 108 | if cycle_move < 0: 109 | direction = -1 110 | else: 111 | direction = 1 112 | cycle_move = abs(cycle_move) 113 | if cycle_move == 0: 114 | cycle_move = cycle_move % self.frames 115 | else: 116 | cycle_move = (cycle_move + (int(self.frames/2)-1)) % self.frames 117 | new_videos = torch.zeros_like(adv_videos) 118 | for i in range(self.frames): 119 | ori_ind = i 120 | new_ind = (ori_ind + direction * cycle_move) % self.frames 121 | new_videos[:,:,new_ind] = adv_videos[:,:,ori_ind] 122 | return new_videos 123 | 124 | def _cycle_move_random(self, adv_videos, cycle_move): 125 | if cycle_move < 0: 126 | direction = -1 127 | else: 128 | direction = 1 129 | # cycle_move = abs(cycle_move) 130 | if cycle_move == 0: 131 | cycle_move = cycle_move % self.frames 132 | else: 133 | cycle_move = random.randint(0, 100) % self.frames 134 | # cycle_move = (cycle_move + int(self.frames/2)) % self.frames 135 | new_videos = torch.zeros_like(adv_videos) 136 | for i in range(self.frames): 137 | ori_ind = i 138 | new_ind = (ori_ind + direction * cycle_move) % self.frames 139 | new_videos[:,:,new_ind] = adv_videos[:,:,ori_ind] 140 | return new_videos 141 | 142 | def _exchange_move(self, adv_videos, exchange_lists): 143 | new_videos = adv_videos.clone() 144 | for exchange in exchange_lists: 145 | one_frame, ano_frame = exchange 146 | new_videos[:,:,one_frame] = adv_videos[:,:,ano_frame] 147 | new_videos[:,:,ano_frame] = adv_videos[:,:,one_frame] 148 | return new_videos 149 | 150 | def _get_grad(self, adv_videos, labels, loss): 151 | batch_size = adv_videos.shape[0] 152 | used_labels = torch.cat([labels]*batch_size, dim=0) 153 | adv_videos.requires_grad = True 154 | outputs = self.model(adv_videos) 155 | cost = self._targeted*loss(outputs, used_labels).to(self.device) 156 | grad = torch.autograd.grad(cost, adv_videos, 157 | retain_graph=False, create_graph=False)[0] 158 | return grad 159 | 160 | def _grad_augmentation(self, grads): 161 | ''' 162 | Input: 163 | grads: kernlen, grad.shape 164 | Return 165 | grad 166 | ''' 167 | same_position_diff_frame = grads.clone() 168 | diff_position_same_frame = torch.zeros_like(grads) 169 | for ind, cycle_move in enumerate(self.cycle_move_list): 170 | diff_position_same_frame[ind] = self._cycle_move(grads[ind], -cycle_move) 171 | s_conv_grad = self._conv1d_frame(same_position_diff_frame) 172 | d_conv_grad = self._conv1d_frame(diff_position_same_frame) 173 | grad = (1-self.weight)*s_conv_grad + self.weight*d_conv_grad 174 | return grad 175 | 176 | def forward(self, videos, labels): 177 | r""" 178 | Overridden. 179 | """ 180 | videos = videos.to(self.device) 181 | momentum = torch.zeros_like(videos).to(self.device) 182 | labels = labels.to(self.device) 183 | loss = nn.CrossEntropyLoss() 184 | unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1] 185 | adv_videos = videos.clone().detach() 186 | del videos 187 | 188 | start_time = time.time() 189 | for i in range(self.steps): 190 | # obtain grads of these variants 191 | batch_new_videos = [] 192 | for cycle_move in self.cycle_move_list: 193 | if self.move_type == 'adj': 194 | new_videos = self._cycle_move(adv_videos, cycle_move) 195 | elif self.move_type == 'large': 196 | new_videos = self._cycle_move_large(adv_videos, cycle_move) 197 | elif self.move_type == 'random': 198 | new_videos = self._cycle_move_random(adv_videos, cycle_move) 199 | batch_new_videos.append(new_videos) 200 | batch_inps = torch.cat(batch_new_videos, dim=0) 201 | grads = [] 202 | batch_times = 5 203 | length = len(self.cycle_move_list) 204 | if self.model_name == 'TPNet': 205 | batch_times = length 206 | print (self.model_name, batch_times) 207 | batch_size = math.ceil(length / batch_times) 208 | for i in range(batch_times): 209 | grad = self._get_grad(batch_inps[i*batch_size:min((i+1)*batch_size, length)], labels, loss) 210 | grads.append(grad) 211 | # grad augmentation 212 | grads = torch.cat(grads, dim=0) 213 | grads = torch.unsqueeze(grads, dim=1) 214 | grad = self._grad_augmentation(grads) 215 | 216 | # momentum 217 | if self.momentum: 218 | grad = norm_grads(grad) 219 | grad += momentum * self.delay 220 | momentum = grad 221 | else: 222 | pass 223 | 224 | adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1] 225 | adv_videos = adv_videos + self.step_size*grad.sign() 226 | delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon) 227 | adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach() 228 | adv_videos = self._transform_video(adv_videos, mode='forward') # norm 229 | print ('now_time', time.time()-start_time) 230 | return adv_videos -------------------------------------------------------------------------------- /datasets.py: -------------------------------------------------------------------------------- 1 | # from gluoncv.torch.data import VideoClsDataset 2 | import torch 3 | 4 | import os 5 | import warnings 6 | import numpy as np 7 | try: 8 | from decord import VideoReader, cpu 9 | except ImportError: 10 | VideoReader = None 11 | cpu = None 12 | 13 | import torch 14 | from torch.utils.data import Dataset 15 | 16 | from gluoncv.torch.data import video_transforms, volume_transforms, multiGridHelper, MultiGridBatchSampler 17 | 18 | class VideoClsDataset(Dataset): 19 | """Load your own video classification dataset.""" 20 | 21 | def __init__(self, anno_path, data_path, mode='train', clip_len=8, 22 | frame_sample_rate=2, crop_size=224, short_side_size=256, 23 | new_height=256, new_width=340, keep_aspect_ratio=False, 24 | num_segment=1, num_crop=1, test_num_segment=10, test_num_crop=3, 25 | use_multigrid=False): 26 | self.anno_path = anno_path 27 | self.data_path = data_path 28 | self.mode = mode 29 | self.clip_len = clip_len 30 | self.frame_sample_rate = frame_sample_rate 31 | self.crop_size = crop_size 32 | self.short_side_size = short_side_size 33 | self.new_height = new_height 34 | self.new_width = new_width 35 | self.keep_aspect_ratio = keep_aspect_ratio 36 | self.num_segment = num_segment 37 | self.test_num_segment = test_num_segment 38 | self.num_crop = num_crop 39 | self.test_num_crop = test_num_crop 40 | self.use_multigrid = use_multigrid and (mode == 'train') 41 | if VideoReader is None: 42 | raise ImportError("Unable to import `decord` which is required to read videos.") 43 | 44 | import pandas as pd 45 | # cleaned = pd.read_csv(self.anno_path, header=None, delimiter=' ') 46 | # self.dataset_samples = list(cleaned.values[:, 0]) 47 | # self.label_array = list(cleaned.values[:, 2]) 48 | cleaned = pd.read_csv(self.anno_path) 49 | self.dataset_samples = cleaned['path'].values.tolist() 50 | self.label_array = cleaned['gt_label'].values.tolist() 51 | self.clip_inds = cleaned['clip_index'].values.tolist() 52 | 53 | if (mode == 'train'): 54 | if self.use_multigrid: 55 | self.mg_helper = multiGridHelper() 56 | self.data_transform = [] 57 | for alpha in range(self.mg_helper.mod_long): 58 | tmp = [] 59 | for beta in range(self.mg_helper.mod_short): 60 | info = self.mg_helper.get_resize(alpha, beta) 61 | scale_s = info[1] 62 | tmp.append(video_transforms.Compose([ 63 | video_transforms.Resize(int(self.short_side_size / scale_s), 64 | interpolation='bilinear'), 65 | # TODO: multiscale corner cropping 66 | video_transforms.RandomResize(ratio=(1, 1.25), 67 | interpolation='bilinear'), 68 | video_transforms.RandomCrop(size=(int(self.crop_size / scale_s), 69 | int(self.crop_size / scale_s)))])) 70 | self.data_transform.append(tmp) 71 | else: 72 | self.data_transform = video_transforms.Compose([ 73 | video_transforms.Resize(int(self.short_side_size), 74 | interpolation='bilinear'), 75 | video_transforms.RandomResize(ratio=(1, 1.25), 76 | interpolation='bilinear'), 77 | video_transforms.RandomCrop(size=(int(self.crop_size), 78 | int(self.crop_size)))]) 79 | 80 | self.data_transform_after = video_transforms.Compose([ 81 | video_transforms.RandomHorizontalFlip(), 82 | volume_transforms.ClipToTensor(), 83 | video_transforms.Normalize(mean=[0.485, 0.456, 0.406], 84 | std=[0.229, 0.224, 0.225]) 85 | ]) 86 | elif (mode == 'validation'): 87 | self.data_transform = video_transforms.Compose([ 88 | video_transforms.Resize(self.short_side_size, interpolation='bilinear'), 89 | video_transforms.CenterCrop(size=(self.crop_size, self.crop_size)), 90 | volume_transforms.ClipToTensor(), 91 | video_transforms.Normalize(mean=[0.485, 0.456, 0.406], 92 | std=[0.229, 0.224, 0.225]) 93 | ]) 94 | elif mode == 'test': 95 | self.data_resize = video_transforms.Compose([ 96 | video_transforms.Resize(size=(short_side_size), interpolation='bilinear') 97 | ]) 98 | self.data_transform = video_transforms.Compose([ 99 | volume_transforms.ClipToTensor(), 100 | video_transforms.Normalize(mean=[0.485, 0.456, 0.406], 101 | std=[0.229, 0.224, 0.225]) 102 | ]) 103 | self.test_seg = [] 104 | self.test_dataset = [] 105 | self.test_label_array = [] 106 | for ck in range(self.test_num_segment): 107 | for cp in range(self.test_num_crop): 108 | for idx in range(len(self.label_array)): 109 | sample_label = self.label_array[idx] 110 | self.test_label_array.append(sample_label) 111 | self.test_dataset.append(self.dataset_samples[idx]) 112 | self.test_seg.append((ck, cp)) 113 | 114 | def __getitem__(self, index): 115 | if self.mode == 'train': 116 | if self.use_multigrid is True: 117 | index, alpha, beta = index 118 | info = self.mg_helper.get_resize(alpha, beta) 119 | scale_t = info[0] 120 | data_transform_func = self.data_transform[alpha][beta] 121 | else: 122 | scale_t = 1 123 | data_transform_func = self.data_transform 124 | 125 | sample = self.dataset_samples[index] 126 | buffer = self.loadvideo_decord(sample, sample_rate_scale=scale_t) 127 | if len(buffer) == 0: 128 | while len(buffer) == 0: 129 | warnings.warn("video {} not correctly loaded during training".format(sample)) 130 | index = np.random.randint(self.__len__()) 131 | sample = self.dataset_samples[index] 132 | buffer = self.loadvideo_decord(sample, sample_rate_scale=scale_t) 133 | 134 | buffer = data_transform_func(buffer) 135 | buffer = self.data_transform_after(buffer) 136 | return buffer, self.label_array[index], sample.split("/")[-1].split(".")[0] 137 | 138 | elif self.mode == 'validation': 139 | sample = self.dataset_samples[index] 140 | clip_ind = self.clip_inds[index] 141 | buffer = self.loadvideo_decord(sample, clip_ind) 142 | if len(buffer) == 0: 143 | while len(buffer) == 0: 144 | warnings.warn("video {} not correctly loaded during validation".format(sample)) 145 | index = np.random.randint(self.__len__()) 146 | sample = self.dataset_samples[index] 147 | buffer = self.loadvideo_decord(sample) 148 | buffer = self.data_transform(buffer) 149 | # return buffer, self.label_array[index], sample.split("/")[-1].split(".")[0] 150 | return buffer, self.label_array[index], sample.split(".")[0], clip_ind 151 | 152 | elif self.mode == 'test': 153 | sample = self.test_dataset[index] 154 | chunk_nb, split_nb = self.test_seg[index] 155 | buffer = self.loadvideo_decord(sample) 156 | 157 | while len(buffer) == 0: 158 | warnings.warn("video {}, temporal {}, spatial {} not found during testing".format(\ 159 | str(self.test_dataset[index]), chunk_nb, split_nb)) 160 | index = np.random.randint(self.__len__()) 161 | sample = self.test_dataset[index] 162 | chunk_nb, split_nb = self.test_seg[index] 163 | buffer = self.loadvideo_decord(sample) 164 | 165 | buffer = self.data_resize(buffer) 166 | if isinstance(buffer, list): 167 | buffer = np.stack(buffer, 0) 168 | 169 | spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) \ 170 | / (self.test_num_crop - 1) 171 | temporal_step = max(1.0 * (buffer.shape[0] - self.clip_len) \ 172 | / (self.test_num_segment - 1), 0) 173 | temporal_start = int(chunk_nb * temporal_step) 174 | spatial_start = int(split_nb * spatial_step) 175 | if buffer.shape[1] >= buffer.shape[2]: 176 | buffer = buffer[temporal_start:temporal_start + self.clip_len, \ 177 | spatial_start:spatial_start + self.short_side_size, :, :] 178 | else: 179 | buffer = buffer[temporal_start:temporal_start + self.clip_len, \ 180 | :, spatial_start:spatial_start + self.short_side_size, :] 181 | 182 | buffer = self.data_transform(buffer) 183 | return buffer, self.test_label_array[index], sample.split("/")[-1].split(".")[0], \ 184 | chunk_nb, split_nb 185 | else: 186 | raise NameError('mode {} unkown'.format(self.mode)) 187 | 188 | def loadvideo_decord(self, sample, clip_ind, sample_rate_scale=1): 189 | """Load video content using Decord""" 190 | # pylint: disable=line-too-long, bare-except, unnecessary-comprehension 191 | fname = os.path.join(self.data_path, sample) 192 | 193 | if not (os.path.exists(fname)): 194 | return [] 195 | 196 | # avoid hanging issue 197 | if os.path.getsize(fname) < 1 * 1024: 198 | print('SKIP: ', fname, " - ", os.path.getsize(fname)) 199 | return [] 200 | try: 201 | if self.keep_aspect_ratio: 202 | vr = VideoReader(fname, num_threads=1, ctx=cpu(0)) 203 | else: 204 | vr = VideoReader(fname, width=self.new_width, height=self.new_height, 205 | num_threads=1, ctx=cpu(0)) 206 | except: 207 | print("video cannot be loaded by decord: ", fname) 208 | return [] 209 | 210 | if self.mode == 'test': 211 | all_index = [x for x in range(0, len(vr), self.frame_sample_rate)] 212 | while len(all_index) < self.clip_len: 213 | all_index.append(all_index[-1]) 214 | vr.seek(0) 215 | buffer = vr.get_batch(all_index).asnumpy() 216 | return buffer 217 | 218 | # handle temporal segments 219 | converted_len = int(self.clip_len * self.frame_sample_rate) 220 | seg_len = len(vr) // self.num_segment 221 | 222 | all_index = [] 223 | for i in range(self.num_segment): 224 | if seg_len <= converted_len: 225 | index = np.linspace(0, seg_len, num=seg_len // self.frame_sample_rate) 226 | index = np.concatenate((index, np.ones(self.clip_len - seg_len // self.frame_sample_rate) * seg_len)) 227 | index = np.clip(index, 0, seg_len - 1).astype(np.int64) 228 | else: 229 | # end_idx = np.random.randint(converted_len, seg_len) 230 | if clip_ind == -1: 231 | end_idx = seg_len - 1 232 | else: 233 | np.random.seed(clip_ind) 234 | end_idx = np.random.randint(converted_len, seg_len) 235 | str_idx = end_idx - converted_len 236 | index = np.linspace(str_idx, end_idx, num=self.clip_len) 237 | index = np.clip(index, str_idx, end_idx - 1).astype(np.int64) 238 | index = index + i*seg_len 239 | all_index.extend(list(index)) 240 | 241 | all_index = all_index[::int(sample_rate_scale)] 242 | vr.seek(0) 243 | buffer = vr.get_batch(all_index).asnumpy() 244 | return buffer 245 | 246 | def __len__(self): 247 | if self.mode != 'test': 248 | return len(self.dataset_samples) 249 | else: 250 | return len(self.test_dataset) 251 | 252 | def get_dataset(cfg, loader=True): 253 | val_dataset = VideoClsDataset(anno_path=cfg.CONFIG.DATA.VAL_ANNO_PATH, 254 | data_path=cfg.CONFIG.DATA.VAL_DATA_PATH, 255 | mode='validation', 256 | use_multigrid=cfg.CONFIG.DATA.MULTIGRID, 257 | clip_len=cfg.CONFIG.DATA.CLIP_LEN, 258 | frame_sample_rate=cfg.CONFIG.DATA.FRAME_RATE, 259 | num_segment=cfg.CONFIG.DATA.NUM_SEGMENT, 260 | num_crop=cfg.CONFIG.DATA.NUM_CROP, 261 | keep_aspect_ratio=cfg.CONFIG.DATA.KEEP_ASPECT_RATIO, 262 | crop_size=cfg.CONFIG.DATA.CROP_SIZE, 263 | short_side_size=cfg.CONFIG.DATA.SHORT_SIDE_SIZE, 264 | new_height=cfg.CONFIG.DATA.NEW_HEIGHT, 265 | new_width=cfg.CONFIG.DATA.NEW_WIDTH) 266 | # length = len(val_dataset) 267 | # batch_contain = int(length/batch_nums) 268 | # this_batch_dataset = val_dataset[batch_contain*(batch_index-1):batch_contain*batch_index] 269 | 270 | print ('The length of Dataset is {}.'.format(len(val_dataset))) 271 | if loader: 272 | val_loader = torch.utils.data.DataLoader( 273 | val_dataset, batch_size=cfg.CONFIG.VAL.BATCH_SIZE, shuffle=False, 274 | num_workers=9, sampler=None, pin_memory=True) 275 | return val_loader 276 | else: 277 | return val_dataset -------------------------------------------------------------------------------- /TPAMI_attack.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.nn as nn 4 | import torchvision.models as models 5 | import random 6 | 7 | from image_cam import GradCAM 8 | from torch.autograd import Variable 9 | from image_cam_utils import find_alexnet_layer, find_vgg_layer, find_resnet_layer, find_densenet_layer, find_squeezenet_layer 10 | import pickle as pkl 11 | 12 | import time 13 | from timm.models import create_model 14 | import numpy as np 15 | 16 | class Attack(object): 17 | """ 18 | Base class for all attacks. 19 | .. note:: 20 | It automatically set device to the device where given model is. 21 | It temporarily changes the model's training mode to `test` 22 | by `.eval()` only during an attack process. 23 | """ 24 | def __init__(self, name, model=None): 25 | r""" 26 | Initializes internal attack state. 27 | Arguments: 28 | name (str) : name of an attack. 29 | model (torch.nn.Module): model to attack. 30 | """ 31 | self.attack = name 32 | self.model = model 33 | self.model_name = str(model).split("(")[0] 34 | 35 | # mean and std values are used in pytorch pretrained models 36 | # they are also used in Kinetics-400. 37 | self.mean = [0.485, 0.456, 0.406] 38 | self.std = [0.229, 0.224, 0.225] 39 | 40 | def forward(self, *input): 41 | r""" 42 | It defines the computation performed at every call (attack forward). 43 | Should be overridden by all subclasses. 44 | """ 45 | raise NotImplementedError 46 | 47 | def _transform_perts(self, perts): 48 | dtype = perts.dtype 49 | mean = torch.as_tensor(self.mean, dtype=dtype).cuda() 50 | std = torch.as_tensor(self.std, dtype=dtype).cuda() 51 | perts.div_(std[:, None, None]) 52 | return perts 53 | 54 | def _transform_video(self, video, mode='forward'): 55 | r''' 56 | Transform the video into [0, 1] 57 | ''' 58 | dtype = video.dtype 59 | mean = torch.as_tensor(self.mean, dtype=dtype).cuda() 60 | std = torch.as_tensor(self.std, dtype=dtype).cuda() 61 | if mode == 'forward': 62 | # [-mean/std, mean/std] 63 | video.sub_(mean[:, None, None]).div_(std[:, None, None]) 64 | elif mode == 'back': 65 | # [0, 1] 66 | video.mul_(std[:, None, None]).add_(mean[:, None, None]) 67 | return video 68 | 69 | def _transform_video_ILAF(self, video, mode='forward'): 70 | r''' 71 | Transform the video into [0, 1] 72 | ''' 73 | dtype = video.dtype 74 | mean = torch.as_tensor(self.mean, dtype=dtype).cuda() 75 | std = torch.as_tensor(self.std, dtype=dtype).cuda() 76 | if mode == 'forward': 77 | # [-mean/std, mean/std] 78 | video.sub_(mean[None, :, None, None, None]).div_(std[None, :, None, None, None]) 79 | elif mode == 'back': 80 | # [0, 1] 81 | video.mul_(std[None, :, None, None, None]).add_(mean[None, :, None, None, None]) 82 | return video 83 | 84 | def __call__(self, *input, **kwargs): 85 | images = self.forward(*input, **kwargs) 86 | return images 87 | 88 | def get_vits(): 89 | model = create_model( 90 | 'vit_base_patch16_224', 91 | pretrained=True, 92 | num_classes=1000, 93 | in_chans=3, 94 | global_pool=None, 95 | scriptable=False) 96 | model.cuda() 97 | model.eval() 98 | return model 99 | 100 | def get_model(model_name): 101 | ''' 102 | ['alexnet', 'vgg', 'resnet', 'densenet', 'squeezenet'] 103 | ''' 104 | if model_name == 'alexnet': 105 | model = models.alexnet(pretrained=True) 106 | # model.features[11/7/4/1] 107 | elif model_name == 'vgg': 108 | model = models.vgg16(pretrained=True) 109 | # model.features[29/20/11/1] 110 | elif model_name == 'resnet': 111 | model = models.resnet101(pretrained=True) 112 | elif model_name == 'densenet': 113 | model = models.densenet161(pretrained=True) 114 | # model.features.denseblock1/2/3/4 115 | # model.features.transition1/2/3,norm5 116 | elif model_name == 'squeezenet': 117 | model = models.squeezenet1_1(pretrained=True) 118 | # model.features[12/9/6/3].expand3x3_activation 119 | model.cuda() 120 | model.eval() 121 | # for m in model.modules(): 122 | # if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d): 123 | # m.eval() 124 | return model 125 | 126 | def get_models(model_name_lists): 127 | models = [] 128 | for model_name in model_name_lists: 129 | model = get_model(model_name) 130 | models.append(model) 131 | return models 132 | 133 | def get_GradCam(model_name_lists): 134 | gradcams = [] 135 | for model_name in model_name_lists: 136 | model_dict = dict(type=model_name, arch=get_model(model_name), input_size=(224, 224)) 137 | this_gradcam = GradCAM(model_dict, False) 138 | gradcams.append(this_gradcam) 139 | return gradcams 140 | 141 | class AENS_I2V_MF(Attack): 142 | ''' 143 | The proposed adaptive I2V with multiple models and layers. 144 | Parameters: 145 | model_name_lists: the surrogate image model names. For example, model_name_lists = ['resnet', 'vgg', 'squeezenet', 'alexnet'] 146 | depths: the layers used in each model. For example, depths = {'resnet':[2,3], 'vgg':[2,3], 'squeezenet':[2,3], 'alexnet':[2,3]} 147 | step_size: the learning rate. 148 | Return: 149 | image_inps: video adversarial example. 150 | used_time: the time during attacking. 151 | cost_saved: the cost values of all steps 152 | ''' 153 | def __init__(self, model_name_lists, depths, step_size, momentum=0, coef_CE=False, epsilon=16/255, steps=60): 154 | super(AENS_I2V_MF, self).__init__("AENS_I2V_MF") 155 | self.epsilon = epsilon 156 | self.steps = steps 157 | self.step_size = step_size 158 | self.loss_info = {} 159 | self.depths = depths 160 | self.momentum = momentum 161 | self.coef_CE = coef_CE 162 | self.models = get_models(model_name_lists) 163 | self.model_names = model_name_lists 164 | 165 | self.coeffs = torch.ones(len(model_name_lists)*2).cuda() 166 | # print ('using image models:', model_name_lists) 167 | 168 | for i in range(len(self.models)): 169 | self.models[i].train() 170 | for m in self.models[i].modules(): 171 | if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d): 172 | m.eval() 173 | model_name = self.model_names[i] 174 | self._attention_hook(self.models[i], model_name) 175 | 176 | def _find_target_layer(self, model, model_name): 177 | used_depth = self.depths[model_name] 178 | if model_name == 'resnet': 179 | if isinstance(used_depth, list): 180 | return [getattr(model, 'layer{}'.format(this_depth))[-1] for this_depth in used_depth] 181 | else: 182 | return getattr(model, 'layer{}'.format(used_depth))[-1] 183 | elif model_name == 'alexnet': 184 | depth_to_layer = {1:1,2:4,3:7,4:11} 185 | if isinstance(used_depth, list): 186 | return [getattr(model, 'features')[depth_to_layer[this_depth]] for this_depth in used_depth] 187 | else: 188 | return getattr(model, 'features')[depth_to_layer[used_depth]] 189 | elif model_name == 'vgg': 190 | depth_to_layer = {1:1,2:11,3:20,4:29} 191 | if isinstance(used_depth, list): 192 | return [getattr(model, 'features')[depth_to_layer[this_depth]] for this_depth in used_depth] 193 | else: 194 | return getattr(model, 'features')[depth_to_layer[used_depth]] 195 | elif model_name == 'squeezenet': 196 | depth_to_layer = {1:3,2:6,3:9,4:12} 197 | if isinstance(used_depth, list): 198 | return [getattr(model, 'features')[depth_to_layer[this_depth]] for this_depth in used_depth] 199 | else: 200 | return getattr(model, 'features')[depth_to_layer[used_depth]].expand3x3_activation 201 | 202 | def _attention_hook(self, model, model_name): 203 | self.gradients = dict() 204 | self.gradients['value'] = [] 205 | self.activations = dict() 206 | self.activations['value'] = [] 207 | def backward_hook(module, grad_input, grad_output): 208 | self.gradients['value'] += [grad_output[0]] 209 | return None 210 | def forward_hook(module, input, output): 211 | self.activations['value'] += [output] 212 | return None 213 | target_layer = self._find_target_layer(model, model_name) 214 | # print (target_layer) 215 | if isinstance(target_layer, list): 216 | for i in target_layer: 217 | i.register_forward_hook(forward_hook) 218 | i.register_backward_hook(backward_hook) 219 | else: 220 | target_layer.register_forward_hook(forward_hook) 221 | target_layer.register_backward_hook(backward_hook) 222 | 223 | def forward(self, videos, labels, video_names): 224 | batch_size = videos.shape[0] 225 | b,c,f,h,w = videos.shape 226 | videos = videos.cuda() 227 | labels = labels.cuda() 228 | self.weights = [] 229 | image_inps = videos.permute([0,2,1,3,4]) 230 | image_inps = image_inps.reshape(b*f, c, h, w) 231 | 232 | # define modifer that updated by optimizer. 233 | modif = torch.Tensor(b*f, c, h, w).fill_(0.01/255).cuda() 234 | modifier = torch.nn.Parameter(modif, requires_grad=True) 235 | optimizer = torch.optim.Adam([modifier], lr=self.step_size) 236 | 237 | unnorm_videos = self._transform_video(image_inps.clone().detach(), mode='back') # [0, 1] 238 | 239 | unnorm_videos = Variable(unnorm_videos, requires_grad=False) 240 | 241 | init_feature_maps = [] 242 | for n in range(len(self.models)): 243 | this_feature_maps = [] 244 | self.gradients = dict() 245 | self.gradients['value'] = [] 246 | self.activations = dict() 247 | self.activations['value'] = [] 248 | _ = self.models[n](image_inps) 249 | for mm in range(len(self.activations['value'])): 250 | activations = self.activations['value'][mm] 251 | activations = Variable(activations, requires_grad=False) 252 | this_feature_maps.append(activations) 253 | init_feature_maps.append(this_feature_maps) 254 | 255 | begin = time.time() 256 | cost_saved = np.zeros(self.steps) 257 | previous_cs_loss = torch.ones_like(self.coeffs) 258 | for i in range(self.steps): 259 | # self.gradients = dict() 260 | # self.gradients['value'] = [] 261 | # self.activations = dict() 262 | # self.activations['value'] = [] 263 | 264 | # update coeff 265 | self.coeffs = torch.softmax(torch.softmax(previous_cs_loss, dim=0) + self.momentum * self.coeffs, dim=0) 266 | self.weights.append(self.coeffs.clone().cpu().numpy()) 267 | # print (self.coeffs.clone().cpu().numpy()) 268 | true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1) 269 | true_image = self._transform_video(true_image, mode='forward') # norm 270 | 271 | losses = [] 272 | for n in range(len(self.models)): 273 | self.gradients = dict() 274 | self.gradients['value'] = [] 275 | self.activations = dict() 276 | self.activations['value'] = [] 277 | _ = self.models[n](true_image) 278 | this_losses = [] 279 | for mm in range(len(init_feature_maps[n])): 280 | activations = self.activations['value'][mm] 281 | init_activations = init_feature_maps[n][mm] 282 | this_dir = activations.view(b*f, -1) 283 | init_dir = init_activations.view(b*f, -1) 284 | this_loss = F.cosine_similarity(this_dir, init_dir) 285 | this_losses.append(this_loss) 286 | losses.append(torch.stack(this_losses)) # 2,32 287 | 288 | 289 | used_coeffs = torch.unsqueeze(self.coeffs, dim=1) # (lens_model*2) * 1 290 | each_features_loss = torch.sum(used_coeffs * torch.cat(losses, dim=0), dim=1) # 4*32 291 | cost = torch.mean(each_features_loss) 292 | 293 | if self.coef_CE: 294 | previous_cs_loss = each_features_loss.clone().detach() 295 | else: 296 | updated_features_loss = torch.sum(torch.cat(losses, dim=0).clone().detach(), dim=1) 297 | previous_cs_loss = updated_features_loss.clone().detach() 298 | 299 | # update previous_cs_loss 300 | 301 | # print (previous_cs_loss.clone().cpu().numpy()) 302 | # print (cost) 303 | optimizer.zero_grad() 304 | cost.backward() 305 | optimizer.step() 306 | 307 | cost_saved[i] = cost.detach().item() 308 | 309 | for ind,vid_name in enumerate(video_names): 310 | if vid_name not in self.loss_info.keys(): 311 | self.loss_info[vid_name] = {} 312 | self.loss_info[vid_name][i] = {'cost': str(cost.detach().cpu().numpy())} 313 | 314 | used_time = time.time()-begin 315 | 316 | true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1) 317 | image_inps = self._transform_video(true_image, mode='forward') 318 | image_inps = image_inps.reshape(b,f,c,h,w) 319 | image_inps = image_inps.permute([0,2,1,3,4]) 320 | return image_inps, used_time, cost_saved 321 | -------------------------------------------------------------------------------- /transforms_ucf101.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | 4 | import numbers 5 | import collections 6 | import numpy as np 7 | import torch 8 | from PIL import Image, ImageOps 9 | try: 10 | import accimage 11 | except ImportError: 12 | accimage = None 13 | 14 | # the code from 3D-ResNets-PyTorch-master/temporal_transforms.py, spatial_transforms.py and target_transforms.py 15 | 16 | #******************************************** 17 | # temporal_transforms 18 | #******************************************** 19 | # LoopPadding: frame_indices < size, loop 20 | # TemporalBeginCrop: frame_indices[:size] < size, loop, [1:] 21 | # TemporalCenterCrop: frame_indice[center-size/2:center+size/2] < size, loop, [1:] 22 | # TemporalRandomCrop: frame_indice[random_begin,:random_begin+size] < size, loop, [1:] 23 | class LoopPadding(object): 24 | """ 25 | Variable size means that the length of temporal images we wanted. 26 | The length of variable out should be equal with variable size. 27 | If not, LoopPadding the temporal images. 28 | """ 29 | def __init__(self, size): 30 | self.size = size 31 | 32 | def __call__(self, frame_indices): 33 | out = frame_indices[1:self.size+1] 34 | 35 | for index in out: 36 | if len(out) >= self.size: 37 | break 38 | out.append(index) 39 | 40 | return out 41 | 42 | class TemporalBeginCrop(object): 43 | """ 44 | Temporally crop the given frame indices at a beginning. 45 | If the number of frames is less than the size, 46 | loop the indices as many times as necessary to satisfy the size. 47 | Args: 48 | size (int): Desired output size of the crop. 49 | """ 50 | 51 | def __init__(self, size): 52 | self.size = size 53 | 54 | def __call__(self, frame_indices): 55 | out = frame_indices[1:self.size+1] 56 | 57 | for index in out: 58 | if len(out) >= self.size: 59 | break 60 | out.append(index) 61 | return out 62 | 63 | class TemporalCenterCrop(object): 64 | """ 65 | Temporally crop the given frame indices at a center. 66 | If the number of frames is less than the size, 67 | loop the indices as many times as necessary to satisfy the size. 68 | Args: 69 | size (int): Desired output size of the crop. 70 | """ 71 | 72 | def __init__(self, size): 73 | self.size = size 74 | 75 | def __call__(self, frame_indices): 76 | """ 77 | Args: 78 | frame_indices (list): frame indices to be cropped. 79 | Returns: 80 | list: Cropped frame indices. 81 | """ 82 | 83 | center_index = len(frame_indices) // 2 84 | begin_index = max(1, center_index - (self.size // 2)) 85 | end_index = min(begin_index + self.size, len(frame_indices)) 86 | 87 | out = frame_indices[begin_index:end_index] 88 | 89 | for index in out: 90 | if len(out) >= self.size: 91 | break 92 | out.append(index) 93 | 94 | return out 95 | 96 | class TemporalRandomCrop(object): 97 | """ 98 | Temporally crop the given frame indices at a random location. 99 | If the number of frames is less than the size, 100 | loop the indices as many times as necessary to satisfy the size. 101 | Args: 102 | size (int): Desired output size of the crop. 103 | """ 104 | 105 | def __init__(self, size): 106 | self.size = size 107 | 108 | def __call__(self, frame_indices): 109 | """ 110 | Args: 111 | frame_indices (list): frame indices to be cropped. 112 | Returns: 113 | list: Cropped frame indices. 114 | """ 115 | 116 | rand_end = max(1, len(frame_indices) - self.size - 1) 117 | random.seed(1024) 118 | begin_index = random.randint(0, rand_end) 119 | end_index = min(begin_index + self.size, len(frame_indices)) 120 | 121 | out = frame_indices[begin_index:end_index] 122 | 123 | for index in out: 124 | if len(out) >= self.size: 125 | break 126 | out.append(index) 127 | 128 | return out 129 | 130 | #******************************************** 131 | # spatial_transforms 132 | #******************************************** 133 | # spatial_Compose: combine multiple spatial transforms function. 134 | # ToTensor: np.array, Image.image to tensor, H*W*C(0-255) to C*H*W (0.0-1.0), pixel/norm_value, for tensor. 135 | # Normalize: (pixel-mean)/std, for tensor. 136 | # Scale: Keeping aspect ratio unchanged, scaled the smaller side, for Image. 137 | # CenterCrop: for Image. 138 | # RandomHorizontalFlip: Horizontally flip a image by a probability 0.5. 139 | # MultiScaleCornerCrop: Multiple Scales from 4 corners and 1 center. 140 | # MultiScaleRandomCrop: Multiple Scales from random position. 141 | 142 | class spatial_Compose(object): 143 | """Composes several transforms together. 144 | Args: 145 | transforms (list of ``Transform`` objects): list of transforms to compose. 146 | Example: 147 | >>> transforms.Compose([ 148 | >>> transforms.CenterCrop(10), 149 | >>> transforms.ToTensor(), 150 | >>> ]) 151 | """ 152 | 153 | def __init__(self, transforms): 154 | self.transforms = transforms 155 | 156 | def __call__(self, img): 157 | for t in self.transforms: 158 | img = t(img) 159 | return img 160 | 161 | def randomize_parameters(self): 162 | for t in self.transforms: 163 | t.randomize_parameters() 164 | 165 | class ToTensor(object): 166 | """ 167 | Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor. 168 | Converts a PIL.Image or numpy.ndarray (H x W x C) in the range 169 | [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. 170 | """ 171 | 172 | def __init__(self, norm_value=255): 173 | self.norm_value = norm_value 174 | 175 | def __call__(self, pic): 176 | """ 177 | Args: 178 | pic (PIL.Image or numpy.ndarray): Image to be converted to tensor. 179 | Returns: 180 | Tensor: Converted image. 181 | """ 182 | if isinstance(pic, np.ndarray): 183 | # handle numpy array 184 | img = torch.from_numpy(pic.transpose((2, 0, 1))) 185 | # backward compatibility 186 | return img.float().div(self.norm_value) 187 | 188 | if accimage is not None and isinstance(pic, accimage.Image): 189 | nppic = np.zeros( 190 | [pic.channels, pic.height, pic.width], dtype=np.float32) 191 | pic.copyto(nppic) 192 | return torch.from_numpy(nppic) 193 | 194 | # handle PIL Image 195 | if pic.mode == 'I': 196 | img = torch.from_numpy(np.array(pic, np.int32, copy=False)) 197 | elif pic.mode == 'I;16': 198 | img = torch.from_numpy(np.array(pic, np.int16, copy=False)) 199 | else: 200 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) 201 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK 202 | if pic.mode == 'YCbCr': 203 | nchannel = 3 204 | elif pic.mode == 'I;16': 205 | nchannel = 1 206 | else: 207 | nchannel = len(pic.mode) 208 | img = img.view(pic.size[1], pic.size[0], nchannel) 209 | # put it from HWC to CHW format 210 | # yikes, this transpose takes 80% of the loading time/CPU 211 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 212 | if isinstance(img, torch.ByteTensor): 213 | return img.float().div(self.norm_value) 214 | else: 215 | return img 216 | 217 | def randomize_parameters(self): 218 | pass 219 | 220 | class Normalize(object): 221 | """ 222 | Normalize an tensor image with mean and standard deviation. 223 | Given mean: (R, G, B) and std: (R, G, B), 224 | will normalize each channel of the torch.*Tensor, i.e. 225 | channel = (channel - mean) / std 226 | Args: 227 | mean (sequence): Sequence of means for R, G, B channels respecitvely. 228 | std (sequence): Sequence of standard deviations for R, G, B channels 229 | respecitvely. 230 | """ 231 | 232 | def __init__(self, mean, std): 233 | self.mean = mean 234 | self.std = std 235 | 236 | def __call__(self, tensor): 237 | """ 238 | Args: 239 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 240 | Returns: 241 | Tensor: Normalized image. 242 | """ 243 | # TODO: make efficient 244 | for t, m, s in zip(tensor, self.mean, self.std): 245 | t.sub_(m).div_(s) 246 | return tensor 247 | 248 | def randomize_parameters(self): 249 | pass 250 | 251 | 252 | class Scale(object): 253 | """Rescale the input PIL.Image to the given size. 254 | Args: 255 | size (sequence or int): Desired output size. If size is a sequence like 256 | (w, h), output size will be matched to this. If size is an int, 257 | smaller edge of the image will be matched to this number. 258 | i.e, if height > width, then image will be rescaled to 259 | (size * height / width, size) 260 | interpolation (int, optional): Desired interpolation. Default is 261 | ``PIL.Image.BILINEAR`` 262 | """ 263 | 264 | def __init__(self, size, interpolation=Image.BILINEAR): 265 | assert isinstance(size, 266 | int) or (isinstance(size, collections.Iterable) and 267 | len(size) == 2) 268 | self.size = size 269 | self.interpolation = interpolation 270 | 271 | def __call__(self, img): 272 | """ 273 | Args: 274 | img (PIL.Image): Image to be scaled. 275 | Returns: 276 | PIL.Image: Rescaled image. 277 | """ 278 | if isinstance(self.size, int): 279 | w, h = img.size 280 | if (w <= h and w == self.size) or (h <= w and h == self.size): 281 | return img 282 | if w < h: 283 | ow = self.size 284 | oh = int(self.size * h / w) 285 | return img.resize((ow, oh), self.interpolation) 286 | else: 287 | oh = self.size 288 | ow = int(self.size * w / h) 289 | return img.resize((ow, oh), self.interpolation) 290 | else: 291 | return img.resize(self.size, self.interpolation) 292 | 293 | def randomize_parameters(self): 294 | pass 295 | 296 | 297 | class CenterCrop(object): 298 | """Crops the given PIL.Image at the center. 299 | Args: 300 | size (sequence or int): Desired output size of the crop. If size is an 301 | int instead of sequence like (h, w), a square crop (size, size) is 302 | made. 303 | """ 304 | 305 | def __init__(self, size): 306 | if isinstance(size, numbers.Number): 307 | self.size = (int(size), int(size)) 308 | else: 309 | self.size = size 310 | 311 | def __call__(self, img): 312 | """ 313 | Args: 314 | img (PIL.Image): Image to be cropped. 315 | Returns: 316 | PIL.Image: Cropped image. 317 | """ 318 | w, h = img.size 319 | th, tw = self.size 320 | x1 = int(round((w - tw) / 2.)) 321 | y1 = int(round((h - th) / 2.)) 322 | return img.crop((x1, y1, x1 + tw, y1 + th)) 323 | 324 | def randomize_parameters(self): 325 | pass 326 | 327 | 328 | class CornerCrop(object): 329 | 330 | def __init__(self, size, crop_position=None): 331 | self.size = size 332 | if crop_position is None: 333 | self.randomize = True 334 | else: 335 | self.randomize = False 336 | self.crop_position = crop_position 337 | self.crop_positions = ['c', 'tl', 'tr', 'bl', 'br'] 338 | 339 | def __call__(self, img): 340 | image_width = img.size[0] 341 | image_height = img.size[1] 342 | 343 | if self.crop_position == 'c': 344 | th, tw = (self.size, self.size) 345 | x1 = int(round((image_width - tw) / 2.)) 346 | y1 = int(round((image_height - th) / 2.)) 347 | x2 = x1 + tw 348 | y2 = y1 + th 349 | elif self.crop_position == 'tl': 350 | x1 = 0 351 | y1 = 0 352 | x2 = self.size 353 | y2 = self.size 354 | elif self.crop_position == 'tr': 355 | x1 = image_width - self.size 356 | y1 = 0 357 | x2 = image_width 358 | y2 = self.size 359 | elif self.crop_position == 'bl': 360 | x1 = 0 361 | y1 = image_height - self.size 362 | x2 = self.size 363 | y2 = image_height 364 | elif self.crop_position == 'br': 365 | x1 = image_width - self.size 366 | y1 = image_height - self.size 367 | x2 = image_width 368 | y2 = image_height 369 | 370 | img = img.crop((x1, y1, x2, y2)) 371 | 372 | return img 373 | 374 | def randomize_parameters(self): 375 | if self.randomize: 376 | random.seed(1024) 377 | self.crop_position = self.crop_positions[random.randint( 378 | 0, 379 | len(self.crop_positions) - 1)] 380 | 381 | 382 | class RandomHorizontalFlip(object): 383 | """Horizontally flip the given PIL.Image randomly with a probability of 0.5.""" 384 | 385 | def __call__(self, img): 386 | """ 387 | Args: 388 | img (PIL.Image): Image to be flipped. 389 | Returns: 390 | PIL.Image: Randomly flipped image. 391 | """ 392 | if self.p < 0.5: 393 | return img.transpose(Image.FLIP_LEFT_RIGHT) 394 | return img 395 | 396 | def randomize_parameters(self): 397 | random.seed(1024) 398 | self.p = random.random() 399 | 400 | 401 | class MultiScaleCornerCrop(object): 402 | """Crop the given PIL.Image to randomly selected size. 403 | A crop of size is selected from scales of the original size. 404 | A position of cropping is randomly selected from 4 corners and 1 center. 405 | This crop is finally resized to given size. 406 | Args: 407 | scales: cropping scales of the original size 408 | size: size of the smaller edge 409 | interpolation: Default: PIL.Image.BILINEAR 410 | """ 411 | 412 | def __init__(self, 413 | scales, 414 | size, 415 | interpolation=Image.BILINEAR, 416 | crop_positions=['c', 'tl', 'tr', 'bl', 'br']): 417 | self.scales = scales 418 | self.size = size 419 | self.interpolation = interpolation 420 | 421 | self.crop_positions = crop_positions 422 | 423 | def __call__(self, img): 424 | min_length = min(img.size[0], img.size[1]) 425 | crop_size = int(min_length * self.scale) 426 | 427 | image_width = img.size[0] 428 | image_height = img.size[1] 429 | 430 | if self.crop_position == 'c': 431 | center_x = image_width // 2 432 | center_y = image_height // 2 433 | box_half = crop_size // 2 434 | x1 = center_x - box_half 435 | y1 = center_y - box_half 436 | x2 = center_x + box_half 437 | y2 = center_y + box_half 438 | elif self.crop_position == 'tl': 439 | x1 = 0 440 | y1 = 0 441 | x2 = crop_size 442 | y2 = crop_size 443 | elif self.crop_position == 'tr': 444 | x1 = image_width - crop_size 445 | y1 = 0 446 | x2 = image_width 447 | y2 = crop_size 448 | elif self.crop_position == 'bl': 449 | x1 = 0 450 | y1 = image_height - crop_size 451 | x2 = crop_size 452 | y2 = image_height 453 | elif self.crop_position == 'br': 454 | x1 = image_width - crop_size 455 | y1 = image_height - crop_size 456 | x2 = image_width 457 | y2 = image_height 458 | 459 | img = img.crop((x1, y1, x2, y2)) 460 | 461 | return img.resize((self.size, self.size), self.interpolation) 462 | 463 | def randomize_parameters(self): 464 | random.seed(1024) 465 | self.scale = self.scales[random.randint(0, len(self.scales) - 1)] 466 | random.seed(1024) 467 | self.crop_position = self.crop_positions[random.randint( 468 | 0, 469 | len(self.crop_positions) - 1)] 470 | 471 | 472 | class MultiScaleRandomCrop(object): 473 | """ 474 | Crop the given PIL.Image to randomly selected size. 475 | A position of cropping is randomly selected. 476 | """ 477 | def __init__(self, scales, size, interpolation=Image.BILINEAR): 478 | self.scales = scales 479 | self.size = size 480 | self.interpolation = interpolation 481 | 482 | def __call__(self, img): 483 | min_length = min(img.size[0], img.size[1]) 484 | crop_size = int(min_length * self.scale) 485 | 486 | image_width = img.size[0] 487 | image_height = img.size[1] 488 | 489 | x1 = self.tl_x * (image_width - crop_size) 490 | y1 = self.tl_y * (image_height - crop_size) 491 | x2 = x1 + crop_size 492 | y2 = y1 + crop_size 493 | 494 | img = img.crop((x1, y1, x2, y2)) 495 | 496 | return img.resize((self.size, self.size), self.interpolation) 497 | 498 | def randomize_parameters(self): 499 | self.scale = self.scales[random.randint(0, len(self.scales) - 1)] 500 | random.seed(1024) 501 | self.tl_x = random.random() 502 | random.seed(1024) 503 | self.tl_y = random.random() 504 | -------------------------------------------------------------------------------- /kinetics400_attack_samples.csv: -------------------------------------------------------------------------------- 1 | path,gt_label,clip_index 2 | abseiling/YqTT34PsD5c_000003_000013.mp4,0,-1 3 | air drumming/--nQbRBEz2s_000104_000114.mp4,1,-1 4 | answering questions/AqPeHqTDfGE_000068_000078.mp4,2,-1 5 | applauding/ieIq7ym_UXQ_000007_000017.mp4,3,-1 6 | applying cream/rFee2NCkWQE_000013_000023.mp4,4,-1 7 | archery/FzfqEd36YbY_001101_001111.mp4,5,-1 8 | arm wrestling/_Lo3hFbum_o_000005_000015.mp4,6,-1 9 | arranging flowers/cuOsRai-HCE_000126_000136.mp4,7,-1 10 | assembling computer/CB1iIWtDpSI_000431_000441.mp4,8,-1 11 | auctioning/6jcDGC4LF5s_001022_001032.mp4,9,-1 12 | baby waking up/X66nHKtYtt0_000008_000018.mp4,10,-1 13 | baking cookies/LG2hEf9ueAM_000178_000188.mp4,11,-1 14 | balloon blowing/qjyGo-e_d6I_000076_000086.mp4,12,-1 15 | bandaging/TwSUMZOrLyE_000139_000149.mp4,13,-1 16 | barbequing/_7mBVhDgiO8_000019_000029.mp4,14,-1 17 | bartending/TuCc2RwG2fM_000354_000364.mp4,15,-1 18 | beatboxing/rYnVViYbae0_000000_000010.mp4,16,-1 19 | bee keeping/EpbYRgIsQRg_000018_000028.mp4,17,-1 20 | belly dancing/cMVkWCb3fE8_000103_000113.mp4,18,-1 21 | bench pressing/ehxWC3nDZC8_000003_000013.mp4,19,-1 22 | bending back/DA8K3c4HgVo_000003_000013.mp4,20,-1 23 | bending metal/U-iQuIgd5ps_000478_000488.mp4,21,-1 24 | biking through snow/g3GJvDqtfys_000031_000041.mp4,22,-1 25 | blasting sand/NAqrwQ54ptY_000028_000038.mp4,23,-1 26 | blowing glass/p2NEs8gon0k_000381_000391.mp4,24,-1 27 | blowing leaves/wKgo6AS5C80_000044_000054.mp4,25,-1 28 | blowing nose/gu0QuD4zpzg_000030_000040.mp4,26,-1 29 | blowing out candles/9IzWImcF3hM_000032_000042.mp4,27,-1 30 | bobsledding/uftReOMM9-A_000063_000073.mp4,28,-1 31 | bookbinding/hmoPcSFBYPY_000222_000232.mp4,29,-1 32 | bouncing on trampoline/bekao5nG02M_000024_000034.mp4,30,-1 33 | bowling/8pBjZcOc8MY_000096_000106.mp4,31,-1 34 | braiding hair/-dLVSg5JvxY_000022_000032.mp4,32,-1 35 | breading or breadcrumbing/-_3E3GBXAUc_000010_000020.mp4,33,-1 36 | breakdancing/4T2F4PQ97GE_000008_000018.mp4,34,-1 37 | brush painting/YHUeGa8Eu70_000225_000235.mp4,35,-1 38 | brushing hair/9LnZrptwj6Q_000314_000324.mp4,36,-1 39 | brushing teeth/8NStNQyjIXI_000054_000064.mp4,37,-1 40 | building cabinet/CG9DKR4lPC0_001821_001831.mp4,38,-1 41 | building shed/6q-XsQgZ8_w_000044_000054.mp4,39,-1 42 | bungee jumping/7Goki93f5mo_000018_000028.mp4,40,-1 43 | busking/CU6MFCvEct0_000016_000026.mp4,41,-1 44 | canoeing or kayaking/cdDu63UKbu0_000164_000174.mp4,42,-1 45 | capoeira/lm6ibanrGK8_000000_000010.mp4,43,-1 46 | carrying baby/ztAfXKZ0ovM_000141_000151.mp4,44,-1 47 | cartwheeling/EiZvgwrHCMk_000000_000010.mp4,45,-1 48 | carving pumpkin/oPoLYdOTOt0_000000_000010.mp4,46,-1 49 | catching fish/dpfJTo3nywA_000028_000038.mp4,47,-1 50 | catching or throwing baseball/AsPjORZU-cU_000055_000065.mp4,48,-1 51 | catching or throwing frisbee/RxgW7Hdn4YM_000006_000016.mp4,49,-1 52 | catching or throwing softball/kU3qQGVRT-g_000011_000021.mp4,50,-1 53 | celebrating/2lBUaUBD9JE_000018_000028.mp4,51,-1 54 | changing oil/-aJHPlJTesM_000734_000744.mp4,52,-1 55 | changing wheel/EQNTFw62uh8_000251_000261.mp4,53,-1 56 | checking tires/wUQduZ3i-VM_000275_000285.mp4,54,-1 57 | cheerleading/6LOV6-dkNZE_000251_000261.mp4,55,-1 58 | chopping wood/nyFulYDEKFs_000017_000027.mp4,56,-1 59 | clapping/M9NORCUCrtE_000003_000013.mp4,57,-1 60 | clay pottery making/PP7MtP6BMkY_000193_000203.mp4,58,-1 61 | clean and jerk/R6pk7NDa7Mw_000015_000025.mp4,59,-1 62 | cleaning floor/vVlrGgL9dxk_000004_000014.mp4,60,-1 63 | cleaning gutters/22xdXMMq6XE_000040_000050.mp4,61,-1 64 | cleaning pool/AFvEYQkSmfk_000123_000133.mp4,62,-1 65 | cleaning shoes/WJEGNo9YETM_000203_000213.mp4,63,-1 66 | cleaning toilet/BjS2g1oZj_s_000065_000075.mp4,64,-1 67 | cleaning windows/OiN3AgBVB80_000003_000013.mp4,65,-1 68 | climbing a rope/NfH4FZhrtvE_000002_000012.mp4,66,-1 69 | climbing ladder/70Er7J3srS0_000001_000011.mp4,67,-1 70 | climbing tree/aM1AgHyvm4E_000017_000027.mp4,68,-1 71 | contact juggling/yymr4YWVFe4_000046_000056.mp4,69,-1 72 | cooking chicken/pj8TWS7KEeY_000024_000034.mp4,70,-1 73 | cooking egg/Ao3M2TPI3sQ_000294_000304.mp4,71,-1 74 | cooking on campfire/BQfDmW1Nodk_000002_000012.mp4,72,-1 75 | cooking sausages/52AOa09jJWs_000195_000205.mp4,73,-1 76 | counting money/kPCbWDyAcFE_000000_000010.mp4,74,-1 77 | country line dancing/suHCOVoGPMU_000475_000485.mp4,75,-1 78 | cracking neck/j_EiZph3YKE_000001_000011.mp4,76,-1 79 | crawling baby/GpPvqvsqGy0_000006_000016.mp4,77,-1 80 | crossing river/luTkBLIT6lU_000036_000046.mp4,78,-1 81 | crying/zCEEKnSB_RU_000000_000010.mp4,79,-1 82 | curling hair/gwNMVUlBUtY_000068_000078.mp4,80,-1 83 | cutting nails/es35biYvLRA_000020_000030.mp4,81,-1 84 | cutting pineapple/T5jQWQg2eNc_000000_000010.mp4,82,-1 85 | cutting watermelon/LBgRTCVwyik_000042_000052.mp4,83,-1 86 | dancing ballet/s_gGtYIrtsc_000118_000128.mp4,84,-1 87 | dancing charleston/FQpLIyAfbqI_000023_000033.mp4,85,-1 88 | dancing gangnam style/o_TIgx4gb_M_000023_000033.mp4,86,-1 89 | dancing macarena/dXIyWMidYa0_000008_000018.mp4,87,-1 90 | deadlifting/zvamd5T7yj8_000001_000011.mp4,88,-1 91 | decorating the christmas tree/kQDSa-xhsLY_000035_000045.mp4,89,-1 92 | digging/42Vx9FGzmkM_000075_000085.mp4,90,-1 93 | dining/-vOrVT1CiPQ_000080_000090.mp4,91,-1 94 | disc golfing/_owWHGvn_b0_000112_000122.mp4,92,-1 95 | diving cliff/1MmjE51PeIE_000015_000025.mp4,93,-1 96 | dodgeball/wFIuMu2w9pA_000010_000020.mp4,94,-1 97 | doing aerobics/-53DvfE42gE_001767_001777.mp4,95,-1 98 | doing laundry/qkd7laDeom0_000098_000108.mp4,96,-1 99 | doing nails/UixL7lHSHR8_000040_000050.mp4,97,-1 100 | drawing/IPmic5VRb7I_000066_000076.mp4,98,-1 101 | dribbling basketball/qoODmONT1a0_000019_000029.mp4,99,-1 102 | drinking/15FiZ48tTUU_000045_000055.mp4,100,-1 103 | drinking beer/382B3Q3xttk_000000_000010.mp4,101,-1 104 | drinking shots/o1hqepKau4A_000004_000014.mp4,102,-1 105 | driving car/NUG7kwJ-614_000400_000410.mp4,103,-1 106 | driving tractor/WtnQKvOuukE_000081_000091.mp4,104,-1 107 | drop kicking/pvuiN-G8-yc_000000_000010.mp4,105,-1 108 | drumming fingers/eap32WOJcAU_000108_000118.mp4,106,-1 109 | dunking basketball/WC2FOUSNyvE_000006_000016.mp4,107,-1 110 | dying hair/-7E9WiX7QfA_000053_000063.mp4,108,-1 111 | eating burger/w9G7CpkBBM0_000000_000010.mp4,109,-1 112 | eating cake/8QhblWHnNAY_000019_000029.mp4,110,-1 113 | eating carrots/V4IaThkaK6Y_000025_000035.mp4,111,-1 114 | eating chips/I5Y53-Q9KRo_000444_000454.mp4,112,-1 115 | eating doughnuts/HyUF0Uo0f2A_000077_000087.mp4,113,-1 116 | eating hotdog/FTOgHjhqlhU_000054_000064.mp4,114,-1 117 | eating ice cream/0fCDlKYkRxc_000081_000091.mp4,115,-1 118 | eating spaghetti/DiSP2oDGQ1Q_000014_000024.mp4,116,-1 119 | eating watermelon/pLA62YSoEoM_000002_000012.mp4,117,-1 120 | egg hunting/U9vSW3-zJ9s_000007_000017.mp4,118,-1 121 | exercising arm/0wZpjStZtUY_000001_000011.mp4,119,-1 122 | exercising with an exercise ball/oj7Qgyz5KK8_000143_000153.mp4,120,-1 123 | extinguishing fire/BVXG_JOh9jQ_000002_000012.mp4,121,-1 124 | faceplanting/petld-72OXM_000001_000011.mp4,122,-1 125 | feeding birds/QJSwBNxKYqg_000120_000130.mp4,123,-1 126 | feeding fish/ZtkTAHzih9Q_000084_000094.mp4,124,-1 127 | feeding goats/v5Bl68y5ra0_000006_000016.mp4,125,-1 128 | filling eyebrows/XycmcISYPA8_000045_000055.mp4,126,-1 129 | finger snapping/j6qYhS2W1fM_000001_000011.mp4,127,-1 130 | fixing hair/-65aI53dvdE_000022_000032.mp4,128,-1 131 | flipping pancake/HIBxq2P0BL0_000004_000014.mp4,129,-1 132 | flying kite/hAQJ9GHklS4_000004_000014.mp4,130,-1 133 | folding clothes/HvbmGxDuNxs_000035_000045.mp4,131,-1 134 | folding napkins/iCtT6ZadoOM_000052_000062.mp4,132,-1 135 | folding paper/soHl6SrXlEI_000105_000115.mp4,133,-1 136 | front raises/ObO_Gnw1nOQ_000005_000015.mp4,134,-1 137 | frying vegetables/1IDdvXnTI60_000123_000133.mp4,135,-1 138 | garbage collecting/KxTIEKllIzg_000114_000124.mp4,136,-1 139 | gargling/HAPBKE3Qo5A_000217_000227.mp4,137,-1 140 | getting a haircut/lVwFn9m8Q_Q_000053_000063.mp4,138,-1 141 | getting a tattoo/g8dOsqPBe7A_000657_000667.mp4,139,-1 142 | giving or receiving award/LmuS2GreXkc_000033_000043.mp4,140,-1 143 | golf chipping/NIf0bxodA9E_000120_000130.mp4,141,-1 144 | golf driving/1Q-E6UW1XE8_000011_000021.mp4,142,-1 145 | golf putting/VS9uEOvJhzg_000000_000010.mp4,143,-1 146 | grinding meat/SErnxQf4ONQ_000230_000240.mp4,144,-1 147 | grooming dog/Q9mt0lJjQUA_000105_000115.mp4,145,-1 148 | grooming horse/kaVWY-GyXcs_000063_000073.mp4,146,-1 149 | gymnastics tumbling/mlzx2bi9nwQ_000059_000069.mp4,147,-1 150 | hammer throw/WUrwglFhY64_000002_000012.mp4,148,-1 151 | headbanging/ZhDdQmHIM78_000044_000054.mp4,149,-1 152 | high jump/M2j1BTibIzs_000000_000010.mp4,151,-1 153 | high kick/NdjLKFhn9j0_000004_000014.mp4,152,-1 154 | hitting baseball/e8uB0GZsVOQ_000034_000044.mp4,153,-1 155 | hockey stop/Nrscg8fLYqY_000049_000059.mp4,154,-1 156 | holding snake/6cbXqLP0FHE_000002_000012.mp4,155,-1 157 | hopscotch/vxp0SOd2W1E_000002_000012.mp4,156,-1 158 | hoverboarding/E1Smsuf6cpE_000147_000157.mp4,157,-1 159 | hugging/xWyOTDxm9yQ_000009_000019.mp4,158,-1 160 | hula hooping/UjfYNVaZ39Y_000087_000097.mp4,159,-1 161 | hurdling/Xa6gI4yGLQo_000000_000010.mp4,160,-1 162 | hurling (sport)/ml2eBC_nXrw_000055_000065.mp4,161,-1 163 | ice climbing/UM1fUqvFnME_000048_000058.mp4,162,-1 164 | ice fishing/GO6YI36E_Do_000140_000150.mp4,163,-1 165 | ice skating/vMZLTP9MfZ4_000008_000018.mp4,164,-1 166 | ironing/ZgHZ0KgFOSc_000215_000225.mp4,165,-1 167 | javelin throw/E5xdkQvnhkc_000002_000012.mp4,166,-1 168 | jetskiing/Be59Cot2yGI_000233_000243.mp4,167,-1 169 | jogging/kBUt5duOHFU_000005_000015.mp4,168,-1 170 | juggling balls/YH801xSLkZM_000000_000010.mp4,169,-1 171 | juggling fire/TA2mmXre8HQ_000000_000010.mp4,170,-1 172 | juggling soccer ball/WAPctsQ-SwM_000000_000010.mp4,171,-1 173 | jumping into pool/kjzgLLaYO8w_000010_000020.mp4,172,-1 174 | jumpstyle dancing/QeG2HREr6m0_000003_000013.mp4,173,-1 175 | kicking field goal/sR0oOq-qOqs_000015_000025.mp4,174,-1 176 | kicking soccer ball/5PML0iLnBD8_000003_000013.mp4,175,-1 177 | kissing/LmPjkroyPcY_000739_000749.mp4,176,-1 178 | kitesurfing/KOOfe61BIyE_000023_000033.mp4,177,-1 179 | knitting/bCa_5xZa4Ug_002346_002356.mp4,178,-1 180 | krumping/3JxrK2Jt52Y_000754_000764.mp4,179,-1 181 | laughing/UpVXo5Q9JKk_000079_000089.mp4,180,-1 182 | laying bricks/N4HdEYIci0I_000037_000047.mp4,181,-1 183 | long jump/MrlWkj87rfU_000002_000012.mp4,182,-1 184 | lunge/g-XXUD65DyI_000003_000013.mp4,183,-1 185 | making a cake/bX6I6jVAQMI_000028_000038.mp4,184,-1 186 | making a sandwich/jofgWiVBwqo_000086_000096.mp4,185,-1 187 | making bed/yD42KW6cm-A_000820_000830.mp4,186,-1 188 | making jewelry/wMWkwQ7HXik_000616_000626.mp4,187,-1 189 | making pizza/wxgqu30nSLE_000000_000010.mp4,188,-1 190 | making snowman/8kN7EyPBmrI_000082_000092.mp4,189,-1 191 | making sushi/Ah2YqA7bmHY_000055_000065.mp4,190,-1 192 | making tea/hs2MVCM2LdY_000043_000053.mp4,191,-1 193 | marching/_h60EbUbh3I_000026_000036.mp4,192,-1 194 | massaging back/zsJ2PmhGM98_000215_000225.mp4,193,-1 195 | massaging feet/BwMKdpNAmy4_000090_000100.mp4,194,-1 196 | massaging legs/0EJXIQ1ltjo_000013_000023.mp4,195,-1 197 | massaging person's head/z-6l_dkR3vE_000299_000309.mp4,196,-1 198 | milking cow/DdUTLqyZ5b8_000044_000054.mp4,197,-1 199 | mopping floor/-F-aEPmjERo_000043_000053.mp4,198,-1 200 | motorcycling/kthzjAS1XS8_000009_000019.mp4,199,-1 201 | moving furniture/b9vF-F1LC5g_000003_000013.mp4,200,-1 202 | mowing lawn/t5SHfHDj0uw_000006_000016.mp4,201,-1 203 | news anchoring/xJMgxnXI0GY_000000_000010.mp4,202,-1 204 | opening bottle/gWd5AU5wP0k_000041_000051.mp4,203,-1 205 | opening present/vd4uGb1162o_000002_000012.mp4,204,-1 206 | paragliding/GF4WEdN_H0s_000191_000201.mp4,205,-1 207 | parasailing/GuClMEvE3gM_000055_000065.mp4,206,-1 208 | parkour/ptgKO940ISM_000042_000052.mp4,207,-1 209 | passing American football (in game)/ixMPVi3Zr9s_000001_000011.mp4,208,-1 210 | passing American football (not in game)/RxO7IEU7_I8_000391_000401.mp4,209,-1 211 | peeling apples/8qEAQXckcVw_000003_000013.mp4,210,-1 212 | peeling potatoes/_3CsQJ6XpHo_000015_000025.mp4,211,-1 213 | petting animal (not cat)/tlWjTLpoWLw_000000_000010.mp4,212,-1 214 | petting cat/q1GijBRBqjE_000203_000213.mp4,213,-1 215 | picking fruit/NTfCraM0XyM_000257_000267.mp4,214,-1 216 | planting trees/_WzkPBxP-5g_000096_000106.mp4,215,-1 217 | plastering/mdN9BDP0cVY_000032_000042.mp4,216,-1 218 | playing accordion/syp1O0cjens_000038_000048.mp4,217,-1 219 | playing badminton/tJz980bJ3UI_000065_000075.mp4,218,-1 220 | playing bagpipes/fMeaggq0_rA_000032_000042.mp4,219,-1 221 | playing basketball/3mIvIgAlniY_000001_000011.mp4,220,-1 222 | playing bass guitar/HqsAvuo5XhA_000059_000069.mp4,221,-1 223 | playing cards/IVP8pO4Q8Hs_000084_000094.mp4,222,-1 224 | playing cello/rsN982-8cvg_000042_000052.mp4,223,-1 225 | playing chess/xFq-OJ8HDJs_000185_000195.mp4,224,-1 226 | playing clarinet/7g4aL1EX8EI_001210_001220.mp4,225,-1 227 | playing controller/gxbUZcsy4EA_000097_000107.mp4,226,-1 228 | playing cricket/lgPslaxBQt0_000000_000010.mp4,227,-1 229 | playing cymbals/--Y25nDn2Wk_000060_000070.mp4,228,-1 230 | playing didgeridoo/2ezT7E6g8Ew_000044_000054.mp4,229,-1 231 | playing drums/kXhnTK9TVsU_000076_000086.mp4,230,-1 232 | playing flute/wqYzrDwV_o4_000047_000057.mp4,231,-1 233 | playing guitar/ysjCIR7SkJU_000141_000151.mp4,232,-1 234 | playing harmonica/DpJQShJs2kI_000036_000046.mp4,233,-1 235 | playing harp/Ud-INZAw5Ik_000163_000173.mp4,234,-1 236 | playing ice hockey/kRWk_-5d5bs_000010_000020.mp4,235,-1 237 | playing keyboard/vxVoptVwZp4_000027_000037.mp4,236,-1 238 | playing kickball/d5TMlt6P-ug_000317_000327.mp4,237,-1 239 | playing monopoly/SsAtR4oD7WY_000000_000010.mp4,238,-1 240 | playing organ/b9TfeDnfemw_000047_000057.mp4,239,-1 241 | playing paintball/DOL1_JLWeoo_000321_000331.mp4,240,-1 242 | playing piano/l4zZtMgNPvU_000009_000019.mp4,241,-1 243 | playing poker/-0NQHRndkPI_000004_000014.mp4,242,-1 244 | playing recorder/Zl_ey-UqwpY_000001_000011.mp4,243,-1 245 | playing saxophone/K06EmNd6t_I_000006_000016.mp4,244,-1 246 | playing squash or racquetball/-yUM3WwKQHM_000032_000042.mp4,245,-1 247 | playing tennis/VoAJFfutNlg_000060_000070.mp4,246,-1 248 | playing trombone/vqNbapex1kU_000015_000025.mp4,247,-1 249 | playing trumpet/-BtzVCzSnLk_000073_000083.mp4,248,-1 250 | playing ukulele/vE6Cnt7XJrg_000026_000036.mp4,249,-1 251 | playing violin/t2XntpSO4Yo_000105_000115.mp4,250,-1 252 | playing volleyball/5Wle9ClW4q0_000170_000180.mp4,251,-1 253 | playing xylophone/N586DnjSCxo_000000_000010.mp4,252,-1 254 | pole vault/9g4Sf8aWIx8_000003_000013.mp4,253,-1 255 | presenting weather forecast/lVSiCfeBP8I_000152_000162.mp4,254,-1 256 | pull ups/yLVMDD7b0xM_000020_000030.mp4,255,-1 257 | pumping fist/V-IqR1THKr4_000015_000025.mp4,256,-1 258 | pumping gas/eanhmmKIolc_000044_000054.mp4,257,-1 259 | punching bag/3baFNAxC2YI_000012_000022.mp4,258,-1 260 | punching person (boxing)/D5iLGttoHr4_000022_000032.mp4,259,-1 261 | push up/-B2oGkg1qSI_000012_000022.mp4,260,-1 262 | pushing car/-46DNkpyApI_000045_000055.mp4,261,-1 263 | pushing cart/p9CIcEEaSEk_000001_000011.mp4,262,-1 264 | pushing wheelchair/5gQlgNS5qfY_000023_000033.mp4,263,-1 265 | reading book/XoO1uEVNgjM_000058_000068.mp4,264,-1 266 | reading newspaper/gKqKWn6Nl0A_000035_000045.mp4,265,-1 267 | recording music/864rV9vdAK4_000577_000587.mp4,266,-1 268 | riding a bike/Ig-eRsgi6CU_000339_000349.mp4,267,-1 269 | riding camel/bGzjObGU_qM_000014_000024.mp4,268,-1 270 | riding elephant/j06vowPye30_000009_000019.mp4,269,-1 271 | riding mechanical bull/eJpkgBaykQ8_000029_000039.mp4,270,-1 272 | riding mountain bike/O95dOpT9T-c_000039_000049.mp4,271,-1 273 | riding mule/azD58bwAe7E_000003_000013.mp4,272,-1 274 | riding or walking with horse/C9pFs8sDARw_000218_000228.mp4,273,-1 275 | riding scooter/FGCNMNjanO4_000013_000023.mp4,274,-1 276 | riding unicycle/9RN16I79P9U_000000_000010.mp4,275,-1 277 | ripping paper/-Ovwq0kVUx4_000002_000012.mp4,276,-1 278 | robot dancing/5hQW4BHjWvM_000061_000071.mp4,277,-1 279 | rock climbing/2jXlO2nzHGE_000026_000036.mp4,278,-1 280 | rock scissors paper/Kxbdg32t6bU_000001_000011.mp4,279,-1 281 | roller skating/_vjX5nPwTBs_000072_000082.mp4,280,-1 282 | running on treadmill/BrKuhHIHccg_000049_000059.mp4,281,-1 283 | sailing/h1SM1ArgB0E_000034_000044.mp4,282,-1 284 | salsa dancing/WW4N7GToB5I_000313_000323.mp4,283,-1 285 | sanding floor/EbXC4bGpZ4M_000034_000044.mp4,284,-1 286 | scrambling eggs/ojJpJZpACdE_000245_000255.mp4,285,-1 287 | scuba diving/64BhyrIZkz0_000002_000012.mp4,286,-1 288 | setting table/tSQqcJqGplA_000011_000021.mp4,287,-1 289 | shaking hands/lCQ17mGZeVE_000029_000039.mp4,288,-1 290 | shaking head/WUOxNQKdRMM_000065_000075.mp4,289,-1 291 | sharpening knives/iaQPoVg8Xtw_000468_000478.mp4,290,-1 292 | sharpening pencil/ZMMCn1JE0Vc_000001_000011.mp4,291,-1 293 | shaving head/K1C_jI8z1F8_000261_000271.mp4,292,-1 294 | shaving legs/zvjNnDhUTxE_000034_000044.mp4,293,-1 295 | shearing sheep/Vaff3l43A40_000018_000028.mp4,294,-1 296 | shining shoes/HhW13wPky1U_000556_000566.mp4,295,-1 297 | shooting basketball/Y3oHAIylSrg_000031_000041.mp4,296,-1 298 | shooting goal (soccer)/ezJdtQzJ7qI_000021_000031.mp4,297,-1 299 | shot put/SQddPtgoQGE_000007_000017.mp4,298,-1 300 | shoveling snow/SZRDWgGOpXY_000062_000072.mp4,299,-1 301 | shredding paper/KXyOXrWiJGY_000022_000032.mp4,300,-1 302 | shuffling cards/_k0w_3JFfmE_000026_000036.mp4,301,-1 303 | side kick/sZ8JiPfAoWc_000005_000015.mp4,302,-1 304 | sign language interpreting/fKvqQEGGf6E_000031_000041.mp4,303,-1 305 | singing/FZrg29zsAe8_000023_000033.mp4,304,-1 306 | situp/jTMZX30XTXA_000072_000082.mp4,305,-1 307 | skateboarding/kIzdzzMLCJI_000199_000209.mp4,306,-1 308 | ski jumping/XQUsRpJ1A_Y_000001_000011.mp4,307,-1 309 | skiing (not slalom or crosscountry)/fRiYQEVMcEc_000000_000010.mp4,308,-1 310 | skiing crosscountry/pfvt6iYSXXw_000764_000774.mp4,309,-1 311 | skiing slalom/Ch_wt_nV2k4_000702_000712.mp4,310,-1 312 | skipping rope/xzXJJIni2hQ_000238_000248.mp4,311,-1 313 | skydiving/5uw3m1tIvJ0_000057_000067.mp4,312,-1 314 | slacklining/iLj4i4fTzn0_000038_000048.mp4,313,-1 315 | sled dog racing/yMQMfdV-Fzs_000025_000035.mp4,315,-1 316 | smoking/xo_9xPRu7_4_000114_000124.mp4,316,-1 317 | smoking hookah/C_QP4vOVTrE_000164_000174.mp4,317,-1 318 | snatch weight lifting/827ciUyYK5k_000000_000010.mp4,318,-1 319 | sneezing/ce6aUvCKpbU_000000_000010.mp4,319,-1 320 | sniffing/u2s0kiGG7AU_000011_000021.mp4,320,-1 321 | snorkeling/rsDfe_ikY1I_000010_000020.mp4,321,-1 322 | snowboarding/4cjhTsZjNP8_000202_000212.mp4,322,-1 323 | snowkiting/ToDS3RIVybY_000025_000035.mp4,323,-1 324 | snowmobiling/EgkRnTkj8gc_000003_000013.mp4,324,-1 325 | somersaulting/hFYg1xqG5yk_000154_000164.mp4,325,-1 326 | spinning poi/by9gw0ipuUg_000002_000012.mp4,326,-1 327 | spray painting/zyR32Dm9yek_000019_000029.mp4,327,-1 328 | spraying/LJQX3Atdn4k_000043_000053.mp4,328,-1 329 | springboard diving/T3b1nxhG9Lo_000026_000036.mp4,329,-1 330 | squat/ENkU87uTdfU_000025_000035.mp4,330,-1 331 | sticking tongue out/E6ZgFC1L178_000041_000051.mp4,331,-1 332 | stomping grapes/gzAmaRypLyI_000062_000072.mp4,332,-1 333 | stretching arm/qKiTc6GGT4c_000036_000046.mp4,333,-1 334 | stretching leg/-hkrPB2YU50_000612_000622.mp4,334,-1 335 | strumming guitar/-2GJPqAglxU_000862_000872.mp4,335,-1 336 | surfing crowd/tpruGil1UCs_000038_000048.mp4,336,-1 337 | surfing water/-G_tgkmqChg_000072_000082.mp4,337,-1 338 | sweeping floor/bHy05OAiL1g_000027_000037.mp4,338,-1 339 | swimming backstroke/7aGVsi5ZgMI_000023_000033.mp4,339,-1 340 | swimming breast stroke/BX-dyfoGFsE_000168_000178.mp4,340,-1 341 | swimming butterfly stroke/aAYaI35qR5Q_000005_000015.mp4,341,-1 342 | swing dancing/LVyo14Q5PmY_000006_000016.mp4,342,-1 343 | swinging legs/W52Cl1ed1LU_000000_000010.mp4,343,-1 344 | swinging on something/ibRyH1Q1bbo_000066_000076.mp4,344,-1 345 | sword fighting/_kcVbo4E2JQ_000101_000111.mp4,345,-1 346 | tai chi/BbFbo987QEo_000057_000067.mp4,346,-1 347 | taking a shower/zW5Gt8bfZbc_000011_000021.mp4,347,-1 348 | tango dancing/dFDdr9zxfzc_000101_000111.mp4,348,-1 349 | tap dancing/dZ1EkA3BuQ4_000000_000010.mp4,349,-1 350 | tapping guitar/7Nqupt1WIn4_000031_000041.mp4,350,-1 351 | tapping pen/xcsH3jFtdSg_000026_000036.mp4,351,-1 352 | tasting beer/UcAAItCUJrk_000518_000528.mp4,352,-1 353 | tasting food/SQCsXDtiARU_000332_000342.mp4,353,-1 354 | testifying/O4ystlpCCxM_000010_000020.mp4,354,-1 355 | texting/V_LQXRQVrok_000116_000126.mp4,355,-1 356 | throwing axe/1YmkhTmmyRc_000002_000012.mp4,356,-1 357 | throwing ball/A6JeTfQqm0I_000000_000010.mp4,357,-1 358 | throwing discus/gjNdAGf_16Y_000084_000094.mp4,358,-1 359 | tickling/W9Ydqjoda9c_000000_000010.mp4,359,-1 360 | tobogganing/SWmBChx-7fI_000003_000013.mp4,360,-1 361 | tossing coin/Iwg-Had3-wE_000002_000012.mp4,361,-1 362 | tossing salad/PcbZKLvO6gc_000181_000191.mp4,362,-1 363 | training dog/am3gcomIUa4_000048_000058.mp4,363,-1 364 | trapezing/p-E3XWgf3Wk_000012_000022.mp4,364,-1 365 | trimming or shaving beard/X9mmMztC1Vo_000366_000376.mp4,365,-1 366 | trimming trees/Q_F85_VgKwM_000045_000055.mp4,366,-1 367 | triple jump/UergZFP-AdM_000002_000012.mp4,367,-1 368 | tying bow tie/c-t7EA00jj8_000016_000026.mp4,368,-1 369 | tying knot (not on a tie)/Vdx6g26ZOE0_000002_000012.mp4,369,-1 370 | tying tie/DIrYfnfogiA_000148_000158.mp4,370,-1 371 | unboxing/5lmmjOhih3U_000046_000056.mp4,371,-1 372 | unloading truck/-aKzhHxNXDo_000066_000076.mp4,372,-1 373 | using computer/5R1KJn3Pqa8_000066_000076.mp4,373,-1 374 | using remote controller (not gaming)/Bj7_KWKEXp8_000046_000056.mp4,374,-1 375 | using segway/QQTUQu4emh8_000145_000155.mp4,375,-1 376 | vault/A4U2LxAwIm4_000031_000041.mp4,376,-1 377 | waiting in line/5V_Ed93k2bI_000059_000069.mp4,377,-1 378 | walking the dog/3NlgmP6MDmY_000021_000031.mp4,378,-1 379 | washing dishes/oEkXkrSbFU8_000052_000062.mp4,379,-1 380 | washing feet/n3vpap_pQ-U_000076_000086.mp4,380,-1 381 | washing hair/BhU4HGJ2q4s_000004_000014.mp4,381,-1 382 | washing hands/-jtKtX9gGdY_000005_000015.mp4,382,-1 383 | water skiing/F1KYDfTyuEI_000040_000050.mp4,383,-1 384 | water sliding/N6lBqLeKs8I_000001_000011.mp4,384,-1 385 | watering plants/jZfXAIU4rZ4_000073_000083.mp4,385,-1 386 | waxing back/P5qR6CoGbk8_000035_000045.mp4,386,-1 387 | waxing chest/oRKbez1LpWU_000080_000090.mp4,387,-1 388 | waxing eyebrows/hjzI8c63hVo_000011_000021.mp4,388,-1 389 | waxing legs/dzeivZlP6tU_000024_000034.mp4,389,-1 390 | weaving basket/oD0wHopSNLU_000000_000010.mp4,390,-1 391 | welding/5hSYP2XxBGY_000204_000214.mp4,391,-1 392 | whistling/KFOWZBfLHrA_000084_000094.mp4,392,-1 393 | windsurfing/nDlR90yHqPY_000112_000122.mp4,393,-1 394 | wrapping present/zYjHJNadEj4_000246_000256.mp4,394,-1 395 | wrestling/UP_iRJv5mPU_000150_000160.mp4,395,-1 396 | writing/OrWjyz2bFJQ_000064_000074.mp4,396,-1 397 | yawning/SaJWnqViSLo_000023_000033.mp4,397,-1 398 | yoga/5NysTi21_D0_000003_000013.mp4,398,-1 399 | zumba/BvO4NNTw7Ks_000094_000104.mp4,399,-1 400 | headbutting/PzD2BkZye2U_000013_000023.mp4,150,6 401 | slapping/WGxSNBg_tl0_000075_000085.mp4,314,9 402 | -------------------------------------------------------------------------------- /image_attacks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.nn as nn 4 | import torchvision.models as models 5 | import random 6 | 7 | from image_cam import GradCAM 8 | from torch.autograd import Variable 9 | from image_cam_utils import find_alexnet_layer, find_vgg_layer, find_resnet_layer, find_densenet_layer, find_squeezenet_layer 10 | import pickle as pkl 11 | 12 | class Attack(object): 13 | """ 14 | Base class for all attacks. 15 | .. note:: 16 | It automatically set device to the device where given model is. 17 | It temporarily changes the model's training mode to `test` 18 | by `.eval()` only during an attack process. 19 | """ 20 | def __init__(self, name, model=None): 21 | r""" 22 | Initializes internal attack state. 23 | Arguments: 24 | name (str) : name of an attack. 25 | model (torch.nn.Module): model to attack. 26 | """ 27 | self.attack = name 28 | self.model = model 29 | self.model_name = str(model).split("(")[0] 30 | 31 | # mean and std values are used in pytorch pretrained models 32 | # they are also used in Kinetics-400. 33 | self.mean = [0.485, 0.456, 0.406] 34 | self.std = [0.229, 0.224, 0.225] 35 | 36 | def forward(self, *input): 37 | r""" 38 | It defines the computation performed at every call (attack forward). 39 | Should be overridden by all subclasses. 40 | """ 41 | raise NotImplementedError 42 | 43 | def _transform_perts(self, perts): 44 | dtype = perts.dtype 45 | mean = torch.as_tensor(self.mean, dtype=dtype).cuda() 46 | std = torch.as_tensor(self.std, dtype=dtype).cuda() 47 | perts.div_(std[:, None, None]) 48 | return perts 49 | 50 | def _transform_video(self, video, mode='forward'): 51 | r''' 52 | Transform the video into [0, 1] 53 | ''' 54 | dtype = video.dtype 55 | mean = torch.as_tensor(self.mean, dtype=dtype).cuda() 56 | std = torch.as_tensor(self.std, dtype=dtype).cuda() 57 | if mode == 'forward': 58 | # [-mean/std, mean/std] 59 | video.sub_(mean[:, None, None]).div_(std[:, None, None]) 60 | elif mode == 'back': 61 | # [0, 1] 62 | video.mul_(std[:, None, None]).add_(mean[:, None, None]) 63 | return video 64 | 65 | def _transform_video_ILAF(self, video, mode='forward'): 66 | r''' 67 | Transform the video into [0, 1] 68 | ''' 69 | dtype = video.dtype 70 | mean = torch.as_tensor(self.mean, dtype=dtype).cuda() 71 | std = torch.as_tensor(self.std, dtype=dtype).cuda() 72 | if mode == 'forward': 73 | # [-mean/std, mean/std] 74 | video.sub_(mean[None, :, None, None, None]).div_(std[None, :, None, None, None]) 75 | elif mode == 'back': 76 | # [0, 1] 77 | video.mul_(std[None, :, None, None, None]).add_(mean[None, :, None, None, None]) 78 | return video 79 | 80 | def __call__(self, *input, **kwargs): 81 | images = self.forward(*input, **kwargs) 82 | return images 83 | 84 | def get_model(model_name): 85 | ''' 86 | ['alexnet', 'vgg', 'resnet', 'densenet', 'squeezenet'] 87 | ''' 88 | if model_name == 'alexnet': 89 | model = models.alexnet(pretrained=True) 90 | # model.features[11/7/4/1] 91 | elif model_name == 'vgg': 92 | model = models.vgg16(pretrained=True) 93 | # model.features[29/20/11/1] 94 | elif model_name == 'resnet': 95 | model = models.resnet101(pretrained=True) 96 | elif model_name == 'densenet': 97 | model = models.densenet161(pretrained=True) 98 | # model.features.denseblock1/2/3/4 99 | # model.features.transition1/2/3,norm5 100 | elif model_name == 'squeezenet': 101 | model = models.squeezenet1_1(pretrained=True) 102 | # model.features[12/9/6/3].expand3x3_activation 103 | model.cuda() 104 | model.eval() 105 | # for m in model.modules(): 106 | # if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d): 107 | # m.eval() 108 | return model 109 | 110 | def get_models(model_name_lists): 111 | models = [] 112 | for model_name in model_name_lists: 113 | model = get_model(model_name) 114 | models.append(model) 115 | return models 116 | 117 | def get_GradCam(model_name_lists): 118 | gradcams = [] 119 | for model_name in model_name_lists: 120 | model_dict = dict(type=model_name, arch=get_model(model_name), input_size=(224, 224)) 121 | this_gradcam = GradCAM(model_dict, False) 122 | gradcams.append(this_gradcam) 123 | return gradcams 124 | 125 | # ***************************************************************** 126 | # paper: Enhancing Cross-Task Black-Box Transferability of 127 | # Adversarial Examples with Dispersion Reduction 128 | # ***************************************************************** 129 | class ImageGuidedStd_Adam(Attack): 130 | ''' 131 | Dispersion Reduction (DR) attack. 132 | paper: Enhancing crosstask black-box transferability of adversarial examples with dispersion reduction 133 | parameters: 134 | depth: {1,2,3,4} 135 | ''' 136 | def __init__(self, model_name_lists, depth, step_size, epsilon=16/255, steps=10): 137 | super(ImageGuidedStd_Adam, self).__init__("ImageGuidedStd_Adam") 138 | self.epsilon = epsilon 139 | self.steps = steps 140 | self.step_size = step_size 141 | self.loss_info = {} 142 | self.depth = depth 143 | self.model = get_models(model_name_lists)[0] 144 | self.model_name = model_name_lists[0] 145 | 146 | self.model.train() 147 | for m in self.model.modules(): 148 | if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d): 149 | m.eval() 150 | 151 | self._attention_hook() 152 | 153 | def _find_target_layer(self): 154 | if self.model_name == 'resnet': 155 | return getattr(self.model, 'layer{}'.format(self.depth))[-1] 156 | elif self.model_name == 'alexnet': 157 | depth_to_layer = {1:1,2:4,3:7,4:11} 158 | return getattr(self.model, 'features')[depth_to_layer[self.depth]] 159 | elif self.model_name == 'vgg': 160 | depth_to_layer = {1:1,2:11,3:20,4:29} 161 | return getattr(self.model, 'features')[depth_to_layer[self.depth]] 162 | elif self.model_name == 'squeezenet': 163 | depth_to_layer = {1:3,2:6,3:9,4:12} 164 | return getattr(self.model, 'features')[depth_to_layer[self.depth]].expand3x3_activation 165 | 166 | def _attention_hook(self): 167 | self.gradients = dict() 168 | self.gradients['value'] = [] 169 | self.activations = dict() 170 | self.activations['value'] = [] 171 | def backward_hook(module, grad_input, grad_output): 172 | self.gradients['value'] += [grad_output[0]] 173 | return None 174 | def forward_hook(module, input, output): 175 | self.activations['value'] += [output] 176 | return None 177 | target_layer = self._find_target_layer() 178 | print (target_layer) 179 | if isinstance(target_layer, list): 180 | for i in target_layer: 181 | i.register_forward_hook(forward_hook) 182 | i.register_backward_hook(backward_hook) 183 | else: 184 | target_layer.register_forward_hook(forward_hook) 185 | target_layer.register_backward_hook(backward_hook) 186 | 187 | def forward(self, videos, labels, video_names): 188 | batch_size = videos.shape[0] 189 | b,c,f,h,w = videos.shape 190 | videos = videos.cuda() 191 | labels = labels.cuda() 192 | 193 | image_inps = videos.permute([0,2,1,3,4]) 194 | image_inps = image_inps.reshape(b*f, c, h, w) 195 | 196 | # define modifer that updated by optimizer. 197 | modif = torch.Tensor(b*f, c, h, w).fill_(0.01/255).cuda() 198 | modifier = torch.nn.Parameter(modif, requires_grad=True) 199 | optimizer = torch.optim.Adam([modifier], lr=self.step_size) 200 | 201 | unnorm_videos = self._transform_video(image_inps.clone().detach(), mode='back') # [0, 1] 202 | 203 | unnorm_videos = Variable(unnorm_videos, requires_grad=False) 204 | 205 | for i in range(self.steps): 206 | self.gradients = dict() 207 | self.gradients['value'] = [] 208 | self.activations = dict() 209 | self.activations['value'] = [] 210 | 211 | true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1) 212 | true_image = self._transform_video(true_image, mode='forward') # norm 213 | 214 | _ = self.model(true_image) 215 | 216 | std_losses = [] 217 | for mm in range(len(self.activations['value'])): 218 | activations = self.activations['value'][mm].std() 219 | std_losses.append(activations) 220 | cost = torch.sum(torch.stack(std_losses)) 221 | optimizer.zero_grad() 222 | cost.backward() 223 | optimizer.step() 224 | 225 | for ind,vid_name in enumerate(video_names): 226 | if vid_name not in self.loss_info.keys(): 227 | self.loss_info[vid_name] = {} 228 | self.loss_info[vid_name][i] = {'cost': str(cost.detach().cpu().numpy())} 229 | 230 | true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1) 231 | image_inps = self._transform_video(true_image, mode='forward') 232 | image_inps = image_inps.reshape(b,f,c,h,w) 233 | image_inps = image_inps.permute([0,2,1,3,4]) 234 | return image_inps 235 | 236 | class ImageGuidedFMDirection_Adam(Attack): 237 | ''' 238 | The proposed Image to Video (I2V) attack. 239 | parameters: 240 | depth: {1,2,3,4} 241 | model_name_lists: [a model name] 242 | ''' 243 | def __init__(self, model_name_lists, depth, step_size, epsilon=16/255, steps=10): 244 | super(ImageGuidedFMDirection_Adam, self).__init__("ImageGuidedFMDirection_Adam") 245 | self.epsilon = epsilon 246 | self.steps = steps 247 | self.step_size = step_size 248 | self.loss_info = {} 249 | self.depth = depth 250 | self.model = get_models(model_name_lists)[0] 251 | self.model_name = model_name_lists[0] 252 | 253 | self.model.train() 254 | for m in self.model.modules(): 255 | if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d): 256 | m.eval() 257 | 258 | self._attention_hook() 259 | 260 | def _find_target_layer(self): 261 | if self.model_name == 'resnet': 262 | return getattr(self.model, 'layer{}'.format(self.depth))[-1] 263 | elif self.model_name == 'alexnet': 264 | depth_to_layer = {1:1,2:4,3:7,4:11} 265 | return getattr(self.model, 'features')[depth_to_layer[self.depth]] 266 | elif self.model_name == 'vgg': 267 | depth_to_layer = {1:1,2:11,3:20,4:29} 268 | return getattr(self.model, 'features')[depth_to_layer[self.depth]] 269 | elif self.model_name == 'squeezenet': 270 | depth_to_layer = {1:3,2:6,3:9,4:12} 271 | return getattr(self.model, 'features')[depth_to_layer[self.depth]].expand3x3_activation 272 | 273 | def _attention_hook(self): 274 | self.gradients = dict() 275 | self.gradients['value'] = [] 276 | self.activations = dict() 277 | self.activations['value'] = [] 278 | def backward_hook(module, grad_input, grad_output): 279 | self.gradients['value'] += [grad_output[0]] 280 | return None 281 | def forward_hook(module, input, output): 282 | self.activations['value'] += [output] 283 | return None 284 | target_layer = self._find_target_layer() 285 | print (target_layer) 286 | if isinstance(target_layer, list): 287 | for i in target_layer: 288 | i.register_forward_hook(forward_hook) 289 | i.register_backward_hook(backward_hook) 290 | else: 291 | target_layer.register_forward_hook(forward_hook) 292 | target_layer.register_backward_hook(backward_hook) 293 | 294 | def forward(self, videos, labels, video_names): 295 | batch_size = videos.shape[0] 296 | b,c,f,h,w = videos.shape 297 | videos = videos.cuda() 298 | labels = labels.cuda() 299 | 300 | image_inps = videos.permute([0,2,1,3,4]) 301 | image_inps = image_inps.reshape(b*f, c, h, w) 302 | 303 | # define modifer that updated by optimizer. 304 | modif = torch.Tensor(b*f, c, h, w).fill_(0.01/255).cuda() 305 | modifier = torch.nn.Parameter(modif, requires_grad=True) 306 | optimizer = torch.optim.Adam([modifier], lr=self.step_size) 307 | 308 | unnorm_videos = self._transform_video(image_inps.clone().detach(), mode='back') # [0, 1] 309 | 310 | unnorm_videos = Variable(unnorm_videos, requires_grad=False) 311 | 312 | # initial feature map 313 | self.gradients = dict() 314 | self.gradients['value'] = [] 315 | self.activations = dict() 316 | self.activations['value'] = [] 317 | 318 | _ = self.model(image_inps) 319 | init_feature_maps = [] 320 | for mm in range(len(self.activations['value'])): 321 | activations = self.activations['value'][mm] 322 | activations = Variable(activations, requires_grad=False) 323 | init_feature_maps.append(activations) 324 | 325 | for i in range(self.steps): 326 | self.gradients = dict() 327 | self.gradients['value'] = [] 328 | self.activations = dict() 329 | self.activations['value'] = [] 330 | 331 | true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1) 332 | true_image = self._transform_video(true_image, mode='forward') # norm 333 | 334 | _ = self.model(true_image) 335 | 336 | losses = [] 337 | for mm in range(len(init_feature_maps)): 338 | activations = self.activations['value'][mm] 339 | init_activations = init_feature_maps[mm] 340 | 341 | this_dir = activations.view(b*f, -1) 342 | init_dir = init_activations.view(b*f, -1) 343 | this_loss = F.cosine_similarity(this_dir, init_dir) 344 | flag = 1 # decrease this_loss 345 | 346 | losses.append(this_loss) 347 | cost = flag * torch.sum(torch.stack(losses)) 348 | 349 | print (cost) 350 | 351 | optimizer.zero_grad() 352 | cost.backward() 353 | optimizer.step() 354 | 355 | for ind,vid_name in enumerate(video_names): 356 | if vid_name not in self.loss_info.keys(): 357 | self.loss_info[vid_name] = {} 358 | self.loss_info[vid_name][i] = {'cost': str(cost.detach().cpu().numpy())} 359 | 360 | true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1) 361 | image_inps = self._transform_video(true_image, mode='forward') 362 | image_inps = image_inps.reshape(b,f,c,h,w) 363 | image_inps = image_inps.permute([0,2,1,3,4]) 364 | return image_inps 365 | 366 | class ImageGuidedFML2_Adam_MultiModels(Attack): 367 | ''' 368 | The proposed ensemble Image to Video (ENS-I2V) attack. 369 | parameters: 370 | depth: {1,2,3,4} 371 | ''' 372 | def __init__(self, model_name_lists, depths, epsilon=16/255, steps=60): 373 | super(ImageGuidedFML2_Adam_MultiModels, self).__init__("ImageGuidedFML2_Adam_MultiModels") 374 | self.epsilon = epsilon 375 | self.steps = steps 376 | self.step_size = 0.005 377 | self.loss_info = {} 378 | self.depths = depths 379 | self.models = get_models(model_name_lists) 380 | self.model_names = model_name_lists 381 | print (model_name_lists) 382 | for i in range(len(self.models)): 383 | self.models[i].train() 384 | for m in self.models[i].modules(): 385 | if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d): 386 | m.eval() 387 | model_name = self.model_names[i] 388 | 389 | self._attention_hook(self.models[i], model_name) 390 | 391 | def _find_target_layer(self, model, model_name): 392 | used_depth = self.depths[model_name] 393 | if model_name == 'resnet': 394 | return getattr(model, 'layer{}'.format(used_depth))[-1] 395 | elif model_name == 'alexnet': 396 | depth_to_layer = {1:1,2:4,3:7,4:11} 397 | return getattr(model, 'features')[depth_to_layer[used_depth]] 398 | elif model_name == 'vgg': 399 | depth_to_layer = {1:1,2:11,3:20,4:29} 400 | return getattr(model, 'features')[depth_to_layer[used_depth]] 401 | elif model_name == 'squeezenet': 402 | depth_to_layer = {1:3,2:6,3:9,4:12} 403 | return getattr(model, 'features')[depth_to_layer[used_depth]].expand3x3_activation 404 | 405 | def _attention_hook(self, model, model_name): 406 | self.gradients = dict() 407 | self.gradients['value'] = [] 408 | self.activations = dict() 409 | self.activations['value'] = [] 410 | def backward_hook(module, grad_input, grad_output): 411 | self.gradients['value'] += [grad_output[0]] 412 | return None 413 | def forward_hook(module, input, output): 414 | self.activations['value'] += [output] 415 | return None 416 | target_layer = self._find_target_layer(model, model_name) 417 | print (target_layer) 418 | if isinstance(target_layer, list): 419 | for i in target_layer: 420 | i.register_forward_hook(forward_hook) 421 | i.register_backward_hook(backward_hook) 422 | else: 423 | target_layer.register_forward_hook(forward_hook) 424 | target_layer.register_backward_hook(backward_hook) 425 | 426 | def forward(self, videos, labels, video_names): 427 | batch_size = videos.shape[0] 428 | b,c,f,h,w = videos.shape 429 | videos = videos.cuda() 430 | labels = labels.cuda() 431 | 432 | image_inps = videos.permute([0,2,1,3,4]) 433 | image_inps = image_inps.reshape(b*f, c, h, w) 434 | 435 | # define modifer that updated by optimizer. 436 | modif = torch.Tensor(b*f, c, h, w).fill_(0.01/255).cuda() 437 | modifier = torch.nn.Parameter(modif, requires_grad=True) 438 | optimizer = torch.optim.Adam([modifier], lr=self.step_size) 439 | 440 | unnorm_videos = self._transform_video(image_inps.clone().detach(), mode='back') # [0, 1] 441 | 442 | unnorm_videos = Variable(unnorm_videos, requires_grad=False) 443 | 444 | # initial feature map 445 | self.gradients = dict() 446 | self.gradients['value'] = [] 447 | self.activations = dict() 448 | self.activations['value'] = [] 449 | 450 | for n in range(len(self.models)): 451 | _ = self.models[n](image_inps) 452 | # _ = self.model(image_inps) 453 | init_feature_maps = [] 454 | for mm in range(len(self.activations['value'])): 455 | activations = self.activations['value'][mm] 456 | activations = Variable(activations, requires_grad=False) 457 | init_feature_maps.append(activations) 458 | 459 | for i in range(self.steps): 460 | self.gradients = dict() 461 | self.gradients['value'] = [] 462 | self.activations = dict() 463 | self.activations['value'] = [] 464 | 465 | true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1) 466 | true_image = self._transform_video(true_image, mode='forward') # norm 467 | 468 | # _ = self.model(true_image) 469 | for n in range(len(self.models)): 470 | _ = self.models[n](true_image) 471 | losses = [] 472 | for mm in range(len(init_feature_maps)): 473 | activations = self.activations['value'][mm] 474 | init_activations = init_feature_maps[mm] 475 | this_dir = activations.view(b*f, -1) 476 | init_dir = init_activations.view(b*f, -1) 477 | this_loss = F.cosine_similarity(this_dir, init_dir) 478 | flag = 1 # decrease this_loss 479 | losses.append(this_loss) 480 | cost = flag * torch.sum(torch.stack(losses)) 481 | 482 | print (cost) 483 | optimizer.zero_grad() 484 | cost.backward() 485 | optimizer.step() 486 | 487 | for ind,vid_name in enumerate(video_names): 488 | if vid_name not in self.loss_info.keys(): 489 | self.loss_info[vid_name] = {} 490 | self.loss_info[vid_name][i] = {'cost': str(cost.detach().cpu().numpy())} 491 | 492 | true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1) 493 | image_inps = self._transform_video(true_image, mode='forward') 494 | image_inps = image_inps.reshape(b,f,c,h,w) 495 | image_inps = image_inps.permute([0,2,1,3,4]) 496 | return image_inps 497 | 498 | class ILAF(Attack): 499 | ''' 500 | ILAF. Paper: Enhancing adversarial example transferability with an intermediate level attack. 501 | ''' 502 | def __init__(self, model, model_type, step_size=0.005, epsilon=16/255, steps=60): 503 | super(ILAF, self).__init__("ILAF") 504 | self.epsilon = epsilon 505 | self.steps = steps 506 | self.step_size = step_size 507 | self.loss_info = {} 508 | self.model_type = model_type 509 | self.model = model 510 | 511 | self._activation_hook() 512 | 513 | def _find_target_layer(self): 514 | if 'i3d' in self.model_type: 515 | return self.model.res_layers._modules['1'] 516 | elif 'slowfast' in self.model_type: 517 | return [self.model._modules['slow_res2'], self.model._modules['fast_res2']] #[b,2048, 8, 7, 7], [b, 256, 32, 7, 7] 518 | elif 'tpn' in self.model_type: 519 | return self.model.layer2 520 | 521 | def _activation_hook(self): 522 | self.activations = dict() 523 | self.activations['value'] = [] 524 | def forward_hook(module, input, output): 525 | self.activations['value'] += [output] 526 | return None 527 | target_layer = self._find_target_layer() 528 | if isinstance(target_layer, list): 529 | for i in target_layer: 530 | i.register_forward_hook(forward_hook) 531 | else: 532 | target_layer.register_forward_hook(forward_hook) 533 | 534 | def forward(self, videos, ori_videos, labels, video_names): 535 | batch_size = videos.shape[0] 536 | b,c,f,h,w = videos.shape 537 | videos = videos.cuda() 538 | labels = labels.cuda() 539 | ori_videos = ori_videos.cuda() 540 | 541 | # ori feature map 542 | ori_feature_maps = [] 543 | self.activations = dict() 544 | self.activations['value'] = [] 545 | with torch.no_grad(): 546 | _ = self.model(ori_videos) 547 | for mm in range(len(self.activations['value'])): 548 | activations = self.activations['value'][mm] 549 | ori_feature_maps.append(activations) 550 | 551 | # existed adv feature map 552 | adv_feature_maps = [] 553 | self.activations = dict() 554 | self.activations['value'] = [] 555 | with torch.no_grad(): 556 | _ = self.model(videos) 557 | for mm in range(len(self.activations['value'])): 558 | activations = self.activations['value'][mm] 559 | adv_feature_maps.append(activations) 560 | 561 | init_directions = [] # normalized direction 562 | init_norms = [] # norm values 563 | for ori_di, adv_di in zip(ori_feature_maps, adv_feature_maps): 564 | init_direction = adv_di - ori_di 565 | norm = torch.norm(init_direction, p=2) 566 | init_norms.append(norm) 567 | init_directions.append(init_direction/torch.norm(init_direction,p=2,keepdim=True)) 568 | 569 | 570 | adv_unnorm_videos = self._transform_video_ILAF(videos.clone().detach(), mode='back') # [0, 1] 571 | ori_unnorm_videos = self._transform_video_ILAF(ori_videos.clone().detach(), mode='back') # [0, 1] 572 | 573 | existed_perturbations = adv_unnorm_videos - ori_unnorm_videos 574 | modifier = torch.Tensor(existed_perturbations.cpu()).cuda() 575 | ori_unnorm_videos = Variable(ori_unnorm_videos, requires_grad=False) 576 | 577 | del adv_feature_maps, adv_unnorm_videos, videos, ori_videos 578 | torch.cuda.empty_cache() 579 | for i in range(self.steps): 580 | modifier.requires_grad = True 581 | self.activations = dict() 582 | self.activations['value'] = [] 583 | 584 | true_image = torch.clamp(ori_unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1) 585 | 586 | 587 | true_image = self._transform_video_ILAF(true_image, mode='forward') # norm 588 | 589 | step_feature_maps = [] 590 | opt = self.model(true_image) 591 | 592 | for mm in range(len(self.activations['value'])): 593 | activations = self.activations['value'][mm] 594 | # activations = Variable(activations, requires_grad=False) 595 | step_feature_maps.append(activations) 596 | 597 | step_directions = [] # normalized direction 598 | step_norms = [] # norm values 599 | for ori_di, adv_di in zip(ori_feature_maps, step_feature_maps): 600 | step_direction = adv_di - ori_di 601 | step_norm = torch.norm(step_direction, p=2) 602 | step_norms.append(step_norm) 603 | step_directions.append(step_direction/torch.norm(step_direction,p=2,keepdim=True)) 604 | 605 | losses = [] 606 | for lens_fm in range(len(step_directions)): 607 | # magnitude 608 | magnitude_gain = step_norms[lens_fm] / init_norms[lens_fm] 609 | # angle 610 | angle_loss = torch.mm(init_directions[lens_fm].view(1,-1), step_directions[lens_fm].view(1,-1).transpose(1,0)) 611 | this_loss = -(0.5 * magnitude_gain + angle_loss) 612 | losses.append(this_loss) 613 | cost = torch.sum(torch.stack(losses)) 614 | 615 | grad = torch.autograd.grad(cost, modifier, 616 | retain_graph=False, create_graph=False)[0] 617 | modifier.data -= self.step_size * grad.sign() 618 | 619 | 620 | for ind,vid_name in enumerate(video_names): 621 | if vid_name not in self.loss_info.keys(): 622 | self.loss_info[vid_name] = {} 623 | self.loss_info[vid_name][i] = {'cost': str(cost.detach().cpu().numpy())} 624 | 625 | true_image = torch.clamp(ori_unnorm_videos + torch.clamp(modifier.data, min=-self.epsilon, max=self.epsilon), min=0, max=1) 626 | image_inps = self._transform_video_ILAF(true_image, mode='forward') 627 | image_inps = image_inps.reshape(b,f,c,h,w) 628 | image_inps = image_inps.permute([0,2,1,3,4]) 629 | return image_inps 630 | -------------------------------------------------------------------------------- /base_attacks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import scipy.stats as st 4 | import numpy as np 5 | import torchvision 6 | from PIL import Image 7 | import random 8 | 9 | from utils import norm_grads 10 | # refer to https://github.com/Harry24k/adversarial-attacks-pytorch 11 | 12 | class Attack(object): 13 | """ 14 | Base class for all attacks. 15 | .. note:: 16 | It automatically set device to the device where given model is. 17 | It temporarily changes the model's training mode to `test` 18 | by `.eval()` only during an attack process. 19 | """ 20 | def __init__(self, name, model): 21 | r""" 22 | Initializes internal attack state. 23 | Arguments: 24 | name (str) : name of an attack. 25 | model (torch.nn.Module): model to attack. 26 | """ 27 | self.attack = name 28 | self.model = model 29 | self.model_name = str(model).split("(")[0] 30 | 31 | self.training = model.training 32 | self.device = next(model.parameters()).device 33 | 34 | self._targeted = 1 35 | self._attack_mode = 'default' 36 | self._return_type = 'float' 37 | self._target_map_function = lambda images, labels:labels 38 | 39 | self.mean = [0.485, 0.456, 0.406] 40 | self.std = [0.229, 0.224, 0.225] 41 | 42 | def forward(self, *input): 43 | r""" 44 | It defines the computation performed at every call (attack forward). 45 | Should be overridden by all subclasses. 46 | """ 47 | raise NotImplementedError 48 | 49 | def set_attack_mode(self, mode, target_map_function=None): 50 | r""" 51 | Set the attack mode. 52 | 53 | Arguments: 54 | mode (str) : 'default' (DEFAULT) 55 | 'targeted' - Use input labels as targeted labels. 56 | 'least_likely' - Use least likely labels as targeted labels. 57 | 58 | target_map_function (function) : 59 | """ 60 | if self._attack_mode is 'only_default': 61 | raise ValueError("Changing attack mode is not supported in this attack method.") 62 | 63 | if (mode is 'targeted') and (target_map_function is None): 64 | raise ValueError("Please give a target_map_function, e.g., lambda images, labels:(labels+1)%10.") 65 | 66 | if mode=="default": 67 | self._attack_mode = "default" 68 | self._targeted = 1 69 | self._transform_label = self._get_label 70 | elif mode=="targeted": 71 | self._attack_mode = "targeted" 72 | self._targeted = -1 73 | self._target_map_function = target_map_function 74 | self._transform_label = self._get_target_label 75 | elif mode=="least_likely": 76 | self._attack_mode = "least_likely" 77 | self._targeted = -1 78 | self._transform_label = self._get_least_likely_label 79 | else: 80 | raise ValueError(mode + " is not a valid mode. [Options : default, targeted, least_likely]") 81 | 82 | def set_return_type(self, type): 83 | r""" 84 | Set the return type of adversarial images: `int` or `float`. 85 | Arguments: 86 | type (str) : 'float' or 'int'. (DEFAULT : 'float') 87 | """ 88 | if type == 'float': 89 | self._return_type = 'float' 90 | elif type == 'int': 91 | self._return_type = 'int' 92 | else: 93 | raise ValueError(type + " is not a valid type. [Options : float, int]") 94 | 95 | def save(self, save_path, data_loader, verbose=True): 96 | r""" 97 | Save adversarial images as torch.tensor from given torch.utils.data.DataLoader. 98 | Arguments: 99 | save_path (str) : save_path. 100 | data_loader (torch.utils.data.DataLoader) : data loader. 101 | verbose (bool) : True for displaying detailed information. (DEFAULT : True) 102 | """ 103 | self.model.eval() 104 | 105 | image_list = [] 106 | label_list = [] 107 | 108 | correct = 0 109 | total = 0 110 | 111 | total_batch = len(data_loader) 112 | 113 | for step, (images, labels) in enumerate(data_loader): 114 | adv_images = self.__call__(images, labels) 115 | 116 | image_list.append(adv_images.cpu()) 117 | label_list.append(labels.cpu()) 118 | 119 | if self._return_type == 'int': 120 | adv_images = adv_images.float()/255 121 | 122 | if verbose: 123 | outputs = self.model(adv_images) 124 | _, predicted = torch.max(outputs.data, 1) 125 | total += labels.size(0) 126 | correct += (predicted == labels.to(self.device)).sum() 127 | 128 | acc = 100 * float(correct) / total 129 | print('- Save Progress : %2.2f %% / Accuracy : %2.2f %%' % ((step+1)/total_batch*100, acc), end='\r') 130 | 131 | x = torch.cat(image_list, 0) 132 | y = torch.cat(label_list, 0) 133 | torch.save((x, y), save_path) 134 | print('\n- Save Complete!') 135 | 136 | self._switch_model() 137 | 138 | def _transform_perts(self, perts): 139 | dtype = perts.dtype 140 | mean = torch.as_tensor(self.mean, dtype=dtype, device=self.device) 141 | std = torch.as_tensor(self.std, dtype=dtype, device=self.device) 142 | perts.div_(std[:, None, None, None]) 143 | return perts 144 | 145 | def _transform_video(self, video, mode='forward'): 146 | r''' 147 | Transform the video into [0, 1] 148 | ''' 149 | dtype = video.dtype 150 | mean = torch.as_tensor(self.mean, dtype=dtype, device=self.device) 151 | std = torch.as_tensor(self.std, dtype=dtype, device=self.device) 152 | if mode == 'forward': 153 | # [-mean/std, mean/std] 154 | video.sub_(mean[:, None, None, None]).div_(std[:, None, None, None]) 155 | elif mode == 'back': 156 | # [0, 1] 157 | video.mul_(std[:, None, None, None]).add_(mean[:, None, None, None]) 158 | return video 159 | 160 | def _transform_label(self, images, labels): 161 | r""" 162 | Function for changing the attack mode. 163 | """ 164 | return labels 165 | 166 | def _get_label(self, images, labels): 167 | r""" 168 | Function for changing the attack mode. 169 | Return input labels. 170 | """ 171 | return labels 172 | 173 | def _get_target_label(self, images, labels): 174 | r""" 175 | Function for changing the attack mode. 176 | Return input labels. 177 | """ 178 | return self._target_map_function(images, labels) 179 | 180 | def _get_least_likely_label(self, images, labels): 181 | r""" 182 | Function for changing the attack mode. 183 | Return least likely labels. 184 | """ 185 | outputs = self.model(images) 186 | _, labels = torch.min(outputs.data, 1) 187 | labels = labels.detach_() 188 | return labels 189 | 190 | def _to_uint(self, images): 191 | r""" 192 | Function for changing the return type. 193 | Return images as int. 194 | """ 195 | return (images*255).type(torch.uint8) 196 | 197 | def _switch_model(self): 198 | r""" 199 | Function for changing the training mode of the model. 200 | """ 201 | if self.training: 202 | self.model.train() 203 | else: 204 | self.model.eval() 205 | 206 | def __str__(self): 207 | info = self.__dict__.copy() 208 | 209 | del_keys = ['model', 'attack'] 210 | 211 | for key in info.keys(): 212 | if key[0] == "_" : 213 | del_keys.append(key) 214 | 215 | for key in del_keys: 216 | del info[key] 217 | 218 | info['attack_mode'] = self._attack_mode 219 | if info['attack_mode'] == 'only_default' : 220 | info['attack_mode'] = 'default' 221 | 222 | info['return_type'] = self._return_type 223 | 224 | return self.attack + "(" + ', '.join('{}={}'.format(key, val) for key, val in info.items()) + ")" 225 | 226 | def __call__(self, *input, **kwargs): 227 | self.model.eval() 228 | images = self.forward(*input, **kwargs) 229 | self._switch_model() 230 | 231 | if self._return_type == 'int': 232 | images = self._to_uint(images) 233 | 234 | return images 235 | 236 | class FGSM(Attack): 237 | '''Fast Gradient Sign Method''' 238 | def __init__(self, model, steps=None, epsilon=16/255): 239 | super(FGSM, self).__init__("FGSM", model) 240 | self.epsilon = epsilon 241 | 242 | def forward(self, videos, labels): 243 | videos = videos.to(self.device) 244 | labels = labels.to(self.device) 245 | loss = nn.CrossEntropyLoss() 246 | 247 | videos.requires_grad = True 248 | outputs = self.model(videos) 249 | cost = self._targeted*loss(outputs, labels).to(self.device) 250 | 251 | grad = torch.autograd.grad(cost, videos, 252 | retain_graph=False, create_graph=False)[0] 253 | 254 | adv_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1] 255 | adv_videos = adv_videos + self.epsilon*grad.sign() 256 | adv_videos = torch.clamp(adv_videos, min=0, max=1).detach() 257 | adv_videos = self._transform_video(adv_videos, mode='forward') # norm 258 | 259 | return adv_videos 260 | 261 | class BIM(Attack): 262 | ''' 263 | Basic Iterative Method 264 | Only iterative version. 265 | ''' 266 | def __init__(self, model, epsilon=16/255, steps=10): 267 | super(BIM, self).__init__("FGSM", model) 268 | self.epsilon = epsilon 269 | self.steps = steps 270 | self.step_size = self.epsilon / self.steps 271 | 272 | def forward(self, videos, labels): 273 | r""" 274 | Overridden. 275 | """ 276 | videos = videos.to(self.device) 277 | labels = labels.to(self.device) 278 | loss = nn.CrossEntropyLoss() 279 | unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1] 280 | adv_videos = videos.clone().detach() 281 | 282 | for i in range(self.steps): 283 | adv_videos.requires_grad = True 284 | outputs = self.model(adv_videos) 285 | cost = self._targeted*loss(outputs, labels).to(self.device) 286 | grad = torch.autograd.grad(cost, adv_videos, 287 | retain_graph=False, create_graph=False)[0] 288 | 289 | adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1] 290 | adv_videos = adv_videos + self.step_size*grad.sign() 291 | delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon) 292 | adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach() 293 | adv_videos = self._transform_video(adv_videos, mode='forward') # norm 294 | 295 | return adv_videos 296 | 297 | class MIFGSM(Attack): 298 | ''' 299 | Momentum Iterative Fast Gradient Sign Method 300 | Only iterative version. 301 | ''' 302 | def __init__(self, model, epsilon=16/255, steps=10, decay=1.0): 303 | super(MIFGSM, self).__init__("MIFGSM", model) 304 | self.epsilon = epsilon 305 | self.steps = steps 306 | self.step_size = self.epsilon / self.steps 307 | self.decay = decay 308 | 309 | def forward(self, videos, labels): 310 | r""" 311 | Overridden. 312 | """ 313 | videos = videos.to(self.device) 314 | labels = labels.to(self.device) 315 | loss = nn.CrossEntropyLoss() 316 | momentum = torch.zeros_like(videos).to(self.device) 317 | unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1] 318 | adv_videos = videos.clone().detach() 319 | 320 | for i in range(self.steps): 321 | adv_videos.requires_grad = True 322 | outputs = self.model(adv_videos) 323 | 324 | cost = self._targeted*loss(outputs, labels).to(self.device) 325 | grad = torch.autograd.grad(cost, adv_videos, 326 | retain_graph=False, create_graph=False)[0] 327 | # frame-level or clip-level 328 | grad = norm_grads(grad, True) 329 | # grad_norm = torch.norm(grad, p=1) 330 | # grad /= grad_norm 331 | grad += momentum*self.decay 332 | momentum = grad 333 | 334 | adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1] 335 | adv_videos = adv_videos + self.step_size*grad.sign() 336 | delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon) 337 | adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach() 338 | adv_videos = self._transform_video(adv_videos, mode='forward') # norm 339 | 340 | return adv_videos 341 | 342 | class DIFGSM(Attack): 343 | ''' 344 | Diverse Inputs Method. 345 | Only iterative version. 346 | Contain momentum or no momentum. 347 | ''' 348 | def __init__(self, model, epsilon=16/255, steps=10, decay=1.0, momentum=False): 349 | super(DIFGSM, self).__init__("DIFGSM", model) 350 | self.epsilon = epsilon 351 | self.steps = steps 352 | self.step_size = self.epsilon / self.steps 353 | self.decay = decay 354 | self.momentum = momentum 355 | 356 | def _input_diversity(self, videos): 357 | # r = torch.randint(1,10, size=(1,1)).item() 358 | # if r <= 5: 359 | if random.random() < 0.5: 360 | return videos 361 | else: 362 | rnd = torch.randint(224,250, size=(1,1)).item() 363 | rescaled = videos.view((-1, ) + videos.shape[2:]) 364 | rescaled = torch.nn.functional.interpolate(rescaled, size=[rnd, rnd], mode='nearest') 365 | # rescaled = torchvision.transforms.functional.resize(videos,[rnd, rnd], Image.NEAREST) 366 | h_rem = 250 - rnd 367 | w_rem = 250 - rnd 368 | pad_top = torch.randint(0, h_rem, size=(1,1)).item() 369 | pad_bottom = h_rem - pad_top 370 | pad_left = torch.randint(0, w_rem, size=(1,1)).item() 371 | pad_right = w_rem - pad_left 372 | padded = nn.functional.pad(rescaled, [pad_left, pad_right, pad_top, pad_bottom]) 373 | # return torchvision.transforms.functional.resize(padded,[224, 224], Image.NEAREST) 374 | padded = torch.nn.functional.interpolate(padded, size=[224, 224], mode='nearest') 375 | padded = padded.view(videos.shape) 376 | return padded 377 | 378 | def forward(self, videos, labels): 379 | r""" 380 | Overridden. 381 | """ 382 | videos = videos.to(self.device) 383 | labels = labels.to(self.device) 384 | loss = nn.CrossEntropyLoss() 385 | momentum = torch.zeros_like(videos).to(self.device) 386 | unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1] 387 | adv_videos = videos.clone().detach() 388 | 389 | for i in range(self.steps): 390 | adv_videos.requires_grad = True 391 | outputs = self.model(self._input_diversity(adv_videos)) 392 | 393 | cost = self._targeted*loss(outputs, labels).to(self.device) 394 | grad = torch.autograd.grad(cost, adv_videos, 395 | retain_graph=False, create_graph=False)[0] 396 | 397 | if self.momentum: 398 | grad_norm = torch.norm(grad, p=1) 399 | grad /= grad_norm 400 | grad += momentum*self.decay 401 | momentum = grad 402 | else: 403 | pass 404 | 405 | adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1] 406 | adv_videos = adv_videos + self.step_size*grad.sign() 407 | delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon) 408 | adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach() 409 | adv_videos = self._transform_video(adv_videos, mode='forward') # norm 410 | 411 | return adv_videos 412 | 413 | class TIFGSM(Attack): 414 | '''Translation-Invariant Attack''' 415 | def __init__(self, model, epsilon=16/255, steps=10, decay=1.0, momentum=False): 416 | super(TIFGSM, self).__init__("MIFGSM", model) 417 | self.epsilon = epsilon 418 | self.steps = steps 419 | self.step_size = self.epsilon / self.steps 420 | self.decay = decay 421 | self.momentum = momentum 422 | # generate start_kernel 423 | kernel = self._initial_kernel(15, 3).astype(np.float32) # (15,15) 424 | stack_kernel = np.stack([kernel, kernel, kernel]) # (3,15,15) 425 | self.stack_kernel = torch.from_numpy(np.expand_dims(stack_kernel, 1)).to(self.device) # 3,1,15,15 426 | 427 | def _initial_kernel(self, kernlen, nsig): 428 | x = np.linspace(-nsig, nsig, kernlen) 429 | kern1d = st.norm.pdf(x) 430 | kernel_raw = np.outer(kern1d, kern1d) 431 | kernel = kernel_raw / kernel_raw.sum() 432 | return kernel 433 | 434 | def _conv2d_frame(self, grads): 435 | ''' 436 | grads: N, C, T, H, W 437 | ''' 438 | frames = grads.shape[2] 439 | out_grads = torch.zeros_like(grads) 440 | for i in range(frames): 441 | this_grads = grads[:,:,i] 442 | out_grad = nn.functional.conv2d(this_grads, self.stack_kernel, groups=3, stride=1, padding=7) 443 | out_grads[:,:,i] = out_grad 444 | out_grads = out_grads / torch.mean(torch.abs(out_grads), [1,2,3], True) 445 | return out_grads 446 | 447 | def forward(self, videos, labels): 448 | r""" 449 | Overridden. 450 | """ 451 | videos = videos.to(self.device) 452 | labels = labels.to(self.device) 453 | loss = nn.CrossEntropyLoss() 454 | momentum = torch.zeros_like(videos).to(self.device) 455 | unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1] 456 | adv_videos = videos.clone().detach() 457 | 458 | for i in range(self.steps): 459 | adv_videos.requires_grad = True 460 | outputs = self.model(adv_videos) 461 | 462 | cost = self._targeted*loss(outputs, labels).to(self.device) 463 | grad = torch.autograd.grad(cost, adv_videos, 464 | retain_graph=False, create_graph=False)[0] 465 | 466 | grad = self._conv2d_frame(grad) 467 | if self.momentum: 468 | grad += momentum*self.decay 469 | momentum = grad 470 | else: 471 | pass 472 | 473 | adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1] 474 | adv_videos = adv_videos + self.step_size*grad.sign() 475 | delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon) 476 | adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach() 477 | adv_videos = self._transform_video(adv_videos, mode='forward') # norm 478 | 479 | return adv_videos 480 | 481 | class SGM(Attack): 482 | '''Skip Gradient Method''' 483 | def __init__(self, model, epsilon=16/255, steps=10, decay=1.0, gamma=0.5, momentum=False): 484 | super(SGM, self).__init__("SGM", model) 485 | self.epsilon = epsilon 486 | self.steps = steps 487 | self.step_size = self.epsilon / self.steps 488 | self.decay = decay 489 | self.momentum = momentum 490 | self.gamma = gamma 491 | 492 | # register model 493 | self._register_hook_for_model(self.model) 494 | 495 | def _register_hook_for_model(self, model): 496 | def backward_hook(gamma): 497 | # implement SGM through grad through ReLU 498 | def _backward_hook(module, grad_in, grad_out): 499 | if isinstance(module, nn.ReLU): 500 | return (gamma * grad_in[0],) 501 | return _backward_hook 502 | 503 | def backward_hook_norm(module, grad_in, grad_out): 504 | # normalize the gradient to avoid gradient explosion or vanish 505 | std = torch.std(grad_in[0]) 506 | return (grad_in[0] / std,) 507 | 508 | backward_hook_sgm = backward_hook(np.power(self.gamma, 0.5)) 509 | for name, module in model.named_modules(): 510 | if 'relu' in name and not '0.relu' in name: 511 | module.register_backward_hook(backward_hook_sgm) 512 | 513 | # e.g., 1.layer1.1, 1.layer4.2, ... 514 | # if len(name.split('.')) == 3: 515 | # refer to https://github.com/csdongxian/skip-connections-matter/issues/3 516 | # if len(name.split('.')) >= 2 and 'layer' in name.split('.')[-2]: 517 | # module.register_backward_hook(backward_hook_norm) 518 | 519 | def forward(self, videos, labels): 520 | r""" 521 | Overridden. 522 | """ 523 | videos = videos.to(self.device) 524 | labels = labels.to(self.device) 525 | loss = nn.CrossEntropyLoss() 526 | momentum = torch.zeros_like(videos).to(self.device) 527 | unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1] 528 | adv_videos = videos.clone().detach() 529 | 530 | for i in range(self.steps): 531 | adv_videos.requires_grad = True 532 | outputs = self.model(adv_videos) 533 | 534 | cost = self._targeted*loss(outputs, labels).to(self.device) 535 | grad = torch.autograd.grad(cost, adv_videos, 536 | retain_graph=False, create_graph=False)[0] 537 | 538 | if self.momentum: 539 | grad_norm = torch.norm(grad, p=1) 540 | grad /= grad_norm 541 | grad += momentum*self.decay 542 | momentum = grad 543 | else: 544 | pass 545 | 546 | adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1] 547 | adv_videos = adv_videos + self.step_size*grad.sign() 548 | delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon) 549 | adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach() 550 | adv_videos = self._transform_video(adv_videos, mode='forward') # norm 551 | return adv_videos 552 | 553 | class SIM(Attack): 554 | '''Scale-Invariant Attack Method''' 555 | def __init__(self, model, epsilon=16/255, steps=10, decay=1.0, sclae_step=5, momentum=False): 556 | super(SIM, self).__init__("SIM", model) 557 | self.epsilon = epsilon 558 | self.steps = steps 559 | self.step_size = self.epsilon / self.steps 560 | self.decay = decay 561 | self.momentum = momentum 562 | self.sclae_step = sclae_step 563 | 564 | def _multi_scale(self, adv_videos, labels, loss): 565 | def obtain_grad(vid, labels): 566 | vid.requires_grad = True 567 | outputs = self.model(vid) 568 | cost = self._targeted*loss(outputs, labels).to(self.device) 569 | grad = torch.autograd.grad(cost, vid, 570 | retain_graph=False, create_graph=False)[0] 571 | return grad 572 | 573 | mean_grad = None 574 | for i in range(self.sclae_step): 575 | tmp_videos = 1 / 2**i * adv_videos 576 | grad = obtain_grad(tmp_videos, labels) 577 | if mean_grad is None: 578 | mean_grad = grad 579 | else: 580 | mean_grad += grad 581 | return mean_grad / self.sclae_step 582 | 583 | def forward(self, videos, labels): 584 | r""" 585 | Overridden. 586 | """ 587 | videos = videos.to(self.device) 588 | labels = labels.to(self.device) 589 | loss = nn.CrossEntropyLoss() 590 | momentum = torch.zeros_like(videos).to(self.device) 591 | unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1] 592 | adv_videos = videos.clone().detach() 593 | 594 | for i in range(self.steps): 595 | grad = self._multi_scale(adv_videos, labels, loss) 596 | 597 | if self.momentum: 598 | grad_norm = torch.norm(grad, p=1) 599 | grad /= grad_norm 600 | grad += momentum*self.decay 601 | momentum = grad 602 | else: 603 | pass 604 | 605 | adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1] 606 | adv_videos = adv_videos + self.step_size*grad.sign() 607 | delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon) 608 | adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach() 609 | adv_videos = self._transform_video(adv_videos, mode='forward') # norm 610 | return adv_videos 611 | 612 | class TIFGSM3D(Attack): 613 | '''Translation-Invariant Attack''' 614 | def __init__(self, model, epsilon=16/255, steps=10, decay=1.0, momentum=False): 615 | super(TIFGSM3D, self).__init__("TIFGSM3D", model) 616 | self.epsilon = epsilon 617 | self.steps = steps 618 | self.step_size = self.epsilon / self.steps 619 | self.decay = decay 620 | self.momentum = momentum 621 | # generate start_kernel 622 | kernel = self._initial_kernel(15, 3).astype(np.float32) # (15,15,15) 623 | stack_kernel = np.stack([kernel, kernel, kernel]) # (3,15,15,15) 624 | self.stack_kernel = torch.from_numpy(np.expand_dims(stack_kernel, 1)).to(self.device) # 3,1,15,15,15 625 | 626 | def _initial_kernel(self, kernlen, nsig): 627 | x = np.linspace(-nsig, nsig, kernlen) 628 | kern1d = st.norm.pdf(x) 629 | kernel_raw = np.outer(kern1d, kern1d) 630 | used_kernel = np.zeros((kernlen, kernlen, kernlen)) 631 | for i in range(kern1d.shape[0]): 632 | used_kernel[i] = kern1d[i] * kernel_raw 633 | used_kernel = used_kernel / used_kernel.sum() 634 | return used_kernel 635 | 636 | def _conv3d_frame(self, grads): 637 | ''' 638 | grads: N, C, T, H, W 639 | ''' 640 | out_grads = nn.functional.conv3d(grads, self.stack_kernel, groups=3, stride=1, padding=7) 641 | # frames = grads.shape[2] 642 | # out_grads = torch.zeros_like(grads) 643 | 644 | # for i in range(frames): 645 | # this_grads = grads[:,:,i] 646 | # out_grad = nn.functional.conv2d(this_grads, self.stack_kernel, groups=3, stride=1, padding=7) 647 | # out_grads[:,:,i] = out_grad 648 | out_grads = norm_grads(out_grads, True) 649 | return out_grads 650 | 651 | def forward(self, videos, labels): 652 | r""" 653 | Overridden. 654 | """ 655 | videos = videos.to(self.device) 656 | labels = labels.to(self.device) 657 | loss = nn.CrossEntropyLoss() 658 | momentum = torch.zeros_like(videos).to(self.device) 659 | unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1] 660 | adv_videos = videos.clone().detach() 661 | 662 | for i in range(self.steps): 663 | adv_videos.requires_grad = True 664 | outputs = self.model(adv_videos) 665 | 666 | cost = self._targeted*loss(outputs, labels).to(self.device) 667 | grad = torch.autograd.grad(cost, adv_videos, 668 | retain_graph=False, create_graph=False)[0] 669 | 670 | grad = self._conv3d_frame(grad) 671 | if self.momentum: 672 | grad += momentum*self.decay 673 | momentum = grad 674 | else: 675 | pass 676 | 677 | adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1] 678 | adv_videos = adv_videos + self.step_size*grad.sign() 679 | delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon) 680 | adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach() 681 | adv_videos = self._transform_video(adv_videos, mode='forward') # norm 682 | 683 | return adv_videos 684 | 685 | class TAP(Attack): 686 | '''Transferable Adversarial Perturbations 687 | params = { 688 | 'kernlen': 3, 689 | 'temporal_kernlen':3, 690 | 'eta': 1e3, 691 | 'conv3d': True 692 | } 693 | ''' 694 | def __init__(self, model, params, epsilon=16/255, steps=10): 695 | super(TAP, self).__init__("TAP", model) 696 | self.epsilon = epsilon 697 | self.steps = steps 698 | self.step_size = self.epsilon / self.steps 699 | 700 | for name, value in params.items(): 701 | setattr(self, name, value) 702 | 703 | kernel = self._initial_kernel_uniform(self.kernlen).astype(np.float32) # (3,3) 704 | stack_kernel = np.stack([kernel, kernel, kernel]) # (3,3,3) 705 | self.stack_2d_kernel = torch.from_numpy(np.expand_dims(stack_kernel, 1)).to(self.device) # 3,1,3,3 706 | 707 | kernel_3d = self._initial_kernel_uniform_3d(self.kernlen, self.temporal_kernlen) # [t,h,h] 708 | stack_kernel_3d = np.stack([kernel_3d, kernel_3d, kernel_3d]) # (3,t,h,h) 709 | self.stack_3d_kernel = torch.from_numpy(np.expand_dims(stack_kernel_3d, 1)).to(self.device) # 3,1,t,h,h 710 | 711 | self._activation_hook() 712 | 713 | def _initial_kernel_uniform(self, kernlen): 714 | kern1d = np.ones(kernlen) 715 | kernel_raw = np.outer(kern1d, kern1d) 716 | kernel = kernel_raw / kernel_raw.sum() 717 | return kernel 718 | 719 | def _initial_kernel_uniform_3d(self, kernlen, temporal_kernel): 720 | kern3d = np.ones((temporal_kernel, kernlen, kernlen)) 721 | kern3d = kern3d / kern3d.sum() 722 | return kern3d 723 | 724 | def _conv2d_frames(self, perts): 725 | frames = perts.shape[2] 726 | out_perts = torch.zeros_like(perts) 727 | for i in range(frames): 728 | this_perts = perts[:,:,i] 729 | out_pert = nn.functional.conv2d(this_perts, self.stack_2d_kernel, groups=3, stride=1, padding=[int((self.kernlen-1)/2), int((self.kernlen-1)/2)]) 730 | out_perts[:,:,i] = out_pert 731 | return torch.sum(torch.abs(out_perts)) 732 | 733 | def _conv3d_frames(self, perts): 734 | out_perts = nn.functional.conv3d(perts, self.stack_3d_kernel, groups=3, stride=1, padding=[int((self.temporal_kernlen-1)/2), int((self.kernlen-1)/2), int((self.kernlen-1)/2)]) 735 | return torch.sum(torch.abs(out_perts)) 736 | 737 | def _find_target_layer(self): 738 | if 'i3d' in self.model_type: 739 | return [self.model.res_layers._modules['0'], self.model.res_layers._modules['1']] 740 | elif 'slowfast' in self.model_type: 741 | return [self.model._modules['slow_res2'], self.model._modules['slow_res3'], self.model._modules['fast_res2'], self.model._modules['fast_res3']] #[b,2048, 8, 7, 7], [b, 256, 32, 7, 7] 742 | elif 'tpn' in self.model_type: 743 | return [self.model.layer1, self.model.layer2] 744 | 745 | def _activation_hook(self): 746 | self.activations = dict() 747 | self.activations['value'] = [] 748 | def forward_hook(module, input, output): 749 | self.activations['value'] += [output] 750 | return None 751 | target_layer = self._find_target_layer() 752 | if isinstance(target_layer, list): 753 | for i in target_layer: 754 | i.register_forward_hook(forward_hook) 755 | else: 756 | target_layer.register_forward_hook(forward_hook) 757 | 758 | def forward(self, videos, labels): 759 | r""" 760 | Overridden. 761 | """ 762 | batch_size = videos.shape[0] 763 | self.loss_info = {} 764 | self.stack_3d_kernel = self.stack_3d_kernel.type(videos.dtype) 765 | videos = videos.to(self.device) 766 | labels = labels.to(self.device) 767 | 768 | self.activations = dict() 769 | self.activations['value'] = [] 770 | outputs = self.model(videos) 771 | ori_feature_map = self.activations['value'] 772 | 773 | loss = nn.CrossEntropyLoss() 774 | unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1] 775 | adv_videos = videos.clone().detach() 776 | 777 | for i in range(self.steps): 778 | self.activations = dict() 779 | self.activations['value'] = [] 780 | adv_videos.requires_grad = True 781 | outputs = self.model(adv_videos) 782 | 783 | # CE loss 784 | cost1 = self._targeted*loss(outputs, labels).to(self.device) 785 | 786 | # l2 distance 787 | # this_feature_map = self._feature_map(adv_videos, True, False, labels) 788 | feat_distance = [] 789 | for i,j in zip(self.activations['value'], ori_feature_map): 790 | this_distance = torch.norm((torch.sign(i) * torch.sqrt(torch.abs(i))).reshape(batch_size, -1) - (torch.sign(j) * torch.sqrt(torch.abs(j))).reshape(batch_size, -1), p=2, dim=1) 791 | feat_distance.append(this_distance) 792 | cost2 = torch.sum(torch.stack(feat_distance), 0) 793 | 794 | # L2 norm 795 | perts = self._transform_perts(adv_videos - videos).to(self.device) 796 | if self.conv3d: 797 | reg_cost = self._conv3d_frames(perts) 798 | else: 799 | reg_cost = self._conv2d_frames(perts) 800 | 801 | cost = cost1 + 1e3 * reg_cost + 0.05 * cost2 802 | 803 | grad = torch.autograd.grad(cost, adv_videos, 804 | retain_graph=False, create_graph=False)[0] 805 | 806 | adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1] 807 | adv_videos = adv_videos + self.step_size*grad.sign() 808 | delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon) 809 | adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach() 810 | adv_videos = self._transform_video(adv_videos, mode='forward') # norm 811 | self.loss_info[i] = {'ce loss': cost1.detach().cpu().numpy(), 812 | 'reg_cost': reg_cost.detach().cpu().numpy(), 813 | 'distance': cost2.detach().cpu().numpy()} 814 | return adv_videos 815 | 816 | --------------------------------------------------------------------------------