├── used_idxs.pkl
├── __pycache__
    ├── utils.cpython-37.pyc
    ├── datasets.cpython-37.pyc
    ├── image_cam.cpython-37.pyc
    ├── base_attacks.cpython-37.pyc
    ├── image_attacks.cpython-37.pyc
    ├── video_attacks.cpython-37.pyc
    ├── dataset_ucf101.cpython-37.pyc
    ├── image_cam_utils.cpython-37.pyc
    ├── reference_ucf101.cpython-37.pyc
    └── transforms_ucf101.cpython-37.pyc
├── utils.py
├── image_fine_tune_attack.py
├── image_main_ucf101.py
├── image_main.py
├── dataset_ucf101.py
├── reference.py
├── attack.py
├── I2V_attack-env.yml
├── attack_ucf101.py
├── README.md
├── reference_ucf101.py
├── image_cam.py
├── run_image_guided.py
├── image_cam_utils.py
├── video_attacks.py
├── datasets.py
├── TPAMI_attack.py
├── transforms_ucf101.py
├── kinetics400_attack_samples.csv
├── image_attacks.py
└── base_attacks.py


/used_idxs.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/used_idxs.pkl


--------------------------------------------------------------------------------
/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/datasets.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/datasets.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/image_cam.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/image_cam.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/base_attacks.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/base_attacks.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/image_attacks.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/image_attacks.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/video_attacks.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/video_attacks.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/dataset_ucf101.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/dataset_ucf101.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/image_cam_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/image_cam_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/reference_ucf101.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/reference_ucf101.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/transforms_ucf101.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhipeng-wei/Image-to-Video-I2V-attack/HEAD/__pycache__/transforms_ucf101.cpython-37.pyc


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from gluoncv.torch.engine.config import get_cfg_defaults
 3 | import torch
 4 | 
 5 | # config info of video models
 6 | # refer to https://cv.gluon.ai/model_zoo/action_recognition.html
 7 | CONFIG_ROOT = '' # <need to specify>
 8 | CONFIG_PATHS = {
 9 |     'i3d_resnet50': os.path.join(CONFIG_ROOT, 'i3d_nl5_resnet50_v1_kinetics400.yaml'),
10 |     'i3d_resnet101': os.path.join(CONFIG_ROOT, 'i3d_nl5_resnet101_v1_kinetics400.yaml'),
11 |     'slowfast_resnet50': os.path.join(CONFIG_ROOT, 'slowfast_8x8_resnet50_kinetics400.yaml'),
12 |     'slowfast_resnet101': os.path.join(CONFIG_ROOT, 'slowfast_8x8_resnet101_kinetics400.yaml'),
13 |     'tpn_resnet50': os.path.join(CONFIG_ROOT, 'tpn_resnet50_f32s2_kinetics400.yaml'),
14 |     'tpn_resnet101': os.path.join(CONFIG_ROOT, 'tpn_resnet101_f32s2_kinetics400.yaml')
15 |     }
16 | 
17 | # data info
18 | UCF_IMAGE_ROOT = '' # <need to specify>
19 | 
20 | # save info
21 | OPT_PATH = '' # <need to specify>
22 | 
23 | # checkpoints path for ucf101
24 | UCF_CKPT_PATH = '' # <need to specify>
25 | 
26 | def change_cfg(cfg, batch_size, random):
27 |     # modify video paths and pretrain setting.
28 |     cfg.CONFIG.DATA.VAL_DATA_PATH = '' # <need to specify>
29 |     cfg.CONFIG.DATA.VAL_ANNO_PATH = './kinetics400_attack_samples.csv' # selected 400 classified correct.
30 |     cfg.CONFIG.MODEL.PRETRAINED = True
31 |     cfg.CONFIG.VAL.BATCH_SIZE = batch_size
32 |     return cfg
33 | 
34 | def get_cfg_custom(cfg_path, batch_size=16, random=False):
35 |     cfg = get_cfg_defaults()
36 |     cfg.merge_from_file(cfg_path)
37 |     cfg = change_cfg(cfg, batch_size, random)
38 |     return cfg
39 | 
40 | class AverageMeter(object):
41 |     """Computes and stores the average and current value"""
42 | 
43 |     def __init__(self):
44 |         self.reset()
45 | 
46 |     def reset(self):
47 |         self.val = 0
48 |         self.avg = 0
49 |         self.sum = 0
50 |         self.count = 0
51 | 
52 |     def update(self, val, n=1):
53 |         self.val = val
54 |         self.sum += val * n
55 |         self.count += n
56 |         self.avg = self.sum / self.count
57 | 
58 | def norm_grads(grads, frame_level=True):
59 |     # frame level norm
60 |     # clip level norm
61 |     assert len(grads.shape) == 5 and grads.shape[2] == 32
62 |     if frame_level:
63 |         norm = torch.mean(torch.abs(grads), [1,3,4], keepdim=True)
64 |     else:
65 |         norm = torch.mean(torch.abs(grads), [1,2,3,4], keepdim=True)
66 |     # norm = torch.norm(grads, dim=[1,2,3,4], p=1)
67 |     return grads / norm


--------------------------------------------------------------------------------
/image_fine_tune_attack.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | import os
 4 | import torch
 5 | import math
 6 | import json
 7 | from torch.utils.data import Dataset, DataLoader
 8 | 
 9 | import image_attacks
10 | from datasets import get_dataset
11 | from gluoncv.torch.model_zoo import get_model
12 | from utils import CONFIG_PATHS, get_cfg_custom
13 | import pickle as pkl
14 | from reference_ucf101 import MODEL_TO_CKPTS
15 | 
16 | class AdvDataset(Dataset):
17 |     def __init__(self, used_adv_path, used_ori_path):
18 |         self.used_adv_path = used_adv_path
19 |         files = os.listdir(self.used_adv_path)
20 |         self.files = [i for i in files if 'adv' in i]
21 |         self.used_ori_path = used_ori_path
22 | 
23 |     def __len__(self):
24 |         return len(self.files)
25 | 
26 |     def __getitem__(self, idx):
27 |         file = self.files[idx]
28 |         vid_id = file.split('-')[0]
29 |         ori_file = os.path.join(self.used_ori_path, '{}-ori.npy'.format(vid_id))
30 |         vid = torch.from_numpy(np.load(os.path.join(self.used_adv_path, file)))
31 |         vid = vid[None]
32 |         ori_vid = torch.from_numpy(np.load(ori_file))
33 |         ori_vid = ori_vid[None]
34 |         label = [int(file.split('-')[0])]
35 |         label = np.array(label).astype(np.int32)
36 |         label = torch.from_numpy(label).long()
37 |         return vid, ori_vid, label
38 | 
39 | 
40 | def arg_parse():
41 |     parser = argparse.ArgumentParser(description='')
42 |     parser.add_argument('--gpu', type=str, default='0', help='gpu device.')
43 |     parser.add_argument('--batch_size', type=int, default=4, metavar='N',
44 |                     help='input batch size for reference (default: 16)')
45 |     parser.add_argument('--attack_method', type=str, default='ILAF', help='')
46 |     parser.add_argument('--opt_path', type=str, default='')
47 |     # adv path
48 |     parser.add_argument('--used_adv', type=str, default='', help='')
49 |     parser.add_argument('--used_ori', type=str, default='', help='')
50 |     # white-box model
51 |     parser.add_argument('--white_model', type=str, default='i3d_resnet101', help='i3d_resnet101 | slowfast_resnet101 | tpn_resnet101')
52 |     parser.add_argument('--dataset', type=str, default='Kinetics-400', help='Kinetics-400 | UCF-101')
53 |     args = parser.parse_args()
54 |     return args
55 | 
56 | if __name__ == '__main__':
57 |     args = arg_parse()
58 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
59 |     print (args)
60 |     # loading cfg.
61 |     cfg_path = CONFIG_PATHS[args.white_model]
62 |     cfg = get_cfg_custom(cfg_path, args.batch_size)
63 |     model = get_model(cfg)
64 |     if args.dataset == 'UCF-101':
65 |         ckpt_path = MODEL_TO_CKPTS[args.white_model]
66 |         model.load_state_dict(torch.load(ckpt_path)['state_dict'])
67 |     model.cuda()
68 | 
69 |     # loading dataset.
70 |     dataset = AdvDataset(used_adv_path = args.used_adv, used_ori_path=args.used_ori)
71 |     
72 |     attack_method = getattr(image_attacks, args.attack_method)(model, args.white_model)
73 |     for step in range(len(dataset)):
74 |         if step %1 == 0:
75 |             print ('Running {}, {}/{}'.format(args.attack_method, step+1, len(dataset)))
76 |         # val_batch, val_label = generate_batch(files_batch[step])
77 |         val_batch, ori_batch, val_label = dataset[step]
78 |         video_names = ['...']
79 |         adv_batches = attack_method(val_batch, ori_batch, val_label, video_names)
80 |         for ind,label in enumerate(val_label):
81 |             adv = adv_batches[ind].detach().cpu().numpy()
82 |             np.save(os.path.join(args.opt_path, '{}-adv'.format(label.item())), adv)


--------------------------------------------------------------------------------
/image_main_ucf101.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | import os
 4 | import torch
 5 | 
 6 | import math
 7 | import json
 8 | 
 9 | import image_attacks
10 | from dataset_ucf101 import attack_genearte_dataeset
11 | from gluoncv.torch.model_zoo import get_model
12 | from utils import CONFIG_PATHS, get_cfg_custom, OPT_PATH
13 | import pickle as pkl
14 | 
15 | def arg_parse():
16 |     parser = argparse.ArgumentParser(description='')
17 |     # parallel run
18 |     parser.add_argument('--batch_nums', type=int, default=1)
19 |     parser.add_argument('--batch_index', type=int, default=1)
20 | 
21 |     # parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.')
22 |     parser.add_argument('--gpu', type=str, default='0', help='gpu device.')
23 |     parser.add_argument('--batch_size', type=int, default=1, metavar='N',
24 |                     help='input batch size for reference (default: 16)')
25 |     parser.add_argument('--attack_method', type=str, default='ImageGuidedAttentionMap', help='')
26 |     parser.add_argument('--step', type=int, default=10, metavar='N',
27 |                     help='Multi-step or One-step in TI and SGM.')
28 |     parser.add_argument('--file_prefix', type=str, default='')
29 | 
30 |     # for std
31 |     parser.add_argument('--depth', type=int, default=1, help='1,2,3,4')
32 |     parser.add_argument('--lamb', type=float, default=0.1, help='')
33 | 
34 |     parser.add_argument('--mode', type=str, default='direction', help='diff_norm\direction')
35 |     parser.add_argument('--step_size', type=float, default=0.004, help='')
36 |     
37 |     # for dropout
38 |     parser.add_argument('--dropout', type=float, default=0.1, help='')
39 | 
40 |     # for direction with changing image model
41 |     parser.add_argument('--direction_image_model', type=str, default='resnet', help='resnet, densenet, squeezenet, vgg, alexnet')
42 |     args = parser.parse_args()
43 |     args.adv_path = os.path.join(OPT_PATH, '{}-{}-{}-{}'.format('Image', args.attack_method, args.step, args.file_prefix))
44 |     if not os.path.exists(args.adv_path):
45 |         os.makedirs(args.adv_path)
46 |     return args
47 | 
48 | if __name__ == '__main__':
49 |     args = arg_parse()
50 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
51 |     print (args)
52 | 
53 |     # loading dataset and model.
54 |     dataset_loader = attack_genearte_dataeset(args.batch_size)
55 |     
56 |     nums_contained = int(400 / args.batch_nums)
57 |     left = (args.batch_index-1) * nums_contained
58 |     right = args.batch_index * nums_contained
59 | 
60 |     # attack 
61 |     if args.attack_method == 'ImageGuidedStd_Adam':
62 |         model_name_lists = [args.direction_image_model]
63 |         attack_method = getattr(image_attacks, args.attack_method)(model_name_lists, depth=args.depth, step_size=args.step_size, steps=args.step)
64 |     elif args.attack_method == 'ImageGuidedFMDirection_Adam':
65 |         model_name_lists = [args.direction_image_model]
66 |         attack_method = getattr(image_attacks, args.attack_method)(model_name_lists, depth=args.depth, step_size=args.step_size, steps=args.step)
67 |     elif args.attack_method == 'ImageGuidedFML2_Adam_MultiModels':
68 |         model_name_lists = ['resnet', 'vgg', 'squeezenet', 'alexnet']
69 |         depths = {
70 |             'resnet':2,
71 |             'vgg':3,
72 |             'squeezenet':2,
73 |             'alexnet':3
74 |         }
75 |         attack_method = getattr(image_attacks, args.attack_method)(model_name_lists, depths=depths, steps=args.step)
76 | 
77 |     for step, data in enumerate(dataset_loader):
78 |         if step >= left and step < right:
79 |             if step %1 == 0:
80 |                 print ('Running {}, {}/{}'.format(args.attack_method, step+1, len(dataset_loader)))
81 |             val_batch = data[0]
82 |             val_label = data[1]
83 |             video_names = str(val_label)
84 |             # np.save(os.path.join(args.adv_path, '{}-ori'.format(val_label[0].item())), val_batch[0].detach().cpu().numpy())
85 |             adv_batches = attack_method(val_batch, val_label, video_names)
86 |             for ind,label in enumerate(val_label):
87 |                 adv = adv_batches[ind].detach().cpu().numpy()            
88 |                 np.save(os.path.join(args.adv_path, '{}-adv'.format(label.item())), adv)
89 |             
90 |     with open(os.path.join(args.adv_path, 'loss_info_{}.json'.format(args.batch_index)), 'w') as opt:
91 |         json.dump(attack_method.loss_info, opt)
92 | 


--------------------------------------------------------------------------------
/image_main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | import os
 4 | import torch
 5 | 
 6 | import math
 7 | import json
 8 | 
 9 | import image_attacks
10 | from datasets import get_dataset
11 | from gluoncv.torch.model_zoo import get_model
12 | from utils import CONFIG_PATHS, OPT_PATH, get_cfg_custom
13 | import pickle as pkl
14 | 
15 | def arg_parse():
16 |     parser = argparse.ArgumentParser(description='')
17 |     # parallel run
18 |     parser.add_argument('--batch_nums', type=int, default=1)
19 |     parser.add_argument('--batch_index', type=int, default=1)
20 | 
21 |     # parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.')
22 |     parser.add_argument('--gpu', type=str, default='0', help='gpu device.')
23 |     parser.add_argument('--batch_size', type=int, default=1, metavar='N',
24 |                     help='input batch size for reference (default: 16)')
25 |     parser.add_argument('--attack_method', type=str, default='ImageGuidedAttentionMap', help='')
26 |     # parser.add_argument('--step', type=int, default=60, metavar='N',
27 |     #                 help='Multi-step or One-step in TI and SGM.')
28 |     parser.add_argument('--step', type=int, default=60, metavar='N',
29 |                     help='Multi-step or One-step in TI and SGM.')
30 |     parser.add_argument('--file_prefix', type=str, default='')
31 | 
32 |     # for std
33 |     parser.add_argument('--depth', type=int, default=1, help='1,2,3,4')
34 |     parser.add_argument('--lamb', type=float, default=0.1, help='')
35 | 
36 |     parser.add_argument('--mode', type=str, default='direction', help='diff_norm\direction')
37 |     parser.add_argument('--step_size', type=float, default=0.004, help='')
38 |     
39 |     # for dropout
40 |     parser.add_argument('--dropout', type=float, default=0.1, help='')
41 | 
42 |     # for direction with changing image model
43 |     parser.add_argument('--direction_image_model', type=str, default='resnet', help='resnet, densenet, squeezenet, vgg, alexnet')
44 |     args = parser.parse_args()
45 |     args.adv_path = os.path.join(OPT_PATH, '{}-{}-{}-{}'.format('Image', args.attack_method, args.step, args.file_prefix))
46 |     if not os.path.exists(args.adv_path):
47 |         os.makedirs(args.adv_path)
48 |     return args
49 | 
50 | if __name__ == '__main__':
51 |     args = arg_parse()
52 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
53 |     print (args)
54 |     # loading cfg.
55 |     cfg_path = CONFIG_PATHS['i3d_resnet101']
56 |     cfg = get_cfg_custom(cfg_path, args.batch_size)
57 | 
58 |     # loading dataset and model.
59 |     dataset_loader = get_dataset(cfg)
60 |     
61 |     nums_contained = int(400 / args.batch_nums)
62 |     left = (args.batch_index-1) * nums_contained
63 |     right = args.batch_index * nums_contained
64 | 
65 |     # attack ImageGuidedStd_Adam ImageGuidedFMDirection_Adam ImageGuidedFML2_Adam_MultiModels ENS_FT_I2V ILAF
66 |     if args.attack_method == 'ImageGuidedStd_Adam':
67 |         model_name_lists = [args.direction_image_model]
68 |         attack_method = getattr(image_attacks, args.attack_method)(model_name_lists, depth=args.depth, step_size=args.step_size, steps=args.step)
69 |     elif args.attack_method == 'ImageGuidedFMDirection_Adam':
70 |         model_name_lists = [args.direction_image_model]
71 |         attack_method = getattr(image_attacks, args.attack_method)(model_name_lists, depth=args.depth, step_size=args.step_size, steps=args.step)
72 |     elif args.attack_method == 'ImageGuidedFML2_Adam_MultiModels':
73 |         model_name_lists = ['resnet', 'vgg', 'squeezenet', 'alexnet']
74 |         depths = {
75 |             'resnet':2,
76 |             'vgg':3,
77 |             'squeezenet':2,
78 |             'alexnet':3
79 |         }
80 |         attack_method = getattr(image_attacks, args.attack_method)(model_name_lists, depths=depths)
81 | 
82 |     for step, data in enumerate(dataset_loader):
83 |         if step >= left and step < right:
84 |             if step %1 == 0:
85 |                 print ('Running {}, {}/{}'.format(args.attack_method, step+1, len(dataset_loader)))
86 |             val_batch = data[0]
87 |             val_label = data[1]
88 |             video_names = data[2]
89 |             adv_batches = attack_method(val_batch, val_label, video_names)
90 |             for ind,label in enumerate(val_label):
91 |                 adv = adv_batches[ind].detach().cpu().numpy()
92 |                 np.save(os.path.join(args.adv_path, '{}-adv'.format(label.item())), adv)
93 |             
94 |     with open(os.path.join(args.adv_path, 'loss_info_{}.json'.format(args.batch_index)), 'w') as opt:
95 |         json.dump(attack_method.loss_info, opt)
96 | 


--------------------------------------------------------------------------------
/dataset_ucf101.py:
--------------------------------------------------------------------------------
  1 | import torch.utils.data as data
  2 | from PIL import Image
  3 | import os
  4 | import math
  5 | import functools
  6 | import json
  7 | import copy
  8 | import numpy as np
  9 | from transforms_ucf101 import *
 10 | import pickle as pkl
 11 | from utils import UCF_IMAGE_ROOT
 12 | 
 13 | def pil_loader(path):
 14 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 15 |     with open(path, 'rb') as f:
 16 |         with Image.open(f) as img:
 17 |             return img.convert('RGB')
 18 | 
 19 | 
 20 | def accimage_loader(path):
 21 |     try:
 22 |         import accimage
 23 |         return accimage.Image(path)
 24 |     except IOError:
 25 |         # Potentially a decoding problem, fall back to PIL.Image
 26 |         return pil_loader(path)
 27 | 
 28 | 
 29 | def get_default_image_loader():
 30 |     from torchvision import get_image_backend
 31 |     if get_image_backend() == 'accimage':
 32 |         return accimage_loader
 33 |     else:
 34 |         return pil_loader
 35 | 
 36 | 
 37 | def video_loader(video_dir_path, frame_indices, image_loader):
 38 |     video = []
 39 |     for i in frame_indices:
 40 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 41 |         if os.path.exists(image_path):
 42 |             video.append(image_loader(image_path))
 43 |         else:
 44 |             return video
 45 | 
 46 |     return video
 47 | 
 48 | def get_default_video_loader():
 49 |     image_loader = get_default_image_loader()
 50 |     return functools.partial(video_loader, image_loader=image_loader)
 51 | 
 52 | class attack_ucf101(data.Dataset):
 53 |     def __init__(self, spatial_transform=None, temporal_transform=None,get_loader=get_default_video_loader):
 54 |         setting='./test01_setting.txt'
 55 |         self.clips = self._make_dataset(setting)    
 56 |         self.spatial_transform = spatial_transform
 57 |         self.temporal_transform = temporal_transform
 58 |         self.loader = get_loader()
 59 |         with open('./used_idxs.pkl', 'rb') as ipt:
 60 |             used_idxs = pkl.load(ipt)
 61 |         self.new_clips = []
 62 |         for i in used_idxs:
 63 |             self.new_clips.append(self.clips[i])
 64 |         self.clips = self.new_clips
 65 |         print ('length', len(self.clips))
 66 |     def __getitem__(self, index):
 67 |         directory, duration, target = self.clips[index]
 68 |         frame_indices = list(range(1, duration + 1))
 69 | 
 70 |         if self.temporal_transform is not None:
 71 |             frame_indices = self.temporal_transform(frame_indices)
 72 | 
 73 |         clip = self.loader(directory, frame_indices)    
 74 | 
 75 |         if self.spatial_transform is not None:
 76 |             self.spatial_transform.randomize_parameters()
 77 |             clip = [self.spatial_transform(img) for img in clip]
 78 | 
 79 |         clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
 80 | 
 81 |         return clip, target
 82 | 
 83 |     def _make_dataset(self, setting):
 84 |         if not os.path.exists(setting):
 85 |             raise(RuntimeError("Setting file %s doesn't exist. Check opt.train-list and opt.val-list. " % (setting)))
 86 |         clips = []
 87 |         with open(setting) as split_f:
 88 |             data = split_f.readlines()
 89 |             for line in data:
 90 |                 line_info = line.split()
 91 |                 # line format: video_path, video_duration, video_label
 92 |                 if len(line_info) < 3:
 93 |                     raise(RuntimeError('Video input format is not correct, missing one or more element. %s' % line))
 94 |                 clip_path = os.path.join(UCF_IMAGE_ROOT, line_info[0])
 95 |                 duration = int(line_info[1])
 96 |                 target = int(line_info[2])
 97 |                 item = (clip_path, duration, target)
 98 |                 clips.append(item)
 99 |         return clips
100 |     
101 |     def __len__(self):
102 |         return len(self.clips)
103 | 
104 | def attack_genearte_dataeset(test_batch_size):
105 |     test_spa_trans, test_temp_trans = test_transform()
106 |     test_dataset = attack_ucf101(spatial_transform=test_spa_trans, temporal_transform=test_temp_trans)
107 |     val_loader = torch.utils.data.DataLoader(
108 |         test_dataset, batch_size=test_batch_size, shuffle=False,
109 |         num_workers=9, pin_memory=True)
110 | 
111 |     return val_loader
112 | 
113 | def test_transform():
114 |     input_size = 224
115 |     scale_ratios = '1.0, 0.8'
116 |     scale_ratios = [float(i) for i in scale_ratios.split(',')]
117 |     default_mean = [0.485, 0.456, 0.406]
118 |     default_std = [0.229, 0.224, 0.225]
119 |     norm_method = Normalize(default_mean, default_std)
120 |     spatial_transform = spatial_Compose([
121 |        Scale(int(input_size / 1.0)),
122 |         CornerCrop(input_size, 'c'),
123 |         ToTensor(), norm_method
124 |         ])
125 |     temporal_transform = LoopPadding(32)
126 |     return spatial_transform, temporal_transform


--------------------------------------------------------------------------------
/reference.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import numpy as np
  4 | import pandas as pd
  5 | import json
  6 | 
  7 | import torch
  8 | from gluoncv.torch.model_zoo import get_model
  9 | from utils import  CONFIG_PATHS, get_cfg_custom, AverageMeter, OPT_PATH
 10 | from datasets import get_dataset
 11 | import argparse
 12 | import math
 13 | 
 14 | def arg_parse():
 15 |     parser = argparse.ArgumentParser(description='')
 16 |     parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.')
 17 |     parser.add_argument('--gpu', type=str, default='0', help='gpu device.')
 18 |     parser.add_argument('--batch_size', type=int, default=16, metavar='N',
 19 |                     help='input batch size for reference (default: 16)')
 20 |     args = parser.parse_args()
 21 |     if 'DATACENTER' in args.adv_path:
 22 |         pass
 23 |     else:
 24 |         args.adv_path = os.path.join(OPT_PATH, args.adv_path)
 25 |     args.adv_path = os.path.join(OPT_PATH, args.adv_path)
 26 |     return args
 27 |     
 28 | def accuracy(output, target):
 29 |     batch_size = target.size(0)
 30 | 
 31 |     _, pred = output.topk(1, 1, True, True)
 32 |     pred = pred.t() # batch_size, 1
 33 |     correct = pred.eq(target.view(1, -1).expand_as(pred))
 34 | 
 35 |     correct_k = correct[:1].view(-1).float().sum(0)
 36 |     return correct_k.mul_(100.0 / batch_size), torch.squeeze(pred)
 37 | 
 38 | def generate_batch(batch_files):
 39 |     batches = []
 40 |     labels = []
 41 |     for file in batch_files:
 42 |         batches.append(torch.from_numpy(np.load(os.path.join(args.adv_path, file))).cuda())
 43 |         labels.append(int(file.split('-')[0]))
 44 |     labels = np.array(labels).astype(np.int32)
 45 |     labels = torch.from_numpy(labels)
 46 |     return torch.stack(batches), labels
 47 | 
 48 | def reference(model, files_batch):
 49 |     data_time = AverageMeter()
 50 |     top1 = AverageMeter()
 51 |     batch_time = AverageMeter()
 52 | 
 53 |     predictions = []
 54 |     labels = []
 55 | 
 56 |     end = time.time()
 57 |     with torch.no_grad():
 58 |         for step, batch in enumerate(files_batch):
 59 |             data_time.update(time.time() - end)
 60 |             val_batch, val_label = generate_batch(batch)
 61 | 
 62 |             val_batch = val_batch.cuda()
 63 |             val_label = val_label.cuda()
 64 | 
 65 |             batch_size = val_label.size(0)
 66 |             outputs = model(val_batch)
 67 | 
 68 |             prec1a, preds = accuracy(outputs.data, val_label)
 69 | 
 70 |             predictions += list(preds.cpu().numpy())
 71 |             labels += list(val_label.cpu().numpy())
 72 | 
 73 |             top1.update(prec1a.item(), val_batch.size(0))   
 74 |             batch_time.update(time.time() - end)
 75 |             end = time.time()
 76 | 
 77 |             if step % 5 == 0:
 78 |                 print('----validation----')
 79 |                 print_string = 'Process: [{0}/{1}]'.format(step + 1, len(files_batch))
 80 |                 print(print_string)
 81 |                 print_string = 'data_time: {data_time:.3f}, batch time: {batch_time:.3f}'.format(
 82 |                     data_time=data_time.val,
 83 |                     batch_time=batch_time.val)
 84 |                 print(print_string)
 85 |                 print_string = 'top-1 accuracy: {top1_acc:.2f}%'.format(top1_acc = top1.avg)
 86 |                 print (print_string)
 87 |     return predictions, labels, top1.avg
 88 | 
 89 | if __name__ == '__main__':
 90 |     global args
 91 |     args = arg_parse()
 92 |     
 93 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
 94 | 
 95 |     # loading adversarial examples.
 96 |     files = os.listdir(args.adv_path)
 97 |     files = [i for i in files if 'adv' in i]
 98 | 
 99 |     batch_times = math.ceil(len(files) / args.batch_size)
100 |     files_batch = []
101 |     for i in range(batch_times):
102 |         batch = files[i*args.batch_size: min((i+1)*args.batch_size, len(files))]
103 |         files_batch.append(batch)
104 | 
105 |     model_val_acc = {}
106 |     info_df = pd.DataFrame()
107 |     info_df['gt_label'] = [i for i in range(400)]
108 |     for model_name in CONFIG_PATHS.keys():
109 |         print ('Model-{}:'.format(model_name))
110 |         cfg_path = CONFIG_PATHS[model_name]
111 |         cfg = get_cfg_custom(cfg_path)
112 |         model = get_model(cfg).cuda()
113 |         model.eval()
114 |         preds, labels, top1_avg = reference(model, files_batch)
115 | 
116 |         predd = np.zeros_like(preds)
117 |         inds = np.argsort(labels)
118 |         for i,ind in enumerate(inds):
119 |             predd[ind] = preds[i]
120 | 
121 |         print (args.adv_path)
122 |         info_df['{}-pre'.format(model_name)] = predd
123 |         model_val_acc[model_name] = top1_avg
124 |         del model
125 |         torch.cuda.empty_cache()
126 | 
127 |     info_df.to_csv(os.path.join(args.adv_path, 'results_all_models_prediction.csv'), index=False)
128 |     with open(os.path.join(args.adv_path, 'top1_acc_all_models.json'), 'w') as opt:
129 |         json.dump(model_val_acc, opt)
130 | 
131 |     
132 | 
133 | 


--------------------------------------------------------------------------------
/attack.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | import os
 4 | import torch
 5 | import math
 6 | 
 7 | import base_attacks
 8 | import video_attacks
 9 | from datasets import get_dataset
10 | from gluoncv.torch.model_zoo import get_model
11 | from utils import CONFIG_PATHS, OPT_PATH, get_cfg_custom
12 | 
13 | def arg_parse():
14 |     parser = argparse.ArgumentParser(description='')
15 |     # parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.')
16 |     parser.add_argument('--gpu', type=str, default='0', help='gpu device.')
17 |     parser.add_argument('--batch_size', type=int, default=4, metavar='N',
18 |                     help='input batch size for reference (default: 16)')
19 |     parser.add_argument('--model', type=str, default='i3d_resnet101', help='i3d_resnet101 | i3d_slow_resnet101 | slowfast_resnet101 | tpn_resnet101.')
20 |     parser.add_argument('--attack_method', type=str, default='TemporalAugmentationMomentum', help='FGSM | BIM | MIFGSM | DIFGSM | TIFGSM | SGM')
21 |     parser.add_argument('--attack_type', type=str, default='image', help='image | video')
22 |     parser.add_argument('--step', type=int, default=10, metavar='N',
23 |                     help='Multi-step or One-step in TI and SGM.')
24 |     parser.add_argument('--sf_frame', type=int, default=32, metavar='N',
25 |                     help='SFFGSM frame.')
26 |     parser.add_argument('--cf_frame', type=str, default='small', metavar='N',
27 |                     help='CFFGSM frame.')
28 |     parser.add_argument('--kernlen', type=int, default=15, metavar='N',
29 |                     help='SFFGSM frame.')
30 |     parser.add_argument('--nsig', type=int, default=3, metavar='N',
31 |                     help='SFFGSM frame.')
32 |     parser.add_argument('--file_prefix', type=str, default='')
33 |     parser.add_argument('--kernel_mode', type=str, default='gaussian')
34 |     parser.add_argument('--iterative_momentum', action='store_true', default=False, help='Use iterative momentum in MFFGSM.')
35 |     parser.add_argument('--frame_conv', action='store_true', default=False, help='Use frame_conv in MFFGSM.')
36 |     # for TemporalAugmentationMomentum
37 |     parser.add_argument('--augmentation_weight', type=float, default=1.0, help='')
38 |     parser.add_argument('--frame_momentum', action='store_true', default=False, help='')
39 |     parser.add_argument('--gamma', type=float, default=1.0, help='')
40 |     # for combine momentum
41 |     parser.add_argument('--no_iterative_momentum', action='store_true', default=False, help='')
42 |     parser.add_argument('--weight_add', action='store_true', default=False, help='')
43 |     parser.add_argument('--momentum_weight', type=float, default=0.5, help='')
44 |     parser.add_argument('--iterative_first', action='store_true', default=False, help='')
45 |     # for TemporalAugmentation
46 |     parser.add_argument('--translation_invariant', action='store_true', default=False, help='')
47 |     parser.add_argument('--temporal_augmentation', action='store_true', default=False, help='')
48 |     parser.add_argument('--TI_First', action='store_true', default=False, help='')
49 |     # for noise and shuffle
50 |     parser.add_argument('--noise', action='store_true', default=False, help='')
51 |     parser.add_argument('--shuffle_grads', action='store_true', default=False, help='')
52 |     # for cycle move
53 |     parser.add_argument('--move_type', type=str, default='adj',help='adj | large | random')
54 |     args = parser.parse_args()
55 |     if args.attack_type == 'video':
56 |         args.adv_path = os.path.join(OPT_PATH, '{}-{}-{}-{}'.format(args.model, args.attack_method, args.step, args.file_prefix))
57 |     elif args.attack_type == 'image':
58 |         args.adv_path = os.path.join(OPT_PATH, '{}-{}-{}-{}'.format(args.model, args.attack_method, args.step, args.file_prefix))
59 |     if not os.path.exists(args.adv_path):
60 |         os.makedirs(args.adv_path)
61 |     return args
62 | 
63 | if __name__ == '__main__':
64 |     args = arg_parse()
65 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
66 |     print (args)
67 |     # loading cfg.
68 |     cfg_path = CONFIG_PATHS[args.model]
69 |     cfg = get_cfg_custom(cfg_path, args.batch_size)
70 | 
71 |     # loading dataset and model.
72 |     dataset_loader = get_dataset(cfg)
73 |     model = get_model(cfg).cuda()
74 | 
75 |     # attack
76 |     if args.attack_type == 'image':
77 |         attack_method = getattr(base_attacks, args.attack_method)(model, steps=args.step)
78 |     elif args.attack_type == 'video':
79 |         if args.attack_method == 'TemporalTranslation':
80 |             spe_params = {'kernlen':args.kernlen, 'momentum':args.iterative_momentum, 'weight':args.augmentation_weight, 'move_type':args.move_type, 'kernel_mode':args.kernel_mode}
81 |         print ('Used Params')
82 |         print (spe_params)
83 |         attack_method = getattr(video_attacks, args.attack_method)(model, params=spe_params, steps=args.step)
84 | 
85 |     for step, data in enumerate(dataset_loader):
86 |         if step %1 == 0:
87 |             print ('Running {}, {}/{}'.format(args.attack_method, step+1, len(dataset_loader)))
88 |         val_batch = data[0].cuda()
89 |         val_label = data[1].cuda()
90 |         adv_batches = attack_method(val_batch, val_label)
91 |         val_batch = val_batch.detach()
92 |         for ind,label in enumerate(val_label):
93 |             ori = val_batch[ind].cpu().numpy()
94 |             adv = adv_batches[ind].cpu().numpy()
95 |             np.save(os.path.join(args.adv_path, '{}-adv'.format(label.item())), adv)
96 |             np.save(os.path.join(args.adv_path, '{}-ori'.format(label.item())), ori)
97 | 


--------------------------------------------------------------------------------
/I2V_attack-env.yml:
--------------------------------------------------------------------------------
  1 | name: transfer
  2 | channels:
  3 |   - pytorch
  4 |   - defaults
  5 | dependencies:
  6 |   - _libgcc_mutex=0.1=main
  7 |   - _openmp_mutex=4.5=1_gnu
  8 |   - argcomplete=1.12.3=pyhd3eb1b0_0
  9 |   - argon2-cffi=20.1.0=py37h27cfd23_1
 10 |   - async_generator=1.10=py37h28b3542_0
 11 |   - attrs=21.2.0=pyhd3eb1b0_0
 12 |   - backcall=0.2.0=pyhd3eb1b0_0
 13 |   - blas=1.0=mkl
 14 |   - bleach=4.0.0=pyhd3eb1b0_0
 15 |   - bzip2=1.0.8=h7b6447c_0
 16 |   - ca-certificates=2021.7.5=h06a4308_1
 17 |   - certifi=2021.5.30=py37h06a4308_0
 18 |   - cffi=1.14.6=py37h400218f_0
 19 |   - cudatoolkit=10.2.89=hfd86e86_1
 20 |   - dbus=1.13.18=hb2f20db_0
 21 |   - debugpy=1.4.1=py37h295c915_0
 22 |   - decorator=5.0.9=pyhd3eb1b0_0
 23 |   - defusedxml=0.7.1=pyhd3eb1b0_0
 24 |   - entrypoints=0.3=py37_0
 25 |   - expat=2.4.1=h2531618_2
 26 |   - ffmpeg=4.3=hf484d3e_0
 27 |   - fontconfig=2.13.1=h6c09931_0
 28 |   - freetype=2.10.4=h5ab3b9f_0
 29 |   - glib=2.69.1=h5202010_0
 30 |   - gmp=6.2.1=h2531618_2
 31 |   - gnutls=3.6.15=he1e5248_0
 32 |   - gst-plugins-base=1.14.0=h8213a91_2
 33 |   - gstreamer=1.14.0=h28cd5cc_2
 34 |   - icu=58.2=he6710b0_3
 35 |   - importlib-metadata=4.8.1=py37h06a4308_0
 36 |   - importlib_metadata=4.8.1=hd3eb1b0_0
 37 |   - intel-openmp=2021.3.0=h06a4308_3350
 38 |   - ipykernel=6.2.0=py37h06a4308_1
 39 |   - ipython=7.27.0=py37hb070fc8_0
 40 |   - ipython_genutils=0.2.0=pyhd3eb1b0_1
 41 |   - ipywidgets=7.6.3=pyhd3eb1b0_1
 42 |   - jedi=0.18.0=py37h06a4308_1
 43 |   - jinja2=3.0.1=pyhd3eb1b0_0
 44 |   - jpeg=9b=h024ee3a_2
 45 |   - jsonschema=3.2.0=pyhd3eb1b0_2
 46 |   - jupyter=1.0.0=py37_7
 47 |   - jupyter_client=7.0.1=pyhd3eb1b0_0
 48 |   - jupyter_console=6.4.0=pyhd3eb1b0_0
 49 |   - jupyter_core=4.7.1=py37h06a4308_0
 50 |   - jupyterlab_pygments=0.1.2=py_0
 51 |   - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1
 52 |   - lame=3.100=h7b6447c_0
 53 |   - lcms2=2.12=h3be6417_0
 54 |   - ld_impl_linux-64=2.35.1=h7274673_9
 55 |   - libffi=3.3=he6710b0_2
 56 |   - libgcc-ng=9.3.0=h5101ec6_17
 57 |   - libgomp=9.3.0=h5101ec6_17
 58 |   - libiconv=1.15=h63c8f33_5
 59 |   - libidn2=2.3.2=h7f8727e_0
 60 |   - libpng=1.6.37=hbc83047_0
 61 |   - libsodium=1.0.18=h7b6447c_0
 62 |   - libstdcxx-ng=9.3.0=hd4cf53a_17
 63 |   - libtasn1=4.16.0=h27cfd23_0
 64 |   - libtiff=4.2.0=h85742a9_0
 65 |   - libunistring=0.9.10=h27cfd23_0
 66 |   - libuuid=1.0.3=h1bed415_2
 67 |   - libuv=1.40.0=h7b6447c_0
 68 |   - libwebp-base=1.2.0=h27cfd23_0
 69 |   - libxcb=1.14=h7b6447c_0
 70 |   - libxml2=2.9.12=h03d6c58_0
 71 |   - lz4-c=1.9.3=h295c915_1
 72 |   - markupsafe=2.0.1=py37h27cfd23_0
 73 |   - matplotlib-inline=0.1.2=pyhd3eb1b0_2
 74 |   - mistune=0.8.4=py37h14c3975_1001
 75 |   - mkl=2021.3.0=h06a4308_520
 76 |   - mkl-service=2.4.0=py37h7f8727e_0
 77 |   - mkl_fft=1.3.0=py37h42c9631_2
 78 |   - mkl_random=1.2.2=py37h51133e4_0
 79 |   - nbclient=0.5.3=pyhd3eb1b0_0
 80 |   - nbconvert=6.1.0=py37h06a4308_0
 81 |   - nbformat=5.1.3=pyhd3eb1b0_0
 82 |   - ncurses=6.2=he6710b0_1
 83 |   - nest-asyncio=1.5.1=pyhd3eb1b0_0
 84 |   - nettle=3.7.3=hbbd107a_1
 85 |   - ninja=1.10.2=hff7bd54_1
 86 |   - notebook=6.4.3=py37h06a4308_0
 87 |   - numpy=1.20.3=py37hf144106_0
 88 |   - numpy-base=1.20.3=py37h74d4b33_0
 89 |   - olefile=0.46=py37_0
 90 |   - openh264=2.1.0=hd408876_0
 91 |   - openjpeg=2.4.0=h3ad879b_0
 92 |   - openssl=1.1.1l=h7f8727e_0
 93 |   - packaging=21.0=pyhd3eb1b0_0
 94 |   - pandocfilters=1.4.3=py37h06a4308_1
 95 |   - parso=0.8.2=pyhd3eb1b0_0
 96 |   - pcre=8.45=h295c915_0
 97 |   - pexpect=4.8.0=pyhd3eb1b0_3
 98 |   - pickleshare=0.7.5=pyhd3eb1b0_1003
 99 |   - pillow=8.3.1=py37h2c7a002_0
100 |   - pip=21.0.1=py37h06a4308_0
101 |   - prometheus_client=0.11.0=pyhd3eb1b0_0
102 |   - prompt-toolkit=3.0.17=pyhca03da5_0
103 |   - prompt_toolkit=3.0.17=hd3eb1b0_0
104 |   - ptyprocess=0.7.0=pyhd3eb1b0_2
105 |   - pycparser=2.20=py_2
106 |   - pygments=2.10.0=pyhd3eb1b0_0
107 |   - pyparsing=2.4.7=pyhd3eb1b0_0
108 |   - pyqt=5.9.2=py37h05f1152_2
109 |   - pyrsistent=0.17.3=py37h7b6447c_0
110 |   - python=3.7.11=h12debd9_0
111 |   - python-dateutil=2.8.2=pyhd3eb1b0_0
112 |   - pytorch=1.9.1=py3.7_cuda10.2_cudnn7.6.5_0
113 |   - pyzmq=22.2.1=py37h295c915_1
114 |   - qt=5.9.7=h5867ecd_1
115 |   - qtconsole=5.1.0=pyhd3eb1b0_0
116 |   - qtpy=1.10.0=pyhd3eb1b0_0
117 |   - readline=8.1=h27cfd23_0
118 |   - send2trash=1.5.0=pyhd3eb1b0_1
119 |   - setuptools=58.0.4=py37h06a4308_0
120 |   - sip=4.19.8=py37hf484d3e_0
121 |   - six=1.16.0=pyhd3eb1b0_0
122 |   - sqlite=3.36.0=hc218d9a_0
123 |   - terminado=0.9.4=py37h06a4308_0
124 |   - testpath=0.5.0=pyhd3eb1b0_0
125 |   - tk=8.6.10=hbc83047_0
126 |   - torchaudio=0.9.1=py37
127 |   - torchvision=0.10.1=py37_cu102
128 |   - tornado=6.1=py37h27cfd23_0
129 |   - traitlets=5.0.5=pyhd3eb1b0_0
130 |   - typing_extensions=3.10.0.2=pyh06a4308_0
131 |   - wcwidth=0.2.5=pyhd3eb1b0_0
132 |   - webencodings=0.5.1=py37_1
133 |   - wheel=0.37.0=pyhd3eb1b0_1
134 |   - widgetsnbextension=3.5.1=py37_0
135 |   - xz=5.2.5=h7b6447c_0
136 |   - zeromq=4.3.4=h2531618_0
137 |   - zipp=3.5.0=pyhd3eb1b0_0
138 |   - zlib=1.2.11=h7b6447c_3
139 |   - zstd=1.4.9=haebb681_0
140 |   - pip:
141 |     - autocfg==0.0.8
142 |     - charset-normalizer==2.0.6
143 |     - cycler==0.10.0
144 |     - decord==0.6.0
145 |     - gluoncv==0.10.4.post4
146 |     - idna==3.2
147 |     - kiwisolver==1.3.2
148 |     - matplotlib==3.4.3
149 |     - opencv-contrib-python==4.5.3.56
150 |     - pandas==1.3.3
151 |     - portalocker==2.3.2
152 |     - pytz==2021.1
153 |     - pyyaml==5.4.1
154 |     - requests==2.26.0
155 |     - scipy==1.7.1
156 |     - seaborn==0.11.2
157 |     - timm==0.5.0
158 |     - tqdm==4.62.3
159 |     - urllib3==1.26.6
160 |     - yacs==0.1.8
161 | prefix: None
162 | 


--------------------------------------------------------------------------------
/attack_ucf101.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import torch
  4 | import numpy as np
  5 | import math
  6 | 
  7 | import base_attacks
  8 | import video_attacks
  9 | from dataset_ucf101 import attack_genearte_dataeset
 10 | from gluoncv.torch.model_zoo import get_model
 11 | from utils import CONFIG_PATHS, get_cfg_custom, OPT_PATH
 12 | from reference_ucf101 import MODEL_TO_CKPTS
 13 | 
 14 | def arg_parse():
 15 |     parser = argparse.ArgumentParser(description='')
 16 |     # parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.')
 17 |     parser.add_argument('--gpu', type=str, default='0', help='gpu device.')
 18 |     parser.add_argument('--batch_size', type=int, default=4, metavar='N',
 19 |                     help='input batch size for reference (default: 16)')
 20 |     parser.add_argument('--model', type=str, default='i3d_resnet101', help='i3d_resnet101 | i3d_slow_resnet101 | slowfast_resnet101 | tpn_resnet101.')
 21 |     parser.add_argument('--attack_method', type=str, default='TemporalAugmentationMomentum', help='FGSM | BIM | MIFGSM | DIFGSM | TIFGSM | SGM')
 22 |     parser.add_argument('--attack_type', type=str, default='image', help='image | video')
 23 |     parser.add_argument('--step', type=int, default=10, metavar='N',
 24 |                     help='Multi-step or One-step in TI and SGM.')
 25 |     parser.add_argument('--sf_frame', type=int, default=32, metavar='N',
 26 |                     help='SFFGSM frame.')
 27 |     parser.add_argument('--cf_frame', type=str, default='small', metavar='N',
 28 |                     help='CFFGSM frame.')
 29 |     parser.add_argument('--kernlen', type=int, default=15, metavar='N',
 30 |                     help='SFFGSM frame.')
 31 |     parser.add_argument('--nsig', type=int, default=3, metavar='N',
 32 |                     help='SFFGSM frame.')
 33 |     parser.add_argument('--file_prefix', type=str, default='')
 34 |     parser.add_argument('--kernel_mode', type=str, default='gaussian')
 35 |     parser.add_argument('--iterative_momentum', action='store_true', default=False, help='Use iterative momentum in MFFGSM.')
 36 |     parser.add_argument('--frame_conv', action='store_true', default=False, help='Use frame_conv in MFFGSM.')
 37 |     # for TemporalAugmentationMomentum
 38 |     parser.add_argument('--augmentation_weight', type=float, default=1.0, help='')
 39 |     parser.add_argument('--frame_momentum', action='store_true', default=False, help='')
 40 |     parser.add_argument('--gamma', type=float, default=1.0, help='')
 41 |     # for combine momentum
 42 |     parser.add_argument('--no_iterative_momentum', action='store_true', default=False, help='')
 43 |     parser.add_argument('--weight_add', action='store_true', default=False, help='')
 44 |     parser.add_argument('--momentum_weight', type=float, default=0.5, help='')
 45 |     parser.add_argument('--iterative_first', action='store_true', default=False, help='')
 46 |     # for TemporalAugmentation
 47 |     parser.add_argument('--translation_invariant', action='store_true', default=False, help='')
 48 |     parser.add_argument('--temporal_augmentation', action='store_true', default=False, help='')
 49 |     parser.add_argument('--TI_First', action='store_true', default=False, help='')
 50 |     # for noise and shuffle
 51 |     parser.add_argument('--noise', action='store_true', default=False, help='')
 52 |     parser.add_argument('--shuffle_grads', action='store_true', default=False, help='')
 53 |     # for cycle move
 54 |     parser.add_argument('--move_type', type=str, default='adj',help='adj | large | random')
 55 |     args = parser.parse_args()
 56 |     if args.attack_type == 'video':
 57 |         args.adv_path = os.path.join(OPT_PATH, 'UCF101_Video-{}-{}-{}-{}'.format(args.model, args.attack_method, args.step, args.file_prefix))
 58 |     elif args.attack_type == 'image':
 59 |         args.adv_path = os.path.join(OPT_PATH, 'UCF101_Image-{}-{}-{}-{}'.format(args.model, args.attack_method, args.step, args.file_prefix))
 60 |     if not os.path.exists(args.adv_path):
 61 |         os.makedirs(args.adv_path)
 62 |     return args
 63 | 
 64 | if __name__ == '__main__':
 65 |     args = arg_parse()
 66 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
 67 |     print (args)
 68 | 
 69 |     # loading cfg.
 70 |     cfg_path = CONFIG_PATHS[args.model]
 71 |     cfg = get_cfg_custom(cfg_path, args.batch_size)
 72 | 
 73 |     # loading dataset and model.
 74 |     dataset_loader = attack_genearte_dataeset(args.batch_size)
 75 |     ckpt_path = MODEL_TO_CKPTS[args.model]
 76 |     model = get_model(cfg)
 77 |     model.load_state_dict(torch.load(ckpt_path)['state_dict'])
 78 |     model.cuda()
 79 |     model.eval()
 80 | 
 81 |     # attack
 82 |     if args.attack_type == 'image':
 83 |         # FGSM, BIM, MIFGSM, DIFGSM, TIFGSM, SGM, SIM
 84 |         attack_method = getattr(base_attacks, args.attack_method)(model, steps=args.step)
 85 |     elif args.attack_type == 'video':
 86 |         if args.attack_method == 'TemporalTranslation':
 87 |             spe_params = {'kernlen':15, 'momentum':False, 'weight':1.0, 'move_type':'adj', 'kernel_mode':'gaussian'}
 88 |         print ('Used Params')
 89 |         print (spe_params)
 90 |         attack_method = getattr(video_attacks, args.attack_method)(model, params=spe_params, steps=args.step)
 91 | 
 92 |     for step, data in enumerate(dataset_loader):
 93 |         if step %1 == 0:
 94 |             print ('Running {}, {}/{}'.format(args.attack_method, step+1, len(dataset_loader)))
 95 |         val_batch = data[0].cuda()
 96 |         val_label = data[1].cuda()
 97 |         adv_batches = attack_method(val_batch, val_label)
 98 |         val_batch = val_batch.detach()
 99 |         for ind,label in enumerate(val_label):
100 |             ori = val_batch[ind].cpu().numpy()
101 |             adv = adv_batches[ind].cpu().numpy()
102 |             np.save(os.path.join(args.adv_path, '{}-adv'.format(label.item())), adv)
103 |             np.save(os.path.join(args.adv_path, '{}-ori'.format(label.item())), ori)
104 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | <h1><a href="https://openaccess.thecvf.com/content/CVPR2022/papers/Wei_Cross-Modal_Transferable_Adversarial_Attacks_From_Images_to_Videos_CVPR_2022_paper.pdf">Cross-Modal Transferable Adversarial Attacks from Images to Videos</a></h1>
  4 | 
  5 | **CVPR 2022**
  6 | 
  7 | </div>
  8 | 
  9 | <div align="center">
 10 | 
 11 | <h1><a href="https://ieeexplore.ieee.org/abstract/document/10375740">Adaptive Cross-Modal Transferable Adversarial Attacks From Images to Videos</a></h1>
 12 | 
 13 | **IEEE Transactions on Pattern Analysis and Machine Intelligence ( Volume: 46, Issue: 5, May 2024)**
 14 | 
 15 | **[Zhipeng Wei](https://zhipeng-wei.github.io/), [Jingjing Chen](https://fvl.fudan.edu.cn/people/jingjingchen), [Zuxuan Wu](https://zxwu.azurewebsites.net/), [Yu-Gang Jiang](https://fvl.fudan.edu.cn/people/yugangjiang/)**
 16 | </div>
 17 | 
 18 | If you use our method for attacks in your research, please consider citing
 19 | ```
 20 | @inproceedings{wei2022cross,
 21 |   title={Cross-Modal Transferable Adversarial Attacks from Images to Videos},
 22 |   author={Wei, Zhipeng and Chen, Jingjing and Wu, Zuxuan and Jiang, Yu-Gang},
 23 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
 24 |   pages={15064--15073},
 25 |   year={2022}
 26 | }
 27 | @ARTICLE{10375740,
 28 |   author={Wei, Zhipeng and Chen, Jingjing and Wu, Zuxuan and Jiang, Yu-Gang},
 29 |   journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, 
 30 |   title={Adaptive Cross-Modal Transferable Adversarial Attacks From Images to Videos}, 
 31 |   year={2024},
 32 |   volume={46},
 33 |   number={5},
 34 |   pages={3772-3783},
 35 |   keywords={Videos;Adaptation models;Perturbation methods;Feature extraction;Computational modeling;Glass box;Closed box;Cross-modal attack;transferable attack},
 36 |   doi={10.1109/TPAMI.2023.3347835}}
 37 | ```
 38 | 
 39 | # Introduction
 40 | We proposed the Image To Video (I2V) attack (CVPR paper), which generates adversarial video examples by optimizing against pretrained image models to deceive video models. Specifically, I2V reduces the cosine similarity between adversarial and benign features in the intermediate layers of image models for each video frame.  
 41 | Moreover, I2V can be easily extended to simultaneously perturb multi-layer features extracted from an ensemble of image models (TPAMI Paper). To efficiently integrate various features, we introduce an adaptive approach to re-weight the contributions of each layer based on its cosine similarity values of the previous attack step.
 42 | 
 43 | # Python Environment.
 44 | We provide I2V_attack-env.yml to recover the used environment.
 45 | ```
 46 | conda env create -f I2V_attack-env.yml
 47 | ```
 48 | GPU infos contain 
 49 | ```
 50 | NVIDIA GeForce RTX 2080TI
 51 | NVIDIA-SMI 430.14       Driver Version: 430.14       CUDA Version: 10.2 
 52 | ```
 53 | 
 54 | # Prepare Model and Dataset
 55 | ### Video model
 56 | For Kinetics-400, download config files from [gluon](https://cv.gluon.ai/model_zoo/action_recognition.html).  Models include i3d_nl5_resnet50_v1_kinetics400, i3d_nl5_resnet101_v1_kinetics400, slowfast_8x8_resnet50_kinetics400, slowfast_8x8_resnet101_kinetics400, tpn_resnet50_f32s2_kinetics400, tpn_resnet101_f32s2_kinetics400.
 57 | After that, change the CONFIG_ROOT of utils.py into your custom path. We use pretrained models on Kinetics-400 from gluon to conduct experiments.
 58 | 
 59 | For UCF-101, we fine-tune these models on UCF-101. Download checkpoint files from [here](https://drive.google.com/open?id=10KOlWdi5bsV9001uL4Bn1T48m9hkgsZ2&authuser=weizhipeng1226%40gmail.com&usp=drive_fs) and specify UCF_CKPT_PATH of utils.py.
 60 | <!-- (due to the double blind review, we will provide the link after the paper is accepted)  -->
 61 | 
 62 | ### Dataset
 63 | Download Kinetics-400 dataset and UCF-101 dataset and set OPT_PATH of utils.py to specify the output path.
 64 | 
 65 | For Kinetics-400, change cfg.CONFIG.DATA.VAL_DATA_PATH of utils.py into your validation path.
 66 | 
 67 | For UCF-101, split videos into images and change UCF_IMAGE_ROOT of utils.py into your images path of UCF-101.
 68 | 
 69 | # Run the code.
 70 | ## Ablation Study and Performance Comparison
 71 | Using this code to obtain the results of Table 3, Table 4, and Figure 4, Figure 5.
 72 | ```python
 73 | python run_image_guided.py --gpu {gpu}
 74 | ```
 75 | ## Generation of adversarial examples
 76 | Before comparing the proposed ENS-I2V with ILAF, we need to generate adversarial examples with white-box video models.
 77 | 
 78 | For kinetics-400,
 79 | ```python
 80 | python attack.py --gpu {gpu} --model {model} --attack_type image --attack_method {image_method} --step {step} --batch_size {batch_size} 
 81 | python attack.py --gpu {gpu} --model {model} --attack_type video --attack_method TemporalTranslation --step {step} --batch_size 1
 82 | ```
 83 | * model: the white-box model.
 84 | * attack_method: such as FGSM, BIM, MI, etc. See more attacks in base_attack.py
 85 | * step: the iteration number.
 86 | 
 87 | For UCF101,
 88 | ```python
 89 | python attack_ucf101.py --gpu {gpu} --model {model} --attack_type image --attack_method {image_method} --step {step} --batch_size {batch_size} 
 90 | python attack_ucf101.py --gpu {gpu} --model {model} --attack_type video --attack_method TemporalTranslation --step {step} --batch_size 1
 91 | ```
 92 | 
 93 | These generated adversarial examples will be stored in the OPT_PATH of utils.py, which can be directly used as the parameter of "--used_ori" and "--used_adv" in subsequent commands.
 94 | 
 95 | ## Comparing against Stronger Baselines
 96 | Fine-tuning existing adversarial examples by: 
 97 | ```python
 98 | python image_fine_tune_attack.py --gpu {gpu} --attack_method ILAF --used_ori {path} --used_adv {path} --opt_path {path} --white_model {model} --dataset {dataset}
 99 | ```
100 | * used_ori: the path of original examples.
101 | * used_adv: the path of existing adversarial examples.
102 | * opt_path: the output path.
103 | * white_model: the white-box model.
104 | * dataset: Kinetics-400 or UCF-101
105 | 
106 | Predict these generated adversarial examples by 
107 | ```python
108 | # ucf101 reference
109 | python reference_ucf101.py --gpu {gpu} --adv_path {adv_path}
110 | # kinetics reference
111 | python reference.py --gpu {gpu} --adv_path {adv_path}
112 | ```
113 | * adv_path: the output path of generated adversarial examples.   
114 | 


--------------------------------------------------------------------------------
/reference_ucf101.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import numpy as np
  4 | import pandas as pd
  5 | import json
  6 | 
  7 | import torch
  8 | from gluoncv.torch.engine.config import get_cfg_defaults
  9 | from gluoncv.torch.model_zoo import get_model
 10 | from utils import AverageMeter, OPT_PATH, CONFIG_ROOT, UCF_CKPT_PATH
 11 | from datasets import get_dataset
 12 | import argparse
 13 | import math
 14 | 
 15 | CONFIG_PATHS = {
 16 |     'i3d_resnet50': os.path.join(CONFIG_ROOT, 'i3d_nl5_resnet50_v1_kinetics400.yaml'),
 17 |     'i3d_resnet101': os.path.join(CONFIG_ROOT, 'i3d_nl5_resnet101_v1_kinetics400.yaml'),
 18 |     'slowfast_resnet50': os.path.join(CONFIG_ROOT, 'slowfast_8x8_resnet50_kinetics400.yaml'),
 19 |     'slowfast_resnet101': os.path.join(CONFIG_ROOT, 'slowfast_8x8_resnet101_kinetics400.yaml'),
 20 |     'tpn_resnet50': os.path.join(CONFIG_ROOT, 'tpn_resnet50_f32s2_kinetics400.yaml'),
 21 |     'tpn_resnet101': os.path.join(CONFIG_ROOT, 'tpn_resnet101_f32s2_kinetics400.yaml')
 22 |     }
 23 | 
 24 | MODEL_TO_CKPTS = {
 25 |     'i3d_resnet50': os.path.join(UCF_CKPT_PATH, 'i3d_resnet50.pth'),
 26 |     'i3d_resnet101': os.path.join(UCF_CKPT_PATH, 'i3d_resnet101.pth'),
 27 |     'slowfast_resnet50': os.path.join(UCF_CKPT_PATH, 'slowfast_resnet50.pth'),
 28 |     'slowfast_resnet101': os.path.join(UCF_CKPT_PATH, 'slowfast_resnet101.pth'),
 29 |     'tpn_resnet50': os.path.join(UCF_CKPT_PATH, 'tpn_resnet50.pth'),
 30 |     'tpn_resnet101': os.path.join(UCF_CKPT_PATH, 'tpn_resnet101.pth')
 31 | }
 32 | 
 33 | def arg_parse():
 34 |     parser = argparse.ArgumentParser(description='')
 35 |     parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.')
 36 |     parser.add_argument('--gpu', type=str, default='0', help='gpu device.')
 37 |     parser.add_argument('--batch_size', type=int, default=16, metavar='N',
 38 |                     help='input batch size for reference (default: 16)')
 39 |     args = parser.parse_args()
 40 |     if 'DATACENTER' in args.adv_path:
 41 |         pass
 42 |     else:
 43 |         args.adv_path = os.path.join(OPT_PATH, args.adv_path)
 44 |     return args
 45 |     
 46 | def accuracy(output, target):
 47 |     batch_size = target.size(0)
 48 | 
 49 |     _, pred = output.topk(1, 1, True, True)
 50 |     pred = pred.t() # batch_size, 1
 51 |     correct = pred.eq(target.view(1, -1).expand_as(pred))
 52 | 
 53 |     correct_k = correct[:1].view(-1).float().sum(0)
 54 |     return correct_k.mul_(100.0 / batch_size), torch.squeeze(pred)
 55 | 
 56 | def generate_batch(batch_files):
 57 |     batches = []
 58 |     labels = []
 59 |     for file in batch_files:
 60 |         batches.append(torch.from_numpy(np.load(os.path.join(args.adv_path, file))).cuda())
 61 |         labels.append(int(file.split('-')[0]))
 62 |     labels = np.array(labels).astype(np.int32)
 63 |     labels = torch.from_numpy(labels)
 64 |     return torch.stack(batches), labels
 65 | 
 66 | def reference(model, files_batch):
 67 |     data_time = AverageMeter()
 68 |     top1 = AverageMeter()
 69 |     batch_time = AverageMeter()
 70 | 
 71 |     predictions = []
 72 |     labels = []
 73 | 
 74 |     end = time.time()
 75 |     with torch.no_grad():
 76 |         for step, batch in enumerate(files_batch):
 77 |             data_time.update(time.time() - end)
 78 |             val_batch, val_label = generate_batch(batch)
 79 | 
 80 |             val_batch = val_batch.cuda()
 81 |             val_label = val_label.cuda()
 82 | 
 83 |             batch_size = val_label.size(0)
 84 |             outputs = model(val_batch)
 85 | 
 86 |             prec1a, preds = accuracy(outputs.data, val_label)
 87 | 
 88 |             predictions += list(preds.cpu().numpy())
 89 |             labels += list(val_label.cpu().numpy())
 90 | 
 91 |             top1.update(prec1a.item(), val_batch.size(0))   
 92 |             batch_time.update(time.time() - end)
 93 |             end = time.time()
 94 | 
 95 |             if step % 5 == 0:
 96 |                 print('----validation----')
 97 |                 print_string = 'Process: [{0}/{1}]'.format(step + 1, len(files_batch))
 98 |                 print(print_string)
 99 |                 print_string = 'data_time: {data_time:.3f}, batch time: {batch_time:.3f}'.format(
100 |                     data_time=data_time.val,
101 |                     batch_time=batch_time.val)
102 |                 print(print_string)
103 |                 print_string = 'top-1 accuracy: {top1_acc:.2f}%'.format(top1_acc = top1.avg)
104 |                 print (print_string)
105 |     return predictions, labels, top1.avg
106 | 
107 | def load_model(model_name):
108 |     cfg = get_cfg_defaults()
109 |     cfg_path = CONFIG_PATHS[model_name]
110 |     cfg.merge_from_file(cfg_path)
111 |     cfg.CONFIG.MODEL.PRETRAINED = False
112 |     ckpt_path = MODEL_TO_CKPTS[model_name]
113 |     model = get_model(cfg)
114 |     model.load_state_dict(torch.load(ckpt_path)['state_dict'])
115 |     model.cuda()
116 |     model.eval()
117 |     return model
118 | 
119 | if __name__ == '__main__':
120 |     global args
121 |     args = arg_parse()
122 |     
123 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
124 | 
125 |     # loading adversarial examples.
126 |     files = os.listdir(args.adv_path)
127 |     files = [i for i in files if 'adv' in i]
128 | 
129 |     batch_times = math.ceil(len(files) / args.batch_size)
130 |     files_batch = []
131 |     for i in range(batch_times):
132 |         batch = files[i*args.batch_size: min((i+1)*args.batch_size, len(files))]
133 |         files_batch.append(batch)
134 | 
135 |     model_val_acc = {}
136 |     info_df = pd.DataFrame()
137 |     info_df['gt_label'] = [i for i in range(101)]
138 |     for model_name in MODEL_TO_CKPTS.keys():
139 |         print ('Model-{}:'.format(model_name))
140 |         model = load_model(model_name)
141 |         preds, labels, top1_avg = reference(model, files_batch)
142 | 
143 |         predd = np.zeros_like(preds)
144 |         inds = np.argsort(labels)
145 |         for i,ind in enumerate(inds):
146 |             predd[ind] = preds[i]
147 | 
148 |         print (args.adv_path)
149 |         info_df['{}-pre'.format(model_name)] = predd
150 |         model_val_acc[model_name] = top1_avg
151 |         del model
152 |         torch.cuda.empty_cache()
153 | 
154 |     info_df.to_csv(os.path.join(args.adv_path, 'results_all_models_prediction.csv'), index=False)
155 |     with open(os.path.join(args.adv_path, 'top1_acc_all_models.json'), 'w') as opt:
156 |         json.dump(model_val_acc, opt)
157 | 
158 |     
159 | 
160 | 


--------------------------------------------------------------------------------
/image_cam.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import torchvision.models as models
  4 | 
  5 | from image_cam_utils import find_alexnet_layer, find_vgg_layer, find_resnet_layer, find_densenet_layer, find_squeezenet_layer
  6 | 
  7 | # refer to https://github.com/1Konny/gradcam_plus_plus-pytorch/blob/master/gradcam.py
  8 | 
  9 | def average_grad_cam_from_images(inps):
 10 |     '''
 11 |     inps: [b,c,f,h,w]
 12 |     '''
 13 |     b,c,f,h,w = inps.shape
 14 |     image_inps = inps.permute([0,2,1,3,4])
 15 |     image_inps = image_inps.reshape(b*f, c, h, w)
 16 |     model_lists = ['alexnet', 'vgg', 'resnet', 'densenet', 'squeezenet']
 17 |     masks = []
 18 |     for model_name in model_lists:
 19 |         if model_name == 'alexnet':
 20 |             model = models.alexnet(pretrained=True)
 21 |         elif model_name == 'vgg':
 22 |             model = models.vgg16(pretrained=True)
 23 |         elif model_name == 'resnet':
 24 |             model = models.resnet101(pretrained=True)
 25 |         elif model_name == 'densenet':
 26 |             model = models.densenet161(pretrained=True)
 27 |         elif model_name == 'squeezenet':
 28 |             model = models.squeezenet1_1(pretrained=True)
 29 |         model.eval()
 30 |         model.cuda()
 31 |         model_dict = dict(type=model_name, arch=model, input_size=(224, 224))
 32 |         
 33 |         gradcam = GradCAM(model_dict, False)
 34 |         mask, _ = gradcam(image_inps)
 35 |         masks.append(mask)
 36 |     average_mask = torch.stack(masks).mean(0, keepdim=False)
 37 |     return average_mask
 38 | 
 39 | class GradCAM(object):
 40 |     """Calculate GradCAM salinecy map.
 41 |     A simple example:
 42 |         # initialize a model, model_dict and gradcam
 43 |         resnet = torchvision.models.resnet101(pretrained=True)
 44 |         resnet.eval()
 45 |         model_dict = dict(model_type='resnet', arch=resnet, layer_name='layer4', input_size=(224, 224))
 46 |         gradcam = GradCAM(model_dict)
 47 |         # get an image and normalize with mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
 48 |         img = load_img()
 49 |         normed_img = normalizer(img)
 50 |         # get a GradCAM saliency map on the class index 10.
 51 |         mask, logit = gradcam(normed_img, class_idx=10)
 52 |         # make heatmap from mask and synthesize saliency map using heatmap and img
 53 |         heatmap, cam_result = visualize_cam(mask, img)
 54 |     Args:
 55 |         model_dict (dict): a dictionary that contains 'model_type', 'arch', layer_name', 'input_size'(optional) as keys.
 56 |         verbose (bool): whether to print output size of the saliency map givien 'layer_name' and 'input_size' in model_dict.
 57 |     """
 58 |     def __init__(self, model_dict, verbose=False):
 59 |         model_type = model_dict['type']
 60 |         self.model_arch = model_dict['arch']
 61 | 
 62 |         self.gradients = dict()
 63 |         self.activations = dict()
 64 |         def backward_hook(module, grad_input, grad_output):
 65 |             self.gradients['value'] = grad_output[0]
 66 |             return None
 67 |         def forward_hook(module, input, output):
 68 |             self.activations['value'] = output
 69 |             return None
 70 | 
 71 |         if 'vgg' in model_type.lower():
 72 |             target_layer = find_vgg_layer(self.model_arch, 'features_29')
 73 |         elif 'resnet' in model_type.lower():
 74 |             target_layer = find_resnet_layer(self.model_arch, 'layer4')
 75 |         elif 'densenet' in model_type.lower():
 76 |             target_layer = find_densenet_layer(self.model_arch, 'features_norm5')
 77 |         elif 'alexnet' in model_type.lower():
 78 |             target_layer = find_alexnet_layer(self.model_arch, 'features_11')
 79 |         elif 'squeezenet' in model_type.lower():
 80 |             target_layer = find_squeezenet_layer(self.model_arch, 'features_12_expand3x3_activation')
 81 | 
 82 |         target_layer.register_forward_hook(forward_hook)
 83 |         target_layer.register_backward_hook(backward_hook)
 84 | 
 85 |         if verbose:
 86 |             try:
 87 |                 input_size = model_dict['input_size']
 88 |             except KeyError:
 89 |                 print("please specify size of input image in model_dict. e.g. {'input_size':(224, 224)}")
 90 |                 pass
 91 |             else:
 92 |                 device = 'cuda' if next(self.model_arch.parameters()).is_cuda else 'cpu'
 93 |                 self.model_arch(torch.zeros(1, 3, *(input_size), device=device))
 94 |                 print('saliency_map size :', self.activations['value'].shape[2:])
 95 | 
 96 | 
 97 |     def forward(self, input, ori_feature_mas, class_idx=None, retain_graph=False):
 98 |         """
 99 |         Args:
100 |             input: input image with shape of (1, 3, H, W)
101 |             class_idx (int): class index for calculating GradCAM.
102 |                     If not specified, the class index that makes the highest model prediction score will be used.
103 |         Return:
104 |             mask: saliency map of the same spatial dimension with input
105 |             logit: model output
106 |         """
107 |         b, c, h, w = input.size()
108 | 
109 |         logit = self.model_arch(input)
110 |         if class_idx is None:
111 |             score = logit[:, logit.max(1)[-1]].squeeze()
112 |         else:
113 |             score = logit[:, class_idx].squeeze()
114 | 
115 |         self.model_arch.zero_grad()
116 |         score.backward()
117 |         gradients = self.gradients['value']
118 |         activations = self.activations['value']
119 |         b, k, u, v = gradients.size()
120 | 
121 |         alpha = gradients.view(b, k, -1).mean(2)
122 |         #alpha = F.relu(gradients.view(b, k, -1)).mean(2)
123 |         weights = alpha.view(b, k, 1, 1)
124 | 
125 |         saliency_map = (weights*activations).sum(1, keepdim=True)
126 |         saliency_map = F.relu(saliency_map)
127 |         # saliency_map = F.upsample(saliency_map, size=(h, w), mode='bilinear', align_corners=False)
128 |         saliency_map_min, saliency_map_max = saliency_map.min(), saliency_map.max()
129 |         saliency_map = (saliency_map - saliency_map_min).div(saliency_map_max - saliency_map_min).data
130 | 
131 |         
132 |         if self.update:
133 |             print (saliency_map.shape)
134 |             print (ori_feature_mas.shape)
135 |             cost = torch.norm(saliency_map.reshape(b, -1) - ori_feature_mas.reshape(b, -1), p=2, dim=1)
136 |             grad = torch.autograd.grad(cost, input, grad_outputs=torch.ones_like(cost),
137 |                                        retain_graph=False, create_graph=False)[0]
138 |             return grad
139 |         else:
140 |             return saliency_map
141 | 
142 |     def __call__(self, input, ori_feature_mas, update=False, class_idx=None, retain_graph=False):
143 |         self.update = update
144 |         return self.forward(input, ori_feature_mas, class_idx, retain_graph)


--------------------------------------------------------------------------------
/run_image_guided.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import argparse
  4 | 
  5 | ImageGuidedFMDirection_Adam_attack_run = 'python image_main.py --gpu {gpu} --attack_method ImageGuidedFMDirection_Adam  --step {step} --step_size {step_size} --direction_image_model resnet --batch_size {batch_size} --batch_nums {batch_nums} --batch_index {batch_index} --file_prefix resnet_step_size_{step_size}_paper_study'
  6 | ImageGuidedFMDirection_Adam_reference_run = 'python reference.py --gpu {gpu} --adv_path Image-ImageGuidedFMDirection_Adam-{step}-resnet_step_size_{step_size}_paper_study'
  7 | 
  8 | Aba_layer_ImageGuidedFMDirection_Adam_attack_run = 'python image_main.py --gpu {gpu} --attack_method ImageGuidedFMDirection_Adam  --step 60 --step_size 0.005 --direction_image_model {image_model} --depth {depth} --file_prefix {image_model}-step_size-0.005-depth-{depth}_paper_study'
  9 | Aba_layer_ImageGuidedFMDirection_Adam_reference_run = 'python reference.py --gpu {gpu} --adv_path Image-ImageGuidedFMDirection_Adam-60-{image_model}-step_size-0.005-depth-{depth}_paper_study'
 10 | 
 11 | # Kinetic400
 12 | Per_Com_ImageGuidedFMDirection_Adam_attack_run = 'python image_main.py --gpu {gpu} --attack_method ImageGuidedFMDirection_Adam  --step 60 --step_size 0.005 --direction_image_model {image_model} --depth {depth} --file_prefix {image_model}-depth-{depth}_paper_per_com'
 13 | Per_Com_ImageGuidedFMDirection_Adam_reference_run = 'python reference.py --gpu {gpu} --adv_path Image-ImageGuidedFMDirection_Adam-60-{image_model}-depth-{depth}_paper_per_com'
 14 | 
 15 | Per_Com_ImageGuidedFML2_Adam_MultiModels_attack_run = 'python image_main.py --gpu {gpu} --attack_method ImageGuidedFML2_Adam_MultiModels  --step 60 --step_size 0.005 --file_prefix paper_per_com'
 16 | Per_Com_ImageGuidedFML2_Adam_MultiModels_reference_run = 'python reference.py --gpu {gpu} --adv_path Image-ImageGuidedFML2_Adam_MultiModels-60-paper_per_com'
 17 | 
 18 | Per_Com_ImageGuidedStd_Adam_attack_run = 'python image_main.py --gpu {gpu} --attack_method ImageGuidedStd_Adam  --step 60 --step_size 0.005 --direction_image_model {image_model} --depth {depth} --file_prefix {image_model}-depth-{depth}_paper_per_com'
 19 | Per_Com_ImageGuidedStd_Adam_reference_run = 'python reference.py --gpu {gpu} --adv_path Image-ImageGuidedStd_Adam-60-{image_model}-depth-{depth}_paper_per_com'
 20 | 
 21 | # UCF101
 22 | Per_Com_UCF_ImageGuidedFMDirection_Adam_attack_run = 'python image_main_ucf101.py --gpu {gpu} --attack_method ImageGuidedFMDirection_Adam  --step 60 --step_size 0.005 --direction_image_model {image_model} --depth {depth} --file_prefix {image_model}-depth-{depth}_paper_per_com'
 23 | Per_Com_UCF_ImageGuidedFMDirection_Adam_reference_run = 'python reference_ucf101.py --gpu {gpu} --adv_path Image-ImageGuidedFMDirection_Adam-60-{image_model}-depth-{depth}_paper_per_com'
 24 | 
 25 | Per_Com_UCF_ImageGuidedFML2_Adam_MultiModels_attack_run = 'python image_main_ucf101.py --gpu {gpu} --attack_method ImageGuidedFML2_Adam_MultiModels  --step 60 --step_size 0.005 --file_prefix paper_per_com'
 26 | Per_Com_UCF_ImageGuidedFML2_Adam_MultiModels_reference_run = 'python reference_ucf101.py --gpu {gpu} --adv_path Image-ImageGuidedFML2_Adam_MultiModels-60-paper_per_com'
 27 | 
 28 | Per_Com_UCF_ImageGuidedStd_Adam_attack_run = 'python image_main_ucf101.py --gpu {gpu} --attack_method ImageGuidedStd_Adam  --step 60 --step_size 0.005 --direction_image_model {image_model} --depth {depth} --file_prefix {image_model}-depth-{depth}_paper_per_com'
 29 | Per_Com_UCF_ImageGuidedStd_Adam_reference_run = 'python reference_ucf101.py --gpu {gpu} --adv_path Image-ImageGuidedStd_Adam-60-{image_model}-depth-{depth}_paper_per_com'
 30 | 
 31 | 
 32 | 
 33 | def arg_parse():
 34 |     parser = argparse.ArgumentParser(description='')
 35 |     # parser.add_argument('--adv_path', type=str, default='', help='the path of adversarial examples.')
 36 |     parser.add_argument('--gpu', type=str, default='0', help='gpu device.')
 37 |     parser.add_argument('--batch_size', type=int, default=1, help='')
 38 |     # parser.add_argument('--kernlens', nargs='+', help='<Required> Set flag', required=True)
 39 |     args = parser.parse_args()
 40 |     return args
 41 | 
 42 | if __name__ == '__main__':
 43 |     args = arg_parse()
 44 | 
 45 |     # ablation study for step size and iteration number (Figure 4)
 46 |     steps = [20, 40, 60, 80, 100]
 47 |     step_sizes = [0.001, 0.0025, 0.0050, 0.0075, 0.010]
 48 |     for step in steps:
 49 |         # step_size = 0.004
 50 |         for step_size in step_sizes:
 51 |             os.system(ImageGuidedFMDirection_Adam_attack_run.format(gpu=args.gpu, step=step, step_size=step_size, batch_nums=1, batch_index=1, batch_size=1))
 52 |             os.system(ImageGuidedFMDirection_Adam_reference_run.format(gpu=args.gpu, step=step, step_size=step_size))
 53 | 
 54 |     # ablation study for attacked layer (Table 2 and Figure 5)
 55 |     depths = [1,2,3,4]
 56 |     image_models = ['resnet', 'squeezenet', 'vgg', 'alexnet']
 57 |     for image_model in image_models:
 58 |         for depth in depths:
 59 |             os.system(Aba_layer_ImageGuidedFMDirection_Adam_attack_run.format(gpu=args.gpu, depth=depth, image_model=image_model))
 60 |             os.system(Aba_layer_ImageGuidedFMDirection_Adam_reference_run.format(gpu=args.gpu, image_model=image_model, depth=depth))
 61 | 
 62 |     # performance comparison for kinetics-400 (Table 3)
 63 |     step = 60
 64 |     step_size = 0.005
 65 |     image_models = ['squeezenet', 'vgg', 'alexnet', 'resnet'] 
 66 |     for image_model in image_models:
 67 |         if image_model == 'resnet' or image_model == 'squeezenet':
 68 |             depth = 2
 69 |         else:
 70 |             depth = 3
 71 |         # I2V attack
 72 |         os.system(Per_Com_ImageGuidedFMDirection_Adam_attack_run.format(gpu=args.gpu, image_model=image_model, depth=depth))
 73 |         os.system(Per_Com_ImageGuidedFMDirection_Adam_reference_run.format(gpu=args.gpu, image_model=image_model, depth=depth))
 74 |         # STD attack
 75 |         os.system(Per_Com_ImageGuidedStd_Adam_attack_run.format(gpu=args.gpu, image_model=image_model, depth=depth))
 76 |         os.system(Per_Com_ImageGuidedStd_Adam_reference_run.format(gpu=args.gpu, image_model=image_model, depth=depth))
 77 | 
 78 |     # ENS-I2V attack
 79 |     os.system(Per_Com_ImageGuidedFML2_Adam_MultiModels_attack_run.format(gpu=args.gpu))
 80 |     os.system(Per_Com_ImageGuidedFML2_Adam_MultiModels_reference_run.format(gpu=args.gpu))
 81 | 
 82 |     # performance compasison for ucf101 (Table 4)
 83 |     step = 60
 84 |     step_size = 0.005
 85 |     image_models = ['resnet', 'squeezenet', 'vgg', 'alexnet'] # '', 
 86 |     for image_model in image_models:
 87 |         if image_model == 'resnet' or image_model == 'squeezenet':
 88 |             depth = 2
 89 |         else:
 90 |             depth = 3
 91 |         # I2V attack
 92 |         os.system(Per_Com_UCF_ImageGuidedFMDirection_Adam_attack_run.format(gpu=args.gpu, image_model=image_model, depth=depth))
 93 |         os.system(Per_Com_UCF_ImageGuidedFMDirection_Adam_reference_run.format(gpu=args.gpu, image_model=image_model, depth=depth))
 94 |         # STD attack
 95 |         os.system(Per_Com_UCF_ImageGuidedStd_Adam_attack_run.format(gpu=args.gpu, image_model=image_model, depth=depth))
 96 |         os.system(Per_Com_UCF_ImageGuidedStd_Adam_reference_run.format(gpu=args.gpu, image_model=image_model, depth=depth))
 97 |         
 98 |     # ENS-I2V attack
 99 |     os.system(Per_Com_UCF_ImageGuidedFML2_Adam_MultiModels_attack_run.format(gpu=args.gpu))
100 |     os.system(Per_Com_UCF_ImageGuidedFML2_Adam_MultiModels_reference_run.format(gpu=args.gpu))
101 |     
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/image_cam_utils.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import torch
  4 | 
  5 | def visualize_cam(mask, img):
  6 |     """Make heatmap from mask and synthesize GradCAM result image using heatmap and img.
  7 |     Args:
  8 |         mask (torch.tensor): mask shape of (1, 1, H, W) and each element has value in range [0, 1]
  9 |         img (torch.tensor): img shape of (1, 3, H, W) and each pixel value is in range [0, 1]
 10 |         
 11 |     Return:
 12 |         heatmap (torch.tensor): heatmap img shape of (3, H, W)
 13 |         result (torch.tensor): synthesized GradCAM result of same shape with heatmap.
 14 |     """
 15 |     heatmap = cv2.applyColorMap(np.uint8(255 * mask.squeeze()), cv2.COLORMAP_JET)
 16 |     heatmap = torch.from_numpy(heatmap).permute(2, 0, 1).float().div(255)
 17 |     b, g, r = heatmap.split(1)
 18 |     heatmap = torch.cat([r, g, b])
 19 |     
 20 |     result = heatmap+img.cpu()
 21 |     result = result.div(result.max()).squeeze()
 22 |     
 23 |     return heatmap, result
 24 | 
 25 | 
 26 | def find_resnet_layer(arch, target_layer_name):
 27 |     """Find resnet layer to calculate GradCAM and GradCAM++
 28 |     
 29 |     Args:
 30 |         arch: default torchvision densenet models
 31 |         target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below.
 32 |             target_layer_name = 'conv1'
 33 |             target_layer_name = 'layer1'
 34 |             target_layer_name = 'layer1_basicblock0'
 35 |             target_layer_name = 'layer1_basicblock0_relu'
 36 |             target_layer_name = 'layer1_bottleneck0'
 37 |             target_layer_name = 'layer1_bottleneck0_conv1'
 38 |             target_layer_name = 'layer1_bottleneck0_downsample'
 39 |             target_layer_name = 'layer1_bottleneck0_downsample_0'
 40 |             target_layer_name = 'avgpool'
 41 |             target_layer_name = 'fc'
 42 |             
 43 |     Return:
 44 |         target_layer: found layer. this layer will be hooked to get forward/backward pass information.
 45 |     """
 46 |     if 'layer' in target_layer_name:
 47 |         hierarchy = target_layer_name.split('_')
 48 |         layer_num = int(hierarchy[0].lstrip('layer'))
 49 |         if layer_num == 1:
 50 |             target_layer = arch.layer1
 51 |         elif layer_num == 2:
 52 |             target_layer = arch.layer2
 53 |         elif layer_num == 3:
 54 |             target_layer = arch.layer3
 55 |         elif layer_num == 4:
 56 |             target_layer = arch.layer4
 57 |         else:
 58 |             raise ValueError('unknown layer : {}'.format(target_layer_name))
 59 | 
 60 |         if len(hierarchy) >= 2:
 61 |             bottleneck_num = int(hierarchy[1].lower().lstrip('bottleneck').lstrip('basicblock'))
 62 |             target_layer = target_layer[bottleneck_num]
 63 | 
 64 |         if len(hierarchy) >= 3:
 65 |             target_layer = target_layer._modules[hierarchy[2]]
 66 |                 
 67 |         if len(hierarchy) == 4:
 68 |             target_layer = target_layer._modules[hierarchy[3]]
 69 | 
 70 |     else:
 71 |         target_layer = arch._modules[target_layer_name]
 72 | 
 73 |     return target_layer
 74 | 
 75 | 
 76 | def find_densenet_layer(arch, target_layer_name):
 77 |     """Find densenet layer to calculate GradCAM and GradCAM++
 78 |     
 79 |     Args:
 80 |         arch: default torchvision densenet models
 81 |         target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below.
 82 |             target_layer_name = 'features'
 83 |             target_layer_name = 'features_transition1'
 84 |             target_layer_name = 'features_transition1_norm'
 85 |             target_layer_name = 'features_denseblock2_denselayer12'
 86 |             target_layer_name = 'features_denseblock2_denselayer12_norm1'
 87 |             target_layer_name = 'features_denseblock2_denselayer12_norm1'
 88 |             target_layer_name = 'classifier'
 89 |             
 90 |     Return:
 91 |         target_layer: found layer. this layer will be hooked to get forward/backward pass information.
 92 |     """
 93 |     
 94 |     hierarchy = target_layer_name.split('_')
 95 |     target_layer = arch._modules[hierarchy[0]]
 96 | 
 97 |     if len(hierarchy) >= 2:
 98 |         target_layer = target_layer._modules[hierarchy[1]]
 99 | 
100 |     if len(hierarchy) >= 3:
101 |         target_layer = target_layer._modules[hierarchy[2]]
102 | 
103 |     if len(hierarchy) == 4:
104 |         target_layer = target_layer._modules[hierarchy[3]]
105 | 
106 |     return target_layer
107 | 
108 | 
109 | def find_vgg_layer(arch, target_layer_name):
110 |     """Find vgg layer to calculate GradCAM and GradCAM++
111 |     
112 |     Args:
113 |         arch: default torchvision densenet models
114 |         target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below.
115 |             target_layer_name = 'features'
116 |             target_layer_name = 'features_42'
117 |             target_layer_name = 'classifier'
118 |             target_layer_name = 'classifier_0'
119 |             
120 |     Return:
121 |         target_layer: found layer. this layer will be hooked to get forward/backward pass information.
122 |     """
123 |     hierarchy = target_layer_name.split('_')
124 | 
125 |     if len(hierarchy) >= 1:
126 |         target_layer = arch.features
127 | 
128 |     if len(hierarchy) == 2:
129 |         target_layer = target_layer[int(hierarchy[1])]
130 | 
131 |     return target_layer
132 | 
133 | 
134 | def find_alexnet_layer(arch, target_layer_name):
135 |     """Find alexnet layer to calculate GradCAM and GradCAM++
136 |     
137 |     Args:
138 |         arch: default torchvision densenet models
139 |         target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below.
140 |             target_layer_name = 'features'
141 |             target_layer_name = 'features_0'
142 |             target_layer_name = 'classifier'
143 |             target_layer_name = 'classifier_0'
144 |             
145 |     Return:
146 |         target_layer: found layer. this layer will be hooked to get forward/backward pass information.
147 |     """
148 |     hierarchy = target_layer_name.split('_')
149 | 
150 |     if len(hierarchy) >= 1:
151 |         target_layer = arch.features
152 | 
153 |     if len(hierarchy) == 2:
154 |         target_layer = target_layer[int(hierarchy[1])]
155 | 
156 |     return target_layer
157 | 
158 | 
159 | def find_squeezenet_layer(arch, target_layer_name):
160 |     """Find squeezenet layer to calculate GradCAM and GradCAM++
161 |     
162 |     Args:
163 |         arch: default torchvision densenet models
164 |         target_layer_name (str): the name of layer with its hierarchical information. please refer to usages below.
165 |             target_layer_name = 'features_12'
166 |             target_layer_name = 'features_12_expand3x3'
167 |             target_layer_name = 'features_12_expand3x3_activation'
168 |             
169 |     Return:
170 |         target_layer: found layer. this layer will be hooked to get forward/backward pass information.
171 |     """
172 |     hierarchy = target_layer_name.split('_')
173 |     target_layer = arch._modules[hierarchy[0]]
174 | 
175 |     if len(hierarchy) >= 2:
176 |         target_layer = target_layer._modules[hierarchy[1]]
177 | 
178 |     if len(hierarchy) == 3:
179 |         target_layer = target_layer._modules[hierarchy[2]]
180 | 
181 |     elif len(hierarchy) == 4:
182 |         target_layer = target_layer._modules[hierarchy[2]+'_'+hierarchy[3]]
183 | 
184 |     return target_layer
185 | 
186 | 
187 | def denormalize(tensor, mean, std):
188 |     if not tensor.ndimension() == 4:
189 |         raise TypeError('tensor should be 4D')
190 | 
191 |     mean = torch.FloatTensor(mean).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device)
192 |     std = torch.FloatTensor(std).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device)
193 | 
194 |     return tensor.mul(std).add(mean)
195 | 
196 | 
197 | def normalize(tensor, mean, std):
198 |     if not tensor.ndimension() == 4:
199 |         raise TypeError('tensor should be 4D')
200 | 
201 |     mean = torch.FloatTensor(mean).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device)
202 |     std = torch.FloatTensor(std).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device)
203 | 
204 |     return tensor.sub(mean).div(std)
205 | 
206 | 
207 | class Normalize(object):
208 |     def __init__(self, mean, std):
209 |         self.mean = mean
210 |         self.std = std
211 | 
212 |     def __call__(self, tensor):
213 |         return self.do(tensor)
214 |     
215 |     def do(self, tensor):
216 |         return normalize(tensor, self.mean, self.std)
217 |     
218 |     def undo(self, tensor):
219 |         return denormalize(tensor, self.mean, self.std)
220 | 
221 |     def __repr__(self):
222 |         return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)


--------------------------------------------------------------------------------
/video_attacks.py:
--------------------------------------------------------------------------------
  1 | from base_attacks import Attack
  2 | import torch
  3 | import torch.nn as nn
  4 | import scipy.stats as st
  5 | import numpy as np
  6 | import torchvision
  7 | from PIL import Image
  8 | import random
  9 | import math
 10 | import time
 11 | import torch.nn.functional as F
 12 | from utils import norm_grads
 13 | 
 14 | class TemporalTranslation(Attack):
 15 |     '''
 16 |     paper: Boosting the transferability of video adversarial examples via temporal translation
 17 |     Replace conv with multiple queries.
 18 |     There are two ways: Cycle and Exchange. 
 19 |     Contain momentum or no momentum.
 20 |     params = {'kernlen':args.kernlen, # conv1 params
 21 |         'momentum':args.momentum
 22 |         'weight':args.augmentation_weight,
 23 |         'move_type': 'adj',
 24 |         'kernel_mode': 'gaussian'}
 25 |     '''
 26 |     def __init__(self, model, params, epsilon=16/255, steps=10, delay=1.0):
 27 |         super(TemporalTranslation, self).__init__("TemporalTranslation", model)
 28 |         self.epsilon = epsilon
 29 |         self.steps = steps
 30 |         self.step_size = self.epsilon / self.steps
 31 |         self.delay = delay
 32 | 
 33 |         for name, value in params.items():
 34 |             setattr(self, name, value)
 35 |         
 36 |         self.frames = 32
 37 |         self.cycle_move_list = self._move_info_generation()
 38 |         if self.kernel_mode == 'gaussian':
 39 |             kernel = self._initial_kernel_gaussian(self.kernlen).astype(np.float32) # (self.kernlen)
 40 |         elif self.kernel_mode == 'linear':
 41 |             kernel = self._initial_kernel_linear(self.kernlen).astype(np.float32) # (self.kernlen)
 42 |         elif self.kernel_mode == 'random':
 43 |             kernel = self._initial_kernel_uniform(self.kernlen).astype(np.float32) # (self.kernlen)
 44 |         
 45 |         self.kernel = torch.from_numpy(np.expand_dims(kernel, 0)).to(self.device) # 1,self.kernlen
 46 | 
 47 |     def _move_info_generation(self):
 48 |         max_move = int((self.kernlen - 1) / 2) 
 49 |         lists = [i for i in range(-max_move, max_move+1)]
 50 |         return lists
 51 |     
 52 |     def _initial_kernel_linear(self, kernlen):
 53 |         k = int((kernlen - 1) / 2)
 54 |         kern1d = []
 55 |         for i in range(k+1):
 56 |             kern1d.append(1 - i / (k+1))
 57 |         kern1d = np.array(kern1d[::-1][:-1] + kern1d)
 58 |         kernel = kern1d / kern1d.sum()
 59 |         return kernel
 60 | 
 61 |     def _initial_kernel_uniform(self, kernlen):
 62 |         kern1d = np.ones(kernlen)
 63 |         kernel = kern1d / kern1d.sum()
 64 |         return kernel
 65 | 
 66 |     def _initial_kernel_gaussian(self, kernlen):
 67 |         assert kernlen%2 == 1
 68 |         k = (kernlen - 1) /2
 69 |         sigma = k/3
 70 |         k = int(k)
 71 |         def calculte_guassian(x, sigma):
 72 |             return (1/(sigma*np.sqrt(2*np.pi)) * np.math.exp(-(x**2)/(2* (sigma**2))))
 73 |         kern1d = []
 74 |         for i in range(-k, k+1):
 75 |             kern1d.append(calculte_guassian(i, sigma))
 76 |         assert len(kern1d) == kernlen
 77 |         kern1d = np.array(kern1d)
 78 |         kernel = kern1d / kern1d.sum()
 79 |         return kernel
 80 | 
 81 |     def _conv1d_frame(self, grads):
 82 |         '''
 83 |         grads: D, N, C, T, H, W
 84 |         '''
 85 |         # cycle padding for grads
 86 |         D,N,C,T,H,W = grads.shape
 87 |         grads = grads.reshape(D, -1)
 88 |         
 89 |         grad = torch.matmul(self.kernel, grads)
 90 |         grad = grad.reshape(N,C,T,H,W)
 91 |         return grad
 92 | 
 93 |     def _cycle_move(self, adv_videos, cycle_move):
 94 |         if cycle_move < 0:
 95 |             direction = -1
 96 |         else:
 97 |             direction = 1
 98 |         cycle_move = abs(cycle_move)
 99 |         cycle_move = cycle_move % self.frames
100 |         new_videos = torch.zeros_like(adv_videos)
101 |         for i in range(self.frames):
102 |             ori_ind = i
103 |             new_ind = (ori_ind + direction * cycle_move) % self.frames 
104 |             new_videos[:,:,new_ind] = adv_videos[:,:,ori_ind]
105 |         return new_videos
106 | 
107 |     def _cycle_move_large(self, adv_videos, cycle_move):
108 |         if cycle_move < 0:
109 |             direction = -1
110 |         else:
111 |             direction = 1
112 |         cycle_move = abs(cycle_move)
113 |         if cycle_move == 0:
114 |             cycle_move = cycle_move % self.frames
115 |         else:
116 |             cycle_move = (cycle_move + (int(self.frames/2)-1)) % self.frames
117 |         new_videos = torch.zeros_like(adv_videos)
118 |         for i in range(self.frames):
119 |             ori_ind = i
120 |             new_ind = (ori_ind + direction * cycle_move) % self.frames 
121 |             new_videos[:,:,new_ind] = adv_videos[:,:,ori_ind]
122 |         return new_videos
123 | 
124 |     def _cycle_move_random(self, adv_videos, cycle_move):
125 |         if cycle_move < 0:
126 |             direction = -1
127 |         else:
128 |             direction = 1
129 |         # cycle_move = abs(cycle_move)
130 |         if cycle_move == 0:
131 |             cycle_move = cycle_move % self.frames
132 |         else:
133 |             cycle_move = random.randint(0, 100) % self.frames
134 |         # cycle_move = (cycle_move + int(self.frames/2)) % self.frames
135 |         new_videos = torch.zeros_like(adv_videos)
136 |         for i in range(self.frames):
137 |             ori_ind = i
138 |             new_ind = (ori_ind + direction * cycle_move) % self.frames 
139 |             new_videos[:,:,new_ind] = adv_videos[:,:,ori_ind]
140 |         return new_videos
141 | 
142 |     def _exchange_move(self, adv_videos, exchange_lists):
143 |         new_videos = adv_videos.clone()
144 |         for exchange in exchange_lists:
145 |             one_frame, ano_frame = exchange
146 |             new_videos[:,:,one_frame] = adv_videos[:,:,ano_frame]
147 |             new_videos[:,:,ano_frame] = adv_videos[:,:,one_frame]
148 |         return new_videos
149 | 
150 |     def _get_grad(self, adv_videos, labels, loss):
151 |         batch_size = adv_videos.shape[0]
152 |         used_labels = torch.cat([labels]*batch_size, dim=0)
153 |         adv_videos.requires_grad = True
154 |         outputs = self.model(adv_videos)
155 |         cost = self._targeted*loss(outputs, used_labels).to(self.device)
156 |         grad = torch.autograd.grad(cost, adv_videos, 
157 |                                     retain_graph=False, create_graph=False)[0]
158 |         return grad
159 | 
160 |     def _grad_augmentation(self, grads):
161 |         '''
162 |         Input:
163 |             grads: kernlen, grad.shape
164 |         Return 
165 |             grad
166 |         '''
167 |         same_position_diff_frame = grads.clone()
168 |         diff_position_same_frame = torch.zeros_like(grads)
169 |         for ind, cycle_move in enumerate(self.cycle_move_list):
170 |             diff_position_same_frame[ind] = self._cycle_move(grads[ind], -cycle_move)
171 |         s_conv_grad = self._conv1d_frame(same_position_diff_frame)
172 |         d_conv_grad = self._conv1d_frame(diff_position_same_frame)
173 |         grad = (1-self.weight)*s_conv_grad + self.weight*d_conv_grad
174 |         return grad
175 | 
176 |     def forward(self, videos, labels):
177 |         r"""
178 |         Overridden.
179 |         """
180 |         videos = videos.to(self.device)
181 |         momentum = torch.zeros_like(videos).to(self.device)
182 |         labels = labels.to(self.device)
183 |         loss = nn.CrossEntropyLoss()
184 |         unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1]
185 |         adv_videos = videos.clone().detach()
186 |         del videos
187 | 
188 |         start_time = time.time()
189 |         for i in range(self.steps):
190 |             # obtain grads of these variants
191 |             batch_new_videos = []
192 |             for cycle_move in self.cycle_move_list:
193 |                 if self.move_type == 'adj':
194 |                     new_videos = self._cycle_move(adv_videos, cycle_move)
195 |                 elif self.move_type == 'large':
196 |                     new_videos = self._cycle_move_large(adv_videos, cycle_move)
197 |                 elif self.move_type == 'random':
198 |                     new_videos = self._cycle_move_random(adv_videos, cycle_move)
199 |                 batch_new_videos.append(new_videos)
200 |             batch_inps = torch.cat(batch_new_videos, dim=0)
201 |             grads = []
202 |             batch_times = 5
203 |             length = len(self.cycle_move_list)
204 |             if self.model_name == 'TPNet':
205 |                 batch_times = length
206 |                 print (self.model_name, batch_times)
207 |             batch_size = math.ceil(length / batch_times)
208 |             for i in range(batch_times):
209 |                 grad = self._get_grad(batch_inps[i*batch_size:min((i+1)*batch_size, length)], labels, loss)
210 |                 grads.append(grad)
211 |             # grad augmentation
212 |             grads = torch.cat(grads, dim=0)
213 |             grads = torch.unsqueeze(grads, dim=1)
214 |             grad = self._grad_augmentation(grads)
215 | 
216 |             # momentum 
217 |             if self.momentum:
218 |                 grad = norm_grads(grad)
219 |                 grad += momentum * self.delay
220 |                 momentum = grad
221 |             else:
222 |                 pass
223 | 
224 |             adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1]
225 |             adv_videos = adv_videos + self.step_size*grad.sign()
226 |             delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon)
227 |             adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach()
228 |             adv_videos = self._transform_video(adv_videos, mode='forward') # norm
229 |             print ('now_time', time.time()-start_time)
230 |         return adv_videos


--------------------------------------------------------------------------------
/datasets.py:
--------------------------------------------------------------------------------
  1 | # from gluoncv.torch.data import VideoClsDataset
  2 | import torch
  3 | 
  4 | import os
  5 | import warnings
  6 | import numpy as np
  7 | try:
  8 |     from decord import VideoReader, cpu
  9 | except ImportError:
 10 |     VideoReader = None
 11 |     cpu = None
 12 | 
 13 | import torch
 14 | from torch.utils.data import Dataset
 15 | 
 16 | from gluoncv.torch.data import video_transforms, volume_transforms, multiGridHelper, MultiGridBatchSampler
 17 | 
 18 | class VideoClsDataset(Dataset):
 19 |     """Load your own video classification dataset."""
 20 | 
 21 |     def __init__(self, anno_path, data_path, mode='train', clip_len=8,
 22 |                  frame_sample_rate=2, crop_size=224, short_side_size=256,
 23 |                  new_height=256, new_width=340, keep_aspect_ratio=False,
 24 |                  num_segment=1, num_crop=1, test_num_segment=10, test_num_crop=3,
 25 |                  use_multigrid=False):
 26 |         self.anno_path = anno_path
 27 |         self.data_path = data_path
 28 |         self.mode = mode
 29 |         self.clip_len = clip_len
 30 |         self.frame_sample_rate = frame_sample_rate
 31 |         self.crop_size = crop_size
 32 |         self.short_side_size = short_side_size
 33 |         self.new_height = new_height
 34 |         self.new_width = new_width
 35 |         self.keep_aspect_ratio = keep_aspect_ratio
 36 |         self.num_segment = num_segment
 37 |         self.test_num_segment = test_num_segment
 38 |         self.num_crop = num_crop
 39 |         self.test_num_crop = test_num_crop
 40 |         self.use_multigrid = use_multigrid and (mode == 'train')
 41 |         if VideoReader is None:
 42 |             raise ImportError("Unable to import `decord` which is required to read videos.")
 43 | 
 44 |         import pandas as pd
 45 |         # cleaned = pd.read_csv(self.anno_path, header=None, delimiter=' ')
 46 |         # self.dataset_samples = list(cleaned.values[:, 0])
 47 |         # self.label_array = list(cleaned.values[:, 2])
 48 |         cleaned = pd.read_csv(self.anno_path)
 49 |         self.dataset_samples = cleaned['path'].values.tolist()
 50 |         self.label_array =  cleaned['gt_label'].values.tolist()
 51 |         self.clip_inds = cleaned['clip_index'].values.tolist()
 52 | 
 53 |         if (mode == 'train'):
 54 |             if self.use_multigrid:
 55 |                 self.mg_helper = multiGridHelper()
 56 |                 self.data_transform = []
 57 |                 for alpha in range(self.mg_helper.mod_long):
 58 |                     tmp = []
 59 |                     for beta in range(self.mg_helper.mod_short):
 60 |                         info = self.mg_helper.get_resize(alpha, beta)
 61 |                         scale_s = info[1]
 62 |                         tmp.append(video_transforms.Compose([
 63 |                             video_transforms.Resize(int(self.short_side_size / scale_s),
 64 |                                                     interpolation='bilinear'),
 65 |                             # TODO: multiscale corner cropping
 66 |                             video_transforms.RandomResize(ratio=(1, 1.25),
 67 |                                                           interpolation='bilinear'),
 68 |                             video_transforms.RandomCrop(size=(int(self.crop_size / scale_s),
 69 |                                                               int(self.crop_size / scale_s)))]))
 70 |                     self.data_transform.append(tmp)
 71 |             else:
 72 |                 self.data_transform = video_transforms.Compose([
 73 |                     video_transforms.Resize(int(self.short_side_size),
 74 |                                             interpolation='bilinear'),
 75 |                     video_transforms.RandomResize(ratio=(1, 1.25),
 76 |                                                   interpolation='bilinear'),
 77 |                     video_transforms.RandomCrop(size=(int(self.crop_size),
 78 |                                                       int(self.crop_size)))])
 79 | 
 80 |             self.data_transform_after = video_transforms.Compose([
 81 |                 video_transforms.RandomHorizontalFlip(),
 82 |                 volume_transforms.ClipToTensor(),
 83 |                 video_transforms.Normalize(mean=[0.485, 0.456, 0.406],
 84 |                                            std=[0.229, 0.224, 0.225])
 85 |             ])
 86 |         elif (mode == 'validation'):
 87 |             self.data_transform = video_transforms.Compose([
 88 |                 video_transforms.Resize(self.short_side_size, interpolation='bilinear'),
 89 |                 video_transforms.CenterCrop(size=(self.crop_size, self.crop_size)),
 90 |                 volume_transforms.ClipToTensor(),
 91 |                 video_transforms.Normalize(mean=[0.485, 0.456, 0.406],
 92 |                                            std=[0.229, 0.224, 0.225])
 93 |             ])
 94 |         elif mode == 'test':
 95 |             self.data_resize = video_transforms.Compose([
 96 |                 video_transforms.Resize(size=(short_side_size), interpolation='bilinear')
 97 |             ])
 98 |             self.data_transform = video_transforms.Compose([
 99 |                 volume_transforms.ClipToTensor(),
100 |                 video_transforms.Normalize(mean=[0.485, 0.456, 0.406],
101 |                                            std=[0.229, 0.224, 0.225])
102 |             ])
103 |             self.test_seg = []
104 |             self.test_dataset = []
105 |             self.test_label_array = []
106 |             for ck in range(self.test_num_segment):
107 |                 for cp in range(self.test_num_crop):
108 |                     for idx in range(len(self.label_array)):
109 |                         sample_label = self.label_array[idx]
110 |                         self.test_label_array.append(sample_label)
111 |                         self.test_dataset.append(self.dataset_samples[idx])
112 |                         self.test_seg.append((ck, cp))
113 | 
114 |     def __getitem__(self, index):
115 |         if self.mode == 'train':
116 |             if self.use_multigrid is True:
117 |                 index, alpha, beta = index
118 |                 info = self.mg_helper.get_resize(alpha, beta)
119 |                 scale_t = info[0]
120 |                 data_transform_func = self.data_transform[alpha][beta]
121 |             else:
122 |                 scale_t = 1
123 |                 data_transform_func = self.data_transform
124 | 
125 |             sample = self.dataset_samples[index]
126 |             buffer = self.loadvideo_decord(sample, sample_rate_scale=scale_t)
127 |             if len(buffer) == 0:
128 |                 while len(buffer) == 0:
129 |                     warnings.warn("video {} not correctly loaded during training".format(sample))
130 |                     index = np.random.randint(self.__len__())
131 |                     sample = self.dataset_samples[index]
132 |                     buffer = self.loadvideo_decord(sample, sample_rate_scale=scale_t)
133 | 
134 |             buffer = data_transform_func(buffer)
135 |             buffer = self.data_transform_after(buffer)
136 |             return buffer, self.label_array[index], sample.split("/")[-1].split(".")[0]
137 | 
138 |         elif self.mode == 'validation':
139 |             sample = self.dataset_samples[index]
140 |             clip_ind = self.clip_inds[index]
141 |             buffer = self.loadvideo_decord(sample, clip_ind)
142 |             if len(buffer) == 0:
143 |                 while len(buffer) == 0:
144 |                     warnings.warn("video {} not correctly loaded during validation".format(sample))
145 |                     index = np.random.randint(self.__len__())
146 |                     sample = self.dataset_samples[index]
147 |                     buffer = self.loadvideo_decord(sample)
148 |             buffer = self.data_transform(buffer)
149 |             # return buffer, self.label_array[index], sample.split("/")[-1].split(".")[0]
150 |             return buffer, self.label_array[index], sample.split(".")[0], clip_ind
151 | 
152 |         elif self.mode == 'test':
153 |             sample = self.test_dataset[index]
154 |             chunk_nb, split_nb = self.test_seg[index]
155 |             buffer = self.loadvideo_decord(sample)
156 | 
157 |             while len(buffer) == 0:
158 |                 warnings.warn("video {}, temporal {}, spatial {} not found during testing".format(\
159 |                     str(self.test_dataset[index]), chunk_nb, split_nb))
160 |                 index = np.random.randint(self.__len__())
161 |                 sample = self.test_dataset[index]
162 |                 chunk_nb, split_nb = self.test_seg[index]
163 |                 buffer = self.loadvideo_decord(sample)
164 | 
165 |             buffer = self.data_resize(buffer)
166 |             if isinstance(buffer, list):
167 |                 buffer = np.stack(buffer, 0)
168 | 
169 |             spatial_step = 1.0 * (max(buffer.shape[1], buffer.shape[2]) - self.short_side_size) \
170 |                                  / (self.test_num_crop - 1)
171 |             temporal_step = max(1.0 * (buffer.shape[0] - self.clip_len) \
172 |                                 / (self.test_num_segment - 1), 0)
173 |             temporal_start = int(chunk_nb * temporal_step)
174 |             spatial_start = int(split_nb * spatial_step)
175 |             if buffer.shape[1] >= buffer.shape[2]:
176 |                 buffer = buffer[temporal_start:temporal_start + self.clip_len, \
177 |                        spatial_start:spatial_start + self.short_side_size, :, :]
178 |             else:
179 |                 buffer = buffer[temporal_start:temporal_start + self.clip_len, \
180 |                        :, spatial_start:spatial_start + self.short_side_size, :]
181 | 
182 |             buffer = self.data_transform(buffer)
183 |             return buffer, self.test_label_array[index], sample.split("/")[-1].split(".")[0], \
184 |                    chunk_nb, split_nb
185 |         else:
186 |             raise NameError('mode {} unkown'.format(self.mode))
187 | 
188 |     def loadvideo_decord(self, sample, clip_ind, sample_rate_scale=1):
189 |         """Load video content using Decord"""
190 |         # pylint: disable=line-too-long, bare-except, unnecessary-comprehension
191 |         fname = os.path.join(self.data_path, sample)
192 | 
193 |         if not (os.path.exists(fname)):
194 |             return []
195 | 
196 |         # avoid hanging issue
197 |         if os.path.getsize(fname) < 1 * 1024:
198 |             print('SKIP: ', fname, " - ", os.path.getsize(fname))
199 |             return []
200 |         try:
201 |             if self.keep_aspect_ratio:
202 |                 vr = VideoReader(fname, num_threads=1, ctx=cpu(0))
203 |             else:
204 |                 vr = VideoReader(fname, width=self.new_width, height=self.new_height,
205 |                                  num_threads=1, ctx=cpu(0))
206 |         except:
207 |             print("video cannot be loaded by decord: ", fname)
208 |             return []
209 | 
210 |         if self.mode == 'test':
211 |             all_index = [x for x in range(0, len(vr), self.frame_sample_rate)]
212 |             while len(all_index) < self.clip_len:
213 |                 all_index.append(all_index[-1])
214 |             vr.seek(0)
215 |             buffer = vr.get_batch(all_index).asnumpy()
216 |             return buffer
217 | 
218 |         # handle temporal segments
219 |         converted_len = int(self.clip_len * self.frame_sample_rate)
220 |         seg_len = len(vr) // self.num_segment
221 | 
222 |         all_index = []
223 |         for i in range(self.num_segment):
224 |             if seg_len <= converted_len:
225 |                 index = np.linspace(0, seg_len, num=seg_len // self.frame_sample_rate)
226 |                 index = np.concatenate((index, np.ones(self.clip_len - seg_len // self.frame_sample_rate) * seg_len))
227 |                 index = np.clip(index, 0, seg_len - 1).astype(np.int64)
228 |             else:
229 |                 # end_idx = np.random.randint(converted_len, seg_len)
230 |                 if clip_ind == -1:
231 |                     end_idx = seg_len - 1 
232 |                 else:
233 |                     np.random.seed(clip_ind)
234 |                     end_idx = np.random.randint(converted_len, seg_len)
235 |                 str_idx = end_idx - converted_len
236 |                 index = np.linspace(str_idx, end_idx, num=self.clip_len)
237 |                 index = np.clip(index, str_idx, end_idx - 1).astype(np.int64)
238 |             index = index + i*seg_len
239 |             all_index.extend(list(index))
240 | 
241 |         all_index = all_index[::int(sample_rate_scale)]
242 |         vr.seek(0)
243 |         buffer = vr.get_batch(all_index).asnumpy()
244 |         return buffer
245 | 
246 |     def __len__(self):
247 |         if self.mode != 'test':
248 |             return len(self.dataset_samples)
249 |         else:
250 |             return len(self.test_dataset)
251 | 
252 | def get_dataset(cfg, loader=True):
253 |     val_dataset = VideoClsDataset(anno_path=cfg.CONFIG.DATA.VAL_ANNO_PATH,
254 |                                   data_path=cfg.CONFIG.DATA.VAL_DATA_PATH,
255 |                                   mode='validation',
256 |                                   use_multigrid=cfg.CONFIG.DATA.MULTIGRID,
257 |                                   clip_len=cfg.CONFIG.DATA.CLIP_LEN,
258 |                                   frame_sample_rate=cfg.CONFIG.DATA.FRAME_RATE,
259 |                                   num_segment=cfg.CONFIG.DATA.NUM_SEGMENT,
260 |                                   num_crop=cfg.CONFIG.DATA.NUM_CROP,
261 |                                   keep_aspect_ratio=cfg.CONFIG.DATA.KEEP_ASPECT_RATIO,
262 |                                   crop_size=cfg.CONFIG.DATA.CROP_SIZE,
263 |                                   short_side_size=cfg.CONFIG.DATA.SHORT_SIDE_SIZE,
264 |                                   new_height=cfg.CONFIG.DATA.NEW_HEIGHT,
265 |                                   new_width=cfg.CONFIG.DATA.NEW_WIDTH)
266 |     # length = len(val_dataset)
267 |     # batch_contain = int(length/batch_nums)
268 |     # this_batch_dataset = val_dataset[batch_contain*(batch_index-1):batch_contain*batch_index]
269 | 
270 |     print ('The length of Dataset is {}.'.format(len(val_dataset)))
271 |     if loader:
272 |         val_loader = torch.utils.data.DataLoader(
273 |             val_dataset, batch_size=cfg.CONFIG.VAL.BATCH_SIZE, shuffle=False,
274 |             num_workers=9, sampler=None, pin_memory=True)
275 |         return val_loader
276 |     else:
277 |         return val_dataset


--------------------------------------------------------------------------------
/TPAMI_attack.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import torch.nn as nn
  4 | import torchvision.models as models
  5 | import random
  6 | 
  7 | from image_cam import GradCAM
  8 | from torch.autograd import Variable
  9 | from image_cam_utils import find_alexnet_layer, find_vgg_layer, find_resnet_layer, find_densenet_layer, find_squeezenet_layer
 10 | import pickle as pkl
 11 | 
 12 | import time
 13 | from timm.models import create_model
 14 | import numpy as np
 15 | 
 16 | class Attack(object):
 17 |     """
 18 |     Base class for all attacks.
 19 |     .. note::
 20 |         It automatically set device to the device where given model is.
 21 |         It temporarily changes the model's training mode to `test`
 22 |         by `.eval()` only during an attack process.
 23 |     """
 24 |     def __init__(self, name, model=None):
 25 |         r"""
 26 |         Initializes internal attack state.
 27 |         Arguments:
 28 |             name (str) : name of an attack.
 29 |             model (torch.nn.Module): model to attack.
 30 |         """
 31 |         self.attack = name
 32 |         self.model = model
 33 |         self.model_name = str(model).split("(")[0]
 34 | 
 35 |         # mean and std values are used in pytorch pretrained models
 36 |         # they are also used in Kinetics-400.
 37 |         self.mean = [0.485, 0.456, 0.406]
 38 |         self.std = [0.229, 0.224, 0.225]
 39 | 
 40 |     def forward(self, *input):
 41 |         r"""
 42 |         It defines the computation performed at every call (attack forward).
 43 |         Should be overridden by all subclasses.
 44 |         """
 45 |         raise NotImplementedError
 46 |     
 47 |     def _transform_perts(self, perts):
 48 |         dtype = perts.dtype
 49 |         mean = torch.as_tensor(self.mean, dtype=dtype).cuda()
 50 |         std = torch.as_tensor(self.std, dtype=dtype).cuda()
 51 |         perts.div_(std[:, None, None])
 52 |         return perts
 53 | 
 54 |     def _transform_video(self, video, mode='forward'):
 55 |         r'''
 56 |         Transform the video into [0, 1]
 57 |         '''
 58 |         dtype = video.dtype
 59 |         mean = torch.as_tensor(self.mean, dtype=dtype).cuda()
 60 |         std = torch.as_tensor(self.std, dtype=dtype).cuda()
 61 |         if mode == 'forward':
 62 |             # [-mean/std, mean/std]
 63 |             video.sub_(mean[:, None, None]).div_(std[:, None, None])
 64 |         elif mode == 'back':
 65 |             # [0, 1]
 66 |             video.mul_(std[:, None, None]).add_(mean[:, None, None])
 67 |         return video
 68 | 
 69 |     def _transform_video_ILAF(self, video, mode='forward'):
 70 |         r'''
 71 |         Transform the video into [0, 1]
 72 |         '''
 73 |         dtype = video.dtype
 74 |         mean = torch.as_tensor(self.mean, dtype=dtype).cuda()
 75 |         std = torch.as_tensor(self.std, dtype=dtype).cuda()
 76 |         if mode == 'forward':
 77 |             # [-mean/std, mean/std]
 78 |             video.sub_(mean[None, :, None, None, None]).div_(std[None, :, None, None, None])
 79 |         elif mode == 'back':
 80 |             # [0, 1]
 81 |             video.mul_(std[None, :, None, None, None]).add_(mean[None, :, None, None, None])
 82 |         return video
 83 | 
 84 |     def __call__(self, *input, **kwargs):
 85 |         images = self.forward(*input, **kwargs)
 86 |         return images
 87 | 
 88 | def get_vits():
 89 |     model = create_model(
 90 |                         'vit_base_patch16_224',
 91 |                         pretrained=True,
 92 |                         num_classes=1000,
 93 |                         in_chans=3,
 94 |                         global_pool=None,
 95 |                         scriptable=False)
 96 |     model.cuda()
 97 |     model.eval()
 98 |     return model
 99 | 
100 | def get_model(model_name):
101 |     '''
102 |     ['alexnet', 'vgg', 'resnet', 'densenet', 'squeezenet']
103 |     '''
104 |     if model_name == 'alexnet':
105 |         model = models.alexnet(pretrained=True)
106 |         # model.features[11/7/4/1] 
107 |     elif model_name == 'vgg':
108 |         model = models.vgg16(pretrained=True)
109 |         # model.features[29/20/11/1]
110 |     elif model_name == 'resnet':
111 |         model = models.resnet101(pretrained=True)
112 |     elif model_name == 'densenet':
113 |         model = models.densenet161(pretrained=True)
114 |         # model.features.denseblock1/2/3/4
115 |         # model.features.transition1/2/3,norm5
116 |     elif model_name == 'squeezenet':
117 |         model = models.squeezenet1_1(pretrained=True)
118 |         # model.features[12/9/6/3].expand3x3_activation
119 |     model.cuda()
120 |     model.eval()
121 |     # for m in model.modules():
122 |     #     if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
123 |     #         m.eval()
124 |     return model
125 | 
126 | def get_models(model_name_lists):
127 |     models = []
128 |     for model_name in model_name_lists:
129 |         model = get_model(model_name)
130 |         models.append(model)
131 |     return models
132 |     
133 | def get_GradCam(model_name_lists):
134 |     gradcams = []
135 |     for model_name in model_name_lists:
136 |         model_dict = dict(type=model_name, arch=get_model(model_name), input_size=(224, 224))
137 |         this_gradcam = GradCAM(model_dict, False)
138 |         gradcams.append(this_gradcam)
139 |     return gradcams 
140 | 
141 | class AENS_I2V_MF(Attack):
142 |     '''
143 |     The proposed adaptive I2V with multiple models and layers.
144 |     Parameters:
145 |         model_name_lists: the surrogate image model names. For example, model_name_lists = ['resnet', 'vgg', 'squeezenet', 'alexnet']
146 |         depths: the layers used in each model. For example,  depths = {'resnet':[2,3], 'vgg':[2,3], 'squeezenet':[2,3], 'alexnet':[2,3]}
147 |         step_size: the learning rate.
148 |     Return:
149 |         image_inps: video adversarial example.
150 |         used_time: the time during attacking.
151 |         cost_saved: the cost values of all steps
152 |     '''
153 |     def __init__(self, model_name_lists, depths, step_size, momentum=0, coef_CE=False, epsilon=16/255, steps=60):
154 |         super(AENS_I2V_MF, self).__init__("AENS_I2V_MF")
155 |         self.epsilon = epsilon
156 |         self.steps = steps
157 |         self.step_size = step_size
158 |         self.loss_info = {}
159 |         self.depths = depths
160 |         self.momentum = momentum
161 |         self.coef_CE = coef_CE
162 |         self.models = get_models(model_name_lists)
163 |         self.model_names = model_name_lists
164 | 
165 |         self.coeffs = torch.ones(len(model_name_lists)*2).cuda()
166 |         # print ('using image models:', model_name_lists)
167 | 
168 |         for i in range(len(self.models)):
169 |             self.models[i].train()
170 |             for m in self.models[i].modules():
171 |                 if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
172 |                     m.eval()
173 |             model_name = self.model_names[i]
174 |             self._attention_hook(self.models[i], model_name)
175 |             
176 |     def _find_target_layer(self, model, model_name):
177 |         used_depth = self.depths[model_name]
178 |         if model_name == 'resnet':
179 |             if isinstance(used_depth, list):
180 |                 return [getattr(model, 'layer{}'.format(this_depth))[-1] for this_depth in used_depth]
181 |             else:
182 |                 return getattr(model, 'layer{}'.format(used_depth))[-1]
183 |         elif model_name == 'alexnet':
184 |             depth_to_layer = {1:1,2:4,3:7,4:11}
185 |             if isinstance(used_depth, list):
186 |                 return [getattr(model, 'features')[depth_to_layer[this_depth]] for this_depth in used_depth]
187 |             else:
188 |                 return getattr(model, 'features')[depth_to_layer[used_depth]]
189 |         elif model_name == 'vgg':
190 |             depth_to_layer = {1:1,2:11,3:20,4:29}
191 |             if isinstance(used_depth, list):
192 |                 return [getattr(model, 'features')[depth_to_layer[this_depth]] for this_depth in used_depth]
193 |             else:
194 |                 return getattr(model, 'features')[depth_to_layer[used_depth]]
195 |         elif model_name == 'squeezenet':
196 |             depth_to_layer = {1:3,2:6,3:9,4:12}
197 |             if isinstance(used_depth, list):
198 |                 return [getattr(model, 'features')[depth_to_layer[this_depth]] for this_depth in used_depth]
199 |             else:
200 |                 return getattr(model, 'features')[depth_to_layer[used_depth]].expand3x3_activation
201 |             
202 |     def _attention_hook(self, model, model_name):
203 |         self.gradients = dict()
204 |         self.gradients['value'] = []
205 |         self.activations = dict()
206 |         self.activations['value'] = []
207 |         def backward_hook(module, grad_input, grad_output):
208 |             self.gradients['value'] += [grad_output[0]]
209 |             return None
210 |         def forward_hook(module, input, output):
211 |             self.activations['value'] += [output]
212 |             return None
213 |         target_layer = self._find_target_layer(model, model_name)
214 |         # print (target_layer)
215 |         if isinstance(target_layer, list):
216 |             for i in target_layer:
217 |                 i.register_forward_hook(forward_hook)
218 |                 i.register_backward_hook(backward_hook)
219 |         else:        
220 |             target_layer.register_forward_hook(forward_hook)
221 |             target_layer.register_backward_hook(backward_hook)
222 | 
223 |     def forward(self, videos, labels, video_names):
224 |         batch_size = videos.shape[0]
225 |         b,c,f,h,w = videos.shape
226 |         videos = videos.cuda()
227 |         labels = labels.cuda()
228 |         self.weights = []
229 |         image_inps = videos.permute([0,2,1,3,4])
230 |         image_inps = image_inps.reshape(b*f, c, h, w)
231 | 
232 |         # define modifer that updated by optimizer.
233 |         modif = torch.Tensor(b*f, c, h, w).fill_(0.01/255).cuda()
234 |         modifier = torch.nn.Parameter(modif, requires_grad=True)
235 |         optimizer = torch.optim.Adam([modifier], lr=self.step_size)
236 | 
237 |         unnorm_videos = self._transform_video(image_inps.clone().detach(), mode='back') # [0, 1]
238 | 
239 |         unnorm_videos = Variable(unnorm_videos, requires_grad=False)
240 |         
241 |         init_feature_maps = []
242 |         for n in range(len(self.models)):
243 |             this_feature_maps = []
244 |             self.gradients = dict()
245 |             self.gradients['value'] = []
246 |             self.activations = dict()
247 |             self.activations['value'] = []
248 |             _ = self.models[n](image_inps)  
249 |             for mm in range(len(self.activations['value'])):
250 |                 activations = self.activations['value'][mm]
251 |                 activations = Variable(activations, requires_grad=False)
252 |                 this_feature_maps.append(activations)
253 |             init_feature_maps.append(this_feature_maps)
254 | 
255 |         begin = time.time()
256 |         cost_saved = np.zeros(self.steps)
257 |         previous_cs_loss = torch.ones_like(self.coeffs)
258 |         for i in range(self.steps):
259 |             # self.gradients = dict()
260 |             # self.gradients['value'] = []
261 |             # self.activations = dict()
262 |             # self.activations['value'] = []
263 | 
264 |             # update coeff
265 |             self.coeffs = torch.softmax(torch.softmax(previous_cs_loss, dim=0) + self.momentum * self.coeffs, dim=0)
266 |             self.weights.append(self.coeffs.clone().cpu().numpy())
267 |             # print (self.coeffs.clone().cpu().numpy())
268 |             true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1)
269 |             true_image = self._transform_video(true_image, mode='forward') # norm
270 | 
271 |             losses = []
272 |             for n in range(len(self.models)):
273 |                 self.gradients = dict()
274 |                 self.gradients['value'] = []
275 |                 self.activations = dict()
276 |                 self.activations['value'] = []
277 |                 _ = self.models[n](true_image)
278 |                 this_losses = []
279 |                 for mm in range(len(init_feature_maps[n])):
280 |                     activations = self.activations['value'][mm]
281 |                     init_activations = init_feature_maps[n][mm]
282 |                     this_dir = activations.view(b*f, -1)
283 |                     init_dir = init_activations.view(b*f, -1)
284 |                     this_loss = F.cosine_similarity(this_dir, init_dir)
285 |                     this_losses.append(this_loss)
286 |                 losses.append(torch.stack(this_losses)) # 2,32
287 |             
288 |             
289 |             used_coeffs = torch.unsqueeze(self.coeffs, dim=1) # (lens_model*2) * 1
290 |             each_features_loss = torch.sum(used_coeffs * torch.cat(losses, dim=0), dim=1) # 4*32
291 |             cost = torch.mean(each_features_loss)
292 |             
293 |             if self.coef_CE:
294 |                 previous_cs_loss = each_features_loss.clone().detach()
295 |             else:
296 |                 updated_features_loss = torch.sum(torch.cat(losses, dim=0).clone().detach(), dim=1)
297 |                 previous_cs_loss = updated_features_loss.clone().detach()
298 | 
299 |             # update previous_cs_loss
300 |             
301 |             # print (previous_cs_loss.clone().cpu().numpy())
302 |             # print (cost)
303 |             optimizer.zero_grad()
304 |             cost.backward()
305 |             optimizer.step()
306 | 
307 |             cost_saved[i] = cost.detach().item()
308 | 
309 |             for ind,vid_name in enumerate(video_names):
310 |                 if vid_name not in self.loss_info.keys():
311 |                     self.loss_info[vid_name] = {}  
312 |                 self.loss_info[vid_name][i] = {'cost': str(cost.detach().cpu().numpy())}
313 | 
314 |         used_time = time.time()-begin
315 | 
316 |         true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1)
317 |         image_inps = self._transform_video(true_image, mode='forward')
318 |         image_inps = image_inps.reshape(b,f,c,h,w)
319 |         image_inps = image_inps.permute([0,2,1,3,4])
320 |         return image_inps, used_time, cost_saved
321 | 


--------------------------------------------------------------------------------
/transforms_ucf101.py:
--------------------------------------------------------------------------------
  1 | import random 
  2 | import math
  3 | 
  4 | import numbers
  5 | import collections
  6 | import numpy as np
  7 | import torch
  8 | from PIL import Image, ImageOps
  9 | try:
 10 |     import accimage
 11 | except ImportError:
 12 |     accimage = None
 13 |     
 14 | # the code from 3D-ResNets-PyTorch-master/temporal_transforms.py, spatial_transforms.py and target_transforms.py
 15 | 
 16 | #********************************************
 17 | # temporal_transforms
 18 | #********************************************
 19 | # LoopPadding: frame_indices < size, loop
 20 | # TemporalBeginCrop: frame_indices[:size] < size, loop, [1:]
 21 | # TemporalCenterCrop: frame_indice[center-size/2:center+size/2] < size, loop, [1:]
 22 | # TemporalRandomCrop: frame_indice[random_begin,:random_begin+size] < size, loop, [1:]
 23 | class LoopPadding(object):
 24 |     """
 25 |     Variable size means that the length of temporal images we wanted.
 26 |     The length of variable out should be equal with variable size.
 27 |     If not, LoopPadding the temporal images.
 28 |     """
 29 |     def __init__(self, size):
 30 |         self.size = size
 31 | 
 32 |     def __call__(self, frame_indices):
 33 |         out = frame_indices[1:self.size+1]
 34 |         
 35 |         for index in out:
 36 |             if len(out) >= self.size:
 37 |                 break
 38 |             out.append(index)
 39 | 
 40 |         return out
 41 |     
 42 | class TemporalBeginCrop(object):
 43 |     """
 44 |     Temporally crop the given frame indices at a beginning.
 45 |     If the number of frames is less than the size,
 46 |     loop the indices as many times as necessary to satisfy the size.
 47 |     Args:
 48 |         size (int): Desired output size of the crop.
 49 |     """
 50 | 
 51 |     def __init__(self, size):
 52 |         self.size = size
 53 | 
 54 |     def __call__(self, frame_indices):
 55 |         out = frame_indices[1:self.size+1]
 56 | 
 57 |         for index in out:
 58 |             if len(out) >= self.size:
 59 |                 break
 60 |             out.append(index)
 61 |         return out
 62 |     
 63 | class TemporalCenterCrop(object):
 64 |     """
 65 |     Temporally crop the given frame indices at a center.
 66 |     If the number of frames is less than the size,
 67 |     loop the indices as many times as necessary to satisfy the size.
 68 |     Args:
 69 |         size (int): Desired output size of the crop.
 70 |     """
 71 | 
 72 |     def __init__(self, size):
 73 |         self.size = size
 74 | 
 75 |     def __call__(self, frame_indices):
 76 |         """
 77 |         Args:
 78 |             frame_indices (list): frame indices to be cropped.
 79 |         Returns:
 80 |             list: Cropped frame indices.
 81 |         """
 82 | 
 83 |         center_index = len(frame_indices) // 2
 84 |         begin_index = max(1, center_index - (self.size // 2))
 85 |         end_index = min(begin_index + self.size, len(frame_indices))
 86 | 
 87 |         out = frame_indices[begin_index:end_index]
 88 | 
 89 |         for index in out:
 90 |             if len(out) >= self.size:
 91 |                 break
 92 |             out.append(index)
 93 | 
 94 |         return out
 95 | 
 96 | class TemporalRandomCrop(object):
 97 |     """
 98 |     Temporally crop the given frame indices at a random location.
 99 |     If the number of frames is less than the size,
100 |     loop the indices as many times as necessary to satisfy the size.
101 |     Args:
102 |         size (int): Desired output size of the crop.
103 |     """
104 | 
105 |     def __init__(self, size):
106 |         self.size = size
107 | 
108 |     def __call__(self, frame_indices):
109 |         """
110 |         Args:
111 |             frame_indices (list): frame indices to be cropped.
112 |         Returns:
113 |             list: Cropped frame indices.
114 |         """
115 | 
116 |         rand_end = max(1, len(frame_indices) - self.size - 1)
117 |         random.seed(1024)
118 |         begin_index = random.randint(0, rand_end)
119 |         end_index = min(begin_index + self.size, len(frame_indices))
120 | 
121 |         out = frame_indices[begin_index:end_index]
122 | 
123 |         for index in out:
124 |             if len(out) >= self.size:
125 |                 break
126 |             out.append(index)
127 | 
128 |         return out
129 |     
130 | #********************************************
131 | # spatial_transforms
132 | #********************************************
133 | # spatial_Compose: combine multiple spatial transforms function.
134 | # ToTensor: np.array, Image.image to tensor, H*W*C(0-255) to C*H*W (0.0-1.0), pixel/norm_value, for tensor.
135 | # Normalize: (pixel-mean)/std, for tensor.
136 | # Scale: Keeping aspect ratio unchanged, scaled the smaller side, for Image.
137 | # CenterCrop: for Image.
138 | # RandomHorizontalFlip: Horizontally flip a image by a probability 0.5.
139 | # MultiScaleCornerCrop: Multiple Scales from 4 corners and 1 center.
140 | # MultiScaleRandomCrop: Multiple Scales from random position.
141 | 
142 | class spatial_Compose(object):
143 |     """Composes several transforms together.
144 |     Args:
145 |         transforms (list of ``Transform`` objects): list of transforms to compose.
146 |     Example:
147 |         >>> transforms.Compose([
148 |         >>>     transforms.CenterCrop(10),
149 |         >>>     transforms.ToTensor(),
150 |         >>> ])
151 |     """
152 | 
153 |     def __init__(self, transforms):
154 |         self.transforms = transforms
155 | 
156 |     def __call__(self, img):
157 |         for t in self.transforms:
158 |             img = t(img)
159 |         return img
160 | 
161 |     def randomize_parameters(self):
162 |         for t in self.transforms:
163 |             t.randomize_parameters()
164 | 
165 | class ToTensor(object):
166 |     """
167 |     Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
168 |     Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
169 |     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
170 |     """
171 | 
172 |     def __init__(self, norm_value=255):
173 |         self.norm_value = norm_value
174 | 
175 |     def __call__(self, pic):
176 |         """
177 |         Args:
178 |             pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
179 |         Returns:
180 |             Tensor: Converted image.
181 |         """
182 |         if isinstance(pic, np.ndarray):
183 |             # handle numpy array
184 |             img = torch.from_numpy(pic.transpose((2, 0, 1)))
185 |             # backward compatibility
186 |             return img.float().div(self.norm_value)
187 | 
188 |         if accimage is not None and isinstance(pic, accimage.Image):
189 |             nppic = np.zeros(
190 |                 [pic.channels, pic.height, pic.width], dtype=np.float32)
191 |             pic.copyto(nppic)
192 |             return torch.from_numpy(nppic)
193 | 
194 |         # handle PIL Image
195 |         if pic.mode == 'I':
196 |             img = torch.from_numpy(np.array(pic, np.int32, copy=False))
197 |         elif pic.mode == 'I;16':
198 |             img = torch.from_numpy(np.array(pic, np.int16, copy=False))
199 |         else:
200 |             img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
201 |         # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
202 |         if pic.mode == 'YCbCr':
203 |             nchannel = 3
204 |         elif pic.mode == 'I;16':
205 |             nchannel = 1
206 |         else:
207 |             nchannel = len(pic.mode)
208 |         img = img.view(pic.size[1], pic.size[0], nchannel)
209 |         # put it from HWC to CHW format
210 |         # yikes, this transpose takes 80% of the loading time/CPU
211 |         img = img.transpose(0, 1).transpose(0, 2).contiguous()
212 |         if isinstance(img, torch.ByteTensor):
213 |             return img.float().div(self.norm_value)
214 |         else:
215 |             return img
216 | 
217 |     def randomize_parameters(self):
218 |         pass
219 |     
220 | class Normalize(object):
221 |     """
222 |     Normalize an tensor image with mean and standard deviation.
223 |     Given mean: (R, G, B) and std: (R, G, B),
224 |     will normalize each channel of the torch.*Tensor, i.e.
225 |     channel = (channel - mean) / std
226 |     Args:
227 |         mean (sequence): Sequence of means for R, G, B channels respecitvely.
228 |         std (sequence): Sequence of standard deviations for R, G, B channels
229 |             respecitvely.
230 |     """
231 | 
232 |     def __init__(self, mean, std):
233 |         self.mean = mean
234 |         self.std = std
235 | 
236 |     def __call__(self, tensor):
237 |         """
238 |         Args:
239 |             tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
240 |         Returns:
241 |             Tensor: Normalized image.
242 |         """
243 |         # TODO: make efficient
244 |         for t, m, s in zip(tensor, self.mean, self.std):
245 |             t.sub_(m).div_(s)
246 |         return tensor
247 | 
248 |     def randomize_parameters(self):
249 |         pass
250 | 
251 | 
252 | class Scale(object):
253 |     """Rescale the input PIL.Image to the given size.
254 |     Args:
255 |         size (sequence or int): Desired output size. If size is a sequence like
256 |             (w, h), output size will be matched to this. If size is an int,
257 |             smaller edge of the image will be matched to this number.
258 |             i.e, if height > width, then image will be rescaled to
259 |             (size * height / width, size)
260 |         interpolation (int, optional): Desired interpolation. Default is
261 |             ``PIL.Image.BILINEAR``
262 |     """
263 | 
264 |     def __init__(self, size, interpolation=Image.BILINEAR):
265 |         assert isinstance(size,
266 |                           int) or (isinstance(size, collections.Iterable) and
267 |                                    len(size) == 2)
268 |         self.size = size
269 |         self.interpolation = interpolation
270 | 
271 |     def __call__(self, img):
272 |         """
273 |         Args:
274 |             img (PIL.Image): Image to be scaled.
275 |         Returns:
276 |             PIL.Image: Rescaled image.
277 |         """
278 |         if isinstance(self.size, int):
279 |             w, h = img.size
280 |             if (w <= h and w == self.size) or (h <= w and h == self.size):
281 |                 return img
282 |             if w < h:
283 |                 ow = self.size
284 |                 oh = int(self.size * h / w)
285 |                 return img.resize((ow, oh), self.interpolation)
286 |             else:
287 |                 oh = self.size
288 |                 ow = int(self.size * w / h)
289 |                 return img.resize((ow, oh), self.interpolation)
290 |         else:
291 |             return img.resize(self.size, self.interpolation)
292 | 
293 |     def randomize_parameters(self):
294 |         pass
295 | 
296 | 
297 | class CenterCrop(object):
298 |     """Crops the given PIL.Image at the center.
299 |     Args:
300 |         size (sequence or int): Desired output size of the crop. If size is an
301 |             int instead of sequence like (h, w), a square crop (size, size) is
302 |             made.
303 |     """
304 | 
305 |     def __init__(self, size):
306 |         if isinstance(size, numbers.Number):
307 |             self.size = (int(size), int(size))
308 |         else:
309 |             self.size = size
310 | 
311 |     def __call__(self, img):
312 |         """
313 |         Args:
314 |             img (PIL.Image): Image to be cropped.
315 |         Returns:
316 |             PIL.Image: Cropped image.
317 |         """
318 |         w, h = img.size
319 |         th, tw = self.size
320 |         x1 = int(round((w - tw) / 2.))
321 |         y1 = int(round((h - th) / 2.))
322 |         return img.crop((x1, y1, x1 + tw, y1 + th))
323 | 
324 |     def randomize_parameters(self):
325 |         pass
326 | 
327 | 
328 | class CornerCrop(object):
329 | 
330 |     def __init__(self, size, crop_position=None):
331 |         self.size = size
332 |         if crop_position is None:
333 |             self.randomize = True
334 |         else:
335 |             self.randomize = False
336 |         self.crop_position = crop_position
337 |         self.crop_positions = ['c', 'tl', 'tr', 'bl', 'br']
338 | 
339 |     def __call__(self, img):
340 |         image_width = img.size[0]
341 |         image_height = img.size[1]
342 | 
343 |         if self.crop_position == 'c':
344 |             th, tw = (self.size, self.size)
345 |             x1 = int(round((image_width - tw) / 2.))
346 |             y1 = int(round((image_height - th) / 2.))
347 |             x2 = x1 + tw
348 |             y2 = y1 + th
349 |         elif self.crop_position == 'tl':
350 |             x1 = 0
351 |             y1 = 0
352 |             x2 = self.size
353 |             y2 = self.size
354 |         elif self.crop_position == 'tr':
355 |             x1 = image_width - self.size
356 |             y1 = 0
357 |             x2 = image_width
358 |             y2 = self.size
359 |         elif self.crop_position == 'bl':
360 |             x1 = 0
361 |             y1 = image_height - self.size
362 |             x2 = self.size
363 |             y2 = image_height
364 |         elif self.crop_position == 'br':
365 |             x1 = image_width - self.size
366 |             y1 = image_height - self.size
367 |             x2 = image_width
368 |             y2 = image_height
369 | 
370 |         img = img.crop((x1, y1, x2, y2))
371 | 
372 |         return img
373 | 
374 |     def randomize_parameters(self):
375 |         if self.randomize:
376 |             random.seed(1024)
377 |             self.crop_position = self.crop_positions[random.randint(
378 |                 0,
379 |                 len(self.crop_positions) - 1)]
380 | 
381 | 
382 | class RandomHorizontalFlip(object):
383 |     """Horizontally flip the given PIL.Image randomly with a probability of 0.5."""
384 | 
385 |     def __call__(self, img):
386 |         """
387 |         Args:
388 |             img (PIL.Image): Image to be flipped.
389 |         Returns:
390 |             PIL.Image: Randomly flipped image.
391 |         """
392 |         if self.p < 0.5:
393 |             return img.transpose(Image.FLIP_LEFT_RIGHT)
394 |         return img
395 | 
396 |     def randomize_parameters(self):
397 |         random.seed(1024)
398 |         self.p = random.random()
399 | 
400 | 
401 | class MultiScaleCornerCrop(object):
402 |     """Crop the given PIL.Image to randomly selected size.
403 |     A crop of size is selected from scales of the original size.
404 |     A position of cropping is randomly selected from 4 corners and 1 center.
405 |     This crop is finally resized to given size.
406 |     Args:
407 |         scales: cropping scales of the original size
408 |         size: size of the smaller edge
409 |         interpolation: Default: PIL.Image.BILINEAR
410 |     """
411 | 
412 |     def __init__(self,
413 |                  scales,
414 |                  size,
415 |                  interpolation=Image.BILINEAR,
416 |                  crop_positions=['c', 'tl', 'tr', 'bl', 'br']):
417 |         self.scales = scales
418 |         self.size = size
419 |         self.interpolation = interpolation
420 | 
421 |         self.crop_positions = crop_positions
422 | 
423 |     def __call__(self, img):
424 |         min_length = min(img.size[0], img.size[1])
425 |         crop_size = int(min_length * self.scale)
426 | 
427 |         image_width = img.size[0]
428 |         image_height = img.size[1]
429 | 
430 |         if self.crop_position == 'c':
431 |             center_x = image_width // 2
432 |             center_y = image_height // 2
433 |             box_half = crop_size // 2
434 |             x1 = center_x - box_half
435 |             y1 = center_y - box_half
436 |             x2 = center_x + box_half
437 |             y2 = center_y + box_half
438 |         elif self.crop_position == 'tl':
439 |             x1 = 0
440 |             y1 = 0
441 |             x2 = crop_size
442 |             y2 = crop_size
443 |         elif self.crop_position == 'tr':
444 |             x1 = image_width - crop_size
445 |             y1 = 0
446 |             x2 = image_width
447 |             y2 = crop_size
448 |         elif self.crop_position == 'bl':
449 |             x1 = 0
450 |             y1 = image_height - crop_size
451 |             x2 = crop_size
452 |             y2 = image_height
453 |         elif self.crop_position == 'br':
454 |             x1 = image_width - crop_size
455 |             y1 = image_height - crop_size
456 |             x2 = image_width
457 |             y2 = image_height
458 | 
459 |         img = img.crop((x1, y1, x2, y2))
460 | 
461 |         return img.resize((self.size, self.size), self.interpolation)
462 | 
463 |     def randomize_parameters(self):
464 |         random.seed(1024)
465 |         self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
466 |         random.seed(1024)
467 |         self.crop_position = self.crop_positions[random.randint(
468 |             0,
469 |             len(self.crop_positions) - 1)]
470 | 
471 | 
472 | class MultiScaleRandomCrop(object):
473 |     """
474 |     Crop the given PIL.Image to randomly selected size.
475 |     A position of cropping is randomly selected.
476 |     """
477 |     def __init__(self, scales, size, interpolation=Image.BILINEAR):
478 |         self.scales = scales
479 |         self.size = size
480 |         self.interpolation = interpolation
481 | 
482 |     def __call__(self, img):
483 |         min_length = min(img.size[0], img.size[1])
484 |         crop_size = int(min_length * self.scale)
485 | 
486 |         image_width = img.size[0]
487 |         image_height = img.size[1]
488 | 
489 |         x1 = self.tl_x * (image_width - crop_size)
490 |         y1 = self.tl_y * (image_height - crop_size)
491 |         x2 = x1 + crop_size
492 |         y2 = y1 + crop_size
493 | 
494 |         img = img.crop((x1, y1, x2, y2))
495 | 
496 |         return img.resize((self.size, self.size), self.interpolation)
497 | 
498 |     def randomize_parameters(self):
499 |         self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
500 |         random.seed(1024)
501 |         self.tl_x = random.random()
502 |         random.seed(1024)
503 |         self.tl_y = random.random()
504 |         


--------------------------------------------------------------------------------
/kinetics400_attack_samples.csv:
--------------------------------------------------------------------------------
  1 | path,gt_label,clip_index
  2 | abseiling/YqTT34PsD5c_000003_000013.mp4,0,-1
  3 | air drumming/--nQbRBEz2s_000104_000114.mp4,1,-1
  4 | answering questions/AqPeHqTDfGE_000068_000078.mp4,2,-1
  5 | applauding/ieIq7ym_UXQ_000007_000017.mp4,3,-1
  6 | applying cream/rFee2NCkWQE_000013_000023.mp4,4,-1
  7 | archery/FzfqEd36YbY_001101_001111.mp4,5,-1
  8 | arm wrestling/_Lo3hFbum_o_000005_000015.mp4,6,-1
  9 | arranging flowers/cuOsRai-HCE_000126_000136.mp4,7,-1
 10 | assembling computer/CB1iIWtDpSI_000431_000441.mp4,8,-1
 11 | auctioning/6jcDGC4LF5s_001022_001032.mp4,9,-1
 12 | baby waking up/X66nHKtYtt0_000008_000018.mp4,10,-1
 13 | baking cookies/LG2hEf9ueAM_000178_000188.mp4,11,-1
 14 | balloon blowing/qjyGo-e_d6I_000076_000086.mp4,12,-1
 15 | bandaging/TwSUMZOrLyE_000139_000149.mp4,13,-1
 16 | barbequing/_7mBVhDgiO8_000019_000029.mp4,14,-1
 17 | bartending/TuCc2RwG2fM_000354_000364.mp4,15,-1
 18 | beatboxing/rYnVViYbae0_000000_000010.mp4,16,-1
 19 | bee keeping/EpbYRgIsQRg_000018_000028.mp4,17,-1
 20 | belly dancing/cMVkWCb3fE8_000103_000113.mp4,18,-1
 21 | bench pressing/ehxWC3nDZC8_000003_000013.mp4,19,-1
 22 | bending back/DA8K3c4HgVo_000003_000013.mp4,20,-1
 23 | bending metal/U-iQuIgd5ps_000478_000488.mp4,21,-1
 24 | biking through snow/g3GJvDqtfys_000031_000041.mp4,22,-1
 25 | blasting sand/NAqrwQ54ptY_000028_000038.mp4,23,-1
 26 | blowing glass/p2NEs8gon0k_000381_000391.mp4,24,-1
 27 | blowing leaves/wKgo6AS5C80_000044_000054.mp4,25,-1
 28 | blowing nose/gu0QuD4zpzg_000030_000040.mp4,26,-1
 29 | blowing out candles/9IzWImcF3hM_000032_000042.mp4,27,-1
 30 | bobsledding/uftReOMM9-A_000063_000073.mp4,28,-1
 31 | bookbinding/hmoPcSFBYPY_000222_000232.mp4,29,-1
 32 | bouncing on trampoline/bekao5nG02M_000024_000034.mp4,30,-1
 33 | bowling/8pBjZcOc8MY_000096_000106.mp4,31,-1
 34 | braiding hair/-dLVSg5JvxY_000022_000032.mp4,32,-1
 35 | breading or breadcrumbing/-_3E3GBXAUc_000010_000020.mp4,33,-1
 36 | breakdancing/4T2F4PQ97GE_000008_000018.mp4,34,-1
 37 | brush painting/YHUeGa8Eu70_000225_000235.mp4,35,-1
 38 | brushing hair/9LnZrptwj6Q_000314_000324.mp4,36,-1
 39 | brushing teeth/8NStNQyjIXI_000054_000064.mp4,37,-1
 40 | building cabinet/CG9DKR4lPC0_001821_001831.mp4,38,-1
 41 | building shed/6q-XsQgZ8_w_000044_000054.mp4,39,-1
 42 | bungee jumping/7Goki93f5mo_000018_000028.mp4,40,-1
 43 | busking/CU6MFCvEct0_000016_000026.mp4,41,-1
 44 | canoeing or kayaking/cdDu63UKbu0_000164_000174.mp4,42,-1
 45 | capoeira/lm6ibanrGK8_000000_000010.mp4,43,-1
 46 | carrying baby/ztAfXKZ0ovM_000141_000151.mp4,44,-1
 47 | cartwheeling/EiZvgwrHCMk_000000_000010.mp4,45,-1
 48 | carving pumpkin/oPoLYdOTOt0_000000_000010.mp4,46,-1
 49 | catching fish/dpfJTo3nywA_000028_000038.mp4,47,-1
 50 | catching or throwing baseball/AsPjORZU-cU_000055_000065.mp4,48,-1
 51 | catching or throwing frisbee/RxgW7Hdn4YM_000006_000016.mp4,49,-1
 52 | catching or throwing softball/kU3qQGVRT-g_000011_000021.mp4,50,-1
 53 | celebrating/2lBUaUBD9JE_000018_000028.mp4,51,-1
 54 | changing oil/-aJHPlJTesM_000734_000744.mp4,52,-1
 55 | changing wheel/EQNTFw62uh8_000251_000261.mp4,53,-1
 56 | checking tires/wUQduZ3i-VM_000275_000285.mp4,54,-1
 57 | cheerleading/6LOV6-dkNZE_000251_000261.mp4,55,-1
 58 | chopping wood/nyFulYDEKFs_000017_000027.mp4,56,-1
 59 | clapping/M9NORCUCrtE_000003_000013.mp4,57,-1
 60 | clay pottery making/PP7MtP6BMkY_000193_000203.mp4,58,-1
 61 | clean and jerk/R6pk7NDa7Mw_000015_000025.mp4,59,-1
 62 | cleaning floor/vVlrGgL9dxk_000004_000014.mp4,60,-1
 63 | cleaning gutters/22xdXMMq6XE_000040_000050.mp4,61,-1
 64 | cleaning pool/AFvEYQkSmfk_000123_000133.mp4,62,-1
 65 | cleaning shoes/WJEGNo9YETM_000203_000213.mp4,63,-1
 66 | cleaning toilet/BjS2g1oZj_s_000065_000075.mp4,64,-1
 67 | cleaning windows/OiN3AgBVB80_000003_000013.mp4,65,-1
 68 | climbing a rope/NfH4FZhrtvE_000002_000012.mp4,66,-1
 69 | climbing ladder/70Er7J3srS0_000001_000011.mp4,67,-1
 70 | climbing tree/aM1AgHyvm4E_000017_000027.mp4,68,-1
 71 | contact juggling/yymr4YWVFe4_000046_000056.mp4,69,-1
 72 | cooking chicken/pj8TWS7KEeY_000024_000034.mp4,70,-1
 73 | cooking egg/Ao3M2TPI3sQ_000294_000304.mp4,71,-1
 74 | cooking on campfire/BQfDmW1Nodk_000002_000012.mp4,72,-1
 75 | cooking sausages/52AOa09jJWs_000195_000205.mp4,73,-1
 76 | counting money/kPCbWDyAcFE_000000_000010.mp4,74,-1
 77 | country line dancing/suHCOVoGPMU_000475_000485.mp4,75,-1
 78 | cracking neck/j_EiZph3YKE_000001_000011.mp4,76,-1
 79 | crawling baby/GpPvqvsqGy0_000006_000016.mp4,77,-1
 80 | crossing river/luTkBLIT6lU_000036_000046.mp4,78,-1
 81 | crying/zCEEKnSB_RU_000000_000010.mp4,79,-1
 82 | curling hair/gwNMVUlBUtY_000068_000078.mp4,80,-1
 83 | cutting nails/es35biYvLRA_000020_000030.mp4,81,-1
 84 | cutting pineapple/T5jQWQg2eNc_000000_000010.mp4,82,-1
 85 | cutting watermelon/LBgRTCVwyik_000042_000052.mp4,83,-1
 86 | dancing ballet/s_gGtYIrtsc_000118_000128.mp4,84,-1
 87 | dancing charleston/FQpLIyAfbqI_000023_000033.mp4,85,-1
 88 | dancing gangnam style/o_TIgx4gb_M_000023_000033.mp4,86,-1
 89 | dancing macarena/dXIyWMidYa0_000008_000018.mp4,87,-1
 90 | deadlifting/zvamd5T7yj8_000001_000011.mp4,88,-1
 91 | decorating the christmas tree/kQDSa-xhsLY_000035_000045.mp4,89,-1
 92 | digging/42Vx9FGzmkM_000075_000085.mp4,90,-1
 93 | dining/-vOrVT1CiPQ_000080_000090.mp4,91,-1
 94 | disc golfing/_owWHGvn_b0_000112_000122.mp4,92,-1
 95 | diving cliff/1MmjE51PeIE_000015_000025.mp4,93,-1
 96 | dodgeball/wFIuMu2w9pA_000010_000020.mp4,94,-1
 97 | doing aerobics/-53DvfE42gE_001767_001777.mp4,95,-1
 98 | doing laundry/qkd7laDeom0_000098_000108.mp4,96,-1
 99 | doing nails/UixL7lHSHR8_000040_000050.mp4,97,-1
100 | drawing/IPmic5VRb7I_000066_000076.mp4,98,-1
101 | dribbling basketball/qoODmONT1a0_000019_000029.mp4,99,-1
102 | drinking/15FiZ48tTUU_000045_000055.mp4,100,-1
103 | drinking beer/382B3Q3xttk_000000_000010.mp4,101,-1
104 | drinking shots/o1hqepKau4A_000004_000014.mp4,102,-1
105 | driving car/NUG7kwJ-614_000400_000410.mp4,103,-1
106 | driving tractor/WtnQKvOuukE_000081_000091.mp4,104,-1
107 | drop kicking/pvuiN-G8-yc_000000_000010.mp4,105,-1
108 | drumming fingers/eap32WOJcAU_000108_000118.mp4,106,-1
109 | dunking basketball/WC2FOUSNyvE_000006_000016.mp4,107,-1
110 | dying hair/-7E9WiX7QfA_000053_000063.mp4,108,-1
111 | eating burger/w9G7CpkBBM0_000000_000010.mp4,109,-1
112 | eating cake/8QhblWHnNAY_000019_000029.mp4,110,-1
113 | eating carrots/V4IaThkaK6Y_000025_000035.mp4,111,-1
114 | eating chips/I5Y53-Q9KRo_000444_000454.mp4,112,-1
115 | eating doughnuts/HyUF0Uo0f2A_000077_000087.mp4,113,-1
116 | eating hotdog/FTOgHjhqlhU_000054_000064.mp4,114,-1
117 | eating ice cream/0fCDlKYkRxc_000081_000091.mp4,115,-1
118 | eating spaghetti/DiSP2oDGQ1Q_000014_000024.mp4,116,-1
119 | eating watermelon/pLA62YSoEoM_000002_000012.mp4,117,-1
120 | egg hunting/U9vSW3-zJ9s_000007_000017.mp4,118,-1
121 | exercising arm/0wZpjStZtUY_000001_000011.mp4,119,-1
122 | exercising with an exercise ball/oj7Qgyz5KK8_000143_000153.mp4,120,-1
123 | extinguishing fire/BVXG_JOh9jQ_000002_000012.mp4,121,-1
124 | faceplanting/petld-72OXM_000001_000011.mp4,122,-1
125 | feeding birds/QJSwBNxKYqg_000120_000130.mp4,123,-1
126 | feeding fish/ZtkTAHzih9Q_000084_000094.mp4,124,-1
127 | feeding goats/v5Bl68y5ra0_000006_000016.mp4,125,-1
128 | filling eyebrows/XycmcISYPA8_000045_000055.mp4,126,-1
129 | finger snapping/j6qYhS2W1fM_000001_000011.mp4,127,-1
130 | fixing hair/-65aI53dvdE_000022_000032.mp4,128,-1
131 | flipping pancake/HIBxq2P0BL0_000004_000014.mp4,129,-1
132 | flying kite/hAQJ9GHklS4_000004_000014.mp4,130,-1
133 | folding clothes/HvbmGxDuNxs_000035_000045.mp4,131,-1
134 | folding napkins/iCtT6ZadoOM_000052_000062.mp4,132,-1
135 | folding paper/soHl6SrXlEI_000105_000115.mp4,133,-1
136 | front raises/ObO_Gnw1nOQ_000005_000015.mp4,134,-1
137 | frying vegetables/1IDdvXnTI60_000123_000133.mp4,135,-1
138 | garbage collecting/KxTIEKllIzg_000114_000124.mp4,136,-1
139 | gargling/HAPBKE3Qo5A_000217_000227.mp4,137,-1
140 | getting a haircut/lVwFn9m8Q_Q_000053_000063.mp4,138,-1
141 | getting a tattoo/g8dOsqPBe7A_000657_000667.mp4,139,-1
142 | giving or receiving award/LmuS2GreXkc_000033_000043.mp4,140,-1
143 | golf chipping/NIf0bxodA9E_000120_000130.mp4,141,-1
144 | golf driving/1Q-E6UW1XE8_000011_000021.mp4,142,-1
145 | golf putting/VS9uEOvJhzg_000000_000010.mp4,143,-1
146 | grinding meat/SErnxQf4ONQ_000230_000240.mp4,144,-1
147 | grooming dog/Q9mt0lJjQUA_000105_000115.mp4,145,-1
148 | grooming horse/kaVWY-GyXcs_000063_000073.mp4,146,-1
149 | gymnastics tumbling/mlzx2bi9nwQ_000059_000069.mp4,147,-1
150 | hammer throw/WUrwglFhY64_000002_000012.mp4,148,-1
151 | headbanging/ZhDdQmHIM78_000044_000054.mp4,149,-1
152 | high jump/M2j1BTibIzs_000000_000010.mp4,151,-1
153 | high kick/NdjLKFhn9j0_000004_000014.mp4,152,-1
154 | hitting baseball/e8uB0GZsVOQ_000034_000044.mp4,153,-1
155 | hockey stop/Nrscg8fLYqY_000049_000059.mp4,154,-1
156 | holding snake/6cbXqLP0FHE_000002_000012.mp4,155,-1
157 | hopscotch/vxp0SOd2W1E_000002_000012.mp4,156,-1
158 | hoverboarding/E1Smsuf6cpE_000147_000157.mp4,157,-1
159 | hugging/xWyOTDxm9yQ_000009_000019.mp4,158,-1
160 | hula hooping/UjfYNVaZ39Y_000087_000097.mp4,159,-1
161 | hurdling/Xa6gI4yGLQo_000000_000010.mp4,160,-1
162 | hurling (sport)/ml2eBC_nXrw_000055_000065.mp4,161,-1
163 | ice climbing/UM1fUqvFnME_000048_000058.mp4,162,-1
164 | ice fishing/GO6YI36E_Do_000140_000150.mp4,163,-1
165 | ice skating/vMZLTP9MfZ4_000008_000018.mp4,164,-1
166 | ironing/ZgHZ0KgFOSc_000215_000225.mp4,165,-1
167 | javelin throw/E5xdkQvnhkc_000002_000012.mp4,166,-1
168 | jetskiing/Be59Cot2yGI_000233_000243.mp4,167,-1
169 | jogging/kBUt5duOHFU_000005_000015.mp4,168,-1
170 | juggling balls/YH801xSLkZM_000000_000010.mp4,169,-1
171 | juggling fire/TA2mmXre8HQ_000000_000010.mp4,170,-1
172 | juggling soccer ball/WAPctsQ-SwM_000000_000010.mp4,171,-1
173 | jumping into pool/kjzgLLaYO8w_000010_000020.mp4,172,-1
174 | jumpstyle dancing/QeG2HREr6m0_000003_000013.mp4,173,-1
175 | kicking field goal/sR0oOq-qOqs_000015_000025.mp4,174,-1
176 | kicking soccer ball/5PML0iLnBD8_000003_000013.mp4,175,-1
177 | kissing/LmPjkroyPcY_000739_000749.mp4,176,-1
178 | kitesurfing/KOOfe61BIyE_000023_000033.mp4,177,-1
179 | knitting/bCa_5xZa4Ug_002346_002356.mp4,178,-1
180 | krumping/3JxrK2Jt52Y_000754_000764.mp4,179,-1
181 | laughing/UpVXo5Q9JKk_000079_000089.mp4,180,-1
182 | laying bricks/N4HdEYIci0I_000037_000047.mp4,181,-1
183 | long jump/MrlWkj87rfU_000002_000012.mp4,182,-1
184 | lunge/g-XXUD65DyI_000003_000013.mp4,183,-1
185 | making a cake/bX6I6jVAQMI_000028_000038.mp4,184,-1
186 | making a sandwich/jofgWiVBwqo_000086_000096.mp4,185,-1
187 | making bed/yD42KW6cm-A_000820_000830.mp4,186,-1
188 | making jewelry/wMWkwQ7HXik_000616_000626.mp4,187,-1
189 | making pizza/wxgqu30nSLE_000000_000010.mp4,188,-1
190 | making snowman/8kN7EyPBmrI_000082_000092.mp4,189,-1
191 | making sushi/Ah2YqA7bmHY_000055_000065.mp4,190,-1
192 | making tea/hs2MVCM2LdY_000043_000053.mp4,191,-1
193 | marching/_h60EbUbh3I_000026_000036.mp4,192,-1
194 | massaging back/zsJ2PmhGM98_000215_000225.mp4,193,-1
195 | massaging feet/BwMKdpNAmy4_000090_000100.mp4,194,-1
196 | massaging legs/0EJXIQ1ltjo_000013_000023.mp4,195,-1
197 | massaging person's head/z-6l_dkR3vE_000299_000309.mp4,196,-1
198 | milking cow/DdUTLqyZ5b8_000044_000054.mp4,197,-1
199 | mopping floor/-F-aEPmjERo_000043_000053.mp4,198,-1
200 | motorcycling/kthzjAS1XS8_000009_000019.mp4,199,-1
201 | moving furniture/b9vF-F1LC5g_000003_000013.mp4,200,-1
202 | mowing lawn/t5SHfHDj0uw_000006_000016.mp4,201,-1
203 | news anchoring/xJMgxnXI0GY_000000_000010.mp4,202,-1
204 | opening bottle/gWd5AU5wP0k_000041_000051.mp4,203,-1
205 | opening present/vd4uGb1162o_000002_000012.mp4,204,-1
206 | paragliding/GF4WEdN_H0s_000191_000201.mp4,205,-1
207 | parasailing/GuClMEvE3gM_000055_000065.mp4,206,-1
208 | parkour/ptgKO940ISM_000042_000052.mp4,207,-1
209 | passing American football (in game)/ixMPVi3Zr9s_000001_000011.mp4,208,-1
210 | passing American football (not in game)/RxO7IEU7_I8_000391_000401.mp4,209,-1
211 | peeling apples/8qEAQXckcVw_000003_000013.mp4,210,-1
212 | peeling potatoes/_3CsQJ6XpHo_000015_000025.mp4,211,-1
213 | petting animal (not cat)/tlWjTLpoWLw_000000_000010.mp4,212,-1
214 | petting cat/q1GijBRBqjE_000203_000213.mp4,213,-1
215 | picking fruit/NTfCraM0XyM_000257_000267.mp4,214,-1
216 | planting trees/_WzkPBxP-5g_000096_000106.mp4,215,-1
217 | plastering/mdN9BDP0cVY_000032_000042.mp4,216,-1
218 | playing accordion/syp1O0cjens_000038_000048.mp4,217,-1
219 | playing badminton/tJz980bJ3UI_000065_000075.mp4,218,-1
220 | playing bagpipes/fMeaggq0_rA_000032_000042.mp4,219,-1
221 | playing basketball/3mIvIgAlniY_000001_000011.mp4,220,-1
222 | playing bass guitar/HqsAvuo5XhA_000059_000069.mp4,221,-1
223 | playing cards/IVP8pO4Q8Hs_000084_000094.mp4,222,-1
224 | playing cello/rsN982-8cvg_000042_000052.mp4,223,-1
225 | playing chess/xFq-OJ8HDJs_000185_000195.mp4,224,-1
226 | playing clarinet/7g4aL1EX8EI_001210_001220.mp4,225,-1
227 | playing controller/gxbUZcsy4EA_000097_000107.mp4,226,-1
228 | playing cricket/lgPslaxBQt0_000000_000010.mp4,227,-1
229 | playing cymbals/--Y25nDn2Wk_000060_000070.mp4,228,-1
230 | playing didgeridoo/2ezT7E6g8Ew_000044_000054.mp4,229,-1
231 | playing drums/kXhnTK9TVsU_000076_000086.mp4,230,-1
232 | playing flute/wqYzrDwV_o4_000047_000057.mp4,231,-1
233 | playing guitar/ysjCIR7SkJU_000141_000151.mp4,232,-1
234 | playing harmonica/DpJQShJs2kI_000036_000046.mp4,233,-1
235 | playing harp/Ud-INZAw5Ik_000163_000173.mp4,234,-1
236 | playing ice hockey/kRWk_-5d5bs_000010_000020.mp4,235,-1
237 | playing keyboard/vxVoptVwZp4_000027_000037.mp4,236,-1
238 | playing kickball/d5TMlt6P-ug_000317_000327.mp4,237,-1
239 | playing monopoly/SsAtR4oD7WY_000000_000010.mp4,238,-1
240 | playing organ/b9TfeDnfemw_000047_000057.mp4,239,-1
241 | playing paintball/DOL1_JLWeoo_000321_000331.mp4,240,-1
242 | playing piano/l4zZtMgNPvU_000009_000019.mp4,241,-1
243 | playing poker/-0NQHRndkPI_000004_000014.mp4,242,-1
244 | playing recorder/Zl_ey-UqwpY_000001_000011.mp4,243,-1
245 | playing saxophone/K06EmNd6t_I_000006_000016.mp4,244,-1
246 | playing squash or racquetball/-yUM3WwKQHM_000032_000042.mp4,245,-1
247 | playing tennis/VoAJFfutNlg_000060_000070.mp4,246,-1
248 | playing trombone/vqNbapex1kU_000015_000025.mp4,247,-1
249 | playing trumpet/-BtzVCzSnLk_000073_000083.mp4,248,-1
250 | playing ukulele/vE6Cnt7XJrg_000026_000036.mp4,249,-1
251 | playing violin/t2XntpSO4Yo_000105_000115.mp4,250,-1
252 | playing volleyball/5Wle9ClW4q0_000170_000180.mp4,251,-1
253 | playing xylophone/N586DnjSCxo_000000_000010.mp4,252,-1
254 | pole vault/9g4Sf8aWIx8_000003_000013.mp4,253,-1
255 | presenting weather forecast/lVSiCfeBP8I_000152_000162.mp4,254,-1
256 | pull ups/yLVMDD7b0xM_000020_000030.mp4,255,-1
257 | pumping fist/V-IqR1THKr4_000015_000025.mp4,256,-1
258 | pumping gas/eanhmmKIolc_000044_000054.mp4,257,-1
259 | punching bag/3baFNAxC2YI_000012_000022.mp4,258,-1
260 | punching person (boxing)/D5iLGttoHr4_000022_000032.mp4,259,-1
261 | push up/-B2oGkg1qSI_000012_000022.mp4,260,-1
262 | pushing car/-46DNkpyApI_000045_000055.mp4,261,-1
263 | pushing cart/p9CIcEEaSEk_000001_000011.mp4,262,-1
264 | pushing wheelchair/5gQlgNS5qfY_000023_000033.mp4,263,-1
265 | reading book/XoO1uEVNgjM_000058_000068.mp4,264,-1
266 | reading newspaper/gKqKWn6Nl0A_000035_000045.mp4,265,-1
267 | recording music/864rV9vdAK4_000577_000587.mp4,266,-1
268 | riding a bike/Ig-eRsgi6CU_000339_000349.mp4,267,-1
269 | riding camel/bGzjObGU_qM_000014_000024.mp4,268,-1
270 | riding elephant/j06vowPye30_000009_000019.mp4,269,-1
271 | riding mechanical bull/eJpkgBaykQ8_000029_000039.mp4,270,-1
272 | riding mountain bike/O95dOpT9T-c_000039_000049.mp4,271,-1
273 | riding mule/azD58bwAe7E_000003_000013.mp4,272,-1
274 | riding or walking with horse/C9pFs8sDARw_000218_000228.mp4,273,-1
275 | riding scooter/FGCNMNjanO4_000013_000023.mp4,274,-1
276 | riding unicycle/9RN16I79P9U_000000_000010.mp4,275,-1
277 | ripping paper/-Ovwq0kVUx4_000002_000012.mp4,276,-1
278 | robot dancing/5hQW4BHjWvM_000061_000071.mp4,277,-1
279 | rock climbing/2jXlO2nzHGE_000026_000036.mp4,278,-1
280 | rock scissors paper/Kxbdg32t6bU_000001_000011.mp4,279,-1
281 | roller skating/_vjX5nPwTBs_000072_000082.mp4,280,-1
282 | running on treadmill/BrKuhHIHccg_000049_000059.mp4,281,-1
283 | sailing/h1SM1ArgB0E_000034_000044.mp4,282,-1
284 | salsa dancing/WW4N7GToB5I_000313_000323.mp4,283,-1
285 | sanding floor/EbXC4bGpZ4M_000034_000044.mp4,284,-1
286 | scrambling eggs/ojJpJZpACdE_000245_000255.mp4,285,-1
287 | scuba diving/64BhyrIZkz0_000002_000012.mp4,286,-1
288 | setting table/tSQqcJqGplA_000011_000021.mp4,287,-1
289 | shaking hands/lCQ17mGZeVE_000029_000039.mp4,288,-1
290 | shaking head/WUOxNQKdRMM_000065_000075.mp4,289,-1
291 | sharpening knives/iaQPoVg8Xtw_000468_000478.mp4,290,-1
292 | sharpening pencil/ZMMCn1JE0Vc_000001_000011.mp4,291,-1
293 | shaving head/K1C_jI8z1F8_000261_000271.mp4,292,-1
294 | shaving legs/zvjNnDhUTxE_000034_000044.mp4,293,-1
295 | shearing sheep/Vaff3l43A40_000018_000028.mp4,294,-1
296 | shining shoes/HhW13wPky1U_000556_000566.mp4,295,-1
297 | shooting basketball/Y3oHAIylSrg_000031_000041.mp4,296,-1
298 | shooting goal (soccer)/ezJdtQzJ7qI_000021_000031.mp4,297,-1
299 | shot put/SQddPtgoQGE_000007_000017.mp4,298,-1
300 | shoveling snow/SZRDWgGOpXY_000062_000072.mp4,299,-1
301 | shredding paper/KXyOXrWiJGY_000022_000032.mp4,300,-1
302 | shuffling cards/_k0w_3JFfmE_000026_000036.mp4,301,-1
303 | side kick/sZ8JiPfAoWc_000005_000015.mp4,302,-1
304 | sign language interpreting/fKvqQEGGf6E_000031_000041.mp4,303,-1
305 | singing/FZrg29zsAe8_000023_000033.mp4,304,-1
306 | situp/jTMZX30XTXA_000072_000082.mp4,305,-1
307 | skateboarding/kIzdzzMLCJI_000199_000209.mp4,306,-1
308 | ski jumping/XQUsRpJ1A_Y_000001_000011.mp4,307,-1
309 | skiing (not slalom or crosscountry)/fRiYQEVMcEc_000000_000010.mp4,308,-1
310 | skiing crosscountry/pfvt6iYSXXw_000764_000774.mp4,309,-1
311 | skiing slalom/Ch_wt_nV2k4_000702_000712.mp4,310,-1
312 | skipping rope/xzXJJIni2hQ_000238_000248.mp4,311,-1
313 | skydiving/5uw3m1tIvJ0_000057_000067.mp4,312,-1
314 | slacklining/iLj4i4fTzn0_000038_000048.mp4,313,-1
315 | sled dog racing/yMQMfdV-Fzs_000025_000035.mp4,315,-1
316 | smoking/xo_9xPRu7_4_000114_000124.mp4,316,-1
317 | smoking hookah/C_QP4vOVTrE_000164_000174.mp4,317,-1
318 | snatch weight lifting/827ciUyYK5k_000000_000010.mp4,318,-1
319 | sneezing/ce6aUvCKpbU_000000_000010.mp4,319,-1
320 | sniffing/u2s0kiGG7AU_000011_000021.mp4,320,-1
321 | snorkeling/rsDfe_ikY1I_000010_000020.mp4,321,-1
322 | snowboarding/4cjhTsZjNP8_000202_000212.mp4,322,-1
323 | snowkiting/ToDS3RIVybY_000025_000035.mp4,323,-1
324 | snowmobiling/EgkRnTkj8gc_000003_000013.mp4,324,-1
325 | somersaulting/hFYg1xqG5yk_000154_000164.mp4,325,-1
326 | spinning poi/by9gw0ipuUg_000002_000012.mp4,326,-1
327 | spray painting/zyR32Dm9yek_000019_000029.mp4,327,-1
328 | spraying/LJQX3Atdn4k_000043_000053.mp4,328,-1
329 | springboard diving/T3b1nxhG9Lo_000026_000036.mp4,329,-1
330 | squat/ENkU87uTdfU_000025_000035.mp4,330,-1
331 | sticking tongue out/E6ZgFC1L178_000041_000051.mp4,331,-1
332 | stomping grapes/gzAmaRypLyI_000062_000072.mp4,332,-1
333 | stretching arm/qKiTc6GGT4c_000036_000046.mp4,333,-1
334 | stretching leg/-hkrPB2YU50_000612_000622.mp4,334,-1
335 | strumming guitar/-2GJPqAglxU_000862_000872.mp4,335,-1
336 | surfing crowd/tpruGil1UCs_000038_000048.mp4,336,-1
337 | surfing water/-G_tgkmqChg_000072_000082.mp4,337,-1
338 | sweeping floor/bHy05OAiL1g_000027_000037.mp4,338,-1
339 | swimming backstroke/7aGVsi5ZgMI_000023_000033.mp4,339,-1
340 | swimming breast stroke/BX-dyfoGFsE_000168_000178.mp4,340,-1
341 | swimming butterfly stroke/aAYaI35qR5Q_000005_000015.mp4,341,-1
342 | swing dancing/LVyo14Q5PmY_000006_000016.mp4,342,-1
343 | swinging legs/W52Cl1ed1LU_000000_000010.mp4,343,-1
344 | swinging on something/ibRyH1Q1bbo_000066_000076.mp4,344,-1
345 | sword fighting/_kcVbo4E2JQ_000101_000111.mp4,345,-1
346 | tai chi/BbFbo987QEo_000057_000067.mp4,346,-1
347 | taking a shower/zW5Gt8bfZbc_000011_000021.mp4,347,-1
348 | tango dancing/dFDdr9zxfzc_000101_000111.mp4,348,-1
349 | tap dancing/dZ1EkA3BuQ4_000000_000010.mp4,349,-1
350 | tapping guitar/7Nqupt1WIn4_000031_000041.mp4,350,-1
351 | tapping pen/xcsH3jFtdSg_000026_000036.mp4,351,-1
352 | tasting beer/UcAAItCUJrk_000518_000528.mp4,352,-1
353 | tasting food/SQCsXDtiARU_000332_000342.mp4,353,-1
354 | testifying/O4ystlpCCxM_000010_000020.mp4,354,-1
355 | texting/V_LQXRQVrok_000116_000126.mp4,355,-1
356 | throwing axe/1YmkhTmmyRc_000002_000012.mp4,356,-1
357 | throwing ball/A6JeTfQqm0I_000000_000010.mp4,357,-1
358 | throwing discus/gjNdAGf_16Y_000084_000094.mp4,358,-1
359 | tickling/W9Ydqjoda9c_000000_000010.mp4,359,-1
360 | tobogganing/SWmBChx-7fI_000003_000013.mp4,360,-1
361 | tossing coin/Iwg-Had3-wE_000002_000012.mp4,361,-1
362 | tossing salad/PcbZKLvO6gc_000181_000191.mp4,362,-1
363 | training dog/am3gcomIUa4_000048_000058.mp4,363,-1
364 | trapezing/p-E3XWgf3Wk_000012_000022.mp4,364,-1
365 | trimming or shaving beard/X9mmMztC1Vo_000366_000376.mp4,365,-1
366 | trimming trees/Q_F85_VgKwM_000045_000055.mp4,366,-1
367 | triple jump/UergZFP-AdM_000002_000012.mp4,367,-1
368 | tying bow tie/c-t7EA00jj8_000016_000026.mp4,368,-1
369 | tying knot (not on a tie)/Vdx6g26ZOE0_000002_000012.mp4,369,-1
370 | tying tie/DIrYfnfogiA_000148_000158.mp4,370,-1
371 | unboxing/5lmmjOhih3U_000046_000056.mp4,371,-1
372 | unloading truck/-aKzhHxNXDo_000066_000076.mp4,372,-1
373 | using computer/5R1KJn3Pqa8_000066_000076.mp4,373,-1
374 | using remote controller (not gaming)/Bj7_KWKEXp8_000046_000056.mp4,374,-1
375 | using segway/QQTUQu4emh8_000145_000155.mp4,375,-1
376 | vault/A4U2LxAwIm4_000031_000041.mp4,376,-1
377 | waiting in line/5V_Ed93k2bI_000059_000069.mp4,377,-1
378 | walking the dog/3NlgmP6MDmY_000021_000031.mp4,378,-1
379 | washing dishes/oEkXkrSbFU8_000052_000062.mp4,379,-1
380 | washing feet/n3vpap_pQ-U_000076_000086.mp4,380,-1
381 | washing hair/BhU4HGJ2q4s_000004_000014.mp4,381,-1
382 | washing hands/-jtKtX9gGdY_000005_000015.mp4,382,-1
383 | water skiing/F1KYDfTyuEI_000040_000050.mp4,383,-1
384 | water sliding/N6lBqLeKs8I_000001_000011.mp4,384,-1
385 | watering plants/jZfXAIU4rZ4_000073_000083.mp4,385,-1
386 | waxing back/P5qR6CoGbk8_000035_000045.mp4,386,-1
387 | waxing chest/oRKbez1LpWU_000080_000090.mp4,387,-1
388 | waxing eyebrows/hjzI8c63hVo_000011_000021.mp4,388,-1
389 | waxing legs/dzeivZlP6tU_000024_000034.mp4,389,-1
390 | weaving basket/oD0wHopSNLU_000000_000010.mp4,390,-1
391 | welding/5hSYP2XxBGY_000204_000214.mp4,391,-1
392 | whistling/KFOWZBfLHrA_000084_000094.mp4,392,-1
393 | windsurfing/nDlR90yHqPY_000112_000122.mp4,393,-1
394 | wrapping present/zYjHJNadEj4_000246_000256.mp4,394,-1
395 | wrestling/UP_iRJv5mPU_000150_000160.mp4,395,-1
396 | writing/OrWjyz2bFJQ_000064_000074.mp4,396,-1
397 | yawning/SaJWnqViSLo_000023_000033.mp4,397,-1
398 | yoga/5NysTi21_D0_000003_000013.mp4,398,-1
399 | zumba/BvO4NNTw7Ks_000094_000104.mp4,399,-1
400 | headbutting/PzD2BkZye2U_000013_000023.mp4,150,6
401 | slapping/WGxSNBg_tl0_000075_000085.mp4,314,9
402 | 


--------------------------------------------------------------------------------
/image_attacks.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import torch.nn as nn
  4 | import torchvision.models as models
  5 | import random
  6 | 
  7 | from image_cam import GradCAM
  8 | from torch.autograd import Variable
  9 | from image_cam_utils import find_alexnet_layer, find_vgg_layer, find_resnet_layer, find_densenet_layer, find_squeezenet_layer
 10 | import pickle as pkl
 11 | 
 12 | class Attack(object):
 13 |     """
 14 |     Base class for all attacks.
 15 |     .. note::
 16 |         It automatically set device to the device where given model is.
 17 |         It temporarily changes the model's training mode to `test`
 18 |         by `.eval()` only during an attack process.
 19 |     """
 20 |     def __init__(self, name, model=None):
 21 |         r"""
 22 |         Initializes internal attack state.
 23 |         Arguments:
 24 |             name (str) : name of an attack.
 25 |             model (torch.nn.Module): model to attack.
 26 |         """
 27 |         self.attack = name
 28 |         self.model = model
 29 |         self.model_name = str(model).split("(")[0]
 30 | 
 31 |         # mean and std values are used in pytorch pretrained models
 32 |         # they are also used in Kinetics-400.
 33 |         self.mean = [0.485, 0.456, 0.406]
 34 |         self.std = [0.229, 0.224, 0.225]
 35 | 
 36 |     def forward(self, *input):
 37 |         r"""
 38 |         It defines the computation performed at every call (attack forward).
 39 |         Should be overridden by all subclasses.
 40 |         """
 41 |         raise NotImplementedError
 42 |     
 43 |     def _transform_perts(self, perts):
 44 |         dtype = perts.dtype
 45 |         mean = torch.as_tensor(self.mean, dtype=dtype).cuda()
 46 |         std = torch.as_tensor(self.std, dtype=dtype).cuda()
 47 |         perts.div_(std[:, None, None])
 48 |         return perts
 49 | 
 50 |     def _transform_video(self, video, mode='forward'):
 51 |         r'''
 52 |         Transform the video into [0, 1]
 53 |         '''
 54 |         dtype = video.dtype
 55 |         mean = torch.as_tensor(self.mean, dtype=dtype).cuda()
 56 |         std = torch.as_tensor(self.std, dtype=dtype).cuda()
 57 |         if mode == 'forward':
 58 |             # [-mean/std, mean/std]
 59 |             video.sub_(mean[:, None, None]).div_(std[:, None, None])
 60 |         elif mode == 'back':
 61 |             # [0, 1]
 62 |             video.mul_(std[:, None, None]).add_(mean[:, None, None])
 63 |         return video
 64 | 
 65 |     def _transform_video_ILAF(self, video, mode='forward'):
 66 |         r'''
 67 |         Transform the video into [0, 1]
 68 |         '''
 69 |         dtype = video.dtype
 70 |         mean = torch.as_tensor(self.mean, dtype=dtype).cuda()
 71 |         std = torch.as_tensor(self.std, dtype=dtype).cuda()
 72 |         if mode == 'forward':
 73 |             # [-mean/std, mean/std]
 74 |             video.sub_(mean[None, :, None, None, None]).div_(std[None, :, None, None, None])
 75 |         elif mode == 'back':
 76 |             # [0, 1]
 77 |             video.mul_(std[None, :, None, None, None]).add_(mean[None, :, None, None, None])
 78 |         return video
 79 | 
 80 |     def __call__(self, *input, **kwargs):
 81 |         images = self.forward(*input, **kwargs)
 82 |         return images
 83 | 
 84 | def get_model(model_name):
 85 |     '''
 86 |     ['alexnet', 'vgg', 'resnet', 'densenet', 'squeezenet']
 87 |     '''
 88 |     if model_name == 'alexnet':
 89 |         model = models.alexnet(pretrained=True)
 90 |         # model.features[11/7/4/1] 
 91 |     elif model_name == 'vgg':
 92 |         model = models.vgg16(pretrained=True)
 93 |         # model.features[29/20/11/1]
 94 |     elif model_name == 'resnet':
 95 |         model = models.resnet101(pretrained=True)
 96 |     elif model_name == 'densenet':
 97 |         model = models.densenet161(pretrained=True)
 98 |         # model.features.denseblock1/2/3/4
 99 |         # model.features.transition1/2/3,norm5
100 |     elif model_name == 'squeezenet':
101 |         model = models.squeezenet1_1(pretrained=True)
102 |         # model.features[12/9/6/3].expand3x3_activation
103 |     model.cuda()
104 |     model.eval()
105 |     # for m in model.modules():
106 |     #     if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
107 |     #         m.eval()
108 |     return model
109 | 
110 | def get_models(model_name_lists):
111 |     models = []
112 |     for model_name in model_name_lists:
113 |         model = get_model(model_name)
114 |         models.append(model)
115 |     return models
116 |     
117 | def get_GradCam(model_name_lists):
118 |     gradcams = []
119 |     for model_name in model_name_lists:
120 |         model_dict = dict(type=model_name, arch=get_model(model_name), input_size=(224, 224))
121 |         this_gradcam = GradCAM(model_dict, False)
122 |         gradcams.append(this_gradcam)
123 |     return gradcams 
124 | 
125 | # *****************************************************************
126 | # paper: Enhancing Cross-Task Black-Box Transferability of 
127 | # Adversarial Examples with Dispersion Reduction
128 | # *****************************************************************
129 | class ImageGuidedStd_Adam(Attack):
130 |     '''
131 |     Dispersion Reduction (DR) attack.
132 |     paper: Enhancing crosstask black-box transferability of adversarial examples with dispersion reduction
133 |     parameters:
134 |         depth: {1,2,3,4}
135 |     '''
136 |     def __init__(self, model_name_lists, depth, step_size, epsilon=16/255, steps=10):
137 |         super(ImageGuidedStd_Adam, self).__init__("ImageGuidedStd_Adam")
138 |         self.epsilon = epsilon
139 |         self.steps = steps
140 |         self.step_size = step_size
141 |         self.loss_info = {}
142 |         self.depth = depth
143 |         self.model = get_models(model_name_lists)[0]
144 |         self.model_name = model_name_lists[0]
145 | 
146 |         self.model.train()
147 |         for m in self.model.modules():
148 |             if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
149 |                 m.eval()
150 | 
151 |         self._attention_hook()
152 | 
153 |     def _find_target_layer(self):
154 |         if self.model_name == 'resnet':
155 |             return getattr(self.model, 'layer{}'.format(self.depth))[-1]
156 |         elif self.model_name == 'alexnet':
157 |             depth_to_layer = {1:1,2:4,3:7,4:11}
158 |             return getattr(self.model, 'features')[depth_to_layer[self.depth]]
159 |         elif self.model_name == 'vgg':
160 |             depth_to_layer = {1:1,2:11,3:20,4:29}
161 |             return getattr(self.model, 'features')[depth_to_layer[self.depth]]
162 |         elif self.model_name == 'squeezenet':
163 |             depth_to_layer = {1:3,2:6,3:9,4:12}
164 |             return getattr(self.model, 'features')[depth_to_layer[self.depth]].expand3x3_activation
165 | 
166 |     def _attention_hook(self):
167 |         self.gradients = dict()
168 |         self.gradients['value'] = []
169 |         self.activations = dict()
170 |         self.activations['value'] = []
171 |         def backward_hook(module, grad_input, grad_output):
172 |             self.gradients['value'] += [grad_output[0]]
173 |             return None
174 |         def forward_hook(module, input, output):
175 |             self.activations['value'] += [output]
176 |             return None
177 |         target_layer = self._find_target_layer()
178 |         print (target_layer)
179 |         if isinstance(target_layer, list):
180 |             for i in target_layer:
181 |                 i.register_forward_hook(forward_hook)
182 |                 i.register_backward_hook(backward_hook)
183 |         else:        
184 |             target_layer.register_forward_hook(forward_hook)
185 |             target_layer.register_backward_hook(backward_hook)
186 | 
187 |     def forward(self, videos, labels, video_names):
188 |         batch_size = videos.shape[0]
189 |         b,c,f,h,w = videos.shape
190 |         videos = videos.cuda()
191 |         labels = labels.cuda()
192 | 
193 |         image_inps = videos.permute([0,2,1,3,4])
194 |         image_inps = image_inps.reshape(b*f, c, h, w)
195 | 
196 |         # define modifer that updated by optimizer.
197 |         modif = torch.Tensor(b*f, c, h, w).fill_(0.01/255).cuda()
198 |         modifier = torch.nn.Parameter(modif, requires_grad=True)
199 |         optimizer = torch.optim.Adam([modifier], lr=self.step_size)
200 | 
201 |         unnorm_videos = self._transform_video(image_inps.clone().detach(), mode='back') # [0, 1]
202 | 
203 |         unnorm_videos = Variable(unnorm_videos, requires_grad=False)
204 | 
205 |         for i in range(self.steps):
206 |             self.gradients = dict()
207 |             self.gradients['value'] = []
208 |             self.activations = dict()
209 |             self.activations['value'] = []
210 | 
211 |             true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1)
212 |             true_image = self._transform_video(true_image, mode='forward') # norm
213 | 
214 |             _ = self.model(true_image)
215 | 
216 |             std_losses = []
217 |             for mm in range(len(self.activations['value'])):
218 |                 activations = self.activations['value'][mm].std()
219 |                 std_losses.append(activations)
220 |             cost = torch.sum(torch.stack(std_losses))
221 |             optimizer.zero_grad()
222 |             cost.backward()
223 |             optimizer.step()
224 |             
225 |             for ind,vid_name in enumerate(video_names):
226 |                 if vid_name not in self.loss_info.keys():
227 |                     self.loss_info[vid_name] = {}  
228 |                 self.loss_info[vid_name][i] = {'cost': str(cost.detach().cpu().numpy())}
229 | 
230 |         true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1)
231 |         image_inps = self._transform_video(true_image, mode='forward')
232 |         image_inps = image_inps.reshape(b,f,c,h,w)
233 |         image_inps = image_inps.permute([0,2,1,3,4])
234 |         return image_inps
235 | 
236 | class ImageGuidedFMDirection_Adam(Attack):
237 |     '''
238 |     The proposed Image to Video (I2V) attack.
239 |     parameters:
240 |         depth: {1,2,3,4}
241 |         model_name_lists: [a model name]
242 |     '''
243 |     def __init__(self, model_name_lists, depth, step_size, epsilon=16/255, steps=10):
244 |         super(ImageGuidedFMDirection_Adam, self).__init__("ImageGuidedFMDirection_Adam")
245 |         self.epsilon = epsilon
246 |         self.steps = steps
247 |         self.step_size = step_size
248 |         self.loss_info = {}
249 |         self.depth = depth
250 |         self.model = get_models(model_name_lists)[0]
251 |         self.model_name = model_name_lists[0]
252 | 
253 |         self.model.train()
254 |         for m in self.model.modules():
255 |             if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
256 |                 m.eval()
257 | 
258 |         self._attention_hook()
259 | 
260 |     def _find_target_layer(self):
261 |             if self.model_name == 'resnet':
262 |                 return getattr(self.model, 'layer{}'.format(self.depth))[-1]
263 |             elif self.model_name == 'alexnet':
264 |                 depth_to_layer = {1:1,2:4,3:7,4:11}
265 |                 return getattr(self.model, 'features')[depth_to_layer[self.depth]]
266 |             elif self.model_name == 'vgg':
267 |                 depth_to_layer = {1:1,2:11,3:20,4:29}
268 |                 return getattr(self.model, 'features')[depth_to_layer[self.depth]]
269 |             elif self.model_name == 'squeezenet':
270 |                 depth_to_layer = {1:3,2:6,3:9,4:12}
271 |                 return getattr(self.model, 'features')[depth_to_layer[self.depth]].expand3x3_activation
272 |                 
273 |     def _attention_hook(self):
274 |         self.gradients = dict()
275 |         self.gradients['value'] = []
276 |         self.activations = dict()
277 |         self.activations['value'] = []
278 |         def backward_hook(module, grad_input, grad_output):
279 |             self.gradients['value'] += [grad_output[0]]
280 |             return None
281 |         def forward_hook(module, input, output):
282 |             self.activations['value'] += [output]
283 |             return None
284 |         target_layer = self._find_target_layer()
285 |         print (target_layer)
286 |         if isinstance(target_layer, list):
287 |             for i in target_layer:
288 |                 i.register_forward_hook(forward_hook)
289 |                 i.register_backward_hook(backward_hook)
290 |         else:        
291 |             target_layer.register_forward_hook(forward_hook)
292 |             target_layer.register_backward_hook(backward_hook)
293 | 
294 |     def forward(self, videos, labels, video_names):
295 |         batch_size = videos.shape[0]
296 |         b,c,f,h,w = videos.shape
297 |         videos = videos.cuda()
298 |         labels = labels.cuda()
299 | 
300 |         image_inps = videos.permute([0,2,1,3,4])
301 |         image_inps = image_inps.reshape(b*f, c, h, w)
302 | 
303 |         # define modifer that updated by optimizer.
304 |         modif = torch.Tensor(b*f, c, h, w).fill_(0.01/255).cuda()
305 |         modifier = torch.nn.Parameter(modif, requires_grad=True)
306 |         optimizer = torch.optim.Adam([modifier], lr=self.step_size)
307 | 
308 |         unnorm_videos = self._transform_video(image_inps.clone().detach(), mode='back') # [0, 1]
309 | 
310 |         unnorm_videos = Variable(unnorm_videos, requires_grad=False)
311 | 
312 |         # initial feature map
313 |         self.gradients = dict()
314 |         self.gradients['value'] = []
315 |         self.activations = dict()
316 |         self.activations['value'] = []
317 | 
318 |         _ = self.model(image_inps)
319 |         init_feature_maps = []
320 |         for mm in range(len(self.activations['value'])):
321 |             activations = self.activations['value'][mm]
322 |             activations = Variable(activations, requires_grad=False)
323 |             init_feature_maps.append(activations)
324 | 
325 |         for i in range(self.steps):
326 |             self.gradients = dict()
327 |             self.gradients['value'] = []
328 |             self.activations = dict()
329 |             self.activations['value'] = []
330 | 
331 |             true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1)
332 |             true_image = self._transform_video(true_image, mode='forward') # norm
333 | 
334 |             _ = self.model(true_image)
335 | 
336 |             losses = []
337 |             for mm in range(len(init_feature_maps)):
338 |                 activations = self.activations['value'][mm]
339 |                 init_activations = init_feature_maps[mm]
340 | 
341 |                 this_dir = activations.view(b*f, -1)
342 |                 init_dir = init_activations.view(b*f, -1)
343 |                 this_loss = F.cosine_similarity(this_dir, init_dir)
344 |                 flag = 1 # decrease this_loss
345 | 
346 |                 losses.append(this_loss)
347 |             cost = flag * torch.sum(torch.stack(losses))
348 |             
349 |             print (cost)
350 | 
351 |             optimizer.zero_grad()
352 |             cost.backward()
353 |             optimizer.step()
354 | 
355 |             for ind,vid_name in enumerate(video_names):
356 |                 if vid_name not in self.loss_info.keys():
357 |                     self.loss_info[vid_name] = {}  
358 |                 self.loss_info[vid_name][i] = {'cost': str(cost.detach().cpu().numpy())}
359 | 
360 |         true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1)
361 |         image_inps = self._transform_video(true_image, mode='forward')
362 |         image_inps = image_inps.reshape(b,f,c,h,w)
363 |         image_inps = image_inps.permute([0,2,1,3,4])
364 |         return image_inps
365 | 
366 | class ImageGuidedFML2_Adam_MultiModels(Attack):
367 |     '''
368 |     The proposed ensemble Image to Video (ENS-I2V) attack.
369 |     parameters:
370 |         depth: {1,2,3,4}
371 |     '''
372 |     def __init__(self, model_name_lists, depths, epsilon=16/255, steps=60):
373 |         super(ImageGuidedFML2_Adam_MultiModels, self).__init__("ImageGuidedFML2_Adam_MultiModels")
374 |         self.epsilon = epsilon
375 |         self.steps = steps
376 |         self.step_size = 0.005
377 |         self.loss_info = {}
378 |         self.depths = depths
379 |         self.models = get_models(model_name_lists)
380 |         self.model_names = model_name_lists
381 |         print (model_name_lists)
382 |         for i in range(len(self.models)):
383 |             self.models[i].train()
384 |             for m in self.models[i].modules():
385 |                 if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
386 |                     m.eval()
387 |             model_name = self.model_names[i]
388 | 
389 |             self._attention_hook(self.models[i], model_name)
390 | 
391 |     def _find_target_layer(self, model, model_name):
392 |         used_depth = self.depths[model_name]
393 |         if model_name == 'resnet':
394 |             return getattr(model, 'layer{}'.format(used_depth))[-1]
395 |         elif model_name == 'alexnet':
396 |             depth_to_layer = {1:1,2:4,3:7,4:11}
397 |             return getattr(model, 'features')[depth_to_layer[used_depth]]
398 |         elif model_name == 'vgg':
399 |             depth_to_layer = {1:1,2:11,3:20,4:29}
400 |             return getattr(model, 'features')[depth_to_layer[used_depth]]
401 |         elif model_name == 'squeezenet':
402 |             depth_to_layer = {1:3,2:6,3:9,4:12}
403 |             return getattr(model, 'features')[depth_to_layer[used_depth]].expand3x3_activation
404 |             
405 |     def _attention_hook(self, model, model_name):
406 |         self.gradients = dict()
407 |         self.gradients['value'] = []
408 |         self.activations = dict()
409 |         self.activations['value'] = []
410 |         def backward_hook(module, grad_input, grad_output):
411 |             self.gradients['value'] += [grad_output[0]]
412 |             return None
413 |         def forward_hook(module, input, output):
414 |             self.activations['value'] += [output]
415 |             return None
416 |         target_layer = self._find_target_layer(model, model_name)
417 |         print (target_layer)
418 |         if isinstance(target_layer, list):
419 |             for i in target_layer:
420 |                 i.register_forward_hook(forward_hook)
421 |                 i.register_backward_hook(backward_hook)
422 |         else:        
423 |             target_layer.register_forward_hook(forward_hook)
424 |             target_layer.register_backward_hook(backward_hook)
425 | 
426 |     def forward(self, videos, labels, video_names):
427 |         batch_size = videos.shape[0]
428 |         b,c,f,h,w = videos.shape
429 |         videos = videos.cuda()
430 |         labels = labels.cuda()
431 | 
432 |         image_inps = videos.permute([0,2,1,3,4])
433 |         image_inps = image_inps.reshape(b*f, c, h, w)
434 | 
435 |         # define modifer that updated by optimizer.
436 |         modif = torch.Tensor(b*f, c, h, w).fill_(0.01/255).cuda()
437 |         modifier = torch.nn.Parameter(modif, requires_grad=True)
438 |         optimizer = torch.optim.Adam([modifier], lr=self.step_size)
439 | 
440 |         unnorm_videos = self._transform_video(image_inps.clone().detach(), mode='back') # [0, 1]
441 | 
442 |         unnorm_videos = Variable(unnorm_videos, requires_grad=False)
443 | 
444 |         # initial feature map
445 |         self.gradients = dict()
446 |         self.gradients['value'] = []
447 |         self.activations = dict()
448 |         self.activations['value'] = []
449 |         
450 |         for n in range(len(self.models)):
451 |             _ = self.models[n](image_inps)
452 |         # _ = self.model(image_inps)
453 |         init_feature_maps = []
454 |         for mm in range(len(self.activations['value'])):
455 |             activations = self.activations['value'][mm]
456 |             activations = Variable(activations, requires_grad=False)
457 |             init_feature_maps.append(activations)
458 | 
459 |         for i in range(self.steps):
460 |             self.gradients = dict()
461 |             self.gradients['value'] = []
462 |             self.activations = dict()
463 |             self.activations['value'] = []
464 | 
465 |             true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1)
466 |             true_image = self._transform_video(true_image, mode='forward') # norm
467 | 
468 |             # _ = self.model(true_image)
469 |             for n in range(len(self.models)):
470 |                 _ = self.models[n](true_image)
471 |             losses = []
472 |             for mm in range(len(init_feature_maps)):
473 |                 activations = self.activations['value'][mm]
474 |                 init_activations = init_feature_maps[mm]
475 |                 this_dir = activations.view(b*f, -1)
476 |                 init_dir = init_activations.view(b*f, -1)
477 |                 this_loss = F.cosine_similarity(this_dir, init_dir)
478 |                 flag = 1 # decrease this_loss
479 |                 losses.append(this_loss)
480 |             cost = flag * torch.sum(torch.stack(losses))
481 |             
482 |             print (cost)
483 |             optimizer.zero_grad()
484 |             cost.backward()
485 |             optimizer.step()
486 | 
487 |             for ind,vid_name in enumerate(video_names):
488 |                 if vid_name not in self.loss_info.keys():
489 |                     self.loss_info[vid_name] = {}  
490 |                 self.loss_info[vid_name][i] = {'cost': str(cost.detach().cpu().numpy())}
491 | 
492 |         true_image = torch.clamp(unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1)
493 |         image_inps = self._transform_video(true_image, mode='forward')
494 |         image_inps = image_inps.reshape(b,f,c,h,w)
495 |         image_inps = image_inps.permute([0,2,1,3,4])
496 |         return image_inps
497 | 
498 | class ILAF(Attack):
499 |     '''
500 |     ILAF. Paper: Enhancing adversarial example transferability with an intermediate level attack.
501 |     '''
502 |     def __init__(self, model, model_type, step_size=0.005, epsilon=16/255, steps=60):
503 |         super(ILAF, self).__init__("ILAF")
504 |         self.epsilon = epsilon
505 |         self.steps = steps
506 |         self.step_size = step_size
507 |         self.loss_info = {}
508 |         self.model_type = model_type
509 |         self.model = model
510 |         
511 |         self._activation_hook()
512 | 
513 |     def _find_target_layer(self):
514 |         if 'i3d' in self.model_type:
515 |             return self.model.res_layers._modules['1']
516 |         elif 'slowfast' in self.model_type:
517 |             return [self.model._modules['slow_res2'], self.model._modules['fast_res2']] #[b,2048, 8, 7, 7], [b, 256, 32, 7, 7]
518 |         elif 'tpn' in self.model_type:
519 |             return self.model.layer2
520 |                 
521 |     def _activation_hook(self):
522 |         self.activations = dict()
523 |         self.activations['value'] = []
524 |         def forward_hook(module, input, output):
525 |             self.activations['value'] += [output]
526 |             return None
527 |         target_layer = self._find_target_layer()
528 |         if isinstance(target_layer, list):
529 |             for i in target_layer:
530 |                 i.register_forward_hook(forward_hook)
531 |         else:        
532 |             target_layer.register_forward_hook(forward_hook)
533 | 
534 |     def forward(self, videos, ori_videos, labels, video_names):
535 |         batch_size = videos.shape[0]
536 |         b,c,f,h,w = videos.shape
537 |         videos = videos.cuda()
538 |         labels = labels.cuda()
539 |         ori_videos = ori_videos.cuda()
540 | 
541 |         # ori feature map
542 |         ori_feature_maps = []
543 |         self.activations = dict()
544 |         self.activations['value'] = []
545 |         with torch.no_grad():
546 |             _ = self.model(ori_videos)
547 |         for mm in range(len(self.activations['value'])):
548 |             activations = self.activations['value'][mm]
549 |             ori_feature_maps.append(activations)
550 | 
551 |         # existed adv feature map
552 |         adv_feature_maps = []
553 |         self.activations = dict()
554 |         self.activations['value'] = []
555 |         with torch.no_grad():
556 |             _ = self.model(videos)
557 |         for mm in range(len(self.activations['value'])):
558 |             activations = self.activations['value'][mm]
559 |             adv_feature_maps.append(activations)
560 | 
561 |         init_directions = [] # normalized direction
562 |         init_norms = [] # norm values
563 |         for ori_di, adv_di in zip(ori_feature_maps, adv_feature_maps):
564 |             init_direction = adv_di - ori_di
565 |             norm = torch.norm(init_direction, p=2)
566 |             init_norms.append(norm)
567 |             init_directions.append(init_direction/torch.norm(init_direction,p=2,keepdim=True))
568 | 
569 |         
570 |         adv_unnorm_videos = self._transform_video_ILAF(videos.clone().detach(), mode='back') # [0, 1]
571 |         ori_unnorm_videos = self._transform_video_ILAF(ori_videos.clone().detach(), mode='back') # [0, 1]
572 | 
573 |         existed_perturbations = adv_unnorm_videos - ori_unnorm_videos
574 |         modifier = torch.Tensor(existed_perturbations.cpu()).cuda()
575 |         ori_unnorm_videos = Variable(ori_unnorm_videos, requires_grad=False)
576 | 
577 |         del adv_feature_maps, adv_unnorm_videos, videos, ori_videos
578 |         torch.cuda.empty_cache() 
579 |         for i in range(self.steps):
580 |             modifier.requires_grad = True
581 |             self.activations = dict()
582 |             self.activations['value'] = []
583 | 
584 |             true_image = torch.clamp(ori_unnorm_videos + torch.clamp(modifier, min=-self.epsilon, max=self.epsilon), min=0, max=1)
585 | 
586 | 
587 |             true_image = self._transform_video_ILAF(true_image, mode='forward') # norm
588 | 
589 |             step_feature_maps = []
590 |             opt = self.model(true_image)
591 |             
592 |             for mm in range(len(self.activations['value'])):
593 |                 activations = self.activations['value'][mm]
594 |                 # activations = Variable(activations, requires_grad=False)
595 |                 step_feature_maps.append(activations)
596 | 
597 |             step_directions = [] # normalized direction
598 |             step_norms = [] # norm values
599 |             for ori_di, adv_di in zip(ori_feature_maps, step_feature_maps):
600 |                 step_direction = adv_di - ori_di
601 |                 step_norm = torch.norm(step_direction, p=2)
602 |                 step_norms.append(step_norm)
603 |                 step_directions.append(step_direction/torch.norm(step_direction,p=2,keepdim=True))
604 | 
605 |             losses = []
606 |             for lens_fm in range(len(step_directions)):
607 |                 # magnitude
608 |                 magnitude_gain = step_norms[lens_fm] / init_norms[lens_fm]
609 |                 # angle
610 |                 angle_loss = torch.mm(init_directions[lens_fm].view(1,-1), step_directions[lens_fm].view(1,-1).transpose(1,0))
611 |                 this_loss = -(0.5 * magnitude_gain + angle_loss)
612 |                 losses.append(this_loss)
613 |             cost = torch.sum(torch.stack(losses))
614 | 
615 |             grad = torch.autograd.grad(cost, modifier,
616 |                                        retain_graph=False, create_graph=False)[0]
617 |             modifier.data -= self.step_size * grad.sign()
618 |             
619 | 
620 |             for ind,vid_name in enumerate(video_names):
621 |                 if vid_name not in self.loss_info.keys():
622 |                     self.loss_info[vid_name] = {}  
623 |                 self.loss_info[vid_name][i] = {'cost': str(cost.detach().cpu().numpy())}
624 | 
625 |         true_image = torch.clamp(ori_unnorm_videos + torch.clamp(modifier.data, min=-self.epsilon, max=self.epsilon), min=0, max=1)
626 |         image_inps = self._transform_video_ILAF(true_image, mode='forward')
627 |         image_inps = image_inps.reshape(b,f,c,h,w)
628 |         image_inps = image_inps.permute([0,2,1,3,4])
629 |         return image_inps
630 | 


--------------------------------------------------------------------------------
/base_attacks.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import scipy.stats as st
  4 | import numpy as np
  5 | import torchvision
  6 | from PIL import Image
  7 | import random
  8 | 
  9 | from utils import norm_grads
 10 | # refer to https://github.com/Harry24k/adversarial-attacks-pytorch
 11 | 
 12 | class Attack(object):
 13 |     """
 14 |     Base class for all attacks.
 15 |     .. note::
 16 |         It automatically set device to the device where given model is.
 17 |         It temporarily changes the model's training mode to `test`
 18 |         by `.eval()` only during an attack process.
 19 |     """
 20 |     def __init__(self, name, model):
 21 |         r"""
 22 |         Initializes internal attack state.
 23 |         Arguments:
 24 |             name (str) : name of an attack.
 25 |             model (torch.nn.Module): model to attack.
 26 |         """
 27 |         self.attack = name
 28 |         self.model = model
 29 |         self.model_name = str(model).split("(")[0]
 30 | 
 31 |         self.training = model.training
 32 |         self.device = next(model.parameters()).device
 33 |         
 34 |         self._targeted = 1
 35 |         self._attack_mode = 'default'
 36 |         self._return_type = 'float'
 37 |         self._target_map_function = lambda images, labels:labels
 38 | 
 39 |         self.mean = [0.485, 0.456, 0.406]
 40 |         self.std = [0.229, 0.224, 0.225]
 41 | 
 42 |     def forward(self, *input):
 43 |         r"""
 44 |         It defines the computation performed at every call (attack forward).
 45 |         Should be overridden by all subclasses.
 46 |         """
 47 |         raise NotImplementedError
 48 |         
 49 |     def set_attack_mode(self, mode, target_map_function=None):
 50 |         r"""
 51 |         Set the attack mode.
 52 |   
 53 |         Arguments:
 54 |             mode (str) : 'default' (DEFAULT)
 55 |                          'targeted' - Use input labels as targeted labels.
 56 |                          'least_likely' - Use least likely labels as targeted labels.
 57 |                          
 58 |             target_map_function (function) :
 59 |         """
 60 |         if self._attack_mode is 'only_default':
 61 |             raise ValueError("Changing attack mode is not supported in this attack method.")
 62 |             
 63 |         if (mode is 'targeted') and (target_map_function is None):
 64 |             raise ValueError("Please give a target_map_function, e.g., lambda images, labels:(labels+1)%10.")
 65 |             
 66 |         if mode=="default":
 67 |             self._attack_mode = "default"
 68 |             self._targeted = 1
 69 |             self._transform_label = self._get_label
 70 |         elif mode=="targeted":
 71 |             self._attack_mode = "targeted"
 72 |             self._targeted = -1
 73 |             self._target_map_function = target_map_function
 74 |             self._transform_label = self._get_target_label
 75 |         elif mode=="least_likely":
 76 |             self._attack_mode = "least_likely"
 77 |             self._targeted = -1
 78 |             self._transform_label = self._get_least_likely_label
 79 |         else:
 80 |             raise ValueError(mode + " is not a valid mode. [Options : default, targeted, least_likely]")
 81 |             
 82 |     def set_return_type(self, type):
 83 |         r"""
 84 |         Set the return type of adversarial images: `int` or `float`.
 85 |         Arguments:
 86 |             type (str) : 'float' or 'int'. (DEFAULT : 'float')
 87 |         """
 88 |         if type == 'float':
 89 |             self._return_type = 'float'
 90 |         elif type == 'int':
 91 |             self._return_type = 'int'
 92 |         else:
 93 |             raise ValueError(type + " is not a valid type. [Options : float, int]")
 94 | 
 95 |     def save(self, save_path, data_loader, verbose=True):
 96 |         r"""
 97 |         Save adversarial images as torch.tensor from given torch.utils.data.DataLoader.
 98 |         Arguments:
 99 |             save_path (str) : save_path.
100 |             data_loader (torch.utils.data.DataLoader) : data loader.
101 |             verbose (bool) : True for displaying detailed information. (DEFAULT : True)
102 |         """
103 |         self.model.eval()
104 | 
105 |         image_list = []
106 |         label_list = []
107 | 
108 |         correct = 0
109 |         total = 0
110 | 
111 |         total_batch = len(data_loader)
112 | 
113 |         for step, (images, labels) in enumerate(data_loader):
114 |             adv_images = self.__call__(images, labels)
115 | 
116 |             image_list.append(adv_images.cpu())
117 |             label_list.append(labels.cpu())
118 | 
119 |             if self._return_type == 'int':
120 |                 adv_images = adv_images.float()/255
121 | 
122 |             if verbose:
123 |                 outputs = self.model(adv_images)
124 |                 _, predicted = torch.max(outputs.data, 1)
125 |                 total += labels.size(0)
126 |                 correct += (predicted == labels.to(self.device)).sum()
127 | 
128 |                 acc = 100 * float(correct) / total
129 |                 print('- Save Progress : %2.2f %% / Accuracy : %2.2f %%' % ((step+1)/total_batch*100, acc), end='\r')
130 | 
131 |         x = torch.cat(image_list, 0)
132 |         y = torch.cat(label_list, 0)
133 |         torch.save((x, y), save_path)
134 |         print('\n- Save Complete!')
135 | 
136 |         self._switch_model()
137 |     
138 |     def _transform_perts(self, perts):
139 |         dtype = perts.dtype
140 |         mean = torch.as_tensor(self.mean, dtype=dtype, device=self.device)
141 |         std = torch.as_tensor(self.std, dtype=dtype, device=self.device)
142 |         perts.div_(std[:, None, None, None])
143 |         return perts
144 | 
145 |     def _transform_video(self, video, mode='forward'):
146 |         r'''
147 |         Transform the video into [0, 1]
148 |         '''
149 |         dtype = video.dtype
150 |         mean = torch.as_tensor(self.mean, dtype=dtype, device=self.device)
151 |         std = torch.as_tensor(self.std, dtype=dtype, device=self.device)
152 |         if mode == 'forward':
153 |             # [-mean/std, mean/std]
154 |             video.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
155 |         elif mode == 'back':
156 |             # [0, 1]
157 |             video.mul_(std[:, None, None, None]).add_(mean[:, None, None, None])
158 |         return video
159 | 
160 |     def _transform_label(self, images, labels):
161 |         r"""
162 |         Function for changing the attack mode.
163 |         """
164 |         return labels
165 |         
166 |     def _get_label(self, images, labels):
167 |         r"""
168 |         Function for changing the attack mode.
169 |         Return input labels.
170 |         """
171 |         return labels
172 |     
173 |     def _get_target_label(self, images, labels):
174 |         r"""
175 |         Function for changing the attack mode.
176 |         Return input labels.
177 |         """
178 |         return self._target_map_function(images, labels)
179 |     
180 |     def _get_least_likely_label(self, images, labels):
181 |         r"""
182 |         Function for changing the attack mode.
183 |         Return least likely labels.
184 |         """
185 |         outputs = self.model(images)
186 |         _, labels = torch.min(outputs.data, 1)
187 |         labels = labels.detach_()
188 |         return labels
189 |     
190 |     def _to_uint(self, images):
191 |         r"""
192 |         Function for changing the return type.
193 |         Return images as int.
194 |         """
195 |         return (images*255).type(torch.uint8)
196 | 
197 |     def _switch_model(self):
198 |         r"""
199 |         Function for changing the training mode of the model.
200 |         """
201 |         if self.training:
202 |             self.model.train()
203 |         else:
204 |             self.model.eval()
205 | 
206 |     def __str__(self):
207 |         info = self.__dict__.copy()
208 |         
209 |         del_keys = ['model', 'attack']
210 |         
211 |         for key in info.keys():
212 |             if key[0] == "_" :
213 |                 del_keys.append(key)
214 |                 
215 |         for key in del_keys:
216 |             del info[key]
217 |         
218 |         info['attack_mode'] = self._attack_mode
219 |         if info['attack_mode'] == 'only_default' :
220 |             info['attack_mode'] = 'default'
221 |             
222 |         info['return_type'] = self._return_type
223 |         
224 |         return self.attack + "(" + ', '.join('{}={}'.format(key, val) for key, val in info.items()) + ")"
225 | 
226 |     def __call__(self, *input, **kwargs):
227 |         self.model.eval()
228 |         images = self.forward(*input, **kwargs)
229 |         self._switch_model()
230 | 
231 |         if self._return_type == 'int':
232 |             images = self._to_uint(images)
233 | 
234 |         return images
235 | 
236 | class FGSM(Attack):
237 |     '''Fast Gradient Sign Method'''
238 |     def __init__(self, model, steps=None, epsilon=16/255):
239 |         super(FGSM, self).__init__("FGSM", model)
240 |         self.epsilon = epsilon
241 | 
242 |     def forward(self, videos, labels):
243 |         videos = videos.to(self.device)
244 |         labels = labels.to(self.device)
245 |         loss = nn.CrossEntropyLoss()
246 | 
247 |         videos.requires_grad = True
248 |         outputs = self.model(videos)
249 |         cost = self._targeted*loss(outputs, labels).to(self.device)
250 | 
251 |         grad = torch.autograd.grad(cost, videos,
252 |                                    retain_graph=False, create_graph=False)[0]
253 | 
254 |         adv_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1]
255 |         adv_videos = adv_videos + self.epsilon*grad.sign()
256 |         adv_videos = torch.clamp(adv_videos, min=0, max=1).detach()
257 |         adv_videos = self._transform_video(adv_videos, mode='forward') # norm
258 | 
259 |         return adv_videos    
260 | 
261 | class BIM(Attack):
262 |     '''
263 |     Basic Iterative Method
264 |     Only iterative version.
265 |     '''
266 |     def __init__(self, model, epsilon=16/255, steps=10):
267 |         super(BIM, self).__init__("FGSM", model)
268 |         self.epsilon = epsilon
269 |         self.steps = steps
270 |         self.step_size = self.epsilon / self.steps
271 | 
272 |     def forward(self, videos, labels):
273 |         r"""
274 |         Overridden.
275 |         """
276 |         videos = videos.to(self.device)
277 |         labels = labels.to(self.device)
278 |         loss = nn.CrossEntropyLoss()
279 |         unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1]
280 |         adv_videos = videos.clone().detach()
281 | 
282 |         for i in range(self.steps):
283 |             adv_videos.requires_grad = True
284 |             outputs = self.model(adv_videos)
285 |             cost = self._targeted*loss(outputs, labels).to(self.device)
286 |             grad = torch.autograd.grad(cost, adv_videos, 
287 |                                        retain_graph=False, create_graph=False)[0]
288 |             
289 |             adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1]
290 |             adv_videos = adv_videos + self.step_size*grad.sign()
291 |             delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon)
292 |             adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach()
293 |             adv_videos = self._transform_video(adv_videos, mode='forward') # norm
294 | 
295 |         return adv_videos
296 | 
297 | class MIFGSM(Attack):
298 |     '''
299 |     Momentum Iterative Fast Gradient Sign Method
300 |     Only iterative version.
301 |     '''
302 |     def __init__(self, model, epsilon=16/255, steps=10, decay=1.0):
303 |         super(MIFGSM, self).__init__("MIFGSM", model)
304 |         self.epsilon = epsilon
305 |         self.steps = steps
306 |         self.step_size = self.epsilon / self.steps
307 |         self.decay = decay
308 | 
309 |     def forward(self, videos, labels):
310 |         r"""
311 |         Overridden.
312 |         """
313 |         videos = videos.to(self.device)
314 |         labels = labels.to(self.device)
315 |         loss = nn.CrossEntropyLoss()
316 |         momentum = torch.zeros_like(videos).to(self.device)
317 |         unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1]
318 |         adv_videos = videos.clone().detach()
319 | 
320 |         for i in range(self.steps):
321 |             adv_videos.requires_grad = True
322 |             outputs = self.model(adv_videos)
323 | 
324 |             cost = self._targeted*loss(outputs, labels).to(self.device)
325 |             grad = torch.autograd.grad(cost, adv_videos, 
326 |                                        retain_graph=False, create_graph=False)[0]
327 |             # frame-level or clip-level
328 |             grad = norm_grads(grad, True)
329 |             # grad_norm = torch.norm(grad, p=1)
330 |             # grad /= grad_norm
331 |             grad += momentum*self.decay
332 |             momentum = grad
333 |             
334 |             adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1]
335 |             adv_videos = adv_videos + self.step_size*grad.sign()
336 |             delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon)
337 |             adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach()
338 |             adv_videos = self._transform_video(adv_videos, mode='forward') # norm
339 | 
340 |         return adv_videos
341 | 
342 | class DIFGSM(Attack):
343 |     '''
344 |     Diverse Inputs Method.
345 |     Only iterative version.
346 |     Contain momentum or no momentum.
347 |     '''
348 |     def __init__(self, model, epsilon=16/255, steps=10, decay=1.0, momentum=False):
349 |         super(DIFGSM, self).__init__("DIFGSM", model)
350 |         self.epsilon = epsilon
351 |         self.steps = steps
352 |         self.step_size = self.epsilon / self.steps
353 |         self.decay = decay
354 |         self.momentum = momentum
355 | 
356 |     def _input_diversity(self, videos):
357 |         # r = torch.randint(1,10, size=(1,1)).item()
358 |         # if r <= 5:
359 |         if random.random() < 0.5:
360 |             return videos
361 |         else:
362 |             rnd = torch.randint(224,250, size=(1,1)).item()
363 |             rescaled = videos.view((-1, ) + videos.shape[2:])
364 |             rescaled = torch.nn.functional.interpolate(rescaled, size=[rnd, rnd], mode='nearest')
365 |             # rescaled = torchvision.transforms.functional.resize(videos,[rnd, rnd], Image.NEAREST)
366 |             h_rem = 250 - rnd
367 |             w_rem = 250 - rnd
368 |             pad_top = torch.randint(0, h_rem, size=(1,1)).item()
369 |             pad_bottom = h_rem - pad_top
370 |             pad_left = torch.randint(0, w_rem, size=(1,1)).item()
371 |             pad_right = w_rem - pad_left
372 |             padded = nn.functional.pad(rescaled, [pad_left, pad_right, pad_top, pad_bottom])
373 |             # return torchvision.transforms.functional.resize(padded,[224, 224], Image.NEAREST)
374 |             padded = torch.nn.functional.interpolate(padded, size=[224, 224], mode='nearest')
375 |             padded = padded.view(videos.shape)
376 |             return padded
377 | 
378 |     def forward(self, videos, labels):
379 |         r"""
380 |         Overridden.
381 |         """
382 |         videos = videos.to(self.device)
383 |         labels = labels.to(self.device)
384 |         loss = nn.CrossEntropyLoss()
385 |         momentum = torch.zeros_like(videos).to(self.device)
386 |         unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1]
387 |         adv_videos = videos.clone().detach()
388 | 
389 |         for i in range(self.steps):
390 |             adv_videos.requires_grad = True
391 |             outputs = self.model(self._input_diversity(adv_videos))
392 | 
393 |             cost = self._targeted*loss(outputs, labels).to(self.device)
394 |             grad = torch.autograd.grad(cost, adv_videos, 
395 |                                        retain_graph=False, create_graph=False)[0]
396 | 
397 |             if self.momentum:
398 |                 grad_norm = torch.norm(grad, p=1)
399 |                 grad /= grad_norm
400 |                 grad += momentum*self.decay
401 |                 momentum = grad
402 |             else:
403 |                 pass
404 | 
405 |             adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1]
406 |             adv_videos = adv_videos + self.step_size*grad.sign()
407 |             delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon)
408 |             adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach()
409 |             adv_videos = self._transform_video(adv_videos, mode='forward') # norm
410 | 
411 |         return adv_videos
412 | 
413 | class TIFGSM(Attack):
414 |     '''Translation-Invariant Attack'''
415 |     def __init__(self, model, epsilon=16/255, steps=10, decay=1.0, momentum=False):
416 |         super(TIFGSM, self).__init__("MIFGSM", model)
417 |         self.epsilon = epsilon
418 |         self.steps = steps
419 |         self.step_size = self.epsilon / self.steps
420 |         self.decay = decay
421 |         self.momentum = momentum
422 |         # generate start_kernel
423 |         kernel = self._initial_kernel(15, 3).astype(np.float32) # (15,15)
424 |         stack_kernel = np.stack([kernel, kernel, kernel]) # (3,15,15)
425 |         self.stack_kernel = torch.from_numpy(np.expand_dims(stack_kernel, 1)).to(self.device) # 3,1,15,15
426 |     
427 |     def _initial_kernel(self, kernlen, nsig):
428 |         x = np.linspace(-nsig, nsig, kernlen)
429 |         kern1d = st.norm.pdf(x)
430 |         kernel_raw = np.outer(kern1d, kern1d)
431 |         kernel = kernel_raw / kernel_raw.sum()
432 |         return kernel
433 | 
434 |     def _conv2d_frame(self, grads):
435 |         '''
436 |         grads: N, C, T, H, W
437 |         '''
438 |         frames = grads.shape[2]
439 |         out_grads = torch.zeros_like(grads)
440 |         for i in range(frames):
441 |             this_grads = grads[:,:,i]
442 |             out_grad = nn.functional.conv2d(this_grads, self.stack_kernel, groups=3, stride=1, padding=7)
443 |             out_grads[:,:,i] = out_grad
444 |         out_grads = out_grads / torch.mean(torch.abs(out_grads), [1,2,3], True)
445 |         return out_grads
446 | 
447 |     def forward(self, videos, labels):
448 |         r"""
449 |         Overridden.
450 |         """
451 |         videos = videos.to(self.device)
452 |         labels = labels.to(self.device)
453 |         loss = nn.CrossEntropyLoss()
454 |         momentum = torch.zeros_like(videos).to(self.device)
455 |         unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1]
456 |         adv_videos = videos.clone().detach()
457 | 
458 |         for i in range(self.steps):
459 |             adv_videos.requires_grad = True
460 |             outputs = self.model(adv_videos)
461 | 
462 |             cost = self._targeted*loss(outputs, labels).to(self.device)
463 |             grad = torch.autograd.grad(cost, adv_videos, 
464 |                                        retain_graph=False, create_graph=False)[0]
465 | 
466 |             grad = self._conv2d_frame(grad)
467 |             if self.momentum:
468 |                 grad += momentum*self.decay
469 |                 momentum = grad
470 |             else:
471 |                 pass
472 | 
473 |             adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1]
474 |             adv_videos = adv_videos + self.step_size*grad.sign()
475 |             delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon)
476 |             adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach()
477 |             adv_videos = self._transform_video(adv_videos, mode='forward') # norm
478 | 
479 |         return adv_videos
480 | 
481 | class SGM(Attack):
482 |     '''Skip Gradient Method'''
483 |     def __init__(self, model, epsilon=16/255, steps=10, decay=1.0, gamma=0.5, momentum=False):
484 |         super(SGM, self).__init__("SGM", model)
485 |         self.epsilon = epsilon
486 |         self.steps = steps
487 |         self.step_size = self.epsilon / self.steps
488 |         self.decay = decay
489 |         self.momentum = momentum
490 |         self.gamma = gamma
491 |         
492 |         # register model
493 |         self._register_hook_for_model(self.model)
494 | 
495 |     def _register_hook_for_model(self, model):    
496 |         def backward_hook(gamma):
497 |             # implement SGM through grad through ReLU
498 |             def _backward_hook(module, grad_in, grad_out):
499 |                 if isinstance(module, nn.ReLU):
500 |                     return (gamma * grad_in[0],)
501 |             return _backward_hook
502 | 
503 |         def backward_hook_norm(module, grad_in, grad_out):
504 |             # normalize the gradient to avoid gradient explosion or vanish
505 |             std = torch.std(grad_in[0])
506 |             return (grad_in[0] / std,)
507 | 
508 |         backward_hook_sgm = backward_hook(np.power(self.gamma, 0.5))
509 |         for name, module in model.named_modules():
510 |             if 'relu' in name and not '0.relu' in name:
511 |                 module.register_backward_hook(backward_hook_sgm)
512 | 
513 |             # e.g., 1.layer1.1, 1.layer4.2, ...
514 |             # if len(name.split('.')) == 3:
515 |             # refer to https://github.com/csdongxian/skip-connections-matter/issues/3
516 |             # if len(name.split('.')) >= 2 and 'layer' in name.split('.')[-2]:
517 |                 # module.register_backward_hook(backward_hook_norm)
518 | 
519 |     def forward(self, videos, labels):
520 |         r"""
521 |         Overridden.
522 |         """
523 |         videos = videos.to(self.device)
524 |         labels = labels.to(self.device)
525 |         loss = nn.CrossEntropyLoss()
526 |         momentum = torch.zeros_like(videos).to(self.device)
527 |         unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1]
528 |         adv_videos = videos.clone().detach()
529 | 
530 |         for i in range(self.steps):
531 |             adv_videos.requires_grad = True
532 |             outputs = self.model(adv_videos)
533 | 
534 |             cost = self._targeted*loss(outputs, labels).to(self.device)
535 |             grad = torch.autograd.grad(cost, adv_videos, 
536 |                                        retain_graph=False, create_graph=False)[0]
537 | 
538 |             if self.momentum:
539 |                 grad_norm = torch.norm(grad, p=1)
540 |                 grad /= grad_norm
541 |                 grad += momentum*self.decay
542 |                 momentum = grad
543 |             else:
544 |                 pass
545 | 
546 |             adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1]
547 |             adv_videos = adv_videos + self.step_size*grad.sign()
548 |             delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon)
549 |             adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach()
550 |             adv_videos = self._transform_video(adv_videos, mode='forward') # norm
551 |         return adv_videos
552 | 
553 | class SIM(Attack):
554 |     '''Scale-Invariant Attack Method'''
555 |     def __init__(self, model, epsilon=16/255, steps=10, decay=1.0, sclae_step=5, momentum=False):
556 |         super(SIM, self).__init__("SIM", model)
557 |         self.epsilon = epsilon
558 |         self.steps = steps
559 |         self.step_size = self.epsilon / self.steps
560 |         self.decay = decay
561 |         self.momentum = momentum
562 |         self.sclae_step = sclae_step
563 | 
564 |     def _multi_scale(self, adv_videos, labels, loss):    
565 |         def obtain_grad(vid, labels):
566 |             vid.requires_grad = True
567 |             outputs = self.model(vid)
568 |             cost = self._targeted*loss(outputs, labels).to(self.device)
569 |             grad = torch.autograd.grad(cost, vid, 
570 |                                        retain_graph=False, create_graph=False)[0]
571 |             return grad
572 |         
573 |         mean_grad = None
574 |         for i in range(self.sclae_step):
575 |             tmp_videos = 1 / 2**i * adv_videos
576 |             grad = obtain_grad(tmp_videos, labels)
577 |             if mean_grad is None:
578 |                 mean_grad = grad
579 |             else:
580 |                 mean_grad += grad
581 |         return mean_grad / self.sclae_step
582 | 
583 |     def forward(self, videos, labels):
584 |         r"""
585 |         Overridden.
586 |         """
587 |         videos = videos.to(self.device)
588 |         labels = labels.to(self.device)
589 |         loss = nn.CrossEntropyLoss()
590 |         momentum = torch.zeros_like(videos).to(self.device)
591 |         unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1]
592 |         adv_videos = videos.clone().detach()
593 | 
594 |         for i in range(self.steps):
595 |             grad = self._multi_scale(adv_videos, labels, loss)
596 | 
597 |             if self.momentum:
598 |                 grad_norm = torch.norm(grad, p=1)
599 |                 grad /= grad_norm
600 |                 grad += momentum*self.decay
601 |                 momentum = grad
602 |             else:
603 |                 pass
604 | 
605 |             adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1]
606 |             adv_videos = adv_videos + self.step_size*grad.sign()
607 |             delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon)
608 |             adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach()
609 |             adv_videos = self._transform_video(adv_videos, mode='forward') # norm
610 |         return adv_videos
611 | 
612 | class TIFGSM3D(Attack):
613 |     '''Translation-Invariant Attack'''
614 |     def __init__(self, model, epsilon=16/255, steps=10, decay=1.0, momentum=False):
615 |         super(TIFGSM3D, self).__init__("TIFGSM3D", model)
616 |         self.epsilon = epsilon
617 |         self.steps = steps
618 |         self.step_size = self.epsilon / self.steps
619 |         self.decay = decay
620 |         self.momentum = momentum
621 |         # generate start_kernel
622 |         kernel = self._initial_kernel(15, 3).astype(np.float32) # (15,15,15)
623 |         stack_kernel = np.stack([kernel, kernel, kernel]) # (3,15,15,15)
624 |         self.stack_kernel = torch.from_numpy(np.expand_dims(stack_kernel, 1)).to(self.device) # 3,1,15,15,15
625 |     
626 |     def _initial_kernel(self, kernlen, nsig):
627 |         x = np.linspace(-nsig, nsig, kernlen)
628 |         kern1d = st.norm.pdf(x)
629 |         kernel_raw = np.outer(kern1d, kern1d)
630 |         used_kernel = np.zeros((kernlen, kernlen, kernlen))
631 |         for i in range(kern1d.shape[0]):
632 |             used_kernel[i] = kern1d[i] * kernel_raw
633 |         used_kernel = used_kernel / used_kernel.sum()
634 |         return used_kernel
635 | 
636 |     def _conv3d_frame(self, grads):
637 |         '''
638 |         grads: N, C, T, H, W
639 |         '''
640 |         out_grads = nn.functional.conv3d(grads, self.stack_kernel, groups=3, stride=1, padding=7)
641 |         # frames = grads.shape[2]
642 |         # out_grads = torch.zeros_like(grads)
643 | 
644 |         # for i in range(frames):
645 |         #     this_grads = grads[:,:,i]
646 |         #     out_grad = nn.functional.conv2d(this_grads, self.stack_kernel, groups=3, stride=1, padding=7)
647 |         #     out_grads[:,:,i] = out_grad
648 |         out_grads = norm_grads(out_grads, True)
649 |         return out_grads
650 | 
651 |     def forward(self, videos, labels):
652 |         r"""
653 |         Overridden.
654 |         """
655 |         videos = videos.to(self.device)
656 |         labels = labels.to(self.device)
657 |         loss = nn.CrossEntropyLoss()
658 |         momentum = torch.zeros_like(videos).to(self.device)
659 |         unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1]
660 |         adv_videos = videos.clone().detach()
661 | 
662 |         for i in range(self.steps):
663 |             adv_videos.requires_grad = True
664 |             outputs = self.model(adv_videos)
665 | 
666 |             cost = self._targeted*loss(outputs, labels).to(self.device)
667 |             grad = torch.autograd.grad(cost, adv_videos, 
668 |                                        retain_graph=False, create_graph=False)[0]
669 | 
670 |             grad = self._conv3d_frame(grad)
671 |             if self.momentum:
672 |                 grad += momentum*self.decay
673 |                 momentum = grad
674 |             else:
675 |                 pass
676 | 
677 |             adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1]
678 |             adv_videos = adv_videos + self.step_size*grad.sign()
679 |             delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon)
680 |             adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach()
681 |             adv_videos = self._transform_video(adv_videos, mode='forward') # norm
682 | 
683 |         return adv_videos
684 |     
685 | class TAP(Attack):
686 |     '''Transferable Adversarial Perturbations
687 |     params = {
688 |         'kernlen': 3,
689 |         'temporal_kernlen':3,
690 |         'eta': 1e3,
691 |         'conv3d': True
692 |     }
693 |     '''
694 |     def __init__(self, model, params, epsilon=16/255, steps=10):
695 |         super(TAP, self).__init__("TAP", model)
696 |         self.epsilon = epsilon
697 |         self.steps = steps
698 |         self.step_size = self.epsilon / self.steps
699 | 
700 |         for name, value in params.items():
701 |             setattr(self, name, value)
702 | 
703 |         kernel = self._initial_kernel_uniform(self.kernlen).astype(np.float32) # (3,3)
704 |         stack_kernel = np.stack([kernel, kernel, kernel]) # (3,3,3)
705 |         self.stack_2d_kernel = torch.from_numpy(np.expand_dims(stack_kernel, 1)).to(self.device) # 3,1,3,3
706 | 
707 |         kernel_3d = self._initial_kernel_uniform_3d(self.kernlen, self.temporal_kernlen) # [t,h,h]
708 |         stack_kernel_3d = np.stack([kernel_3d, kernel_3d, kernel_3d]) # (3,t,h,h)
709 |         self.stack_3d_kernel = torch.from_numpy(np.expand_dims(stack_kernel_3d, 1)).to(self.device) # 3,1,t,h,h
710 | 
711 |         self._activation_hook()
712 |         
713 |     def _initial_kernel_uniform(self, kernlen):
714 |         kern1d = np.ones(kernlen)
715 |         kernel_raw = np.outer(kern1d, kern1d)
716 |         kernel = kernel_raw / kernel_raw.sum()
717 |         return kernel
718 | 
719 |     def _initial_kernel_uniform_3d(self, kernlen, temporal_kernel):
720 |         kern3d = np.ones((temporal_kernel, kernlen, kernlen))
721 |         kern3d = kern3d / kern3d.sum()
722 |         return kern3d
723 | 
724 |     def _conv2d_frames(self, perts):
725 |         frames = perts.shape[2]
726 |         out_perts = torch.zeros_like(perts)
727 |         for i in range(frames):
728 |             this_perts = perts[:,:,i]
729 |             out_pert = nn.functional.conv2d(this_perts, self.stack_2d_kernel, groups=3, stride=1, padding=[int((self.kernlen-1)/2), int((self.kernlen-1)/2)])
730 |             out_perts[:,:,i] = out_pert
731 |         return torch.sum(torch.abs(out_perts))
732 | 
733 |     def _conv3d_frames(self, perts):
734 |         out_perts = nn.functional.conv3d(perts, self.stack_3d_kernel, groups=3, stride=1, padding=[int((self.temporal_kernlen-1)/2), int((self.kernlen-1)/2), int((self.kernlen-1)/2)])
735 |         return torch.sum(torch.abs(out_perts))
736 | 
737 |     def _find_target_layer(self):
738 |         if 'i3d' in self.model_type:
739 |             return [self.model.res_layers._modules['0'], self.model.res_layers._modules['1']]
740 |         elif 'slowfast' in self.model_type:
741 |             return [self.model._modules['slow_res2'], self.model._modules['slow_res3'], self.model._modules['fast_res2'], self.model._modules['fast_res3']] #[b,2048, 8, 7, 7], [b, 256, 32, 7, 7]
742 |         elif 'tpn' in self.model_type:
743 |             return [self.model.layer1, self.model.layer2]
744 | 
745 |     def _activation_hook(self):
746 |         self.activations = dict()
747 |         self.activations['value'] = []
748 |         def forward_hook(module, input, output):
749 |             self.activations['value'] += [output]
750 |             return None
751 |         target_layer = self._find_target_layer()
752 |         if isinstance(target_layer, list):
753 |             for i in target_layer:
754 |                 i.register_forward_hook(forward_hook)
755 |         else:        
756 |             target_layer.register_forward_hook(forward_hook)
757 | 
758 |     def forward(self, videos, labels):
759 |         r"""
760 |         Overridden.
761 |         """
762 |         batch_size = videos.shape[0]
763 |         self.loss_info = {}
764 |         self.stack_3d_kernel = self.stack_3d_kernel.type(videos.dtype)
765 |         videos = videos.to(self.device)
766 |         labels = labels.to(self.device)
767 | 
768 |         self.activations = dict()
769 |         self.activations['value'] = []
770 |         outputs = self.model(videos)
771 |         ori_feature_map = self.activations['value']
772 | 
773 |         loss = nn.CrossEntropyLoss()
774 |         unnorm_videos = self._transform_video(videos.clone().detach(), mode='back') # [0, 1]
775 |         adv_videos = videos.clone().detach()
776 | 
777 |         for i in range(self.steps):
778 |             self.activations = dict()
779 |             self.activations['value'] = []
780 |             adv_videos.requires_grad = True
781 |             outputs = self.model(adv_videos)
782 | 
783 |             # CE loss
784 |             cost1 = self._targeted*loss(outputs, labels).to(self.device)
785 | 
786 |             # l2 distance
787 |             # this_feature_map = self._feature_map(adv_videos, True, False, labels)
788 |             feat_distance = []
789 |             for i,j in zip(self.activations['value'], ori_feature_map):
790 |                 this_distance = torch.norm((torch.sign(i) * torch.sqrt(torch.abs(i))).reshape(batch_size, -1) - (torch.sign(j) * torch.sqrt(torch.abs(j))).reshape(batch_size, -1), p=2, dim=1)
791 |                 feat_distance.append(this_distance)
792 |             cost2 = torch.sum(torch.stack(feat_distance), 0)
793 | 
794 |             # L2 norm
795 |             perts = self._transform_perts(adv_videos - videos).to(self.device)
796 |             if self.conv3d:
797 |                 reg_cost = self._conv3d_frames(perts)
798 |             else:
799 |                 reg_cost = self._conv2d_frames(perts)
800 | 
801 |             cost = cost1 + 1e3 * reg_cost + 0.05 * cost2
802 | 
803 |             grad = torch.autograd.grad(cost, adv_videos, 
804 |                                        retain_graph=False, create_graph=False)[0]
805 | 
806 |             adv_videos = self._transform_video(adv_videos.detach(), mode='back') # [0, 1]
807 |             adv_videos = adv_videos + self.step_size*grad.sign()
808 |             delta = torch.clamp(adv_videos - unnorm_videos, min=-self.epsilon, max=self.epsilon)
809 |             adv_videos = torch.clamp(unnorm_videos + delta, min=0, max=1).detach()
810 |             adv_videos = self._transform_video(adv_videos, mode='forward') # norm
811 |             self.loss_info[i] = {'ce loss': cost1.detach().cpu().numpy(), 
812 |                         'reg_cost': reg_cost.detach().cpu().numpy(),
813 |                         'distance': cost2.detach().cpu().numpy()}
814 |         return adv_videos
815 | 
816 | 


--------------------------------------------------------------------------------