├── .gitignore ├── 3d_net_visualization.py ├── README.md ├── action_feature_visualization.py ├── action_recognition.py ├── main.py ├── net ├── c3d.py ├── i3d.py ├── i3dpt_origin.py ├── mfnet_3d.py ├── model.py ├── mp_i3d.py └── r3d.py ├── output ├── imgs │ └── 79 │ │ ├── focusmap_000.png │ │ ├── focusmap_001.png │ │ ├── focusmap_002.png │ │ ├── focusmap_003.png │ │ ├── focusmap_004.png │ │ ├── focusmap_005.png │ │ ├── focusmap_006.png │ │ ├── heatmap_000.png │ │ ├── heatmap_001.png │ │ ├── heatmap_002.png │ │ ├── heatmap_003.png │ │ ├── heatmap_004.png │ │ ├── heatmap_005.png │ │ └── heatmap_006.png ├── ucf101_test_1 │ └── 0 │ │ ├── focusmap │ │ ├── 000.png │ │ ├── 001.png │ │ ├── 002.png │ │ ├── 003.png │ │ ├── 004.png │ │ ├── 005.png │ │ ├── 006.png │ │ ├── 007.png │ │ ├── 008.png │ │ ├── 009.png │ │ ├── 010.png │ │ ├── 011.png │ │ ├── 012.png │ │ ├── 013.png │ │ ├── 014.png │ │ └── 015.png │ │ ├── heatmap │ │ ├── 000.png │ │ ├── 001.png │ │ ├── 002.png │ │ ├── 003.png │ │ ├── 004.png │ │ ├── 005.png │ │ ├── 006.png │ │ ├── 007.png │ │ ├── 008.png │ │ ├── 009.png │ │ ├── 010.png │ │ ├── 011.png │ │ ├── 012.png │ │ ├── 013.png │ │ ├── 014.png │ │ └── 015.png │ │ └── info.txt └── video │ ├── label_0.mp4 │ ├── label_28.mp4 │ └── label_471.mp4 ├── process_all_hmdb51_videos.py ├── resources ├── HMDB_snapshot1.png ├── HMDB_snapshot2.png ├── classInd.txt ├── focusimg_1.png ├── heatmap_000.png ├── heatmap_000_sc.png ├── heatmap_003.png ├── heatmap_003_sc.png ├── heatmap_007.png ├── heatmap_007_sc.png ├── heatmap_1.png ├── hmdb51_classInd.txt ├── supervised.gif └── unsupervised.gif ├── scripts ├── c3d_unsupervised_demo.sh ├── demo.sh ├── i3d_demo.sh ├── i3d_mixup_demo.sh ├── i3d_rotate_demo.sh ├── i3d_unsupervised_demo.sh ├── mpi3d_demo.sh └── r3d_unsupervised_demo.sh ├── test_videos ├── 50_FIRST_DATES_drink_u_nm_np1_fr_goo_29.mp4 ├── BASE_Jumping_Compilation_-_Brilliant_dive_f_cm_np1_le_bad_3.mp4 ├── BaseballSwingAnalysis_swing_baseball_u_nm_np1_ba_med_0.mp4 ├── Bodenturnen_im_sportunterricht_handstand_f_cm_np1_le_med_1.mp4 ├── Bruno_Walks_up_Stairs_-_Chicago_Dog_Training_-_We_can_teach_ANYTHING_to_a_dog!!!!_climb_stairs_f_cm_np1_fr_med_0.mp4 ├── DefensivePistolShootingTechniques_shoot_gun_f_nm_np1_fr_med_3.mp4 ├── Documentario_Le_Parkour_Londrina_jump_f_cm_np1_ri_bad_6.mp4 ├── v_ApplyEyeMakeup_g01_c01.avi └── v_HeadMassage_g02_c05.avi ├── util.py └── utils ├── gen_new_video.py ├── gen_rotation_data.py └── video_cat.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Example user template template 3 | ### Example user template 4 | 5 | # IntelliJ project files 6 | .idea 7 | *.iml 8 | out 9 | gen 10 | pretrained_model 11 | /output/concat_videos/ 12 | /test_videos/bk/ 13 | /output/hmdb51_all_videos/ 14 | /output/ucf101_test_1/ 15 | /output/video/MultiMedia/ 16 | /output/video/BK/ 17 | /output/imgs/28/ 18 | -------------------------------------------------------------------------------- /3d_net_visualization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2019-03-17 10:09 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : Action_Video_Visualization 8 | # @File : 3d_net_visualization.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import os 14 | 15 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 16 | os.environ["CUDA_VISIBLE_DEVICES"]='3' 17 | import os 18 | import cv2 19 | import torch 20 | import argparse 21 | import numpy as np 22 | from scipy.ndimage import zoom 23 | from net.mfnet_3d import MFNET_3D 24 | from util import load_images 25 | 26 | 27 | def parse_args(): 28 | parser = argparse.ArgumentParser(description='mfnet-base-parser') 29 | parser.add_argument("--num_classes", type=int, default=101) 30 | parser.add_argument("--model_weights", type=str, default='pretrained_model/MFNet3D_UCF-101_Split-1_96.3.pth') 31 | parser.add_argument("--frame_dir", type=str, default='test_videos/ucf101_test_1') 32 | parser.add_argument("--label", type=int, default=0) 33 | parser.add_argument("--base_output_dir", type=str, default="output") 34 | return parser.parse_args() 35 | args = parse_args() 36 | 37 | 38 | def load_model(): 39 | model_ft = MFNET_3D(args.num_classes) 40 | model_ft = torch.nn.DataParallel(model_ft).cuda() 41 | checkpoint = torch.load(args.model_weights) 42 | model_ft.load_state_dict(checkpoint['state_dict']) 43 | model_ft.cuda() 44 | model_ft.eval() 45 | return model_ft 46 | 47 | 48 | def split_imgs(): 49 | frame_names = os.listdir(args.frame_dir) 50 | frame_indices = list(np.linspace(0, len(frame_names) - 1, num=16, dtype=np.int)) 51 | selected_frames = [frame_names[i] for i in frame_indices] 52 | 53 | RGB_vid, vid = load_images(args.frame_dir, selected_frames) 54 | return RGB_vid, vid 55 | 56 | 57 | def cam_calculate(model_ft, vid): 58 | # get predictions, last convolution output and the weights of the prediction layer 59 | # i3d is two layer fc, need to modify here 60 | predictions, layerout = model_ft(torch.tensor(vid).cuda()) # 1x101 61 | layerout = torch.tensor(layerout[0].numpy().transpose(1, 2, 3, 0)) #8x7x7x768 62 | pred_weights = model_ft.module.classifier.weight.data.detach().cpu().numpy().transpose() # 768 x 101 63 | pred = torch.argmax(predictions).item() 64 | cam = np.zeros(dtype = np.float32, shape = layerout.shape[0:3]) 65 | for i, w in enumerate(pred_weights[:, args.label]): 66 | #i = 0, w:101 67 | # Compute cam for every kernel 68 | cam += w * layerout[:, :, :, i] # 8x7x7 69 | 70 | # Resize CAM to frame level 71 | cam = zoom(cam, (2, 32, 32)) # output map is 8x7x7, so multiply to get to 16x224x224 (original image size) 72 | 73 | # normalize 74 | cam -= np.min(cam) 75 | cam /= np.max(cam) - np.min(cam) 76 | return cam, pred 77 | 78 | 79 | def save_imgs(cam, pred, RGB_vid): 80 | # make dirs and filenames 81 | example_name = os.path.basename(args.frame_dir) 82 | heatmap_dir = os.path.join(args.base_output_dir, example_name, str(args.label), "heatmap") 83 | focusmap_dir = os.path.join(args.base_output_dir, example_name, str(args.label), "focusmap") 84 | for d in [heatmap_dir, focusmap_dir]: 85 | if not os.path.exists(d): 86 | os.makedirs(d) 87 | 88 | file = open(os.path.join(args.base_output_dir, example_name, str(args.label), "info.txt"), "a") 89 | file.write("Visualizing for class {}\n".format(args.label)) 90 | file.write("Predicted class {}\n".format(pred)) 91 | file.close() 92 | 93 | # produce heatmap and focusmap for every frame and activation map 94 | for i in range(0, cam.shape[0]): 95 | # Create colourmap 96 | # COLORMAP_AUTUMN = 0, 97 | # COLORMAP_BONE = 1, 98 | # COLORMAP_JET = 2, 99 | # COLORMAP_WINTER = 3, 100 | # COLORMAP_RAINBOW = 4, 101 | # COLORMAP_OCEAN = 5, 102 | # COLORMAP_SUMMER = 6, 103 | # COLORMAP_SPRING = 7, 104 | # COLORMAP_COOL = 8, 105 | # COLORMAP_HSV = 9, 106 | # COLORMAP_PINK = 10, 107 | # COLORMAP_HOT = 11 108 | 109 | heatmap = cv2.applyColorMap(np.uint8(255 * cam[i]), cv2.COLORMAP_WINTER) 110 | # Create focus map 111 | # focusmap = np.uint8(255 * cam[i]) 112 | # focusmap = cv2.normalize(cam[i], dst=focusmap, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1) 113 | 114 | # Create frame with heatmap 115 | heatframe = heatmap // 2 + RGB_vid[0][i] // 2 116 | cv2.imwrite(os.path.join(heatmap_dir, '{:03d}.png'.format(i)), heatframe) 117 | 118 | # Create frame with focus map in the alpha channel 119 | focusframe = RGB_vid[0][i] 120 | focusframe = cv2.cvtColor(np.uint8(focusframe), cv2.COLOR_BGR2BGRA) 121 | focusframe[:, :, 3] = focusframe 122 | cv2.imwrite(os.path.join(focusmap_dir, '{:03d}.png'.format(i)), focusframe) 123 | 124 | 125 | def main(): 126 | global args 127 | mfnet = load_model() 128 | RGB_vid, vid = split_imgs() 129 | cam, pred = cam_calculate(mfnet, vid) 130 | save_imgs(cam, pred, RGB_vid) 131 | 132 | 133 | if __name__ == '__main__': 134 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 3D Net Visualization Tools (PyTorch) 2 | 3 | ## Demo 4 | 5 | **This project is to show which space-time region that the model focus on, 6 | supported supervised or unsupervised (no label available). For an input video, 7 | this project will show attention map in video and frames.** 8 | 9 | ### saved video 10 | 11 | Video can't be show here, there are some gif. 12 | 13 | **supervised with label** 14 | 15 | ![gif](https://github.com/FingerRec/3DNet_Visualization/raw/master/resources/supervised.gif) 16 | 17 | **unsupervised (only have RGB video)** 18 | 19 | ![gif_2](https://github.com/FingerRec/3DNet_Visualization/raw/master/resources/unsupervised.gif) 20 | 21 | 22 | ### saved img 23 | 24 | **heatmap** 25 | 26 | ![heatmap_image](https://github.com/FingerRec/3DNet_Visualization/raw/master/resources/heatmap_1.png) 27 | 28 | **focus map** 29 | 30 | ![focus_image](https://github.com/FingerRec/3DNet_Visualization/raw/master/resources/focusimg_1.png) 31 | 32 | ### feature map average(without label) 33 | In some case, the real label of video/action can't access. We average all filters 34 | and visualize the heatmap. 35 | 36 | ![averaage feature map scratch](https://github.com/FingerRec/3DNet_Visualization/raw/master/resources/heatmap_000_sc.png) 37 | ![averaage feature map supervised](https://github.com/FingerRec/3DNet_Visualization/raw/master/resources/heatmap_000.png) 38 | 39 | 40 | 41 | ## Require: 42 | - pytorch0.4 43 | - opencv 44 | - numpy 45 | - skvideo 46 | - ffmpeg 47 | 48 | ## Run: 49 | ### 1.create pretrain_model dir 50 | ```bash 51 | git clone https://github.com/FingerRec/3DNet_Visualization.git 52 | cd 3DNet_Visualization 53 | mkdir pretrained_model 54 | ``` 55 | 56 | ### 2.download pretrained model and put in into the dir pretrained_model 57 | 58 | #### MF-Net 59 | download pretrained MFNet on UCF101 from [google_drive](https://goo.gl/mML2gv) and put it into directory pretrained_model, 60 | which is from [MFNet](https://github.com/cypw/PyTorch-MFNet) 61 | #### I3d 62 | [google_drive](https://drive.google.com/open?id=1feHEql9XhoV2pwXb5dTs4TFuaqsa1ajX) 63 | 64 | #### R3D 65 | 66 | [r3d](https://drive.google.com/file/d/1H52vT1T0sl7iWA7Up8wu1rSMFzgdwGZG/view?usp=sharing) 67 | 68 | R3D pretrain model is from [3D-Resnet-Pytorch](https://github.com/kenshohara/3D-ResNets-PyTorch) 69 | 70 | #### C3D 71 | 72 | [C3D](https://drive.google.com/file/d/19NWziHWh1LgCcHU34geoKwYezAogv9fX/view?usp=sharing) 73 | 74 | C3D pretrain model is from [C3D-Pytorch](https://github.com/jfzhang95/pytorch-video-recognition) 75 | 76 | ### 3.run demo 77 | 78 | pretrained I3d on HMDB51 79 | ```bash 80 | bash scripts/demo.sh 81 | ``` 82 | #### c3d 83 | ```bash 84 | bash scripts/c3d_unsupervised_demo.sh 85 | ``` 86 | 87 | #### r3d 88 | ```bash 89 | bash scripts/r3d_unsupervised_demo.sh 90 | ``` 91 | 92 | The generate video and imgs will be put in dir output/imgs and output/video. 93 | 94 | Tip: in main.py, if set clip_steps is 1, will generate a video the same length as origin. 95 | 96 | ### 4.test own video 97 | 98 | the details in demo.sh as follow, change --video and --label accorading to your video, please refer to resources/classInd.txt for label information for UCF101 videos. 99 | 100 | ```bash 101 | python main.py --num_classes 101 \ 102 | --classes_list resources/classInd.txt \ 103 | --model_weights pretrained_model/MFNet3D_UCF-101_Split-1_96.3.pth \ 104 | --video test_videos/[your own video here] \ 105 | --frames_num 16 --label 0 --clip_steps 16 \ 106 | --output_dir output \ 107 | --supervised unsupervised # not annotate this line if no label available 108 | 109 | ``` 110 | 111 | **Notice unsupervised compute only add --supervised unsupervised in script;** 112 | 113 | 114 | Tip:UCF101/HMDB51 dataset is support now, for Kinetics et al. Just download a pretrained model and change --classes_list 115 | 116 | ## To Do List 117 | - [X] support i3d, mpi3d 118 | - [X] support multi fc layers or full convolution networks 119 | - [X] support feature map average without label 120 | - [X] support r3d and c3d 121 | - [ ] support Slow-Fast Net 122 | - [ ] visualize filters 123 | - [ ] grad-cam 124 | 125 | ## More information 126 | 127 | Support your own network: 128 | 129 | > 1. pretrained model; 2. update load_model() in main.py; 3. modify last linear layer name in generate_supervised_cam in action_recognition.py 130 | 131 | **Notice C3D and R3D are pretrained on Sports/Kinetics, for better visualization, you may need to finetune these networks on UCF/HMDB as in [RHE](https://github.com/FingerRec/RHE)** 132 | 133 | 134 | ## Acknowledgment 135 | This project is highly based on [SaliencyTubes](https://github.com/alexandrosstergiou/Saliency-Tubes-Visual-Explanations-for-Spatio-Temporal-Convolutions) 136 | , [MF-Net](https://github.com/cypw/PyTorch-MFNet) and [st-gcn](https://github.com/yysijie/st-gcn). -------------------------------------------------------------------------------- /action_feature_visualization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2019-03-16 22:47 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : Action_Video_Visualization 8 | # @File : action_feature_visualization.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import scipy.io 14 | import os 15 | import cv2 16 | from util import * 17 | import numpy as np 18 | 19 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 20 | #i want to do a show video such as openpose, show detection video and label real time, plot the weights of the prediction layer 21 | #plot heat map as well 22 | #this work may be finished in 2-3 days. 23 | 24 | 25 | class Visualization(object): 26 | def __init__(self): 27 | return 28 | 29 | def gen_heatmap(self, cam, frame): 30 | """ 31 | geneate headmap and focus map from images 32 | :return: 33 | """ 34 | # produce heatmap and focusmap for every frame and activation map 35 | # cam:16x224x224x3 frame:1x3x16x224x224 36 | # Create colourmap 37 | # COLORMAP_AUTUMN = 0, 38 | # COLORMAP_BONE = 1, 39 | # COLORMAP_JET = 2, 40 | # COLORMAP_WINTER = 3, 41 | # COLORMAP_RAINBOW = 4, 42 | # COLORMAP_OCEAN = 5, 43 | # COLORMAP_SUMMER = 6, 44 | # COLORMAP_SPRING = 7, 45 | # COLORMAP_COOL = 8, 46 | # COLORMAP_HSV = 9, 47 | # COLORMAP_PINK = 10, 48 | # COLORMAP_HOT = 11 49 | for i in range(cam.shape[0]): 50 | # Create colourmap 51 | heatmap = cv2.applyColorMap(np.uint8(255 * cam[i]), cv2.COLORMAP_JET) # for COLORMAP 5/8 52 | # heatmap = cv2.applyColorMap(np.uint8(255 * cam[i]), cv2.COLORMAP_COOL) 53 | # Create focus map 54 | focusmap = np.uint8(255 * cam[i]) 55 | focusmap = cv2.normalize(cam[i], dst=focusmap, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, 56 | dtype=cv2.CV_8UC1) 57 | # Create frame with heatmap 58 | heatframe = heatmap // 2 + frame[0][i] // 2 59 | # Create frame with focus map in the alpha channel 60 | focusframe = frame[0][i] 61 | # focusframe = cv2.cvtColor(np.uint8(focusframe), cv2.COLOR_BGR2BGRA) 62 | # focusframe[:, :, 3] = focusmap 63 | # focusmap = cv2.blur(focusmap, (30,30)) 64 | alpha = focusmap 65 | focusframe = np.dstack((focusframe, alpha)) 66 | return heatframe, focusframe 67 | 68 | @staticmethod 69 | def gen_mask_img(origin_img, heat_map, pred_top3, prob_top3, label, classes_list, text=True): 70 | """ 71 | a img will be divide into four parts, origin images, activation_map, heatmap, focusmap 72 | and add text into them 73 | may be want to visulization these filters, do it later 74 | :return: 75 | """ 76 | h, w, c = origin_img.shape 77 | assert h >= 224 and w >= 224 78 | x1 = int(round((w - 224) / 2.)) 79 | y1 = int(round((h - 224) / 2.)) 80 | cropped_img = origin_img[y1:(y1 + 224), x1:(x1 + 224), :] 81 | #focus_crop_img = np.zeros([224, 224, 3]) 82 | #for i in range(3): 83 | # focus_crop_img = focus_map[:,:,i] * focus_map[:, :, 3] 84 | #focus_crop_img = cv2.cvtColor(focus_map, cv2.COLOR_RGBA2RGB) 85 | #focus_map = np.resize(focus_crop_img, [224,224,3]) 86 | classes = [x.strip() for x in open(classes_list)] 87 | if text: 88 | label_name = 'real label: ' + classes[label - 1] 89 | put_text(cropped_img, label_name, (0.1, 0.5)) 90 | for i in range(3): 91 | label_text = " Top {}: label: {}".format(i+1, classes[pred_top3[i]]) 92 | put_text(heat_map[i], label_text, (0.1, 0.5)) 93 | prob_text = "prob: {}".format(str(prob_top3[i])[:7]) 94 | put_text(heat_map[i], prob_text, (0.2, 0.5)) 95 | img0 = np.concatenate((cropped_img, heat_map[0]), axis=1) 96 | img1 = np.concatenate((heat_map[1], heat_map[2]), axis=1) 97 | maskimg = np.concatenate((img0, img1), axis=0) 98 | return maskimg 99 | 100 | @staticmethod 101 | def gen_mp_mask_img(origin_img, heat_map, pred_top3, prob_top3, label, classes_list): 102 | """ 103 | a img will be divide into four parts, origin images, activation_map, heatmap, focusmap 104 | and add text into them 105 | may be want to visulization these filters, do it later 106 | :return: 107 | """ 108 | h, w, c = origin_img.shape 109 | assert h >= 224 and w >= 224 110 | x1 = int(round((w - 224) / 2.)) 111 | y1 = int(round((h - 224) / 2.)) 112 | cropped_img = origin_img[y1:(y1 + 224), x1:(x1 + 224), :] 113 | classes = [x.strip() for x in open(classes_list)] 114 | label_name = 'real label: ' + classes[label - 1] 115 | put_text(cropped_img, label_name, (0.1, 0.5)) 116 | pred_top3 = np.array(pred_top3) 117 | prob_top3 = np.array(prob_top3) 118 | strs = ['s', 'm', 'l'] 119 | for i in range(3): 120 | label_text = " Path {}: label: {}".format(strs[i], classes[pred_top3[i][0]]) 121 | put_text(heat_map[i], label_text, (0.1, 0.5)) 122 | prob_text = "prob: {}".format(str(prob_top3[i][0])[:7]) 123 | put_text(heat_map[i], prob_text, (0.2, 0.5)) 124 | img0 = np.concatenate((cropped_img, heat_map[0]), axis=1) 125 | img1 = np.concatenate((heat_map[1], heat_map[2]), axis=1) 126 | maskimg = np.concatenate((img0, img1), axis=1) 127 | return maskimg 128 | -------------------------------------------------------------------------------- /action_recognition.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2019-03-17 13:00 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : Action_Video_Visualization 8 | # @File : action_recognition.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import scipy.io 14 | import os 15 | import torch 16 | import numpy as np 17 | import cv2 18 | from util import center_crop 19 | from scipy.ndimage import zoom 20 | 21 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 22 | 23 | 24 | class ActionRecognition(object): 25 | def __init__(self, args, model): 26 | self.model = model 27 | self.args = args 28 | 29 | def img_process(self, imgs, frames_num): 30 | images = np.zeros((frames_num, 224, 224, 3)) 31 | orig_imgs = np.zeros_like(images) 32 | for i in range(frames_num): 33 | next_image = imgs[i] 34 | next_image = np.uint8(next_image) 35 | scaled_img = cv2.resize(next_image, (256, 256), interpolation=cv2.INTER_LINEAR) # resize to 256x256 36 | cropped_img = center_crop(scaled_img) # center crop 224x224 37 | final_img = cv2.cvtColor(cropped_img, cv2.COLOR_BGR2RGB) 38 | images[i] = final_img 39 | orig_imgs[i] = cropped_img 40 | torch_imgs = torch.from_numpy(images.transpose(3, 0, 1, 2)) 41 | torch_imgs = torch_imgs.float() / 255.0 42 | mean_3d = [124 / 255, 117 / 255, 104 / 255] 43 | std_3d = [0.229, 0.224, 0.225] 44 | for t, m, s in zip(torch_imgs, mean_3d, std_3d): 45 | t.sub_(m).div_(s) 46 | return np.expand_dims(orig_imgs, 0), torch_imgs.unsqueeze(0) 47 | 48 | def recognition_video(self, imgs): 49 | """ 50 | recognition video's action 51 | :param imgs: preprocess imgs 52 | :return: 53 | """ 54 | prediction, _ = self.model(torch.tensor(imgs).cuda()) # 1x101 55 | pred = torch.argmax(prediction).item() 56 | return pred 57 | 58 | def generate_supervised_cam(self, imgs): 59 | predictions, layerout = self.model(torch.tensor(imgs).cuda()) # 1x101 60 | layerout = torch.tensor(layerout[0].numpy().transpose(1, 2, 3, 0)) # 8x7x7x768 61 | if self.args.arch == "i3d" or "mf_net": 62 | pred_weights = self.model.module.classifier.weight.data.detach().cpu().numpy().transpose() # 768 x 101 63 | elif self.args.arch == "r3d": 64 | pred_weights = self.model.module.fc.weight.data.detach().cpu().numpy().transpose() # 2048 x 101 65 | elif self.args.arch == "c3d": 66 | pred_weights = self.model.module.linear.weight.data.detach().cpu().numpy().transpose() # 512 x 101 67 | predictions = torch.nn.Softmax(dim=1)(predictions) 68 | pred_top3 = predictions.detach().cpu().numpy().argsort()[0][::-1][:3] 69 | probality_top3 = -np.sort(-predictions.detach().cpu().numpy())[0,0:3] 70 | #print(pred_top3) 71 | #pred_top3 = torch.argmax(predictions).item() 72 | cam_list = list() 73 | for k in range(len(pred_top3)): 74 | cam = np.zeros(dtype=np.float32, shape=layerout.shape[0:3]) 75 | for i, w in enumerate(pred_weights[:, pred_top3[k]]): 76 | # Compute cam for every kernel 77 | cam += w * layerout[:, :, :, i] # 8x7x7 78 | # Resize CAM to frame level 79 | cam = zoom(cam, (16//layerout.size(0), 224//layerout.size(1), 224//layerout.size(2))) 80 | # output map is 8x7x7, so multiply to get to 16x224x224 (original image size) 81 | 82 | # normalize 83 | cam -= np.min(cam) 84 | cam /= np.max(cam) - np.min(cam) 85 | cam_list.append(cam) 86 | return cam_list, pred_top3, probality_top3 87 | 88 | def generate_unsupervised_cam(self, imgs): 89 | """ 90 | 91 | :param imgs: 92 | :return: 93 | """ 94 | _, layerout = self.model(torch.tensor(imgs).cuda()) # 1x101 95 | layerout = torch.tensor(layerout[0].detach().cpu().numpy().transpose(1, 2, 3, 0)) # 8x7x7x1024 96 | cam_list = list() 97 | cam = np.zeros(dtype=np.float32, shape=layerout.shape[0:3]) 98 | # print(cam.shape) 99 | for i in range(layerout.size(3)): 100 | cam += layerout[:, :, :, i].cpu().numpy() # 8x7x7 101 | cam = zoom(cam, (16//layerout.size(0), 224//layerout.size(1), 224//layerout.size(2)), mode='wrap') 102 | # output map is 8x7x7, so multiply to get to 16x224x224 (original video size) 103 | 104 | # normalize 105 | cam -= np.min(cam) 106 | cam /= np.max(cam) - np.min(cam) 107 | cam_list.append(cam) 108 | cam_list.append(cam) 109 | cam_list.append(cam) 110 | return cam_list 111 | 112 | ''' 113 | def generate_mp_cam(self, imgs): 114 | """ 115 | mpi3d has three part, for each part we record the grad-cam 116 | :param imgs: 117 | :return: 118 | """ 119 | predictions, layerout_s, layerout_m, layerout_l, predictions_s, predictions_m, predictions_l = self.model(torch.tensor(imgs).cuda()) # 1x101 120 | layerout_s = torch.tensor(layerout_s[0].detach().cpu().numpy().transpose(1, 2, 3, 0)) # 32 x 28 x 28 x 480 121 | layerout_m = torch.tensor(layerout_m[0].detach().cpu().numpy().transpose(1, 2, 3, 0)) # 16x14x14x832 122 | layerout_l = torch.tensor(layerout_l[0].detach().cpu().numpy().transpose(1, 2, 3, 0)) # 8x7x7x1024 123 | pred_weights_s = self.model.module.s_depend.local_range_depen.conv3d.weight.data.detach().cpu().numpy().transpose() # 480 x 51 124 | pred_weights_s = np.reshape(pred_weights_s, (51, 51)) # may be need do squeeze rather than reshape 125 | pred_weights_m = self.model.module.m_depend.local_range_depen.conv3d.weight.data.detach().cpu().numpy().transpose() # 832 x 51 126 | pred_weights_m = np.reshape(pred_weights_m, (51, 51)) 127 | pred_weights_l = self.model.module.l_depend.local_range_depen.conv3d.weight.data.detach().cpu().numpy().transpose() # 1024 x 51 128 | pred_weights_l = np.reshape(pred_weights_l, (51, 51)) 129 | predictions = torch.nn.Softmax(dim=1)(predictions) 130 | pred_top3 = predictions.detach().cpu().numpy().argsort()[0][::-1][:3] 131 | probality_top3 = -np.sort(-predictions.detach().cpu().numpy())[0,0:3] 132 | predictions_s = torch.nn.Softmax(dim=1)(predictions_s) 133 | predictions_m = torch.nn.Softmax(dim=1)(predictions_m) 134 | predictions_l = torch.nn.Softmax(dim=1)(predictions_l) 135 | three_pred = [predictions_s.detach().cpu().numpy().argsort()[0][::-1][:1],predictions_m.detach().cpu().numpy().argsort()[0][::-1][:1],predictions_l.detach().cpu().numpy().argsort()[0][::-1][:1]] 136 | three_prob = [-np.sort(-predictions_s.detach().cpu().numpy())[0,0:1],-np.sort(-predictions_m.detach().cpu().numpy())[0,0:1],-np.sort(-predictions_l.detach().cpu().numpy())[0,0:1]] 137 | layerout = [layerout_s, layerout_m, layerout_l] 138 | pred_weights = [pred_weights_s, pred_weights_m, pred_weights_l] 139 | #print(pred_top3) 140 | #pred_top3 = torch.argmax(predictions).item() 141 | cam_list = list() 142 | for k in range(3): 143 | cam = np.zeros(dtype=np.float32, shape=layerout[k].shape[0:3]) 144 | cam = zoom(cam, (pow(2, k + 1), 224, 224)) 145 | for i, w in enumerate(pred_weights[k][:, pred_top3[0]]): 146 | print(i) 147 | # Compute cam for every kernel 148 | cam += zoom(w * layerout[k][:, :, :, i], (pow(2, k + 1), 224, 224)) 149 | #cam += w * layerout[k][:, :, :, i] # 8x7x7 150 | # Resize CAM to frame level 151 | #cam = zoom(cam, (pow(2,k+1), pow(2,3+k), pow(2,3+k))) # output map is 8x7x7, so multiply to get to 64x224x224 (original image size) 152 | #cam = zoom(cam, (pow(2, k + 1), 224, 224)) 153 | # normalize 154 | cam -= np.min(cam) 155 | cam /= np.max(cam) - np.min(cam) 156 | cam_list.append(cam) 157 | #return cam_list, pred_top3, probality_top3 158 | return cam_list, three_pred, three_prob 159 | ''' -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2019-03-17 12:55 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : Action_Video_Visualization 8 | # @File : main.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import argparse 14 | from net.mfnet_3d import MFNET_3D 15 | from net.mp_i3d import MultiPathI3d 16 | from net.i3dpt_origin import I3D, weights_init 17 | from net.c3d import C3D 18 | from net.r3d import resnet50 19 | from action_recognition import ActionRecognition 20 | from util import * 21 | from action_feature_visualization import Visualization 22 | import math 23 | import datetime 24 | 25 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 26 | os.environ["CUDA_VISIBLE_DEVICES"] = '2' 27 | date_time = datetime.datetime.today().strftime('%m-%d-%H%M') 28 | 29 | 30 | def parse_args(): 31 | parser = argparse.ArgumentParser(description='mfnet-base-parser') 32 | parser.add_argument("--num_classes", type=int, default=101) 33 | parser.add_argument("--classes_list", type=str, default='resources/classInd.txt') 34 | parser.add_argument("--arch", type=str, default='mf_net', choices=['s3d', 'i3d', 'mf_net', 'c3d', 'mpi3d', 'r3d']) 35 | parser.add_argument("--supervised", type=str, default='fully_supervised', 36 | choices=['fully_supervised', 'unsupervised']) 37 | parser.add_argument("--model_weights", type=str, default='pretrained_model/MFNet3D_UCF-101_Split-1_96.3.pth') 38 | parser.add_argument("--video", type=str, default='test_videos/v_Shotput_g05_c02.avi') 39 | parser.add_argument("--frames_num", type=int, default=16, help = "the frames num for the network input") 40 | parser.add_argument("--label", type=int, default=79) 41 | parser.add_argument("--clip_steps", type=int, default=16) 42 | parser.add_argument("--output_dir", type=str, default="output") 43 | parser.add_argument("--gpus", type=str, default="1") 44 | return parser.parse_args() 45 | 46 | 47 | args = parse_args() 48 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus 49 | 50 | 51 | def weight_transform(model_dict, pretrain_dict, supervised=True): 52 | ''' 53 | 54 | :return: 55 | ''' 56 | for k, _ in pretrain_dict.items(): 57 | print("pretrain: {}".format(k)) 58 | if supervised: 59 | weight_dict = {k:v for k, v in pretrain_dict.items() if k in model_dict} 60 | else: 61 | weight_dict = {k:v for k, v in pretrain_dict.items() if k in model_dict and 'fc' not in k and 'classifier' not in k} 62 | for k, _ in weight_dict.items(): 63 | print("have load: {}".format(k)) 64 | model_dict.update(weight_dict) 65 | return model_dict 66 | 67 | def c3d_weight_transform(model_dict, pretrain_dict, supervised=True): 68 | corresp_name = { 69 | # Conv1 70 | "features.0.weight": "conv1.weight", 71 | "features.0.bias": "conv1.bias", 72 | # Conv2 73 | "features.3.weight": "conv2.weight", 74 | "features.3.bias": "conv2.bias", 75 | # Conv3a 76 | "features.6.weight": "conv3a.weight", 77 | "features.6.bias": "conv3a.bias", 78 | # Conv3b 79 | "features.8.weight": "conv3b.weight", 80 | "features.8.bias": "conv3b.bias", 81 | # Conv4a 82 | "features.11.weight": "conv4a.weight", 83 | "features.11.bias": "conv4a.bias", 84 | # Conv4b 85 | "features.13.weight": "conv4b.weight", 86 | "features.13.bias": "conv4b.bias", 87 | # Conv5a 88 | "features.16.weight": "conv5a.weight", 89 | "features.16.bias": "conv5a.bias", 90 | # Conv5b 91 | "features.18.weight": "conv5b.weight", 92 | "features.18.bias": "conv5b.bias", 93 | # fc6 94 | "classifier.0.weight": "fc6.weight", 95 | "classifier.0.bias": "fc6.bias", 96 | # fc7 97 | "classifier.3.weight": "fc7.weight", 98 | "classifier.3.bias": "fc7.bias", 99 | } 100 | 101 | p_dict = pretrain_dict 102 | s_dict = model_dict 103 | for name in p_dict: 104 | if name not in corresp_name: 105 | continue 106 | if 'classifier' in name: 107 | continue 108 | s_dict[corresp_name[name]] = p_dict[name] 109 | print("have load: {}".format(corresp_name[name])) 110 | return s_dict 111 | 112 | 113 | def load_model(): 114 | if args.arch == 'mf_net': 115 | model_ft = MFNET_3D(args.num_classes) 116 | elif args.arch == 'mpi3d': 117 | model_ft = MultiPathI3d(args.num_classes, in_channels=3, dropout_prob=0) 118 | elif args.arch == 'i3d': 119 | model_ft = I3D(args.num_classes, modality='rgb', dropout_prob=0) 120 | elif args.arch == 'r3d': 121 | model_ft = resnet50(num_classes=args.num_classes) 122 | elif args.arch == 'c3d': 123 | model_ft = C3D(with_classifier=True, num_classes=args.num_classes) 124 | else: 125 | Exception("Not support network now!") 126 | if args.model_weights: 127 | checkpoint = torch.load(args.model_weights) 128 | if args.arch in ['mpi3d', 'i3d']: 129 | base_dict = {'.'.join(k.split('.')[1:]):v for k,v in list(checkpoint['state_dict'].items())} 130 | # model_ft.load_state_dict(base_dict) 131 | model_dict = model_ft.state_dict() 132 | model_dict = weight_transform(model_dict, base_dict) 133 | model_ft.load_state_dict(model_dict) 134 | else: 135 | if args.supervised == 'unsupervised': 136 | if args.arch == 'c3d': 137 | model_dict = model_ft.state_dict() 138 | model_dict = c3d_weight_transform(model_dict, checkpoint, supervised=False) 139 | model_ft.load_state_dict(model_dict) 140 | else: 141 | base_dict = {k : v for k, v in list(checkpoint['state_dict'].items())} 142 | model_dict = model_ft.state_dict() 143 | model_dict = weight_transform(model_dict, base_dict, supervised=False) 144 | model_ft.load_state_dict(model_dict) 145 | else: 146 | model_ft.load_state_dict(checkpoint['state_dict']) 147 | else: 148 | # print("????") 149 | weights_init(model_ft) 150 | model_ft.cuda() 151 | model_ft = torch.nn.DataParallel(model_ft).cuda() 152 | model_ft.eval() 153 | return model_ft 154 | 155 | 156 | def decode_on_the_fly(self): 157 | """ 158 | there incule two way to implement decode on the fly 159 | we need to consider the video at begin and at end 160 | :return: 161 | """ 162 | 163 | 164 | def heat_map_api(video, frames_num, clip_steps, output_dir, label, classes_list): 165 | args.arch = 'i3d' 166 | args.num_classes = 51 167 | args.gpus = 1 168 | # args.supervised = 'self_supervised' 169 | # args.model_weights = 'pretrained_model/77.254_mpi3d_rgb_model_best.pth.tar' 170 | # args.model_weights = 'pretrained_model/hmdb51_rgb_gl_randomrotation_3flip_mixup_way2_1loss_stride_1_12_26_checkpoint_37.77.pth.tar' 171 | # args.model_weights = 'pretrained_model/25.294_i3dpt_rgb_model_best.pth.tar' 172 | # args.model_weights = 'pretrained_model/36.209_i3dpt_rgb_model_best.pth.tar' 173 | # args.classes_list = 'resources/hmdb51_classInd.txt' 174 | # args.model_weights = "" 175 | reg_net = ActionRecognition(args, load_model()) 176 | visulaize = Visualization() 177 | 178 | length, width, height = video_frame_count(video) 179 | if length < frames_num: 180 | print( 181 | "the video's frame num is {}, shorter than {}, will loop the video.".format(length, frames_num)) 182 | cap = cv2.VideoCapture(video) 183 | # q = queue.Queue(self.frames_num) 184 | frames = list() 185 | count = 0 186 | while count < length: 187 | ret, frame = cap.read() 188 | if type(frame) == type(None): 189 | break 190 | else: 191 | frames.append(frame) 192 | # if video shorter than frames_num, repeat last frame 193 | index = 0 194 | while len(frames) < frames_num: 195 | frames.append(frames[index]) 196 | index += 1 197 | length += 1 198 | mask_imgs = list() 199 | focus_imgs = list() 200 | count = 0 201 | for i in range(math.ceil((length - frames_num) // clip_steps)+1): 202 | if 0 < length - frames_num - clip_steps*i: 203 | reg_imgs = frames[i * clip_steps:i * clip_steps + frames_num] 204 | else: 205 | if length > frames_num + 1: 206 | reg_imgs = frames[length - 1 - frames_num: -1] 207 | else: 208 | reg_imgs = frames 209 | for j in range(frames_num - length): 210 | reg_imgs.append(reg_imgs[j]) 211 | if len(reg_imgs) < frames_num: 212 | print("reg_imgs is too short") 213 | break 214 | RGB_vid, vid = reg_net.img_process(reg_imgs, frames_num) 215 | if args.supervised == 'unsupervised': 216 | cam_list = reg_net.generate_unsupervised_cam(vid) 217 | else: 218 | cam_list, pred_top3, prob_top3 = reg_net.generate_supervised_cam(vid) 219 | heat_maps = list() 220 | for j in range(len(cam_list)): 221 | heat_map, focus_map = visulaize.gen_heatmap(cam_list[j], RGB_vid) 222 | heat_maps.append(heat_map) 223 | focus_imgs.append(focus_map) # BGRA space 224 | if args.supervised == 'unsupervised': 225 | mask_img = visulaize.gen_mask_img(RGB_vid[0][args.frames_num // 2], heat_maps, None, None, 226 | args.label, args.classes_list, text=False) 227 | else: 228 | mask_img = visulaize.gen_mask_img(RGB_vid[0][args.frames_num // 2], heat_maps, pred_top3, prob_top3, 229 | args.label, args.classes_list) 230 | mask_imgs.append(mask_img) 231 | print("precoss video clips: {}/{}, wait a moment".format(i + 1, int(math.ceil(length - frames_num) // clip_steps) + 1)) 232 | count += 1 233 | # saved_video_path = save_as_video(output_dir, mask_imgs, label) 234 | save_as_imgs(output_dir, mask_imgs, count, label, 'heatmap_') 235 | save_as_imgs(output_dir, focus_imgs, count, label, 'focusmap_') 236 | 237 | 238 | def main(): 239 | global args 240 | reg_net = ActionRecognition(args, load_model()) 241 | visulaize = Visualization() 242 | 243 | length, width, height = video_frame_count(args.video) 244 | if length < args.frames_num: 245 | print("the video's frame num is {}, shorter than {}, will repeat the last frame".format(length, args.frames_num)) 246 | cap = cv2.VideoCapture(args.video) 247 | # q = queue.Queue(self.frames_num) 248 | frames = list() 249 | count = 0 250 | while count < length: 251 | ret, frame = cap.read() 252 | if type(frame) == type(None): 253 | break 254 | else: 255 | frames.append(frame) 256 | # if video shorter than frames_num, repeat last frame 257 | while len(frames) < args.frames_num: 258 | frames.append(frames[length - 1]) 259 | mask_imgs = list() 260 | focus_imgs = list() 261 | count = 0 262 | for i in range(int(length/args.clip_steps) -1): 263 | if i < length - args.frames_num: 264 | reg_imgs = frames[i*args.clip_steps:i*args.clip_steps + args.frames_num] 265 | else: 266 | reg_imgs = frames[length - 1 - args.frames_num: -1] 267 | if len(reg_imgs) < args.frames_num: 268 | print("reg_imgs is too short") 269 | break 270 | RGB_vid, vid = reg_net.img_process(reg_imgs, args.frames_num) 271 | if args.supervised == 'unsupervised': 272 | cam_list = reg_net.generate_unsupervised_cam(vid) 273 | else: 274 | cam_list, pred_top3, prob_top3 = reg_net.generate_supervised_cam(vid) 275 | heat_maps = list() 276 | for j in range(len(cam_list)): 277 | heat_map, focus_map = visulaize.gen_heatmap(cam_list[j], RGB_vid) 278 | heat_maps.append(heat_map) 279 | focus_imgs.append(focus_map) # BGRA space 280 | if args.supervised == 'unsupervised': 281 | mask_img = visulaize.gen_mask_img(RGB_vid[0][args.frames_num // 2], heat_maps, None, None, 282 | args.label, args.classes_list, text=False) 283 | else: 284 | mask_img = visulaize.gen_mask_img(RGB_vid[0][args.frames_num//2], heat_maps, pred_top3, prob_top3, 285 | args.label, args.classes_list) 286 | mask_imgs.append(mask_img) 287 | print("precoss video clips: {}/{}, wait a moment".format(i+1, int(length/args.clip_steps)-1)) 288 | count += 1 289 | saved_video_path = save_as_video(args.output_dir, mask_imgs, args.label) 290 | save_as_imgs(args.output_dir, mask_imgs, count, args.label, 'heatmap_') 291 | save_as_imgs(args.output_dir, focus_imgs, count, args.label, 'focusmap_') 292 | # visualization(saved_video_path) 293 | 294 | 295 | if __name__ == '__main__': 296 | main() -------------------------------------------------------------------------------- /net/c3d.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2020-09-17 15:59 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : amax_Action_Video_Visualization 8 | # @File : c3d.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import scipy.io 14 | import os 15 | 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 17 | 18 | """C3D""" 19 | import math 20 | from collections import OrderedDict 21 | 22 | import torch 23 | import torch.nn as nn 24 | from torch.nn.modules.utils import _triple 25 | 26 | 27 | class C3D(nn.Module): 28 | """C3D with BN and pool5 to be AdaptiveAvgPool3d(1).""" 29 | 30 | def __init__(self, with_classifier=True, num_classes=101): 31 | super(C3D, self).__init__() 32 | self.with_classifier = with_classifier 33 | self.num_classes = num_classes 34 | 35 | self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 36 | self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)) 37 | 38 | self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 39 | self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) 40 | 41 | self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 42 | self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 43 | self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) 44 | 45 | self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 46 | self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 47 | self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) 48 | 49 | self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 50 | self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) 51 | self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1)) 52 | 53 | if self.with_classifier: 54 | self.fc6 = nn.Linear(8192, 4096) 55 | self.fc7 = nn.Linear(4096, 4096) 56 | self.fc8 = nn.Linear(4096, 487) 57 | 58 | self.dropout = nn.Dropout(p=0.5) 59 | 60 | self.relu = nn.ReLU() 61 | self.softmax = nn.Softmax() 62 | 63 | if self.with_classifier: 64 | self.linear = nn.Linear(512, self.num_classes) 65 | 66 | def forward(self, x, return_conv=False): 67 | h = self.relu(self.conv1(x)) 68 | h = self.pool1(h) 69 | 70 | h = self.relu(self.conv2(h)) 71 | h = self.pool2(h) 72 | 73 | h = self.relu(self.conv3a(h)) 74 | h = self.relu(self.conv3b(h)) 75 | h = self.pool3(h) 76 | 77 | h = self.relu(self.conv4a(h)) 78 | h = self.relu(self.conv4b(h)) 79 | h = self.pool4(h) 80 | 81 | h = self.relu(self.conv5a(h)) 82 | h = self.relu(self.conv5b(h)) 83 | h = self.pool5(h) 84 | feature = h 85 | if self.with_classifier: 86 | h = h.view(-1, 8192) 87 | h = self.relu(self.fc6(h)) 88 | h = self.dropout(h) 89 | h = self.relu(self.fc7(h)) 90 | h = self.dropout(h) 91 | logits = self.fc8(h) 92 | probs = self.softmax(logits) 93 | return probs, feature 94 | else: 95 | return feature 96 | 97 | 98 | if __name__ == '__main__': 99 | c3d = C3D() -------------------------------------------------------------------------------- /net/i3d.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2019-03-19 10:44 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : Action_Video_Visualization 8 | # @File : i3d.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import scipy.io 14 | import os 15 | 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 17 | import torch 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | from torch.autograd import Variable 21 | 22 | import numpy as np 23 | 24 | import os 25 | import sys 26 | from collections import OrderedDict 27 | 28 | 29 | class MaxPool3dSamePadding(nn.MaxPool3d): 30 | 31 | def compute_pad(self, dim, s): 32 | if s % self.stride[dim] == 0: 33 | return max(self.kernel_size[dim] - self.stride[dim], 0) 34 | else: 35 | return max(self.kernel_size[dim] - (s % self.stride[dim]), 0) 36 | 37 | def forward(self, x): 38 | # compute 'same' padding 39 | (batch, channel, t, h, w) = x.size() 40 | # print t,h,w 41 | out_t = np.ceil(float(t) / float(self.stride[0])) 42 | out_h = np.ceil(float(h) / float(self.stride[1])) 43 | out_w = np.ceil(float(w) / float(self.stride[2])) 44 | # print out_t, out_h, out_w 45 | pad_t = self.compute_pad(0, t) 46 | pad_h = self.compute_pad(1, h) 47 | pad_w = self.compute_pad(2, w) 48 | # print pad_t, pad_h, pad_w 49 | 50 | pad_t_f = pad_t // 2 51 | pad_t_b = pad_t - pad_t_f 52 | pad_h_f = pad_h // 2 53 | pad_h_b = pad_h - pad_h_f 54 | pad_w_f = pad_w // 2 55 | pad_w_b = pad_w - pad_w_f 56 | 57 | pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) 58 | # print x.size() 59 | # print pad 60 | x = F.pad(x, pad) 61 | return super(MaxPool3dSamePadding, self).forward(x) 62 | 63 | 64 | class Unit3D(nn.Module): 65 | 66 | def __init__(self, in_channels, 67 | output_channels, 68 | kernel_shape=(1, 1, 1), 69 | stride=(1, 1, 1), 70 | padding=0, 71 | activation_fn=F.relu, 72 | use_batch_norm=True, 73 | use_bias=False, 74 | name='unit_3d'): 75 | 76 | """Initializes Unit3D module.""" 77 | super(Unit3D, self).__init__() 78 | 79 | self._output_channels = output_channels 80 | self._kernel_shape = kernel_shape 81 | self._stride = stride 82 | self._use_batch_norm = use_batch_norm 83 | self._activation_fn = activation_fn 84 | self._use_bias = use_bias 85 | self.name = name 86 | self.padding = padding 87 | 88 | self.conv3d = nn.Conv3d(in_channels=in_channels, 89 | out_channels=self._output_channels, 90 | kernel_size=self._kernel_shape, 91 | stride=self._stride, 92 | padding=0, 93 | # we always want padding to be 0 here. We will dynamically pad based on input size in forward function 94 | bias=self._use_bias) 95 | 96 | if self._use_batch_norm: 97 | self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01) 98 | 99 | def compute_pad(self, dim, s): 100 | if s % self._stride[dim] == 0: 101 | return max(self._kernel_shape[dim] - self._stride[dim], 0) 102 | else: 103 | return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0) 104 | 105 | def forward(self, x): 106 | # compute 'same' padding 107 | (batch, channel, t, h, w) = x.size() 108 | # print t,h,w 109 | out_t = np.ceil(float(t) / float(self._stride[0])) 110 | out_h = np.ceil(float(h) / float(self._stride[1])) 111 | out_w = np.ceil(float(w) / float(self._stride[2])) 112 | # print out_t, out_h, out_w 113 | pad_t = self.compute_pad(0, t) 114 | pad_h = self.compute_pad(1, h) 115 | pad_w = self.compute_pad(2, w) 116 | # print pad_t, pad_h, pad_w 117 | 118 | pad_t_f = pad_t // 2 119 | pad_t_b = pad_t - pad_t_f 120 | pad_h_f = pad_h // 2 121 | pad_h_b = pad_h - pad_h_f 122 | pad_w_f = pad_w // 2 123 | pad_w_b = pad_w - pad_w_f 124 | 125 | pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) 126 | # print x.size() 127 | # print pad 128 | x = F.pad(x, pad) 129 | # print x.size() 130 | 131 | x = self.conv3d(x) 132 | if self._use_batch_norm: 133 | x = self.bn(x) 134 | if self._activation_fn is not None: 135 | x = self._activation_fn(x) 136 | return x 137 | 138 | 139 | class InceptionModule(nn.Module): 140 | def __init__(self, in_channels, out_channels, name): 141 | super(InceptionModule, self).__init__() 142 | 143 | self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0, 144 | name=name + '/Branch_0/Conv3d_0a_1x1') 145 | self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0, 146 | name=name + '/Branch_1/Conv3d_0a_1x1') 147 | self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3], 148 | name=name + '/Branch_1/Conv3d_0b_3x3') 149 | self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0, 150 | name=name + '/Branch_2/Conv3d_0a_1x1') 151 | self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3], 152 | name=name + '/Branch_2/Conv3d_0b_3x3') 153 | self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3], 154 | stride=(1, 1, 1), padding=0) 155 | self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0, 156 | name=name + '/Branch_3/Conv3d_0b_1x1') 157 | self.name = name 158 | 159 | def forward(self, x): 160 | b0 = self.b0(x) 161 | b1 = self.b1b(self.b1a(x)) 162 | b2 = self.b2b(self.b2a(x)) 163 | b3 = self.b3b(self.b3a(x)) 164 | return torch.cat([b0, b1, b2, b3], dim=1) 165 | 166 | 167 | class InceptionI3d(nn.Module): 168 | """Inception-v1 I3D architecture. 169 | The model is introduced in: 170 | Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset 171 | Joao Carreira, Andrew Zisserman 172 | https://arxiv.org/pdf/1705.07750v1.pdf. 173 | See also the Inception architecture, introduced in: 174 | Going deeper with convolutions 175 | Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, 176 | Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. 177 | http://arxiv.org/pdf/1409.4842v1.pdf. 178 | """ 179 | 180 | # Endpoints of the model in order. During construction, all the endpoints up 181 | # to a designated `final_endpoint` are returned in a dictionary as the 182 | # second return value. 183 | VALID_ENDPOINTS = ( 184 | 'Conv3d_1a_7x7', 185 | 'MaxPool3d_2a_3x3', 186 | 'Conv3d_2b_1x1', 187 | 'Conv3d_2c_3x3', 188 | 'MaxPool3d_3a_3x3', 189 | 'Mixed_3b', 190 | 'Mixed_3c', 191 | 'MaxPool3d_4a_3x3', 192 | 'Mixed_4b', 193 | 'Mixed_4c', 194 | 'Mixed_4d', 195 | 'Mixed_4e', 196 | 'Mixed_4f', 197 | 'MaxPool3d_5a_2x2', 198 | 'Mixed_5b', 199 | 'Mixed_5c', 200 | 'Logits', 201 | 'Predictions', 202 | ) 203 | 204 | def __init__(self, num_classes=400, spatial_squeeze=True, 205 | final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_prob=0.5): 206 | """Initializes I3D model instance. 207 | Args: 208 | num_classes: The number of outputs in the logit layer (default 400, which 209 | matches the Kinetics dataset). 210 | spatial_squeeze: Whether to squeeze the spatial dimensions for the logits 211 | before returning (default True). 212 | final_endpoint: The model contains many possible endpoints. 213 | `final_endpoint` specifies the last endpoint for the model to be built 214 | up to. In addition to the output at `final_endpoint`, all the outputs 215 | at endpoints up to `final_endpoint` will also be returned, in a 216 | dictionary. `final_endpoint` must be one of 217 | InceptionI3d.VALID_ENDPOINTS (default 'Logits'). 218 | name: A string (optional). The name of this module. 219 | Raises: 220 | ValueError: if `final_endpoint` is not recognized. 221 | """ 222 | 223 | if final_endpoint not in self.VALID_ENDPOINTS: 224 | raise ValueError('Unknown final endpoint %s' % final_endpoint) 225 | 226 | super(InceptionI3d, self).__init__() 227 | self._num_classes = num_classes 228 | self._spatial_squeeze = spatial_squeeze 229 | self._final_endpoint = final_endpoint 230 | self.logits = None 231 | 232 | if self._final_endpoint not in self.VALID_ENDPOINTS: 233 | raise ValueError('Unknown final endpoint %s' % self._final_endpoint) 234 | 235 | self.end_points = {} 236 | end_point = 'Conv3d_1a_7x7' 237 | self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], 238 | stride=(2, 2, 2), padding=(3, 3, 3), name=name + end_point) 239 | if self._final_endpoint == end_point: return 240 | 241 | end_point = 'MaxPool3d_2a_3x3' 242 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), 243 | padding=0) 244 | if self._final_endpoint == end_point: return 245 | 246 | end_point = 'Conv3d_2b_1x1' 247 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0, 248 | name=name + end_point) 249 | if self._final_endpoint == end_point: return 250 | 251 | end_point = 'Conv3d_2c_3x3' 252 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1, 253 | name=name + end_point) 254 | if self._final_endpoint == end_point: return 255 | 256 | end_point = 'MaxPool3d_3a_3x3' 257 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), 258 | padding=0) 259 | if self._final_endpoint == end_point: return 260 | 261 | end_point = 'Mixed_3b' 262 | self.end_points[end_point] = InceptionModule(192, [64, 96, 128, 16, 32, 32], name + end_point) 263 | if self._final_endpoint == end_point: return 264 | 265 | end_point = 'Mixed_3c' 266 | self.end_points[end_point] = InceptionModule(256, [128, 128, 192, 32, 96, 64], name + end_point) 267 | if self._final_endpoint == end_point: return 268 | 269 | # ====================================Add Some Model To I3d 270 | """ 271 | end_point = 'attention_1' 272 | self.end_points[end_point] = Self_Attn(480, 'relu') 273 | if self._final_endpoint == end_point: return 274 | """ 275 | # ======================================= 276 | end_point = 'MaxPool3d_4a_3x3' 277 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2), 278 | padding=0) 279 | if self._final_endpoint == end_point: return 280 | 281 | end_point = 'Mixed_4b' 282 | self.end_points[end_point] = InceptionModule(128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point) 283 | if self._final_endpoint == end_point: return 284 | 285 | end_point = 'Mixed_4c' 286 | self.end_points[end_point] = InceptionModule(192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point) 287 | if self._final_endpoint == end_point: return 288 | 289 | end_point = 'Mixed_4d' 290 | self.end_points[end_point] = InceptionModule(160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point) 291 | if self._final_endpoint == end_point: return 292 | 293 | end_point = 'Mixed_4e' 294 | self.end_points[end_point] = InceptionModule(128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point) 295 | if self._final_endpoint == end_point: return 296 | 297 | end_point = 'Mixed_4f' 298 | self.end_points[end_point] = InceptionModule(112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], 299 | name + end_point) 300 | if self._final_endpoint == end_point: return 301 | 302 | end_point = 'MaxPool3d_5a_2x2' 303 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), 304 | padding=0) 305 | if self._final_endpoint == end_point: return 306 | 307 | end_point = 'Mixed_5b' 308 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], 309 | name + end_point) 310 | if self._final_endpoint == end_point: return 311 | 312 | end_point = 'Mixed_5c' 313 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], 314 | name + end_point) 315 | if self._final_endpoint == end_point: return 316 | 317 | end_point = 'Logits' 318 | self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], 319 | stride=(1, 1, 1)) 320 | self.dropout = nn.Dropout(dropout_prob) 321 | self.dropout_probality = dropout_prob 322 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=400, 323 | kernel_shape=[1, 1, 1], 324 | padding=0, 325 | activation_fn=None, 326 | use_batch_norm=False, 327 | use_bias=True, 328 | name='logits') 329 | self.softmax = torch.nn.Softmax(dim=1) 330 | if self._num_classes != 400: 331 | self.fc_out = nn.Linear(400, self._num_classes, bias=True) 332 | self.build() 333 | if self._final_endpoint == end_point: return 334 | 335 | def replace_logits(self, num_classes): 336 | self._num_classes = num_classes 337 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, 338 | kernel_shape=[1, 1, 1], 339 | padding=0, 340 | activation_fn=None, 341 | use_batch_norm=False, 342 | use_bias=True, 343 | name='logits') 344 | 345 | def replace_dropout(self, dropout_radio): 346 | self.dropout = nn.Dropout(dropout_radio) 347 | if self._num_classes != 400: 348 | self.logits_dropout = nn.Dropout(dropout_radio) 349 | 350 | def build(self): 351 | for k in self.end_points.keys(): 352 | self.add_module(k, self.end_points[k]) 353 | 354 | def forward(self, x): 355 | for end_point in self.VALID_ENDPOINTS: 356 | if end_point in self.end_points: 357 | x = self._modules[end_point](x) # use _modules to work with dataparallel 358 | 359 | x = self.logits(self.dropout(self.avg_pool(x))) 360 | # print(x.size()) 361 | if self._spatial_squeeze: 362 | logits = x.squeeze(3).squeeze(3) # remove dim whose size is 1 363 | logits = torch.mean(logits, 2) 364 | # print(logits) 365 | if self._num_classes != 400: 366 | logits_out = nn.Dropout(self.dropout_probality)(logits) 367 | fc_out = self.fc_out(logits_out) 368 | # print(fc_out.size()) # 4 x 101 369 | # print(self.softmax(fc_out)[0,:].data.cpu().numpy().sum()) 370 | return fc_out 371 | # return self.softmax(fc_out) 372 | else: 373 | # print(logits.size()) # 5 x101 374 | # logits is batch X time X classes, which is what we want to work with 375 | # return self.softmax(logits) 376 | return logits 377 | 378 | def extract_features(self, x): 379 | for end_point in self.VALID_ENDPOINTS: 380 | if end_point in self.end_points: 381 | x = self._modules[end_point](x) 382 | return self.avg_pool(x) 383 | 384 | 385 | def get_fine_tuning_parameters(model): 386 | ft_module_names = [] 387 | # ft_module_names.append('Mixed_5b') 388 | # ft_module_names.append('Mixed_5c') 389 | ft_module_names.append('fc_out') 390 | ft_module_names.append('logits') 391 | # ft_module_names.append('attention_1') 392 | 393 | parameters = [] 394 | for k, v in model.named_parameters(): 395 | for ft_module in ft_module_names: 396 | if ft_module in k: 397 | parameters.append({'params': v}) 398 | break 399 | else: 400 | parameters.append({'params': v, 'lr': 0.0001}) 401 | 402 | return parameters 403 | -------------------------------------------------------------------------------- /net/i3dpt_origin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2019-05-12 22:09 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : pytorch_i3d 8 | # @File : i3dpt.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import scipy.io 14 | import os 15 | 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 17 | 18 | import math 19 | import os 20 | 21 | import numpy as np 22 | import torch 23 | from torch.nn import ReplicationPad3d 24 | import torch.nn.functional as F 25 | import torch.nn as nn 26 | from torch.nn.init import xavier_uniform_, constant_, normal_ 27 | 28 | 29 | def get_padding_shape(filter_shape, stride): 30 | def _pad_top_bottom(filter_dim, stride_val): 31 | pad_along = max(filter_dim - stride_val, 0) 32 | pad_top = pad_along // 2 33 | pad_bottom = pad_along - pad_top 34 | return pad_top, pad_bottom 35 | 36 | padding_shape = [] 37 | for filter_dim, stride_val in zip(filter_shape, stride): 38 | pad_top, pad_bottom = _pad_top_bottom(filter_dim, stride_val) 39 | padding_shape.append(pad_top) 40 | padding_shape.append(pad_bottom) 41 | depth_top = padding_shape.pop(0) 42 | depth_bottom = padding_shape.pop(0) 43 | padding_shape.append(depth_top) 44 | padding_shape.append(depth_bottom) 45 | 46 | return tuple(padding_shape) 47 | 48 | 49 | def simplify_padding(padding_shapes): 50 | all_same = True 51 | padding_init = padding_shapes[0] 52 | for pad in padding_shapes[1:]: 53 | if pad != padding_init: 54 | all_same = False 55 | return all_same, padding_init 56 | 57 | 58 | class Unit3Dpy(torch.nn.Module): 59 | def __init__(self, 60 | in_channels, 61 | out_channels, 62 | kernel_size=(1, 1, 1), 63 | stride=(1, 1, 1), 64 | activation='relu', 65 | padding='SAME', 66 | use_bias=False, 67 | use_bn=True): 68 | super(Unit3Dpy, self).__init__() 69 | 70 | self.padding = padding 71 | self.activation = activation 72 | self.use_bn = use_bn 73 | if padding == 'SAME': 74 | padding_shape = get_padding_shape(kernel_size, stride) 75 | simplify_pad, pad_size = simplify_padding(padding_shape) 76 | self.simplify_pad = simplify_pad 77 | elif padding == 'VALID': 78 | padding_shape = 0 79 | else: 80 | raise ValueError( 81 | 'padding should be in [VALID|SAME] but got {}'.format(padding)) 82 | 83 | if padding == 'SAME': 84 | if not simplify_pad: 85 | self.pad = torch.nn.ConstantPad3d(padding_shape, 0) 86 | self.conv3d = torch.nn.Conv3d( 87 | in_channels, 88 | out_channels, 89 | kernel_size, 90 | stride=stride, 91 | bias=use_bias) 92 | else: 93 | self.conv3d = torch.nn.Conv3d( 94 | in_channels, 95 | out_channels, 96 | kernel_size, 97 | stride=stride, 98 | padding=pad_size, 99 | bias=use_bias) 100 | elif padding == 'VALID': 101 | self.conv3d = torch.nn.Conv3d( 102 | in_channels, 103 | out_channels, 104 | kernel_size, 105 | padding=padding_shape, 106 | stride=stride, 107 | bias=use_bias) 108 | else: 109 | raise ValueError( 110 | 'padding should be in [VALID|SAME] but got {}'.format(padding)) 111 | 112 | if self.use_bn: 113 | self.batch3d = torch.nn.BatchNorm3d(out_channels) 114 | 115 | if activation == 'relu': 116 | self.activation = torch.nn.functional.relu 117 | 118 | def forward(self, inp): 119 | if self.padding == 'SAME' and self.simplify_pad is False: 120 | inp = self.pad(inp) 121 | out = self.conv3d(inp) 122 | if self.use_bn: 123 | out = self.batch3d(out) 124 | if self.activation is not None: 125 | out = torch.nn.functional.relu(out) 126 | return out 127 | 128 | 129 | class MaxPool3dTFPadding(torch.nn.Module): 130 | def __init__(self, kernel_size, stride=None, padding='SAME'): 131 | super(MaxPool3dTFPadding, self).__init__() 132 | if padding == 'SAME': 133 | padding_shape = get_padding_shape(kernel_size, stride) 134 | self.padding_shape = padding_shape 135 | self.pad = torch.nn.ConstantPad3d(padding_shape, 0) 136 | self.pool = torch.nn.MaxPool3d(kernel_size, stride, ceil_mode=True) 137 | 138 | def forward(self, inp): 139 | inp = self.pad(inp) 140 | out = self.pool(inp) 141 | return out 142 | 143 | 144 | class Mixed(torch.nn.Module): 145 | def __init__(self, in_channels, out_channels): 146 | super(Mixed, self).__init__() 147 | # Branch 0 148 | self.branch_0 = Unit3Dpy( 149 | in_channels, out_channels[0], kernel_size=(1, 1, 1)) 150 | 151 | # Branch 1 152 | branch_1_conv1 = Unit3Dpy( 153 | in_channels, out_channels[1], kernel_size=(1, 1, 1)) 154 | branch_1_conv2 = Unit3Dpy( 155 | out_channels[1], out_channels[2], kernel_size=(3, 3, 3)) 156 | self.branch_1 = torch.nn.Sequential(branch_1_conv1, branch_1_conv2) 157 | 158 | # Branch 2 159 | branch_2_conv1 = Unit3Dpy( 160 | in_channels, out_channels[3], kernel_size=(1, 1, 1)) 161 | branch_2_conv2 = Unit3Dpy( 162 | out_channels[3], out_channels[4], kernel_size=(3, 3, 3)) 163 | self.branch_2 = torch.nn.Sequential(branch_2_conv1, branch_2_conv2) 164 | 165 | # Branch3 166 | branch_3_pool = MaxPool3dTFPadding( 167 | kernel_size=(3, 3, 3), stride=(1, 1, 1), padding='SAME') 168 | branch_3_conv2 = Unit3Dpy( 169 | in_channels, out_channels[5], kernel_size=(1, 1, 1)) 170 | self.branch_3 = torch.nn.Sequential(branch_3_pool, branch_3_conv2) 171 | 172 | def forward(self, inp): 173 | out_0 = self.branch_0(inp) 174 | out_1 = self.branch_1(inp) 175 | out_2 = self.branch_2(inp) 176 | out_3 = self.branch_3(inp) 177 | out = torch.cat((out_0, out_1, out_2, out_3), 1) 178 | return out 179 | 180 | 181 | class I3D(torch.nn.Module): 182 | def __init__(self, 183 | num_classes, 184 | modality='rgb', 185 | dropout_prob=0, 186 | name='inception'): 187 | super(I3D, self).__init__() 188 | 189 | self.name = name 190 | self.num_classes = num_classes 191 | if modality == 'rgb': 192 | in_channels = 3 193 | elif modality == 'flow': 194 | in_channels = 2 195 | else: 196 | raise ValueError( 197 | '{} not among known modalities [rgb|flow]'.format(modality)) 198 | self.modality = modality 199 | 200 | conv3d_1a_7x7 = Unit3Dpy( 201 | out_channels=64, 202 | in_channels=in_channels, 203 | kernel_size=(7, 7, 7), 204 | stride=(2, 2, 2), 205 | padding='SAME') 206 | # 1st conv-pool 207 | self.conv3d_1a_7x7 = conv3d_1a_7x7 208 | self.maxPool3d_2a_3x3 = MaxPool3dTFPadding( 209 | kernel_size=(1, 3, 3), stride=(1, 2, 2), padding='SAME') 210 | # conv conv 211 | conv3d_2b_1x1 = Unit3Dpy( 212 | out_channels=64, 213 | in_channels=64, 214 | kernel_size=(1, 1, 1), 215 | padding='SAME') 216 | self.conv3d_2b_1x1 = conv3d_2b_1x1 217 | conv3d_2c_3x3 = Unit3Dpy( 218 | out_channels=192, 219 | in_channels=64, 220 | kernel_size=(3, 3, 3), 221 | padding='SAME') 222 | self.conv3d_2c_3x3 = conv3d_2c_3x3 #here padding = 1 may influence the result 223 | self.maxPool3d_3a_3x3 = MaxPool3dTFPadding( 224 | kernel_size=(1, 3, 3), stride=(1, 2, 2), padding='SAME') 225 | 226 | # Mixed_3b 227 | self.mixed_3b = Mixed(192, [64, 96, 128, 16, 32, 32]) 228 | self.mixed_3c = Mixed(256, [128, 128, 192, 32, 96, 64]) 229 | 230 | self.maxPool3d_4a_3x3 = MaxPool3dTFPadding( 231 | kernel_size=(3, 3, 3), stride=(2, 2, 2), padding='SAME') 232 | 233 | # Mixed 4 234 | self.mixed_4b = Mixed(480, [192, 96, 208, 16, 48, 64]) 235 | self.mixed_4c = Mixed(512, [160, 112, 224, 24, 64, 64]) 236 | self.mixed_4d = Mixed(512, [128, 128, 256, 24, 64, 64]) 237 | self.mixed_4e = Mixed(512, [112, 144, 288, 32, 64, 64]) 238 | self.mixed_4f = Mixed(528, [256, 160, 320, 32, 128, 128]) 239 | 240 | self.maxPool3d_5a_2x2 = MaxPool3dTFPadding( 241 | kernel_size=(2, 2, 2), stride=(2, 2, 2), padding='SAME') 242 | 243 | # Mixed 5 244 | self.mixed_5b = Mixed(832, [256, 160, 320, 32, 128, 128]) 245 | self.mixed_5c = Mixed(832, [384, 192, 384, 48, 128, 128]) 246 | 247 | self.avg_pool = torch.nn.AvgPool3d((2, 7, 7), (1, 1, 1)) 248 | self.dropout = torch.nn.Dropout(dropout_prob) 249 | self.conv3d_0c_1x1_custom = Unit3Dpy( 250 | in_channels=1024, 251 | out_channels=self.num_classes, 252 | kernel_size=(1, 1, 1), 253 | activation=None, 254 | use_bias=False, 255 | use_bn=False) 256 | self.softmax = torch.nn.Softmax(1) 257 | # ==========two dropout for temporal ensembling========= 258 | self.dropout1 = nn.Dropout(0.3) 259 | self.dropout2 = nn.Dropout(0.3) 260 | 261 | def forward(self, inp): 262 | out = self.conv3d_1a_7x7(inp) 263 | out = self.maxPool3d_2a_3x3(out) 264 | out = self.conv3d_2b_1x1(out) 265 | out = self.conv3d_2c_3x3(out) 266 | out = self.maxPool3d_3a_3x3(out) 267 | out = self.mixed_3b(out) 268 | out = self.mixed_3c(out) 269 | # out = self.s_depend(out) 270 | out = self.maxPool3d_4a_3x3(out) 271 | # out = self.dropout1(out) 272 | out = self.mixed_4b(out) 273 | out = self.mixed_4c(out) 274 | out = self.mixed_4d(out) 275 | out = self.mixed_4e(out) 276 | out = self.mixed_4f(out) 277 | # out = self.m_depend(out) 278 | out = self.maxPool3d_5a_2x2(out) 279 | # out = self.dropout2(out) 280 | out = self.mixed_5b(out) 281 | out = self.mixed_5c(out) 282 | features = out 283 | out = self.avg_pool(out) 284 | out = self.dropout(out) 285 | out = self.conv3d_0c_1x1_custom(out) 286 | out = out.squeeze(3) 287 | out = out.squeeze(3) 288 | out = out.mean(2) 289 | out_logits = out 290 | return F.log_softmax(out, dim=1), features 291 | 292 | def load_tf_weights(self, sess): 293 | state_dict = {} 294 | if self.modality == 'rgb': 295 | prefix = 'RGB/inception_i3d' 296 | elif self.modality == 'flow': 297 | prefix = 'Flow/inception_i3d' 298 | load_conv3d(state_dict, 'conv3d_1a_7x7', sess, 299 | os.path.join(prefix, 'Conv3d_1a_7x7')) 300 | load_conv3d(state_dict, 'conv3d_2b_1x1', sess, 301 | os.path.join(prefix, 'Conv3d_2b_1x1')) 302 | load_conv3d(state_dict, 'conv3d_2c_3x3', sess, 303 | os.path.join(prefix, 'Conv3d_2c_3x3')) 304 | 305 | load_mixed(state_dict, 'mixed_3b', sess, 306 | os.path.join(prefix, 'Mixed_3b')) 307 | load_mixed(state_dict, 'mixed_3c', sess, 308 | os.path.join(prefix, 'Mixed_3c')) 309 | load_mixed(state_dict, 'mixed_4b', sess, 310 | os.path.join(prefix, 'Mixed_4b')) 311 | load_mixed(state_dict, 'mixed_4c', sess, 312 | os.path.join(prefix, 'Mixed_4c')) 313 | load_mixed(state_dict, 'mixed_4d', sess, 314 | os.path.join(prefix, 'Mixed_4d')) 315 | load_mixed(state_dict, 'mixed_4e', sess, 316 | os.path.join(prefix, 'Mixed_4e')) 317 | # Here goest to 0.1 max error with tf 318 | load_mixed(state_dict, 'mixed_4f', sess, 319 | os.path.join(prefix, 'Mixed_4f')) 320 | 321 | load_mixed( 322 | state_dict, 323 | 'mixed_5b', 324 | sess, 325 | os.path.join(prefix, 'Mixed_5b'), 326 | fix_typo=True) 327 | load_mixed(state_dict, 'mixed_5c', sess, 328 | os.path.join(prefix, 'Mixed_5c')) 329 | load_conv3d( 330 | state_dict, 331 | 'conv3d_0c_1x1', 332 | sess, 333 | os.path.join(prefix, 'Logits', 'Conv3d_0c_1x1'), 334 | bias=True, 335 | bn=False) 336 | self.load_state_dict(state_dict) 337 | 338 | 339 | def get_conv_params(sess, name, bias=False): 340 | # Get conv weights 341 | conv_weights_tensor = sess.graph.get_tensor_by_name( 342 | os.path.join(name, 'w:0')) 343 | if bias: 344 | conv_bias_tensor = sess.graph.get_tensor_by_name( 345 | os.path.join(name, 'b:0')) 346 | conv_bias = sess.run(conv_bias_tensor) 347 | conv_weights = sess.run(conv_weights_tensor) 348 | conv_shape = conv_weights.shape 349 | 350 | kernel_shape = conv_shape[0:3] 351 | in_channels = conv_shape[3] 352 | out_channels = conv_shape[4] 353 | 354 | conv_op = sess.graph.get_operation_by_name( 355 | os.path.join(name, 'convolution')) 356 | padding_name = conv_op.get_attr('padding') 357 | padding = _get_padding(padding_name, kernel_shape) 358 | all_strides = conv_op.get_attr('strides') 359 | strides = all_strides[1:4] 360 | conv_params = [ 361 | conv_weights, kernel_shape, in_channels, out_channels, strides, padding 362 | ] 363 | if bias: 364 | conv_params.append(conv_bias) 365 | return conv_params 366 | 367 | 368 | def get_bn_params(sess, name): 369 | moving_mean_tensor = sess.graph.get_tensor_by_name( 370 | os.path.join(name, 'moving_mean:0')) 371 | moving_var_tensor = sess.graph.get_tensor_by_name( 372 | os.path.join(name, 'moving_variance:0')) 373 | beta_tensor = sess.graph.get_tensor_by_name(os.path.join(name, 'beta:0')) 374 | moving_mean = sess.run(moving_mean_tensor) 375 | moving_var = sess.run(moving_var_tensor) 376 | beta = sess.run(beta_tensor) 377 | return moving_mean, moving_var, beta 378 | 379 | 380 | def _get_padding(padding_name, conv_shape): 381 | padding_name = padding_name.decode("utf-8") 382 | if padding_name == "VALID": 383 | return [0, 0] 384 | elif padding_name == "SAME": 385 | # return [math.ceil(int(conv_shape[0])/2), math.ceil(int(conv_shape[1])/2)] 386 | return [ 387 | math.floor(int(conv_shape[0]) / 2), 388 | math.floor(int(conv_shape[1]) / 2), 389 | math.floor(int(conv_shape[2]) / 2) 390 | ] 391 | else: 392 | raise ValueError('Invalid padding name ' + padding_name) 393 | 394 | 395 | def load_conv3d(state_dict, name_pt, sess, name_tf, bias=False, bn=True): 396 | # Transfer convolution params 397 | conv_name_tf = os.path.join(name_tf, 'conv_3d') 398 | conv_params = get_conv_params(sess, conv_name_tf, bias=bias) 399 | if bias: 400 | conv_weights, kernel_shape, in_channels, out_channels, strides, padding, conv_bias = conv_params 401 | else: 402 | conv_weights, kernel_shape, in_channels, out_channels, strides, padding = conv_params 403 | 404 | conv_weights_rs = np.transpose( 405 | conv_weights, (4, 3, 0, 1, 406 | 2)) # to pt format (out_c, in_c, depth, height, width) 407 | state_dict[name_pt + '.conv3d.weight'] = torch.from_numpy(conv_weights_rs) 408 | if bias: 409 | state_dict[name_pt + '.conv3d.bias'] = torch.from_numpy(conv_bias) 410 | 411 | # Transfer batch norm params 412 | if bn: 413 | conv_tf_name = os.path.join(name_tf, 'batch_norm') 414 | moving_mean, moving_var, beta = get_bn_params(sess, conv_tf_name) 415 | 416 | out_planes = conv_weights_rs.shape[0] 417 | state_dict[name_pt + '.batch3d.weight'] = torch.ones(out_planes) 418 | state_dict[name_pt + 419 | '.batch3d.bias'] = torch.from_numpy(beta.squeeze()) 420 | state_dict[name_pt 421 | + '.batch3d.running_mean'] = torch.from_numpy(moving_mean.squeeze()) 422 | state_dict[name_pt 423 | + '.batch3d.running_var'] = torch.from_numpy(moving_var.squeeze()) 424 | 425 | 426 | def load_mixed(state_dict, name_pt, sess, name_tf, fix_typo=False): 427 | # Branch 0 428 | load_conv3d(state_dict, name_pt + '.branch_0', sess, 429 | os.path.join(name_tf, 'Branch_0/Conv3d_0a_1x1')) 430 | 431 | # Branch .1 432 | load_conv3d(state_dict, name_pt + '.branch_1.0', sess, 433 | os.path.join(name_tf, 'Branch_1/Conv3d_0a_1x1')) 434 | load_conv3d(state_dict, name_pt + '.branch_1.1', sess, 435 | os.path.join(name_tf, 'Branch_1/Conv3d_0b_3x3')) 436 | 437 | # Branch 2 438 | load_conv3d(state_dict, name_pt + '.branch_2.0', sess, 439 | os.path.join(name_tf, 'Branch_2/Conv3d_0a_1x1')) 440 | if fix_typo: 441 | load_conv3d(state_dict, name_pt + '.branch_2.1', sess, 442 | os.path.join(name_tf, 'Branch_2/Conv3d_0a_3x3')) 443 | else: 444 | load_conv3d(state_dict, name_pt + '.branch_2.1', sess, 445 | os.path.join(name_tf, 'Branch_2/Conv3d_0b_3x3')) 446 | 447 | # Branch 3 448 | load_conv3d(state_dict, name_pt + '.branch_3.1', sess, 449 | os.path.join(name_tf, 'Branch_3/Conv3d_0b_1x1')) 450 | 451 | 452 | def weights_init(model): 453 | """ Initializes the weights of the CNN model using the Xavier 454 | initialization. 455 | """ 456 | if isinstance(model, nn.Conv2d) or isinstance(model, nn.Conv3d) or isinstance(model, nn.Conv1d): 457 | xavier_uniform_(model.weight, gain=math.sqrt(2.0)) 458 | constant_(model.bias, 0.1) 459 | elif isinstance(model, nn.BatchNorm2d) or isinstance(model, nn.BatchNorm1d) or isinstance(model, nn.BatchNorm3d): 460 | normal_(model.weight, 1.0, 0.02) 461 | constant_(model.bias, 0) 462 | # zeros_(model.bias) 463 | -------------------------------------------------------------------------------- /net/mfnet_3d.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Original Author: Yunpeng Chen 4 | https://github.com/cypw/PyTorch-MFNet/blob/master/network/mfnet_3d.py 5 | """ 6 | 7 | from collections import OrderedDict 8 | import torch.nn as nn 9 | 10 | class BN_AC_CONV3D(nn.Module): 11 | 12 | def __init__(self, num_in, num_filter, 13 | kernel=(1,1,1), pad=(0,0,0), stride=(1,1,1), g=1, bias=False): 14 | super(BN_AC_CONV3D, self).__init__() 15 | self.bn = nn.BatchNorm3d(num_in) 16 | self.relu = nn.ReLU(inplace=True) 17 | self.conv = nn.Conv3d(num_in, num_filter, kernel_size=kernel, padding=pad, 18 | stride=stride, groups=g, bias=bias) 19 | 20 | def forward(self, x): 21 | h = self.relu(self.bn(x)) 22 | h = self.conv(h) 23 | return h 24 | 25 | 26 | class MF_UNIT(nn.Module): 27 | 28 | def __init__(self, num_in, num_mid, num_out, g=1, stride=(1,1,1), first_block=False, use_3d=True): 29 | super(MF_UNIT, self).__init__() 30 | num_ix = int(num_mid/4) 31 | kt,pt = (3,1) if use_3d else (1,0) 32 | # prepare input 33 | self.conv_i1 = BN_AC_CONV3D(num_in=num_in, num_filter=num_ix, kernel=(1,1,1), pad=(0,0,0)) 34 | self.conv_i2 = BN_AC_CONV3D(num_in=num_ix, num_filter=num_in, kernel=(1,1,1), pad=(0,0,0)) 35 | # main part 36 | self.conv_m1 = BN_AC_CONV3D(num_in=num_in, num_filter=num_mid, kernel=(kt,3,3), pad=(pt,1,1), stride=stride, g=g) 37 | if first_block: 38 | self.conv_m2 = BN_AC_CONV3D(num_in=num_mid, num_filter=num_out, kernel=(1,1,1), pad=(0,0,0)) 39 | else: 40 | self.conv_m2 = BN_AC_CONV3D(num_in=num_mid, num_filter=num_out, kernel=(1,3,3), pad=(0,1,1), g=g) 41 | # adapter 42 | if first_block: 43 | self.conv_w1 = BN_AC_CONV3D(num_in=num_in, num_filter=num_out, kernel=(1,1,1), pad=(0,0,0), stride=stride) 44 | 45 | def forward(self, x): 46 | 47 | h = self.conv_i1(x) 48 | x_in = x + self.conv_i2(h) 49 | 50 | h = self.conv_m1(x_in) 51 | h = self.conv_m2(h) 52 | 53 | if hasattr(self, 'conv_w1'): 54 | x = self.conv_w1(x) 55 | 56 | return h + x 57 | 58 | 59 | class MFNET_3D(nn.Module): 60 | 61 | def __init__(self, num_classes, dropout=None, pretrained=False, pretrained_model="", **kwargs): 62 | super(MFNET_3D, self).__init__() 63 | 64 | groups = 16 65 | k_sec = { 2: 3, \ 66 | 3: 4, \ 67 | 4: 6, \ 68 | 5: 3 } 69 | 70 | # conv1 - x224 (x16) 71 | conv1_num_out = 16 72 | self.conv1 = nn.Sequential(OrderedDict([ 73 | ('conv', nn.Conv3d( 3, conv1_num_out, kernel_size=(3,5,5), padding=(1,2,2), stride=(1,2,2), bias=False)), 74 | ('bn', nn.BatchNorm3d(conv1_num_out)), 75 | ('relu', nn.ReLU(inplace=True)) 76 | ])) 77 | self.maxpool = nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)) 78 | 79 | # conv2 - x56 (x8) 80 | num_mid = 96 81 | conv2_num_out = 96 82 | self.conv2 = nn.Sequential(OrderedDict([ 83 | ("B%02d"%i, MF_UNIT(num_in=conv1_num_out if i==1 else conv2_num_out, 84 | num_mid=num_mid, 85 | num_out=conv2_num_out, 86 | stride=(2,1,1) if i==1 else (1,1,1), 87 | g=groups, 88 | first_block=(i==1))) for i in range(1,k_sec[2]+1) 89 | ])) 90 | 91 | # conv3 - x28 (x8) 92 | num_mid *= 2 93 | conv3_num_out = 2 * conv2_num_out 94 | self.conv3 = nn.Sequential(OrderedDict([ 95 | ("B%02d"%i, MF_UNIT(num_in=conv2_num_out if i==1 else conv3_num_out, 96 | num_mid=num_mid, 97 | num_out=conv3_num_out, 98 | stride=(1,2,2) if i==1 else (1,1,1), 99 | g=groups, 100 | first_block=(i==1))) for i in range(1,k_sec[3]+1) 101 | ])) 102 | 103 | # conv4 - x14 (x8) 104 | num_mid *= 2 105 | conv4_num_out = 2 * conv3_num_out 106 | self.conv4 = nn.Sequential(OrderedDict([ 107 | ("B%02d"%i, MF_UNIT(num_in=conv3_num_out if i==1 else conv4_num_out, 108 | num_mid=num_mid, 109 | num_out=conv4_num_out, 110 | stride=(1,2,2) if i==1 else (1,1,1), 111 | g=groups, 112 | first_block=(i==1))) for i in range(1,k_sec[4]+1) 113 | ])) 114 | 115 | # conv5 - x7 (x8) 116 | num_mid *= 2 117 | conv5_num_out = 2 * conv4_num_out 118 | self.conv5 = nn.Sequential(OrderedDict([ 119 | ("B%02d"%i, MF_UNIT(num_in=conv4_num_out if i==1 else conv5_num_out, 120 | num_mid=num_mid, 121 | num_out=conv5_num_out, 122 | stride=(1,2,2) if i==1 else (1,1,1), 123 | g=groups, 124 | first_block=(i==1))) for i in range(1,k_sec[5]+1) 125 | ])) 126 | 127 | # final 128 | self.tail = nn.Sequential(OrderedDict([ 129 | ('bn', nn.BatchNorm3d(conv5_num_out)), 130 | ('relu', nn.ReLU(inplace=True)) 131 | ])) 132 | 133 | if dropout: 134 | self.globalpool = nn.Sequential(OrderedDict([ 135 | ('avg', nn.AvgPool3d(kernel_size=(8,7,7), stride=(1,1,1))), 136 | ('dropout', nn.Dropout(p=dropout)), 137 | ])) 138 | else: 139 | self.globalpool = nn.Sequential(OrderedDict([ 140 | ('avg', nn.AvgPool3d(kernel_size=(8,7,7), stride=(1,1,1))), 141 | # ('dropout', nn.Dropout(p=0.5)), only for fine-tuning 142 | ])) 143 | self.classifier = nn.Linear(conv5_num_out, num_classes) 144 | 145 | def forward(self, x): 146 | assert x.shape[2] == 16 147 | 148 | h = self.conv1(x) # x224 -> x112 149 | h = self.maxpool(h) # x112 -> x56 150 | 151 | h = self.conv2(h) # x56 -> x56 152 | h = self.conv3(h) # x56 -> x28 153 | h = self.conv4(h) # x28 -> x14 154 | h = self.conv5(h) # x14 -> x7 155 | h = self.tail(h) 156 | layerout = h.detach().cpu() 157 | h = self.globalpool(h) 158 | 159 | h = h.view(h.shape[0], -1) 160 | h = self.classifier(h) 161 | 162 | return h, layerout -------------------------------------------------------------------------------- /net/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2020-09-17 16:02 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : amax_Action_Video_Visualization 8 | # @File : model.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import scipy.io 14 | import os 15 | 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 17 | 18 | from torch import nn 19 | import torch.nn.functional as F 20 | import torch 21 | import numpy as np 22 | 23 | class Flatten(nn.Module): 24 | def __init__(self): 25 | super(Flatten, self).__init__() 26 | 27 | def forward(self, input): 28 | return input.view(input.size(0), -1) 29 | 30 | 31 | class Normalize(nn.Module): 32 | def __init__(self, power=2): 33 | super(Normalize, self).__init__() 34 | self.power = power 35 | 36 | def forward(self, x): 37 | norm = x.pow(self.power).sum(1, keepdim=True).pow(1./self.power) 38 | out = x.div(norm) 39 | return out 40 | 41 | 42 | class Sharpen(nn.Module): 43 | def __init__(self, tempeature=0.5): 44 | super(Sharpen, self).__init__() 45 | self.T = tempeature 46 | 47 | def forward(self, probabilities): 48 | tempered = torch.pow(probabilities, 1 / self.T) 49 | tempered = tempered / tempered.sum(dim=-1, keepdim=True) 50 | return tempered 51 | 52 | class MotionEnhance(nn.Module): 53 | def __init__(self, beta=1, maxium_radio=0.3): 54 | super(MotionEnhance, self).__init__() 55 | self.beta = beta 56 | self.maxium_radio = maxium_radio 57 | 58 | def forward(self, x): 59 | b, c, t, h, w = x.size() 60 | mean = nn.AdaptiveAvgPool3d((1, h, w))(x) 61 | lam = np.random.beta(self.beta, self.beta) * self.maxium_radio 62 | out = (x - mean * lam) * (1 / (1 - lam)) 63 | return out -------------------------------------------------------------------------------- /net/mp_i3d.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2019-04-27 16:18 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : Action_Video_Visualization 8 | # @File : mp_i3d.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import scipy.io 14 | import os 15 | 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 17 | #!/usr/bin/env python 18 | # -*- coding: utf-8 -*- 19 | """ 20 | # @Time : 2019-04-08 18:32 21 | # @Author : Awiny 22 | # @Site : 23 | # @Project : pytorch_i3d 24 | # @File : multi_path_i3d.py 25 | # @Software: PyCharm 26 | # @Github : https://github.com/FingerRec 27 | # @Blog : http://fingerrec.github.io 28 | """ 29 | import scipy.io 30 | import os 31 | 32 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 33 | import scipy.io 34 | import os 35 | import random 36 | 37 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 38 | #======================================================================================== 39 | #This network is designed to capture different range dependencies and cobine them. 40 | #With dilated conv and downsample, i want to down the number of parameters. 41 | #The network are divided into 3 parllel network. and across information between them. 42 | #1:64frame input, 56 x 56 input, long range temporal dependencies, call s 43 | #2:16frame input, 112x112, middle range temporal dependencies, call m 44 | #3:4frame input, 224x224, shortest temporal dependencies, call l 45 | #after these network, use tpp to combine them and put it into fc layer 46 | #======================================================================================== 47 | 48 | import torch 49 | import torch.nn as nn 50 | import torch.nn.functional as F 51 | from torch.autograd import Variable 52 | 53 | import numpy as np 54 | import math 55 | from math import exp 56 | import os 57 | import sys 58 | from collections import OrderedDict 59 | 60 | class MaxPool3dSamePadding(nn.MaxPool3d): 61 | 62 | def compute_pad(self, dim, s): 63 | if s % self.stride[dim] == 0: 64 | return max(self.kernel_size[dim] - self.stride[dim], 0) 65 | else: 66 | return max(self.kernel_size[dim] - (s % self.stride[dim]), 0) 67 | 68 | def forward(self, x): 69 | # compute 'same' padding 70 | (batch, channel, t, h, w) = x.size() 71 | # print t,h,w 72 | out_t = np.ceil(float(t) / float(self.stride[0])) 73 | out_h = np.ceil(float(h) / float(self.stride[1])) 74 | out_w = np.ceil(float(w) / float(self.stride[2])) 75 | # print out_t, out_h, out_w 76 | pad_t = self.compute_pad(0, t) 77 | pad_h = self.compute_pad(1, h) 78 | pad_w = self.compute_pad(2, w) 79 | # print pad_t, pad_h, pad_w 80 | 81 | pad_t_f = pad_t // 2 82 | pad_t_b = pad_t - pad_t_f 83 | pad_h_f = pad_h // 2 84 | pad_h_b = pad_h - pad_h_f 85 | pad_w_f = pad_w // 2 86 | pad_w_b = pad_w - pad_w_f 87 | 88 | pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) 89 | # print x.size() 90 | # print pad 91 | x = F.pad(x, pad) 92 | return super(MaxPool3dSamePadding, self).forward(x) 93 | 94 | class Unit3D(nn.Module): 95 | 96 | def __init__(self, in_channels, 97 | output_channels, 98 | kernel_shape=(1, 1, 1), 99 | stride=(1, 1, 1), 100 | padding=0, 101 | dilation=1, 102 | activation_fn=F.relu, 103 | use_batch_norm=True, 104 | use_bias=False, 105 | name='unit_3d'): 106 | 107 | """Initializes Unit3D module.""" 108 | super(Unit3D, self).__init__() 109 | 110 | self._output_channels = output_channels 111 | self._kernel_shape = kernel_shape 112 | self._stride = stride 113 | self._use_batch_norm = use_batch_norm 114 | self._activation_fn = activation_fn 115 | self._use_bias = use_bias 116 | self.name = name 117 | self.padding = padding 118 | 119 | self.conv3d = nn.Conv3d(in_channels=in_channels, 120 | out_channels=self._output_channels, 121 | kernel_size=self._kernel_shape, 122 | stride=self._stride, 123 | dilation=dilation, 124 | padding=0, 125 | # we always want padding to be 0 here. We will dynamically pad based on input size in forward function 126 | bias=self._use_bias) 127 | 128 | if self._use_batch_norm: 129 | self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01) 130 | 131 | def compute_pad(self, dim, s): 132 | if s % self._stride[dim] == 0: 133 | return max(self._kernel_shape[dim] - self._stride[dim], 0) 134 | else: 135 | return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0) 136 | 137 | def forward(self, x): 138 | # compute 'same' padding 139 | (batch, channel, t, h, w) = x.size() 140 | # print t,h,w 141 | out_t = np.ceil(float(t) / float(self._stride[0])) 142 | out_h = np.ceil(float(h) / float(self._stride[1])) 143 | out_w = np.ceil(float(w) / float(self._stride[2])) 144 | # print out_t, out_h, out_w 145 | pad_t = self.compute_pad(0, t) 146 | pad_h = self.compute_pad(1, h) 147 | pad_w = self.compute_pad(2, w) 148 | # print pad_t, pad_h, pad_w 149 | 150 | pad_t_f = pad_t // 2 151 | pad_t_b = pad_t - pad_t_f 152 | pad_h_f = pad_h // 2 153 | pad_h_b = pad_h - pad_h_f 154 | pad_w_f = pad_w // 2 155 | pad_w_b = pad_w - pad_w_f 156 | 157 | pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) 158 | # print x.size() 159 | # print pad 160 | x = F.pad(x, pad) 161 | # print x.size() 162 | 163 | x = self.conv3d(x) 164 | if self._use_batch_norm: 165 | x = self.bn(x) 166 | if self._activation_fn is not None: 167 | x = self._activation_fn(x) 168 | return x 169 | 170 | class TemporalPyramidPool3D_2(nn.Module): 171 | """ 172 | Args: 173 | out_side (tuple): Length of side in the pooling results of each pyramid layer. 174 | 175 | Inputs: 176 | - `input`: the input Tensor to invert ([batch, channel, width, height]) 177 | """ 178 | 179 | def __init__(self, out_side): 180 | super(TemporalPyramidPool3D_2, self).__init__() 181 | self.out_side = out_side 182 | self.out_t = out_side[0] + out_side[1] + out_side[2] 183 | 184 | def forward(self, x): 185 | out = None 186 | for n in self.out_side: 187 | t_r, w_r, h_r = map(lambda s: math.ceil(s / n), x.size()[2:]) # Receptive Field Size 188 | s_t, s_w, s_h = map(lambda s: math.floor(s / n), x.size()[2:]) # Stride 189 | max_pool = nn.MaxPool3d(kernel_size=(t_r, w_r, h_r), stride=(s_t, s_w, s_h)) 190 | y = max_pool(x) 191 | avg_pool = nn.AdaptiveAvgPool3d((y.size(2), 1, 1)) 192 | y = avg_pool(y) 193 | # print(y.size()) 194 | if out is None: 195 | out = y.view(y.size()[0], y.size()[1], -1, 1, 1) 196 | else: 197 | out = torch.cat((out, y.view(y.size()[0], y.size()[1], -1, 1, 1)), 2) 198 | return out 199 | 200 | class TemporalPyramidPool3D(nn.Module): 201 | """ 202 | Args: 203 | out_side (tuple): Length of side in the pooling results of each pyramid layer. 204 | 205 | Inputs: 206 | - `input`: the input Tensor to invert ([batch, channel, width, height]) 207 | """ 208 | 209 | def __init__(self, out_side): 210 | super(TemporalPyramidPool3D, self).__init__() 211 | self.out_side = out_side 212 | 213 | def forward(self, x): 214 | out = None 215 | for n in self.out_side: 216 | avg_pool = nn.AdaptiveMaxPool3d((n, 1, 1)) 217 | y = avg_pool(x) 218 | if out is None: 219 | out = y.view(y.size()[0], y.size()[1], -1, 1, 1) 220 | else: 221 | out = torch.cat((out, y.view(y.size()[0], y.size()[1], -1, 1, 1)), 2) 222 | return out 223 | 224 | class SpatialPyramidPool3D(nn.Module): 225 | """ 226 | Args: 227 | out_side (tuple): Length of side in the pooling results of each pyramid layer. 228 | 229 | Inputs: 230 | - `input`: the input Tensor to invert ([batch, channel, width, height]) 231 | """ 232 | 233 | def __init__(self, out_side): 234 | super(SpatialPyramidPool3D, self).__init__() 235 | self.out_side = out_side 236 | 237 | def forward(self, x): 238 | out = None 239 | for n in self.out_side: 240 | max_pool = nn.AdaptiveMaxPool3d((1, n, n)) 241 | y = max_pool(x) 242 | if out is None: 243 | out = y.view(y.size()[0], y.size()[1], 1, n*n, 1) 244 | else: 245 | out = torch.cat((out, y.view(y.size()[0], y.size()[1], 1, n*n, 1)), 3) 246 | return out 247 | 248 | class InplaceShift(torch.autograd.Function): 249 | # Special thanks to @raoyongming for the help to this function 250 | @staticmethod 251 | def forward(ctx, input, fold): 252 | # not support higher order gradient 253 | # input = input.detach_() 254 | ctx.fold_ = fold 255 | n, t, c, h, w = input.size() 256 | buffer = input.data.new(n, t, fold, h, w).zero_() 257 | buffer[:, :-1] = input.data[:, 1:, :fold] 258 | input.data[:, :, :fold] = buffer 259 | buffer.zero_() 260 | buffer[:, 1:] = input.data[:, :-1, fold: 2 * fold] 261 | input.data[:, :, fold: 2 * fold] = buffer 262 | return input 263 | 264 | @staticmethod 265 | def backward(ctx, grad_output): 266 | # grad_output = grad_output.detach_() 267 | fold = ctx.fold_ 268 | n, t, c, h, w = grad_output.size() 269 | buffer = grad_output.data.new(n, t, fold, h, w).zero_() 270 | buffer[:, 1:] = grad_output.data[:, :-1, :fold] 271 | grad_output.data[:, :, :fold] = buffer 272 | buffer.zero_() 273 | buffer[:, :-1] = grad_output.data[:, 1:, fold: 2 * fold] 274 | grad_output.data[:, :, fold: 2 * fold] = buffer 275 | return grad_output, None 276 | 277 | class TemporalShuffle(nn.Module): 278 | def __init__(self, fold_div=8): 279 | super(TemporalShuffle, self).__init__() 280 | self.fold_div = fold_div 281 | 282 | def forward(self, x): 283 | b, t, c, h, w = x.size() 284 | fold = c // self.fold_div 285 | out = InplaceShift.apply(x, fold) 286 | return out.view(b, t, c, h, w) 287 | 288 | class MultiDependBlock(nn.Module): 289 | def __init__(self, in_channel, out_channel, concat=False, fc=False): 290 | super(MultiDependBlock, self).__init__() 291 | self.out_channel = out_channel 292 | self.channel_compress = Unit3D(in_channels=in_channel, output_channels=out_channel, 293 | kernel_shape=[1, 1, 1], 294 | stride=(1, 1, 1), 295 | padding=0, 296 | activation_fn=None, 297 | use_batch_norm=False, 298 | use_bias=True, 299 | name='channel_compress') 300 | self.long_range_depen = Unit3D(in_channels=out_channel, output_channels=out_channel, 301 | kernel_shape=[2, 1, 1], 302 | stride=(1, 1, 1), 303 | padding=0, 304 | activation_fn=None, 305 | use_batch_norm=False, 306 | use_bias=True, 307 | name='long_range_depen') 308 | self.middle_range_depen = Unit3D(in_channels=out_channel, output_channels=out_channel, 309 | kernel_shape=[2, 1, 1], 310 | stride=(1, 1, 1), 311 | padding=0, 312 | activation_fn=None, 313 | use_batch_norm=False, 314 | use_bias=True, 315 | name='middle_range_depen') 316 | self.small_range_depen = Unit3D(in_channels=out_channel, output_channels=out_channel, 317 | kernel_shape=[2, 1, 1], 318 | stride=(1, 1, 1), 319 | padding=0, 320 | activation_fn=None, 321 | use_batch_norm=False, 322 | use_bias=True, 323 | name='small_range_depen') 324 | self.local_range_depen = Unit3D(in_channels=out_channel, output_channels=out_channel, 325 | kernel_shape=[1, 1, 1], 326 | stride=(1, 1, 1), 327 | padding=0, 328 | activation_fn=None, 329 | use_batch_norm=False, 330 | use_bias=True, 331 | name='local_range_depen') 332 | ''' 333 | self.single_range_depen = Unit3D(in_channels=out_channel, output_channels=out_channel, 334 | kernel_shape=[1, 1, 1], 335 | stride=(1, 1, 1), 336 | padding=0, 337 | activation_fn=None, 338 | use_batch_norm=False, 339 | use_bias=True, 340 | name='single_range_depen') 341 | ''' 342 | self.concat = concat 343 | self.fc = fc 344 | if self.fc: 345 | self.fc_fusion = nn.Sequential( 346 | nn.ReLU(), 347 | nn.Linear(3 * out_channel, 128), 348 | nn.ReLU(), 349 | nn.Linear(128, out_channel), 350 | ) 351 | #self.dropout_probality = 0.05 352 | def forward(self, x): 353 | b, c, t, h, w = x.size() 354 | spatial_pool_x = nn.AdaptiveAvgPool3d((t,1,1))(x)/2 + nn.AdaptiveMaxPool3d((t,1,1))(x)/2 355 | #spatial_pool_x = nn.Dropout(self.dropout_probality)(spatial_pool_x) 356 | spatial_pool_x = self.channel_compress(spatial_pool_x) 357 | long_range_depen = self.long_range_depen(spatial_pool_x[:,:,::(t-1),:,:]) 358 | middle_range_depen = self.middle_range_depen(spatial_pool_x[:,:,::(t-1)//2,:,:]) 359 | small_range_depen = self.small_range_depen(spatial_pool_x[:,:,::(t-1)//4,:,:]) 360 | local_range_depen = self.local_range_depen(spatial_pool_x[:,:,::(t-1)//7,:,:]) 361 | #single_range_depen = self.single_range_depen(spatial_pool_x[:, :, ::1, :, :]) 362 | ''' 363 | long_range_depen = self.long_range_depen(spatial_pool_x[:,:,::7,:,:]) 364 | middle_range_depen = self.middle_range_depen(spatial_pool_x[:,:,::4,:,:]) 365 | small_range_depen = self.small_range_depen(spatial_pool_x[:,:,::2,:,:]) 366 | local_range_depen = self.local_range_depen(spatial_pool_x[:,:,::1,:,:]) 367 | ''' 368 | if self.fc: 369 | out = torch.cat((nn.AdaptiveMaxPool3d((1, 1, 1))(long_range_depen).squeeze(2).squeeze(2).squeeze(2), nn.AdaptiveMaxPool3d((1, 1, 1))(middle_range_depen).squeeze(2).squeeze(2).squeeze(2), nn.AdaptiveMaxPool3d((1, 1, 1))(small_range_depen).squeeze(2).squeeze(2).squeeze(2)), dim = 1) 370 | return self.fc_fusion(out) 371 | elif self.concat: 372 | return torch.cat((nn.AdaptiveMaxPool3d((1, 1, 1))(long_range_depen).squeeze(2).squeeze(2).squeeze(2), nn.AdaptiveMaxPool3d((1, 1, 1))(middle_range_depen).squeeze(2).squeeze(2).squeeze(2), nn.AdaptiveMaxPool3d((1, 1, 1))(small_range_depen).squeeze(2).squeeze(2).squeeze(2)), dim = 1) 373 | else: 374 | return nn.AdaptiveMaxPool3d((1, 1, 1))(long_range_depen).squeeze(2).squeeze(2).squeeze(2) + \ 375 | nn.AdaptiveMaxPool3d((1, 1, 1))(middle_range_depen).squeeze(2).squeeze(2).squeeze(2) + \ 376 | nn.AdaptiveMaxPool3d((1, 1, 1))(small_range_depen).squeeze(2).squeeze(2).squeeze(2) + \ 377 | nn.AdaptiveMaxPool3d((1, 1, 1))(local_range_depen).squeeze(2).squeeze(2).squeeze(2) #+ \ 378 | #nn.AdaptiveMaxPool3d((1, 1, 1))(single_range_depen).squeeze(2).squeeze(2).squeeze(2) 379 | 380 | class TemporalDependBlock(nn.Module): 381 | def __init__(self, in_channel, out_channel): 382 | super(TemporalDependBlock, self).__init__() 383 | self.out_channel = out_channel 384 | self.channel_compress = Unit3D(in_channels=in_channel, output_channels=out_channel, 385 | kernel_shape=[1, 1, 1], 386 | stride=(1, 1, 1), 387 | padding=0, 388 | activation_fn=None, 389 | use_batch_norm=False, 390 | use_bias=True, 391 | name='channel_compress') 392 | self.tpp = TemporalPyramidPool3D((1,2,4,8)) 393 | self.temporal_conv = Unit3D(in_channels=out_channel, output_channels=out_channel, 394 | kernel_shape=[15, 1, 1], 395 | stride=(15, 1, 1), 396 | padding=0, 397 | activation_fn=None, 398 | use_batch_norm=False, 399 | use_bias=True, 400 | name='latter_temporal_conv') 401 | def forward(self, x): 402 | b, c, t, h, w = x.size() 403 | compress = self.channel_compress(x) 404 | tpp = self.tpp(compress) 405 | out = self.temporal_conv(tpp) 406 | return out.view(b, out.size(1)) 407 | 408 | class HeavyMultiDependBlock(nn.Module): 409 | def __init__(self, in_channel, out_channel): 410 | super(HeavyMultiDependBlock, self).__init__() 411 | self.out_channel = out_channel 412 | self.channel_compress = Unit3D(in_channels=in_channel, output_channels=out_channel, 413 | kernel_shape=[1, 1, 1], 414 | stride=(1, 1, 1), 415 | padding=0, 416 | activation_fn=None, 417 | use_batch_norm=False, 418 | use_bias=True, 419 | name='channel_compress') 420 | self.long_range_depen = Unit3D(in_channels=out_channel, output_channels=out_channel, 421 | kernel_shape=[2, 1, 1], 422 | stride=(1, 1, 1), 423 | padding=0, 424 | activation_fn=None, 425 | use_batch_norm=False, 426 | use_bias=True, 427 | name='long_range_depen') 428 | self.middle_range_depen = Unit3D(in_channels=out_channel, output_channels=out_channel, 429 | kernel_shape=[2, 1, 1], 430 | stride=(1, 1, 1), 431 | padding=0, 432 | activation_fn=None, 433 | use_batch_norm=False, 434 | use_bias=True, 435 | name='middle_range_depen') 436 | self.small_range_depen = Unit3D(in_channels=out_channel, output_channels=out_channel, 437 | kernel_shape=[2, 1, 1], 438 | stride=(1, 1, 1), 439 | padding=0, 440 | activation_fn=None, 441 | use_batch_norm=False, 442 | use_bias=True, 443 | name='small_range_depen') 444 | self.tpp_1 = TemporalPyramidPool3D((1,2,4)) 445 | self.fusion_1 = Unit3D(in_channels=out_channel, output_channels=out_channel, 446 | kernel_shape=[7, 1, 1], 447 | stride=(7, 1, 1), 448 | padding=0, 449 | activation_fn=None, 450 | use_batch_norm=False, 451 | use_bias=True, 452 | name='long_range_depen') 453 | self.tpp_2 = TemporalPyramidPool3D((1,2,4)) 454 | self.fusion_2 = Unit3D(in_channels=out_channel, output_channels=out_channel, 455 | kernel_shape=[7, 1, 1], 456 | stride=(7, 1, 1), 457 | padding=0, 458 | activation_fn=None, 459 | use_batch_norm=False, 460 | use_bias=True, 461 | name='middle_range_depen') 462 | self.tpp_3 = TemporalPyramidPool3D((1,2,4)) 463 | self.fusion_3 = Unit3D(in_channels=out_channel, output_channels=out_channel, 464 | kernel_shape=[7, 1, 1], 465 | stride=(7, 1, 1), 466 | padding=0, 467 | activation_fn=None, 468 | use_batch_norm=False, 469 | use_bias=True, 470 | name='small_range_depen') 471 | def forward(self, x): 472 | b, c, t, h, w = x.size() 473 | spatial_pool_x = nn.AdaptiveAvgPool3d((t,1,1))(x)/2 + nn.AdaptiveMaxPool3d((t,1,1))(x)/2 474 | spatial_pool_x = self.channel_compress(spatial_pool_x) 475 | long_range_depen = self.long_range_depen(spatial_pool_x[:,:,::4,:,:]) 476 | middle_range_depen = self.middle_range_depen(spatial_pool_x[:,:,::2,:,:]) 477 | small_range_depen = self.small_range_depen(spatial_pool_x[:,:,::1,:,:]) 478 | long_range_depen = self.tpp_1(long_range_depen) 479 | middle_range_depen = self.tpp_2(middle_range_depen) 480 | small_range_depen = self.tpp_3(small_range_depen) 481 | return self.fusion_1(long_range_depen).squeeze(2).squeeze(2).squeeze(2) + self.fusion_2(middle_range_depen).squeeze(2).squeeze(2).squeeze(2) + self.fusion_3(small_range_depen).squeeze(2).squeeze(2).squeeze(2) 482 | 483 | class InceptionModule(nn.Module): 484 | def __init__(self, in_channels, out_channels, name): 485 | super(InceptionModule, self).__init__() 486 | 487 | self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0, 488 | name=name + '/Branch_0/Conv3d_0a_1x1') 489 | self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0, 490 | name=name + '/Branch_1/Conv3d_0a_1x1') 491 | self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3], 492 | name=name + '/Branch_1/Conv3d_0b_3x3') 493 | self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0, 494 | name=name + '/Branch_2/Conv3d_0a_1x1') 495 | self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3], 496 | name=name + '/Branch_2/Conv3d_0b_3x3') 497 | self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3], 498 | stride=(1, 1, 1), padding=0) 499 | self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0, 500 | name=name + '/Branch_3/Conv3d_0b_1x1') 501 | #self.temporal_shift = TemporalShuffle(fold_div=16) 502 | self.name = name 503 | 504 | def forward(self, x): 505 | b0 = self.b0(x) 506 | b1 = self.b1b(self.b1a(x)) 507 | b2 = self.b2b(self.b2a(x)) 508 | b3 = self.b3b(self.b3a(x)) 509 | return torch.cat([b0, b1, b2, b3], dim=1) 510 | """ 511 | out = torch.cat([b0, b1, b2, b3], dim=1) 512 | b, c, t, h, w = x.size() 513 | if t > 16: 514 | ts_1 = self.temporal_shift(out) 515 | return out + ts_1 516 | else: 517 | ''' 518 | tb0 = self.tba(x) 519 | tb1 = self.tbb(tb0) 520 | tb2 = self.tbc(tb1) 521 | ''' 522 | return out 523 | """ 524 | class TemporalInceptionModule(nn.Module): 525 | def __init__(self, in_channels, out_channels, name): 526 | super(TemporalInceptionModule, self).__init__() 527 | 528 | self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0, 529 | name=name + '/Branch_0/Conv3d_0a_1x1') 530 | self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0, 531 | name=name + '/Branch_1/Conv3d_0a_1x1') 532 | self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[1, 3, 3], 533 | name=name + '/Branch_1/Conv3d_0b_3x3') 534 | self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0, 535 | name=name + '/Branch_2/Conv3d_0a_1x1') 536 | self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[1, 3, 3], 537 | name=name + '/Branch_2/Conv3d_0b_3x3') 538 | self.b3a = MaxPool3dSamePadding(kernel_size=[1, 3, 3], 539 | stride=(1, 1, 1), padding=0) 540 | self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0, 541 | name=name + '/Branch_3/Conv3d_0b_1x1') 542 | self.name = name 543 | 544 | def forward(self, x): 545 | b0 = self.b0(x) 546 | b1 = self.b1b(self.b1a(x)) 547 | b2 = self.b2b(self.b2a(x)) 548 | b3 = self.b3b(self.b3a(x)) 549 | return torch.cat([b0, b1, b2, b3], dim=1) 550 | 551 | class MultiPathI3d(nn.Module): 552 | def __init__(self, num_classes=400, spatial_squeeze=True, in_channels=3, dropout_prob=0.5): 553 | 554 | super(MultiPathI3d, self).__init__() 555 | self._num_classes = num_classes 556 | self._spatial_squeeze = spatial_squeeze 557 | self.logits = None 558 | 559 | self.Conv3d_1a_7x7 = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], 560 | stride=(2, 2, 2), padding=(3, 3, 3), name='conv3d_1a_7_7') 561 | 562 | self.MaxPool3d_2a_3x3 = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), 563 | padding=0) 564 | self.Conv3d_2b_1x1 = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0,name='Conv3d_2b_1x1') 565 | self.Conv3d_2c_3x3 = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1, name='Conv3d_2c_3x3') 566 | self.maxpool_1 = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0) 567 | self.Mixed_3b = InceptionModule(192, [64, 96, 128, 16, 32, 32], 'Mixed_3b') 568 | self.Mixed_3c = InceptionModule(256, [128, 128, 192, 32, 96, 64], 'Mixed_3c') 569 | self.maxpool_2 = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0) 570 | self.Mixed_4b = InceptionModule(128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], 'Mixed_4b') 571 | self.Mixed_4c = InceptionModule(192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], 'Mixed_4c') 572 | self.Mixed_4d = InceptionModule(160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], 'Mixed_4d') 573 | self.Mixed_4e = InceptionModule(128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], 'Mixed_4e') 574 | self.Mixed_4f = InceptionModule(112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], 'Mixed_4f') 575 | self.maxpool_3 = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0) 576 | self.Mixed_5b = InceptionModule(256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], 'Mixed_5b') 577 | self.Mixed_5c = InceptionModule(256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], 'Mixed_5c') 578 | self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], 579 | stride=(1, 1, 1)) 580 | self.dropout = nn.Dropout(dropout_prob) 581 | self.dropout_probality = dropout_prob 582 | 583 | 584 | #=================================Multi Stride Multi Path Compress Network====================== 585 | self.s_depend = MultiDependBlock(480, self._num_classes, concat=False, fc=False) 586 | self.m_depend = MultiDependBlock(832, self._num_classes, concat=False, fc=False) 587 | self.l_depend = MultiDependBlock(1024, self._num_classes, concat=False, fc=False) 588 | self.concat = False 589 | self.fc_fusion = False 590 | if self.concat: 591 | self.fc = nn.Linear(self._num_classes*9, self._num_classes) 592 | def _upsample_add(self, x, y): 593 | '''Upsample and add two feature maps. 594 | Args: 595 | x: (Variable) top feature map to be upsampled. 596 | y: (Variable) lateral feature map. 597 | Returns: 598 | (Variable) added feature map. 599 | Note in PyTorch, when input size is odd, the upsampled feature map 600 | with `F.upsample(..., scale_factor=2, mode='nearest')` 601 | maybe not equal to the lateral feature map size. 602 | e.g. 603 | original input size: [N,_,15,15] -> 604 | conv2d feature map size: [N,_,8,8] -> 605 | upsampled feature map size: [N,_,16,16] 606 | So we choose bilinear upsample which supports arbitrary output sizes. 607 | ''' 608 | _, _, T, H, W = y.size() 609 | return F.interpolate(x, size=(T, H, W), mode='trilinear', align_corners=True)/2 + y/2 610 | #return F.upsample(x, size=(T, H, W), mode='trilinear') + y 611 | 612 | def constrain(self, x): 613 | alpha = 0.2 614 | beta = 5 615 | return 1/(beta+exp(-x)) + alpha 616 | 617 | def forward(self, x): 618 | x = self.Conv3d_1a_7x7(x) 619 | x = self.MaxPool3d_2a_3x3(x) 620 | x = self.Conv3d_2b_1x1(x) 621 | x = self.Conv3d_2c_3x3(x) 622 | x = self.maxpool_1(x) 623 | x = self.Mixed_3b(x) 624 | x = self.Mixed_3c(x) 625 | path_s = x 626 | x = self.maxpool_2(x) 627 | x = self.Mixed_4b(x) 628 | x = self.Mixed_4c(x) 629 | x = self.Mixed_4d(x) 630 | x = self.Mixed_4e(x) 631 | x = self.Mixed_4f(x) 632 | path_m = x 633 | x = self.maxpool_3(x) 634 | x = self.Mixed_5b(x) 635 | x = self.Mixed_5c(x) 636 | path_l = x 637 | plot_s = path_s 638 | plot_m = path_m 639 | plot_l = path_l 640 | path_s = self.s_depend(path_s) 641 | path_m = self.m_depend(path_m) 642 | path_l = self.l_depend(path_l) 643 | #main_path = self.main_depend(x) 644 | main_path = path_m + path_l + path_s 645 | if self.concat: 646 | out = torch.cat((self.s_depend(path_s), self.m_depend(path_m), self.l_depend(path_l)), dim=1) 647 | return self.fc(out) #+ temporal_path 648 | else: 649 | return main_path, plot_s, plot_m, plot_l, path_s, path_m, path_l 650 | -------------------------------------------------------------------------------- /net/r3d.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2020-09-17 15:59 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : amax_Action_Video_Visualization 8 | # @File : r3d.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import scipy.io 14 | import os 15 | 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 17 | 18 | """C3D""" 19 | import torch 20 | import torch.nn as nn 21 | import torch.nn.functional as F 22 | from torch.autograd import Variable 23 | import math 24 | from functools import partial 25 | from net.model import Flatten, Normalize 26 | 27 | __all__ = [ 28 | 'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 29 | 'resnet152', 'resnet200' 30 | ] 31 | 32 | 33 | def conv3x3x3(in_planes, out_planes, stride=1): 34 | # 3x3x3 convolution with padding 35 | return nn.Conv3d( 36 | in_planes, 37 | out_planes, 38 | kernel_size=3, 39 | stride=stride, 40 | padding=1, 41 | bias=False) 42 | 43 | 44 | def downsample_basic_block(x, planes, stride): 45 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 46 | zero_pads = torch.Tensor( 47 | out.size(0), planes - out.size(1), out.size(2), out.size(3), 48 | out.size(4)).zero_() 49 | if isinstance(out.data, torch.cuda.FloatTensor): 50 | zero_pads = zero_pads.cuda() 51 | 52 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 53 | 54 | return out 55 | 56 | 57 | class BasicBlock(nn.Module): 58 | expansion = 1 59 | 60 | def __init__(self, inplanes, planes, stride=1, downsample=None): 61 | super(BasicBlock, self).__init__() 62 | self.conv1 = conv3x3x3(inplanes, planes, stride) 63 | self.bn1 = nn.BatchNorm3d(planes) 64 | self.relu = nn.ReLU(inplace=True) 65 | self.conv2 = conv3x3x3(planes, planes) 66 | self.bn2 = nn.BatchNorm3d(planes) 67 | self.downsample = downsample 68 | self.stride = stride 69 | 70 | def forward(self, x): 71 | residual = x 72 | 73 | out = self.conv1(x) 74 | out = self.bn1(out) 75 | out = self.relu(out) 76 | 77 | out = self.conv2(out) 78 | out = self.bn2(out) 79 | 80 | if self.downsample is not None: 81 | residual = self.downsample(x) 82 | 83 | out += residual 84 | out = self.relu(out) 85 | 86 | return out 87 | 88 | 89 | class Bottleneck(nn.Module): 90 | expansion = 4 91 | 92 | def __init__(self, inplanes, planes, stride=1, downsample=None): 93 | super(Bottleneck, self).__init__() 94 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 95 | self.bn1 = nn.BatchNorm3d(planes) 96 | self.conv2 = nn.Conv3d( 97 | planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 98 | self.bn2 = nn.BatchNorm3d(planes) 99 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) 100 | self.bn3 = nn.BatchNorm3d(planes * 4) 101 | self.relu = nn.ReLU(inplace=True) 102 | self.downsample = downsample 103 | self.stride = stride 104 | 105 | def forward(self, x): 106 | residual = x 107 | 108 | out = self.conv1(x) 109 | out = self.bn1(out) 110 | out = self.relu(out) 111 | 112 | out = self.conv2(out) 113 | out = self.bn2(out) 114 | out = self.relu(out) 115 | 116 | out = self.conv3(out) 117 | out = self.bn3(out) 118 | 119 | if self.downsample is not None: 120 | residual = self.downsample(x) 121 | 122 | out += residual 123 | out = self.relu(out) 124 | 125 | return out 126 | 127 | 128 | class ResNet(nn.Module): 129 | def __init__(self, 130 | block, 131 | layers, 132 | sample_size=224, 133 | sample_duration=16, 134 | shortcut_type='B', 135 | num_classes=400, 136 | with_classifier=True): 137 | self.inplanes = 64 138 | super(ResNet, self).__init__() 139 | self.conv1 = nn.Conv3d( 140 | 3, 141 | 64, 142 | kernel_size=7, 143 | stride=(1, 2, 2), 144 | padding=(3, 3, 3), 145 | bias=False) 146 | self.bn1 = nn.BatchNorm3d(64) 147 | self.relu = nn.ReLU(inplace=True) 148 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 149 | self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type) 150 | self.layer2 = self._make_layer( 151 | block, 128, layers[1], shortcut_type, stride=2) 152 | self.layer3 = self._make_layer( 153 | block, 256, layers[2], shortcut_type, stride=2) 154 | self.layer4 = self._make_layer( 155 | block, 512, layers[3], shortcut_type, stride=2) 156 | last_duration = int(math.ceil(sample_duration / 16)) 157 | last_size = int(math.ceil(sample_size / 32)) 158 | self.with_classifier = with_classifier 159 | if with_classifier: 160 | # self.avgpool_custom = nn.AvgPool3d( 161 | # (1, last_size, last_size), stride=1) 162 | # self.cp = nn.Conv3d(in_channels=512 * block.expansion, out_channels=num_classes, 163 | # kernel_size=(last_duration, 1, 1), bias=False) 164 | self.avgpool = nn.AdaptiveAvgPool3d(1) 165 | self.fc = nn.Linear(512 * block.expansion, num_classes) 166 | else: 167 | self.id_head = nn.Sequential( 168 | torch.nn.AdaptiveAvgPool3d((1, 1, 1)), 169 | Flatten(), 170 | torch.nn.Linear(512, 128), 171 | Normalize(2) 172 | ) 173 | self.cls_head = nn.Sequential(torch.nn.AdaptiveAvgPool3d(1), 174 | Flatten(), 175 | torch.nn.Linear(512, 200) 176 | ) 177 | self.feature_head = nn.Sequential(torch.nn.AdaptiveAvgPool3d(1), 178 | Flatten(), 179 | Normalize(2) 180 | ) 181 | for m in self.modules(): 182 | if isinstance(m, nn.Conv3d): 183 | m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out') 184 | elif isinstance(m, nn.BatchNorm3d): 185 | m.weight.data.fill_(1) 186 | m.bias.data.zero_() 187 | 188 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 189 | downsample = None 190 | if stride != 1 or self.inplanes != planes * block.expansion: 191 | if shortcut_type == 'A': 192 | downsample = partial( 193 | downsample_basic_block, 194 | planes=planes * block.expansion, 195 | stride=stride) 196 | else: 197 | downsample = nn.Sequential( 198 | nn.Conv3d( 199 | self.inplanes, 200 | planes * block.expansion, 201 | kernel_size=1, 202 | stride=stride, 203 | bias=False), nn.BatchNorm3d(planes * block.expansion)) 204 | 205 | layers = [] 206 | layers.append(block(self.inplanes, planes, stride, downsample)) 207 | self.inplanes = planes * block.expansion 208 | for i in range(1, blocks): 209 | layers.append(block(self.inplanes, planes)) 210 | 211 | return nn.Sequential(*layers) 212 | 213 | def forward(self, x, return_conv=False): 214 | x = self.conv1(x) 215 | x = self.bn1(x) 216 | x = self.relu(x) 217 | x = self.maxpool(x) 218 | 219 | x = self.layer1(x) 220 | x = self.layer2(x) 221 | x = self.layer3(x) 222 | x = self.layer4(x) 223 | feature = x 224 | x = self.avgpool(x) 225 | # x = self.cp(self.avgpool_custom(x)) 226 | x = x.squeeze(3).squeeze(3).mean(2) 227 | x = self.fc(x) 228 | return F.log_softmax(x, dim=1), feature 229 | 230 | 231 | def get_fine_tuning_parameters(model, ft_begin_index): 232 | if ft_begin_index == 0: 233 | return model.parameters() 234 | 235 | ft_module_names = [] 236 | for i in range(ft_begin_index, 5): 237 | ft_module_names.append('layer{}'.format(i)) 238 | ft_module_names.append('fc') 239 | 240 | parameters = [] 241 | for k, v in model.named_parameters(): 242 | for ft_module in ft_module_names: 243 | if ft_module in k: 244 | parameters.append({'params': v}) 245 | break 246 | else: 247 | parameters.append({'params': v, 'lr': 0.0}) 248 | 249 | return parameters 250 | 251 | 252 | def resnet10(**kwargs): 253 | """Constructs a ResNet-18 model. 254 | """ 255 | model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs) 256 | return model 257 | 258 | 259 | def resnet18(**kwargs): 260 | """Constructs a ResNet-18 model. 261 | """ 262 | model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) 263 | return model 264 | 265 | 266 | def resnet34(**kwargs): 267 | """Constructs a ResNet-34 model. 268 | """ 269 | model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) 270 | return model 271 | 272 | 273 | def resnet50(**kwargs): 274 | """Constructs a ResNet-50 model. 275 | """ 276 | model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) 277 | return model 278 | 279 | 280 | def resnet101(**kwargs): 281 | """Constructs a ResNet-101 model. 282 | """ 283 | model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) 284 | return model 285 | 286 | 287 | def resnet152(**kwargs): 288 | """Constructs a ResNet-101 model. 289 | """ 290 | model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) 291 | return model 292 | 293 | 294 | def resnet200(**kwargs): 295 | """Constructs a ResNet-101 model. 296 | """ 297 | model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs) 298 | return model -------------------------------------------------------------------------------- /output/imgs/79/focusmap_000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/focusmap_000.png -------------------------------------------------------------------------------- /output/imgs/79/focusmap_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/focusmap_001.png -------------------------------------------------------------------------------- /output/imgs/79/focusmap_002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/focusmap_002.png -------------------------------------------------------------------------------- /output/imgs/79/focusmap_003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/focusmap_003.png -------------------------------------------------------------------------------- /output/imgs/79/focusmap_004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/focusmap_004.png -------------------------------------------------------------------------------- /output/imgs/79/focusmap_005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/focusmap_005.png -------------------------------------------------------------------------------- /output/imgs/79/focusmap_006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/focusmap_006.png -------------------------------------------------------------------------------- /output/imgs/79/heatmap_000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/heatmap_000.png -------------------------------------------------------------------------------- /output/imgs/79/heatmap_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/heatmap_001.png -------------------------------------------------------------------------------- /output/imgs/79/heatmap_002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/heatmap_002.png -------------------------------------------------------------------------------- /output/imgs/79/heatmap_003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/heatmap_003.png -------------------------------------------------------------------------------- /output/imgs/79/heatmap_004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/heatmap_004.png -------------------------------------------------------------------------------- /output/imgs/79/heatmap_005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/heatmap_005.png -------------------------------------------------------------------------------- /output/imgs/79/heatmap_006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/imgs/79/heatmap_006.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/000.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/001.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/002.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/003.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/004.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/005.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/006.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/007.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/008.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/008.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/009.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/009.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/010.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/010.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/011.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/011.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/012.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/012.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/013.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/013.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/014.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/014.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/focusmap/015.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/focusmap/015.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/000.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/001.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/002.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/003.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/004.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/005.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/006.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/007.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/008.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/008.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/009.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/009.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/010.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/010.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/011.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/011.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/012.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/012.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/013.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/013.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/014.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/014.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/heatmap/015.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/ucf101_test_1/0/heatmap/015.png -------------------------------------------------------------------------------- /output/ucf101_test_1/0/info.txt: -------------------------------------------------------------------------------- 1 | Visualizing for class 0 2 | Predicted class 0 3 | Visualizing for class 0 4 | Predicted class 0 5 | -------------------------------------------------------------------------------- /output/video/label_0.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/video/label_0.mp4 -------------------------------------------------------------------------------- /output/video/label_28.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/video/label_28.mp4 -------------------------------------------------------------------------------- /output/video/label_471.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/output/video/label_471.mp4 -------------------------------------------------------------------------------- /process_all_hmdb51_videos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2019-04-27 23:25 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : amax_Action_Video_Visualization 8 | # @File : process_all_hmdb51_videos.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import scipy.io 14 | import os 15 | from main import heat_map_api 16 | import time 17 | import datetime 18 | 19 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 20 | videos_dir = "/data1/DataSet/Hmdb51/hmdb51_mpeg/" 21 | output_dirs = "output/self_supervised_fine_tune/" 22 | frames_num = 16 23 | clip_steps = 8 24 | classes_list = "resources/hmdb51_classInd.txt" 25 | 26 | 27 | classes = {} 28 | with open(classes_list) as f: 29 | for line in f.readlines(): 30 | info = line.strip().split(' ') 31 | classes[info[1]] = int(info[0]) 32 | count = 0 33 | videos_num = 7000 34 | begin =time.time() 35 | for dir in os.listdir(videos_dir): 36 | for video in os.listdir(os.path.join(videos_dir,dir)): 37 | count += 1 38 | video_path = os.path.join(videos_dir, dir, video) 39 | label = classes[dir] 40 | output_dir = os.path.join(output_dirs, dir, video.split('.')[0]) 41 | if not os.path.exists(os.path.join(output_dirs, dir)): 42 | os.mkdir(os.path.join(output_dirs, dir)) 43 | if not os.path.exists(output_dir): 44 | os.mkdir(output_dir) 45 | else: 46 | continue 47 | try: 48 | heat_map_api(video_path, frames_num, clip_steps, output_dir, label, classes_list) 49 | except TypeError: 50 | print("video not found ") 51 | continue 52 | end = time.time() 53 | # datetime.datetime.fromtimestamp(1421077403.0) 54 | # print("have processed {}/{} videos, left time: {}".format(count, videos_num, (end-begin)/count*(videos_num-count))) 55 | print("have processed {}/{} videos, will be finished in: {}".format(count, videos_num, 56 | datetime.datetime.fromtimestamp(time.time() + (end - begin) / count * (videos_num - count)))) 57 | -------------------------------------------------------------------------------- /resources/HMDB_snapshot1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/resources/HMDB_snapshot1.png -------------------------------------------------------------------------------- /resources/HMDB_snapshot2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/resources/HMDB_snapshot2.png -------------------------------------------------------------------------------- /resources/classInd.txt: -------------------------------------------------------------------------------- 1 | 1 ApplyEyeMakeup 2 | 2 ApplyLipstick 3 | 3 Archery 4 | 4 BabyCrawling 5 | 5 BalanceBeam 6 | 6 BandMarching 7 | 7 BaseballPitch 8 | 8 Basketball 9 | 9 BasketballDunk 10 | 10 BenchPress 11 | 11 Biking 12 | 12 Billiards 13 | 13 BlowDryHair 14 | 14 BlowingCandles 15 | 15 BodyWeightSquats 16 | 16 Bowling 17 | 17 BoxingPunchingBag 18 | 18 BoxingSpeedBag 19 | 19 BreastStroke 20 | 20 BrushingTeeth 21 | 21 CleanAndJerk 22 | 22 CliffDiving 23 | 23 CricketBowling 24 | 24 CricketShot 25 | 25 CuttingInKitchen 26 | 26 Diving 27 | 27 Drumming 28 | 28 Fencing 29 | 29 FieldHockeyPenalty 30 | 30 FloorGymnastics 31 | 31 FrisbeeCatch 32 | 32 FrontCrawl 33 | 33 GolfSwing 34 | 34 Haircut 35 | 35 Hammering 36 | 36 HammerThrow 37 | 37 HandstandPushups 38 | 38 HandstandWalking 39 | 39 HeadMassage 40 | 40 HighJump 41 | 41 HorseRace 42 | 42 HorseRiding 43 | 43 HulaHoop 44 | 44 IceDancing 45 | 45 JavelinThrow 46 | 46 JugglingBalls 47 | 47 JumpingJack 48 | 48 JumpRope 49 | 49 Kayaking 50 | 50 Knitting 51 | 51 LongJump 52 | 52 Lunges 53 | 53 MilitaryParade 54 | 54 Mixing 55 | 55 MoppingFloor 56 | 56 Nunchucks 57 | 57 ParallelBars 58 | 58 PizzaTossing 59 | 59 PlayingCello 60 | 60 PlayingDaf 61 | 61 PlayingDhol 62 | 62 PlayingFlute 63 | 63 PlayingGuitar 64 | 64 PlayingPiano 65 | 65 PlayingSitar 66 | 66 PlayingTabla 67 | 67 PlayingViolin 68 | 68 PoleVault 69 | 69 PommelHorse 70 | 70 PullUps 71 | 71 Punch 72 | 72 PushUps 73 | 73 Rafting 74 | 74 RockClimbingIndoor 75 | 75 RopeClimbing 76 | 76 Rowing 77 | 77 SalsaSpin 78 | 78 ShavingBeard 79 | 79 Shotput 80 | 80 SkateBoarding 81 | 81 Skiing 82 | 82 Skijet 83 | 83 SkyDiving 84 | 84 SoccerJuggling 85 | 85 SoccerPenalty 86 | 86 StillRings 87 | 87 SumoWrestling 88 | 88 Surfing 89 | 89 Swing 90 | 90 TableTennisShot 91 | 91 TaiChi 92 | 92 TennisSwing 93 | 93 ThrowDiscus 94 | 94 TrampolineJumping 95 | 95 Typing 96 | 96 UnevenBars 97 | 97 VolleyballSpiking 98 | 98 WalkingWithDog 99 | 99 WallPushups 100 | 100 WritingOnBoard 101 | 101 YoYo 102 | -------------------------------------------------------------------------------- /resources/focusimg_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/resources/focusimg_1.png -------------------------------------------------------------------------------- /resources/heatmap_000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/resources/heatmap_000.png -------------------------------------------------------------------------------- /resources/heatmap_000_sc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/resources/heatmap_000_sc.png -------------------------------------------------------------------------------- /resources/heatmap_003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/resources/heatmap_003.png -------------------------------------------------------------------------------- /resources/heatmap_003_sc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/resources/heatmap_003_sc.png -------------------------------------------------------------------------------- /resources/heatmap_007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/resources/heatmap_007.png -------------------------------------------------------------------------------- /resources/heatmap_007_sc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/resources/heatmap_007_sc.png -------------------------------------------------------------------------------- /resources/heatmap_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/resources/heatmap_1.png -------------------------------------------------------------------------------- /resources/hmdb51_classInd.txt: -------------------------------------------------------------------------------- 1 | 1 brush_hair 2 | 2 cartwheel 3 | 3 catch 4 | 4 chew 5 | 5 clap 6 | 6 climb 7 | 7 climb_stairs 8 | 8 dive 9 | 9 draw_sword 10 | 10 dribble 11 | 11 drink 12 | 12 eat 13 | 13 fall_floor 14 | 14 fencing 15 | 15 flic_flac 16 | 16 golf 17 | 17 handstand 18 | 18 hit 19 | 19 hug 20 | 20 jump 21 | 21 kick 22 | 22 kick_ball 23 | 23 kiss 24 | 24 laugh 25 | 25 pick 26 | 26 pour 27 | 27 pullup 28 | 28 punch 29 | 29 push 30 | 30 pushup 31 | 31 ride_bike 32 | 32 ride_horse 33 | 33 run 34 | 34 shake_hands 35 | 35 shoot_ball 36 | 36 shoot_bow 37 | 37 shoot_gun 38 | 38 sit 39 | 39 situp 40 | 40 smile 41 | 41 smoke 42 | 42 somersault 43 | 43 stand 44 | 44 swing_baseball 45 | 45 sword_exercise 46 | 46 sword 47 | 47 talk 48 | 48 throw 49 | 49 turn 50 | 50 walk 51 | 51 wave 52 | -------------------------------------------------------------------------------- /resources/supervised.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/resources/supervised.gif -------------------------------------------------------------------------------- /resources/unsupervised.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/resources/unsupervised.gif -------------------------------------------------------------------------------- /scripts/c3d_unsupervised_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python main.py --num_classes 51 \ 3 | --arch c3d \ 4 | --classes_list resources/hmdb51_classInd.txt \ 5 | --model_weights pretrained_model/c3d-pretrained.pth \ 6 | --video test_videos/punch_28.mp4 \ 7 | --frames_num 16 --label 28 --clip_steps 8 \ 8 | --output_dir output --gpus 1 --supervised unsupervised -------------------------------------------------------------------------------- /scripts/demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python main.py --num_classes 101 \ 3 | --arch mf_net \ 4 | --classes_list resources/classInd.txt \ 5 | --model_weights pretrained_model/MFNet3D_UCF-101_Split-1_96.3.pth \ 6 | --video test_videos/v_ApplyEyeMakeup_g01_c01.avi \ 7 | --frames_num 16 --label 0 --clip_steps 16 \ 8 | --output_dir output -------------------------------------------------------------------------------- /scripts/i3d_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python main.py --num_classes 51 \ 3 | --arch i3d \ 4 | --classes_list resources/hmdb51_classInd.txt \ 5 | --model_weights pretrained_model/hmdb51_rgb_gl_randomrotation_3flip_mixup_way2_1loss_stride_1_12_26_checkpoint_37.77.pth.tar \ 6 | --video test_videos/punch_28.mp4 \ 7 | --frames_num 16 --label 28 --clip_steps 16 \ 8 | --output_dir output --gpus 1 -------------------------------------------------------------------------------- /scripts/i3d_mixup_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | for MIXUP_TYPE in {1..9..2} 3 | do 4 | python main.py --num_classes 51 \ 5 | --arch i3d \ 6 | --classes_list resources/hmdb51_classInd.txt \ 7 | --model_weights pretrained_model/31.372_i3dpt_rgb_model_best.pth.tar \ 8 | --video test_videos/drive_0.$MIXUP_TYPE.mp4 \ 9 | --frames_num 16 --label 111$MIXUP_TYPE --clip_steps 4 \ 10 | --output_dir output --gpus 1 --supervised self_supervised 11 | done 12 | for MIXUP_TYPE in {1..9..2} 13 | do 14 | python main.py --num_classes 51 \ 15 | --arch i3d \ 16 | --classes_list resources/hmdb51_classInd.txt \ 17 | --model_weights pretrained_model/36.209_i3dpt_rgb_model_best.pth.tar \ 18 | --video test_videos/drive_0.$MIXUP_TYPE.mp4 \ 19 | --frames_num 16 --label 112$MIXUP_TYPE --clip_steps 4 \ 20 | --output_dir output --gpus 1 --supervised self_supervised 21 | done -------------------------------------------------------------------------------- /scripts/i3d_rotate_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # scratch 3 | for ROTATE_TYPE in {0..15} 4 | do 5 | echo "$ROTATE_TYPE / 16 finished" 6 | python main.py --num_classes 51 \ 7 | --arch i3d \ 8 | --classes_list resources/hmdb51_classInd.txt \ 9 | --model_weights pretrained_model/31.372_i3dpt_rgb_model_best.pth.tar \ 10 | --video test_videos/shoot_gun_r_type$ROTATE_TYPE.mp4 \ 11 | --frames_num 16 --label 52$ROTATE_TYPE --clip_steps 8 \ 12 | --output_dir output --gpus 1 --supervised self_supervised 13 | done 14 | 15 | # self-supervised 16 | for ROTATE_TYPE in {0..15} 17 | do 18 | echo "$ROTATE_TYPE / 16 finished" 19 | python main.py --num_classes 51 \ 20 | --arch i3d \ 21 | --classes_list resources/hmdb51_classInd.txt \ 22 | --model_weights pretrained_model/36.209_i3dpt_rgb_model_best.pth.tar \ 23 | --video test_videos/shoot_gun_r_type$ROTATE_TYPE.mp4 \ 24 | --frames_num 16 --label 53$ROTATE_TYPE --clip_steps 8 \ 25 | --output_dir output --gpus 1 --supervised self_supervised 26 | done -------------------------------------------------------------------------------- /scripts/i3d_unsupervised_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python main.py --num_classes 51 \ 3 | --arch i3d \ 4 | --classes_list resources/hmdb51_classInd.txt \ 5 | --model_weights pretrained_model/77.254_mpi3d_rgb_model_best.pth.tar \ 6 | --video test_videos/punch_28.mp4 \ 7 | --frames_num 16 --label 28 --clip_steps 16 \ 8 | --output_dir output --gpus 1 --supervised unsupervised -------------------------------------------------------------------------------- /scripts/mpi3d_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python main.py --num_classes 51 \ 3 | --arch mpi3d \ 4 | --classes_list resources/hmdb51_classInd.txt \ 5 | --model_weights pretrained_model/77.254_mpi3d_rgb_model_best.pth.tar \ 6 | --video test_videos/punch_28.mp4 \ 7 | --frames_num 64 --label 28 --clip_steps 1 \ 8 | --output_dir output --gpus 2 -------------------------------------------------------------------------------- /scripts/r3d_unsupervised_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python main.py --num_classes 51 \ 3 | --arch r3d \ 4 | --classes_list resources/hmdb51_classInd.txt \ 5 | --model_weights pretrained_model/r3d50_K_200ep.pth \ 6 | --video test_videos/punch_28.mp4 \ 7 | --frames_num 16 --label 28 --clip_steps 8 \ 8 | --output_dir output --gpus 1 --supervised unsupervised -------------------------------------------------------------------------------- /test_videos/50_FIRST_DATES_drink_u_nm_np1_fr_goo_29.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/test_videos/50_FIRST_DATES_drink_u_nm_np1_fr_goo_29.mp4 -------------------------------------------------------------------------------- /test_videos/BASE_Jumping_Compilation_-_Brilliant_dive_f_cm_np1_le_bad_3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/test_videos/BASE_Jumping_Compilation_-_Brilliant_dive_f_cm_np1_le_bad_3.mp4 -------------------------------------------------------------------------------- /test_videos/BaseballSwingAnalysis_swing_baseball_u_nm_np1_ba_med_0.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/test_videos/BaseballSwingAnalysis_swing_baseball_u_nm_np1_ba_med_0.mp4 -------------------------------------------------------------------------------- /test_videos/Bodenturnen_im_sportunterricht_handstand_f_cm_np1_le_med_1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/test_videos/Bodenturnen_im_sportunterricht_handstand_f_cm_np1_le_med_1.mp4 -------------------------------------------------------------------------------- /test_videos/Bruno_Walks_up_Stairs_-_Chicago_Dog_Training_-_We_can_teach_ANYTHING_to_a_dog!!!!_climb_stairs_f_cm_np1_fr_med_0.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/test_videos/Bruno_Walks_up_Stairs_-_Chicago_Dog_Training_-_We_can_teach_ANYTHING_to_a_dog!!!!_climb_stairs_f_cm_np1_fr_med_0.mp4 -------------------------------------------------------------------------------- /test_videos/DefensivePistolShootingTechniques_shoot_gun_f_nm_np1_fr_med_3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/test_videos/DefensivePistolShootingTechniques_shoot_gun_f_nm_np1_fr_med_3.mp4 -------------------------------------------------------------------------------- /test_videos/Documentario_Le_Parkour_Londrina_jump_f_cm_np1_ri_bad_6.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/test_videos/Documentario_Le_Parkour_Londrina_jump_f_cm_np1_ri_bad_6.mp4 -------------------------------------------------------------------------------- /test_videos/v_ApplyEyeMakeup_g01_c01.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/test_videos/v_ApplyEyeMakeup_g01_c01.avi -------------------------------------------------------------------------------- /test_videos/v_HeadMassage_g02_c05.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/3DNet_Visualization/974aef9536cd89b6fc15feba8d73dc54ad193114/test_videos/v_HeadMassage_g02_c05.avi -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2019-03-17 11:55 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : Action_Video_Visualization 8 | # @File : util.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import scipy.io 14 | import os 15 | 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 17 | 18 | import cv2 19 | import numpy as np 20 | import torch 21 | import skvideo.io 22 | 23 | def video_frame_count(video_path): 24 | cap = cv2.VideoCapture(video_path) 25 | if not cap.isOpened(): 26 | print("could not open: ", video_path) 27 | return -1 28 | length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 29 | width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) ) 30 | height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) ) 31 | return length, width, height 32 | 33 | 34 | def visualization(video_path, fps=30): 35 | cap = cv2.VideoCapture(video_path) 36 | while cap.isOpened(): 37 | ret, frame = cap.read() 38 | gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) 39 | 40 | cv2.imshow('frame', gray) 41 | if cv2.waitKey(1000 / fps) & 0xFF == ord('q'): 42 | break 43 | 44 | cap.release() 45 | cv2.destroyAllWindows() 46 | 47 | 48 | def save_as_video(output_dir, frames, label): 49 | # save video 50 | if not os.path.exists(output_dir + '/video'): 51 | os.makedirs(output_dir + '/video') 52 | # print(output_dir + '/video') 53 | # print(skvideo._FFMPEG_SUPPORTED_ENCODERS) 54 | output_path = '{}/video/label_{}.mp4'.format(output_dir, label) 55 | writer = skvideo.io.FFmpegWriter(output_path, 56 | outputdict={'-b': '300000000'}) 57 | # writer.set(cv2.CAP_PROP_FRAME_WIDTH, 224) 58 | # writer.set(cv2.CAP_PROP_FRAME_HEIGHT, 448) 59 | for frame in frames: 60 | new_frame = cv2.cvtColor(np.uint8(frame), cv2.COLOR_BGR2RGB) 61 | writer.writeFrame(new_frame) 62 | writer.close() 63 | print('The video result has been saved in {}.'.format(output_dir+'/video')) 64 | return output_dir + '/video' 65 | 66 | def save_as_imgs(output_dir, frames, frames_num, label, prefix='heatmap_'): 67 | #save imgs 68 | if not os.path.exists(output_dir + '/imgs/' + str(label)): 69 | os.makedirs(output_dir + '/imgs/' + str(label)) 70 | for i in range(frames_num): 71 | cv2.imwrite(os.path.join(output_dir + '/imgs/' + str(label), prefix + '{:03d}.png'.format(i)), frames[i]) 72 | print('These images has been saved in {}.'.format(output_dir + '/imgs')) 73 | return output_dir + '/imgs' 74 | 75 | 76 | def center_crop(data, tw=224, th=224): 77 | h, w, c = data.shape 78 | x1 = int(round((w - tw) / 2.)) 79 | y1 = int(round((h - th) / 2.)) 80 | cropped_data = data[y1:(y1 + th), x1:(x1 + tw), :] 81 | return cropped_data 82 | 83 | 84 | def load_images(frame_dir, selected_frames): 85 | images = np.zeros((16, 224, 224, 3)) 86 | orig_imgs = np.zeros_like(images) 87 | for i, frame_name in enumerate(selected_frames): 88 | im_name = os.path.join(frame_dir, frame_name) 89 | next_image = cv2.imread(im_name, cv2.IMREAD_COLOR) 90 | scaled_img = cv2.resize(next_image, (256, 256), interpolation=cv2.INTER_LINEAR) # resize to 256x256 91 | cropped_img = center_crop(scaled_img) # center crop 224x224 92 | final_img = cv2.cvtColor(cropped_img, cv2.COLOR_BGR2RGB) 93 | images[i] = final_img 94 | orig_imgs[i] = cropped_img 95 | 96 | torch_imgs = torch.from_numpy(images.transpose(3, 0, 1, 2)) 97 | torch_imgs = torch_imgs.float() / 255.0 98 | mean_3d = [124 / 255, 117 / 255, 104 / 255] 99 | std_3d = [0.229, 0.224, 0.225] 100 | for t, m, s in zip(torch_imgs, mean_3d, std_3d): 101 | t.sub_(m).div_(s) 102 | return np.expand_dims(orig_imgs, 0), torch_imgs.unsqueeze(0) 103 | 104 | def put_text(img, text, position, scale_factor=0.4): 105 | t_w, t_h = cv2.getTextSize( 106 | text, cv2.FONT_HERSHEY_TRIPLEX, scale_factor, thickness=1)[0] 107 | H, W, _ = img.shape 108 | position = (int(W * position[1] - t_w * 0.5), int(H * position[0] - t_h * 0.5)) 109 | params = (position, cv2.FONT_HERSHEY_TRIPLEX, scale_factor, 110 | (255,255,255)) 111 | cv2.putText(img, text, *params) -------------------------------------------------------------------------------- /utils/gen_new_video.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import skvideo.io 3 | import numpy as np 4 | 5 | 6 | def mixup_data(x, loss_prob): 7 | num = len(x) 8 | img_index = np.random.randint(num) 9 | mixed_x = x 10 | for i in range(num): 11 | mixed_x[i] = (1-loss_prob) * x[i] + loss_prob * x[img_index] 12 | return mixed_x 13 | 14 | 15 | def read_video(video): 16 | cap = cv2.VideoCapture(video) 17 | frames = list() 18 | while True: 19 | ret, frame = cap.read() 20 | if type(frame) is type(None): 21 | break 22 | else: 23 | frames.append(frame) 24 | return frames 25 | 26 | 27 | def write_video(name, frames): 28 | writer = skvideo.io.FFmpegWriter(name, 29 | outputdict={'-b': '300000000'}) 30 | for frame in frames: 31 | writer.writeFrame(frame) 32 | writer.close() 33 | return 1 34 | 35 | 36 | if __name__ == '__main__': 37 | video = 'test_videos/drive.mp4' 38 | for i in range(1, 11, 2): 39 | prob = i / 10 40 | seqs = read_video(video) 41 | seqs = mixup_data(seqs, prob) 42 | name = 'test_videos/drive_{}.mp4'.format(prob) 43 | write_video(name, seqs) 44 | -------------------------------------------------------------------------------- /utils/gen_rotation_data.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import skvideo.io 3 | import numpy as np 4 | 5 | 6 | def rotation_data(x, r_type): 7 | """ 8 | 9 | :param x: 10 | :param r_type: 0: no rotate 1: up-down flip 2: left-right flip 11 | :return: 12 | """ 13 | num = len(x) 14 | x = np.array(x) 15 | mixed_x = list() 16 | f_type = r_type // 4 17 | rota_type = r_type % 4 18 | if f_type == 1: 19 | # print(x[i].shape) 20 | for i in range(num): 21 | mixed_x.append(np.flip(x[i], 0)) 22 | elif f_type == 2: 23 | for i in range(num): 24 | mixed_x.append(x[num-i-1]) 25 | elif f_type == 3: 26 | for i in range(num): 27 | mixed_x.append(np.flip(x[num - i - 1], 0)) 28 | else: 29 | for i in range(num): 30 | mixed_x.append(x[i]) 31 | 32 | if rota_type == 1: 33 | for i in range(num): 34 | mixed_x[i] = np.rot90(mixed_x[i], 1) 35 | elif rota_type == 2: 36 | for i in range(num): 37 | mixed_x[i] = np.rot90(mixed_x[i], 2) 38 | elif rota_type == 3: 39 | for i in range(num): 40 | mixed_x[i] = np.rot90(mixed_x[i], 3) 41 | else: 42 | for i in range(num): 43 | mixed_x[i] = mixed_x[i] 44 | return mixed_x 45 | 46 | 47 | def read_video(video): 48 | cap = cv2.VideoCapture(video) 49 | frames = list() 50 | while True: 51 | ret, frame = cap.read() 52 | if type(frame) is type(None): 53 | break 54 | else: 55 | frames.append(frame) 56 | return frames 57 | 58 | 59 | def write_video(name, frames): 60 | writer = skvideo.io.FFmpegWriter(name, 61 | outputdict={'-b': '300000000'}) 62 | for frame in frames: 63 | writer.writeFrame(frame) 64 | writer.close() 65 | return 1 66 | 67 | 68 | if __name__ == '__main__': 69 | video = 'test_videos/shoot_gun.mp4' 70 | for r_type in range(16): 71 | seqs = read_video(video) 72 | seqs = rotation_data(seqs, r_type) 73 | name = 'test_videos/shoot_gun_r_type{}.mp4'.format(r_type) 74 | write_video(name, seqs) 75 | -------------------------------------------------------------------------------- /utils/video_cat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2020-09-17 10:17 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : amax_Action_Video_Visualization 8 | # @File : video_cat.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import scipy.io 14 | import os 15 | import sys 16 | sys.path.append("../") 17 | import cv2 18 | from util import save_as_video, video_frame_count 19 | 20 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 21 | 22 | 23 | videos_path = '../output/concat_videos' 24 | frames = list() 25 | 26 | for video in os.listdir(videos_path): 27 | try: 28 | length, width, height = video_frame_count(os.path.join(videos_path, video)) 29 | except TypeError: 30 | print("video {} not abailable".format(os.path.join(videos_path, video))) 31 | continue 32 | cap = cv2.VideoCapture(os.path.join(videos_path, video)) 33 | # q = queue.Queue(self.frames_num) 34 | count = 0 35 | while count < length: 36 | ret, frame = cap.read() 37 | if type(frame) == type(None): 38 | break 39 | else: 40 | count += 1 41 | # print(frame.shape[0]//2) 42 | save_frame = cv2.cvtColor(frame[:frame.shape[0]//2, :, :], cv2.COLOR_BGR2RGB) 43 | cv2.putText(save_frame, 'DSM no label pretrain', (224, 20), cv2.FONT_HERSHEY_COMPLEX, 0.5,(0,255,0), 1) 44 | frames.append(save_frame) 45 | 46 | save_as_video('../output', frames, 'çoncated') --------------------------------------------------------------------------------