├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── alphaction
    ├── __init__.py
    ├── cam
    │   ├── .gitignore
    │   ├── README.md
    │   ├── __init__.py
    │   ├── cam.py
    │   ├── clip_loader.py
    │   ├── hilacam.py
    │   ├── mhsa.py
    │   └── ritsm.py
    ├── config
    │   ├── __init__.py
    │   └── defaults.py
    ├── dataset
    │   ├── __init__.py
    │   ├── build.py
    │   ├── collate_batch.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── ava.py
    │   │   ├── ava_dataset.py
    │   │   ├── ava_helper.py
    │   │   ├── concat_dataset.py
    │   │   ├── cv2_transform.py
    │   │   ├── evaluation
    │   │   │   ├── __init__.py
    │   │   │   ├── ava
    │   │   │   │   ├── README.md
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── ava_eval.py
    │   │   │   ├── evaluate_map.py
    │   │   │   ├── jhmdb
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── jhmdb_eval.py
    │   │   │   ├── pascal_evaluation
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── label_map_util.py
    │   │   │   │   ├── metrics.py
    │   │   │   │   ├── np_box_list.py
    │   │   │   │   ├── np_box_list_ops.py
    │   │   │   │   ├── np_box_mask_list.py
    │   │   │   │   ├── np_box_mask_list_ops.py
    │   │   │   │   ├── np_box_ops.py
    │   │   │   │   ├── np_mask_ops.py
    │   │   │   │   ├── object_detection_evaluation.py
    │   │   │   │   ├── per_image_evaluation.py
    │   │   │   │   └── standard_fields.py
    │   │   │   ├── pascal_wrapper.py
    │   │   │   └── ucf24
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── ucf24_eval.py
    │   │   ├── jhmdb_dataset.py
    │   │   ├── ucf24_dataset.py
    │   │   └── utils.py
    │   └── samplers
    │   │   ├── __init__.py
    │   │   ├── distributed.py
    │   │   ├── grouped_batch_sampler.py
    │   │   └── iteration_based_batch_sampler.py
    ├── engine
    │   ├── __init__.py
    │   ├── feature_extraction.py
    │   ├── inference.py
    │   └── trainer.py
    ├── layers
    │   ├── __init__.py
    │   └── batch_norm.py
    ├── modeling
    │   ├── __init__.py
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   ├── backbone.py
    │   │   ├── i3d.py
    │   │   ├── sfmodels
    │   │   │   ├── common.py
    │   │   │   ├── nonlocal_helper.py
    │   │   │   ├── resnet_helper.py
    │   │   │   └── stem_helper.py
    │   │   ├── slowfast.py
    │   │   ├── video_model_builder.py
    │   │   └── vit_utils.py
    │   ├── common_blocks.py
    │   ├── detector
    │   │   ├── __init__.py
    │   │   ├── action_detector.py
    │   │   ├── naive_baseline.py
    │   │   └── stm_detector.py
    │   ├── dict_model.py
    │   ├── encoders
    │   │   ├── clipvip
    │   │   │   ├── CLIP_ViP.py
    │   │   │   ├── clipvip_encoder.py
    │   │   │   ├── custom_layers.py
    │   │   │   └── loader.py
    │   │   ├── openai_clip
    │   │   │   ├── clip_encoder.py
    │   │   │   └── clip_loader.py
    │   │   └── viclip
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │   ├── demo.py
    │   │   │   ├── simple_tokenizer.py
    │   │   │   ├── viclip.py
    │   │   │   ├── viclip_encoder.py
    │   │   │   ├── viclip_text.py
    │   │   │   └── viclip_vision.py
    │   ├── nonlocal_block.py
    │   ├── registry.py
    │   ├── roi_heads
    │   │   ├── __init__.py
    │   │   ├── action_head
    │   │   │   ├── IA_structure.py
    │   │   │   ├── __init__.py
    │   │   │   ├── action_head.py
    │   │   │   ├── inference.py
    │   │   │   ├── loss.py
    │   │   │   ├── metric.py
    │   │   │   ├── roi_action_feature_extractor.py
    │   │   │   └── roi_action_predictors.py
    │   │   └── roi_heads_3d.py
    │   ├── stm_decoder
    │   │   ├── stm_decoder.py
    │   │   └── util
    │   │   │   ├── __init__.py
    │   │   │   ├── adaptive_mixing_operator.py
    │   │   │   ├── box_ops.py
    │   │   │   ├── head_utils.py
    │   │   │   ├── loss.py
    │   │   │   ├── misc.py
    │   │   │   └── msaq.py
    │   └── utils.py
    ├── solver
    │   ├── __init__.py
    │   ├── build.py
    │   └── lr_scheduler.py
    ├── structures
    │   ├── __init__.py
    │   ├── bounding_box.py
    │   └── memory_pool.py
    └── utils
    │   ├── IA_helper.py
    │   ├── __init__.py
    │   ├── c2_model_loading.py
    │   ├── checkpoint.py
    │   ├── comm.py
    │   ├── logger.py
    │   ├── metric_logger.py
    │   ├── model_serialization.py
    │   ├── random_seed.py
    │   ├── registry.py
    │   ├── video_decode.py
    │   └── visualize.py
├── assets
    └── wacv25_openmixer.png
├── config_files
    ├── jhmdb
    │   ├── openmixer_e2e.yaml
    │   ├── openmixer_zsr_tl.yaml
    │   └── openmixer_zsr_zsl.yaml
    └── ucf24
    │   ├── openmixer_e2e.yaml
    │   ├── openmixer_zsr_tl.yaml
    │   └── openmixer_zsr_zsl.yaml
├── demo.py
├── preprocess
    ├── generate_vdt_jhmdb.py
    ├── generate_vdt_ucf24.py
    ├── openworld_split_jhmdb.py
    └── openworld_split_ucf24.py
├── requirements.txt
├── test_net.py
├── third_party
    ├── eval_utils.py
    ├── maskrcnn_utils.py
    ├── run_maskrcnn.py
    └── video_io.py
├── train_net.py
└── trainval.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | data
 2 | pretrained
 3 | output/
 4 | *.pyc
 5 | *.vscode
 6 | *.log
 7 | *.egg-info
 8 | *.out
 9 | *.err
10 | *_temp.*
11 | *.jpg
12 | *.jpeg
13 | .nfs*
14 | alphaction/cam/demo.py
15 | *.pth
16 | backup/
17 | *backup*
18 | */figures/*
19 | *.zip
20 | data_zip/
21 | data-release


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/GroundingDINO"]
2 | 	path = third_party/GroundingDINO
3 | 	url = https://github.com/IDEA-Research/GroundingDINO.git
4 | 	ignore = dirty
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Wentao Bao
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OpenMixer
 2 | This repository released the source code of the WACV 2025 paper [OpenMixer](https://arxiv.org/pdf/2411.10922), heavily dependent on the [STMixer](https://github.com/MCG-NJU/STMixer) codebase. OpenMixer is an open-vocabulary action detector that aims to detect any human actions from videos in an open world. The figure below shows the model architecture.
 3 | 
 4 | <p align="center">
 5 | <img src="assets/wacv25_openmixer.png" alt="OpenMixer" width="600px"/>
 6 | </p>
 7 | 
 8 | ## Installation
 9 | - Create conda environment:  
10 | ```bash
11 | conda create -n openmixer python=3.7
12 | ```
13 | 
14 | - Install pytorch:  
15 | ```bash
16 | pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117
17 | ```
18 | 
19 | - Install other libraries (including the OpenAI-CLIP):  
20 | ```bash
21 | pip install -r requirements.txt
22 | ```
23 | 
24 | ## Data Preparation
25 | - First, please refer to the MMAction2 [JHMDB](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/jhmdb/README.md) and [UCF24](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ucf101_24/README.md) dataset preparation steps.
26 | 
27 | - Next, please download our released [Open-World splits](https://drive.google.com/drive/folders/1Bu5GNsGIfYD-4u_7WMjBOWZj_3zs-HbJ?usp=sharing). Make sure folders are structured as follows.
28 | ```bash
29 | data
30 | ├──JHMDB
31 | |   ├── openworld
32 | |   ├── Frames
33 | |   ├── JHMDB-MaskRCNN.pkl
34 | |   ├── JHMDB-GT.pkl
35 | ├──UCF24
36 | |   ├── openworld
37 | |   ├── rgb-images
38 | |   ├── UCF24-MaskRCNN.pkl
39 | ```
40 | 
41 | ## Models
42 | 
43 | - Please download the pretrained `CLIP-ViP-B/16` checkpoint from [XPretrain/CLIP-ViP](https://github.com/microsoft/XPretrain/tree/main/CLIP-ViP), which is a video CLIP model served as the backbone of our model. After downloaded, make sure the file is located at `./pretrained/pretrain_clipvip_base_16.pt`.
44 | 
45 | - [Optional] We released three OpenMixer models and inference results for each of the JHMDB and UCF24 datasets here: [Google Drive](https://drive.google.com/drive/folders/1MDT_jcJolNZjuZ15cdhXyJmewMyVBKUP?usp=sharing). They correspond to the configurations in the folder `./config_files/`. Note that for the ZSR_ZSL setting, no model training needed. 
46 | 
47 | 
48 | ## Training
49 | 
50 | We provided an easy-to-use bash script to enable training and evaluation for different settings and datasets. For example, to train the OpenMixer model under the end-to-end setting on the JHMDB dataset using 4 specified GPUs:
51 | ```bash
52 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash trainval.sh train jhmdb
53 | ```
54 | Optionally, you may change the GPU IDs and dataset name to `ucf24`. For other settings, in the `trainval.sh`, change the `CFG_FILE` to `openmixer_zsr_tl.yaml` to train OpenMixer model under the ZSR+TL setting.
55 | 
56 | 
57 | ## Validation
58 | We use the same bash script for validation (inference + evaluation)
59 | ```bash
60 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash trainval.sh eval jhmdb
61 | ```
62 | Optionally, you may change the GPU IDs and dataset name to `ucf24`. For other settings, in the `trainval.sh`, change the `CFG_FILE` to `openmixer_zsr_tl.yaml` and `openmixer_zsr_zsl.yaml` for evaluating models under the ZSR+TL and ZSR+ZSL settings, respectively.
63 | 
64 | 
65 | ## Acknowledgements
66 | This project is built upon [STMixer](https://github.com/MCG-NJU/STMixer), [CLIP-ViP](https://github.com/microsoft/XPretrain/CLIP-ViP), and [OpenAI-CLIP](https://github.com/openai/CLIP). We sincerely thank contributors of all these great open-source repositories!
67 | 
68 | 
69 | ## Citation
70 | 
71 | If this project helps you in your research or project, please cite
72 | our paper:
73 | 
74 | ```
75 | @InProceedings{bao2025wacv,
76 |   title={Exploiting VLM Localizability and Semantics for Open Vocabulary Action Detection},
77 |   author={Wentao Bao and Kai Li and Yuxiao Chen and Deep Patel and Martin Renqiang Min and Yu Kong},
78 |   booktitle = {IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)},
79 |   year={2025}
80 | }
81 | ```
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/alphaction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/__init__.py


--------------------------------------------------------------------------------
/alphaction/cam/.gitignore:
--------------------------------------------------------------------------------
1 | demos/
2 | hila_clip/
3 | pytorch_grad_cam/
4 | *.gif
5 | *.mp4


--------------------------------------------------------------------------------
/alphaction/cam/README.md:
--------------------------------------------------------------------------------
 1 | ## HilaCAM for CLIP Visual Attention
 2 | 
 3 | Please go to [gScoreCAM](https://github.com/anguyen8/gScoreCAM), download the folders `hila_clip/` and `pytorch_grad_cam/`, and put them in this folder.
 4 | 
 5 | The following commands show the steps:
 6 | ```shell
 7 | cd alphaction/cam
 8 | git clone https://github.com/anguyen8/gScoreCAM
 9 | cp -r gScoreCAM/hila_clip gScoreCAM/pytorch_grad_cam .
10 | rm -rf gScoreCAM
11 | 
12 | ```
13 | 
14 | After that, please ensure the following python packages are installed:
15 | ```shell
16 | pip install ttach kornia scikit-learn scikit-image
17 | ```


--------------------------------------------------------------------------------
/alphaction/cam/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/cam/__init__.py


--------------------------------------------------------------------------------
/alphaction/cam/clip_loader.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | #* For CLIP ViT
 4 | def reshape_transform(tensor, height=None, width=None):
 5 |     if height or width is None:
 6 |         grid_square = len(tensor) - 1
 7 |         if grid_square ** 0.5 % 1 == 0:
 8 |             height = width = int(grid_square**0.5)
 9 |         else:
10 |             raise ValueError("Heatmap is not square, please set height and width.")
11 |     result = tensor[1:, :, :].reshape(
12 |         height, width, tensor.size(2))
13 | 
14 |     # Bring the channels to the first dimension,
15 |     # like in CNNs.
16 |     result = result.permute(2, 0, 1)
17 |     return result.unsqueeze(0)
18 | 
19 | def load_clip(clip_version, attn_prob=True, attn_grad=True, attn_last_only=True, resize='adapt', custom=False, model_weight=None):
20 |     device = "cuda" if torch.cuda.is_available() else "cpu"
21 |     if 'vit' in clip_version.lower() and not custom: #* This is no necessary, for experimental usage, hila CLIP will hook all attentions.
22 |         from hila_clip import clip
23 |         clip_model, preprocess = clip.load(clip_version, device=device, jit=False)
24 |     
25 |     elif 'clip-vip' in clip_version.lower():
26 |         import sys, clip
27 |         sys.path.append("../../")
28 |         from alphaction.modeling.encoders.clipvip import loader
29 |         clip_model, preprocess = loader.load(clip_version, 
30 |                                              attn_prob=attn_prob,
31 |                                              attn_grad=attn_grad, 
32 |                                              attn_last_only=attn_last_only,
33 |                                              device=device, model_weight=model_weight)
34 | 
35 |     elif custom:
36 |         from hila_clip import clip
37 |         clip_model, preprocess = clip.load(clip_version, device=device, jit=False)
38 | 
39 |     else:
40 |         import clip
41 |         clip_model, preprocess = clip.load(clip_version, device=device)
42 | 
43 |     if clip_version.startswith("RN"):
44 |         target_layer = clip_model.visual.layer4[-1]
45 |         cam_trans = None
46 |     elif 'clip-vip' in clip_version.lower():
47 |         target_layer = clip_model.vision_model.encoder.layers[-1]
48 |         cam_trans = reshape_transform
49 |     else:
50 |         target_layer = clip_model.visual.transformer.resblocks[-1]
51 |         cam_trans = reshape_transform
52 | 
53 |     if resize == 'raw': # remove clip resizing
54 |         if not custom:
55 |             raise Exception("Raw input needs to use custom clip.") 
56 |         preprocess.transforms.pop(0)
57 |         preprocess.transforms.pop(0)
58 |     elif resize == 'adapt': # adapt to clip size
59 |         from torchvision import transforms
60 |         crop_size = preprocess.transforms[1].size # resize to crop size so that no information will be cropped
61 |         preprocess.transforms.insert(0, transforms.Resize(crop_size))
62 |     # clip_model = torch.nn.DataParallel(clip_model)
63 |     return clip_model, preprocess, target_layer, cam_trans, clip
64 | 
65 | def load_clip_from_checkpoint(checkpoint, model):
66 |     checkpoint = torch.load(checkpoint, map_location='cpu')
67 | 
68 |     # # Use these 3 lines if you use default model setting(not training setting) of the clip. For example, if you set context_length to 100 since your string is very long during training, then assign 100 to checkpoint['model_state_dict']["context_length"] 
69 |     # checkpoint['model_state_dict']["input_resolution"] = model.input_resolution #default is 224
70 |     # checkpoint['model_state_dict']["context_length"] = model.context_length # default is 77
71 |     # checkpoint['model_state_dict']["vocab_size"] = model.vocab_size 
72 | 
73 |     model.load_state_dict(checkpoint['model_state_dict'])
74 |     return model


--------------------------------------------------------------------------------
/alphaction/cam/mhsa.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def get_multi_head_mask(attentions, threshold=0.6):
 5 |     nh, np = attentions.size(0), attentions.size(-1)
 6 |     # we keep only a certain percentage of the mass
 7 |     val, idx = torch.sort(attentions)
 8 |     val /= torch.sum(val, dim=-1, keepdim=True)
 9 |     cumval = torch.cumsum(val, dim=-1)
10 |     th_attn = cumval > (1 - threshold)
11 |     idx2 = torch.argsort(idx)  # dim=-1 by default
12 |     th_attn = th_attn.view(nh, -1)
13 |     for head in range(nh):
14 |         th_attn[head] = th_attn[head][idx2[head].view(-1)]
15 |     if len(attentions.size()) == 3:
16 |         th_attn = th_attn.view(nh, -1, np)
17 |     return th_attn
18 | 
19 | 
20 | def get_masked_attention_map(attentions, nh, heatmap_size, cam_size, mask=None):
21 |     if mask is not None:
22 |         # apply mask on attention map
23 |         attentions = attentions * mask.float()  # (num_heads, N, L)
24 |     # normalize within each frame
25 |     attentions -= attentions.min(dim=-1, keepdim=True)[0]
26 |     attentions /= attentions.max(dim=-1, keepdim=True)[0]
27 |     
28 |     num_frames = attentions.size(1) if len(attentions.size()) == 3 else 1
29 |     # average over multi-heads as the final attention
30 |     attentions = attentions.reshape(nh, num_frames, heatmap_size[0], heatmap_size[1]).mean(dim=0, keepdim=True)
31 |     
32 |     if cam_size is not None:
33 |         # interpolate
34 |         attentions = torch.nn.functional.interpolate(attentions, size=(cam_size[1], cam_size[0]), mode="bilinear")[0]
35 |     return attentions.cpu().numpy()
36 | 
37 | 
38 | @torch.no_grad()
39 | def mhsa_clip(image, model, cam_size=None, threshold=0.6):
40 |     # get patch token features
41 |     _, attn_last = model.encode_image(image, last_attn_output=True)  # (B, num_heads, L, D)
42 |     nh = attn_last.shape[1] # number of head
43 |     
44 |     # we keep only the output patch attention
45 |     # assume batch_size = 1
46 |     attentions = attn_last[0, :, 0, 1:].reshape(nh, -1)  # (num_heads, 7*7)
47 |     heatmap_size = [int(attentions.size(-1)**0.5), int(attentions.size(-1)**0.5)]  # 7
48 |     
49 |     th_attn = get_multi_head_mask(attentions, threshold)
50 |     
51 |     attn_map = get_masked_attention_map(attentions, nh, heatmap_size, cam_size, mask=th_attn)  # (1, H, W)
52 | 
53 |     return attn_map[0]
54 | 
55 | 
56 | @torch.no_grad()
57 | def mhsa_clipvip(video, model, cam_size=None, threshold=0.6):
58 |     """ video: (B, T, C, H, W)
59 |         text: (K, L)
60 |         cam_size: (W, H)
61 |     """
62 |     num_proxy = model.config.vision_additional_config.add_cls_num + 1
63 |     num_heads = model.config.vision_config.num_attention_heads
64 |     num_frames = video.size(1)
65 | 
66 |     # run forward pass to get the last block attentions
67 |     _, heatmap_size = model.get_image_features(video, return_ws=True)  # (h,w)
68 |     last_block = list(dict(model.vision_model.encoder.layers.named_children()).values())[-1]
69 |     attn_inter = last_block.attn_probs['inter']  # [B*num_heads, M, M+N*L] where M=4
70 |     attn_intra = last_block.attn_probs['intra']  # [B*num_heads*N, L, M+L] where L=196 if input_size=224
71 |     
72 |     num_patches = attn_intra.shape[-2] # L
73 |     attentions_inter = attn_inter[:, 0, num_proxy:].reshape(-1, num_heads, num_frames, num_patches)[0]  # [B*num_heads, N*L] --> [num_heads, N, L]
74 |     # attentions_intra = attn_intra[:, 0, num_proxy:].reshape(-1, num_heads, num_frames, num_patches)[0]  # [B*num_heads*N, L] --> [num_heads, N, L]
75 |     
76 |     th_attn = get_multi_head_mask(attentions_inter, threshold)
77 |     attn_map = get_masked_attention_map(attentions_inter, num_heads, heatmap_size, cam_size, mask=th_attn)  # (T, H, W)
78 |     
79 |     # temporal weights
80 |     temporal_weights = attn_inter[:, 0, num_proxy:].reshape(-1, num_frames, num_patches).sum(dim=-1)  # [B*num_heads, N]
81 |     temporal_weights = temporal_weights.reshape(-1, num_heads, num_frames)[0].sum(dim=0)  # [N]
82 |     temporal_weights -= temporal_weights.min(dim=-1, keepdim=True)[0]
83 |     temporal_weights /= temporal_weights.max(dim=-1, keepdim=True)[0]
84 |     temporal_weights = temporal_weights.cpu().numpy()
85 |     attn_map = temporal_weights[:, None, None] * attn_map
86 |     
87 |     # # visualize the weights
88 |     # import matplotlib.pyplot as plt
89 |     # import numpy as np
90 |     # plt.bar(np.arange(num_frames) + 1, temporal_weights, 0.4)
91 |     # plt.xlabel("video frames")
92 |     # plt.ylabel("normalized attentions")
93 |     # plt.xticks(np.arange(num_frames) + 1)
94 |     # plt.tight_layout()
95 |     # plt.savefig("../../_temp./temporal_weights.png")
96 |     
97 |     return attn_map


--------------------------------------------------------------------------------
/alphaction/cam/ritsm.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import cv2
  4 | 
  5 | 
  6 | def clip_forward(model, image, text):
  7 |     # get patch token features
  8 |     image_features, encoder_out = model.encode_image(image, transformer_output=True)  # (N, L, D)
  9 |     text_features = model.encode_text(text)
 10 | 
 11 |     # cosine similarity as logits
 12 |     logit_scale = model.logit_scale.exp()
 13 |     logits_per_image = logit_scale * image_features @ text_features.t()
 14 |     logits_per_text = logit_scale * text_features @ image_features.t()
 15 | 
 16 |     return logits_per_image, encoder_out, text_features
 17 | 
 18 | 
 19 | @torch.no_grad()
 20 | def ritsm_clip(image, text, model, device, index=None, cam_size=None, return_logits=False, attn_grad=False):
 21 |     # forward pass
 22 |     logits_per_image, encoder_out, text_features = clip_forward(model, image, text)
 23 |     probs = logits_per_image.softmax(dim=-1)
 24 |     if index is None:
 25 |         # locate the largest score of img-text pair
 26 |         index = np.argmax(logits_per_image.cpu().data.numpy(), axis=-1)
 27 |     
 28 |     input_size = model.visual.input_resolution  # 224
 29 |     patch_features = encoder_out[:, 1:, :]  # (B, 7*7, 768)
 30 |     heatmap_size = int(patch_features.size(1)**0.5)  # 7
 31 | 
 32 |     # projection
 33 |     patch_features = model.visual.ln_post(patch_features)
 34 |     if model.visual.proj is not None:
 35 |         patch_features = patch_features @ model.visual.proj   # (B, 7*7, 512)
 36 | 
 37 |     # normalize
 38 |     patch_features = patch_features / patch_features.norm(dim=-1, keepdim=True)
 39 |     # text_features = text_features / text_features.norm(dim=-1, keepdim=True)  # (K=1, 512)
 40 | 
 41 |     # image-text similarity
 42 |     it_sim = patch_features @ text_features.t()  # (B, 7*7, K=1)
 43 | 
 44 |     # reshape & resize
 45 |     image_relevance_all = it_sim[:, :, index].view(-1, 1, heatmap_size, heatmap_size)  # (B, 1, 7, 7)
 46 |     image_relevance_all = torch.nn.functional.interpolate(image_relevance_all.float(), size=input_size, mode='bilinear')  # (B, 1, H, W)
 47 |     image_relevance = image_relevance_all[0]  # assume batch_size = 1
 48 |     image_relevance = image_relevance.reshape(input_size, input_size).detach().cpu().numpy()
 49 |     image_relevance = (image_relevance - image_relevance.min()) / (image_relevance.max() - image_relevance.min())    
 50 |     # reverse
 51 |     image_relevance = np.fabs(1 - image_relevance)
 52 | 
 53 |     out = cv2.resize(image_relevance, cam_size) if cam_size is not None else image_relevance
 54 |     if return_logits:
 55 |         return out, logits_per_image
 56 |     return out
 57 | 
 58 | 
 59 | @torch.no_grad()
 60 | def ritsm_clipvip(video, text, model, device, index=None, cam_size=None, return_logits=False, attn_grad=False, use_mask=False):
 61 |     """ video: (B, T, C, H, W)
 62 |         text: (K, L)
 63 |     """
 64 |     num_proxy = model.config.vision_additional_config.add_cls_num + 1
 65 |     eos_idx = text.argmax(dim=-1)
 66 |     num_frames = video.size(1)
 67 |     
 68 |     input_size = model.config.vision_config.image_size  # 224
 69 |     patch_size = model.config.vision_config.patch_size  # 16
 70 |     num_patches = int(input_size // patch_size)  # 14
 71 | 
 72 |     # run forward pass
 73 |     out_dict = model(text, video)
 74 |     logits_per_image = out_dict['logits_per_image']
 75 | 
 76 |     if index is None:
 77 |         # locate the largest score of img-text pair
 78 |         index = np.argmax(logits_per_image.cpu().data.numpy(), axis=-1)
 79 | 
 80 |     # get patch features from the last vision encoder block
 81 |     patch_features = out_dict['vision_model_output']['last_hidden_state'][:, num_proxy:, :]  # (B, T*14*14, 768)
 82 |     assert num_frames * (num_patches ** 2) == patch_features.size(1)
 83 |     
 84 |     # layernorm, projection, and normalization
 85 |     patch_features = model.vision_model.post_layernorm(patch_features)
 86 |     patch_features = model.visual_projection(patch_features)  # 768 --> 512
 87 |     patch_features = patch_features / patch_features.norm(dim=-1, keepdim=True)  # (B, T*14*14, 512)
 88 | 
 89 |     # get the text features
 90 |     text_features = out_dict['text_embeds']  # after layernorm, projection, and normalization
 91 | 
 92 |     # image-text similarity
 93 |     it_sim = patch_features @ text_features.t()  # (B, T*14*14, K=1)
 94 |     
 95 |     if use_mask:
 96 |         th_attn = get_attn_mask(it_sim[0, :, index].view(num_frames, -1), threshold=0.6)
 97 |         it_sim = it_sim * th_attn.view(-1).unsqueeze(0).unsqueeze(-1) 
 98 |     
 99 |     # reshape & resize
100 |     image_relevance_all = it_sim[:, :, index].view(-1, num_frames, num_patches, num_patches)  # (B, T, 14, 14)
101 |     image_relevance_all = torch.nn.functional.interpolate(image_relevance_all.float(), size=input_size, mode='bilinear')  # (B, T, H, W)
102 | 
103 |     # assume batch_size = 1
104 |     image_relevance_all = image_relevance_all[0]
105 | 
106 |     all_maps = []
107 |     for image_relevance in image_relevance_all:
108 |         image_relevance = image_relevance.reshape(input_size, input_size).detach().cpu().numpy()
109 |         # normalize and reverse
110 |         image_relevance = (image_relevance - image_relevance.min()) / (image_relevance.max() - image_relevance.min())
111 |         # reverse
112 |         image_relevance = np.fabs(1 - image_relevance)
113 |         atten_map = cv2.resize(image_relevance, cam_size) if cam_size is not None else image_relevance    
114 |         all_maps.append(atten_map)
115 | 
116 |     out = np.stack(all_maps, axis=0)
117 |     
118 |     if return_logits:
119 |         return out, logits_per_image
120 |     return out
121 | 
122 | 
123 | def get_attn_mask(attentions, threshold=0.6):
124 |     """ attentions: (T, L)
125 |     """
126 |     nh = attentions.size(0)
127 |     # we keep only a certain percentage of the mass
128 |     val, idx = torch.sort(attentions)
129 |     val /= torch.sum(val, dim=-1, keepdim=True)
130 |     cumval = torch.cumsum(val, dim=-1)
131 |     th_attn = cumval > (1 - threshold)
132 |     idx2 = torch.argsort(idx)  # dim=-1 by default
133 |     th_attn = th_attn.view(nh, -1)
134 |     for head in range(nh):
135 |         th_attn[head] = th_attn[head][idx2[head].view(-1)]
136 |     return th_attn


--------------------------------------------------------------------------------
/alphaction/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .defaults import _C as cfg
2 | 


--------------------------------------------------------------------------------
/alphaction/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import make_data_loader
2 | 


--------------------------------------------------------------------------------
/alphaction/dataset/build.py:
--------------------------------------------------------------------------------
  1 | import bisect
  2 | import copy
  3 | import torch.utils.data
  4 | from alphaction.utils.comm import get_world_size
  5 | from . import datasets as D
  6 | from . import samplers
  7 | from .collate_batch import BatchCollator
  8 | 
  9 | def build_dataset(cfg, split):
 10 |     if cfg.DATA.DATASETS[0] == 'ucf24':
 11 |         dataset = D.UCF24(cfg, split)
 12 |     elif cfg.DATA.DATASETS[0] == 'jhmdb':
 13 |         dataset = D.Jhmdb(cfg, split)
 14 |     elif cfg.DATA.DATASETS[0] == 'ava_v2.2':
 15 |         dataset = D.Ava(cfg, split)
 16 |     else:
 17 |         raise NotImplementedError
 18 | 
 19 |     return [dataset]
 20 | 
 21 | def make_data_sampler(dataset, shuffle, distributed):
 22 |     if distributed:
 23 |         return samplers.DistributedSampler(dataset, shuffle=shuffle)
 24 |     if shuffle:
 25 |         sampler = torch.utils.data.sampler.RandomSampler(dataset)
 26 |     else:
 27 |         sampler = torch.utils.data.sampler.SequentialSampler(dataset)
 28 |     return sampler
 29 | 
 30 | 
 31 | def _quantize(x, bins):
 32 |     bins = copy.copy(bins)
 33 |     bins = sorted(bins)
 34 |     quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
 35 |     return quantized
 36 | 
 37 | 
 38 | def _compute_aspect_ratios(dataset):
 39 |     aspect_ratios = []
 40 |     for i in range(len(dataset)):
 41 |         video_info = dataset.get_video_info(i)
 42 |         aspect_ratio = float(video_info["height"]) / float(video_info["width"])
 43 |         aspect_ratios.append(aspect_ratio)
 44 |     return aspect_ratios
 45 | 
 46 | 
 47 | def make_batch_data_sampler(
 48 |         dataset, sampler, aspect_grouping, videos_per_batch, num_iters=None, start_iter=0, drop_last=False
 49 | ):
 50 |     if aspect_grouping:
 51 |         if not isinstance(aspect_grouping, (list, tuple)):
 52 |             aspect_grouping = [aspect_grouping]
 53 |         aspect_ratios = _compute_aspect_ratios(dataset)
 54 |         group_ids = _quantize(aspect_ratios, aspect_grouping)
 55 |         batch_sampler = samplers.GroupedBatchSampler(
 56 |             sampler, group_ids, videos_per_batch, drop_uneven=drop_last
 57 |         )
 58 |     else:
 59 |         batch_sampler = torch.utils.data.sampler.BatchSampler(
 60 |             sampler, videos_per_batch, drop_last=drop_last
 61 |         )
 62 |     if num_iters is not None:
 63 |         batch_sampler = samplers.IterationBasedBatchSampler(
 64 |             batch_sampler, num_iters, start_iter
 65 |         )
 66 |     return batch_sampler
 67 | 
 68 | 
 69 | def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
 70 |     num_gpus = get_world_size()
 71 |     if is_train:
 72 |         # for training
 73 |         videos_per_batch = cfg.SOLVER.VIDEOS_PER_BATCH
 74 |         assert (
 75 |                 videos_per_batch % num_gpus == 0
 76 |         ), "SOLVER.VIDEOS_PER_BATCH ({}) must be divisible by the number "
 77 |         "of GPUs ({}) used.".format(videos_per_batch, num_gpus)
 78 |         videos_per_gpu = videos_per_batch // num_gpus
 79 |         shuffle = True
 80 |         drop_last = True
 81 |         # num_iters = cfg.SOLVER.MAX_EPOCH*cfg.SOLVER.ITER_PER_EPOCH
 82 |         split = 'train'
 83 |     else:
 84 |         # for testing
 85 |         videos_per_batch = cfg.TEST.VIDEOS_PER_BATCH
 86 |         assert (
 87 |                 videos_per_batch % num_gpus == 0
 88 |         ), "TEST.VIDEOS_PER_BATCH ({}) must be divisible by the number "
 89 |         "of GPUs ({}) used.".format(videos_per_batch, num_gpus)
 90 |         videos_per_gpu = videos_per_batch // num_gpus
 91 |         shuffle = False if not is_distributed else True
 92 |         drop_last = False
 93 |         # num_iters = None
 94 |         start_iter = 0
 95 |         split = 'test'
 96 | 
 97 |     # group images which have similar aspect ratio. In this case, we only
 98 |     # group in two cases: those with width / height > 1, and the other way around,
 99 |     # but the code supports more general grouping strategy
100 |     aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []
101 | 
102 |     # build dataset
103 |     datasets = build_dataset(cfg, split=split)
104 | 
105 |     # build sampler and dataloader
106 |     data_loaders, vocabularies, iter_per_epoch_all = [], [], []
107 |     for dataset in datasets:
108 |         if is_train:
109 |             # number of iterations for all epochs
110 |             iter_per_epoch = int(len(dataset) // cfg.SOLVER.VIDEOS_PER_BATCH) if cfg.SOLVER.ITER_PER_EPOCH == -1 else cfg.SOLVER.ITER_PER_EPOCH
111 |             iter_per_epoch_all.append(iter_per_epoch)
112 |         num_iters = cfg.SOLVER.MAX_EPOCH * iter_per_epoch if is_train else None
113 |         # sampler
114 |         sampler = make_data_sampler(dataset, shuffle, is_distributed)
115 |         batch_sampler = make_batch_data_sampler(
116 |             dataset, sampler, aspect_grouping, videos_per_gpu, num_iters, start_iter, drop_last
117 |         )
118 |         collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
119 |         num_workers = cfg.DATALOADER.NUM_WORKERS
120 |         data_loader = torch.utils.data.DataLoader(
121 |             dataset,
122 |             num_workers=num_workers,
123 |             batch_sampler=batch_sampler,
124 |             collate_fn=collator,
125 |         )
126 |         data_loaders.append(data_loader)
127 |         if cfg.DATA.OPEN_VOCABULARY:
128 |             vocabularies.append(dataset.text_input)
129 |         else:
130 |             vocabularies.append(None)
131 |     if is_train:
132 |         # during training, a single (possibly concatenated) data_loader is returned
133 |         assert len(data_loaders) == 1
134 |         return data_loaders[0], vocabularies[0]['closed'], iter_per_epoch_all[0]
135 |     
136 |     vocabularies_val = []
137 |     if len(vocabularies) > 0:
138 |         for vocab in vocabularies:
139 |             if cfg.TEST.EVAL_OPEN and vocab is not None:
140 |                 vocabularies_val.append(vocab['open'])
141 |             else:
142 |                 vocabularies_val.append(vocab['closed'])
143 |     
144 |     return data_loaders, vocabularies_val, iter_per_epoch_all


--------------------------------------------------------------------------------
/alphaction/dataset/collate_batch.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | 
 4 | 
 5 | def batch_different_videos(videos, size_divisible=0):
 6 |     '''
 7 |     :param videos: a list of video tensors
 8 |     :param size_divisible: output_size(width and height) should be divisble by this param
 9 |     :return: batched videos as a single tensor
10 |     '''
11 |     assert isinstance(videos, (tuple, list))
12 |     max_size = tuple(max(s) for s in zip(*[clip.shape for clip in videos]))
13 | 
14 |     if size_divisible > 0:
15 |         stride = size_divisible
16 |         max_size = list(max_size)
17 |         max_size[2] = int(math.ceil(max_size[2] / stride) * stride)
18 |         max_size[3] = int(math.ceil(max_size[3] / stride) * stride)
19 |         max_size = tuple(max_size)
20 | 
21 |     batch_shape = (len(videos),) + max_size
22 |     batched_clips = videos[0].new(*batch_shape).zero_()
23 |     for clip, pad_clip in zip(videos, batched_clips):
24 |         pad_clip[:clip.shape[0], :clip.shape[1], :clip.shape[2], :clip.shape[3]].copy_(clip)
25 | 
26 |     return batched_clips
27 | 
28 | 
29 | class BatchCollator(object):
30 |     """
31 |     From a list of samples from the dataset,
32 |     returns the batched objectimages and targets.
33 |     This should be passed to the DataLoader
34 |     """
35 | 
36 |     def __init__(self, size_divisible=0):
37 |         self.divisible = size_divisible
38 |         self.size_divisible = self.divisible
39 | 
40 |     def __call__(self, batch):
41 |         transposed_batch = list(zip(*batch))
42 |         slow_clips = batch_different_videos(transposed_batch[0], self.size_divisible)
43 |         if transposed_batch[1][0] is not None:
44 |             fast_clips = batch_different_videos(transposed_batch[1], self.size_divisible)
45 |         else:
46 |             fast_clips = None
47 |         whwh = torch.stack(transposed_batch[2])
48 |         boxes = transposed_batch[3]
49 |         label_arrs = transposed_batch[4]
50 |         metadata = transposed_batch[5]
51 |         clip_ids = transposed_batch[6]
52 |         return slow_clips, fast_clips, whwh, boxes, label_arrs, metadata, clip_ids


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .concat_dataset import ConcatDataset
2 | from .ava_dataset import Ava
3 | from .jhmdb_dataset import Jhmdb
4 | from .ucf24_dataset import UCF24
5 | 
6 | __all__ = ["ConcatDataset", "Ava", "Jhmdb", "UCF24"]


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/concat_dataset.py:
--------------------------------------------------------------------------------
 1 | import bisect
 2 | 
 3 | from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
 4 | 
 5 | 
 6 | class ConcatDataset(_ConcatDataset):
 7 |     """
 8 |     Same as torch.utils.dataset.dataset.ConcatDataset, but exposes an extra
 9 |     method for querying the sizes of the image
10 |     """
11 | 
12 |     def get_idxs(self, idx):
13 |         dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
14 |         if dataset_idx == 0:
15 |             sample_idx = idx
16 |         else:
17 |             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
18 |         return dataset_idx, sample_idx
19 | 
20 |     def get_video_info(self, idx):
21 |         dataset_idx, sample_idx = self.get_idxs(idx)
22 |         return self.datasets[dataset_idx].get_video_info(sample_idx)
23 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | from alphaction.dataset import datasets
 2 | 
 3 | from .ava import ava_evaluation
 4 | from .jhmdb import jhmdb_evaluation
 5 | from .ucf24 import ucf24_evaluation
 6 | 
 7 | 
 8 | def evaluate(dataset, predictions, output_folder, **kwargs):
 9 |     """evaluate dataset using different methods based on dataset type.
10 |     Args:
11 |         dataset: Dataset object
12 |         predictions(list[BoxList]): each item in the list represents the
13 |             prediction results for one image.
14 |         output_folder: output folder, to save evaluation files or results.
15 |         **kwargs: other args.
16 |     Returns:
17 |         evaluation result
18 |     """
19 |     args = dict(
20 |         dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs
21 |     )
22 |     if isinstance(dataset, datasets.Ava):
23 |         return ava_evaluation(**args)
24 |     elif isinstance(dataset, datasets.Jhmdb):
25 |         return jhmdb_evaluation(**args)
26 |     elif isinstance(dataset, datasets.UCF24):
27 |         return ucf24_evaluation(**args)
28 |     else:
29 |         dataset_name = dataset.__class__.__name__
30 |         raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name))
31 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ava/README.md:
--------------------------------------------------------------------------------
1 | The evaluation code of AVA is modified from [https://github.com/activitynet/ActivityNet](https://github.com/activitynet/ActivityNet).


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ava/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .ava_eval import do_ava_evaluation
 3 | 
 4 | 
 5 | def ava_evaluation(dataset, predictions, output_folder, **kwargs):
 6 |     logger = logging.getLogger("alphaction.inference")
 7 |     logger.info("performing ava evaluation.")
 8 |     return do_ava_evaluation(
 9 |         dataset=dataset,
10 |         predictions=predictions,
11 |         output_folder=output_folder,
12 |         logger=logger,
13 |         metric=kwargs.get('metric', 'frame_ap'),
14 |         save_csv=kwargs.get('save_csv', False)
15 |     )
16 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/jhmdb/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .jhmdb_eval import do_jhmdb_evaluation
 3 | 
 4 | 
 5 | def jhmdb_evaluation(dataset, predictions, output_folder, **kwargs):
 6 |     logger = logging.getLogger("alphaction.inference")
 7 |     logger.info("performing jhmdb evaluation.")
 8 |     return do_jhmdb_evaluation(
 9 |         dataset=dataset,
10 |         predictions=predictions,
11 |         output_folder=output_folder,
12 |         logger=logger,
13 |         metric=kwargs.get('metric', 'frame_ap'),
14 |         save_csv=kwargs.get('save_csv', False)
15 |     )


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/dataset/datasets/evaluation/pascal_evaluation/__init__.py


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_evaluation/metrics.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Functions for computing metrics like precision, recall, CorLoc and etc."""
 17 | from __future__ import division
 18 | 
 19 | import numpy as np
 20 | 
 21 | 
 22 | def compute_precision_recall(scores, labels, num_gt):
 23 |   """Compute precision and recall.
 24 | 
 25 |   Args:
 26 |     scores: A float numpy array representing detection score
 27 |     labels: A boolean numpy array representing true/false positive labels
 28 |     num_gt: Number of ground truth instances
 29 | 
 30 |   Raises:
 31 |     ValueError: if the input is not of the correct format
 32 | 
 33 |   Returns:
 34 |     precision: Fraction of positive instances over detected ones. This value is
 35 |       None if no ground truth labels are present.
 36 |     recall: Fraction of detected positive instance over all positive instances.
 37 |       This value is None if no ground truth labels are present.
 38 | 
 39 |   """
 40 |   if not isinstance(
 41 |       labels, np.ndarray) or labels.dtype != np.bool or len(labels.shape) != 1:
 42 |     raise ValueError("labels must be single dimension bool numpy array")
 43 | 
 44 |   if not isinstance(
 45 |       scores, np.ndarray) or len(scores.shape) != 1:
 46 |     raise ValueError("scores must be single dimension numpy array")
 47 | 
 48 |   if num_gt < np.sum(labels):
 49 |     raise ValueError("Number of true positives must be smaller than num_gt.")
 50 | 
 51 |   if len(scores) != len(labels):
 52 |     raise ValueError("scores and labels must be of the same size.")
 53 | 
 54 |   if num_gt == 0:
 55 |     return None, None
 56 | 
 57 |   sorted_indices = np.argsort(scores)
 58 |   sorted_indices = sorted_indices[::-1]
 59 |   labels = labels.astype(int)
 60 |   true_positive_labels = labels[sorted_indices]
 61 |   false_positive_labels = 1 - true_positive_labels
 62 |   cum_true_positives = np.cumsum(true_positive_labels)
 63 |   cum_false_positives = np.cumsum(false_positive_labels)
 64 |   precision = cum_true_positives.astype(float) / (
 65 |       cum_true_positives + cum_false_positives)
 66 |   recall = cum_true_positives.astype(float) / num_gt
 67 |   return precision, recall
 68 | 
 69 | 
 70 | def compute_average_precision(precision, recall):
 71 |   """Compute Average Precision according to the definition in VOCdevkit.
 72 | 
 73 |   Precision is modified to ensure that it does not decrease as recall
 74 |   decrease.
 75 | 
 76 |   Args:
 77 |     precision: A float [N, 1] numpy array of precisions
 78 |     recall: A float [N, 1] numpy array of recalls
 79 | 
 80 |   Raises:
 81 |     ValueError: if the input is not of the correct format
 82 | 
 83 |   Returns:
 84 |     average_precison: The area under the precision recall curve. NaN if
 85 |       precision and recall are None.
 86 | 
 87 |   """
 88 |   if precision is None:
 89 |     if recall is not None:
 90 |       raise ValueError("If precision is None, recall must also be None")
 91 |     return np.NAN
 92 | 
 93 |   if not isinstance(precision, np.ndarray) or not isinstance(recall,
 94 |                                                              np.ndarray):
 95 |     raise ValueError("precision and recall must be numpy array")
 96 |   if precision.dtype != np.float or recall.dtype != np.float:
 97 |     raise ValueError("input must be float numpy array.")
 98 |   if len(precision) != len(recall):
 99 |     raise ValueError("precision and recall must be of the same size.")
100 |   if not precision.size:
101 |     return 0.0
102 |   if np.amin(precision) < 0 or np.amax(precision) > 1:
103 |     raise ValueError("Precision must be in the range of [0, 1].")
104 |   if np.amin(recall) < 0 or np.amax(recall) > 1:
105 |     raise ValueError("recall must be in the range of [0, 1].")
106 |   if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)):
107 |     raise ValueError("recall must be a non-decreasing array")
108 | 
109 |   recall = np.concatenate([[0], recall, [1]])
110 |   precision = np.concatenate([[0], precision, [0]])
111 | 
112 |   # Preprocess precision to be a non-decreasing array
113 |   for i in range(len(precision) - 2, -1, -1):
114 |     precision[i] = np.maximum(precision[i], precision[i + 1])
115 | 
116 |   indices = np.where(recall[1:] != recall[:-1])[0] + 1
117 |   average_precision = np.sum(
118 |       (recall[indices] - recall[indices - 1]) * precision[indices])
119 |   return average_precision
120 | 
121 | 
122 | def compute_cor_loc(num_gt_imgs_per_class,
123 |                     num_images_correctly_detected_per_class):
124 |   """Compute CorLoc according to the definition in the following paper.
125 | 
126 |   https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf
127 | 
128 |   Returns nans if there are no ground truth images for a class.
129 | 
130 |   Args:
131 |     num_gt_imgs_per_class: 1D array, representing number of images containing
132 |         at least one object instance of a particular class
133 |     num_images_correctly_detected_per_class: 1D array, representing number of
134 |         images that are correctly detected at least one object instance of a
135 |         particular class
136 | 
137 |   Returns:
138 |     corloc_per_class: A float numpy array represents the corloc score of each
139 |       class
140 |   """
141 |   # Divide by zero expected for classes with no gt examples.
142 |   with np.errstate(divide="ignore", invalid="ignore"):
143 |     return np.where(
144 |         num_gt_imgs_per_class == 0, np.nan,
145 |         num_images_correctly_detected_per_class / num_gt_imgs_per_class)
146 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_evaluation/np_box_list.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Numpy BoxList classes and functions."""
 17 | 
 18 | import numpy as np
 19 | 
 20 | 
 21 | class BoxList(object):
 22 |   """Box collection.
 23 | 
 24 |   BoxList represents a list of bounding boxes as numpy array, where each
 25 |   bounding box is represented as a row of 4 numbers,
 26 |   [y_min, x_min, y_max, x_max].  It is assumed that all bounding boxes within a
 27 |   given list correspond to a single image.
 28 | 
 29 |   Optionally, users can add additional related fields (such as
 30 |   objectness/classification scores).
 31 |   """
 32 | 
 33 |   def __init__(self, data):
 34 |     """Constructs box collection.
 35 | 
 36 |     Args:
 37 |       data: a numpy array of shape [N, 4] representing box coordinates
 38 | 
 39 |     Raises:
 40 |       ValueError: if bbox dataset is not a numpy array
 41 |       ValueError: if invalid dimensions for bbox dataset
 42 |     """
 43 |     if not isinstance(data, np.ndarray):
 44 |       raise ValueError('dataset must be a numpy array.')
 45 |     if len(data.shape) != 2 or data.shape[1] != 4:
 46 |       raise ValueError('Invalid dimensions for box dataset.')
 47 |     if data.dtype != np.float32 and data.dtype != np.float64:
 48 |       raise ValueError('Invalid dataset type for box dataset: float is required.')
 49 |     if not self._is_valid_boxes(data):
 50 |       raise ValueError('Invalid box dataset. dataset must be a numpy array of '
 51 |                        'N*[y_min, x_min, y_max, x_max]')
 52 |     self.data = {'boxes': data}
 53 | 
 54 |   def num_boxes(self):
 55 |     """Return number of boxes held in collections."""
 56 |     return self.data['boxes'].shape[0]
 57 | 
 58 |   def get_extra_fields(self):
 59 |     """Return all non-box fields."""
 60 |     return [k for k in self.data.keys() if k != 'boxes']
 61 | 
 62 |   def has_field(self, field):
 63 |     return field in self.data
 64 | 
 65 |   def add_field(self, field, field_data):
 66 |     """Add dataset to a specified field.
 67 | 
 68 |     Args:
 69 |       field: a string parameter used to speficy a related field to be accessed.
 70 |       field_data: a numpy array of [N, ...] representing the dataset associated
 71 |           with the field.
 72 |     Raises:
 73 |       ValueError: if the field is already exist or the dimension of the field
 74 |           dataset does not matches the number of boxes.
 75 |     """
 76 |     if self.has_field(field):
 77 |       raise ValueError('Field ' + field + 'already exists')
 78 |     if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes():
 79 |       raise ValueError('Invalid dimensions for field dataset')
 80 |     self.data[field] = field_data
 81 | 
 82 |   def get(self):
 83 |     """Convenience function for accesssing box coordinates.
 84 | 
 85 |     Returns:
 86 |       a numpy array of shape [N, 4] representing box corners
 87 |     """
 88 |     return self.get_field('boxes')
 89 | 
 90 |   def get_field(self, field):
 91 |     """Accesses dataset associated with the specified field in the box collection.
 92 | 
 93 |     Args:
 94 |       field: a string parameter used to speficy a related field to be accessed.
 95 | 
 96 |     Returns:
 97 |       a numpy 1-d array representing dataset of an associated field
 98 | 
 99 |     Raises:
100 |       ValueError: if invalid field
101 |     """
102 |     if not self.has_field(field):
103 |       raise ValueError('field {} does not exist'.format(field))
104 |     return self.data[field]
105 | 
106 |   def get_coordinates(self):
107 |     """Get corner coordinates of boxes.
108 | 
109 |     Returns:
110 |      a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max]
111 |     """
112 |     box_coordinates = self.get()
113 |     y_min = box_coordinates[:, 0]
114 |     x_min = box_coordinates[:, 1]
115 |     y_max = box_coordinates[:, 2]
116 |     x_max = box_coordinates[:, 3]
117 |     return [y_min, x_min, y_max, x_max]
118 | 
119 |   def _is_valid_boxes(self, data):
120 |     """Check whether dataset fullfills the format of N*[ymin, xmin, ymax, xmin].
121 | 
122 |     Args:
123 |       data: a numpy array of shape [N, 4] representing box coordinates
124 | 
125 |     Returns:
126 |       a boolean indicating whether all ymax of boxes are equal or greater than
127 |           ymin, and all xmax of boxes are equal or greater than xmin.
128 |     """
129 |     if data.shape[0] > 0:
130 |       for i in range(data.shape[0]):
131 |         if data[i, 0] > data[i, 2] or data[i, 1] > data[i, 3]:
132 |           return False
133 |     return True
134 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_evaluation/np_box_mask_list.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Numpy BoxMaskList classes and functions."""
17 | 
18 | import numpy as np
19 | from . import np_box_list
20 | 
21 | 
22 | class BoxMaskList(np_box_list.BoxList):
23 |   """Convenience wrapper for BoxList with masks.
24 | 
25 |   BoxMaskList extends the np_box_list.BoxList to contain masks as well.
26 |   In particular, its constructor receives both boxes and masks. Note that the
27 |   masks correspond to the full image.
28 |   """
29 | 
30 |   def __init__(self, box_data, mask_data):
31 |     """Constructs box collection.
32 | 
33 |     Args:
34 |       box_data: a numpy array of shape [N, 4] representing box coordinates
35 |       mask_data: a numpy array of shape [N, height, width] representing masks
36 |         with values are in {0,1}. The masks correspond to the full
37 |         image. The height and the width will be equal to image height and width.
38 | 
39 |     Raises:
40 |       ValueError: if bbox dataset is not a numpy array
41 |       ValueError: if invalid dimensions for bbox dataset
42 |       ValueError: if mask dataset is not a numpy array
43 |       ValueError: if invalid dimension for mask dataset
44 |     """
45 |     super(BoxMaskList, self).__init__(box_data)
46 |     if not isinstance(mask_data, np.ndarray):
47 |       raise ValueError('Mask dataset must be a numpy array.')
48 |     if len(mask_data.shape) != 3:
49 |       raise ValueError('Invalid dimensions for mask dataset.')
50 |     if mask_data.dtype != np.uint8:
51 |       raise ValueError('Invalid dataset type for mask dataset: uint8 is required.')
52 |     if mask_data.shape[0] != box_data.shape[0]:
53 |       raise ValueError('There should be the same number of boxes and masks.')
54 |     self.data['masks'] = mask_data
55 | 
56 |   def get_masks(self):
57 |     """Convenience function for accessing masks.
58 | 
59 |     Returns:
60 |       a numpy array of shape [N, height, width] representing masks
61 |     """
62 |     return self.get_field('masks')
63 | 
64 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_evaluation/np_box_ops.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Operations for [N, 4] numpy arrays representing bounding boxes.
17 | 
18 | Example box operations that are supported:
19 |   * Areas: compute bounding box areas
20 |   * IOU: pairwise intersection-over-union scores
21 | """
22 | import numpy as np
23 | 
24 | 
25 | def area(boxes):
26 |   """Computes area of boxes.
27 | 
28 |   Args:
29 |     boxes: Numpy array with shape [N, 4] holding N boxes
30 | 
31 |   Returns:
32 |     a numpy array with shape [N*1] representing box areas
33 |   """
34 |   return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
35 | 
36 | 
37 | def intersection(boxes1, boxes2):
38 |   """Compute pairwise intersection areas between boxes.
39 | 
40 |   Args:
41 |     boxes1: a numpy array with shape [N, 4] holding N boxes
42 |     boxes2: a numpy array with shape [M, 4] holding M boxes
43 | 
44 |   Returns:
45 |     a numpy array with shape [N*M] representing pairwise intersection area
46 |   """
47 |   [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
48 |   [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
49 | 
50 |   all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
51 |   all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
52 |   intersect_heights = np.maximum(
53 |       np.zeros(all_pairs_max_ymin.shape),
54 |       all_pairs_min_ymax - all_pairs_max_ymin)
55 |   all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
56 |   all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
57 |   intersect_widths = np.maximum(
58 |       np.zeros(all_pairs_max_xmin.shape),
59 |       all_pairs_min_xmax - all_pairs_max_xmin)
60 |   return intersect_heights * intersect_widths
61 | 
62 | 
63 | def iou(boxes1, boxes2):
64 |   """Computes pairwise intersection-over-union between box collections.
65 | 
66 |   Args:
67 |     boxes1: a numpy array with shape [N, 4] holding N boxes.
68 |     boxes2: a numpy array with shape [M, 4] holding N boxes.
69 | 
70 |   Returns:
71 |     a numpy array with shape [N, M] representing pairwise iou scores.
72 |   """
73 |   intersect = intersection(boxes1, boxes2)
74 |   area1 = area(boxes1)
75 |   area2 = area(boxes2)
76 |   union = np.expand_dims(area1, axis=1) + np.expand_dims(
77 |       area2, axis=0) - intersect
78 |   return intersect / union
79 | 
80 | 
81 | def ioa(boxes1, boxes2):
82 |   """Computes pairwise intersection-over-area between box collections.
83 | 
84 |   Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
85 |   their intersection area over box2's area. Note that ioa is not symmetric,
86 |   that is, IOA(box1, box2) != IOA(box2, box1).
87 | 
88 |   Args:
89 |     boxes1: a numpy array with shape [N, 4] holding N boxes.
90 |     boxes2: a numpy array with shape [M, 4] holding N boxes.
91 | 
92 |   Returns:
93 |     a numpy array with shape [N, M] representing pairwise ioa scores.
94 |   """
95 |   intersect = intersection(boxes1, boxes2)
96 |   areas = np.expand_dims(area(boxes2), axis=0)
97 |   return intersect / areas
98 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_evaluation/np_mask_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Operations for [N, height, width] numpy arrays representing masks.
 17 | 
 18 | Example mask operations that are supported:
 19 |   * Areas: compute mask areas
 20 |   * IOU: pairwise intersection-over-union scores
 21 | """
 22 | import numpy as np
 23 | 
 24 | EPSILON = 1e-7
 25 | 
 26 | 
 27 | def area(masks):
 28 |   """Computes area of masks.
 29 | 
 30 |   Args:
 31 |     masks: Numpy array with shape [N, height, width] holding N masks. Masks
 32 |       values are of type np.uint8 and values are in {0,1}.
 33 | 
 34 |   Returns:
 35 |     a numpy array with shape [N*1] representing mask areas.
 36 | 
 37 |   Raises:
 38 |     ValueError: If masks.dtype is not np.uint8
 39 |   """
 40 |   if masks.dtype != np.uint8:
 41 |     raise ValueError('Masks type should be np.uint8')
 42 |   return np.sum(masks, axis=(1, 2), dtype=np.float32)
 43 | 
 44 | 
 45 | def intersection(masks1, masks2):
 46 |   """Compute pairwise intersection areas between masks.
 47 | 
 48 |   Args:
 49 |     masks1: a numpy array with shape [N, height, width] holding N masks. Masks
 50 |       values are of type np.uint8 and values are in {0,1}.
 51 |     masks2: a numpy array with shape [M, height, width] holding M masks. Masks
 52 |       values are of type np.uint8 and values are in {0,1}.
 53 | 
 54 |   Returns:
 55 |     a numpy array with shape [N*M] representing pairwise intersection area.
 56 | 
 57 |   Raises:
 58 |     ValueError: If masks1 and masks2 are not of type np.uint8.
 59 |   """
 60 |   if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
 61 |     raise ValueError('masks1 and masks2 should be of type np.uint8')
 62 |   n = masks1.shape[0]
 63 |   m = masks2.shape[0]
 64 |   answer = np.zeros([n, m], dtype=np.float32)
 65 |   for i in np.arange(n):
 66 |     for j in np.arange(m):
 67 |       answer[i, j] = np.sum(np.minimum(masks1[i], masks2[j]), dtype=np.float32)
 68 |   return answer
 69 | 
 70 | 
 71 | def iou(masks1, masks2):
 72 |   """Computes pairwise intersection-over-union between mask collections.
 73 | 
 74 |   Args:
 75 |     masks1: a numpy array with shape [N, height, width] holding N masks. Masks
 76 |       values are of type np.uint8 and values are in {0,1}.
 77 |     masks2: a numpy array with shape [M, height, width] holding N masks. Masks
 78 |       values are of type np.uint8 and values are in {0,1}.
 79 | 
 80 |   Returns:
 81 |     a numpy array with shape [N, M] representing pairwise iou scores.
 82 | 
 83 |   Raises:
 84 |     ValueError: If masks1 and masks2 are not of type np.uint8.
 85 |   """
 86 |   if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
 87 |     raise ValueError('masks1 and masks2 should be of type np.uint8')
 88 |   intersect = intersection(masks1, masks2)
 89 |   area1 = area(masks1)
 90 |   area2 = area(masks2)
 91 |   union = np.expand_dims(area1, axis=1) + np.expand_dims(
 92 |       area2, axis=0) - intersect
 93 |   return intersect / np.maximum(union, EPSILON)
 94 | 
 95 | 
 96 | def ioa(masks1, masks2):
 97 |   """Computes pairwise intersection-over-area between box collections.
 98 | 
 99 |   Intersection-over-area (ioa) between two masks, mask1 and mask2 is defined as
100 |   their intersection area over mask2's area. Note that ioa is not symmetric,
101 |   that is, IOA(mask1, mask2) != IOA(mask2, mask1).
102 | 
103 |   Args:
104 |     masks1: a numpy array with shape [N, height, width] holding N masks. Masks
105 |       values are of type np.uint8 and values are in {0,1}.
106 |     masks2: a numpy array with shape [M, height, width] holding N masks. Masks
107 |       values are of type np.uint8 and values are in {0,1}.
108 | 
109 |   Returns:
110 |     a numpy array with shape [N, M] representing pairwise ioa scores.
111 | 
112 |   Raises:
113 |     ValueError: If masks1 and masks2 are not of type np.uint8.
114 |   """
115 |   if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
116 |     raise ValueError('masks1 and masks2 should be of type np.uint8')
117 |   intersect = intersection(masks1, masks2)
118 |   areas = np.expand_dims(area(masks2), axis=0)
119 |   return intersect / (areas + EPSILON)
120 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_wrapper.py:
--------------------------------------------------------------------------------
  1 | from .pascal_evaluation import object_detection_evaluation, standard_fields
  2 | import numpy as np
  3 | 
  4 | 
  5 | 
  6 | def parse_id(activity_list=None, class_num=24):
  7 |     if activity_list is None:  # use the class ID instead
  8 |         activity_list = ['Class{}'.format(i) for i in range(class_num)]
  9 |         # activity_list = ['Basketball', 'BasketballDunk', 'Biking', 'CliffDiving', 'CricketBowling', 'Diving', 'Fencing', 'FloorGymnastics', 'GolfSwing', 'HorseRiding', 'IceDancing', 'LongJump', 'PoleVault', 'RopeClimbing', 'SalsaSpin', 'SkateBoarding', 'Skiing', 'Skijet', 'SoccerJuggling', 'Surfing', 'TennisSwing', 'TrampolineJumping', 'VolleyballSpiking', 'WalkingWithDog']
 10 |     categories = []
 11 |     for i, act_name in enumerate(activity_list):
 12 |         categories.append({'id': i + 1, 'name': act_name})
 13 |     return categories
 14 | 
 15 | 
 16 | class STDetectionEvaluaterUCF(object):
 17 |     '''
 18 |     evaluater class designed for multi-iou thresholds
 19 |         based on https://github.com/activitynet/ActivityNet/blob/master/Evaluation/get_ava_performance.py
 20 |     parameters:
 21 |         dataset that provide GT annos, in the format of AWSCVMotionDataset
 22 |         tiou_thresholds: a list of iou thresholds
 23 |     attributes:
 24 |         clear(): clear detection results, GT is kept
 25 |         load_detection_from_path(), load anno from a list of path, in the format of [confi x1 y1 x2 y2 scoresx15]
 26 |         evaluate(): run evaluation code
 27 |     '''
 28 | 
 29 |     def __init__(self, tiou_thresholds=[0.5], load_from_dataset=False, activity_list=None, class_num=24):
 30 |         categories = parse_id(activity_list=activity_list, class_num=class_num)
 31 |         self.class_num = class_num
 32 |         self.categories = categories
 33 |         self.tiou_thresholds = tiou_thresholds
 34 |         self.lst_pascal_evaluator = []
 35 |         self.load_from_dataset = load_from_dataset
 36 |         self.exclude_key = []
 37 |         for iou in self.tiou_thresholds:
 38 |             self.lst_pascal_evaluator.append(
 39 |                 object_detection_evaluation.PascalDetectionEvaluator(categories, matching_iou_threshold=iou))
 40 | 
 41 |     def clear(self):
 42 |         for evaluator in self.lst_pascal_evaluator:
 43 |             evaluator.clear()
 44 | 
 45 |     def load_ground_truth(self, ground_truth):
 46 |         # write into evaluator
 47 |         for image_key, info in ground_truth.items():
 48 |             boxes = info['bbox'].copy()  # normalized coordinates
 49 |             resolution = info['resolution']
 50 |             boxes_eval = []
 51 |             for box in boxes:
 52 |                 area = (box[3] - box[1]) * resolution[0] * (box[2] - box[0]) * resolution[1]
 53 |                 if area < 10: continue  # ignore too small boxes
 54 |                 boxes_eval.append(box)
 55 |             if len(boxes_eval) == 0:  # no boxes
 56 |                 self.exclude_key.append(image_key)  # mark the excluded frames to filter the detections later
 57 |                 continue
 58 |             
 59 |             for evaluator in self.lst_pascal_evaluator:
 60 |                 evaluator.add_single_ground_truth_image_info(
 61 |                     image_key, {
 62 |                         standard_fields.InputDataFields.groundtruth_boxes:
 63 |                             np.vstack(boxes_eval),
 64 |                         standard_fields.InputDataFields.groundtruth_classes:
 65 |                             np.array(info['labels'], dtype=int),
 66 |                         standard_fields.InputDataFields.groundtruth_difficult:
 67 |                             np.zeros(len(boxes_eval), dtype=bool)
 68 |                     })
 69 |     
 70 | 
 71 |     def load_detection(self, detections):
 72 |         """ Load detection results from dict memory
 73 |         """
 74 |         for image_key, info in detections.items():
 75 |             # filtering out results that are in the excluded frames
 76 |             if image_key in self.exclude_key or len(info['boxes']) == 0:
 77 |                 continue
 78 | 
 79 |             # sorted by confidence:
 80 |             boxes, labels, scores = info['boxes'], info['action_ids'], info['scores']
 81 |             index = np.argsort(-scores)
 82 |             boxes, labels, scores = boxes[index], labels[index], scores[index]
 83 | 
 84 |             # add info into evaluator
 85 |             for evaluator in self.lst_pascal_evaluator:
 86 |                 evaluator.add_single_detected_image_info(
 87 |                     image_key, {
 88 |                         standard_fields.DetectionResultFields.detection_boxes: boxes,
 89 |                         standard_fields.DetectionResultFields.detection_classes: labels,
 90 |                         standard_fields.DetectionResultFields.detection_scores: scores
 91 |                     })
 92 | 
 93 |     def evaluate(self):
 94 |         result = {}
 95 |         for x, iou in enumerate(self.tiou_thresholds):
 96 |             evaluator = self.lst_pascal_evaluator[x]
 97 |             metrics = evaluator.evaluate()
 98 |             result.update(metrics)
 99 |         return result
100 | 
101 | 
102 | def frame_mAP_pascal(_results, _targets, vocab, logger, iou_list=[0.5]):
103 |     evaluater = STDetectionEvaluaterUCF(tiou_thresholds=iou_list, activity_list=vocab, class_num=len(vocab))
104 | 
105 |     logger.info("Adding ground truth into evaluator")
106 |     evaluater.load_ground_truth(_targets)
107 | 
108 |     logger.info("Adding predictions into evaluator")
109 |     evaluater.load_detection(_results)
110 | 
111 |     eval_res = evaluater.evaluate()
112 |     
113 |     return eval_res


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ucf24/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .ucf24_eval import do_ucf24_evaluation
 3 | 
 4 | 
 5 | def ucf24_evaluation(dataset, predictions, output_folder, **kwargs):
 6 |     logger = logging.getLogger("alphaction.inference")
 7 |     logger.info("performing UCF24 evaluation.")
 8 |     return do_ucf24_evaluation(
 9 |         dataset=dataset,
10 |         predictions=predictions,
11 |         output_folder=output_folder,
12 |         logger=logger,
13 |         metric=kwargs.get('metric', 'frame_ap'),
14 |         save_csv=kwargs.get('save_csv', False)
15 |     )


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import logging
  4 | import numpy as np
  5 | import time
  6 | import cv2
  7 | import torch
  8 | from iopath.common.file_io import g_pathmgr
  9 | import os
 10 | import pickle
 11 | 
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | def retry_load_images(image_paths, retry=10, backend="pytorch"):
 17 |     """
 18 |     This function is to load images with support of retrying for failed load.
 19 |     Args:
 20 |         image_paths (list): paths of images needed to be loaded.
 21 |         retry (int, optional): maximum time of loading retrying. Defaults to 10.
 22 |         backend (str): `pytorch` or `cv2`.
 23 |     Returns:
 24 |         imgs (list): list of loaded images.
 25 |     """
 26 |     for i in range(retry):
 27 |         imgs = []
 28 |         for image_path in image_paths:
 29 |             with g_pathmgr.open(image_path, "rb") as f:
 30 |                 img_str = np.frombuffer(f.read(), np.uint8)
 31 |                 img = cv2.imdecode(img_str, flags=cv2.IMREAD_COLOR)
 32 |             imgs.append(img)
 33 | 
 34 |         if all(img is not None for img in imgs):
 35 |             if backend == "pytorch":
 36 |                 imgs = torch.as_tensor(np.stack(imgs))
 37 |             return imgs
 38 |         else:
 39 |             logger.warn("Reading failed. Will retry.")
 40 |             time.sleep(1.0)
 41 |         if i == retry - 1:
 42 |             raise Exception("Failed to load images {}".format(image_paths))
 43 | 
 44 | 
 45 | def read_greyscale_image(img_file):
 46 |     assert os.path.exists(img_file), "File does not exist!\n{}".format(img_file)
 47 |     im = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE)
 48 |     im = im.astype(np.float32) / 255.0
 49 |     im = torch.from_numpy(im)
 50 |     return im
 51 | 
 52 | 
 53 | def get_sequence(center_idx, half_len, sample_rate, num_frames):
 54 |     """
 55 |     Sample frames among the corresponding clip.
 56 |     Args:
 57 |         center_idx (int): center frame idx for current clip
 58 |         half_len (int): half of the clip length
 59 |         sample_rate (int): sampling rate for sampling frames inside of the clip
 60 |         num_frames (int): number of expected sampled frames
 61 |     Returns:
 62 |         seq (list): list of indexes of sampled frames in this clip.
 63 |     """
 64 |     seq = list(range(center_idx - half_len, center_idx + half_len, sample_rate))
 65 | 
 66 |     for seq_idx in range(len(seq)):
 67 |         if seq[seq_idx] < 0:
 68 |             seq[seq_idx] = 0
 69 |         elif seq[seq_idx] >= num_frames:
 70 |             seq[seq_idx] = num_frames - 1
 71 |     return seq
 72 | 
 73 | def pack_pathway_output(cfg, frames, pathways=2):
 74 |     """
 75 |     Prepare output as a list of tensors. Each tensor corresponding to a
 76 |     unique pathway.
 77 |     Args:
 78 |         frames (tensor): frames of images sampled from the video. The
 79 |             dimension is `channel` x `num frames` x `height` x `width`.
 80 |     Returns:
 81 |         frame_list (list): list of tensors with the dimension of
 82 |             `channel` x `num frames` x `height` x `width`.
 83 |     """
 84 |     if cfg.DATA.REVERSE_INPUT_CHANNEL:
 85 |         frames = frames[[2, 1, 0], :, :, :]
 86 |     if pathways==1:
 87 |         frame_list = [frames]
 88 |     elif pathways==2:
 89 |         fast_pathway = frames
 90 |         # Perform temporal sampling from the fast pathway.
 91 |         slow_pathway = torch.index_select(
 92 |             frames,
 93 |             1,
 94 |             torch.linspace(
 95 |                 0, frames.shape[1] - 1, frames.shape[1] // cfg.SLOWFAST.ALPHA
 96 |             ).long(),
 97 |         )
 98 |         frame_list = [slow_pathway, fast_pathway]
 99 |     else:
100 |         raise NotImplementedError()
101 |     return frame_list
102 | 
103 | 
104 | def load_dets_data(det_file, topk=None):
105 |     assert os.path.exists(det_file), "detection file does not exist: {}".format(det_file)
106 |     with open(det_file, 'rb') as fid:
107 |         data = pickle.load(fid, encoding='iso-8859-1')
108 |     # get list of all frames
109 |     all_dets = dict()
110 |     for vid, dets in data.items():
111 |         for i in list(dets['boxes'].keys()):
112 |             boxes, scores = dets['boxes'][i], dets['scores'][i]
113 |             key = "%s,%05d" % (vid, i)
114 |             if topk is None:
115 |                 all_dets[key] = np.hstack((boxes, scores[:, None]))  # (n, 5)
116 |             else:
117 |                 indices = np.argsort(scores)[::-1][:topk]  # topK maximum indices
118 |                 all_dets[key] = np.hstack((boxes[indices], scores[indices, None]))  # (n, 5)
119 |     return all_dets


--------------------------------------------------------------------------------
/alphaction/dataset/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from .distributed import DistributedSampler
2 | from .grouped_batch_sampler import GroupedBatchSampler
3 | from .iteration_based_batch_sampler import IterationBasedBatchSampler
4 | 
5 | __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"]
6 | 


--------------------------------------------------------------------------------
/alphaction/dataset/samplers/distributed.py:
--------------------------------------------------------------------------------
 1 | # Code is copy-pasted exactly as in torch.utils.dataset.distributed.
 2 | # FIXME remove this once c10d fixes the bug it has
 3 | import math
 4 | import torch
 5 | import torch.distributed as dist
 6 | from torch.utils.data.sampler import Sampler
 7 | 
 8 | 
 9 | class DistributedSampler(Sampler):
10 |     """Sampler that restricts dataset loading to a subset of the dataset.
11 |     It is especially useful in conjunction with
12 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
13 |     process can pass a DistributedSampler instance as a DataLoader sampler,
14 |     and load a subset of the original dataset that is exclusive to it.
15 |     .. note::
16 |         Dataset is assumed to be of constant size.
17 |     Arguments:
18 |         dataset: Dataset used for sampling.
19 |         num_replicas (optional): Number of processes participating in
20 |             distributed training.
21 |         rank (optional): Rank of the current process within num_replicas.
22 |     """
23 | 
24 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
25 |         if num_replicas is None:
26 |             if not dist.is_available():
27 |                 raise RuntimeError("Requires distributed package to be available")
28 |             num_replicas = dist.get_world_size()
29 |         if rank is None:
30 |             if not dist.is_available():
31 |                 raise RuntimeError("Requires distributed package to be available")
32 |             rank = dist.get_rank()
33 |         self.dataset = dataset
34 |         self.num_replicas = num_replicas
35 |         self.rank = rank
36 |         self.epoch = 0
37 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
38 |         self.total_size = self.num_samples * self.num_replicas
39 |         self.shuffle = shuffle
40 | 
41 |     def __iter__(self):
42 |         if self.shuffle:
43 |             # deterministically shuffle based on epoch
44 |             g = torch.Generator()
45 |             g.manual_seed(self.epoch)
46 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
47 |         else:
48 |             indices = torch.arange(len(self.dataset)).tolist()
49 | 
50 |         # add extra samples to make it evenly divisible
51 |         indices += indices[: (self.total_size - len(indices))]
52 |         assert len(indices) == self.total_size
53 | 
54 |         # subsample
55 |         offset = self.num_samples * self.rank
56 |         indices = indices[offset : offset + self.num_samples]
57 |         assert len(indices) == self.num_samples
58 | 
59 |         return iter(indices)
60 | 
61 |     def __len__(self):
62 |         return self.num_samples
63 | 
64 |     def set_epoch(self, epoch):
65 |         self.epoch = epoch
66 | 


--------------------------------------------------------------------------------
/alphaction/dataset/samplers/grouped_batch_sampler.py:
--------------------------------------------------------------------------------
  1 | # Modified based on https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py
  2 | import itertools
  3 | 
  4 | import torch
  5 | from torch.utils.data.sampler import BatchSampler
  6 | from torch.utils.data.sampler import Sampler
  7 | 
  8 | 
  9 | class GroupedBatchSampler(BatchSampler):
 10 |     """
 11 |     Wraps another sampler to yield a mini-batch of indices.
 12 |     It enforces that elements from the same group should appear in groups of batch_size.
 13 |     It also tries to provide mini-batches which follows an ordering which is
 14 |     as close as possible to the ordering from the original sampler.
 15 | 
 16 |     Arguments:
 17 |         sampler (Sampler): Base sampler.
 18 |         batch_size (int): Size of mini-batch.
 19 |         drop_uneven (bool): If ``True``, the sampler will drop the batches whose
 20 |             size is less than ``batch_size``
 21 | 
 22 |     """
 23 | 
 24 |     def __init__(self, sampler, group_ids, batch_size, drop_uneven=False):
 25 |         if not isinstance(sampler, Sampler):
 26 |             raise ValueError(
 27 |                 "sampler should be an instance of "
 28 |                 "torch.utils.dataset.Sampler, but got sampler={}".format(sampler)
 29 |             )
 30 |         self.sampler = sampler
 31 |         self.group_ids = torch.as_tensor(group_ids)
 32 |         assert self.group_ids.dim() == 1
 33 |         self.batch_size = batch_size
 34 |         self.drop_uneven = drop_uneven
 35 | 
 36 |         self.groups = torch.unique(self.group_ids).sort(0)[0]
 37 | 
 38 |     def _prepare_batches(self):
 39 |         dataset_size = len(self.group_ids)
 40 |         # get the sampled indices from the sampler
 41 |         sampled_ids = torch.as_tensor(list(self.sampler))
 42 |         # potentially not all elements of the dataset were sampled
 43 |         # by the sampler (e.g., DistributedSampler).
 44 |         # construct a tensor which contains -1 if the element was
 45 |         # not sampled, and a non-negative number indicating the
 46 |         # order where the element was sampled.
 47 |         # for example. if sampled_ids = [3, 1] and dataset_size = 5,
 48 |         # the order is [-1, 1, -1, 0, -1]
 49 |         order = torch.full((dataset_size,), -1, dtype=torch.int64)
 50 |         order[sampled_ids] = torch.arange(len(sampled_ids))
 51 | 
 52 |         # get a mask with the elements that were sampled
 53 |         mask = order >= 0
 54 | 
 55 |         # find the elements that belong to each individual cluster
 56 |         clusters = [(self.group_ids == i) & mask for i in self.groups]
 57 |         # get relative order of the elements inside each cluster
 58 |         # that follows the order from the sampler
 59 |         relative_order = [order[cluster] for cluster in clusters]
 60 |         # with the relative order, find the absolute order in the
 61 |         # sampled space
 62 |         permutation_ids = [s[s.sort()[1]] for s in relative_order]
 63 |         # permute each cluster so that they follow the order from
 64 |         # the sampler
 65 |         permuted_clusters = [sampled_ids[idx] for idx in permutation_ids]
 66 | 
 67 |         # splits each cluster in batch_size, and merge as a list of tensors
 68 |         splits = [c.split(self.batch_size) for c in permuted_clusters]
 69 |         merged = tuple(itertools.chain.from_iterable(splits))
 70 | 
 71 |         # now each batch internally has the right order, but
 72 |         # they are grouped by clusters. Find the permutation between
 73 |         # different batches that brings them as close as possible to
 74 |         # the order that we have in the sampler. For that, we will consider the
 75 |         # ordering as coming from the first element of each batch, and sort
 76 |         # correspondingly
 77 |         first_element_of_batch = [t[0].item() for t in merged]
 78 |         # get and inverse mapping from sampled indices and the position where
 79 |         # they occur (as returned by the sampler)
 80 |         inv_sampled_ids_map = {v: k for k, v in enumerate(sampled_ids.tolist())}
 81 |         # from the first element in each batch, get a relative ordering
 82 |         first_index_of_batch = torch.as_tensor(
 83 |             [inv_sampled_ids_map[s] for s in first_element_of_batch]
 84 |         )
 85 | 
 86 |         # permute the batches so that they approximately follow the order
 87 |         # from the sampler
 88 |         permutation_order = first_index_of_batch.sort(0)[1].tolist()
 89 |         # finally, permute the batches
 90 |         batches = [merged[i].tolist() for i in permutation_order]
 91 | 
 92 |         if self.drop_uneven:
 93 |             kept = []
 94 |             for batch in batches:
 95 |                 if len(batch) == self.batch_size:
 96 |                     kept.append(batch)
 97 |             batches = kept
 98 |         return batches
 99 | 
100 |     def __iter__(self):
101 |         batches = self._prepare_batches()
102 |         self._batches = batches
103 |         return iter(batches)
104 | 
105 |     def __len__(self):
106 |         if not hasattr(self, "_batches"):
107 |             self._batches = self._prepare_batches()
108 |         return len(self._batches)
109 | 


--------------------------------------------------------------------------------
/alphaction/dataset/samplers/iteration_based_batch_sampler.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py
 2 | from torch.utils.data.sampler import BatchSampler
 3 | 
 4 | 
 5 | class IterationBasedBatchSampler(BatchSampler):
 6 |     """
 7 |     Wraps a BatchSampler, resampling from it until
 8 |     a specified number of iterations have been sampled
 9 |     """
10 | 
11 |     def __init__(self, batch_sampler, num_iterations, start_iter=0):
12 |         self.batch_sampler = batch_sampler
13 |         self.num_iterations = num_iterations
14 |         self.start_iter = start_iter
15 | 
16 |     def __iter__(self):
17 |         iteration = self.start_iter
18 |         while iteration <= self.num_iterations:
19 |             # if the underlying sampler has a set_epoch method, like
20 |             # DistributedSampler, used for making each process see
21 |             # a different split of the dataset, then set it
22 |             if hasattr(self.batch_sampler.sampler, "set_epoch"):
23 |                 self.batch_sampler.sampler.set_epoch(iteration)
24 |             for batch in self.batch_sampler:
25 |                 iteration += 1
26 |                 if iteration > self.num_iterations:
27 |                     break
28 |                 yield batch
29 | 
30 |     def __len__(self):
31 |         return self.num_iterations
32 | 


--------------------------------------------------------------------------------
/alphaction/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/engine/__init__.py


--------------------------------------------------------------------------------
/alphaction/engine/feature_extraction.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch
 3 | from tqdm import tqdm
 4 | import time
 5 | import datetime
 6 | from alphaction.utils.comm import get_rank, synchronize, get_world_size
 7 | 
 8 | 
 9 | def do_feature_extraction(model_ddp, data_loader, distributed):
10 |     
11 |     device = torch.device("cuda")
12 |     num_devices = get_world_size()
13 |     dataset = data_loader.dataset
14 | 
15 |     if dataset.finished_feat_extraction():
16 |         return
17 |     logger = logging.getLogger("alphaction.feature_extraction.{}".format(dataset._split))
18 | 
19 |     logger.info("Start feature extraction on {} dataset({} videos).".format(dataset.__class__.__name__, len(dataset)))
20 |     start_time = time.time()
21 |     model = model_ddp.module if distributed else model_ddp
22 |     model.eval()
23 | 
24 |     extra_args = {} if get_world_size() == 1 else dict(desc="feature extracting", disable=(not get_rank()==0))
25 |     
26 |     with torch.no_grad():
27 |         for i, batch in tqdm(enumerate(data_loader), **extra_args):
28 |             video, _, whwh, boxes, _, metadata, idx = batch
29 |             video = video.to(device)
30 | 
31 |             # extract patch token features and CLS token feature
32 |             features, cls_feat = model.backbone([video])
33 |             # extract text features
34 |             text_features = model.backbone.forward_text(device=device)
35 | 
36 |             # save torch tensors
37 |             dataset.save_features(idx, features[0].cpu(), cls_feat.cpu(), text_features.cpu())
38 |             
39 |             if dataset.finished_feat_extraction():
40 |                 logger.info("Finished feature extraction. ")
41 |                 break  # check if all samples are processed
42 |     
43 |     synchronize()
44 |     total_time = time.time() - start_time
45 |     total_time_str = str(datetime.timedelta(seconds=total_time))
46 |     logger.info("Feature extraction time: {} ({} s / video per device, on {} devices)".format(
47 |             total_time_str, total_time * num_devices / len(dataset), num_devices))
48 | 


--------------------------------------------------------------------------------
/alphaction/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .batch_norm import FrozenBatchNorm1d, FrozenBatchNorm2d, FrozenBatchNorm3d
2 | 
3 | __all__ = [ "FrozenBatchNorm1d", "FrozenBatchNorm2d", "FrozenBatchNorm3d"]
4 | 
5 | 


--------------------------------------------------------------------------------
/alphaction/layers/batch_norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class _FrozenBatchNorm(nn.Module):
 6 |     def __init__(self, num_features, eps=1e-5, affine=True, track_running_stats=True):
 7 |         super(_FrozenBatchNorm, self).__init__()
 8 |         self.num_features = num_features
 9 |         self.eps = eps
10 |         self.affine = affine
11 |         self.track_running_stats = track_running_stats
12 |         if self.affine:
13 |             self.register_buffer("weight", torch.Tensor(num_features))
14 |             self.register_buffer("bias", torch.Tensor(num_features))
15 |         else:
16 |             self.register_buffer("weight", None)
17 |             self.register_buffer("bias", None)
18 |         if self.track_running_stats:
19 |             self.register_buffer('running_mean', torch.zeros(num_features))
20 |             self.register_buffer('running_var', torch.ones(num_features))
21 |         else:
22 |             self.register_parameter('running_mean', None)
23 |             self.register_parameter('running_var', None)
24 |         self.reset_parameters()
25 | 
26 |     def reset_running_stats(self):
27 |         if self.track_running_stats:
28 |             self.running_mean.zero_()
29 |             self.running_var.fill_(1)
30 | 
31 |     def reset_parameters(self):
32 |         self.reset_running_stats()
33 |         if self.affine:
34 |             self.weight.data.uniform_()
35 |             self.bias.data.zero_()
36 | 
37 |     def _check_input_dim(self, input):
38 |         raise NotImplementedError
39 | 
40 |     def forward(self, input):
41 |         self._check_input_dim(input)
42 |         view_shape = (1, self.num_features) + (1,) * (input.dim() - 2)
43 | 
44 |         if self.track_running_stats:
45 |             scale = self.weight / (self.running_var + self.eps).sqrt()
46 |             bias = self.bias - self.running_mean * scale
47 |         else:
48 |             scale = self.weight
49 |             bias = self.bias
50 | 
51 |         return scale.view(*view_shape) * input + bias.view(*view_shape)
52 | 
53 |     def extra_repr(self):
54 |         return '{num_features}, eps={eps}, affine={affine}, ' \
55 |                'track_running_stats={track_running_stats}'.format(**self.__dict__)
56 | 
57 |     def _load_from_state_dict(self, state_dict, prefix, metadata, strict,
58 |                               missing_keys, unexpected_keys, error_msgs):
59 |         num_batches_tracked_key = prefix + 'num_batches_tracked'
60 |         if num_batches_tracked_key in state_dict:
61 |             del state_dict[num_batches_tracked_key]
62 |         super(_FrozenBatchNorm, self)._load_from_state_dict(
63 |             state_dict, prefix, metadata, strict,
64 |             missing_keys, unexpected_keys, error_msgs)
65 | 
66 | 
67 | class FrozenBatchNorm1d(_FrozenBatchNorm):
68 |     def _check_input_dim(self, input):
69 |         if input.dim() != 2 and input.dim() != 3:
70 |             raise ValueError('expected 2D or 3D input (got {}D input)'
71 |                              .format(input.dim()))
72 | 
73 | 
74 | class FrozenBatchNorm2d(_FrozenBatchNorm):
75 |     def _check_input_dim(self, input):
76 |         if input.dim() != 4:
77 |             raise ValueError('expected 4D input (got {}D input)'
78 |                              .format(input.dim()))
79 | 
80 | 
81 | class FrozenBatchNorm3d(_FrozenBatchNorm):
82 |     def _check_input_dim(self, input):
83 |         if input.dim() != 5:
84 |             raise ValueError('expected 5D input (got {}D input)'
85 |                              .format(input.dim()))
86 | 


--------------------------------------------------------------------------------
/alphaction/modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/modeling/__init__.py


--------------------------------------------------------------------------------
/alphaction/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import build_backbone


--------------------------------------------------------------------------------
/alphaction/modeling/backbone/backbone.py:
--------------------------------------------------------------------------------
 1 | from alphaction.modeling import registry
 2 | from . import slowfast, i3d, video_model_builder
 3 | 
 4 | @registry.BACKBONES.register("Slowfast-Resnet50")
 5 | @registry.BACKBONES.register("Slowfast-Resnet101")
 6 | def build_slowfast_resnet_backbone(cfg):
 7 |     model = slowfast.SlowFast(cfg)
 8 |     return model
 9 | 
10 | @registry.BACKBONES.register("PySlowonly")
11 | def build_pyslowonly_resnet_backbone(cfg):
12 |     model = video_model_builder.ResNet(cfg)
13 |     return model
14 | 
15 | @registry.BACKBONES.register("PySlowfast-R50")
16 | @registry.BACKBONES.register("PySlowfast-R101")
17 | def build_pyslowfast_resnet_backbone(cfg):
18 |     model = video_model_builder.SlowFast(cfg)
19 |     return model
20 | 
21 | @registry.BACKBONES.register("MAE-ViT-B")
22 | @registry.BACKBONES.register("MAE-ViT-L")
23 | def build_mae_vit_backbone(cfg):
24 |     model = video_model_builder.ViT(cfg)
25 |     return model
26 | 
27 | @registry.BACKBONES.register("I3D-Resnet50")
28 | @registry.BACKBONES.register("I3D-Resnet101")
29 | @registry.BACKBONES.register("I3D-Resnet50-Sparse")
30 | @registry.BACKBONES.register("I3D-Resnet101-Sparse")
31 | def build_i3d_resnet_backbone(cfg):
32 |     model = i3d.I3D(cfg)
33 |     return model
34 | 
35 | # OpenAI CLIP
36 | @registry.BACKBONES.register("ViT-B/16")
37 | @registry.BACKBONES.register("ViT-B/32")
38 | @registry.BACKBONES.register("ViT-L/14")
39 | def build_clip_vit_backbone(cfg):
40 |     from alphaction.modeling.encoders.openai_clip.clip_encoder import build_clip_backbone
41 |     model = build_clip_backbone(cfg)
42 |     return model
43 | 
44 | # CLIP-ViP
45 | @registry.BACKBONES.register("ViP-B/16")
46 | @registry.BACKBONES.register("ViP-B/32")
47 | def build_clipvip_backbone(cfg):
48 |     from alphaction.modeling.encoders.clipvip.clipvip_encoder import build_clipvip_backbone
49 |     model = build_clipvip_backbone(cfg)
50 |     return model
51 | 
52 | # ViCLIP from InternVideo
53 | @registry.BACKBONES.register("ViCLIP-L/14")
54 | def build_viclip_backbone(cfg):
55 |     from alphaction.modeling.encoders.viclip.viclip_encoder import build_viclip_backbone
56 |     model = build_viclip_backbone(cfg)
57 |     return model
58 | 
59 | 
60 | def build_backbone(cfg):
61 |     assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \
62 |         "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format(
63 |             cfg.MODEL.BACKBONE.CONV_BODY
64 |         )
65 |     return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg)
66 | 


--------------------------------------------------------------------------------
/alphaction/modeling/backbone/i3d.py:
--------------------------------------------------------------------------------
  1 | from __future__ import (absolute_import, division, print_function,
  2 |                         unicode_literals)
  3 | 
  4 | import torch.nn as nn
  5 | from alphaction.layers import FrozenBatchNorm3d
  6 | from alphaction.modeling.common_blocks import ResNLBlock
  7 | 
  8 | 
  9 | def get_model_cfg(cfg):
 10 |     backbone_strs = cfg.MODEL.BACKBONE.CONV_BODY.split('-')[1:]
 11 |     error_msg = 'Model backbone {} is not supported.'.format(cfg.MODEL.BACKBONE.CONV_BODY)
 12 | 
 13 |     use_temp_convs_1 = [2]
 14 |     temp_strides_1 = [2]
 15 |     max_pool_stride_1 = 2
 16 | 
 17 |     use_temp_convs_2 = [1, 1, 1]
 18 |     temp_strides_2 = [1, 1, 1]
 19 |     max_pool_stride_2 = 2
 20 | 
 21 |     use_temp_convs_3 = [1, 0, 1, 0]
 22 |     temp_strides_3 = [1, 1, 1, 1]
 23 | 
 24 |     use_temp_convs_5 = [0, 1, 0]
 25 |     temp_strides_5 = [1, 1, 1]
 26 | 
 27 |     avg_pool_stride = int(cfg.INPUT.FRAME_NUM / 8)
 28 |     if backbone_strs[0] == 'Resnet50':
 29 |         block_config = (3, 4, 6, 3)
 30 | 
 31 |         use_temp_convs_4 = [1, 0, 1, 0, 1, 0]
 32 |         temp_strides_4 = [1, 1, 1, 1, 1, 1]
 33 |     elif backbone_strs[0] == 'Resnet101':
 34 |         block_config = (3, 4, 23, 3)
 35 | 
 36 |         use_temp_convs_4 = []
 37 |         for i in range(23):
 38 |             if i % 2 == 0:
 39 |                 use_temp_convs_4.append(1)
 40 |             else:
 41 |                 use_temp_convs_4.append(0)
 42 |         temp_strides_4 = [1, ] * 23
 43 |     else:
 44 |         raise KeyError(error_msg)
 45 | 
 46 |     if len(backbone_strs) > 1:
 47 |         if len(backbone_strs) == 2 and backbone_strs[1] == 'Sparse':
 48 |             temp_strides_1 = [1]
 49 |             max_pool_stride_1 = 1
 50 |             avg_pool_stride = int(cfg.INPUT.FRAME_NUM / 2)
 51 |         else:
 52 |             raise KeyError(error_msg)
 53 | 
 54 |     use_temp_convs_set = [use_temp_convs_1, use_temp_convs_2, use_temp_convs_3, use_temp_convs_4, use_temp_convs_5]
 55 |     temp_strides_set = [temp_strides_1, temp_strides_2, temp_strides_3, temp_strides_4, temp_strides_5]
 56 |     pool_strides_set = [max_pool_stride_1, max_pool_stride_2, avg_pool_stride]
 57 |     return block_config, use_temp_convs_set, temp_strides_set, pool_strides_set
 58 | 
 59 | 
 60 | class I3D(nn.Module):
 61 |     def __init__(self, cfg):
 62 |         super(I3D, self).__init__()
 63 | 
 64 |         self.cfg = cfg.clone()
 65 | 
 66 |         block_config, use_temp_convs_set, temp_strides_set, pool_strides_set = get_model_cfg(cfg)
 67 |         conv3_nonlocal = cfg.MODEL.BACKBONE.I3D.CONV3_NONLOCAL
 68 |         conv4_nonlocal = cfg.MODEL.BACKBONE.I3D.CONV4_NONLOCAL
 69 | 
 70 |         dim_inner = 64
 71 |         conv_dims = [64, 256, 512, 1024, 2048]
 72 |         self.dim_out = conv_dims[-1]
 73 |         n1, n2, n3, n4 = block_config
 74 |         layer_mod = 2
 75 |         conv3_nl_mod = layer_mod
 76 |         conv4_nl_mod = layer_mod
 77 |         if not conv3_nonlocal:
 78 |             conv3_nl_mod = 1000
 79 |         if not conv4_nonlocal:
 80 |             conv4_nl_mod = 1000
 81 |         self.c2_mapping = None
 82 | 
 83 |         data_dim = 3
 84 |         self.conv1 = nn.Conv3d(data_dim, conv_dims[0], (1 + use_temp_convs_set[0][0] * 2, 7, 7),
 85 |                                stride=(temp_strides_set[0][0], 2, 2),
 86 |                                padding=(use_temp_convs_set[0][0], 3, 3), bias=False)
 87 |         nn.init.kaiming_normal_(self.conv1.weight)
 88 | 
 89 |         if cfg.MODEL.BACKBONE.FROZEN_BN:
 90 |             self.bn1 = FrozenBatchNorm3d(conv_dims[0], eps=cfg.MODEL.BACKBONE.BN_EPSILON)
 91 |             nn.init.constant_(self.bn1.weight, 1.0)
 92 |             nn.init.constant_(self.bn1.bias, 0.0)
 93 |         else:
 94 |             self.bn1 = nn.BatchNorm3d(conv_dims[0], eps=cfg.MODEL.BACKBONE.BN_EPSILON, momentum=cfg.MODEL.BACKBONE.BN_MOMENTUM)
 95 | 
 96 |         self.relu = nn.ReLU(inplace=True)
 97 |         self.maxpool1 = nn.MaxPool3d((pool_strides_set[0], 3, 3), stride=(pool_strides_set[0], 2, 2))
 98 | 
 99 |         self.res_nl1 = ResNLBlock(cfg, conv_dims[0], conv_dims[1], stride=1, num_blocks=n1, dim_inner=dim_inner,
100 |                                   use_temp_convs=use_temp_convs_set[1], temp_strides=temp_strides_set[1])
101 |         self.maxpool2 = nn.MaxPool3d((pool_strides_set[1], 1, 1), stride=(pool_strides_set[1], 1, 1))
102 | 
103 |         self.res_nl2 = ResNLBlock(cfg, conv_dims[1], conv_dims[2], stride=2, num_blocks=n2,
104 |                                   dim_inner=dim_inner * 2, use_temp_convs=use_temp_convs_set[2],
105 |                                   temp_strides=temp_strides_set[2], nonlocal_mod=conv3_nl_mod,
106 |                                   group_nonlocal=cfg.MODEL.BACKBONE.I3D.CONV3_GROUP_NL)
107 | 
108 |         self.res_nl3 = ResNLBlock(cfg, conv_dims[2], conv_dims[3], stride=2, num_blocks=n3,
109 |                                   dim_inner=dim_inner * 4, use_temp_convs=use_temp_convs_set[3],
110 |                                   temp_strides=temp_strides_set[3], nonlocal_mod=conv4_nl_mod)
111 | 
112 |         self.res_nl4 = ResNLBlock(cfg, conv_dims[3], conv_dims[4], stride=1, num_blocks=n4,
113 |                                   dim_inner=dim_inner * 8, use_temp_convs=use_temp_convs_set[4],
114 |                                   temp_strides=temp_strides_set[4],
115 |                                   dilation=2)
116 | 
117 |     def forward(self, _, x):
118 |         # We only use fast videos, which is the second input.
119 |         out = self.conv1(x)
120 |         out = self.bn1(out)
121 |         out = self.relu(out)
122 |         out = self.maxpool1(out)
123 | 
124 |         out = self.res_nl1(out)
125 |         out = self.maxpool2(out)
126 | 
127 |         out = self.res_nl2(out)
128 | 
129 |         out = self.res_nl3(out)
130 | 
131 |         out = self.res_nl4(out)
132 |         return None, out
133 | 
134 |     def c2_weight_mapping(self):
135 |         if self.c2_mapping is None:
136 |             weight_map = {'conv1.weight': 'conv1_w',
137 |                       'bn1.weight': 'res_conv1_bn_s',
138 |                       'bn1.bias': 'res_conv1_bn_b',
139 |                       'bn1.running_mean': 'res_conv1_bn_rm',
140 |                       'bn1.running_var': 'res_conv1_bn_riv'}
141 |             for i in range(1, 5):
142 |                 name = 'res_nl{}'.format(i)
143 |                 child_map = getattr(self, name).c2_weight_mapping()
144 |                 for key, val in child_map.items():
145 |                     new_key = name + '.' + key
146 |                     weight_map[new_key] = val.format(i + 1)
147 |             self.c2_mapping = weight_map
148 |         return self.c2_mapping
149 | 


--------------------------------------------------------------------------------
/alphaction/modeling/backbone/sfmodels/common.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | 
  7 | class Mlp(nn.Module):
  8 |     def __init__(
  9 |         self,
 10 |         in_features,
 11 |         hidden_features=None,
 12 |         out_features=None,
 13 |         act_layer=nn.GELU,
 14 |         drop_rate=0.0,
 15 |     ):
 16 |         super().__init__()
 17 |         self.drop_rate = drop_rate
 18 |         out_features = out_features or in_features
 19 |         hidden_features = hidden_features or in_features
 20 |         self.fc1 = nn.Linear(in_features, hidden_features)
 21 |         self.act = act_layer()
 22 |         self.fc2 = nn.Linear(hidden_features, out_features)
 23 |         if self.drop_rate > 0.0:
 24 |             self.drop = nn.Dropout(drop_rate)
 25 | 
 26 |     def forward(self, x):
 27 |         x = self.fc1(x)
 28 |         x = self.act(x)
 29 |         if self.drop_rate > 0.0:
 30 |             x = self.drop(x)
 31 |         x = self.fc2(x)
 32 |         if self.drop_rate > 0.0:
 33 |             x = self.drop(x)
 34 |         return x
 35 | 
 36 | 
 37 | class Permute(nn.Module):
 38 |     def __init__(self, dims):
 39 |         super().__init__()
 40 |         self.dims = dims
 41 | 
 42 |     def forward(self, x):
 43 |         return x.permute(*self.dims)
 44 | 
 45 | 
 46 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
 47 |     """
 48 |     Stochastic Depth per sample.
 49 |     """
 50 |     if drop_prob == 0.0 or not training:
 51 |         return x
 52 |     keep_prob = 1 - drop_prob
 53 |     shape = (x.shape[0],) + (1,) * (
 54 |         x.ndim - 1
 55 |     )  # work with diff dim tensors, not just 2D ConvNets
 56 |     mask = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
 57 |     mask.floor_()  # binarize
 58 |     output = x.div(keep_prob) * mask
 59 |     return output
 60 | 
 61 | 
 62 | class DropPath(nn.Module):
 63 |     """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
 64 | 
 65 |     def __init__(self, drop_prob=None):
 66 |         super(DropPath, self).__init__()
 67 |         self.drop_prob = drop_prob
 68 | 
 69 |     def forward(self, x):
 70 |         return drop_path(x, self.drop_prob, self.training)
 71 | 
 72 | 
 73 | class TwoStreamFusion(nn.Module):
 74 |     def __init__(self, mode, dim=None, kernel=3, padding=1):
 75 |         """
 76 |         A general constructor for neural modules fusing two equal sized tensors
 77 |         in forward. Following options are supported:
 78 |         "add" / "max" / "min" / "avg"             : respective operations on the two halves.
 79 |         "concat"                                  : NOOP.
 80 |         "concat_linear_{dim_mult}_{drop_rate}"    : MLP to fuse with hidden dim "dim_mult"
 81 |                                                     (optional, def 1.) higher than input dim
 82 |                                                     with optional dropout "drop_rate" (def: 0.)
 83 |         "ln+concat_linear_{dim_mult}_{drop_rate}" : perform MLP after layernorm on the input.
 84 |         """
 85 |         super().__init__()
 86 |         self.mode = mode
 87 |         if mode == "add":
 88 |             self.fuse_fn = lambda x: torch.stack(torch.chunk(x, 2, dim=2)).sum(
 89 |                 dim=0
 90 |             )
 91 |         elif mode == "max":
 92 |             self.fuse_fn = (
 93 |                 lambda x: torch.stack(torch.chunk(x, 2, dim=2))
 94 |                 .max(dim=0)
 95 |                 .values
 96 |             )
 97 |         elif mode == "min":
 98 |             self.fuse_fn = (
 99 |                 lambda x: torch.stack(torch.chunk(x, 2, dim=2))
100 |                 .min(dim=0)
101 |                 .values
102 |             )
103 |         elif mode == "avg":
104 |             self.fuse_fn = lambda x: torch.stack(torch.chunk(x, 2, dim=2)).mean(
105 |                 dim=0
106 |             )
107 |         elif mode == "concat":
108 |             # x itself is the channel concat version
109 |             self.fuse_fn = lambda x: x
110 |         elif "concat_linear" in mode:
111 |             if len(mode.split("_")) == 2:
112 |                 dim_mult = 1.0
113 |                 drop_rate = 0.0
114 |             elif len(mode.split("_")) == 3:
115 |                 dim_mult = float(mode.split("_")[-1])
116 |                 drop_rate = 0.0
117 | 
118 |             elif len(mode.split("_")) == 4:
119 |                 dim_mult = float(mode.split("_")[-2])
120 |                 drop_rate = float(mode.split("_")[-1])
121 |             else:
122 |                 raise NotImplementedError
123 | 
124 |             if mode.split("+")[0] == "ln":
125 |                 self.fuse_fn = nn.Sequential(
126 |                     nn.LayerNorm(dim),
127 |                     Mlp(
128 |                         in_features=dim,
129 |                         hidden_features=int(dim * dim_mult),
130 |                         act_layer=nn.GELU,
131 |                         out_features=dim,
132 |                         drop_rate=drop_rate,
133 |                     ),
134 |                 )
135 |             else:
136 |                 self.fuse_fn = Mlp(
137 |                     in_features=dim,
138 |                     hidden_features=int(dim * dim_mult),
139 |                     act_layer=nn.GELU,
140 |                     out_features=dim,
141 |                     drop_rate=drop_rate,
142 |                 )
143 | 
144 |         else:
145 |             raise NotImplementedError
146 | 
147 |     def forward(self, x):
148 |         if "concat_linear" in self.mode:
149 |             return self.fuse_fn(x) + x
150 | 
151 |         else:
152 |             return self.fuse_fn(x)


--------------------------------------------------------------------------------
/alphaction/modeling/backbone/sfmodels/nonlocal_helper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Non-local helper"""
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | 
 10 | class Nonlocal(nn.Module):
 11 |     """
 12 |     Builds Non-local Neural Networks as a generic family of building
 13 |     blocks for capturing long-range dependencies. Non-local Network
 14 |     computes the response at a position as a weighted sum of the
 15 |     features at all positions. This building block can be plugged into
 16 |     many computer vision architectures.
 17 |     More details in the paper: https://arxiv.org/pdf/1711.07971.pdf
 18 |     """
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         dim,
 23 |         dim_inner,
 24 |         pool_size=None,
 25 |         instantiation="softmax",
 26 |         zero_init_final_conv=False,
 27 |         zero_init_final_norm=True,
 28 |         norm_eps=1e-5,
 29 |         norm_momentum=0.1,
 30 |         norm_module=nn.BatchNorm3d,
 31 |     ):
 32 |         """
 33 |         Args:
 34 |             dim (int): number of dimension for the input.
 35 |             dim_inner (int): number of dimension inside of the Non-local block.
 36 |             pool_size (list): the kernel size of spatial temporal pooling,
 37 |                 temporal pool kernel size, spatial pool kernel size, spatial
 38 |                 pool kernel size in order. By default pool_size is None,
 39 |                 then there would be no pooling used.
 40 |             instantiation (string): supports two different instantiation method:
 41 |                 "dot_product": normalizing correlation matrix with L2.
 42 |                 "softmax": normalizing correlation matrix with Softmax.
 43 |             zero_init_final_conv (bool): If true, zero initializing the final
 44 |                 convolution of the Non-local block.
 45 |             zero_init_final_norm (bool):
 46 |                 If true, zero initializing the final batch norm of the Non-local
 47 |                 block.
 48 |             norm_module (nn.Module): nn.Module for the normalization layer. The
 49 |                 default is nn.BatchNorm3d.
 50 |         """
 51 |         super(Nonlocal, self).__init__()
 52 |         self.dim = dim
 53 |         self.dim_inner = dim_inner
 54 |         self.pool_size = pool_size
 55 |         self.instantiation = instantiation
 56 |         self.use_pool = (
 57 |             False
 58 |             if pool_size is None
 59 |             else any((size > 1 for size in pool_size))
 60 |         )
 61 |         self.norm_eps = norm_eps
 62 |         self.norm_momentum = norm_momentum
 63 |         self._construct_nonlocal(
 64 |             zero_init_final_conv, zero_init_final_norm, norm_module
 65 |         )
 66 | 
 67 |     def _construct_nonlocal(
 68 |         self, zero_init_final_conv, zero_init_final_norm, norm_module
 69 |     ):
 70 |         # Three convolution heads: theta, phi, and g.
 71 |         self.conv_theta = nn.Conv3d(
 72 |             self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
 73 |         )
 74 |         self.conv_phi = nn.Conv3d(
 75 |             self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
 76 |         )
 77 |         self.conv_g = nn.Conv3d(
 78 |             self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
 79 |         )
 80 | 
 81 |         # Final convolution output.
 82 |         self.conv_out = nn.Conv3d(
 83 |             self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0
 84 |         )
 85 |         # Zero initializing the final convolution output.
 86 |         self.conv_out.zero_init = zero_init_final_conv
 87 | 
 88 |         # TODO: change the name to `norm`
 89 |         self.bn = norm_module(
 90 |             num_features=self.dim,
 91 |             eps=self.norm_eps,
 92 |             momentum=self.norm_momentum,
 93 |         )
 94 |         # Zero initializing the final bn.
 95 |         self.bn.transform_final_bn = zero_init_final_norm
 96 | 
 97 |         # Optional to add the spatial-temporal pooling.
 98 |         if self.use_pool:
 99 |             self.pool = nn.MaxPool3d(
100 |                 kernel_size=self.pool_size,
101 |                 stride=self.pool_size,
102 |                 padding=[0, 0, 0],
103 |             )
104 | 
105 |     def forward(self, x):
106 |         x_identity = x
107 |         N, C, T, H, W = x.size()
108 | 
109 |         theta = self.conv_theta(x)
110 | 
111 |         # Perform temporal-spatial pooling to reduce the computation.
112 |         if self.use_pool:
113 |             x = self.pool(x)
114 | 
115 |         phi = self.conv_phi(x)
116 |         g = self.conv_g(x)
117 | 
118 |         theta = theta.view(N, self.dim_inner, -1)
119 |         phi = phi.view(N, self.dim_inner, -1)
120 |         g = g.view(N, self.dim_inner, -1)
121 | 
122 |         # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW).
123 |         theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi))
124 |         # For original Non-local paper, there are two main ways to normalize
125 |         # the affinity tensor:
126 |         #   1) Softmax normalization (norm on exp).
127 |         #   2) dot_product normalization.
128 |         if self.instantiation == "softmax":
129 |             # Normalizing the affinity tensor theta_phi before softmax.
130 |             theta_phi = theta_phi * (self.dim_inner**-0.5)
131 |             theta_phi = nn.functional.softmax(theta_phi, dim=2)
132 |         elif self.instantiation == "dot_product":
133 |             spatial_temporal_dim = theta_phi.shape[2]
134 |             theta_phi = theta_phi / spatial_temporal_dim
135 |         else:
136 |             raise NotImplementedError(
137 |                 "Unknown norm type {}".format(self.instantiation)
138 |             )
139 | 
140 |         # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW).
141 |         theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g))
142 | 
143 |         # (N, C, TxHxW) => (N, C, T, H, W).
144 |         theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W)
145 | 
146 |         p = self.conv_out(theta_phi_g)
147 |         p = self.bn(p)
148 |         return x_identity + p


--------------------------------------------------------------------------------
/alphaction/modeling/detector/__init__.py:
--------------------------------------------------------------------------------
1 | from .stm_detector import build_detection_model
2 | from .naive_baseline import build_naive_baseline


--------------------------------------------------------------------------------
/alphaction/modeling/detector/action_detector.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from ..backbone import build_backbone
 4 | from ..roi_heads.roi_heads_3d import build_3d_roi_heads
 5 | 
 6 | 
 7 | class ActionDetector(nn.Module):
 8 |     def __init__(self, cfg):
 9 |         super(ActionDetector, self).__init__()
10 |         self.backbone = build_backbone(cfg)
11 |         self.roi_heads = build_3d_roi_heads(cfg, self.backbone.dim_out)
12 | 
13 |     def forward(self, slow_video, fast_video, boxes, objects=None, extras={}, part_forward=-1):
14 |         # part_forward is used to split this model into two parts.
15 |         # if part_forward<0, just use it as a single model
16 |         # if part_forward=0, use this model to extract pooled feature(person and object, no memory features).
17 |         # if part_forward=1, use the ia structure to aggregate interactions and give final result.
18 |         # implemented in roi_heads
19 | 
20 |         if part_forward==1:
21 |             slow_features = fast_features = None
22 |         else:
23 |             slow_features, fast_features = self.backbone(slow_video, fast_video)
24 | 
25 |         result, detector_losses, loss_weight, detector_metrics = self.roi_heads(slow_features, fast_features, boxes, objects, extras, part_forward)
26 | 
27 |         if self.training:
28 |             return detector_losses, loss_weight, detector_metrics, result
29 | 
30 |         return result
31 | 
32 |     def c2_weight_mapping(self):
33 |         if not hasattr(self, "c2_mapping"):
34 |             weight_map = {}
35 |             for name, m_child in self.named_children():
36 |                 if m_child.state_dict() and hasattr(m_child, "c2_weight_mapping"):
37 |                     child_map = m_child.c2_weight_mapping()
38 |                     for key, val in child_map.items():
39 |                         new_key = name + '.' + key
40 |                         weight_map[new_key] = val
41 |             self.c2_mapping = weight_map
42 |         return self.c2_mapping
43 | 
44 | def build_detection_model(cfg):
45 |     return ActionDetector(cfg)


--------------------------------------------------------------------------------
/alphaction/modeling/detector/naive_baseline.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from ..backbone import build_backbone
  5 | from alphaction.modeling.stm_decoder.util.box_ops import clip_boxes_tensor
  6 | from torchvision.ops import roi_align
  7 | from einops import rearrange
  8 | import numpy as np
  9 | 
 10 | 
 11 | 
 12 | class NaiveBaseline(nn.Module):
 13 |     def __init__(self, cfg):
 14 |         super(NaiveBaseline, self).__init__()
 15 |         self.backbone = build_backbone(cfg)
 16 |         assert self.backbone.visual_encoder.use_cls_feat
 17 |         assert cfg.DATA.OPEN_VOCABULARY
 18 | 
 19 |         self.use_roi_feat = cfg.MODEL.USE_ROI_FEAT
 20 |         self.multi_label_action = cfg.MODEL.MULTI_LABEL_ACTION
 21 |     
 22 |     
 23 |     def roi_align_pool(self, patch_feats, batch_boxes, raw_sizes, out_size=(7, 7), spatial_scale=1.0/16):
 24 |         """ patch_feats: (B, D, T, h, w)
 25 |             boxes: list of boxes, not normalized
 26 |             raw_sizes: (B, 2) in (width, height)
 27 |         """
 28 |         B, D, T, h, w = patch_feats.size()
 29 |         device = patch_feats.device
 30 |         feat_maps = patch_feats.mean(dim=2)  # (B, D, h, w)  temporally mean pooling
 31 |         boxes_list = [np.hstack([np.ones((boxes.shape[0], 1)) * i, boxes]) for i, boxes in enumerate(batch_boxes)]
 32 |         boxes_tensor = torch.from_numpy(np.vstack(boxes_list)).type(patch_feats.dtype).to(device)
 33 |         roi_feat = roi_align(feat_maps, boxes_tensor, out_size, spatial_scale)  # (BN, D, 7, 7)
 34 |         roi_feat = rearrange(roi_feat, 'm d h w -> m (h w) d')
 35 |         
 36 |         # get meanpooled roi features
 37 |         roi_align_features = []
 38 |         batch_indices = boxes_tensor[:, 0].long()
 39 |         for i in range(B):
 40 |             rois = roi_feat[batch_indices == i].mean(dim=1)  # (n, d)
 41 |             roi_align_features.append(rois)
 42 |         
 43 |         return roi_align_features
 44 | 
 45 |     
 46 |     def forward(self, slow_video, fast_video, whwh, boxes=None, labels=None, extras={}, part_forward=-1):
 47 | 
 48 |         assert not self.training, "NaiveBaseline does not need training!"
 49 |         assert 'prior_boxes' in extras, "NaiveBaseline use loaded boxes for testing!"
 50 |         device = slow_video.device
 51 | 
 52 |         prior_boxes = extras['prior_boxes']
 53 |         box_list = []
 54 |         for i in range(len(prior_boxes)):
 55 |             box = torch.tensor(prior_boxes[i], dtype=torch.float32, device=device)
 56 |             cur_whwh = whwh[i]
 57 |             box = clip_boxes_tensor(box, cur_whwh[1], cur_whwh[0])
 58 |             box[:, 0::2] /= cur_whwh[0]
 59 |             box[:, 1::2] /= cur_whwh[1]
 60 |             box_list.append(box)
 61 | 
 62 |         if self.backbone.num_pathways == 1:
 63 |             features = self.backbone([slow_video])
 64 |         else:
 65 |             features = self.backbone([slow_video, fast_video])
 66 |         
 67 |         patch_feats, cls_feat_visual = features  # (B, 512)
 68 |         B = cls_feat_visual.size(0)
 69 | 
 70 |         if self.use_roi_feat:
 71 |             # feature projection & RoIAlign Pooling
 72 |             patch_feats = self.backbone.visual_encoder.project_patch_features(patch_feats[0])
 73 |             roi_features = self.roi_align_pool(patch_feats, prior_boxes, whwh[:, :2])
 74 | 
 75 |         # get the current text feature embeddings
 76 |         text_features = self.backbone.forward_text(device=slow_video.device)  # (K, 512)
 77 |         tau_inv = self.backbone.tau_inv
 78 | 
 79 |         if isinstance(text_features, list):
 80 |             text_features = torch.stack(text_features).mean(1)
 81 |         text_features_normed = text_features / text_features.norm(dim=-1, keepdim=True)  # (K, D)
 82 | 
 83 |         action_score_list = []
 84 |         if self.use_roi_feat:
 85 |             # return self.forward_roi_cls(roi_features, text_features, tau_inv, whwh)
 86 |             for roi_feat in roi_features:
 87 |                 # action recognition
 88 |                 vis_features_normed = roi_feat / roi_feat.norm(dim=-1, keepdim=True)  # (N, D)
 89 |                 action_score = tau_inv * vis_features_normed @ text_features_normed.t()  # (N, K)
 90 |                 action_score_list.append(action_score)
 91 |         else:
 92 |             vis_features_normed = cls_feat_visual / cls_feat_visual.norm(dim=-1, keepdim=True)  # (B, D)
 93 |             action_score = tau_inv * vis_features_normed @ text_features_normed.t()  # (B, K)
 94 |             for i in range(B):
 95 |                 # with full frame input, we only have one score vector, which need to be repeated.
 96 |                 scores = action_score[[i]].repeat(box_list[i].size(0), 1)  # (1, K)
 97 |                 action_score_list.append(scores)
 98 |         
 99 |         return action_score_list, box_list
100 | 
101 | 
102 | def build_naive_baseline(cfg):
103 |     return NaiveBaseline(cfg)


--------------------------------------------------------------------------------
/alphaction/modeling/dict_model.py:
--------------------------------------------------------------------------------
  1 | # Simple pytorch implementation of Dictionary Learning based on stochastic gradient descent
  2 | #
  3 | # June 2018
  4 | # Jeremias Sulam
  5 | 
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import Variable
 10 | import torch.nn.functional as F
 11 | import numpy as np
 12 | 
 13 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 14 | 
 15 | 
 16 | ####################################
 17 |     ## Dict. Learning ##
 18 | ####################################
 19 | 
 20 | class DictLearn(nn.Module):
 21 |     def __init__(self, num_basis, dim_basis, SC='FISTA', sc_iters=None):
 22 |         super(DictLearn, self).__init__()
 23 | 
 24 |         self.W = nn.Parameter(torch.randn(dim_basis, num_basis, requires_grad=False))
 25 |         
 26 |         # normalization
 27 |         self.W.data = NormDict(self.W.data)
 28 |         self.SC = SC
 29 |         self.sc_iters = sc_iters
 30 |         
 31 |         if self.sc_iters is None:
 32 |             self.sc_iters = 20 if SC=='FISTA' else 50
 33 |         
 34 |     
 35 |         
 36 |     def forward(self, Y, K):
 37 |         
 38 |         # normalizing Dict
 39 |         self.W.requires_grad_(False)
 40 |         self.W.data = NormDict(self.W.data)
 41 |         
 42 |         # Sparse Coding
 43 |         if self.SC == 'IHT':
 44 |             Gamma, residual, errIHT = IHT(Y,self.W,K, self.sc_iters)
 45 |         elif self.SC == 'FISTA':
 46 |             Gamma, residual, errIHT = FISTA(Y,self.W,K, self.sc_iters)
 47 |         else: print("Oops!")
 48 |         
 49 |         # Reconstructing
 50 |         self.W.requires_grad_(True)
 51 |         X = torch.mm(Gamma,self.W.transpose(1,0))
 52 |         
 53 |         # sparsity
 54 |         # NNZ = np.count_nonzero(Gamma.cpu().data.numpy())/Gamma.shape[0]
 55 | 
 56 |         return X, Gamma, errIHT
 57 |         
 58 | 
 59 |         
 60 | #--------------------------------------------------------------
 61 | #         Auxiliary Functions
 62 | #--------------------------------------------------------------
 63 | 
 64 | def hard_threshold_k(X, k):
 65 |     Gamma = X.clone()
 66 |     m = X.data.shape[1]
 67 |     a,_ = torch.abs(Gamma).data.sort(dim=1,descending=True)
 68 |     T = torch.mm(a[:,k].unsqueeze(1),torch.Tensor(np.ones((1,m))).to(device))
 69 |     mask = Variable(torch.Tensor((np.abs(Gamma.data.cpu().numpy())>T.cpu().numpy()) + 0.)).to(device)
 70 |     Gamma = Gamma * mask
 71 |     return Gamma#, mask.data.nonzero()
 72 | 
 73 | #--------------------------------------------------------------
 74 | 
 75 | 
 76 | def soft_threshold(X, lamda):
 77 |     #pdb.set_trace()
 78 |     Gamma = X.clone()
 79 |     Gamma = torch.sign(Gamma) * F.relu(torch.abs(Gamma)-lamda)
 80 |     return Gamma.to(device)
 81 | 
 82 | 
 83 | #--------------------------------------------------------------
 84 | 
 85 | 
 86 | def IHT(Y,W,K, ITER=50):
 87 |     
 88 |     c = PowerMethod(W)
 89 |     eta = 1/c
 90 |     Gamma = hard_threshold_k(torch.mm(Y,eta*W),K)    
 91 |     residual = torch.mm(Gamma, W.transpose(1,0)) - Y
 92 |     
 93 |     norms = np.zeros((ITER,))
 94 | 
 95 |     for i in range(ITER):
 96 |         Gamma = hard_threshold_k(Gamma - eta * torch.mm(residual, W), K)
 97 |         residual = torch.mm(Gamma, W.transpose(1,0)) - Y
 98 |         norms[i] = np.linalg.norm(residual.cpu().numpy(),'fro')/ np.linalg.norm(Y.cpu().numpy(),'fro')
 99 |     
100 |     return Gamma, residual, norms
101 | 
102 | 
103 | #--------------------------------------------------------------
104 | 
105 | 
106 | def FISTA(Y,W,lamda, ITER=20):
107 |     
108 |     c = PowerMethod(W)
109 |     eta = 1/c
110 |     norms = np.zeros((ITER,))
111 |     
112 |     Gamma = soft_threshold(torch.mm(Y,eta*W),lamda)
113 |     Z = Gamma.clone()
114 |     Gamma_1 = Gamma.clone()
115 |     t = 1
116 |     
117 |     for i in range(ITER):
118 |         Gamma_1 = Gamma.clone()
119 |         residual = torch.mm(Z, W.transpose(1,0)) - Y
120 |         Gamma = soft_threshold(Z - eta * torch.mm(residual, W), lamda/c)
121 |         
122 |         t_1 = t
123 |         t = (1+np.sqrt(1 + 4*t**2))/2
124 |         #pdb.set_trace()
125 |         Z = Gamma + ((t_1 - 1)/t * (Gamma - Gamma_1)).to(device)
126 |         
127 |         norms[i] = np.linalg.norm(residual.cpu().numpy(),'fro')/ np.linalg.norm(Y.cpu().numpy(),'fro')
128 |     
129 |     return Gamma, residual, norms
130 | 
131 | 
132 | #--------------------------------------------------------------
133 | 
134 | def NormDict(W):
135 |     Wn = torch.norm(W, p=2, dim=0).detach()
136 |     W = W.div(Wn.expand_as(W))
137 |     return W
138 | 
139 | #--------------------------------------------------------------
140 | 
141 | def PowerMethod(W):
142 |     ITER = 100
143 |     m = W.shape[1]
144 |     X = torch.randn(1, m).to(device)
145 |     for i in range(ITER):
146 |         Dgamma = torch.mm(X,W.transpose(1,0))
147 |         X = torch.mm(Dgamma,W)
148 |         nm = torch.norm(X,p=2)
149 |         X = X/nm
150 |     
151 |     return nm
152 | 
153 | #--------------------------------------------------------------
154 | 
155 | 
156 | def showFilters(W,ncol,nrows):
157 |     p = int(np.sqrt(W.shape[0]))+2
158 |     Nimages = W.shape[1]
159 |     Mosaic = np.zeros((p*ncol,p*nrows))
160 |     indx = 0
161 |     for i in range(ncol):
162 |         for j in range(nrows):
163 |             im = W[:,indx].reshape(p-2,p-2)
164 |             im = (im-np.min(im))
165 |             im = im/np.max(im)
166 |             Mosaic[ i*p : (i+1)*p , j*p : (j+1)*p ] = np.pad(im,(1,1),mode='constant')
167 |             indx += 1
168 |             
169 |     return Mosaic
170 | 


--------------------------------------------------------------------------------
/alphaction/modeling/encoders/clipvip/custom_layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from collections import OrderedDict
 4 | 
 5 | 
 6 | 
 7 | class LayerNorm(nn.LayerNorm):
 8 |     """Subclass torch's LayerNorm to handle fp16."""
 9 | 
10 |     def forward(self, x: torch.Tensor):
11 |         orig_type = x.dtype
12 |         ret = super().forward(x.type(torch.float32))
13 |         return ret.type(orig_type)
14 | 
15 | 
16 | class QuickGELU(nn.Module):
17 |     def forward(self, x: torch.Tensor):
18 |         return x * torch.sigmoid(1.702 * x)
19 | 
20 | 
21 | class CrossAttnBlock(nn.Module):
22 |     def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, drop: float = 0., return_kv=False):
23 |         super().__init__()
24 | 
25 |         self.attn = nn.MultiheadAttention(d_model, n_head, dropout=drop)
26 |         self.ln_x = LayerNorm(d_model)
27 |         self.ln_y = LayerNorm(d_model)
28 |         self.mlp = nn.Sequential(OrderedDict([
29 |             ("c_fc", nn.Linear(d_model, d_model * 4)),
30 |             ("fc_drop", nn.Dropout(drop)),
31 |             ("gelu", QuickGELU()),
32 |             ("c_proj", nn.Linear(d_model * 4, d_model)),
33 |             ("proj_drop", nn.Dropout(drop)),
34 |         ]))
35 |         self.ln_2 = LayerNorm(d_model)
36 |         self.attn_mask = attn_mask
37 |         self.return_kv = return_kv
38 | 
39 |     def attention(self, x: torch.Tensor, y: torch.Tensor):
40 |         self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
41 |         return self.attn(x, y, y, need_weights=False, attn_mask=self.attn_mask)[0]
42 | 
43 |     def forward(self, x: torch.Tensor, y: torch.Tensor):
44 |         """ x: query (T=1, B, d)
45 |             y: key & value (T=64, B, d)
46 |         """
47 |         if len(x.size()) == 2:
48 |             x = x.unsqueeze(0)
49 |         x = x + self.attention(self.ln_x(x), self.ln_y(y))
50 |         x = x + self.mlp(self.ln_2(x))
51 |         if x.size(0) == 1:
52 |             x = x.squeeze(0)
53 |         if self.return_kv:
54 |             return x, y
55 |         return x
56 | 
57 | 
58 | class CrossAttnModules(nn.Sequential):
59 |     def forward(self, *input):
60 |         for module in self._modules.values():
61 |             input = module(*input)
62 |         return input[0]


--------------------------------------------------------------------------------
/alphaction/modeling/encoders/viclip/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | license: mit
3 | ---
4 | 


--------------------------------------------------------------------------------
/alphaction/modeling/encoders/viclip/__init__.py:
--------------------------------------------------------------------------------
 1 | from .simple_tokenizer import SimpleTokenizer as _Tokenizer
 2 | from .viclip import ViCLIP
 3 | import torch
 4 | import numpy as np
 5 | import cv2
 6 | 
 7 | clip_candidates = {'viclip':None, 'clip':None}
 8 | 
 9 | def get_clip(name='viclip', weight_file=None):
10 |     global clip_candidates
11 |     m = clip_candidates[name]
12 |     if m is None:
13 |         if name == 'viclip':
14 |             tokenizer = _Tokenizer()
15 |             vclip = ViCLIP(tokenizer, pretrain=weight_file)
16 |             # m = vclip
17 |             m = (vclip, tokenizer)
18 |         else:
19 |             raise Exception('the target clip model is not found.')
20 |     
21 |     return m
22 | 
23 | def get_text_feat_dict(texts, clip, tokenizer, text_feat_d={}):
24 |     for t in texts:
25 |         feat = clip.get_text_features(t, tokenizer, text_feat_d)
26 |         text_feat_d[t] = feat
27 |     return text_feat_d
28 | 
29 | def get_vid_feat(frames, clip):
30 |     return clip.get_vid_features(frames)
31 | 
32 | def _frame_from_video(video):
33 |     while video.isOpened():
34 |         success, frame = video.read()
35 |         if success:
36 |             yield frame
37 |         else:
38 |             break
39 | 
40 | v_mean = np.array([0.485, 0.456, 0.406]).reshape(1,1,3)
41 | v_std = np.array([0.229, 0.224, 0.225]).reshape(1,1,3)
42 | def normalize(data):
43 |     return (data/255.0-v_mean)/v_std
44 | 
45 | def frames2tensor(vid_list, fnum=8, target_size=(224, 224), device=torch.device('cuda')):
46 |     assert(len(vid_list) >= fnum)
47 |     step = len(vid_list) // fnum
48 |     vid_list = vid_list[::step][:fnum]
49 |     vid_list = [cv2.resize(x[:,:,::-1], target_size) for x in vid_list]
50 |     vid_tube = [np.expand_dims(normalize(x), axis=(0, 1)) for x in vid_list]
51 |     vid_tube = np.concatenate(vid_tube, axis=1)
52 |     vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3))
53 |     vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float()
54 |     return vid_tube
55 | 
56 | def retrieve_text(frames, texts, name='viclip', weight_file=None, topk=5, device=torch.device('cuda')):
57 |     clip, tokenizer = get_clip(name, weight_file)
58 |     clip = clip.to(device)
59 |     frames_tensor = frames2tensor(frames, device=device)
60 |     vid_feat = get_vid_feat(frames_tensor, clip)
61 | 
62 |     text_feat_d = {}
63 |     text_feat_d = get_text_feat_dict(texts, clip, tokenizer, text_feat_d)
64 |     text_feats = [text_feat_d[t] for t in texts]
65 |     text_feats_tensor = torch.cat(text_feats, 0)
66 |     
67 |     probs, idxs = clip.get_predict_label(vid_feat, text_feats_tensor, top=topk)
68 | 
69 |     ret_texts = [texts[i] for i in idxs.numpy()[0].tolist()]
70 |     return ret_texts, probs.numpy()[0]
71 | 
72 | 


--------------------------------------------------------------------------------
/alphaction/modeling/encoders/viclip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/modeling/encoders/viclip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/alphaction/modeling/encoders/viclip/demo.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import cv2
  4 | import argparse
  5 | import torch
  6 | 
  7 | import sys
  8 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..', '..'))
  9 | from alphaction.modeling.encoders.viclip import retrieve_text, _frame_from_video
 10 | from alphaction.config import cfg
 11 | from alphaction.dataset import make_data_loader
 12 | from alphaction.dataset.datasets import utils as utils
 13 | from alphaction.utils.random_seed import set_seed
 14 | 
 15 | 
 16 | 
 17 | def get_cfg():
 18 |     parser = argparse.ArgumentParser(description="PyTorch Action Detection Training")
 19 |     parser.add_argument(
 20 |         "--config-file",
 21 |         default="",
 22 |         metavar="FILE",
 23 |         help="path to config file",
 24 |         type=str,
 25 |     )
 26 |     parser.add_argument("--local_rank", type=int, default=0)
 27 |     parser.add_argument(
 28 |         "--skip-final-test",
 29 |         dest="skip_test",
 30 |         help="Do not test the final model",
 31 |         action="store_true",
 32 |     )
 33 |     parser.add_argument(
 34 |         "--skip-val-in-train",
 35 |         dest="skip_val",
 36 |         help="Do not validate during training",
 37 |         action="store_true",
 38 |     )
 39 |     parser.add_argument(
 40 |         "--transfer",
 41 |         dest="transfer_weight",
 42 |         help="Transfer weight from a pretrained model",
 43 |         action="store_true"
 44 |     )
 45 |     parser.add_argument(
 46 |         "--adjust-lr",
 47 |         dest="adjust_lr",
 48 |         help="Adjust learning rate scheduler from old checkpoint",
 49 |         action="store_true"
 50 |     )
 51 |     parser.add_argument(
 52 |         "--no-head",
 53 |         dest="no_head",
 54 |         help="Not load the head layer parameters from weight file",
 55 |         action="store_true"
 56 |     )
 57 |     parser.add_argument(
 58 |         "--use-tfboard",
 59 |         action='store_true',
 60 |         dest='tfboard',
 61 |         help='Use tensorboard to log stats'
 62 |     )
 63 |     parser.add_argument(
 64 |         "--seed",
 65 |         type=int,
 66 |         default=2,
 67 |         help="Manual seed at the begining."
 68 |     )
 69 |     parser.add_argument(
 70 |         "opts",
 71 |         help="Modify config options using the command-line",
 72 |         default=None,
 73 |         nargs=argparse.REMAINDER,
 74 |     )
 75 | 
 76 |     args = parser.parse_args()
 77 | 
 78 |     num_gpus = 1
 79 |     args.distributed = False
 80 | 
 81 |     torch.backends.cudnn.deterministic = True
 82 |     torch.backends.cudnn.benchmark = False
 83 | 
 84 |     # Merge config.
 85 |     cfg.merge_from_file(args.config_file)
 86 |     cfg.merge_from_list(args.opts)
 87 |     cfg.freeze()
 88 | 
 89 |     set_seed(args.seed, 0, num_gpus)
 90 | 
 91 |     return cfg
 92 | 
 93 | 
 94 | def get_one_sample(dataset):
 95 |     idx = int(np.random.choice(list(range(len(dataset))), 1))
 96 |     video_idx, sec_idx, sec, center_idx = dataset._keyframe_indices[idx]
 97 |     # Get the frame idxs for current clip.
 98 |     seq = utils.get_sequence(
 99 |         center_idx,
100 |         dataset._seq_len // 2,
101 |         dataset._sample_rate,
102 |         num_frames=len(dataset._image_paths[video_idx]),
103 |     )
104 | 
105 |     # Load images of current clip.
106 |     image_paths = [dataset._image_paths[video_idx][frame] for frame in seq]
107 |     imgs = utils.retry_load_images(
108 |         image_paths, backend='cv2'
109 |     )
110 | 
111 |     clip_label_list = dataset._keyframe_boxes_and_labels[video_idx][sec_idx]
112 |     assert len(clip_label_list) > 0
113 |     labels = []
114 |     for box_labels in clip_label_list:
115 |         for label in box_labels[1]:
116 |             if label == -1:
117 |                 continue
118 |             label = dataset.id_to_indices['closed'][label]
119 |             labels.append(label)
120 |     
121 |     return imgs, labels
122 | 
123 | 
124 | 
125 | if __name__ == '__main__':
126 | 
127 |     cfg = get_cfg()
128 | 
129 |     data_loader, vocabulary_train, iter_per_epoch = make_data_loader(
130 |         cfg,
131 |         is_train=True,
132 |         is_distributed=False,
133 |         start_iter=0,
134 |     )
135 |     
136 |     for n in range(10):
137 |         print("Trial {}...".format(n + 1))
138 |         frames, labels = get_one_sample(data_loader.dataset)
139 |         class_texts = [elems['caption'] for clsname, elems in data_loader.dataset.text_input['closed'].items()]
140 |         gt_texts = [class_texts[clsid] for clsid in labels]
141 | 
142 |         texts, probs = retrieve_text(frames, class_texts, name='viclip', topk=5, weight_file='pretrained/ViClip-InternVid-10M-FLT.pth')
143 | 
144 |         for t, p in zip(texts, probs):
145 |             print(f'text: {t} ~ prob: {p:.4f}')
146 |         
147 |         print("Ground Truth class texts: ", gt_texts)


--------------------------------------------------------------------------------
/alphaction/modeling/encoders/viclip/simple_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import html
  3 | import os
  4 | from functools import lru_cache
  5 | 
  6 | import ftfy
  7 | import regex as re
  8 | 
  9 | 
 10 | @lru_cache()
 11 | def default_bpe():
 12 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 13 | # @lru_cache()
 14 | # def default_bpe():
 15 | #     return "bpe_simple_vocab_16e6.txt.gz"
 16 | 
 17 | 
 18 | @lru_cache()
 19 | def bytes_to_unicode():
 20 |     """
 21 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 22 |     The reversible bpe codes work on unicode strings.
 23 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 24 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 25 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 26 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 27 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 28 |     """
 29 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 30 |     cs = bs[:]
 31 |     n = 0
 32 |     for b in range(2**8):
 33 |         if b not in bs:
 34 |             bs.append(b)
 35 |             cs.append(2**8+n)
 36 |             n += 1
 37 |     cs = [chr(n) for n in cs]
 38 |     return dict(zip(bs, cs))
 39 | 
 40 | 
 41 | def get_pairs(word):
 42 |     """Return set of symbol pairs in a word.
 43 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 44 |     """
 45 |     pairs = set()
 46 |     prev_char = word[0]
 47 |     for char in word[1:]:
 48 |         pairs.add((prev_char, char))
 49 |         prev_char = char
 50 |     return pairs
 51 | 
 52 | 
 53 | def basic_clean(text):
 54 |     text = ftfy.fix_text(text)
 55 |     text = html.unescape(html.unescape(text))
 56 |     return text.strip()
 57 | 
 58 | 
 59 | def whitespace_clean(text):
 60 |     text = re.sub(r'\s+', ' ', text)
 61 |     text = text.strip()
 62 |     return text
 63 | 
 64 | 
 65 | class SimpleTokenizer(object):
 66 |     def __init__(self, bpe_path: str = default_bpe()):
 67 |         self.byte_encoder = bytes_to_unicode()
 68 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 69 |         merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
 70 |         merges = merges[1:49152-256-2+1]
 71 |         merges = [tuple(merge.split()) for merge in merges]
 72 |         vocab = list(bytes_to_unicode().values())
 73 |         vocab = vocab + [v+'</w>' for v in vocab]
 74 |         for merge in merges:
 75 |             vocab.append(''.join(merge))
 76 |         vocab.extend(['<|startoftext|>', '<|endoftext|>'])
 77 |         self.encoder = dict(zip(vocab, range(len(vocab))))
 78 |         self.decoder = {v: k for k, v in self.encoder.items()}
 79 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 80 |         self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
 81 |         self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
 82 | 
 83 |     def bpe(self, token):
 84 |         if token in self.cache:
 85 |             return self.cache[token]
 86 |         word = tuple(token[:-1]) + ( token[-1] + '</w>',)
 87 |         pairs = get_pairs(word)
 88 | 
 89 |         if not pairs:
 90 |             return token+'</w>'
 91 | 
 92 |         while True:
 93 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
 94 |             if bigram not in self.bpe_ranks:
 95 |                 break
 96 |             first, second = bigram
 97 |             new_word = []
 98 |             i = 0
 99 |             while i < len(word):
100 |                 try:
101 |                     j = word.index(first, i)
102 |                     new_word.extend(word[i:j])
103 |                     i = j
104 |                 except:
105 |                     new_word.extend(word[i:])
106 |                     break
107 | 
108 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
109 |                     new_word.append(first+second)
110 |                     i += 2
111 |                 else:
112 |                     new_word.append(word[i])
113 |                     i += 1
114 |             new_word = tuple(new_word)
115 |             word = new_word
116 |             if len(word) == 1:
117 |                 break
118 |             else:
119 |                 pairs = get_pairs(word)
120 |         word = ' '.join(word)
121 |         self.cache[token] = word
122 |         return word
123 | 
124 |     def encode(self, text):
125 |         bpe_tokens = []
126 |         text = whitespace_clean(basic_clean(text)).lower()
127 |         for token in re.findall(self.pat, text):
128 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
129 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
130 |         return bpe_tokens
131 | 
132 |     def decode(self, tokens):
133 |         text = ''.join([self.decoder[token] for token in tokens])
134 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
135 |         return text
136 | 


--------------------------------------------------------------------------------
/alphaction/modeling/nonlocal_block.py:
--------------------------------------------------------------------------------
  1 | from __future__ import (absolute_import, division, print_function,
  2 |                         unicode_literals)
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | from alphaction.layers import FrozenBatchNorm3d
  7 | 
  8 | 
  9 | class NLBlock(nn.Module):
 10 |     def __init__(self, dim_in, dim_out, dim_inner, nl_cfg, group=False):
 11 |         super(NLBlock, self).__init__()
 12 | 
 13 |         self.nl_cfg = nl_cfg.clone()
 14 |         self.group = group
 15 |         self.group_size = 4
 16 | 
 17 |         init_std = nl_cfg.CONV_INIT_STD
 18 |         bias = not nl_cfg.NO_BIAS
 19 |         pool_stride = 2
 20 | 
 21 |         self.scale_value = dim_inner ** (-0.5)
 22 |         self.dim_inner = dim_inner
 23 | 
 24 |         self.theta = nn.Conv3d(dim_in, dim_inner, 1, bias=bias)
 25 |         nn.init.normal_(self.theta.weight, std=init_std)
 26 |         if bias:
 27 |             nn.init.constant_(self.theta.bias, 0)
 28 | 
 29 |         if nl_cfg.USE_MAXPOOL:
 30 |             self.maxpool = nn.MaxPool3d((1, pool_stride, pool_stride),
 31 |                                         stride=(1, pool_stride, pool_stride))
 32 | 
 33 |         self.phi = nn.Conv3d(dim_in, dim_inner, 1, bias=bias)
 34 |         nn.init.normal_(self.phi.weight, std=init_std)
 35 |         if bias:
 36 |             nn.init.constant_(self.phi.bias, 0)
 37 | 
 38 |         self.g = nn.Conv3d(dim_in, dim_inner, 1, bias=bias)
 39 |         nn.init.normal_(self.g.weight, std=init_std)
 40 |         if bias:
 41 |             nn.init.constant_(self.g.bias, 0)
 42 | 
 43 |         if nl_cfg.USE_SOFTMAX:
 44 |             self.softmax = nn.Softmax(dim=2)
 45 | 
 46 |         self.out = nn.Conv3d(dim_inner, dim_out, 1, bias=bias)
 47 |         if nl_cfg.USE_ZERO_INIT_CONV:
 48 |             nn.init.constant_(self.out.weight, 0)
 49 |         else:
 50 |             nn.init.normal_(self.out.weight, std=init_std)
 51 |         if bias:
 52 |             nn.init.constant_(self.out.bias, 0)
 53 | 
 54 |         if nl_cfg.USE_BN:
 55 |             if nl_cfg.FROZEN_BN:
 56 |                 self.bn = FrozenBatchNorm3d(dim_out, eps=nl_cfg.BN_EPSILON)
 57 |             else:
 58 |                 self.bn = nn.BatchNorm3d(dim_out, eps=nl_cfg.BN_EPSILON, momentum=nl_cfg.BN_MOMENTUM)
 59 |             nn.init.constant_(self.bn.weight, nl_cfg.BN_INIT_GAMMA)
 60 | 
 61 |     def forward(self, x):
 62 |         if x.dim() != 5:
 63 |             raise ValueError('expected 4D or 5D input (got {}D input)'
 64 |                              .format(x.dim()))
 65 | 
 66 |         if self.group:
 67 |             x = x.transpose(1, 2)
 68 |             sz_before_group = list(x.shape)
 69 |             sz_after_group = sz_before_group.copy()
 70 |             sz_after_group[0] = -1
 71 |             sz_after_group[1] = self.group_size
 72 |             x = x.contiguous().view(*sz_after_group)
 73 |             x = x.transpose(1, 2)
 74 | 
 75 |         batch_size = x.shape[0]
 76 | 
 77 |         theta = self.theta(x)
 78 | 
 79 |         if self.nl_cfg.USE_MAXPOOL:
 80 |             max_pool = self.maxpool(x)
 81 |         else:
 82 |             max_pool = x
 83 | 
 84 |         phi = self.phi(max_pool)
 85 | 
 86 |         g = self.g(max_pool)
 87 | 
 88 |         org_size = theta.size()
 89 |         mat_size = [batch_size, self.dim_inner, -1]
 90 |         theta = theta.view(*mat_size)
 91 |         phi = phi.view(*mat_size)
 92 |         g = g.view(*mat_size)
 93 | 
 94 |         theta_phi = torch.bmm(theta.transpose(1, 2), phi)
 95 | 
 96 |         if self.nl_cfg.USE_SOFTMAX:
 97 |             if self.nl_cfg.USE_SCALE:
 98 |                 theta_phi_sc = theta_phi * self.scale_value
 99 |             else:
100 |                 theta_phi_sc = theta_phi
101 |             p = self.softmax(theta_phi_sc)
102 |         else:
103 |             p = theta_phi / theta_phi.shape[-1]
104 | 
105 |         t = torch.bmm(g, p.transpose(1, 2))
106 | 
107 |         t = t.view(org_size)
108 | 
109 |         out = self.out(t)
110 | 
111 |         if self.nl_cfg.USE_BN:
112 |             out = self.bn(out)
113 |         out = out + x
114 | 
115 |         if self.group:
116 |             out = out.transpose(1, 2)
117 |             out = out.contiguous().view(*sz_before_group)
118 |             out = out.transpose(1, 2)
119 | 
120 |         return out
121 | 
122 |     def c2_weight_mapping(self):
123 |         weight_map = {}
124 |         for name, m_child in self.named_children():
125 |             if m_child.state_dict():
126 |                 if isinstance(m_child, (nn.BatchNorm3d, FrozenBatchNorm3d)):
127 |                     weight_map[name + '.weight'] = '{}_s'.format(name)
128 |                     weight_map[name + '.running_mean'] = '{}_rm'.format(name)
129 |                     weight_map[name + '.running_var'] = '{}_riv'.format(name)
130 |                 elif isinstance(m_child, nn.GroupNorm):
131 |                     weight_map[name + '.weight'] = '{}_s'.format(name)
132 |                 else:
133 |                     weight_map[name + '.weight'] = '{}_w'.format(name)
134 |                 weight_map[name + '.bias'] = '{}_b'.format(name)
135 |         return weight_map
136 | 


--------------------------------------------------------------------------------
/alphaction/modeling/registry.py:
--------------------------------------------------------------------------------
1 | from alphaction.utils.registry import Registry
2 | 
3 | BACKBONES = Registry()
4 | ROI_ACTION_FEATURE_EXTRACTORS = Registry()
5 | ROI_ACTION_PREDICTORS = Registry()
6 | INTERACTION_AGGREGATION_STRUCTURES = Registry()


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/modeling/roi_heads/__init__.py


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/modeling/roi_heads/action_head/__init__.py


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/action_head.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .roi_action_feature_extractor import make_roi_action_feature_extractor
 4 | from .roi_action_predictors import make_roi_action_predictor
 5 | from .inference import make_roi_action_post_processor
 6 | from .loss import make_roi_action_loss_evaluator
 7 | from .metric import make_roi_action_accuracy_evaluator
 8 | from alphaction.modeling.utils import prepare_pooled_feature
 9 | from alphaction.utils.comm import all_reduce
10 | 
11 | 
12 | class ROIActionHead(torch.nn.Module):
13 |     """
14 |     Generic Action Head class.
15 |     """
16 | 
17 |     def __init__(self, cfg, dim_in):
18 |         super(ROIActionHead, self).__init__()
19 |         self.feature_extractor = make_roi_action_feature_extractor(cfg, dim_in)
20 |         self.predictor = make_roi_action_predictor(cfg, self.feature_extractor.dim_out)
21 |         self.post_processor = make_roi_action_post_processor(cfg)
22 |         self.loss_evaluator = make_roi_action_loss_evaluator(cfg)
23 |         self.accuracy_evaluator = make_roi_action_accuracy_evaluator(cfg)
24 |         self.test_ext = cfg.TEST.EXTEND_SCALE
25 | 
26 |     def forward(self, slow_features, fast_features, boxes, objects=None, extras={}, part_forward=-1):
27 |         # In training stage, boxes are from gt.
28 |         # In testing stage, boxes are detected by human detector and proposals should be
29 |         # enlarged boxes.
30 |         assert not (self.training and part_forward >= 0)
31 | 
32 |         if part_forward == 1:
33 |             boxes = extras["current_feat_p"]
34 |             objects = extras["current_feat_o"]
35 | 
36 |         if self.training:
37 |             proposals = self.loss_evaluator.sample_box(boxes)
38 |         else:
39 |             proposals = [box.extend(self.test_ext) for box in boxes]
40 | 
41 |         x, x_pooled, x_objects = self.feature_extractor(slow_features, fast_features, proposals, objects, extras, part_forward)
42 | 
43 |         if part_forward == 0:
44 |             pooled_feature = prepare_pooled_feature(x_pooled, boxes)
45 |             if x_objects is None:
46 |                 object_pooled_feature = None
47 |             else:
48 |                 object_pooled_feature = prepare_pooled_feature(x_objects, objects)
49 |             return [pooled_feature, object_pooled_feature], {}, {}, {}
50 | 
51 |         action_logits = self.predictor(x)
52 | 
53 |         if not self.training:
54 |             result = self.post_processor((action_logits,), boxes)
55 |             return result, {}, {}, {}
56 | 
57 |         box_num = action_logits.size(0)
58 |         box_num = torch.as_tensor([box_num], dtype=torch.float32, device=action_logits.device)
59 |         all_reduce(box_num, average=True)
60 | 
61 |         loss_dict, loss_weight = self.loss_evaluator(
62 |             [action_logits], box_num.item(),
63 |         )
64 | 
65 |         metric_dict = self.accuracy_evaluator(
66 |             [action_logits], proposals, box_num.item(),
67 |         )
68 | 
69 |         pooled_feature = prepare_pooled_feature(x_pooled, proposals)
70 |         if x_objects is None:
71 |             object_pooled_feature = []
72 |         else:
73 |             object_pooled_feature = prepare_pooled_feature(x_objects, objects)
74 | 
75 |         return (
76 |             [pooled_feature, object_pooled_feature],
77 |             loss_dict,
78 |             loss_weight,
79 |             metric_dict,
80 |         )
81 | 
82 |     def c2_weight_mapping(self):
83 |         weight_map = {}
84 |         for name, m_child in self.named_children():
85 |             if m_child.state_dict() and hasattr(m_child, "c2_weight_mapping"):
86 |                 child_map = m_child.c2_weight_mapping()
87 |                 for key, val in child_map.items():
88 |                     new_key = name + '.' + key
89 |                     weight_map[new_key] = val
90 |         return weight_map
91 | 
92 | 
93 | def build_roi_action_head(cfg, dim_in):
94 |     return ROIActionHead(cfg, dim_in)
95 | 


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/inference.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from alphaction.structures.bounding_box import BoxList
 6 | 
 7 | 
 8 | class PostProcessor(nn.Module):
 9 |     def __init__(self, pose_action_num):
10 |         super(PostProcessor, self).__init__()
11 |         self.pose_action_num = pose_action_num
12 | 
13 |     def forward(self, x, boxes):
14 |         # boxes should be (#detections,4)
15 |         # prob should be calculated in different way.
16 |         class_logits, = x
17 |         pose_action_prob = F.softmax(class_logits[:,:self.pose_action_num],-1)
18 |         interaction_action_prob = torch.sigmoid(class_logits[:,self.pose_action_num:])
19 | 
20 |         action_prob = torch.cat((pose_action_prob,interaction_action_prob),1)
21 | 
22 |         image_shapes = [box.size for box in boxes]
23 |         boxes_per_image = [len(box) for box in boxes]
24 |         box_tensors = [a.bbox for a in boxes]
25 | 
26 |         action_prob = action_prob.split(boxes_per_image, dim=0)
27 | 
28 |         results = []
29 |         for prob, boxes_per_image, image_shape in zip(
30 |                 action_prob, box_tensors, image_shapes
31 |         ):
32 |             boxlist = self.prepare_boxlist(boxes_per_image, prob, image_shape)
33 |             results.append(boxlist)
34 |         return results
35 | 
36 |     def prepare_boxlist(self, boxes, scores, image_shape):
37 |         boxlist = BoxList(boxes, image_shape, mode="xyxy")
38 |         boxlist.add_field("scores", scores)
39 |         return boxlist
40 | 
41 | 
42 | def make_roi_action_post_processor(cfg):
43 |     softmax_num = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES
44 |     postprocessor = PostProcessor(softmax_num)
45 |     return postprocessor
46 | 


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from alphaction.layers import SigmoidFocalLoss, SoftmaxFocalLoss
 3 | from alphaction.modeling.utils import cat
 4 | 
 5 | 
 6 | class ActionLossComputation(object):
 7 |     def __init__(self, cfg):
 8 |         self.proposal_per_clip = cfg.MODEL.ROI_ACTION_HEAD.PROPOSAL_PER_CLIP
 9 |         self.num_pose = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES
10 |         self.num_object = cfg.MODEL.ROI_ACTION_HEAD.NUM_OBJECT_MANIPULATION_CLASSES
11 |         self.num_person = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_INTERACTION_CLASSES
12 | 
13 |         self.weight_dict = dict(
14 |             loss_pose_action = cfg.MODEL.ROI_ACTION_HEAD.POSE_LOSS_WEIGHT,
15 |             loss_object_interaction = cfg.MODEL.ROI_ACTION_HEAD.OBJECT_LOSS_WEIGHT,
16 |             loss_person_interaction = cfg.MODEL.ROI_ACTION_HEAD.PERSON_LOSS_WEIGHT,
17 |         )
18 | 
19 |         gamma = cfg.MODEL.ROI_ACTION_HEAD.FOCAL_LOSS.GAMMA
20 |         alpha = cfg.MODEL.ROI_ACTION_HEAD.FOCAL_LOSS.ALPHA
21 |         self.sigmoid_focal_loss = SigmoidFocalLoss(gamma, alpha, reduction="none")
22 |         self.softmax_focal_loss = SoftmaxFocalLoss(gamma, alpha, reduction="sum")
23 | 
24 |     def sample_box(self, boxes):
25 |         proposals = []
26 |         num_proposals = self.proposal_per_clip
27 |         for boxes_per_image in boxes:
28 |             num_boxes = len(boxes_per_image)
29 | 
30 |             if num_boxes > num_proposals:
31 |                 choice_inds = torch.randperm(num_boxes)[:num_proposals]
32 |                 proposals_per_image = boxes_per_image[choice_inds]
33 |             else:
34 |                 proposals_per_image = boxes_per_image
35 |             proposals_per_image = proposals_per_image.random_aug(0.2, 0.1, 0.1, 0.05)
36 |             proposals.append(proposals_per_image)
37 |         self._proposals = proposals
38 |         return proposals
39 | 
40 |     def __call__(self, class_logits, avg_box_num):
41 |         class_logits = cat(class_logits, dim=0)
42 |         assert class_logits.shape[1] == (self.num_pose + self.num_object + self.num_person), \
43 |             "The shape of tensor class logits doesn't match total number of action classes."
44 | 
45 |         if not hasattr(self, "_proposals"):
46 |             raise RuntimeError("sample_box needs to be called before")
47 | 
48 |         proposals = self._proposals
49 | 
50 |         labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)
51 |         assert class_logits.shape[1] == labels.shape[1], \
52 |             "The shape of tensor class logits doesn't match the label tensor."
53 | 
54 |         loss_dict = {}
55 | 
56 |         if self.num_pose > 0:
57 |             pose_label = labels[:, :self.num_pose].argmax(dim=1)
58 |             pose_logits = class_logits[:, :self.num_pose]
59 |             pose_loss = self.softmax_focal_loss(pose_logits, pose_label) / avg_box_num
60 |             loss_dict["loss_pose_action"] = pose_loss
61 | 
62 |         interaction_label = labels[:, self.num_pose:].to(dtype=torch.float32)
63 |         object_label = interaction_label[:, :self.num_object]
64 |         person_label = interaction_label[:, self.num_object:]
65 | 
66 |         interaction_logits = class_logits[:, self.num_pose:]
67 |         object_logits = interaction_logits[:, :self.num_object]
68 |         person_logits = interaction_logits[:, self.num_object:]
69 | 
70 |         if self.num_object > 0:
71 |             object_loss = self.sigmoid_focal_loss(object_logits, object_label).mean(dim=1).sum() / avg_box_num
72 |             loss_dict["loss_object_interaction"] = object_loss
73 |         if self.num_person > 0:
74 |             person_loss = self.sigmoid_focal_loss(person_logits, person_label).mean(dim=1).sum() / avg_box_num
75 |             loss_dict["loss_person_interaction"] = person_loss
76 | 
77 |         return loss_dict, self.weight_dict
78 | 
79 | 
80 | def make_roi_action_loss_evaluator(cfg):
81 |     loss_evaluator = ActionLossComputation(cfg)
82 | 
83 |     return loss_evaluator


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/metric.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from alphaction.modeling.utils import cat
 3 | 
 4 | 
 5 | class ActionAccuracyComputation(object):
 6 |     def __init__(self, num_pose, num_object, num_person):
 7 |         self.num_pose = num_pose
 8 |         self.num_object = num_object
 9 |         self.num_person = num_person
10 | 
11 |     def logic_iou(self, pred, label):
12 |         device = pred.device
13 | 
14 |         version = torch.__version__
15 |         if eval('.'.join(version.split('.')[:2]))>=1.3:
16 |             pred = pred.bool()
17 |             label = label.bool()
18 | 
19 |         label_union = (pred | label).float().sum(dim=1)
20 |         label_inter = (pred & label).float().sum(dim=1)
21 |         replacer = torch.ones_like(label_union, device=device)
22 |         zero_mask = label_union == 0
23 |         label_inter = torch.where(zero_mask, replacer, label_inter)
24 |         label_union = torch.where(zero_mask, replacer, label_union)
25 |         return label_inter / label_union
26 | 
27 |     def __call__(self, class_logits, proposals, avg_box_num):
28 |         class_logits = [logits.detach() for logits in class_logits]
29 |         class_logits = cat(class_logits, dim=0)
30 |         assert class_logits.shape[1] == (self.num_pose + self.num_object + self.num_person), \
31 |             "The shape of tensor class logits doesn't match total number of action classes."
32 | 
33 |         labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)
34 | 
35 |         metric_dict = {}
36 |         if self.num_pose>0:
37 |             pose_label = labels[:, :self.num_pose].argmax(dim=1)
38 |             pose_pred = class_logits[:, :self.num_pose].argmax(dim=1)
39 |             accuracy_pose_action = pose_label.eq(pose_pred).float().sum()
40 |             metric_dict["accuracy_pose_action"] = accuracy_pose_action / avg_box_num
41 | 
42 |         interaction_label = labels[:, self.num_pose:]
43 |         interaction_logits = class_logits[:, self.num_pose:]
44 |         interaction_pred = interaction_logits.sigmoid() > 0.5
45 | 
46 |         if self.num_object>0:
47 |             object_label = interaction_label[:, :self.num_object]
48 |             object_pred = interaction_pred[:, :self.num_object]
49 |             accuracy_object_interaction = self.logic_iou(object_pred, object_label)
50 |             metric_dict["accuracy_object_interaction"] = accuracy_object_interaction.sum() / avg_box_num
51 | 
52 |         if self.num_person>0:
53 |             person_label = interaction_label[:, self.num_object:]
54 |             person_pred = interaction_pred[:, self.num_object:]
55 |             accuracy_person_interaction = self.logic_iou(person_pred, person_label)
56 |             metric_dict["accuracy_person_interaction"] = accuracy_person_interaction.sum() / avg_box_num
57 | 
58 |         return metric_dict
59 | 
60 | 
61 | def make_roi_action_accuracy_evaluator(cfg):
62 |     num_pose = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES
63 |     num_object = cfg.MODEL.ROI_ACTION_HEAD.NUM_OBJECT_MANIPULATION_CLASSES
64 |     num_person = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_INTERACTION_CLASSES
65 |     return ActionAccuracyComputation(num_pose, num_object, num_person)


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/roi_action_predictors.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from alphaction.modeling import registry
 3 | 
 4 | 
 5 | @registry.ROI_ACTION_PREDICTORS.register("FCPredictor")
 6 | class FCPredictor(nn.Module):
 7 |     def __init__(self, config, dim_in):
 8 |         super(FCPredictor, self).__init__()
 9 | 
10 |         num_classes = config.MODEL.ROI_ACTION_HEAD.NUM_CLASSES
11 | 
12 |         dropout_rate = config.MODEL.ROI_ACTION_HEAD.DROPOUT_RATE
13 |         if dropout_rate > 0:
14 |             self.dropout = nn.Dropout(p=dropout_rate, inplace=True)
15 | 
16 |         self.cls_score = nn.Linear(dim_in, num_classes)
17 | 
18 |         nn.init.normal_(self.cls_score.weight, std=0.01)
19 |         nn.init.constant_(self.cls_score.bias, 0)
20 | 
21 |     def forward(self, x):
22 |         x = x.view(x.size(0), -1)
23 |         if hasattr(self, "dropout"):
24 |             x = self.dropout(x)
25 |         scores = self.cls_score(x)
26 | 
27 |         return scores
28 | 
29 |     def c2_weight_mapping(self):
30 |         return {"cls_score.weight": "pred_w",
31 |                 "cls_score.bias": "pred_b"}
32 | 
33 | 
34 | def make_roi_action_predictor(cfg, dim_in):
35 |     func = registry.ROI_ACTION_PREDICTORS[cfg.MODEL.ROI_ACTION_HEAD.PREDICTOR]
36 |     return func(cfg, dim_in)
37 | 


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/roi_heads_3d.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .action_head.action_head import build_roi_action_head
 4 | 
 5 | 
 6 | class Combined3dROIHeads(torch.nn.ModuleDict):
 7 |     def __init__(self, cfg, heads):
 8 |         super(Combined3dROIHeads, self).__init__(heads)
 9 |         self.cfg = cfg.clone()
10 | 
11 |     def forward(self, slow_features, fast_features, boxes, objects=None, extras={}, part_forward=-1):
12 |         result, loss_action, loss_weight, accuracy_action = self.action(slow_features, fast_features, boxes, objects, extras, part_forward)
13 | 
14 |         return result, loss_action, loss_weight, accuracy_action
15 | 
16 |     def c2_weight_mapping(self):
17 |         weight_map = {}
18 |         for name, m_child in self.named_children():
19 |             if m_child.state_dict() and hasattr(m_child,"c2_weight_mapping"):
20 |                 child_map = m_child.c2_weight_mapping()
21 |                 for key, val in child_map.items():
22 |                     new_key = name + '.' + key
23 |                     weight_map[new_key] = val
24 |         return weight_map
25 | 
26 | 
27 | def build_3d_roi_heads(cfg, dim_in):
28 |     roi_heads = []
29 |     roi_heads.append(("action", build_roi_action_head(cfg, dim_in)))
30 | 
31 |     if roi_heads:
32 |         roi_heads = Combined3dROIHeads(cfg, roi_heads)
33 | 
34 |     return roi_heads
35 | 


--------------------------------------------------------------------------------
/alphaction/modeling/stm_decoder/util/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 | 


--------------------------------------------------------------------------------
/alphaction/modeling/stm_decoder/util/adaptive_mixing_operator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class AdaptiveMixing(nn.Module):
 7 |     def __init__(self, in_dim, in_points, n_groups, query_dim=None,
 8 |                  out_dim=None, out_points=None, sampling_rate=None):
 9 |         super(AdaptiveMixing, self).__init__()
10 |         out_dim = out_dim if out_dim is not None else in_dim
11 |         out_points = out_points if out_points is not None else in_points
12 |         query_dim = query_dim if query_dim is not None else in_dim
13 |         sampling_rate = sampling_rate if sampling_rate is not None else 1
14 | 
15 |         self.query_dim = query_dim
16 |         self.in_dim = in_dim
17 |         self.in_points = in_points//sampling_rate
18 |         self.n_groups = n_groups
19 |         self.out_dim = out_dim
20 |         self.out_points = out_points
21 | 
22 |         self.eff_in_dim = in_dim//n_groups
23 |         self.eff_out_dim = out_dim//n_groups
24 | 
25 |         self.pad_bias_dim = 0
26 |         self.pad_bias_points = 0
27 | 
28 |         self.eff_in_dim = self.eff_in_dim + self.pad_bias_dim
29 |         self.in_points = self.in_points + self.pad_bias_points
30 | 
31 |         self.REDUCTION = 1
32 | 
33 |         self.m_parameters = self.eff_in_dim * self.eff_out_dim
34 |         self.s_parameters = self.in_points * self.out_points
35 | 
36 |         self.total_parameters = self.m_parameters + self.s_parameters
37 | 
38 |         self.parameter_generator = nn.Sequential(
39 |             nn.Linear(self.query_dim, self.n_groups*self.total_parameters),
40 |         )
41 | 
42 |         self.out_proj = nn.Linear(
43 |             self.eff_out_dim*self.out_points*self.n_groups, self.query_dim, bias=True
44 |         )
45 | 
46 |         self.act = nn.ReLU(inplace=True)
47 | 
48 |         self._init_weights()
49 | 
50 |     @torch.no_grad()
51 |     def _init_weights(self):
52 |         nn.init.zeros_(self.parameter_generator[-1].weight)
53 | 
54 |     def forward(self, x, query):
55 | 
56 |         B, N, g, P, C = x.size()
57 |         G = self.n_groups
58 |         assert g == G
59 | 
60 | 
61 |         '''generate mixing parameters'''
62 |         params = self.parameter_generator(query)
63 |         params = params.reshape(B*N, G, -1)
64 |         out = x.reshape(B*N, G, P, C)
65 | 
66 |         M, S = params.split(
67 |             [self.m_parameters, self.s_parameters], 2)
68 | 
69 |         M = M.reshape(B*N, G, self.eff_in_dim, self.eff_in_dim)
70 |         S = S.reshape(B*N, G, self.out_points, self.in_points)
71 | 
72 | 
73 |         '''adaptive channel mixing
74 |         the process also can be done with torch.bmm
75 |         but for clarity, we use torch.matmul
76 |         '''
77 |         out = torch.matmul(out, M)
78 |         out = F.layer_norm(out, [out.size(-2), out.size(-1)])
79 |         out = self.act(out)
80 | 
81 |         '''adaptive spatial mixing'''
82 |         out = torch.matmul(S, out)  # implicitly transpose and matmul
83 |         out = F.layer_norm(out, [out.size(-2), out.size(-1)])
84 |         out = self.act(out)
85 | 
86 |         '''linear transfomation to query dim'''
87 |         out = out.reshape(B, N, -1)
88 |         out = self.out_proj(out)
89 | 
90 |         out = query + out
91 | 
92 |         return out
93 | 


--------------------------------------------------------------------------------
/alphaction/modeling/stm_decoder/util/msaq.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | 
  4 | 
  5 | def translate_to_linear_weight(ref: torch.Tensor, num_total, tau=2.0):
  6 |     # ref: [n, n_query, 1, in_points * n_heads]
  7 |     # num_total: feature levels (typically 4)
  8 |     grid = torch.arange(num_total, device=ref.device, dtype=ref.dtype).view(
  9 |         *[len(ref.shape)*[1, ]+[-1, ]])
 10 |     # [1, 1, 1, 1, num_total]
 11 | 
 12 |     ref = ref.unsqueeze(-1).clone()
 13 |     # [n, n_query, 1, in_points * n_heads, 1]
 14 |     l2 = (ref-grid).pow(2.0).div(tau).abs().neg()
 15 |     # [n, n_query, 1, in_points * n_heads, num_total]
 16 |     weight = torch.softmax(l2, dim=-1)
 17 | 
 18 |     return weight
 19 | 
 20 | 
 21 | def MHAQ3D(sample_points: torch.Tensor, value: torch.Tensor, weight=None, n_points=1):
 22 |     '''
 23 |     Args:
 24 |         sample_points: [n, n_query, 1, in_points * n_heads, 2]
 25 |         value: [n, c, t, h, w]
 26 |         weight: [n, n_query, 1, in_points * n_heads]
 27 |         n_points: in_points
 28 | 
 29 |     Returns:
 30 |         [B,c//n_heads,n_heads,t,in_points,n_query,1]
 31 |     '''
 32 |     B, Hq, Wq, n_heads_points, _ = sample_points.shape
 33 |     # print(value.shape)
 34 |     B, Ck, Tk, Hk, Wk = value.shape
 35 | 
 36 |     n_heads = n_heads_points//n_points
 37 | 
 38 |     sample_points = sample_points.view(B, Hq, Wq, n_heads, n_points, 2) \
 39 |         .permute(0, 3, 1, 2, 4, 5).contiguous().flatten(0, 1)
 40 |     # n*n_heads, n_query, 1, in_points, 2
 41 |     sample_points = sample_points.repeat(Tk, 1, 1, 1, 1)
 42 |     # n*n_heads*Tk, n_query, 1, in_points, 2
 43 |     sample_points = sample_points.flatten(2, 3)
 44 |     # n*n_heads*Tk, n_query, in_points, 2
 45 |     sample_points = sample_points*2.0-1.0
 46 |     value = value.view(B*n_heads, Ck//n_heads, Tk, Hk, Wk).permute(2, 0, 1, 3, 4).flatten(0, 1)
 47 |     out = F.grid_sample(
 48 |         value, sample_points,
 49 |         mode='bilinear', padding_mode='zeros', align_corners=False,
 50 |     )
 51 |     # n*n_heads*Tk, c//n_heads, n_query, in_points
 52 | 
 53 |     if weight is not None:
 54 |         weight = weight.view(B, Hq, Wq, n_heads, n_points) \
 55 |             .permute(0, 3, 1, 2, 4).flatten(0, 1).flatten(2, 3).unsqueeze(1).repeat(Tk, 1, 1, 1)
 56 |         # n*n_heads*Tk, 1, n_query, in_points
 57 |         out *= weight
 58 | 
 59 |     return out.view(Tk, B, n_heads, Ck//n_heads, Hq, Wq, n_points).permute(1, 3, 2, 0, 6, 4, 5)
 60 | 
 61 | 
 62 | def SAMPLE4D(sample_points: torch.Tensor, values: torch.Tensor, featmap_strides, n_points: int = 1, num_levels: int = None, mapping_stride=3.0, tau=2.0, ):
 63 |     B, Hq, Wq, n_heads_points, _ = sample_points.shape
 64 |     B, C, t, _, _ = values[0].shape
 65 | 
 66 |     n_heads = n_heads_points//n_points
 67 | 
 68 |     if num_levels is None:
 69 |         num_levels = len(values)
 70 | 
 71 |     sample_points_xy = sample_points[..., 0:2]
 72 |     # print(sample_points_xy.shape) torch.Size([2, 100, 1, 128=32*4, 2])
 73 |     # [n, n_query, 1, in_points * n_heads, 2] 
 74 |     
 75 |     sample_points_lvl = sample_points[..., 2].clone()
 76 |     # print(sample_points_lvl.shape) torch.Size([2, 100, 1, 128=32*4])
 77 |     # [n, n_query, 1, in_points * n_heads]
 78 | 
 79 |     sample_points_lvl_mapped = sample_points_lvl - mapping_stride
 80 |     # print(sample_points_lvl_mapped.shape) torch.Size([2, 100, 1, 128=32*4])
 81 |     # [n, n_query, 1, in_points * n_heads]
 82 | 
 83 |     sample_points_lvl_weight = translate_to_linear_weight(sample_points_lvl_mapped, num_levels, tau=tau)
 84 |     # print(sample_points_lvl_weight.shape) torch.Size([2, 100, 1, 128=32*4, 4])
 85 |     # [n, n_query, 1, in_points * n_heads, num_levels]
 86 | 
 87 |     sample_points_lvl_weight_list = sample_points_lvl_weight.unbind(-1)
 88 |     # [[n, n_query, 1, in_points * n_heads],....]
 89 | 
 90 |     out = sample_points.new_zeros(B, C//n_heads, n_heads, t, n_points, Hq, Wq)
 91 |     # print(out.shape) torch.Size([2, 64=256//4, 4, 4, 32, 100, 1])
 92 |     # n, dim//n_heads, n_heads, t, in_points, n_query, 1
 93 | 
 94 |     for i in range(num_levels):
 95 |         value = values[i]
 96 |         # B, C, T, H, W
 97 |         lvl_weights = sample_points_lvl_weight_list[i]
 98 |         stride = featmap_strides[i]
 99 |         
100 |         mapping_size = value.new_tensor([value.size(4), value.size(3)]).view(1, 1, 1, 1, -1) * stride        
101 |         normalized_xy = sample_points_xy / mapping_size
102 |         # [n, n_query, 1, in_points * n_heads, 2]
103 | 
104 |         out += MHAQ3D(normalized_xy, value, weight=lvl_weights, n_points=n_points)
105 | 
106 |     return out, None
107 | 


--------------------------------------------------------------------------------
/alphaction/modeling/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Miscellaneous utility functions
 3 | """
 4 | 
 5 | import torch
 6 | from alphaction.structures.bounding_box import BoxList
 7 | 
 8 | 
 9 | def cat(tensors, dim=0):
10 |     """
11 |     Efficient version of torch.cat that avoids a copy if there is only a single element in a list
12 |     """
13 |     assert isinstance(tensors, (list, tuple))
14 |     if len(tensors) == 1:
15 |         return tensors[0]
16 |     return torch.cat(tensors, dim)
17 | 
18 | def pad_sequence(sequence, targ_size, padding_value=0):
19 |     tensor_size = sequence[0].size()
20 |     trailing_dims = tensor_size[1:]
21 |     out_dims = (len(sequence), targ_size) + trailing_dims
22 | 
23 |     out_tensor = sequence[0].new_full(out_dims, padding_value)
24 |     for i, tensor in enumerate(sequence):
25 |         length = tensor.size(0)
26 |         out_tensor[i, :length, ...] = tensor
27 | 
28 |     return out_tensor
29 | 
30 | def prepare_pooled_feature(x_pooled, boxes, detach=True):
31 |     image_shapes = [box.size for box in boxes]
32 |     boxes_per_image = [len(box) for box in boxes]
33 |     box_tensors = [a.bbox for a in boxes]
34 | 
35 |     if detach:
36 |         x_pooled = x_pooled.detach()
37 |     pooled_feature = x_pooled.split(boxes_per_image, dim=0)
38 | 
39 |     boxes_result = []
40 |     for feature_per_image, boxes_per_image, image_shape in zip(
41 |             pooled_feature, box_tensors, image_shapes
42 |     ):
43 |         boxlist = BoxList(boxes_per_image, image_shape, mode="xyxy")
44 |         boxlist.add_field("pooled_feature", feature_per_image)
45 |         boxes_result.append(boxlist)
46 |     return boxes_result


--------------------------------------------------------------------------------
/alphaction/solver/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import make_optimizer
2 | from .build import make_lr_scheduler
3 | from .lr_scheduler import WarmupMultiStepLR
4 | 


--------------------------------------------------------------------------------
/alphaction/solver/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/solver/lr_scheduler.py
 2 | from bisect import bisect_right
 3 | 
 4 | import torch
 5 | import math
 6 | 
 7 | 
 8 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
 9 |     def __init__(
10 |         self,
11 |         optimizer,
12 |         milestones,
13 |         gamma=0.1,
14 |         warmup_factor=1.0 / 3,
15 |         warmup_iters=500,
16 |         warmup_method="linear",
17 |         last_epoch=-1,
18 |     ):
19 |         if not list(milestones) == sorted(milestones):
20 |             raise ValueError(
21 |                 "Milestones should be a list of" " increasing integers. Got {}",
22 |                 milestones,
23 |             )
24 | 
25 |         if warmup_method not in ("constant", "linear"):
26 |             raise ValueError(
27 |                 "Only 'constant' or 'linear' warmup_method accepted"
28 |                 "got {}".format(warmup_method)
29 |             )
30 |         self.milestones = milestones
31 |         self.gamma = gamma
32 |         self.warmup_factor = warmup_factor
33 |         self.warmup_iters = warmup_iters
34 |         self.warmup_method = warmup_method
35 |         super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
36 | 
37 |     def get_lr(self):
38 |         warmup_factor = 1
39 |         if self.last_epoch < self.warmup_iters:
40 |             if self.warmup_method == "constant":
41 |                 warmup_factor = self.warmup_factor
42 |             elif self.warmup_method == "linear":
43 |                 alpha = float(self.last_epoch) / self.warmup_iters
44 |                 warmup_factor = self.warmup_factor * (1 - alpha) + alpha
45 |         return [
46 |             base_lr
47 |             * warmup_factor
48 |             * self.gamma ** bisect_right(self.milestones, self.last_epoch)
49 |             for base_lr in self.base_lrs
50 |         ]
51 | 
52 | class HalfPeriodCosStepLR(torch.optim.lr_scheduler._LRScheduler):
53 |     def __init__(
54 |         self,
55 |         optimizer,
56 |         warmup_factor=1.0 / 3,
57 |         warmup_iters=8000,
58 |         max_iters=60000,
59 |         warmup_method="linear",
60 |         last_epoch=-1,
61 |     ):
62 |         if warmup_method not in ("constant", "linear"):
63 |             raise ValueError(
64 |                 "Only 'constant' or 'linear' warmup_method accepted"
65 |                 "got {}".format(warmup_method)
66 |             )
67 |         self.warmup_factor = warmup_factor
68 |         self.warmup_iters = warmup_iters
69 |         self.max_iters = max_iters
70 |         self.warmup_method = warmup_method
71 |         super(HalfPeriodCosStepLR, self).__init__(optimizer, last_epoch)
72 | 
73 |     def get_lr(self):
74 |         warmup_factor = 1
75 |         if self.last_epoch < self.warmup_iters:
76 |             if self.warmup_method == "constant":
77 |                 warmup_factor = self.warmup_factor
78 |             elif self.warmup_method == "linear":
79 |                 alpha = float(self.last_epoch) / self.warmup_iters
80 |                 warmup_factor = self.warmup_factor * (1 - alpha) + alpha
81 |         else:
82 |             warmup_factor = 0.5 * (math.cos(self.last_epoch / self.max_iters * math.pi) + 1)
83 |         return [
84 |             base_lr
85 |             * warmup_factor
86 |             for base_lr in self.base_lrs
87 |         ]


--------------------------------------------------------------------------------
/alphaction/structures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/structures/__init__.py


--------------------------------------------------------------------------------
/alphaction/structures/memory_pool.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | class MemoryPool(object):
 4 |     def __init__(self):
 5 |         self.cache = defaultdict(dict)
 6 | 
 7 |     def update(self, update_info):
 8 |         for movie_id, feature_per_movie in update_info.items():
 9 |             self.cache[movie_id].update(feature_per_movie)
10 | 
11 |     def update_list(self, update_info_list):
12 |         for update_info in update_info_list:
13 |             self.update(update_info)
14 | 
15 |     def __getitem__(self, item):
16 |         if isinstance(item, tuple) and len(item)==2:
17 |             return self.cache[item[0]][item[1]]
18 |         return self.cache[item]
19 | 
20 |     def __setitem__(self, key, value):
21 |         if isinstance(key, tuple) and len(key)==2:
22 |             self.cache[key[0]][key[1]] = value
23 |         else:
24 |             self.cache[key] = value
25 | 
26 |     def __delitem__(self, item):
27 |         if isinstance(item, tuple) and len(item)==2:
28 |             del self.cache[item[0]][item[1]]
29 |         else:
30 |             del self.cache[item]
31 | 
32 |     def __contains__(self, item):
33 |         if isinstance(item, tuple) and len(item)==2:
34 |             return (item[0] in self.cache and item[1] in self.cache[item[0]])
35 |         return (item in self.cache)
36 | 
37 |     def items(self):
38 |         return self.cache.items()


--------------------------------------------------------------------------------
/alphaction/utils/IA_helper.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | 
 3 | def _block_set(ia_blocks):
 4 |     if len(ia_blocks) > 0 and isinstance(ia_blocks[0], list):
 5 |         ia_blocks = list(itertools.chain.from_iterable(ia_blocks))
 6 |     return ia_blocks
 7 | 
 8 | def has_person(ia_config):
 9 |     ia_blocks = _block_set(ia_config.I_BLOCK_LIST)
10 |     return (ia_config.ACTIVE and 'P' in ia_blocks and ia_config.MAX_PERSON > 0)
11 | 
12 | 
13 | def has_object(ia_config):
14 |     ia_blocks = _block_set(ia_config.I_BLOCK_LIST)
15 |     return (ia_config.ACTIVE and 'O' in ia_blocks and ia_config.MAX_OBJECT > 0)
16 | 
17 | 
18 | def has_memory(ia_config):
19 |     ia_blocks = _block_set(ia_config.I_BLOCK_LIST)
20 |     return (ia_config.ACTIVE and 'M' in ia_blocks and ia_config.MAX_PER_SEC > 0)
21 | 


--------------------------------------------------------------------------------
/alphaction/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/utils/__init__.py


--------------------------------------------------------------------------------
/alphaction/utils/logger.py:
--------------------------------------------------------------------------------
 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/logger.py
 2 | import logging
 3 | import os
 4 | import sys
 5 | import time
 6 | 
 7 | 
 8 | def setup_logger(name, save_dir, distributed_rank, filename=None):
 9 |     logger = logging.getLogger(name)
10 |     logger.setLevel(logging.DEBUG)
11 |     logger.propagate = False
12 |     # don't log results for the non-master process
13 |     if distributed_rank > 0:
14 |         return logger
15 |     ch = logging.StreamHandler(stream=sys.stdout)
16 |     ch.setLevel(logging.DEBUG)
17 |     formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
18 |     ch.setFormatter(formatter)
19 |     logger.addHandler(ch)
20 | 
21 |     if save_dir:
22 |         if filename is None:
23 |             filename = time.strftime("%Y-%m-%d_%H.%M.%S", time.localtime()) + ".log"
24 |         fh = logging.FileHandler(os.path.join(save_dir, filename))
25 |         fh.setLevel(logging.DEBUG)
26 |         fh.setFormatter(formatter)
27 |         logger.addHandler(fh)
28 | 
29 |     return logger
30 | 
31 | def setup_tblogger(save_dir, distributed_rank):
32 |     if distributed_rank>0:
33 |         return None
34 |     from tensorboardX import SummaryWriter
35 |     tbdir = os.path.join(save_dir,'tb')
36 |     os.makedirs(tbdir,exist_ok=True)
37 |     tblogger = SummaryWriter(tbdir)
38 |     return tblogger


--------------------------------------------------------------------------------
/alphaction/utils/metric_logger.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/metric_logger.py
 2 | from collections import defaultdict
 3 | from collections import deque
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class SmoothedValue(object):
 9 |     """Track a series of values and provide access to smoothed values over a
10 |     window or the global series average.
11 |     """
12 | 
13 |     def __init__(self, window_size=10):
14 |         self.deque = deque(maxlen=window_size)
15 |         # self.series = []
16 |         self.total = 0.0
17 |         self.count = 0
18 | 
19 |     def update(self, value):
20 |         self.deque.append(value)
21 |         # self.series.append(value)
22 |         self.count += 1
23 |         self.total += value
24 | 
25 |     @property
26 |     def median(self):
27 |         d = torch.tensor(list(self.deque))
28 |         return d.median().item()
29 | 
30 |     @property
31 |     def avg(self):
32 |         d = torch.tensor(list(self.deque))
33 |         return d.mean().item()
34 | 
35 |     @property
36 |     def global_avg(self):
37 |         return self.total / self.count
38 | 
39 | 
40 | class MetricLogger(object):
41 |     def __init__(self, delimiter="\t"):
42 |         self.meters = defaultdict(SmoothedValue)
43 |         self.delimiter = delimiter
44 | 
45 |     def update(self, **kwargs):
46 |         for k, v in kwargs.items():
47 |             if isinstance(v, torch.Tensor):
48 |                 v = v.item()
49 |             assert isinstance(v, (float, int))
50 |             self.meters[k].update(v)
51 | 
52 |     def __getattr__(self, attr):
53 |         if attr in self.meters:
54 |             return self.meters[attr]
55 |         if attr in self.__dict__:
56 |             return self.__dict__[attr]
57 |         raise AttributeError("'{}' object has no attribute '{}'".format(
58 |                     type(self).__name__, attr))
59 | 
60 |     def __str__(self):
61 |         loss_str = []
62 |         for name, meter in self.meters.items():
63 |             loss_str.append(
64 |                 "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg)
65 |             )
66 |         return self.delimiter.join(loss_str)
67 | 


--------------------------------------------------------------------------------
/alphaction/utils/model_serialization.py:
--------------------------------------------------------------------------------
 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/model_serialization.py
 2 | from collections import OrderedDict
 3 | import logging
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | def align_and_update_state_dicts(model_state_dict, loaded_state_dict, no_head):
 9 |     """
10 |     Strategy: suppose that the models that we will create will have prefixes appended
11 |     to each of its keys, for example due to an extra level of nesting that the original
12 |     pre-trained weights from ImageNet won't contain. For example, model.state_dict()
13 |     might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains
14 |     res2.conv1.weight. We thus want to match both parameters together.
15 |     For that, we look for each model weight, look among all loaded keys if there is one
16 |     that is a suffix of the current weight name, and use it if that's the case.
17 |     If multiple matches exist, take the one with longest size
18 |     of the corresponding name. For example, for the same model as before, the pretrained
19 |     weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case,
20 |     we want to match backbone[0].body.conv1.weight to conv1.weight, and
21 |     backbone[0].body.res2.conv1.weight to res2.conv1.weight.
22 |     """
23 |     current_keys = sorted(list(model_state_dict.keys()))
24 |     loaded_keys = sorted(list(loaded_state_dict.keys()))
25 |     # get a matrix of string matches, where each (i, j) entry correspond to the size of the
26 |     # loaded_key string, if it matches
27 |     match_matrix = [
28 |         len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys
29 |     ]
30 |     match_matrix = torch.as_tensor(match_matrix).view(
31 |         len(current_keys), len(loaded_keys)
32 |     )
33 |     max_match_size, idxs = match_matrix.max(1)
34 |     # remove indices that correspond to no-match
35 |     idxs[max_match_size == 0] = -1
36 | 
37 |     # used for logging
38 |     max_size = max([len(key) for key in current_keys]) if current_keys else 1
39 |     max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1
40 |     # log_str_template = "{: <{}} loaded from {: <{}} of shape {}"
41 |     logger = logging.getLogger(__name__)
42 |     for idx_new, idx_old in enumerate(idxs.tolist()):
43 |         if idx_old == -1:
44 |             continue
45 |         key = current_keys[idx_new]
46 |         key_old = loaded_keys[idx_old]
47 | 
48 |         if no_head and key_old.startswith("roi_heads."):
49 |             logger.info("{} will not be loaded.".format(key))
50 |             continue
51 | 
52 |         model_state_dict[key] = loaded_state_dict[key_old]
53 |         # logger.info(
54 |         #     log_str_template.format(
55 |         #         key,
56 |         #         max_size,
57 |         #         key_old,
58 |         #         max_size_loaded,
59 |         #         tuple(loaded_state_dict[key_old].shape),
60 |         #     )
61 |         # )
62 | 
63 | 
64 | def strip_prefix_if_present(state_dict, prefix):
65 |     keys = sorted(state_dict.keys())
66 |     if not all(key.startswith(prefix) for key in keys):
67 |         return state_dict
68 |     stripped_state_dict = OrderedDict()
69 |     for key, value in state_dict.items():
70 |         stripped_state_dict[key.replace(prefix, "")] = value
71 |     return stripped_state_dict
72 | 
73 | 
74 | def exclude_layers(model_state_dict, excluded):
75 |     model_state_dict_new = OrderedDict()
76 |     for key, value in model_state_dict.items():
77 |         if any([exc in key for exc in excluded]):
78 |             continue
79 |         model_state_dict_new[key] = value
80 |     return model_state_dict_new
81 | 
82 | 
83 | def load_state_dict(model, loaded_state_dict, no_head, excluded=[]):
84 |     model_state_dict = model.state_dict()
85 |     # if the state_dict comes from a model that was wrapped in a
86 |     # DataParallel or DistributedDataParallel during serialization,
87 |     # remove the "module" prefix before performing the matching
88 |     loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.")
89 |     
90 |     if len(excluded) > 0:
91 |         # exclude specified layers
92 |         loaded_state_dict = exclude_layers(loaded_state_dict, excluded)
93 |     
94 |     align_and_update_state_dicts(model_state_dict, loaded_state_dict, no_head)
95 | 
96 |     # use strict loading
97 |     model.load_state_dict(model_state_dict)
98 | 


--------------------------------------------------------------------------------
/alphaction/utils/random_seed.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import random
 3 | import numpy as np
 4 | 
 5 | def set_seed(seed, rank, world_size):
 6 |     rng = random.Random(seed)
 7 |     seed_per_rank = [rng.randint(0, 2**32-1) for _ in range(world_size)]
 8 |     cur_seed = seed_per_rank[rank]
 9 |     random.seed(cur_seed)
10 |     torch.manual_seed(cur_seed)
11 |     torch.cuda.manual_seed(cur_seed)
12 |     np.random.seed(cur_seed)


--------------------------------------------------------------------------------
/alphaction/utils/registry.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/registry.py
 2 | 
 3 | def _register_generic(module_dict, module_name, module):
 4 |     assert module_name not in module_dict
 5 |     module_dict[module_name] = module
 6 | 
 7 | 
 8 | class Registry(dict):
 9 |     '''
10 |     A helper class for managing registering modules, it extends a dictionary
11 |     and provides a register functions.
12 | 
13 |     Eg. creeting a registry:
14 |         some_registry = Registry({"default": default_module})
15 | 
16 |     There're two ways of registering new modules:
17 |     1): normal way is just calling register function:
18 |         def foo():
19 |             ...
20 |         some_registry.register("foo_module", foo)
21 |     2): used as decorator when declaring the module:
22 |         @some_registry.register("foo_module")
23 |         @some_registry.register("foo_modeul_nickname")
24 |         def foo():
25 |             ...
26 | 
27 |     Access of module is just like using a dictionary, eg:
28 |         f = some_registry["foo_modeul"]
29 |     '''
30 |     def __init__(self, *args, **kwargs):
31 |         super(Registry, self).__init__(*args, **kwargs)
32 | 
33 |     def register(self, module_name, module=None):
34 |         # used as function call
35 |         if module is not None:
36 |             _register_generic(self, module_name, module)
37 |             return
38 | 
39 |         # used as decorator
40 |         def register_fn(fn):
41 |             _register_generic(self, module_name, fn)
42 |             return fn
43 | 
44 |         return register_fn
45 | 


--------------------------------------------------------------------------------
/alphaction/utils/video_decode.py:
--------------------------------------------------------------------------------
 1 | import av
 2 | 
 3 | def av_decode_video(video_path):
 4 |     try:
 5 |         with av.open(video_path) as container:
 6 |             frames = []
 7 |             for frame in container.decode(video=0):
 8 |                 frames.append(frame.to_rgb().to_ndarray())
 9 |         return frames
10 |     except Exception:
11 |         assert len(frame) != 0
12 |         return frames


--------------------------------------------------------------------------------
/alphaction/utils/visualize.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, List
 2 | import torch
 3 | from torchvision.ops import box_convert
 4 | import numpy as np
 5 | import supervision as sv
 6 | import cv2
 7 | import imageio
 8 | 
 9 | 
10 | def annotate(image_source: np.ndarray, boxes: torch.Tensor, normalized=True, logits=None, phrases=[], is_xyxy=False, color=None, text_padding=10, set_text_color='black') -> np.ndarray:
11 |     h, w, _ = image_source.shape
12 |     if normalized:
13 |         boxes = boxes * torch.Tensor([w, h, w, h])
14 |     if not is_xyxy:
15 |         assert isinstance(boxes, torch.Tensor)
16 |         xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
17 |     elif isinstance(boxes, torch.Tensor):
18 |         xyxy = boxes.numpy()
19 |     else:
20 |         xyxy = boxes
21 |     detections = sv.Detections(xyxy=xyxy)
22 | 
23 |     if logits is not None and len(phrases) == logits.size(0):
24 |         labels = [
25 |             f"{phrase} {logit:.2f}"
26 |             for phrase, logit
27 |             in zip(phrases, logits)
28 |         ]
29 |     else:
30 |         labels = phrases
31 | 
32 |     if color is None or (not isinstance(color, tuple)):
33 |         svcolor = sv.ColorPalette.default()
34 |     else:
35 |         svcolor = sv.Color(*color)
36 |     text_color = sv.Color.white() if set_text_color == 'white' else sv.Color.black()
37 |     box_annotator = sv.BoxAnnotator(color=svcolor, text_padding=text_padding, text_color=text_color)
38 |     annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
39 |     annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
40 |     return annotated_frame
41 | 
42 | 
43 | def video_to_gif(video, giffile, fps=5.0, toBGR=False):
44 |     assert giffile.endswith('.gif')
45 |     with imageio.get_writer(giffile, mode='I', duration=1.0/fps, loop=0) as writer:
46 |         for frame in video:
47 |             frame_vis = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) if toBGR else np.copy(frame)
48 |             writer.append_data(frame_vis)


--------------------------------------------------------------------------------
/assets/wacv25_openmixer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/assets/wacv25_openmixer.png


--------------------------------------------------------------------------------
/config_files/jhmdb/openmixer_e2e.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   PATH_TO_DATA_DIR: "data/JHMDB"
 3 |   NUM_FRAMES: 16
 4 |   SAMPLING_RATE: 1
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]  # from CLIPViP (same as CLIP)
 7 |   DATASETS: ['jhmdb']
 8 |   OPEN_VOCABULARY: True
 9 |   REFINE_VOCAB: True
10 | JHMDB:
11 |   FRAME_DIR: "Frames/"
12 |   OPEN_WORLD_DIR: 'openworld'
13 |   CW_SPLIT_FILE: 'train50%/closed_world_0.pkl'
14 |   OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl'
15 |   SAMPLES_SPLIT: 0
16 |   VOCAB_REFINE: 'vocab_gpt3.5.json'
17 | MODEL:
18 |   WEIGHT: null
19 |   BACKBONE:
20 |     CONV_BODY: "ViP-B/16"
21 |     PATHWAYS: 1
22 |     RESIDUAL_LATERAL: True
23 |   STM:
24 |     NUM_QUERIES: 100
25 |     HIDDEN_DIM: 512
26 |     NUM_STAGES: 3
27 |     ACTION_CLASSES: 10  # 50%: 10, 75%: 15
28 |     OBJECT_CLASSES: 1
29 |     NUM_HEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     NUM_FCS: 2
33 |     ACTIVATION: 'ReLU'
34 |     SPATIAL_POINTS: 32
35 |     TEMPORAL_POINTS: 16  # must be the same as NUM_FRAMES
36 |     OUT_MULTIPLIER: 4
37 |     N_GROUPS: 4
38 |     NUM_CLS: 1
39 |     NUM_ACT: 1
40 |     NUM_REG: 1
41 |     OBJECT_WEIGHT: 2.0
42 |     ACTION_WEIGHT: 48.0
43 |     GIOU_WEIGHT: 2.0
44 |     L1_WEIGHT: 2.0
45 |     BACKGROUND_WEIGHT: 0.1
46 |     INTERMEDIATE_SUPERVISION: True
47 |     PERSON_THRESHOLD: 0.6
48 |     USE_CLS_FEAT: True
49 |     COND_CLS: True
50 |     FUSE_CLS: True
51 |     FUSE_METHOD: 'logit_fusion'
52 |     FUSE_FACTOR: 0.99
53 |     DeST: True
54 |   TEXT_ENCODER: 'CLIPViP'
55 |   CLIPViP:
56 |     ARCH: ViP-B/16
57 |     CLIP_NAME: "openai/clip-vit-base-patch16"  # load from huggingface
58 |     WEIGHT: "pretrained/pretrain_clipvip_base_16.pt"
59 |     TEMPORAL_SIZE: 12
60 |     USE_TEMPORAL_EMBED: True
61 |     LOGIT_SCALE_INIT: 4.6
62 |     ADD_CLS_NUM: 3
63 |     CONTEXT_INIT: 'a '
64 |     LEN_CONTEXT: 24
65 |     CAM_METHOD: 'RITSM'
66 |     USE_ATTN: False
67 |   MULTI_LABEL_ACTION: False  # softmax
68 | ViT:
69 |   LAYER_DECAY: 1.0
70 |   WEIGHT_DECAY: 1e-5
71 | SOLVER:
72 |   MAX_EPOCH: 12
73 |   BASE_LR: 0.00001
74 |   WEIGHT_DECAY: 1e-4
75 |   STEPS: (5, 8)
76 |   WARMUP_FACTOR: 0.1
77 |   WARMUP_EPOCH: 2
78 |   CHECKPOINT_PERIOD: 1
79 |   EVAL_PERIOD: 1
80 |   EVAL_AFTER: 2
81 |   VIDEOS_PER_BATCH: 16
82 |   OPTIMIZING_METHOD: 'adamw'
83 | TEST:
84 |   VIDEOS_PER_BATCH: 16
85 |   EVAL_OPEN: True
86 |   METRIC: 'video_ap'
87 |   SMALL_OPEN_WORLD: True
88 |   INDEPENDENT_EVAL: True
89 | OUTPUT_DIR: "output/jhmdb/openmixer_e2e"
90 | 


--------------------------------------------------------------------------------
/config_files/jhmdb/openmixer_zsr_tl.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   PATH_TO_DATA_DIR: "data/JHMDB"
 3 |   NUM_FRAMES: 16
 4 |   SAMPLING_RATE: 1
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]  # from CLIPViP (same as CLIP)
 7 |   DATASETS: ['jhmdb']
 8 |   OPEN_VOCABULARY: True
 9 |   REFINE_VOCAB: True
10 | JHMDB:
11 |   FRAME_DIR: "Frames/"
12 |   OPEN_WORLD_DIR: 'openworld'
13 |   CW_SPLIT_FILE: 'train50%/closed_world_0.pkl'
14 |   OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl'
15 |   SAMPLES_SPLIT: 0
16 |   VOCAB_REFINE: 'vocab_gpt3.5.json'
17 | MODEL:
18 |   WEIGHT: null
19 |   BACKBONE:
20 |     CONV_BODY: "ViP-B/16"
21 |     PATHWAYS: 1
22 |   STM:
23 |     NUM_QUERIES: 100
24 |     HIDDEN_DIM: 512
25 |     NUM_STAGES: 3
26 |     ACTION_CLASSES: 10  # 50%: 10, 75%: 15
27 |     OBJECT_CLASSES: 1
28 |     NUM_HEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     NUM_FCS: 2
32 |     ACTIVATION: 'ReLU'
33 |     SPATIAL_POINTS: 32
34 |     TEMPORAL_POINTS: 16  # must be the same as NUM_FRAMES
35 |     OUT_MULTIPLIER: 4
36 |     N_GROUPS: 4
37 |     NUM_CLS: 1
38 |     NUM_ACT: 1
39 |     NUM_REG: 1
40 |     OBJECT_WEIGHT: 2.0
41 |     ACTION_WEIGHT: 48.0
42 |     GIOU_WEIGHT: 2.0
43 |     L1_WEIGHT: 2.0
44 |     BACKGROUND_WEIGHT: 0.1
45 |     INTERMEDIATE_SUPERVISION: True
46 |     PERSON_THRESHOLD: 0.6
47 |     USE_CLS_FEAT: True
48 |     PRETRAIN_ACTION: True
49 |   TEXT_ENCODER: 'CLIPViP'
50 |   CLIPViP:
51 |     ARCH: ViP-B/16
52 |     CLIP_NAME: "openai/clip-vit-base-patch16"  # load from huggingface
53 |     WEIGHT: "pretrained/pretrain_clipvip_base_16.pt"
54 |     TEMPORAL_SIZE: 12
55 |     USE_TEMPORAL_EMBED: True
56 |     LOGIT_SCALE_INIT: 4.6
57 |     ADD_CLS_NUM: 3
58 |     CONTEXT_INIT: 'a '
59 |     LEN_CONTEXT: 24
60 |     CAM_METHOD: 'RITSM'
61 |     USE_ATTN: False
62 |   MULTI_LABEL_ACTION: False  # softmax
63 | ViT:
64 |   LAYER_DECAY: 1.0
65 |   WEIGHT_DECAY: 1e-5
66 | SOLVER:
67 |   MAX_EPOCH: 12
68 |   BASE_LR: 0.00001
69 |   WEIGHT_DECAY: 1e-4
70 |   STEPS: (5, 8)
71 |   WARMUP_FACTOR: 0.1
72 |   WARMUP_EPOCH: 2
73 |   CHECKPOINT_PERIOD: 1
74 |   EVAL_PERIOD: 1
75 |   EVAL_AFTER: 2
76 |   VIDEOS_PER_BATCH: 16
77 |   OPTIMIZING_METHOD: 'adamw'
78 | TEST:
79 |   VIDEOS_PER_BATCH: 16
80 |   EVAL_OPEN: True
81 |   METRIC: 'video_ap'
82 |   SMALL_OPEN_WORLD: True
83 |   INDEPENDENT_EVAL: True
84 | OUTPUT_DIR: "output/jhmdb/openmixer_zsr_tl"
85 | 


--------------------------------------------------------------------------------
/config_files/jhmdb/openmixer_zsr_zsl.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   PATH_TO_DATA_DIR: "data/JHMDB"
 3 |   NUM_FRAMES: 16
 4 |   SAMPLING_RATE: 1
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]  # from CLIPViP (same as CLIP)
 7 |   DATASETS: ['jhmdb']
 8 |   OPEN_VOCABULARY: True
 9 |   REFINE_VOCAB: True
10 | JHMDB:
11 |   FRAME_DIR: "Frames/"
12 |   OPEN_WORLD_DIR: 'openworld'
13 |   CW_SPLIT_FILE: 'train50%/closed_world_0.pkl'
14 |   OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl'
15 |   SAMPLES_SPLIT: 0
16 |   VOCAB_REFINE: 'vocab_gpt3.5.json'
17 |   PRIOR_BOX_FILE: 'JHMDB-MaskRCNN.pkl'
18 | MODEL:
19 |   DET: NaiveBaseline
20 |   MULTI_LABEL_ACTION: False
21 |   PRIOR_BOXES_INIT: 'det'
22 |   WEIGHT: null
23 |   BACKBONE:
24 |     CONV_BODY: "ViP-B/16"
25 |     PATHWAYS: 1
26 |   STM:
27 |     USE_CLS_FEAT: True
28 |   TEXT_ENCODER: 'CLIPViP'
29 |   CLIPViP:
30 |     ARCH: ViP-B/16
31 |     CLIP_NAME: "openai/clip-vit-base-patch16"  # load from huggingface
32 |     WEIGHT: "pretrained/pretrain_clipvip_base_16.pt"
33 |     TEMPORAL_SIZE: 12
34 |     USE_TEMPORAL_EMBED: True
35 |     LOGIT_SCALE_INIT: 4.6
36 |     ADD_CLS_NUM: 3
37 |     # CONTEXT_INIT: 'a video of '
38 |     LEN_CONTEXT: 24
39 | TEST:
40 |   VIDEOS_PER_BATCH: 32
41 |   EVAL_OPEN: True
42 |   METRIC: 'video_ap'
43 |   SMALL_OPEN_WORLD: True
44 |   INDEPENDENT_EVAL: True
45 | OUTPUT_DIR: "output/jhmdb/openmixer_zsr_zsl"
46 | 


--------------------------------------------------------------------------------
/config_files/ucf24/openmixer_e2e.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   PATH_TO_DATA_DIR: "data/UCF24"
 3 |   NUM_FRAMES: 16
 4 |   SAMPLING_RATE: 1
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]  # from CLIPViP (same as CLIP)
 7 |   DATASETS: ['ucf24']
 8 |   OPEN_VOCABULARY: True
 9 |   REFINE_VOCAB: True
10 | UCF24:
11 |   FRAME_DIR: "rgb-images"
12 |   OPEN_WORLD_DIR: 'openworld'
13 |   CW_SPLIT_FILE: 'train50%/closed_world_0.pkl'
14 |   OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl'
15 |   VOCAB_REFINE: 'vocab_gpt4.json'
16 |   # PRIOR_BOX_FILE: 'UCF24-GDINO-top10.pkl'  # 'UCF24-MaskRCNN.pkl', 'UCF24-GDINO-top10.pkl'
17 | MODEL:
18 |   WEIGHT: null
19 |   BACKBONE:
20 |     CONV_BODY: "ViP-B/16"
21 |     PATHWAYS: 1
22 |     RESIDUAL_LATERAL: True
23 |   STM:
24 |     NUM_QUERIES: 100
25 |     HIDDEN_DIM: 512
26 |     NUM_STAGES: 3
27 |     ACTION_CLASSES: 10  # 50%: 10, 75%: 15
28 |     OBJECT_CLASSES: 1
29 |     NUM_HEADS: 8
30 |     DROPOUT: 0.0
31 |     DIM_FEEDFORWARD: 2048
32 |     NUM_FCS: 2
33 |     ACTIVATION: 'ReLU'
34 |     SPATIAL_POINTS: 32
35 |     TEMPORAL_POINTS: 16  # must be the same as NUM_FRAMES
36 |     OUT_MULTIPLIER: 4
37 |     N_GROUPS: 4
38 |     NUM_CLS: 1
39 |     NUM_ACT: 1
40 |     NUM_REG: 1
41 |     OBJECT_WEIGHT: 2.0
42 |     ACTION_WEIGHT: 8.0
43 |     GIOU_WEIGHT: 2.0
44 |     L1_WEIGHT: 2.0
45 |     BACKGROUND_WEIGHT: 0.1
46 |     INTERMEDIATE_SUPERVISION: True
47 |     PERSON_THRESHOLD: 0.6
48 |     USE_CLS_FEAT: True
49 |     COND_CLS: True
50 |     FUSE_CLS: True
51 |     FUSE_METHOD: 'logit_fusion'
52 |     FUSE_FACTOR: 0.999
53 |     DeST: True
54 |   TEXT_ENCODER: 'CLIPViP'
55 |   CLIPViP:
56 |     ARCH: ViP-B/16
57 |     CLIP_NAME: "openai/clip-vit-base-patch16"  # load from huggingface
58 |     WEIGHT: "pretrained/pretrain_clipvip_base_16.pt"
59 |     TEMPORAL_SIZE: 12
60 |     USE_TEMPORAL_EMBED: True
61 |     LOGIT_SCALE_INIT: 4.6
62 |     ADD_CLS_NUM: 3
63 |     CONTEXT_INIT: ''
64 |     LEN_CONTEXT: 24
65 |     CAM_METHOD: 'RITSM'
66 |     USE_ATTN: False
67 |   MULTI_LABEL_ACTION: False  # softmax
68 | ViT:
69 |   LAYER_DECAY: 1.0
70 |   WEIGHT_DECAY: 1e-5
71 | SOLVER:
72 |   MAX_EPOCH: 12
73 |   BASE_LR: 0.00001
74 |   WEIGHT_DECAY: 1e-4
75 |   STEPS: (5, 8)
76 |   WARMUP_FACTOR: 0.1
77 |   WARMUP_EPOCH: 2
78 |   CHECKPOINT_PERIOD: 1
79 |   EVAL_PERIOD: 1
80 |   EVAL_AFTER: 2
81 |   VIDEOS_PER_BATCH: 8
82 |   OPTIMIZING_METHOD: 'adamw'
83 | TEST:
84 |   VIDEOS_PER_BATCH: 16
85 |   EVAL_OPEN: True
86 |   METRIC: 'video_ap'
87 |   SMALL_OPEN_WORLD: True
88 |   INDEPENDENT_EVAL: True
89 |   IOU_THRESH: 0.2
90 |   # PRIOR_BOX_TEST: True
91 | OUTPUT_DIR: "output/ucf24/openmixer_e2e"
92 | 


--------------------------------------------------------------------------------
/config_files/ucf24/openmixer_zsr_tl.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   PATH_TO_DATA_DIR: "data/UCF24"
 3 |   NUM_FRAMES: 16
 4 |   SAMPLING_RATE: 1
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]  # from CLIPViP (same as CLIP)
 7 |   DATASETS: ['ucf24']
 8 |   OPEN_VOCABULARY: True
 9 |   REFINE_VOCAB: True
10 | UCF24:
11 |   FRAME_DIR: "rgb-images"
12 |   OPEN_WORLD_DIR: 'openworld'
13 |   CW_SPLIT_FILE: 'train50%/closed_world_0.pkl'
14 |   OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl'
15 |   VOCAB_REFINE: 'vocab_gpt4.json'
16 | MODEL:
17 |   WEIGHT: null
18 |   BACKBONE:
19 |     CONV_BODY: "ViP-B/16"
20 |     PATHWAYS: 1
21 |     RESIDUAL_LATERAL: True
22 |   STM:
23 |     NUM_QUERIES: 100
24 |     HIDDEN_DIM: 512
25 |     NUM_STAGES: 3
26 |     ACTION_CLASSES: 10  # 50%: 10, 75%: 15
27 |     OBJECT_CLASSES: 1
28 |     NUM_HEADS: 8
29 |     DROPOUT: 0.0
30 |     DIM_FEEDFORWARD: 2048
31 |     NUM_FCS: 2
32 |     ACTIVATION: 'ReLU'
33 |     SPATIAL_POINTS: 32
34 |     TEMPORAL_POINTS: 16  # must be the same as NUM_FRAMES
35 |     OUT_MULTIPLIER: 4
36 |     N_GROUPS: 4
37 |     NUM_CLS: 1
38 |     NUM_ACT: 1
39 |     NUM_REG: 1
40 |     OBJECT_WEIGHT: 2.0
41 |     ACTION_WEIGHT: 48.0
42 |     GIOU_WEIGHT: 2.0
43 |     L1_WEIGHT: 2.0
44 |     BACKGROUND_WEIGHT: 0.1
45 |     INTERMEDIATE_SUPERVISION: True
46 |     PERSON_THRESHOLD: 0.6
47 |     USE_CLS_FEAT: True
48 |     PRETRAIN_ACTION: True
49 |   TEXT_ENCODER: 'CLIPViP'
50 |   CLIPViP:
51 |     ARCH: ViP-B/16
52 |     CLIP_NAME: "openai/clip-vit-base-patch16"  # load from huggingface
53 |     WEIGHT: "pretrained/pretrain_clipvip_base_16.pt"
54 |     TEMPORAL_SIZE: 12
55 |     USE_TEMPORAL_EMBED: True
56 |     LOGIT_SCALE_INIT: 4.6
57 |     ADD_CLS_NUM: 3
58 |     CONTEXT_INIT: ''
59 |     LEN_CONTEXT: 24
60 |     CAM_METHOD: 'RITSM'
61 |     USE_ATTN: False
62 |   MULTI_LABEL_ACTION: False  # softmax
63 | ViT:
64 |   LAYER_DECAY: 1.0
65 |   WEIGHT_DECAY: 1e-5
66 | SOLVER:
67 |   MAX_EPOCH: 12
68 |   BASE_LR: 0.00001
69 |   WEIGHT_DECAY: 1e-4
70 |   STEPS: (5, 8)
71 |   WARMUP_FACTOR: 0.1
72 |   WARMUP_EPOCH: 2
73 |   CHECKPOINT_PERIOD: 1
74 |   EVAL_PERIOD: 1
75 |   EVAL_AFTER: 2
76 |   VIDEOS_PER_BATCH: 16
77 |   OPTIMIZING_METHOD: 'adamw'
78 | TEST:
79 |   VIDEOS_PER_BATCH: 16
80 |   EVAL_OPEN: True
81 |   METRIC: 'video_ap'
82 |   SMALL_OPEN_WORLD: True
83 |   INDEPENDENT_EVAL: True
84 |   IOU_THRESH: 0.2
85 | OUTPUT_DIR: "output/ucf24/openmixer_zsr_tl"
86 | 


--------------------------------------------------------------------------------
/config_files/ucf24/openmixer_zsr_zsl.yaml:
--------------------------------------------------------------------------------
 1 | DATA:
 2 |   PATH_TO_DATA_DIR: "data/UCF24"
 3 |   NUM_FRAMES: 16
 4 |   SAMPLING_RATE: 1
 5 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
 6 |   STD: [0.26862954, 0.26130258, 0.27577711]  # from CLIPViP (same as CLIP)
 7 |   DATASETS: ['ucf24']
 8 |   OPEN_VOCABULARY: True
 9 |   REFINE_VOCAB: True
10 | UCF24:
11 |   FRAME_DIR: "rgb-images"
12 |   OPEN_WORLD_DIR: 'openworld'
13 |   CW_SPLIT_FILE: 'train50%/closed_world_0.pkl'
14 |   OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl'
15 |   VOCAB_REFINE: 'vocab_gpt4.json'
16 |   PRIOR_BOX_FILE: 'UCF24-MaskRCNN.pkl'
17 | MODEL:
18 |   DET: NaiveBaseline
19 |   MULTI_LABEL_ACTION: False
20 |   PRIOR_BOXES_INIT: 'det'  # prior boxes in testing
21 |   WEIGHT: null
22 |   BACKBONE:
23 |     CONV_BODY: "ViP-B/16"
24 |     PATHWAYS: 1
25 |   STM:
26 |     USE_CLS_FEAT: True
27 |   TEXT_ENCODER: 'CLIPViP'
28 |   CLIPViP:
29 |     ARCH: ViP-B/16
30 |     CLIP_NAME: "openai/clip-vit-base-patch16"  # load from huggingface
31 |     WEIGHT: "pretrained/pretrain_clipvip_base_16.pt"
32 |     TEMPORAL_SIZE: 12
33 |     USE_TEMPORAL_EMBED: True
34 |     LOGIT_SCALE_INIT: 4.6
35 |     ADD_CLS_NUM: 3
36 |     # CONTEXT_INIT: 'a video of '
37 |     LEN_CONTEXT: 24
38 | TEST:
39 |   VIDEOS_PER_BATCH: 64
40 |   EVAL_OPEN: True
41 |   METRIC: 'video_ap'
42 |   SMALL_OPEN_WORLD: True
43 |   INDEPENDENT_EVAL: True
44 |   IOU_THRESH: 0.2
45 | OUTPUT_DIR: "output/ucf24/openmixer_zsr_zsl"
46 | 


--------------------------------------------------------------------------------
/preprocess/generate_vdt_jhmdb.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import re
  3 | import os
  4 | import copy
  5 | import json
  6 | 
  7 | import openai
  8 | openai.api_key = "YOUR_OPENAI_KEY_HERE"
  9 | 
 10 | 
 11 | def read_class_list(filepath):
 12 |     class_list = []
 13 |     with open(filepath, 'r') as f:
 14 |         for line in f.readlines():
 15 |             class_list.append(line.strip())
 16 |     return class_list
 17 | 
 18 | def read_class_description(filepath):
 19 |     with open(filepath, 'r') as f:
 20 |         refine_maps = json.load(f)
 21 |     return refine_maps
 22 | 
 23 | 
 24 | def run_gpt4(class_name):
 25 |     prompt = """
 26 | What are the visual features for distinguishing {}? Please describe with a few short sentences.
 27 | """
 28 |     cls_name = re.sub("_", " ", class_name)
 29 |     message = [
 30 |         {"role": "system", "content": "You are a useful assistant."},
 31 |         {"role": "user", "content": prompt.format(cls_name)}
 32 |     ]
 33 | 
 34 |     response = openai.ChatCompletion.create(
 35 |     model="gpt-4-0613",
 36 |     max_tokens=1024,
 37 |     temperature=1.2,
 38 |     messages = message)
 39 | 
 40 |     # parse the response
 41 |     result = response['choices'][0]['message']['content']
 42 |     return result
 43 | 
 44 | 
 45 | def generate_different_meaning():
 46 |     jhmdb_classes = read_class_list(os.path.join(data_path, 'vocab_open.txt'))
 47 | 
 48 |     results = {}
 49 |     for clsname in jhmdb_classes:
 50 |         print("\nProcessing action: {}...".format(clsname))
 51 |         cls_name = re.sub("_", " ", clsname)
 52 |         prompt = f"Generate 16 unique sentences describing the action '{cls_name}':"
 53 |         message = [
 54 |             {"role": "system", "content": "You are a useful assistant."},
 55 |             {"role": "user", "content": prompt}
 56 |         ]
 57 |         response = openai.ChatCompletion.create(model="gpt-4-0613", messages=message, max_tokens=800, request_timeout=60)  # Adjust max_tokens as needed
 58 |         res = response.choices[0]['message']['content'].strip().split('\n')
 59 |         print(res)
 60 | 
 61 |         results[clsname] = res
 62 |     
 63 |     with open(os.path.join(data_path, "vocab_gpt4_m16.json"), "w") as outfile:
 64 |         json.dump(results, outfile)
 65 | 
 66 | 
 67 | def generate_same_meaning():
 68 |     class_descriptions = read_class_description(os.path.join(data_path, 'vocab_gpt3.5.json'))
 69 | 
 70 |     results = {}
 71 |     for clsname, desc in class_descriptions.items():
 72 |         print("\nProcessing action: {}...".format(clsname))
 73 |         cls_name = re.sub("_", " ", clsname)
 74 |         cap_prefix, cap = desc.split(": ")
 75 |         prompt = f"Given a sport action type from JHMDB dataset, such as '{cls_name}, please provide 16 different sentences that express the same meaning of the caption: '{cap}'."
 76 |         message = [
 77 |             {"role": "system", "content": "You are a useful assistant."},
 78 |             {"role": "user", "content": prompt}
 79 |         ]
 80 |         response = openai.ChatCompletion.create(model="gpt-4-0613", messages=message, max_tokens=800, request_timeout=60)  # Adjust max_tokens as needed
 81 |         res = response.choices[0]['message']['content'].strip().split('\n')
 82 |         res = [desc] + [re.sub(r'\d+.', f'{cap_prefix}:', cap) for cap in res]
 83 |         print(res)
 84 | 
 85 |         results[clsname] = res
 86 |     
 87 | 
 88 |     with open(os.path.join(data_path, "vocab_gpt4_m16new.json"), "w") as outfile:
 89 |         json.dump(results, outfile, indent=4)
 90 | 
 91 | 
 92 | if __name__ == '__main__':
 93 |     random.seed(42)
 94 |     data_path = '../data/JHMDB/openworld'
 95 |     
 96 |     # generate_different_meaning()
 97 | 
 98 |     # generate_same_meaning()
 99 | 
100 |     class_descriptions = read_class_description(os.path.join(data_path, 'vocab_gpt3.5.json'))
101 | 
102 |     # get candidate verbs
103 |     seen_classes = read_class_list(os.path.join(data_path, 'train50%', 'vocab_closed_0.txt'))
104 |     verbs_list = [clsname.split("_")[0] for clsname in seen_classes]
105 |     
106 |     prompt = """In this task, you are given an input sentence. 
107 | Your job is to tell me 16 output sentences with different meanings by only changing the action verbs using a list of candidate verbs. 
108 | The output format should be a dictionary of key-value pair where keys are the verbs you are choosing, and values are the generated sentences."""
109 | 
110 |     results = {}
111 |     for clsname, desc in class_descriptions.items():
112 |         if clsname not in seen_classes:
113 |             continue  # only process the seen classes
114 |         print("\nProcessing action: {}...".format(clsname))
115 |         cls_name = re.sub("_", " ", clsname)
116 |         cap_prefix, cap = desc.split(": ")
117 |         verbs_sub = copy.deepcopy(verbs_list)
118 |         verbs_sub.remove(clsname.split("_")[0])
119 |         verbs_sub = ', '.join(verbs_sub)
120 |         message = [
121 |             {"role": "system", "content": "You are a useful assistant."},
122 |             {"role": "user", "content": prompt + f" The input sentence: {cap} The candidate verb list: [{verbs_sub}]."}
123 |         ]
124 |         response = openai.ChatCompletion.create(model="gpt-4-0613", messages=message, max_tokens=800, request_timeout=60)
125 |         res = response.choices[0]['message']['content'].strip().split('\n')
126 |         result_list = []
127 |         for strline in res:
128 |             if ': ' not in strline:
129 |                 continue
130 |             strline = re.sub("\"", "", strline.strip(","))
131 |             prefix, sentence = strline.split(': ')
132 |             result_list.append("{}: {}".format(prefix.capitalize(), sentence))
133 |             if len(result_list) == 8:
134 |                 break
135 |         print(result_list)
136 |         
137 |         results[clsname] = result_list
138 | 
139 |     with open(os.path.join(data_path, 'train50%', "hardneg_closed_0.json"), "w") as outfile:
140 |         json.dump(results, outfile, indent=4)
141 | 
142 |     


--------------------------------------------------------------------------------
/preprocess/generate_vdt_ucf24.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import re
 3 | import os
 4 | import copy
 5 | import json
 6 | 
 7 | import openai
 8 | openai.api_key = "YOUR_OPENAI_KEY_HERE"
 9 | 
10 | 
11 | 
12 | def read_class_list(filepath):
13 |     class_list = []
14 |     with open(filepath, 'r') as f:
15 |         for line in f.readlines():
16 |             class_list.append(line.strip())
17 |     return class_list
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     random.seed(42)
22 |     data_path = '../data/UCF24/openworld'
23 | 
24 |     ucf24_classes = read_class_list(os.path.join(data_path, 'vocab_open.txt'))
25 | 
26 |     results = {}
27 |     for clsname in ucf24_classes:
28 |         print("\nProcessing action: {}...".format(clsname))
29 |         prompt = f"Generate 16 captions that describe the action '{clsname}'. For example, given the action dance, your output will be like: Dance: A person is dancing on the stage, with the body moving rhythmically to music."
30 |         message = [
31 |             {"role": "system", "content": "You are a useful assistant."},
32 |             {"role": "user", "content": prompt}
33 |         ]
34 |         response = openai.ChatCompletion.create(model="gpt-4-0613", messages=message, max_tokens=800, request_timeout=60)  # Adjust max_tokens as needed
35 |         res = response.choices[0]['message']['content'].strip().split('\n')
36 |         res = [re.sub(r'\d+. ', '', cap) for cap in res]
37 |         print(res)
38 |         results[clsname] = res
39 |     
40 |     with open(os.path.join(data_path, "vocab_gpt4.json"), "w") as outfile:
41 |         json.dump(results, outfile, indent=4)
42 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | tqdm
 2 | yacs
 3 | opencv-python
 4 | tensorboardX
 5 | SciPy
 6 | fvcore
 7 | timm
 8 | iopath
 9 | git+https://github.com/openai/CLIP.git
10 | transformers
11 | ttach
12 | kornia
13 | scikit-learn
14 | scikit-image
15 | einops
16 | matplotlib
17 | supervision


--------------------------------------------------------------------------------
/test_net.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import torch
  5 | from alphaction.config import cfg
  6 | from alphaction.dataset import make_data_loader
  7 | from alphaction.engine.inference import inference
  8 | from alphaction.modeling.detector import build_detection_model, build_naive_baseline
  9 | from alphaction.utils.checkpoint import ActionCheckpointer
 10 | from torch.utils.collect_env import get_pretty_env_info
 11 | from alphaction.utils.comm import synchronize, get_rank
 12 | from alphaction.utils.logger import setup_logger
 13 | #pytorch issuse #973
 14 | import resource
 15 | 
 16 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
 17 | resource.setrlimit(resource.RLIMIT_NOFILE, (rlimit[1], rlimit[1]))
 18 | 
 19 | 
 20 | def main():
 21 |     parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
 22 |     parser.add_argument(
 23 |         "--config-file",
 24 |         default="",
 25 |         metavar="FILE",
 26 |         help="path to config file",
 27 |     )
 28 |     parser.add_argument("--local_rank", type=int, default=0)
 29 |     parser.add_argument(
 30 |         "opts",
 31 |         help="Modify config options using the command-line",
 32 |         default=None,
 33 |         nargs=argparse.REMAINDER,
 34 |     )
 35 | 
 36 |     args = parser.parse_args()
 37 | 
 38 |     num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
 39 |     distributed = num_gpus > 1
 40 | 
 41 |     if distributed:
 42 |         torch.cuda.set_device(args.local_rank)
 43 |         torch.distributed.init_process_group(
 44 |             backend="nccl", init_method="env://"
 45 |         )
 46 | 
 47 |     # Merge config file.
 48 |     cfg.merge_from_file(args.config_file)
 49 |     cfg.merge_from_list(args.opts)
 50 |     cfg.freeze()
 51 | 
 52 | 
 53 |     # Print experimental infos.
 54 |     save_dir = ""
 55 |     logger = setup_logger("alphaction", save_dir, get_rank())
 56 |     logger.info("Using {} GPUs".format(num_gpus))
 57 |     logger.info(cfg)
 58 | 
 59 |     logger.info("Collecting env info (might take some time)")
 60 |     logger.info("\n" + get_pretty_env_info())
 61 | 
 62 |     # Build the model.
 63 |     if cfg.MODEL.DET == 'STMDetector':
 64 |         model = build_detection_model(cfg)
 65 |     elif cfg.MODEL.DET == 'NaiveBaseline':
 66 |         model = build_naive_baseline(cfg)
 67 |     model.to("cuda")
 68 | 
 69 |     if cfg.MODEL.DET != 'NaiveBaseline':
 70 |         # load weight.
 71 |         output_dir = cfg.OUTPUT_DIR
 72 |         checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir)
 73 |         ckpt_file = os.path.join(output_dir, cfg.MODEL.WEIGHT) if cfg.MODEL.WEIGHT else None
 74 |         checkpointer.load(ckpt_file)
 75 | 
 76 |     output_folders = [None] * len(cfg.DATA.DATASETS)
 77 |     dataset_names = cfg.DATA.DATASETS
 78 |     if cfg.OUTPUT_DIR:
 79 |         for idx, dataset_name in enumerate(dataset_names):
 80 |             inf_folder = "inference" if not cfg.TEST.SMALL_OPEN_WORLD else "inference_small"
 81 |             output_folder = os.path.join(cfg.OUTPUT_DIR, inf_folder, dataset_name)
 82 |             os.makedirs(output_folder, exist_ok=True)
 83 |             output_folders[idx] = output_folder
 84 | 
 85 |     # Do inference.
 86 |     data_loaders_test, vocabularies_test, _ = make_data_loader(cfg, is_train=False, is_distributed=distributed)
 87 |     for i, (output_folder, dataset_name, data_loader_test) in enumerate(zip(output_folders, dataset_names, data_loaders_test)):
 88 |         # set open vocabulary
 89 |         if len(vocabularies_test) > 0:
 90 |             model.backbone.text_encoder.set_vocabulary(vocabularies_test[i])
 91 |         
 92 |         inference(
 93 |             model,
 94 |             data_loader_test,
 95 |             dataset_name,
 96 |             output_folder=output_folder,
 97 |             metric=cfg.TEST.METRIC,
 98 |             use_cache=True
 99 |         )
100 |         synchronize()
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     main()
105 | 


--------------------------------------------------------------------------------
/third_party/eval_utils.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | LIB_PATH=['../alphaction/dataset/datasets/evaluation']
 3 | sys.path.extend(LIB_PATH)
 4 | 
 5 | from pascal_evaluation.object_detection_evaluation import PascalDetectionEvaluator
 6 | from pascal_evaluation.standard_fields import InputDataFields, DetectionResultFields
 7 | 
 8 | import pickle
 9 | import numpy as np
10 | 
11 | 
12 | 
13 | def load_gt_data(anno_file, split=0):
14 |     assert os.path.exists(anno_file), "Annotation file does not exist: {}".format(anno_file)
15 |     with open(anno_file, 'rb') as fid:
16 |         data = pickle.load(fid, encoding='iso-8859-1')
17 |     return data
18 | 
19 | 
20 | def eval_person_boxes(results, gt_data):
21 |     class_id = 1
22 | 
23 |     pascal_evaluator = PascalDetectionEvaluator([{'id': class_id, 'name': 'person'}],
24 |                                                 matching_iou_threshold=0.5)
25 | 
26 |     # prepare ground truth
27 |     for vid, annos in gt_data['gttubes'].items():
28 |         # each video contains only one action type
29 |         act_id = list(annos.keys())[0]
30 |         act_annos = annos[act_id][0]
31 |         height, width = gt_data['resolution'][vid]
32 |         # each action type contains only one action box on a frame
33 |         for fid_box in act_annos:
34 |             img_key = "%s,%04d" % (vid, float(fid_box[0]))
35 |             box_normed = fid_box[1:5] / np.array([width, height, width, height], dtype=np.float32)  # (xyxy)
36 |             box_normed = box_normed[[1, 0, 3, 2]]  # (yxyx)
37 |             pascal_evaluator.add_single_ground_truth_image_info(
38 |                 img_key, {
39 |                     InputDataFields.groundtruth_boxes: box_normed[None],
40 |                     InputDataFields.groundtruth_classes: np.array([class_id], dtype=int),
41 |                     InputDataFields.groundtruth_difficult: np.zeros(1, dtype=bool)
42 |                 })
43 | 
44 |     # prepare detection results
45 |     for vid, dets in results.items():
46 |         boxes, scores = dets['boxes'], dets['scores']
47 |         frame_ids = list(boxes.keys())
48 |         for fid in frame_ids:
49 |             img_key = "%s,%04d" % (vid, float(fid))
50 |             boxes_pred = boxes[fid].copy()
51 |             boxes_pred = boxes_pred[:, [1, 0, 3, 2]]
52 |             pascal_evaluator.add_single_detected_image_info(
53 |                 img_key, {
54 |                     DetectionResultFields.detection_boxes: boxes_pred,
55 |                     DetectionResultFields.detection_classes: np.array([class_id]*len(boxes[fid]), dtype=int),
56 |                     DetectionResultFields.detection_scores:  scores[fid].copy()
57 |                 })
58 |     
59 |     eval_res = pascal_evaluator.evaluate()
60 | 
61 |     precisions = pascal_evaluator._evaluation.precisions_per_class
62 |     recalls = pascal_evaluator._evaluation.recalls_per_class
63 | 
64 |     return eval_res, precisions, recalls


--------------------------------------------------------------------------------
/third_party/maskrcnn_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchvision.ops import box_convert
 3 | from video_io import *
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | def preprocess_clip(clip, device):
 8 |     """ clip: (T, H, W, 3) in uint8 format
 9 |     """
10 |     # preprocess video
11 |     clip = torch.from_numpy(clip).to(device).float() / 255.0
12 |     clip = clip.permute(0, 3, 1, 2).contiguous()  # (T, C, H, W)
13 |     return clip
14 | 
15 | 
16 | def maskrcnn_video(video_path, model, categories, box_thresh=0.35, topk=None, batch_size=16, fmt='%05d.png', device=torch.device('cuda')):
17 |     # load video data
18 |     if isinstance(video_path, list):
19 |         video = read_video_from_list(video_path)
20 |         video_name = os.path.dirname(video_path[0]).split("/")[-1]
21 |     elif os.path.isfile(video_path):
22 |         video = read_video_from_file(video_path)  # (T, H, W, C) in RGB uint8 format
23 |         video_name = video_path.split("/")[-1][:-4]
24 |     else:
25 |         video = read_video_from_folder(video_path, fmt=fmt)
26 |         video_name = video_path.split("/")[-1]
27 |     num_frames = len(video)
28 | 
29 |     if isinstance(video_path, list):
30 |         frame_ids = [int(imgfile[:-4].split("/")[-1].split("_")[-1])
31 |                      for imgfile in video_path]
32 |     else:
33 |         frame_ids = list(range(num_frames))
34 |     
35 |     if num_frames > batch_size:
36 |         video = np.array_split(video, int(num_frames // batch_size))
37 |         frame_ids = np.array_split(frame_ids, int(num_frames // batch_size))
38 |     else:
39 |         video, frame_ids = [video], [frame_ids]
40 | 
41 |     results = {'boxes': dict(), 'scores': dict()}
42 |     for fids, clip in tqdm(zip(frame_ids, video), total=len(video), desc="{}".format(video_name), ncols=0):
43 |         # preprocess
44 |         height, width = clip.shape[1:3]
45 |         batch = preprocess_clip(clip, device)  # (T, 3, H, W)
46 |         with torch.no_grad():
47 |             outputs = model(batch)
48 |         # get results
49 |         for i, outs in enumerate(outputs):
50 |             mask = outs['labels'] == categories.index('person')
51 |             if not any(mask):
52 |                 continue  # no person at all
53 | 
54 |             if box_thresh is not None:
55 |                 mask = mask & (outs['scores'] > box_thresh)
56 |             if topk is not None:
57 |                 inds = torch.topk(outs['scores'], topk)[1]
58 |                 topk_mask = torch.zeros_like(outs['scores'], dtype=torch.bool).scatter_(0, inds, True)
59 |                 mask = mask & topk_mask
60 |             if not any(mask):  # no valid person
61 |                 continue
62 |             
63 |             # mask out and sort boxes and scores
64 |             boxes = outs['boxes'][mask]  # the predicted boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
65 |             scores = outs['scores'][mask]
66 |             idx = torch.argsort(scores, descending=True)
67 |             boxes, scores = boxes[idx], scores[idx]
68 |             # save
69 |             boxes[:, [0, 2]] = boxes[:, [0, 2]] / width
70 |             boxes[:, [1, 3]] = boxes[:, [1, 3]] / height
71 |             results['boxes'][fids[i]] = boxes.cpu().numpy()  # normalized (x1, y1, x2, y2)
72 |             results['scores'][fids[i]] = scores.cpu().numpy()
73 | 
74 |     return results
75 | 


--------------------------------------------------------------------------------
/third_party/run_maskrcnn.py:
--------------------------------------------------------------------------------
  1 | import os, argparse, pickle
  2 | from tqdm import tqdm
  3 | 
  4 | import torch
  5 | from torchvision.models.detection import maskrcnn_resnet50_fpn
  6 | from torchvision.models._meta import _COCO_CATEGORIES
  7 | from maskrcnn_utils import maskrcnn_video
  8 | from video_io import vis_dets
  9 | 
 10 | from eval_utils import eval_person_boxes, load_gt_data
 11 | from pprint import pformat
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | 
 15 | def main(args):
 16 | 
 17 |     dataset = args.data.upper()
 18 |     if args.data == 'jhmdb':
 19 |         video_dir = f'../data/{dataset}/Frames'
 20 |         fmt = '%05d.png'
 21 |         box_thresh, topk = None, 1
 22 |     
 23 |     elif args.data == 'ucf24':
 24 |         video_dir = f'../data/{dataset}/rgb-images'
 25 |         fmt = '%05d.jpg'
 26 |         box_thresh, topk = 0.35, None
 27 |     
 28 |     else:
 29 |         raise NotImplemented
 30 |     results_save_file = f'../data/{dataset}/{dataset}-MaskRCNN.pkl'
 31 | 
 32 |     if not os.path.exists(results_save_file):
 33 |         # setup device and model
 34 |         device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 35 |         model = maskrcnn_resnet50_fpn(pretrained=True).to(device)
 36 |         model.eval()
 37 | 
 38 |         all_video_files = []
 39 |         for folder in os.listdir(video_dir):
 40 |             videos_class_path = os.path.join(video_dir, folder)
 41 |             if not os.path.isdir(videos_class_path):
 42 |                 continue
 43 |             vid_files = [folder + '/' + vid for vid in os.listdir(videos_class_path) if os.path.isdir(os.path.join(videos_class_path, vid))]
 44 |             all_video_files.extend(vid_files)
 45 |         
 46 |         results = dict()
 47 |         for vid in tqdm(all_video_files, total=len(all_video_files), ncols=0):
 48 |             print("\nRuning on the file: {}...".format(vid))
 49 |             results[vid] = maskrcnn_video(os.path.join(video_dir, vid), model, _COCO_CATEGORIES,
 50 |                                           fmt=fmt, box_thresh=box_thresh, topk=topk, device=device)
 51 |     
 52 |         with open(results_save_file, 'wb') as fid:
 53 |             pickle.dump(results, fid, protocol=pickle.HIGHEST_PROTOCOL)
 54 |         
 55 |     else:
 56 |         with open(results_save_file, 'rb') as fid:
 57 |             results = pickle.load(fid, encoding='iso-8859-1')
 58 | 
 59 |     # evaluation
 60 |     if args.eval:
 61 |         # load the ground truth
 62 |         jhmdb_gt_file = '../data/JHMDB/JHMDB-GT.pkl'
 63 |         gt_data = load_gt_data(jhmdb_gt_file)
 64 |         
 65 |         eval_res, precisions, recalls = eval_person_boxes(results, gt_data)
 66 | 
 67 |         print(pformat(eval_res, indent=2))
 68 | 
 69 |         plt.figure(figsize=(10, 6))
 70 |         plt.plot(recalls[0], precisions[0], label="Precision-Recall curve")
 71 |         plt.xlabel("Recall")
 72 |         plt.ylabel("Precision")
 73 |         plt.legend(loc="lower left")
 74 |         plt.tight_layout()
 75 |         plt.savefig('../temp/jhmdb/precision_recall_curve_maskrcnn.png', bbox_inches='tight')
 76 |         plt.close()
 77 |     
 78 |     # visualize
 79 |     if args.vis:
 80 |         test_video = ['kick_ball/FIFA_11_Gamescom-Trailer_kick_ball_f_cm_np1_ba_med_4']
 81 |         save_dir = os.path.join(os.path.dirname(results_save_file), 'VisMaskRCNN')
 82 |         os.makedirs(save_dir, exist_ok=True)
 83 |         for vid in test_video:
 84 |             savefile = os.path.join(save_dir, vid.replace('/', '-') + "_pred.mp4")
 85 |             vis_dets(results, vid, video_dir, savefile)
 86 | 
 87 | 
 88 | if __name__ == '__main__':
 89 | 
 90 |     parser = argparse.ArgumentParser(description="Mask RCNN (ResNet-50 FPN) Experiments")
 91 |     parser.add_argument(
 92 |         "--data", type=str, default='jhmdb', choices=['jhmdb', 'ucf24'], help="dataset used for testing",
 93 |     )
 94 |     parser.add_argument(
 95 |         "--vis", action='store_true', help="visualize the detection results",
 96 |     )
 97 |     parser.add_argument(
 98 |         "--eval", action='store_true', help="evaluate the quality"
 99 |     )
100 |     args = parser.parse_args()
101 |     
102 |     main(args)


--------------------------------------------------------------------------------
/third_party/video_io.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | import os
 4 | import supervision as sv
 5 | 
 6 | 
 7 | def read_video_from_file(video_file, toRGB=True):
 8 |     assert os.path.exists(video_file), "File does not exist! {}".format(video_file)
 9 |     cap = cv2.VideoCapture(video_file)
10 |     success, frame = cap.read()
11 |     video = []
12 |     while success:
13 |         if toRGB:
14 |             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
15 |         video.append(frame)
16 |         success, frame = cap.read()
17 |     video = np.array(video)
18 |     return video
19 | 
20 | 
21 | def read_video_from_folder(video_path, fmt='%05d.png', start_frame=1, toRGB=True):
22 |     frame_files = [name for name in os.listdir(video_path) if name.endswith(fmt[-4:])]
23 |     vid_name = video_path.split("/")[-1]
24 |     video = []
25 |     for i in range(len(frame_files)):
26 |         if len(fmt.split("_")) == 1:
27 |             frame_name = fmt%(i + start_frame)
28 |         elif len(fmt.split("_")) == 2:
29 |             frame_name = fmt%(vid_name, i + start_frame)
30 |         frame_file = os.path.join(video_path, frame_name)  # frame starting from 1
31 |         frame = cv2.imread(frame_file)
32 |         if toRGB:
33 |             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
34 |         video.append(frame)
35 |     video = np.array(video)
36 |     return video
37 | 
38 | 
39 | def read_video_from_list(img_list, toRGB=True):
40 |     video = []
41 |     for frame_file in img_list:
42 |         frame = cv2.imread(frame_file)
43 |         if toRGB:
44 |             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
45 |         video.append(frame)
46 |     video = np.array(video)
47 |     return video
48 | 
49 | 
50 | def write_video(mat, video_file, fps=30, write_frames=True):
51 |     """ mat: (T, H, W, C)
52 |     """
53 |     video_writer = cv2.VideoWriter(video_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (mat.shape[2], mat.shape[1]))
54 |     for frame in mat:
55 |         video_writer.write(frame)
56 |     
57 |     if write_frames:
58 |         os.makedirs(video_file[:-4], exist_ok=True)
59 |         for i, frame in enumerate(mat):
60 |             cv2.imwrite(os.path.join(video_file[:-4], '%06d.jpg'%(i)), frame)
61 | 
62 | 
63 | def vis_dets(results, vid, video_dir, savefile):
64 |     # read frames
65 |     video = read_video_from_folder(os.path.join(video_dir, vid), toRGB=False)  # (T, H, W, C)
66 |     # parse detections
67 |     boxes = results[vid]['boxes']  # normalized (x1, y1, x2, y2)
68 |     scores = results[vid]['scores']
69 |     video_vis = []
70 |     # visualize
71 |     for i, frame in enumerate(video):
72 |         h, w = frame.shape[:2]
73 |         xyxy = boxes[i] * np.array([[w, h, w, h]])
74 |         detections = sv.Detections(xyxy=xyxy, confidence=scores[i])
75 |         labels = [f"person {s:.2f}" for s in scores[i]]
76 |         # annotate on frame
77 |         box_annotator = sv.BoxAnnotator()
78 |         annotated_frame = box_annotator.annotate(scene=frame.copy(), detections=detections, labels=labels)
79 |         video_vis.append(annotated_frame)
80 |     video_vis = np.array(video_vis)
81 |     # visualize
82 |     write_video(video_vis, savefile, fps=20, write_frames=False)


--------------------------------------------------------------------------------
/trainval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PHASE=$1  # train, eval
 4 | DATASET=$2  # jhmdb, ucf24
 5 | 
 6 | CFG_FILE="config_files/${DATASET}/openmixer_e2e.yaml"
 7 | # CFG_FILE="config_files/${DATASET}/openmixer_zsr_tl.yaml"
 8 | # CFG_FILE="config_files/${DATASET}/openmixer_zsr_zsl.yaml"  # eval-only!
 9 | 
10 | TEST_WEIGHT=${3:-'checkpoints/model_final.pth'}
11 | 
12 | eval "$(conda shell.bash hook)"
13 | conda activate openmixer
14 | 
15 | if [ $PHASE == 'train' ]
16 | then
17 |     python -m torch.distributed.launch --nproc_per_node=4 --master_port=2024 train_net.py \
18 |         --config-file ${CFG_FILE} \
19 |         --transfer \
20 |         --no-head \
21 |         --use-tfboard
22 | elif [ $PHASE == 'eval' ]
23 | then
24 |     python -m torch.distributed.launch --nproc_per_node=4 --master_port=2405 test_net.py \
25 |         --config-file ${CFG_FILE} \
26 |         MODEL.WEIGHT ${TEST_WEIGHT}
27 | fi
28 | 
29 | echo "${PHASE} finished!"


--------------------------------------------------------------------------------