├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── alphaction
├── __init__.py
├── cam
│ ├── .gitignore
│ ├── README.md
│ ├── __init__.py
│ ├── cam.py
│ ├── clip_loader.py
│ ├── hilacam.py
│ ├── mhsa.py
│ └── ritsm.py
├── config
│ ├── __init__.py
│ └── defaults.py
├── dataset
│ ├── __init__.py
│ ├── build.py
│ ├── collate_batch.py
│ ├── datasets
│ │ ├── __init__.py
│ │ ├── ava.py
│ │ ├── ava_dataset.py
│ │ ├── ava_helper.py
│ │ ├── concat_dataset.py
│ │ ├── cv2_transform.py
│ │ ├── evaluation
│ │ │ ├── __init__.py
│ │ │ ├── ava
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ └── ava_eval.py
│ │ │ ├── evaluate_map.py
│ │ │ ├── jhmdb
│ │ │ │ ├── __init__.py
│ │ │ │ └── jhmdb_eval.py
│ │ │ ├── pascal_evaluation
│ │ │ │ ├── __init__.py
│ │ │ │ ├── label_map_util.py
│ │ │ │ ├── metrics.py
│ │ │ │ ├── np_box_list.py
│ │ │ │ ├── np_box_list_ops.py
│ │ │ │ ├── np_box_mask_list.py
│ │ │ │ ├── np_box_mask_list_ops.py
│ │ │ │ ├── np_box_ops.py
│ │ │ │ ├── np_mask_ops.py
│ │ │ │ ├── object_detection_evaluation.py
│ │ │ │ ├── per_image_evaluation.py
│ │ │ │ └── standard_fields.py
│ │ │ ├── pascal_wrapper.py
│ │ │ └── ucf24
│ │ │ │ ├── __init__.py
│ │ │ │ └── ucf24_eval.py
│ │ ├── jhmdb_dataset.py
│ │ ├── ucf24_dataset.py
│ │ └── utils.py
│ └── samplers
│ │ ├── __init__.py
│ │ ├── distributed.py
│ │ ├── grouped_batch_sampler.py
│ │ └── iteration_based_batch_sampler.py
├── engine
│ ├── __init__.py
│ ├── feature_extraction.py
│ ├── inference.py
│ └── trainer.py
├── layers
│ ├── __init__.py
│ └── batch_norm.py
├── modeling
│ ├── __init__.py
│ ├── backbone
│ │ ├── __init__.py
│ │ ├── backbone.py
│ │ ├── i3d.py
│ │ ├── sfmodels
│ │ │ ├── common.py
│ │ │ ├── nonlocal_helper.py
│ │ │ ├── resnet_helper.py
│ │ │ └── stem_helper.py
│ │ ├── slowfast.py
│ │ ├── video_model_builder.py
│ │ └── vit_utils.py
│ ├── common_blocks.py
│ ├── detector
│ │ ├── __init__.py
│ │ ├── action_detector.py
│ │ ├── naive_baseline.py
│ │ └── stm_detector.py
│ ├── dict_model.py
│ ├── encoders
│ │ ├── clipvip
│ │ │ ├── CLIP_ViP.py
│ │ │ ├── clipvip_encoder.py
│ │ │ ├── custom_layers.py
│ │ │ └── loader.py
│ │ ├── openai_clip
│ │ │ ├── clip_encoder.py
│ │ │ └── clip_loader.py
│ │ └── viclip
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── bpe_simple_vocab_16e6.txt.gz
│ │ │ ├── demo.py
│ │ │ ├── simple_tokenizer.py
│ │ │ ├── viclip.py
│ │ │ ├── viclip_encoder.py
│ │ │ ├── viclip_text.py
│ │ │ └── viclip_vision.py
│ ├── nonlocal_block.py
│ ├── registry.py
│ ├── roi_heads
│ │ ├── __init__.py
│ │ ├── action_head
│ │ │ ├── IA_structure.py
│ │ │ ├── __init__.py
│ │ │ ├── action_head.py
│ │ │ ├── inference.py
│ │ │ ├── loss.py
│ │ │ ├── metric.py
│ │ │ ├── roi_action_feature_extractor.py
│ │ │ └── roi_action_predictors.py
│ │ └── roi_heads_3d.py
│ ├── stm_decoder
│ │ ├── stm_decoder.py
│ │ └── util
│ │ │ ├── __init__.py
│ │ │ ├── adaptive_mixing_operator.py
│ │ │ ├── box_ops.py
│ │ │ ├── head_utils.py
│ │ │ ├── loss.py
│ │ │ ├── misc.py
│ │ │ └── msaq.py
│ └── utils.py
├── solver
│ ├── __init__.py
│ ├── build.py
│ └── lr_scheduler.py
├── structures
│ ├── __init__.py
│ ├── bounding_box.py
│ └── memory_pool.py
└── utils
│ ├── IA_helper.py
│ ├── __init__.py
│ ├── c2_model_loading.py
│ ├── checkpoint.py
│ ├── comm.py
│ ├── logger.py
│ ├── metric_logger.py
│ ├── model_serialization.py
│ ├── random_seed.py
│ ├── registry.py
│ ├── video_decode.py
│ └── visualize.py
├── assets
└── wacv25_openmixer.png
├── config_files
├── jhmdb
│ ├── openmixer_e2e.yaml
│ ├── openmixer_zsr_tl.yaml
│ └── openmixer_zsr_zsl.yaml
└── ucf24
│ ├── openmixer_e2e.yaml
│ ├── openmixer_zsr_tl.yaml
│ └── openmixer_zsr_zsl.yaml
├── demo.py
├── preprocess
├── generate_vdt_jhmdb.py
├── generate_vdt_ucf24.py
├── openworld_split_jhmdb.py
└── openworld_split_ucf24.py
├── requirements.txt
├── test_net.py
├── third_party
├── eval_utils.py
├── maskrcnn_utils.py
├── run_maskrcnn.py
└── video_io.py
├── train_net.py
└── trainval.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | pretrained
3 | output/
4 | *.pyc
5 | *.vscode
6 | *.log
7 | *.egg-info
8 | *.out
9 | *.err
10 | *_temp.*
11 | *.jpg
12 | *.jpeg
13 | .nfs*
14 | alphaction/cam/demo.py
15 | *.pth
16 | backup/
17 | *backup*
18 | */figures/*
19 | *.zip
20 | data_zip/
21 | data-release
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/GroundingDINO"]
2 | path = third_party/GroundingDINO
3 | url = https://github.com/IDEA-Research/GroundingDINO.git
4 | ignore = dirty
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Wentao Bao
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OpenMixer
2 | This repository released the source code of the WACV 2025 paper [OpenMixer](https://arxiv.org/pdf/2411.10922), heavily dependent on the [STMixer](https://github.com/MCG-NJU/STMixer) codebase. OpenMixer is an open-vocabulary action detector that aims to detect any human actions from videos in an open world. The figure below shows the model architecture.
3 |
4 |
5 |
6 |
7 |
8 | ## Installation
9 | - Create conda environment:
10 | ```bash
11 | conda create -n openmixer python=3.7
12 | ```
13 |
14 | - Install pytorch:
15 | ```bash
16 | pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117
17 | ```
18 |
19 | - Install other libraries (including the OpenAI-CLIP):
20 | ```bash
21 | pip install -r requirements.txt
22 | ```
23 |
24 | ## Data Preparation
25 | - First, please refer to the MMAction2 [JHMDB](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/jhmdb/README.md) and [UCF24](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ucf101_24/README.md) dataset preparation steps.
26 |
27 | - Next, please download our released [Open-World splits](https://drive.google.com/drive/folders/1Bu5GNsGIfYD-4u_7WMjBOWZj_3zs-HbJ?usp=sharing). Make sure folders are structured as follows.
28 | ```bash
29 | data
30 | ├──JHMDB
31 | | ├── openworld
32 | | ├── Frames
33 | | ├── JHMDB-MaskRCNN.pkl
34 | | ├── JHMDB-GT.pkl
35 | ├──UCF24
36 | | ├── openworld
37 | | ├── rgb-images
38 | | ├── UCF24-MaskRCNN.pkl
39 | ```
40 |
41 | ## Models
42 |
43 | - Please download the pretrained `CLIP-ViP-B/16` checkpoint from [XPretrain/CLIP-ViP](https://github.com/microsoft/XPretrain/tree/main/CLIP-ViP), which is a video CLIP model served as the backbone of our model. After downloaded, make sure the file is located at `./pretrained/pretrain_clipvip_base_16.pt`.
44 |
45 | - [Optional] We released three OpenMixer models and inference results for each of the JHMDB and UCF24 datasets here: [Google Drive](https://drive.google.com/drive/folders/1MDT_jcJolNZjuZ15cdhXyJmewMyVBKUP?usp=sharing). They correspond to the configurations in the folder `./config_files/`. Note that for the ZSR_ZSL setting, no model training needed.
46 |
47 |
48 | ## Training
49 |
50 | We provided an easy-to-use bash script to enable training and evaluation for different settings and datasets. For example, to train the OpenMixer model under the end-to-end setting on the JHMDB dataset using 4 specified GPUs:
51 | ```bash
52 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash trainval.sh train jhmdb
53 | ```
54 | Optionally, you may change the GPU IDs and dataset name to `ucf24`. For other settings, in the `trainval.sh`, change the `CFG_FILE` to `openmixer_zsr_tl.yaml` to train OpenMixer model under the ZSR+TL setting.
55 |
56 |
57 | ## Validation
58 | We use the same bash script for validation (inference + evaluation)
59 | ```bash
60 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash trainval.sh eval jhmdb
61 | ```
62 | Optionally, you may change the GPU IDs and dataset name to `ucf24`. For other settings, in the `trainval.sh`, change the `CFG_FILE` to `openmixer_zsr_tl.yaml` and `openmixer_zsr_zsl.yaml` for evaluating models under the ZSR+TL and ZSR+ZSL settings, respectively.
63 |
64 |
65 | ## Acknowledgements
66 | This project is built upon [STMixer](https://github.com/MCG-NJU/STMixer), [CLIP-ViP](https://github.com/microsoft/XPretrain/CLIP-ViP), and [OpenAI-CLIP](https://github.com/openai/CLIP). We sincerely thank contributors of all these great open-source repositories!
67 |
68 |
69 | ## Citation
70 |
71 | If this project helps you in your research or project, please cite
72 | our paper:
73 |
74 | ```
75 | @InProceedings{bao2025wacv,
76 | title={Exploiting VLM Localizability and Semantics for Open Vocabulary Action Detection},
77 | author={Wentao Bao and Kai Li and Yuxiao Chen and Deep Patel and Martin Renqiang Min and Yu Kong},
78 | booktitle = {IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)},
79 | year={2025}
80 | }
81 | ```
82 |
83 |
84 |
--------------------------------------------------------------------------------
/alphaction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/__init__.py
--------------------------------------------------------------------------------
/alphaction/cam/.gitignore:
--------------------------------------------------------------------------------
1 | demos/
2 | hila_clip/
3 | pytorch_grad_cam/
4 | *.gif
5 | *.mp4
--------------------------------------------------------------------------------
/alphaction/cam/README.md:
--------------------------------------------------------------------------------
1 | ## HilaCAM for CLIP Visual Attention
2 |
3 | Please go to [gScoreCAM](https://github.com/anguyen8/gScoreCAM), download the folders `hila_clip/` and `pytorch_grad_cam/`, and put them in this folder.
4 |
5 | The following commands show the steps:
6 | ```shell
7 | cd alphaction/cam
8 | git clone https://github.com/anguyen8/gScoreCAM
9 | cp -r gScoreCAM/hila_clip gScoreCAM/pytorch_grad_cam .
10 | rm -rf gScoreCAM
11 |
12 | ```
13 |
14 | After that, please ensure the following python packages are installed:
15 | ```shell
16 | pip install ttach kornia scikit-learn scikit-image
17 | ```
--------------------------------------------------------------------------------
/alphaction/cam/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/cam/__init__.py
--------------------------------------------------------------------------------
/alphaction/cam/clip_loader.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | #* For CLIP ViT
4 | def reshape_transform(tensor, height=None, width=None):
5 | if height or width is None:
6 | grid_square = len(tensor) - 1
7 | if grid_square ** 0.5 % 1 == 0:
8 | height = width = int(grid_square**0.5)
9 | else:
10 | raise ValueError("Heatmap is not square, please set height and width.")
11 | result = tensor[1:, :, :].reshape(
12 | height, width, tensor.size(2))
13 |
14 | # Bring the channels to the first dimension,
15 | # like in CNNs.
16 | result = result.permute(2, 0, 1)
17 | return result.unsqueeze(0)
18 |
19 | def load_clip(clip_version, attn_prob=True, attn_grad=True, attn_last_only=True, resize='adapt', custom=False, model_weight=None):
20 | device = "cuda" if torch.cuda.is_available() else "cpu"
21 | if 'vit' in clip_version.lower() and not custom: #* This is no necessary, for experimental usage, hila CLIP will hook all attentions.
22 | from hila_clip import clip
23 | clip_model, preprocess = clip.load(clip_version, device=device, jit=False)
24 |
25 | elif 'clip-vip' in clip_version.lower():
26 | import sys, clip
27 | sys.path.append("../../")
28 | from alphaction.modeling.encoders.clipvip import loader
29 | clip_model, preprocess = loader.load(clip_version,
30 | attn_prob=attn_prob,
31 | attn_grad=attn_grad,
32 | attn_last_only=attn_last_only,
33 | device=device, model_weight=model_weight)
34 |
35 | elif custom:
36 | from hila_clip import clip
37 | clip_model, preprocess = clip.load(clip_version, device=device, jit=False)
38 |
39 | else:
40 | import clip
41 | clip_model, preprocess = clip.load(clip_version, device=device)
42 |
43 | if clip_version.startswith("RN"):
44 | target_layer = clip_model.visual.layer4[-1]
45 | cam_trans = None
46 | elif 'clip-vip' in clip_version.lower():
47 | target_layer = clip_model.vision_model.encoder.layers[-1]
48 | cam_trans = reshape_transform
49 | else:
50 | target_layer = clip_model.visual.transformer.resblocks[-1]
51 | cam_trans = reshape_transform
52 |
53 | if resize == 'raw': # remove clip resizing
54 | if not custom:
55 | raise Exception("Raw input needs to use custom clip.")
56 | preprocess.transforms.pop(0)
57 | preprocess.transforms.pop(0)
58 | elif resize == 'adapt': # adapt to clip size
59 | from torchvision import transforms
60 | crop_size = preprocess.transforms[1].size # resize to crop size so that no information will be cropped
61 | preprocess.transforms.insert(0, transforms.Resize(crop_size))
62 | # clip_model = torch.nn.DataParallel(clip_model)
63 | return clip_model, preprocess, target_layer, cam_trans, clip
64 |
65 | def load_clip_from_checkpoint(checkpoint, model):
66 | checkpoint = torch.load(checkpoint, map_location='cpu')
67 |
68 | # # Use these 3 lines if you use default model setting(not training setting) of the clip. For example, if you set context_length to 100 since your string is very long during training, then assign 100 to checkpoint['model_state_dict']["context_length"]
69 | # checkpoint['model_state_dict']["input_resolution"] = model.input_resolution #default is 224
70 | # checkpoint['model_state_dict']["context_length"] = model.context_length # default is 77
71 | # checkpoint['model_state_dict']["vocab_size"] = model.vocab_size
72 |
73 | model.load_state_dict(checkpoint['model_state_dict'])
74 | return model
--------------------------------------------------------------------------------
/alphaction/cam/mhsa.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def get_multi_head_mask(attentions, threshold=0.6):
5 | nh, np = attentions.size(0), attentions.size(-1)
6 | # we keep only a certain percentage of the mass
7 | val, idx = torch.sort(attentions)
8 | val /= torch.sum(val, dim=-1, keepdim=True)
9 | cumval = torch.cumsum(val, dim=-1)
10 | th_attn = cumval > (1 - threshold)
11 | idx2 = torch.argsort(idx) # dim=-1 by default
12 | th_attn = th_attn.view(nh, -1)
13 | for head in range(nh):
14 | th_attn[head] = th_attn[head][idx2[head].view(-1)]
15 | if len(attentions.size()) == 3:
16 | th_attn = th_attn.view(nh, -1, np)
17 | return th_attn
18 |
19 |
20 | def get_masked_attention_map(attentions, nh, heatmap_size, cam_size, mask=None):
21 | if mask is not None:
22 | # apply mask on attention map
23 | attentions = attentions * mask.float() # (num_heads, N, L)
24 | # normalize within each frame
25 | attentions -= attentions.min(dim=-1, keepdim=True)[0]
26 | attentions /= attentions.max(dim=-1, keepdim=True)[0]
27 |
28 | num_frames = attentions.size(1) if len(attentions.size()) == 3 else 1
29 | # average over multi-heads as the final attention
30 | attentions = attentions.reshape(nh, num_frames, heatmap_size[0], heatmap_size[1]).mean(dim=0, keepdim=True)
31 |
32 | if cam_size is not None:
33 | # interpolate
34 | attentions = torch.nn.functional.interpolate(attentions, size=(cam_size[1], cam_size[0]), mode="bilinear")[0]
35 | return attentions.cpu().numpy()
36 |
37 |
38 | @torch.no_grad()
39 | def mhsa_clip(image, model, cam_size=None, threshold=0.6):
40 | # get patch token features
41 | _, attn_last = model.encode_image(image, last_attn_output=True) # (B, num_heads, L, D)
42 | nh = attn_last.shape[1] # number of head
43 |
44 | # we keep only the output patch attention
45 | # assume batch_size = 1
46 | attentions = attn_last[0, :, 0, 1:].reshape(nh, -1) # (num_heads, 7*7)
47 | heatmap_size = [int(attentions.size(-1)**0.5), int(attentions.size(-1)**0.5)] # 7
48 |
49 | th_attn = get_multi_head_mask(attentions, threshold)
50 |
51 | attn_map = get_masked_attention_map(attentions, nh, heatmap_size, cam_size, mask=th_attn) # (1, H, W)
52 |
53 | return attn_map[0]
54 |
55 |
56 | @torch.no_grad()
57 | def mhsa_clipvip(video, model, cam_size=None, threshold=0.6):
58 | """ video: (B, T, C, H, W)
59 | text: (K, L)
60 | cam_size: (W, H)
61 | """
62 | num_proxy = model.config.vision_additional_config.add_cls_num + 1
63 | num_heads = model.config.vision_config.num_attention_heads
64 | num_frames = video.size(1)
65 |
66 | # run forward pass to get the last block attentions
67 | _, heatmap_size = model.get_image_features(video, return_ws=True) # (h,w)
68 | last_block = list(dict(model.vision_model.encoder.layers.named_children()).values())[-1]
69 | attn_inter = last_block.attn_probs['inter'] # [B*num_heads, M, M+N*L] where M=4
70 | attn_intra = last_block.attn_probs['intra'] # [B*num_heads*N, L, M+L] where L=196 if input_size=224
71 |
72 | num_patches = attn_intra.shape[-2] # L
73 | attentions_inter = attn_inter[:, 0, num_proxy:].reshape(-1, num_heads, num_frames, num_patches)[0] # [B*num_heads, N*L] --> [num_heads, N, L]
74 | # attentions_intra = attn_intra[:, 0, num_proxy:].reshape(-1, num_heads, num_frames, num_patches)[0] # [B*num_heads*N, L] --> [num_heads, N, L]
75 |
76 | th_attn = get_multi_head_mask(attentions_inter, threshold)
77 | attn_map = get_masked_attention_map(attentions_inter, num_heads, heatmap_size, cam_size, mask=th_attn) # (T, H, W)
78 |
79 | # temporal weights
80 | temporal_weights = attn_inter[:, 0, num_proxy:].reshape(-1, num_frames, num_patches).sum(dim=-1) # [B*num_heads, N]
81 | temporal_weights = temporal_weights.reshape(-1, num_heads, num_frames)[0].sum(dim=0) # [N]
82 | temporal_weights -= temporal_weights.min(dim=-1, keepdim=True)[0]
83 | temporal_weights /= temporal_weights.max(dim=-1, keepdim=True)[0]
84 | temporal_weights = temporal_weights.cpu().numpy()
85 | attn_map = temporal_weights[:, None, None] * attn_map
86 |
87 | # # visualize the weights
88 | # import matplotlib.pyplot as plt
89 | # import numpy as np
90 | # plt.bar(np.arange(num_frames) + 1, temporal_weights, 0.4)
91 | # plt.xlabel("video frames")
92 | # plt.ylabel("normalized attentions")
93 | # plt.xticks(np.arange(num_frames) + 1)
94 | # plt.tight_layout()
95 | # plt.savefig("../../_temp./temporal_weights.png")
96 |
97 | return attn_map
--------------------------------------------------------------------------------
/alphaction/cam/ritsm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import cv2
4 |
5 |
6 | def clip_forward(model, image, text):
7 | # get patch token features
8 | image_features, encoder_out = model.encode_image(image, transformer_output=True) # (N, L, D)
9 | text_features = model.encode_text(text)
10 |
11 | # cosine similarity as logits
12 | logit_scale = model.logit_scale.exp()
13 | logits_per_image = logit_scale * image_features @ text_features.t()
14 | logits_per_text = logit_scale * text_features @ image_features.t()
15 |
16 | return logits_per_image, encoder_out, text_features
17 |
18 |
19 | @torch.no_grad()
20 | def ritsm_clip(image, text, model, device, index=None, cam_size=None, return_logits=False, attn_grad=False):
21 | # forward pass
22 | logits_per_image, encoder_out, text_features = clip_forward(model, image, text)
23 | probs = logits_per_image.softmax(dim=-1)
24 | if index is None:
25 | # locate the largest score of img-text pair
26 | index = np.argmax(logits_per_image.cpu().data.numpy(), axis=-1)
27 |
28 | input_size = model.visual.input_resolution # 224
29 | patch_features = encoder_out[:, 1:, :] # (B, 7*7, 768)
30 | heatmap_size = int(patch_features.size(1)**0.5) # 7
31 |
32 | # projection
33 | patch_features = model.visual.ln_post(patch_features)
34 | if model.visual.proj is not None:
35 | patch_features = patch_features @ model.visual.proj # (B, 7*7, 512)
36 |
37 | # normalize
38 | patch_features = patch_features / patch_features.norm(dim=-1, keepdim=True)
39 | # text_features = text_features / text_features.norm(dim=-1, keepdim=True) # (K=1, 512)
40 |
41 | # image-text similarity
42 | it_sim = patch_features @ text_features.t() # (B, 7*7, K=1)
43 |
44 | # reshape & resize
45 | image_relevance_all = it_sim[:, :, index].view(-1, 1, heatmap_size, heatmap_size) # (B, 1, 7, 7)
46 | image_relevance_all = torch.nn.functional.interpolate(image_relevance_all.float(), size=input_size, mode='bilinear') # (B, 1, H, W)
47 | image_relevance = image_relevance_all[0] # assume batch_size = 1
48 | image_relevance = image_relevance.reshape(input_size, input_size).detach().cpu().numpy()
49 | image_relevance = (image_relevance - image_relevance.min()) / (image_relevance.max() - image_relevance.min())
50 | # reverse
51 | image_relevance = np.fabs(1 - image_relevance)
52 |
53 | out = cv2.resize(image_relevance, cam_size) if cam_size is not None else image_relevance
54 | if return_logits:
55 | return out, logits_per_image
56 | return out
57 |
58 |
59 | @torch.no_grad()
60 | def ritsm_clipvip(video, text, model, device, index=None, cam_size=None, return_logits=False, attn_grad=False, use_mask=False):
61 | """ video: (B, T, C, H, W)
62 | text: (K, L)
63 | """
64 | num_proxy = model.config.vision_additional_config.add_cls_num + 1
65 | eos_idx = text.argmax(dim=-1)
66 | num_frames = video.size(1)
67 |
68 | input_size = model.config.vision_config.image_size # 224
69 | patch_size = model.config.vision_config.patch_size # 16
70 | num_patches = int(input_size // patch_size) # 14
71 |
72 | # run forward pass
73 | out_dict = model(text, video)
74 | logits_per_image = out_dict['logits_per_image']
75 |
76 | if index is None:
77 | # locate the largest score of img-text pair
78 | index = np.argmax(logits_per_image.cpu().data.numpy(), axis=-1)
79 |
80 | # get patch features from the last vision encoder block
81 | patch_features = out_dict['vision_model_output']['last_hidden_state'][:, num_proxy:, :] # (B, T*14*14, 768)
82 | assert num_frames * (num_patches ** 2) == patch_features.size(1)
83 |
84 | # layernorm, projection, and normalization
85 | patch_features = model.vision_model.post_layernorm(patch_features)
86 | patch_features = model.visual_projection(patch_features) # 768 --> 512
87 | patch_features = patch_features / patch_features.norm(dim=-1, keepdim=True) # (B, T*14*14, 512)
88 |
89 | # get the text features
90 | text_features = out_dict['text_embeds'] # after layernorm, projection, and normalization
91 |
92 | # image-text similarity
93 | it_sim = patch_features @ text_features.t() # (B, T*14*14, K=1)
94 |
95 | if use_mask:
96 | th_attn = get_attn_mask(it_sim[0, :, index].view(num_frames, -1), threshold=0.6)
97 | it_sim = it_sim * th_attn.view(-1).unsqueeze(0).unsqueeze(-1)
98 |
99 | # reshape & resize
100 | image_relevance_all = it_sim[:, :, index].view(-1, num_frames, num_patches, num_patches) # (B, T, 14, 14)
101 | image_relevance_all = torch.nn.functional.interpolate(image_relevance_all.float(), size=input_size, mode='bilinear') # (B, T, H, W)
102 |
103 | # assume batch_size = 1
104 | image_relevance_all = image_relevance_all[0]
105 |
106 | all_maps = []
107 | for image_relevance in image_relevance_all:
108 | image_relevance = image_relevance.reshape(input_size, input_size).detach().cpu().numpy()
109 | # normalize and reverse
110 | image_relevance = (image_relevance - image_relevance.min()) / (image_relevance.max() - image_relevance.min())
111 | # reverse
112 | image_relevance = np.fabs(1 - image_relevance)
113 | atten_map = cv2.resize(image_relevance, cam_size) if cam_size is not None else image_relevance
114 | all_maps.append(atten_map)
115 |
116 | out = np.stack(all_maps, axis=0)
117 |
118 | if return_logits:
119 | return out, logits_per_image
120 | return out
121 |
122 |
123 | def get_attn_mask(attentions, threshold=0.6):
124 | """ attentions: (T, L)
125 | """
126 | nh = attentions.size(0)
127 | # we keep only a certain percentage of the mass
128 | val, idx = torch.sort(attentions)
129 | val /= torch.sum(val, dim=-1, keepdim=True)
130 | cumval = torch.cumsum(val, dim=-1)
131 | th_attn = cumval > (1 - threshold)
132 | idx2 = torch.argsort(idx) # dim=-1 by default
133 | th_attn = th_attn.view(nh, -1)
134 | for head in range(nh):
135 | th_attn[head] = th_attn[head][idx2[head].view(-1)]
136 | return th_attn
--------------------------------------------------------------------------------
/alphaction/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .defaults import _C as cfg
2 |
--------------------------------------------------------------------------------
/alphaction/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import make_data_loader
2 |
--------------------------------------------------------------------------------
/alphaction/dataset/build.py:
--------------------------------------------------------------------------------
1 | import bisect
2 | import copy
3 | import torch.utils.data
4 | from alphaction.utils.comm import get_world_size
5 | from . import datasets as D
6 | from . import samplers
7 | from .collate_batch import BatchCollator
8 |
9 | def build_dataset(cfg, split):
10 | if cfg.DATA.DATASETS[0] == 'ucf24':
11 | dataset = D.UCF24(cfg, split)
12 | elif cfg.DATA.DATASETS[0] == 'jhmdb':
13 | dataset = D.Jhmdb(cfg, split)
14 | elif cfg.DATA.DATASETS[0] == 'ava_v2.2':
15 | dataset = D.Ava(cfg, split)
16 | else:
17 | raise NotImplementedError
18 |
19 | return [dataset]
20 |
21 | def make_data_sampler(dataset, shuffle, distributed):
22 | if distributed:
23 | return samplers.DistributedSampler(dataset, shuffle=shuffle)
24 | if shuffle:
25 | sampler = torch.utils.data.sampler.RandomSampler(dataset)
26 | else:
27 | sampler = torch.utils.data.sampler.SequentialSampler(dataset)
28 | return sampler
29 |
30 |
31 | def _quantize(x, bins):
32 | bins = copy.copy(bins)
33 | bins = sorted(bins)
34 | quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
35 | return quantized
36 |
37 |
38 | def _compute_aspect_ratios(dataset):
39 | aspect_ratios = []
40 | for i in range(len(dataset)):
41 | video_info = dataset.get_video_info(i)
42 | aspect_ratio = float(video_info["height"]) / float(video_info["width"])
43 | aspect_ratios.append(aspect_ratio)
44 | return aspect_ratios
45 |
46 |
47 | def make_batch_data_sampler(
48 | dataset, sampler, aspect_grouping, videos_per_batch, num_iters=None, start_iter=0, drop_last=False
49 | ):
50 | if aspect_grouping:
51 | if not isinstance(aspect_grouping, (list, tuple)):
52 | aspect_grouping = [aspect_grouping]
53 | aspect_ratios = _compute_aspect_ratios(dataset)
54 | group_ids = _quantize(aspect_ratios, aspect_grouping)
55 | batch_sampler = samplers.GroupedBatchSampler(
56 | sampler, group_ids, videos_per_batch, drop_uneven=drop_last
57 | )
58 | else:
59 | batch_sampler = torch.utils.data.sampler.BatchSampler(
60 | sampler, videos_per_batch, drop_last=drop_last
61 | )
62 | if num_iters is not None:
63 | batch_sampler = samplers.IterationBasedBatchSampler(
64 | batch_sampler, num_iters, start_iter
65 | )
66 | return batch_sampler
67 |
68 |
69 | def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
70 | num_gpus = get_world_size()
71 | if is_train:
72 | # for training
73 | videos_per_batch = cfg.SOLVER.VIDEOS_PER_BATCH
74 | assert (
75 | videos_per_batch % num_gpus == 0
76 | ), "SOLVER.VIDEOS_PER_BATCH ({}) must be divisible by the number "
77 | "of GPUs ({}) used.".format(videos_per_batch, num_gpus)
78 | videos_per_gpu = videos_per_batch // num_gpus
79 | shuffle = True
80 | drop_last = True
81 | # num_iters = cfg.SOLVER.MAX_EPOCH*cfg.SOLVER.ITER_PER_EPOCH
82 | split = 'train'
83 | else:
84 | # for testing
85 | videos_per_batch = cfg.TEST.VIDEOS_PER_BATCH
86 | assert (
87 | videos_per_batch % num_gpus == 0
88 | ), "TEST.VIDEOS_PER_BATCH ({}) must be divisible by the number "
89 | "of GPUs ({}) used.".format(videos_per_batch, num_gpus)
90 | videos_per_gpu = videos_per_batch // num_gpus
91 | shuffle = False if not is_distributed else True
92 | drop_last = False
93 | # num_iters = None
94 | start_iter = 0
95 | split = 'test'
96 |
97 | # group images which have similar aspect ratio. In this case, we only
98 | # group in two cases: those with width / height > 1, and the other way around,
99 | # but the code supports more general grouping strategy
100 | aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []
101 |
102 | # build dataset
103 | datasets = build_dataset(cfg, split=split)
104 |
105 | # build sampler and dataloader
106 | data_loaders, vocabularies, iter_per_epoch_all = [], [], []
107 | for dataset in datasets:
108 | if is_train:
109 | # number of iterations for all epochs
110 | iter_per_epoch = int(len(dataset) // cfg.SOLVER.VIDEOS_PER_BATCH) if cfg.SOLVER.ITER_PER_EPOCH == -1 else cfg.SOLVER.ITER_PER_EPOCH
111 | iter_per_epoch_all.append(iter_per_epoch)
112 | num_iters = cfg.SOLVER.MAX_EPOCH * iter_per_epoch if is_train else None
113 | # sampler
114 | sampler = make_data_sampler(dataset, shuffle, is_distributed)
115 | batch_sampler = make_batch_data_sampler(
116 | dataset, sampler, aspect_grouping, videos_per_gpu, num_iters, start_iter, drop_last
117 | )
118 | collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
119 | num_workers = cfg.DATALOADER.NUM_WORKERS
120 | data_loader = torch.utils.data.DataLoader(
121 | dataset,
122 | num_workers=num_workers,
123 | batch_sampler=batch_sampler,
124 | collate_fn=collator,
125 | )
126 | data_loaders.append(data_loader)
127 | if cfg.DATA.OPEN_VOCABULARY:
128 | vocabularies.append(dataset.text_input)
129 | else:
130 | vocabularies.append(None)
131 | if is_train:
132 | # during training, a single (possibly concatenated) data_loader is returned
133 | assert len(data_loaders) == 1
134 | return data_loaders[0], vocabularies[0]['closed'], iter_per_epoch_all[0]
135 |
136 | vocabularies_val = []
137 | if len(vocabularies) > 0:
138 | for vocab in vocabularies:
139 | if cfg.TEST.EVAL_OPEN and vocab is not None:
140 | vocabularies_val.append(vocab['open'])
141 | else:
142 | vocabularies_val.append(vocab['closed'])
143 |
144 | return data_loaders, vocabularies_val, iter_per_epoch_all
--------------------------------------------------------------------------------
/alphaction/dataset/collate_batch.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 |
4 |
5 | def batch_different_videos(videos, size_divisible=0):
6 | '''
7 | :param videos: a list of video tensors
8 | :param size_divisible: output_size(width and height) should be divisble by this param
9 | :return: batched videos as a single tensor
10 | '''
11 | assert isinstance(videos, (tuple, list))
12 | max_size = tuple(max(s) for s in zip(*[clip.shape for clip in videos]))
13 |
14 | if size_divisible > 0:
15 | stride = size_divisible
16 | max_size = list(max_size)
17 | max_size[2] = int(math.ceil(max_size[2] / stride) * stride)
18 | max_size[3] = int(math.ceil(max_size[3] / stride) * stride)
19 | max_size = tuple(max_size)
20 |
21 | batch_shape = (len(videos),) + max_size
22 | batched_clips = videos[0].new(*batch_shape).zero_()
23 | for clip, pad_clip in zip(videos, batched_clips):
24 | pad_clip[:clip.shape[0], :clip.shape[1], :clip.shape[2], :clip.shape[3]].copy_(clip)
25 |
26 | return batched_clips
27 |
28 |
29 | class BatchCollator(object):
30 | """
31 | From a list of samples from the dataset,
32 | returns the batched objectimages and targets.
33 | This should be passed to the DataLoader
34 | """
35 |
36 | def __init__(self, size_divisible=0):
37 | self.divisible = size_divisible
38 | self.size_divisible = self.divisible
39 |
40 | def __call__(self, batch):
41 | transposed_batch = list(zip(*batch))
42 | slow_clips = batch_different_videos(transposed_batch[0], self.size_divisible)
43 | if transposed_batch[1][0] is not None:
44 | fast_clips = batch_different_videos(transposed_batch[1], self.size_divisible)
45 | else:
46 | fast_clips = None
47 | whwh = torch.stack(transposed_batch[2])
48 | boxes = transposed_batch[3]
49 | label_arrs = transposed_batch[4]
50 | metadata = transposed_batch[5]
51 | clip_ids = transposed_batch[6]
52 | return slow_clips, fast_clips, whwh, boxes, label_arrs, metadata, clip_ids
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .concat_dataset import ConcatDataset
2 | from .ava_dataset import Ava
3 | from .jhmdb_dataset import Jhmdb
4 | from .ucf24_dataset import UCF24
5 |
6 | __all__ = ["ConcatDataset", "Ava", "Jhmdb", "UCF24"]
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/concat_dataset.py:
--------------------------------------------------------------------------------
1 | import bisect
2 |
3 | from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
4 |
5 |
6 | class ConcatDataset(_ConcatDataset):
7 | """
8 | Same as torch.utils.dataset.dataset.ConcatDataset, but exposes an extra
9 | method for querying the sizes of the image
10 | """
11 |
12 | def get_idxs(self, idx):
13 | dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
14 | if dataset_idx == 0:
15 | sample_idx = idx
16 | else:
17 | sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
18 | return dataset_idx, sample_idx
19 |
20 | def get_video_info(self, idx):
21 | dataset_idx, sample_idx = self.get_idxs(idx)
22 | return self.datasets[dataset_idx].get_video_info(sample_idx)
23 |
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from alphaction.dataset import datasets
2 |
3 | from .ava import ava_evaluation
4 | from .jhmdb import jhmdb_evaluation
5 | from .ucf24 import ucf24_evaluation
6 |
7 |
8 | def evaluate(dataset, predictions, output_folder, **kwargs):
9 | """evaluate dataset using different methods based on dataset type.
10 | Args:
11 | dataset: Dataset object
12 | predictions(list[BoxList]): each item in the list represents the
13 | prediction results for one image.
14 | output_folder: output folder, to save evaluation files or results.
15 | **kwargs: other args.
16 | Returns:
17 | evaluation result
18 | """
19 | args = dict(
20 | dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs
21 | )
22 | if isinstance(dataset, datasets.Ava):
23 | return ava_evaluation(**args)
24 | elif isinstance(dataset, datasets.Jhmdb):
25 | return jhmdb_evaluation(**args)
26 | elif isinstance(dataset, datasets.UCF24):
27 | return ucf24_evaluation(**args)
28 | else:
29 | dataset_name = dataset.__class__.__name__
30 | raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name))
31 |
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ava/README.md:
--------------------------------------------------------------------------------
1 | The evaluation code of AVA is modified from [https://github.com/activitynet/ActivityNet](https://github.com/activitynet/ActivityNet).
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ava/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from .ava_eval import do_ava_evaluation
3 |
4 |
5 | def ava_evaluation(dataset, predictions, output_folder, **kwargs):
6 | logger = logging.getLogger("alphaction.inference")
7 | logger.info("performing ava evaluation.")
8 | return do_ava_evaluation(
9 | dataset=dataset,
10 | predictions=predictions,
11 | output_folder=output_folder,
12 | logger=logger,
13 | metric=kwargs.get('metric', 'frame_ap'),
14 | save_csv=kwargs.get('save_csv', False)
15 | )
16 |
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/jhmdb/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from .jhmdb_eval import do_jhmdb_evaluation
3 |
4 |
5 | def jhmdb_evaluation(dataset, predictions, output_folder, **kwargs):
6 | logger = logging.getLogger("alphaction.inference")
7 | logger.info("performing jhmdb evaluation.")
8 | return do_jhmdb_evaluation(
9 | dataset=dataset,
10 | predictions=predictions,
11 | output_folder=output_folder,
12 | logger=logger,
13 | metric=kwargs.get('metric', 'frame_ap'),
14 | save_csv=kwargs.get('save_csv', False)
15 | )
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/dataset/datasets/evaluation/pascal_evaluation/__init__.py
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_evaluation/metrics.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Functions for computing metrics like precision, recall, CorLoc and etc."""
17 | from __future__ import division
18 |
19 | import numpy as np
20 |
21 |
22 | def compute_precision_recall(scores, labels, num_gt):
23 | """Compute precision and recall.
24 |
25 | Args:
26 | scores: A float numpy array representing detection score
27 | labels: A boolean numpy array representing true/false positive labels
28 | num_gt: Number of ground truth instances
29 |
30 | Raises:
31 | ValueError: if the input is not of the correct format
32 |
33 | Returns:
34 | precision: Fraction of positive instances over detected ones. This value is
35 | None if no ground truth labels are present.
36 | recall: Fraction of detected positive instance over all positive instances.
37 | This value is None if no ground truth labels are present.
38 |
39 | """
40 | if not isinstance(
41 | labels, np.ndarray) or labels.dtype != np.bool or len(labels.shape) != 1:
42 | raise ValueError("labels must be single dimension bool numpy array")
43 |
44 | if not isinstance(
45 | scores, np.ndarray) or len(scores.shape) != 1:
46 | raise ValueError("scores must be single dimension numpy array")
47 |
48 | if num_gt < np.sum(labels):
49 | raise ValueError("Number of true positives must be smaller than num_gt.")
50 |
51 | if len(scores) != len(labels):
52 | raise ValueError("scores and labels must be of the same size.")
53 |
54 | if num_gt == 0:
55 | return None, None
56 |
57 | sorted_indices = np.argsort(scores)
58 | sorted_indices = sorted_indices[::-1]
59 | labels = labels.astype(int)
60 | true_positive_labels = labels[sorted_indices]
61 | false_positive_labels = 1 - true_positive_labels
62 | cum_true_positives = np.cumsum(true_positive_labels)
63 | cum_false_positives = np.cumsum(false_positive_labels)
64 | precision = cum_true_positives.astype(float) / (
65 | cum_true_positives + cum_false_positives)
66 | recall = cum_true_positives.astype(float) / num_gt
67 | return precision, recall
68 |
69 |
70 | def compute_average_precision(precision, recall):
71 | """Compute Average Precision according to the definition in VOCdevkit.
72 |
73 | Precision is modified to ensure that it does not decrease as recall
74 | decrease.
75 |
76 | Args:
77 | precision: A float [N, 1] numpy array of precisions
78 | recall: A float [N, 1] numpy array of recalls
79 |
80 | Raises:
81 | ValueError: if the input is not of the correct format
82 |
83 | Returns:
84 | average_precison: The area under the precision recall curve. NaN if
85 | precision and recall are None.
86 |
87 | """
88 | if precision is None:
89 | if recall is not None:
90 | raise ValueError("If precision is None, recall must also be None")
91 | return np.NAN
92 |
93 | if not isinstance(precision, np.ndarray) or not isinstance(recall,
94 | np.ndarray):
95 | raise ValueError("precision and recall must be numpy array")
96 | if precision.dtype != np.float or recall.dtype != np.float:
97 | raise ValueError("input must be float numpy array.")
98 | if len(precision) != len(recall):
99 | raise ValueError("precision and recall must be of the same size.")
100 | if not precision.size:
101 | return 0.0
102 | if np.amin(precision) < 0 or np.amax(precision) > 1:
103 | raise ValueError("Precision must be in the range of [0, 1].")
104 | if np.amin(recall) < 0 or np.amax(recall) > 1:
105 | raise ValueError("recall must be in the range of [0, 1].")
106 | if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)):
107 | raise ValueError("recall must be a non-decreasing array")
108 |
109 | recall = np.concatenate([[0], recall, [1]])
110 | precision = np.concatenate([[0], precision, [0]])
111 |
112 | # Preprocess precision to be a non-decreasing array
113 | for i in range(len(precision) - 2, -1, -1):
114 | precision[i] = np.maximum(precision[i], precision[i + 1])
115 |
116 | indices = np.where(recall[1:] != recall[:-1])[0] + 1
117 | average_precision = np.sum(
118 | (recall[indices] - recall[indices - 1]) * precision[indices])
119 | return average_precision
120 |
121 |
122 | def compute_cor_loc(num_gt_imgs_per_class,
123 | num_images_correctly_detected_per_class):
124 | """Compute CorLoc according to the definition in the following paper.
125 |
126 | https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf
127 |
128 | Returns nans if there are no ground truth images for a class.
129 |
130 | Args:
131 | num_gt_imgs_per_class: 1D array, representing number of images containing
132 | at least one object instance of a particular class
133 | num_images_correctly_detected_per_class: 1D array, representing number of
134 | images that are correctly detected at least one object instance of a
135 | particular class
136 |
137 | Returns:
138 | corloc_per_class: A float numpy array represents the corloc score of each
139 | class
140 | """
141 | # Divide by zero expected for classes with no gt examples.
142 | with np.errstate(divide="ignore", invalid="ignore"):
143 | return np.where(
144 | num_gt_imgs_per_class == 0, np.nan,
145 | num_images_correctly_detected_per_class / num_gt_imgs_per_class)
146 |
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_evaluation/np_box_list.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Numpy BoxList classes and functions."""
17 |
18 | import numpy as np
19 |
20 |
21 | class BoxList(object):
22 | """Box collection.
23 |
24 | BoxList represents a list of bounding boxes as numpy array, where each
25 | bounding box is represented as a row of 4 numbers,
26 | [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes within a
27 | given list correspond to a single image.
28 |
29 | Optionally, users can add additional related fields (such as
30 | objectness/classification scores).
31 | """
32 |
33 | def __init__(self, data):
34 | """Constructs box collection.
35 |
36 | Args:
37 | data: a numpy array of shape [N, 4] representing box coordinates
38 |
39 | Raises:
40 | ValueError: if bbox dataset is not a numpy array
41 | ValueError: if invalid dimensions for bbox dataset
42 | """
43 | if not isinstance(data, np.ndarray):
44 | raise ValueError('dataset must be a numpy array.')
45 | if len(data.shape) != 2 or data.shape[1] != 4:
46 | raise ValueError('Invalid dimensions for box dataset.')
47 | if data.dtype != np.float32 and data.dtype != np.float64:
48 | raise ValueError('Invalid dataset type for box dataset: float is required.')
49 | if not self._is_valid_boxes(data):
50 | raise ValueError('Invalid box dataset. dataset must be a numpy array of '
51 | 'N*[y_min, x_min, y_max, x_max]')
52 | self.data = {'boxes': data}
53 |
54 | def num_boxes(self):
55 | """Return number of boxes held in collections."""
56 | return self.data['boxes'].shape[0]
57 |
58 | def get_extra_fields(self):
59 | """Return all non-box fields."""
60 | return [k for k in self.data.keys() if k != 'boxes']
61 |
62 | def has_field(self, field):
63 | return field in self.data
64 |
65 | def add_field(self, field, field_data):
66 | """Add dataset to a specified field.
67 |
68 | Args:
69 | field: a string parameter used to speficy a related field to be accessed.
70 | field_data: a numpy array of [N, ...] representing the dataset associated
71 | with the field.
72 | Raises:
73 | ValueError: if the field is already exist or the dimension of the field
74 | dataset does not matches the number of boxes.
75 | """
76 | if self.has_field(field):
77 | raise ValueError('Field ' + field + 'already exists')
78 | if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes():
79 | raise ValueError('Invalid dimensions for field dataset')
80 | self.data[field] = field_data
81 |
82 | def get(self):
83 | """Convenience function for accesssing box coordinates.
84 |
85 | Returns:
86 | a numpy array of shape [N, 4] representing box corners
87 | """
88 | return self.get_field('boxes')
89 |
90 | def get_field(self, field):
91 | """Accesses dataset associated with the specified field in the box collection.
92 |
93 | Args:
94 | field: a string parameter used to speficy a related field to be accessed.
95 |
96 | Returns:
97 | a numpy 1-d array representing dataset of an associated field
98 |
99 | Raises:
100 | ValueError: if invalid field
101 | """
102 | if not self.has_field(field):
103 | raise ValueError('field {} does not exist'.format(field))
104 | return self.data[field]
105 |
106 | def get_coordinates(self):
107 | """Get corner coordinates of boxes.
108 |
109 | Returns:
110 | a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max]
111 | """
112 | box_coordinates = self.get()
113 | y_min = box_coordinates[:, 0]
114 | x_min = box_coordinates[:, 1]
115 | y_max = box_coordinates[:, 2]
116 | x_max = box_coordinates[:, 3]
117 | return [y_min, x_min, y_max, x_max]
118 |
119 | def _is_valid_boxes(self, data):
120 | """Check whether dataset fullfills the format of N*[ymin, xmin, ymax, xmin].
121 |
122 | Args:
123 | data: a numpy array of shape [N, 4] representing box coordinates
124 |
125 | Returns:
126 | a boolean indicating whether all ymax of boxes are equal or greater than
127 | ymin, and all xmax of boxes are equal or greater than xmin.
128 | """
129 | if data.shape[0] > 0:
130 | for i in range(data.shape[0]):
131 | if data[i, 0] > data[i, 2] or data[i, 1] > data[i, 3]:
132 | return False
133 | return True
134 |
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_evaluation/np_box_mask_list.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Numpy BoxMaskList classes and functions."""
17 |
18 | import numpy as np
19 | from . import np_box_list
20 |
21 |
22 | class BoxMaskList(np_box_list.BoxList):
23 | """Convenience wrapper for BoxList with masks.
24 |
25 | BoxMaskList extends the np_box_list.BoxList to contain masks as well.
26 | In particular, its constructor receives both boxes and masks. Note that the
27 | masks correspond to the full image.
28 | """
29 |
30 | def __init__(self, box_data, mask_data):
31 | """Constructs box collection.
32 |
33 | Args:
34 | box_data: a numpy array of shape [N, 4] representing box coordinates
35 | mask_data: a numpy array of shape [N, height, width] representing masks
36 | with values are in {0,1}. The masks correspond to the full
37 | image. The height and the width will be equal to image height and width.
38 |
39 | Raises:
40 | ValueError: if bbox dataset is not a numpy array
41 | ValueError: if invalid dimensions for bbox dataset
42 | ValueError: if mask dataset is not a numpy array
43 | ValueError: if invalid dimension for mask dataset
44 | """
45 | super(BoxMaskList, self).__init__(box_data)
46 | if not isinstance(mask_data, np.ndarray):
47 | raise ValueError('Mask dataset must be a numpy array.')
48 | if len(mask_data.shape) != 3:
49 | raise ValueError('Invalid dimensions for mask dataset.')
50 | if mask_data.dtype != np.uint8:
51 | raise ValueError('Invalid dataset type for mask dataset: uint8 is required.')
52 | if mask_data.shape[0] != box_data.shape[0]:
53 | raise ValueError('There should be the same number of boxes and masks.')
54 | self.data['masks'] = mask_data
55 |
56 | def get_masks(self):
57 | """Convenience function for accessing masks.
58 |
59 | Returns:
60 | a numpy array of shape [N, height, width] representing masks
61 | """
62 | return self.get_field('masks')
63 |
64 |
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_evaluation/np_box_ops.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Operations for [N, 4] numpy arrays representing bounding boxes.
17 |
18 | Example box operations that are supported:
19 | * Areas: compute bounding box areas
20 | * IOU: pairwise intersection-over-union scores
21 | """
22 | import numpy as np
23 |
24 |
25 | def area(boxes):
26 | """Computes area of boxes.
27 |
28 | Args:
29 | boxes: Numpy array with shape [N, 4] holding N boxes
30 |
31 | Returns:
32 | a numpy array with shape [N*1] representing box areas
33 | """
34 | return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
35 |
36 |
37 | def intersection(boxes1, boxes2):
38 | """Compute pairwise intersection areas between boxes.
39 |
40 | Args:
41 | boxes1: a numpy array with shape [N, 4] holding N boxes
42 | boxes2: a numpy array with shape [M, 4] holding M boxes
43 |
44 | Returns:
45 | a numpy array with shape [N*M] representing pairwise intersection area
46 | """
47 | [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
48 | [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
49 |
50 | all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
51 | all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
52 | intersect_heights = np.maximum(
53 | np.zeros(all_pairs_max_ymin.shape),
54 | all_pairs_min_ymax - all_pairs_max_ymin)
55 | all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
56 | all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
57 | intersect_widths = np.maximum(
58 | np.zeros(all_pairs_max_xmin.shape),
59 | all_pairs_min_xmax - all_pairs_max_xmin)
60 | return intersect_heights * intersect_widths
61 |
62 |
63 | def iou(boxes1, boxes2):
64 | """Computes pairwise intersection-over-union between box collections.
65 |
66 | Args:
67 | boxes1: a numpy array with shape [N, 4] holding N boxes.
68 | boxes2: a numpy array with shape [M, 4] holding N boxes.
69 |
70 | Returns:
71 | a numpy array with shape [N, M] representing pairwise iou scores.
72 | """
73 | intersect = intersection(boxes1, boxes2)
74 | area1 = area(boxes1)
75 | area2 = area(boxes2)
76 | union = np.expand_dims(area1, axis=1) + np.expand_dims(
77 | area2, axis=0) - intersect
78 | return intersect / union
79 |
80 |
81 | def ioa(boxes1, boxes2):
82 | """Computes pairwise intersection-over-area between box collections.
83 |
84 | Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
85 | their intersection area over box2's area. Note that ioa is not symmetric,
86 | that is, IOA(box1, box2) != IOA(box2, box1).
87 |
88 | Args:
89 | boxes1: a numpy array with shape [N, 4] holding N boxes.
90 | boxes2: a numpy array with shape [M, 4] holding N boxes.
91 |
92 | Returns:
93 | a numpy array with shape [N, M] representing pairwise ioa scores.
94 | """
95 | intersect = intersection(boxes1, boxes2)
96 | areas = np.expand_dims(area(boxes2), axis=0)
97 | return intersect / areas
98 |
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_evaluation/np_mask_ops.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Operations for [N, height, width] numpy arrays representing masks.
17 |
18 | Example mask operations that are supported:
19 | * Areas: compute mask areas
20 | * IOU: pairwise intersection-over-union scores
21 | """
22 | import numpy as np
23 |
24 | EPSILON = 1e-7
25 |
26 |
27 | def area(masks):
28 | """Computes area of masks.
29 |
30 | Args:
31 | masks: Numpy array with shape [N, height, width] holding N masks. Masks
32 | values are of type np.uint8 and values are in {0,1}.
33 |
34 | Returns:
35 | a numpy array with shape [N*1] representing mask areas.
36 |
37 | Raises:
38 | ValueError: If masks.dtype is not np.uint8
39 | """
40 | if masks.dtype != np.uint8:
41 | raise ValueError('Masks type should be np.uint8')
42 | return np.sum(masks, axis=(1, 2), dtype=np.float32)
43 |
44 |
45 | def intersection(masks1, masks2):
46 | """Compute pairwise intersection areas between masks.
47 |
48 | Args:
49 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks
50 | values are of type np.uint8 and values are in {0,1}.
51 | masks2: a numpy array with shape [M, height, width] holding M masks. Masks
52 | values are of type np.uint8 and values are in {0,1}.
53 |
54 | Returns:
55 | a numpy array with shape [N*M] representing pairwise intersection area.
56 |
57 | Raises:
58 | ValueError: If masks1 and masks2 are not of type np.uint8.
59 | """
60 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
61 | raise ValueError('masks1 and masks2 should be of type np.uint8')
62 | n = masks1.shape[0]
63 | m = masks2.shape[0]
64 | answer = np.zeros([n, m], dtype=np.float32)
65 | for i in np.arange(n):
66 | for j in np.arange(m):
67 | answer[i, j] = np.sum(np.minimum(masks1[i], masks2[j]), dtype=np.float32)
68 | return answer
69 |
70 |
71 | def iou(masks1, masks2):
72 | """Computes pairwise intersection-over-union between mask collections.
73 |
74 | Args:
75 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks
76 | values are of type np.uint8 and values are in {0,1}.
77 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks
78 | values are of type np.uint8 and values are in {0,1}.
79 |
80 | Returns:
81 | a numpy array with shape [N, M] representing pairwise iou scores.
82 |
83 | Raises:
84 | ValueError: If masks1 and masks2 are not of type np.uint8.
85 | """
86 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
87 | raise ValueError('masks1 and masks2 should be of type np.uint8')
88 | intersect = intersection(masks1, masks2)
89 | area1 = area(masks1)
90 | area2 = area(masks2)
91 | union = np.expand_dims(area1, axis=1) + np.expand_dims(
92 | area2, axis=0) - intersect
93 | return intersect / np.maximum(union, EPSILON)
94 |
95 |
96 | def ioa(masks1, masks2):
97 | """Computes pairwise intersection-over-area between box collections.
98 |
99 | Intersection-over-area (ioa) between two masks, mask1 and mask2 is defined as
100 | their intersection area over mask2's area. Note that ioa is not symmetric,
101 | that is, IOA(mask1, mask2) != IOA(mask2, mask1).
102 |
103 | Args:
104 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks
105 | values are of type np.uint8 and values are in {0,1}.
106 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks
107 | values are of type np.uint8 and values are in {0,1}.
108 |
109 | Returns:
110 | a numpy array with shape [N, M] representing pairwise ioa scores.
111 |
112 | Raises:
113 | ValueError: If masks1 and masks2 are not of type np.uint8.
114 | """
115 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
116 | raise ValueError('masks1 and masks2 should be of type np.uint8')
117 | intersect = intersection(masks1, masks2)
118 | areas = np.expand_dims(area(masks2), axis=0)
119 | return intersect / (areas + EPSILON)
120 |
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/pascal_wrapper.py:
--------------------------------------------------------------------------------
1 | from .pascal_evaluation import object_detection_evaluation, standard_fields
2 | import numpy as np
3 |
4 |
5 |
6 | def parse_id(activity_list=None, class_num=24):
7 | if activity_list is None: # use the class ID instead
8 | activity_list = ['Class{}'.format(i) for i in range(class_num)]
9 | # activity_list = ['Basketball', 'BasketballDunk', 'Biking', 'CliffDiving', 'CricketBowling', 'Diving', 'Fencing', 'FloorGymnastics', 'GolfSwing', 'HorseRiding', 'IceDancing', 'LongJump', 'PoleVault', 'RopeClimbing', 'SalsaSpin', 'SkateBoarding', 'Skiing', 'Skijet', 'SoccerJuggling', 'Surfing', 'TennisSwing', 'TrampolineJumping', 'VolleyballSpiking', 'WalkingWithDog']
10 | categories = []
11 | for i, act_name in enumerate(activity_list):
12 | categories.append({'id': i + 1, 'name': act_name})
13 | return categories
14 |
15 |
16 | class STDetectionEvaluaterUCF(object):
17 | '''
18 | evaluater class designed for multi-iou thresholds
19 | based on https://github.com/activitynet/ActivityNet/blob/master/Evaluation/get_ava_performance.py
20 | parameters:
21 | dataset that provide GT annos, in the format of AWSCVMotionDataset
22 | tiou_thresholds: a list of iou thresholds
23 | attributes:
24 | clear(): clear detection results, GT is kept
25 | load_detection_from_path(), load anno from a list of path, in the format of [confi x1 y1 x2 y2 scoresx15]
26 | evaluate(): run evaluation code
27 | '''
28 |
29 | def __init__(self, tiou_thresholds=[0.5], load_from_dataset=False, activity_list=None, class_num=24):
30 | categories = parse_id(activity_list=activity_list, class_num=class_num)
31 | self.class_num = class_num
32 | self.categories = categories
33 | self.tiou_thresholds = tiou_thresholds
34 | self.lst_pascal_evaluator = []
35 | self.load_from_dataset = load_from_dataset
36 | self.exclude_key = []
37 | for iou in self.tiou_thresholds:
38 | self.lst_pascal_evaluator.append(
39 | object_detection_evaluation.PascalDetectionEvaluator(categories, matching_iou_threshold=iou))
40 |
41 | def clear(self):
42 | for evaluator in self.lst_pascal_evaluator:
43 | evaluator.clear()
44 |
45 | def load_ground_truth(self, ground_truth):
46 | # write into evaluator
47 | for image_key, info in ground_truth.items():
48 | boxes = info['bbox'].copy() # normalized coordinates
49 | resolution = info['resolution']
50 | boxes_eval = []
51 | for box in boxes:
52 | area = (box[3] - box[1]) * resolution[0] * (box[2] - box[0]) * resolution[1]
53 | if area < 10: continue # ignore too small boxes
54 | boxes_eval.append(box)
55 | if len(boxes_eval) == 0: # no boxes
56 | self.exclude_key.append(image_key) # mark the excluded frames to filter the detections later
57 | continue
58 |
59 | for evaluator in self.lst_pascal_evaluator:
60 | evaluator.add_single_ground_truth_image_info(
61 | image_key, {
62 | standard_fields.InputDataFields.groundtruth_boxes:
63 | np.vstack(boxes_eval),
64 | standard_fields.InputDataFields.groundtruth_classes:
65 | np.array(info['labels'], dtype=int),
66 | standard_fields.InputDataFields.groundtruth_difficult:
67 | np.zeros(len(boxes_eval), dtype=bool)
68 | })
69 |
70 |
71 | def load_detection(self, detections):
72 | """ Load detection results from dict memory
73 | """
74 | for image_key, info in detections.items():
75 | # filtering out results that are in the excluded frames
76 | if image_key in self.exclude_key or len(info['boxes']) == 0:
77 | continue
78 |
79 | # sorted by confidence:
80 | boxes, labels, scores = info['boxes'], info['action_ids'], info['scores']
81 | index = np.argsort(-scores)
82 | boxes, labels, scores = boxes[index], labels[index], scores[index]
83 |
84 | # add info into evaluator
85 | for evaluator in self.lst_pascal_evaluator:
86 | evaluator.add_single_detected_image_info(
87 | image_key, {
88 | standard_fields.DetectionResultFields.detection_boxes: boxes,
89 | standard_fields.DetectionResultFields.detection_classes: labels,
90 | standard_fields.DetectionResultFields.detection_scores: scores
91 | })
92 |
93 | def evaluate(self):
94 | result = {}
95 | for x, iou in enumerate(self.tiou_thresholds):
96 | evaluator = self.lst_pascal_evaluator[x]
97 | metrics = evaluator.evaluate()
98 | result.update(metrics)
99 | return result
100 |
101 |
102 | def frame_mAP_pascal(_results, _targets, vocab, logger, iou_list=[0.5]):
103 | evaluater = STDetectionEvaluaterUCF(tiou_thresholds=iou_list, activity_list=vocab, class_num=len(vocab))
104 |
105 | logger.info("Adding ground truth into evaluator")
106 | evaluater.load_ground_truth(_targets)
107 |
108 | logger.info("Adding predictions into evaluator")
109 | evaluater.load_detection(_results)
110 |
111 | eval_res = evaluater.evaluate()
112 |
113 | return eval_res
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ucf24/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from .ucf24_eval import do_ucf24_evaluation
3 |
4 |
5 | def ucf24_evaluation(dataset, predictions, output_folder, **kwargs):
6 | logger = logging.getLogger("alphaction.inference")
7 | logger.info("performing UCF24 evaluation.")
8 | return do_ucf24_evaluation(
9 | dataset=dataset,
10 | predictions=predictions,
11 | output_folder=output_folder,
12 | logger=logger,
13 | metric=kwargs.get('metric', 'frame_ap'),
14 | save_csv=kwargs.get('save_csv', False)
15 | )
--------------------------------------------------------------------------------
/alphaction/dataset/datasets/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import logging
4 | import numpy as np
5 | import time
6 | import cv2
7 | import torch
8 | from iopath.common.file_io import g_pathmgr
9 | import os
10 | import pickle
11 |
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | def retry_load_images(image_paths, retry=10, backend="pytorch"):
17 | """
18 | This function is to load images with support of retrying for failed load.
19 | Args:
20 | image_paths (list): paths of images needed to be loaded.
21 | retry (int, optional): maximum time of loading retrying. Defaults to 10.
22 | backend (str): `pytorch` or `cv2`.
23 | Returns:
24 | imgs (list): list of loaded images.
25 | """
26 | for i in range(retry):
27 | imgs = []
28 | for image_path in image_paths:
29 | with g_pathmgr.open(image_path, "rb") as f:
30 | img_str = np.frombuffer(f.read(), np.uint8)
31 | img = cv2.imdecode(img_str, flags=cv2.IMREAD_COLOR)
32 | imgs.append(img)
33 |
34 | if all(img is not None for img in imgs):
35 | if backend == "pytorch":
36 | imgs = torch.as_tensor(np.stack(imgs))
37 | return imgs
38 | else:
39 | logger.warn("Reading failed. Will retry.")
40 | time.sleep(1.0)
41 | if i == retry - 1:
42 | raise Exception("Failed to load images {}".format(image_paths))
43 |
44 |
45 | def read_greyscale_image(img_file):
46 | assert os.path.exists(img_file), "File does not exist!\n{}".format(img_file)
47 | im = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE)
48 | im = im.astype(np.float32) / 255.0
49 | im = torch.from_numpy(im)
50 | return im
51 |
52 |
53 | def get_sequence(center_idx, half_len, sample_rate, num_frames):
54 | """
55 | Sample frames among the corresponding clip.
56 | Args:
57 | center_idx (int): center frame idx for current clip
58 | half_len (int): half of the clip length
59 | sample_rate (int): sampling rate for sampling frames inside of the clip
60 | num_frames (int): number of expected sampled frames
61 | Returns:
62 | seq (list): list of indexes of sampled frames in this clip.
63 | """
64 | seq = list(range(center_idx - half_len, center_idx + half_len, sample_rate))
65 |
66 | for seq_idx in range(len(seq)):
67 | if seq[seq_idx] < 0:
68 | seq[seq_idx] = 0
69 | elif seq[seq_idx] >= num_frames:
70 | seq[seq_idx] = num_frames - 1
71 | return seq
72 |
73 | def pack_pathway_output(cfg, frames, pathways=2):
74 | """
75 | Prepare output as a list of tensors. Each tensor corresponding to a
76 | unique pathway.
77 | Args:
78 | frames (tensor): frames of images sampled from the video. The
79 | dimension is `channel` x `num frames` x `height` x `width`.
80 | Returns:
81 | frame_list (list): list of tensors with the dimension of
82 | `channel` x `num frames` x `height` x `width`.
83 | """
84 | if cfg.DATA.REVERSE_INPUT_CHANNEL:
85 | frames = frames[[2, 1, 0], :, :, :]
86 | if pathways==1:
87 | frame_list = [frames]
88 | elif pathways==2:
89 | fast_pathway = frames
90 | # Perform temporal sampling from the fast pathway.
91 | slow_pathway = torch.index_select(
92 | frames,
93 | 1,
94 | torch.linspace(
95 | 0, frames.shape[1] - 1, frames.shape[1] // cfg.SLOWFAST.ALPHA
96 | ).long(),
97 | )
98 | frame_list = [slow_pathway, fast_pathway]
99 | else:
100 | raise NotImplementedError()
101 | return frame_list
102 |
103 |
104 | def load_dets_data(det_file, topk=None):
105 | assert os.path.exists(det_file), "detection file does not exist: {}".format(det_file)
106 | with open(det_file, 'rb') as fid:
107 | data = pickle.load(fid, encoding='iso-8859-1')
108 | # get list of all frames
109 | all_dets = dict()
110 | for vid, dets in data.items():
111 | for i in list(dets['boxes'].keys()):
112 | boxes, scores = dets['boxes'][i], dets['scores'][i]
113 | key = "%s,%05d" % (vid, i)
114 | if topk is None:
115 | all_dets[key] = np.hstack((boxes, scores[:, None])) # (n, 5)
116 | else:
117 | indices = np.argsort(scores)[::-1][:topk] # topK maximum indices
118 | all_dets[key] = np.hstack((boxes[indices], scores[indices, None])) # (n, 5)
119 | return all_dets
--------------------------------------------------------------------------------
/alphaction/dataset/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from .distributed import DistributedSampler
2 | from .grouped_batch_sampler import GroupedBatchSampler
3 | from .iteration_based_batch_sampler import IterationBasedBatchSampler
4 |
5 | __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"]
6 |
--------------------------------------------------------------------------------
/alphaction/dataset/samplers/distributed.py:
--------------------------------------------------------------------------------
1 | # Code is copy-pasted exactly as in torch.utils.dataset.distributed.
2 | # FIXME remove this once c10d fixes the bug it has
3 | import math
4 | import torch
5 | import torch.distributed as dist
6 | from torch.utils.data.sampler import Sampler
7 |
8 |
9 | class DistributedSampler(Sampler):
10 | """Sampler that restricts dataset loading to a subset of the dataset.
11 | It is especially useful in conjunction with
12 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
13 | process can pass a DistributedSampler instance as a DataLoader sampler,
14 | and load a subset of the original dataset that is exclusive to it.
15 | .. note::
16 | Dataset is assumed to be of constant size.
17 | Arguments:
18 | dataset: Dataset used for sampling.
19 | num_replicas (optional): Number of processes participating in
20 | distributed training.
21 | rank (optional): Rank of the current process within num_replicas.
22 | """
23 |
24 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
25 | if num_replicas is None:
26 | if not dist.is_available():
27 | raise RuntimeError("Requires distributed package to be available")
28 | num_replicas = dist.get_world_size()
29 | if rank is None:
30 | if not dist.is_available():
31 | raise RuntimeError("Requires distributed package to be available")
32 | rank = dist.get_rank()
33 | self.dataset = dataset
34 | self.num_replicas = num_replicas
35 | self.rank = rank
36 | self.epoch = 0
37 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
38 | self.total_size = self.num_samples * self.num_replicas
39 | self.shuffle = shuffle
40 |
41 | def __iter__(self):
42 | if self.shuffle:
43 | # deterministically shuffle based on epoch
44 | g = torch.Generator()
45 | g.manual_seed(self.epoch)
46 | indices = torch.randperm(len(self.dataset), generator=g).tolist()
47 | else:
48 | indices = torch.arange(len(self.dataset)).tolist()
49 |
50 | # add extra samples to make it evenly divisible
51 | indices += indices[: (self.total_size - len(indices))]
52 | assert len(indices) == self.total_size
53 |
54 | # subsample
55 | offset = self.num_samples * self.rank
56 | indices = indices[offset : offset + self.num_samples]
57 | assert len(indices) == self.num_samples
58 |
59 | return iter(indices)
60 |
61 | def __len__(self):
62 | return self.num_samples
63 |
64 | def set_epoch(self, epoch):
65 | self.epoch = epoch
66 |
--------------------------------------------------------------------------------
/alphaction/dataset/samplers/grouped_batch_sampler.py:
--------------------------------------------------------------------------------
1 | # Modified based on https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py
2 | import itertools
3 |
4 | import torch
5 | from torch.utils.data.sampler import BatchSampler
6 | from torch.utils.data.sampler import Sampler
7 |
8 |
9 | class GroupedBatchSampler(BatchSampler):
10 | """
11 | Wraps another sampler to yield a mini-batch of indices.
12 | It enforces that elements from the same group should appear in groups of batch_size.
13 | It also tries to provide mini-batches which follows an ordering which is
14 | as close as possible to the ordering from the original sampler.
15 |
16 | Arguments:
17 | sampler (Sampler): Base sampler.
18 | batch_size (int): Size of mini-batch.
19 | drop_uneven (bool): If ``True``, the sampler will drop the batches whose
20 | size is less than ``batch_size``
21 |
22 | """
23 |
24 | def __init__(self, sampler, group_ids, batch_size, drop_uneven=False):
25 | if not isinstance(sampler, Sampler):
26 | raise ValueError(
27 | "sampler should be an instance of "
28 | "torch.utils.dataset.Sampler, but got sampler={}".format(sampler)
29 | )
30 | self.sampler = sampler
31 | self.group_ids = torch.as_tensor(group_ids)
32 | assert self.group_ids.dim() == 1
33 | self.batch_size = batch_size
34 | self.drop_uneven = drop_uneven
35 |
36 | self.groups = torch.unique(self.group_ids).sort(0)[0]
37 |
38 | def _prepare_batches(self):
39 | dataset_size = len(self.group_ids)
40 | # get the sampled indices from the sampler
41 | sampled_ids = torch.as_tensor(list(self.sampler))
42 | # potentially not all elements of the dataset were sampled
43 | # by the sampler (e.g., DistributedSampler).
44 | # construct a tensor which contains -1 if the element was
45 | # not sampled, and a non-negative number indicating the
46 | # order where the element was sampled.
47 | # for example. if sampled_ids = [3, 1] and dataset_size = 5,
48 | # the order is [-1, 1, -1, 0, -1]
49 | order = torch.full((dataset_size,), -1, dtype=torch.int64)
50 | order[sampled_ids] = torch.arange(len(sampled_ids))
51 |
52 | # get a mask with the elements that were sampled
53 | mask = order >= 0
54 |
55 | # find the elements that belong to each individual cluster
56 | clusters = [(self.group_ids == i) & mask for i in self.groups]
57 | # get relative order of the elements inside each cluster
58 | # that follows the order from the sampler
59 | relative_order = [order[cluster] for cluster in clusters]
60 | # with the relative order, find the absolute order in the
61 | # sampled space
62 | permutation_ids = [s[s.sort()[1]] for s in relative_order]
63 | # permute each cluster so that they follow the order from
64 | # the sampler
65 | permuted_clusters = [sampled_ids[idx] for idx in permutation_ids]
66 |
67 | # splits each cluster in batch_size, and merge as a list of tensors
68 | splits = [c.split(self.batch_size) for c in permuted_clusters]
69 | merged = tuple(itertools.chain.from_iterable(splits))
70 |
71 | # now each batch internally has the right order, but
72 | # they are grouped by clusters. Find the permutation between
73 | # different batches that brings them as close as possible to
74 | # the order that we have in the sampler. For that, we will consider the
75 | # ordering as coming from the first element of each batch, and sort
76 | # correspondingly
77 | first_element_of_batch = [t[0].item() for t in merged]
78 | # get and inverse mapping from sampled indices and the position where
79 | # they occur (as returned by the sampler)
80 | inv_sampled_ids_map = {v: k for k, v in enumerate(sampled_ids.tolist())}
81 | # from the first element in each batch, get a relative ordering
82 | first_index_of_batch = torch.as_tensor(
83 | [inv_sampled_ids_map[s] for s in first_element_of_batch]
84 | )
85 |
86 | # permute the batches so that they approximately follow the order
87 | # from the sampler
88 | permutation_order = first_index_of_batch.sort(0)[1].tolist()
89 | # finally, permute the batches
90 | batches = [merged[i].tolist() for i in permutation_order]
91 |
92 | if self.drop_uneven:
93 | kept = []
94 | for batch in batches:
95 | if len(batch) == self.batch_size:
96 | kept.append(batch)
97 | batches = kept
98 | return batches
99 |
100 | def __iter__(self):
101 | batches = self._prepare_batches()
102 | self._batches = batches
103 | return iter(batches)
104 |
105 | def __len__(self):
106 | if not hasattr(self, "_batches"):
107 | self._batches = self._prepare_batches()
108 | return len(self._batches)
109 |
--------------------------------------------------------------------------------
/alphaction/dataset/samplers/iteration_based_batch_sampler.py:
--------------------------------------------------------------------------------
1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py
2 | from torch.utils.data.sampler import BatchSampler
3 |
4 |
5 | class IterationBasedBatchSampler(BatchSampler):
6 | """
7 | Wraps a BatchSampler, resampling from it until
8 | a specified number of iterations have been sampled
9 | """
10 |
11 | def __init__(self, batch_sampler, num_iterations, start_iter=0):
12 | self.batch_sampler = batch_sampler
13 | self.num_iterations = num_iterations
14 | self.start_iter = start_iter
15 |
16 | def __iter__(self):
17 | iteration = self.start_iter
18 | while iteration <= self.num_iterations:
19 | # if the underlying sampler has a set_epoch method, like
20 | # DistributedSampler, used for making each process see
21 | # a different split of the dataset, then set it
22 | if hasattr(self.batch_sampler.sampler, "set_epoch"):
23 | self.batch_sampler.sampler.set_epoch(iteration)
24 | for batch in self.batch_sampler:
25 | iteration += 1
26 | if iteration > self.num_iterations:
27 | break
28 | yield batch
29 |
30 | def __len__(self):
31 | return self.num_iterations
32 |
--------------------------------------------------------------------------------
/alphaction/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/engine/__init__.py
--------------------------------------------------------------------------------
/alphaction/engine/feature_extraction.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import torch
3 | from tqdm import tqdm
4 | import time
5 | import datetime
6 | from alphaction.utils.comm import get_rank, synchronize, get_world_size
7 |
8 |
9 | def do_feature_extraction(model_ddp, data_loader, distributed):
10 |
11 | device = torch.device("cuda")
12 | num_devices = get_world_size()
13 | dataset = data_loader.dataset
14 |
15 | if dataset.finished_feat_extraction():
16 | return
17 | logger = logging.getLogger("alphaction.feature_extraction.{}".format(dataset._split))
18 |
19 | logger.info("Start feature extraction on {} dataset({} videos).".format(dataset.__class__.__name__, len(dataset)))
20 | start_time = time.time()
21 | model = model_ddp.module if distributed else model_ddp
22 | model.eval()
23 |
24 | extra_args = {} if get_world_size() == 1 else dict(desc="feature extracting", disable=(not get_rank()==0))
25 |
26 | with torch.no_grad():
27 | for i, batch in tqdm(enumerate(data_loader), **extra_args):
28 | video, _, whwh, boxes, _, metadata, idx = batch
29 | video = video.to(device)
30 |
31 | # extract patch token features and CLS token feature
32 | features, cls_feat = model.backbone([video])
33 | # extract text features
34 | text_features = model.backbone.forward_text(device=device)
35 |
36 | # save torch tensors
37 | dataset.save_features(idx, features[0].cpu(), cls_feat.cpu(), text_features.cpu())
38 |
39 | if dataset.finished_feat_extraction():
40 | logger.info("Finished feature extraction. ")
41 | break # check if all samples are processed
42 |
43 | synchronize()
44 | total_time = time.time() - start_time
45 | total_time_str = str(datetime.timedelta(seconds=total_time))
46 | logger.info("Feature extraction time: {} ({} s / video per device, on {} devices)".format(
47 | total_time_str, total_time * num_devices / len(dataset), num_devices))
48 |
--------------------------------------------------------------------------------
/alphaction/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .batch_norm import FrozenBatchNorm1d, FrozenBatchNorm2d, FrozenBatchNorm3d
2 |
3 | __all__ = [ "FrozenBatchNorm1d", "FrozenBatchNorm2d", "FrozenBatchNorm3d"]
4 |
5 |
--------------------------------------------------------------------------------
/alphaction/layers/batch_norm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 |
4 |
5 | class _FrozenBatchNorm(nn.Module):
6 | def __init__(self, num_features, eps=1e-5, affine=True, track_running_stats=True):
7 | super(_FrozenBatchNorm, self).__init__()
8 | self.num_features = num_features
9 | self.eps = eps
10 | self.affine = affine
11 | self.track_running_stats = track_running_stats
12 | if self.affine:
13 | self.register_buffer("weight", torch.Tensor(num_features))
14 | self.register_buffer("bias", torch.Tensor(num_features))
15 | else:
16 | self.register_buffer("weight", None)
17 | self.register_buffer("bias", None)
18 | if self.track_running_stats:
19 | self.register_buffer('running_mean', torch.zeros(num_features))
20 | self.register_buffer('running_var', torch.ones(num_features))
21 | else:
22 | self.register_parameter('running_mean', None)
23 | self.register_parameter('running_var', None)
24 | self.reset_parameters()
25 |
26 | def reset_running_stats(self):
27 | if self.track_running_stats:
28 | self.running_mean.zero_()
29 | self.running_var.fill_(1)
30 |
31 | def reset_parameters(self):
32 | self.reset_running_stats()
33 | if self.affine:
34 | self.weight.data.uniform_()
35 | self.bias.data.zero_()
36 |
37 | def _check_input_dim(self, input):
38 | raise NotImplementedError
39 |
40 | def forward(self, input):
41 | self._check_input_dim(input)
42 | view_shape = (1, self.num_features) + (1,) * (input.dim() - 2)
43 |
44 | if self.track_running_stats:
45 | scale = self.weight / (self.running_var + self.eps).sqrt()
46 | bias = self.bias - self.running_mean * scale
47 | else:
48 | scale = self.weight
49 | bias = self.bias
50 |
51 | return scale.view(*view_shape) * input + bias.view(*view_shape)
52 |
53 | def extra_repr(self):
54 | return '{num_features}, eps={eps}, affine={affine}, ' \
55 | 'track_running_stats={track_running_stats}'.format(**self.__dict__)
56 |
57 | def _load_from_state_dict(self, state_dict, prefix, metadata, strict,
58 | missing_keys, unexpected_keys, error_msgs):
59 | num_batches_tracked_key = prefix + 'num_batches_tracked'
60 | if num_batches_tracked_key in state_dict:
61 | del state_dict[num_batches_tracked_key]
62 | super(_FrozenBatchNorm, self)._load_from_state_dict(
63 | state_dict, prefix, metadata, strict,
64 | missing_keys, unexpected_keys, error_msgs)
65 |
66 |
67 | class FrozenBatchNorm1d(_FrozenBatchNorm):
68 | def _check_input_dim(self, input):
69 | if input.dim() != 2 and input.dim() != 3:
70 | raise ValueError('expected 2D or 3D input (got {}D input)'
71 | .format(input.dim()))
72 |
73 |
74 | class FrozenBatchNorm2d(_FrozenBatchNorm):
75 | def _check_input_dim(self, input):
76 | if input.dim() != 4:
77 | raise ValueError('expected 4D input (got {}D input)'
78 | .format(input.dim()))
79 |
80 |
81 | class FrozenBatchNorm3d(_FrozenBatchNorm):
82 | def _check_input_dim(self, input):
83 | if input.dim() != 5:
84 | raise ValueError('expected 5D input (got {}D input)'
85 | .format(input.dim()))
86 |
--------------------------------------------------------------------------------
/alphaction/modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/modeling/__init__.py
--------------------------------------------------------------------------------
/alphaction/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import build_backbone
--------------------------------------------------------------------------------
/alphaction/modeling/backbone/backbone.py:
--------------------------------------------------------------------------------
1 | from alphaction.modeling import registry
2 | from . import slowfast, i3d, video_model_builder
3 |
4 | @registry.BACKBONES.register("Slowfast-Resnet50")
5 | @registry.BACKBONES.register("Slowfast-Resnet101")
6 | def build_slowfast_resnet_backbone(cfg):
7 | model = slowfast.SlowFast(cfg)
8 | return model
9 |
10 | @registry.BACKBONES.register("PySlowonly")
11 | def build_pyslowonly_resnet_backbone(cfg):
12 | model = video_model_builder.ResNet(cfg)
13 | return model
14 |
15 | @registry.BACKBONES.register("PySlowfast-R50")
16 | @registry.BACKBONES.register("PySlowfast-R101")
17 | def build_pyslowfast_resnet_backbone(cfg):
18 | model = video_model_builder.SlowFast(cfg)
19 | return model
20 |
21 | @registry.BACKBONES.register("MAE-ViT-B")
22 | @registry.BACKBONES.register("MAE-ViT-L")
23 | def build_mae_vit_backbone(cfg):
24 | model = video_model_builder.ViT(cfg)
25 | return model
26 |
27 | @registry.BACKBONES.register("I3D-Resnet50")
28 | @registry.BACKBONES.register("I3D-Resnet101")
29 | @registry.BACKBONES.register("I3D-Resnet50-Sparse")
30 | @registry.BACKBONES.register("I3D-Resnet101-Sparse")
31 | def build_i3d_resnet_backbone(cfg):
32 | model = i3d.I3D(cfg)
33 | return model
34 |
35 | # OpenAI CLIP
36 | @registry.BACKBONES.register("ViT-B/16")
37 | @registry.BACKBONES.register("ViT-B/32")
38 | @registry.BACKBONES.register("ViT-L/14")
39 | def build_clip_vit_backbone(cfg):
40 | from alphaction.modeling.encoders.openai_clip.clip_encoder import build_clip_backbone
41 | model = build_clip_backbone(cfg)
42 | return model
43 |
44 | # CLIP-ViP
45 | @registry.BACKBONES.register("ViP-B/16")
46 | @registry.BACKBONES.register("ViP-B/32")
47 | def build_clipvip_backbone(cfg):
48 | from alphaction.modeling.encoders.clipvip.clipvip_encoder import build_clipvip_backbone
49 | model = build_clipvip_backbone(cfg)
50 | return model
51 |
52 | # ViCLIP from InternVideo
53 | @registry.BACKBONES.register("ViCLIP-L/14")
54 | def build_viclip_backbone(cfg):
55 | from alphaction.modeling.encoders.viclip.viclip_encoder import build_viclip_backbone
56 | model = build_viclip_backbone(cfg)
57 | return model
58 |
59 |
60 | def build_backbone(cfg):
61 | assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \
62 | "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format(
63 | cfg.MODEL.BACKBONE.CONV_BODY
64 | )
65 | return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg)
66 |
--------------------------------------------------------------------------------
/alphaction/modeling/backbone/i3d.py:
--------------------------------------------------------------------------------
1 | from __future__ import (absolute_import, division, print_function,
2 | unicode_literals)
3 |
4 | import torch.nn as nn
5 | from alphaction.layers import FrozenBatchNorm3d
6 | from alphaction.modeling.common_blocks import ResNLBlock
7 |
8 |
9 | def get_model_cfg(cfg):
10 | backbone_strs = cfg.MODEL.BACKBONE.CONV_BODY.split('-')[1:]
11 | error_msg = 'Model backbone {} is not supported.'.format(cfg.MODEL.BACKBONE.CONV_BODY)
12 |
13 | use_temp_convs_1 = [2]
14 | temp_strides_1 = [2]
15 | max_pool_stride_1 = 2
16 |
17 | use_temp_convs_2 = [1, 1, 1]
18 | temp_strides_2 = [1, 1, 1]
19 | max_pool_stride_2 = 2
20 |
21 | use_temp_convs_3 = [1, 0, 1, 0]
22 | temp_strides_3 = [1, 1, 1, 1]
23 |
24 | use_temp_convs_5 = [0, 1, 0]
25 | temp_strides_5 = [1, 1, 1]
26 |
27 | avg_pool_stride = int(cfg.INPUT.FRAME_NUM / 8)
28 | if backbone_strs[0] == 'Resnet50':
29 | block_config = (3, 4, 6, 3)
30 |
31 | use_temp_convs_4 = [1, 0, 1, 0, 1, 0]
32 | temp_strides_4 = [1, 1, 1, 1, 1, 1]
33 | elif backbone_strs[0] == 'Resnet101':
34 | block_config = (3, 4, 23, 3)
35 |
36 | use_temp_convs_4 = []
37 | for i in range(23):
38 | if i % 2 == 0:
39 | use_temp_convs_4.append(1)
40 | else:
41 | use_temp_convs_4.append(0)
42 | temp_strides_4 = [1, ] * 23
43 | else:
44 | raise KeyError(error_msg)
45 |
46 | if len(backbone_strs) > 1:
47 | if len(backbone_strs) == 2 and backbone_strs[1] == 'Sparse':
48 | temp_strides_1 = [1]
49 | max_pool_stride_1 = 1
50 | avg_pool_stride = int(cfg.INPUT.FRAME_NUM / 2)
51 | else:
52 | raise KeyError(error_msg)
53 |
54 | use_temp_convs_set = [use_temp_convs_1, use_temp_convs_2, use_temp_convs_3, use_temp_convs_4, use_temp_convs_5]
55 | temp_strides_set = [temp_strides_1, temp_strides_2, temp_strides_3, temp_strides_4, temp_strides_5]
56 | pool_strides_set = [max_pool_stride_1, max_pool_stride_2, avg_pool_stride]
57 | return block_config, use_temp_convs_set, temp_strides_set, pool_strides_set
58 |
59 |
60 | class I3D(nn.Module):
61 | def __init__(self, cfg):
62 | super(I3D, self).__init__()
63 |
64 | self.cfg = cfg.clone()
65 |
66 | block_config, use_temp_convs_set, temp_strides_set, pool_strides_set = get_model_cfg(cfg)
67 | conv3_nonlocal = cfg.MODEL.BACKBONE.I3D.CONV3_NONLOCAL
68 | conv4_nonlocal = cfg.MODEL.BACKBONE.I3D.CONV4_NONLOCAL
69 |
70 | dim_inner = 64
71 | conv_dims = [64, 256, 512, 1024, 2048]
72 | self.dim_out = conv_dims[-1]
73 | n1, n2, n3, n4 = block_config
74 | layer_mod = 2
75 | conv3_nl_mod = layer_mod
76 | conv4_nl_mod = layer_mod
77 | if not conv3_nonlocal:
78 | conv3_nl_mod = 1000
79 | if not conv4_nonlocal:
80 | conv4_nl_mod = 1000
81 | self.c2_mapping = None
82 |
83 | data_dim = 3
84 | self.conv1 = nn.Conv3d(data_dim, conv_dims[0], (1 + use_temp_convs_set[0][0] * 2, 7, 7),
85 | stride=(temp_strides_set[0][0], 2, 2),
86 | padding=(use_temp_convs_set[0][0], 3, 3), bias=False)
87 | nn.init.kaiming_normal_(self.conv1.weight)
88 |
89 | if cfg.MODEL.BACKBONE.FROZEN_BN:
90 | self.bn1 = FrozenBatchNorm3d(conv_dims[0], eps=cfg.MODEL.BACKBONE.BN_EPSILON)
91 | nn.init.constant_(self.bn1.weight, 1.0)
92 | nn.init.constant_(self.bn1.bias, 0.0)
93 | else:
94 | self.bn1 = nn.BatchNorm3d(conv_dims[0], eps=cfg.MODEL.BACKBONE.BN_EPSILON, momentum=cfg.MODEL.BACKBONE.BN_MOMENTUM)
95 |
96 | self.relu = nn.ReLU(inplace=True)
97 | self.maxpool1 = nn.MaxPool3d((pool_strides_set[0], 3, 3), stride=(pool_strides_set[0], 2, 2))
98 |
99 | self.res_nl1 = ResNLBlock(cfg, conv_dims[0], conv_dims[1], stride=1, num_blocks=n1, dim_inner=dim_inner,
100 | use_temp_convs=use_temp_convs_set[1], temp_strides=temp_strides_set[1])
101 | self.maxpool2 = nn.MaxPool3d((pool_strides_set[1], 1, 1), stride=(pool_strides_set[1], 1, 1))
102 |
103 | self.res_nl2 = ResNLBlock(cfg, conv_dims[1], conv_dims[2], stride=2, num_blocks=n2,
104 | dim_inner=dim_inner * 2, use_temp_convs=use_temp_convs_set[2],
105 | temp_strides=temp_strides_set[2], nonlocal_mod=conv3_nl_mod,
106 | group_nonlocal=cfg.MODEL.BACKBONE.I3D.CONV3_GROUP_NL)
107 |
108 | self.res_nl3 = ResNLBlock(cfg, conv_dims[2], conv_dims[3], stride=2, num_blocks=n3,
109 | dim_inner=dim_inner * 4, use_temp_convs=use_temp_convs_set[3],
110 | temp_strides=temp_strides_set[3], nonlocal_mod=conv4_nl_mod)
111 |
112 | self.res_nl4 = ResNLBlock(cfg, conv_dims[3], conv_dims[4], stride=1, num_blocks=n4,
113 | dim_inner=dim_inner * 8, use_temp_convs=use_temp_convs_set[4],
114 | temp_strides=temp_strides_set[4],
115 | dilation=2)
116 |
117 | def forward(self, _, x):
118 | # We only use fast videos, which is the second input.
119 | out = self.conv1(x)
120 | out = self.bn1(out)
121 | out = self.relu(out)
122 | out = self.maxpool1(out)
123 |
124 | out = self.res_nl1(out)
125 | out = self.maxpool2(out)
126 |
127 | out = self.res_nl2(out)
128 |
129 | out = self.res_nl3(out)
130 |
131 | out = self.res_nl4(out)
132 | return None, out
133 |
134 | def c2_weight_mapping(self):
135 | if self.c2_mapping is None:
136 | weight_map = {'conv1.weight': 'conv1_w',
137 | 'bn1.weight': 'res_conv1_bn_s',
138 | 'bn1.bias': 'res_conv1_bn_b',
139 | 'bn1.running_mean': 'res_conv1_bn_rm',
140 | 'bn1.running_var': 'res_conv1_bn_riv'}
141 | for i in range(1, 5):
142 | name = 'res_nl{}'.format(i)
143 | child_map = getattr(self, name).c2_weight_mapping()
144 | for key, val in child_map.items():
145 | new_key = name + '.' + key
146 | weight_map[new_key] = val.format(i + 1)
147 | self.c2_mapping = weight_map
148 | return self.c2_mapping
149 |
--------------------------------------------------------------------------------
/alphaction/modeling/backbone/sfmodels/common.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 |
7 | class Mlp(nn.Module):
8 | def __init__(
9 | self,
10 | in_features,
11 | hidden_features=None,
12 | out_features=None,
13 | act_layer=nn.GELU,
14 | drop_rate=0.0,
15 | ):
16 | super().__init__()
17 | self.drop_rate = drop_rate
18 | out_features = out_features or in_features
19 | hidden_features = hidden_features or in_features
20 | self.fc1 = nn.Linear(in_features, hidden_features)
21 | self.act = act_layer()
22 | self.fc2 = nn.Linear(hidden_features, out_features)
23 | if self.drop_rate > 0.0:
24 | self.drop = nn.Dropout(drop_rate)
25 |
26 | def forward(self, x):
27 | x = self.fc1(x)
28 | x = self.act(x)
29 | if self.drop_rate > 0.0:
30 | x = self.drop(x)
31 | x = self.fc2(x)
32 | if self.drop_rate > 0.0:
33 | x = self.drop(x)
34 | return x
35 |
36 |
37 | class Permute(nn.Module):
38 | def __init__(self, dims):
39 | super().__init__()
40 | self.dims = dims
41 |
42 | def forward(self, x):
43 | return x.permute(*self.dims)
44 |
45 |
46 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
47 | """
48 | Stochastic Depth per sample.
49 | """
50 | if drop_prob == 0.0 or not training:
51 | return x
52 | keep_prob = 1 - drop_prob
53 | shape = (x.shape[0],) + (1,) * (
54 | x.ndim - 1
55 | ) # work with diff dim tensors, not just 2D ConvNets
56 | mask = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
57 | mask.floor_() # binarize
58 | output = x.div(keep_prob) * mask
59 | return output
60 |
61 |
62 | class DropPath(nn.Module):
63 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
64 |
65 | def __init__(self, drop_prob=None):
66 | super(DropPath, self).__init__()
67 | self.drop_prob = drop_prob
68 |
69 | def forward(self, x):
70 | return drop_path(x, self.drop_prob, self.training)
71 |
72 |
73 | class TwoStreamFusion(nn.Module):
74 | def __init__(self, mode, dim=None, kernel=3, padding=1):
75 | """
76 | A general constructor for neural modules fusing two equal sized tensors
77 | in forward. Following options are supported:
78 | "add" / "max" / "min" / "avg" : respective operations on the two halves.
79 | "concat" : NOOP.
80 | "concat_linear_{dim_mult}_{drop_rate}" : MLP to fuse with hidden dim "dim_mult"
81 | (optional, def 1.) higher than input dim
82 | with optional dropout "drop_rate" (def: 0.)
83 | "ln+concat_linear_{dim_mult}_{drop_rate}" : perform MLP after layernorm on the input.
84 | """
85 | super().__init__()
86 | self.mode = mode
87 | if mode == "add":
88 | self.fuse_fn = lambda x: torch.stack(torch.chunk(x, 2, dim=2)).sum(
89 | dim=0
90 | )
91 | elif mode == "max":
92 | self.fuse_fn = (
93 | lambda x: torch.stack(torch.chunk(x, 2, dim=2))
94 | .max(dim=0)
95 | .values
96 | )
97 | elif mode == "min":
98 | self.fuse_fn = (
99 | lambda x: torch.stack(torch.chunk(x, 2, dim=2))
100 | .min(dim=0)
101 | .values
102 | )
103 | elif mode == "avg":
104 | self.fuse_fn = lambda x: torch.stack(torch.chunk(x, 2, dim=2)).mean(
105 | dim=0
106 | )
107 | elif mode == "concat":
108 | # x itself is the channel concat version
109 | self.fuse_fn = lambda x: x
110 | elif "concat_linear" in mode:
111 | if len(mode.split("_")) == 2:
112 | dim_mult = 1.0
113 | drop_rate = 0.0
114 | elif len(mode.split("_")) == 3:
115 | dim_mult = float(mode.split("_")[-1])
116 | drop_rate = 0.0
117 |
118 | elif len(mode.split("_")) == 4:
119 | dim_mult = float(mode.split("_")[-2])
120 | drop_rate = float(mode.split("_")[-1])
121 | else:
122 | raise NotImplementedError
123 |
124 | if mode.split("+")[0] == "ln":
125 | self.fuse_fn = nn.Sequential(
126 | nn.LayerNorm(dim),
127 | Mlp(
128 | in_features=dim,
129 | hidden_features=int(dim * dim_mult),
130 | act_layer=nn.GELU,
131 | out_features=dim,
132 | drop_rate=drop_rate,
133 | ),
134 | )
135 | else:
136 | self.fuse_fn = Mlp(
137 | in_features=dim,
138 | hidden_features=int(dim * dim_mult),
139 | act_layer=nn.GELU,
140 | out_features=dim,
141 | drop_rate=drop_rate,
142 | )
143 |
144 | else:
145 | raise NotImplementedError
146 |
147 | def forward(self, x):
148 | if "concat_linear" in self.mode:
149 | return self.fuse_fn(x) + x
150 |
151 | else:
152 | return self.fuse_fn(x)
--------------------------------------------------------------------------------
/alphaction/modeling/backbone/sfmodels/nonlocal_helper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 |
4 | """Non-local helper"""
5 |
6 | import torch
7 | import torch.nn as nn
8 |
9 |
10 | class Nonlocal(nn.Module):
11 | """
12 | Builds Non-local Neural Networks as a generic family of building
13 | blocks for capturing long-range dependencies. Non-local Network
14 | computes the response at a position as a weighted sum of the
15 | features at all positions. This building block can be plugged into
16 | many computer vision architectures.
17 | More details in the paper: https://arxiv.org/pdf/1711.07971.pdf
18 | """
19 |
20 | def __init__(
21 | self,
22 | dim,
23 | dim_inner,
24 | pool_size=None,
25 | instantiation="softmax",
26 | zero_init_final_conv=False,
27 | zero_init_final_norm=True,
28 | norm_eps=1e-5,
29 | norm_momentum=0.1,
30 | norm_module=nn.BatchNorm3d,
31 | ):
32 | """
33 | Args:
34 | dim (int): number of dimension for the input.
35 | dim_inner (int): number of dimension inside of the Non-local block.
36 | pool_size (list): the kernel size of spatial temporal pooling,
37 | temporal pool kernel size, spatial pool kernel size, spatial
38 | pool kernel size in order. By default pool_size is None,
39 | then there would be no pooling used.
40 | instantiation (string): supports two different instantiation method:
41 | "dot_product": normalizing correlation matrix with L2.
42 | "softmax": normalizing correlation matrix with Softmax.
43 | zero_init_final_conv (bool): If true, zero initializing the final
44 | convolution of the Non-local block.
45 | zero_init_final_norm (bool):
46 | If true, zero initializing the final batch norm of the Non-local
47 | block.
48 | norm_module (nn.Module): nn.Module for the normalization layer. The
49 | default is nn.BatchNorm3d.
50 | """
51 | super(Nonlocal, self).__init__()
52 | self.dim = dim
53 | self.dim_inner = dim_inner
54 | self.pool_size = pool_size
55 | self.instantiation = instantiation
56 | self.use_pool = (
57 | False
58 | if pool_size is None
59 | else any((size > 1 for size in pool_size))
60 | )
61 | self.norm_eps = norm_eps
62 | self.norm_momentum = norm_momentum
63 | self._construct_nonlocal(
64 | zero_init_final_conv, zero_init_final_norm, norm_module
65 | )
66 |
67 | def _construct_nonlocal(
68 | self, zero_init_final_conv, zero_init_final_norm, norm_module
69 | ):
70 | # Three convolution heads: theta, phi, and g.
71 | self.conv_theta = nn.Conv3d(
72 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
73 | )
74 | self.conv_phi = nn.Conv3d(
75 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
76 | )
77 | self.conv_g = nn.Conv3d(
78 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
79 | )
80 |
81 | # Final convolution output.
82 | self.conv_out = nn.Conv3d(
83 | self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0
84 | )
85 | # Zero initializing the final convolution output.
86 | self.conv_out.zero_init = zero_init_final_conv
87 |
88 | # TODO: change the name to `norm`
89 | self.bn = norm_module(
90 | num_features=self.dim,
91 | eps=self.norm_eps,
92 | momentum=self.norm_momentum,
93 | )
94 | # Zero initializing the final bn.
95 | self.bn.transform_final_bn = zero_init_final_norm
96 |
97 | # Optional to add the spatial-temporal pooling.
98 | if self.use_pool:
99 | self.pool = nn.MaxPool3d(
100 | kernel_size=self.pool_size,
101 | stride=self.pool_size,
102 | padding=[0, 0, 0],
103 | )
104 |
105 | def forward(self, x):
106 | x_identity = x
107 | N, C, T, H, W = x.size()
108 |
109 | theta = self.conv_theta(x)
110 |
111 | # Perform temporal-spatial pooling to reduce the computation.
112 | if self.use_pool:
113 | x = self.pool(x)
114 |
115 | phi = self.conv_phi(x)
116 | g = self.conv_g(x)
117 |
118 | theta = theta.view(N, self.dim_inner, -1)
119 | phi = phi.view(N, self.dim_inner, -1)
120 | g = g.view(N, self.dim_inner, -1)
121 |
122 | # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW).
123 | theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi))
124 | # For original Non-local paper, there are two main ways to normalize
125 | # the affinity tensor:
126 | # 1) Softmax normalization (norm on exp).
127 | # 2) dot_product normalization.
128 | if self.instantiation == "softmax":
129 | # Normalizing the affinity tensor theta_phi before softmax.
130 | theta_phi = theta_phi * (self.dim_inner**-0.5)
131 | theta_phi = nn.functional.softmax(theta_phi, dim=2)
132 | elif self.instantiation == "dot_product":
133 | spatial_temporal_dim = theta_phi.shape[2]
134 | theta_phi = theta_phi / spatial_temporal_dim
135 | else:
136 | raise NotImplementedError(
137 | "Unknown norm type {}".format(self.instantiation)
138 | )
139 |
140 | # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW).
141 | theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g))
142 |
143 | # (N, C, TxHxW) => (N, C, T, H, W).
144 | theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W)
145 |
146 | p = self.conv_out(theta_phi_g)
147 | p = self.bn(p)
148 | return x_identity + p
--------------------------------------------------------------------------------
/alphaction/modeling/detector/__init__.py:
--------------------------------------------------------------------------------
1 | from .stm_detector import build_detection_model
2 | from .naive_baseline import build_naive_baseline
--------------------------------------------------------------------------------
/alphaction/modeling/detector/action_detector.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 | from ..backbone import build_backbone
4 | from ..roi_heads.roi_heads_3d import build_3d_roi_heads
5 |
6 |
7 | class ActionDetector(nn.Module):
8 | def __init__(self, cfg):
9 | super(ActionDetector, self).__init__()
10 | self.backbone = build_backbone(cfg)
11 | self.roi_heads = build_3d_roi_heads(cfg, self.backbone.dim_out)
12 |
13 | def forward(self, slow_video, fast_video, boxes, objects=None, extras={}, part_forward=-1):
14 | # part_forward is used to split this model into two parts.
15 | # if part_forward<0, just use it as a single model
16 | # if part_forward=0, use this model to extract pooled feature(person and object, no memory features).
17 | # if part_forward=1, use the ia structure to aggregate interactions and give final result.
18 | # implemented in roi_heads
19 |
20 | if part_forward==1:
21 | slow_features = fast_features = None
22 | else:
23 | slow_features, fast_features = self.backbone(slow_video, fast_video)
24 |
25 | result, detector_losses, loss_weight, detector_metrics = self.roi_heads(slow_features, fast_features, boxes, objects, extras, part_forward)
26 |
27 | if self.training:
28 | return detector_losses, loss_weight, detector_metrics, result
29 |
30 | return result
31 |
32 | def c2_weight_mapping(self):
33 | if not hasattr(self, "c2_mapping"):
34 | weight_map = {}
35 | for name, m_child in self.named_children():
36 | if m_child.state_dict() and hasattr(m_child, "c2_weight_mapping"):
37 | child_map = m_child.c2_weight_mapping()
38 | for key, val in child_map.items():
39 | new_key = name + '.' + key
40 | weight_map[new_key] = val
41 | self.c2_mapping = weight_map
42 | return self.c2_mapping
43 |
44 | def build_detection_model(cfg):
45 | return ActionDetector(cfg)
--------------------------------------------------------------------------------
/alphaction/modeling/detector/naive_baseline.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from ..backbone import build_backbone
5 | from alphaction.modeling.stm_decoder.util.box_ops import clip_boxes_tensor
6 | from torchvision.ops import roi_align
7 | from einops import rearrange
8 | import numpy as np
9 |
10 |
11 |
12 | class NaiveBaseline(nn.Module):
13 | def __init__(self, cfg):
14 | super(NaiveBaseline, self).__init__()
15 | self.backbone = build_backbone(cfg)
16 | assert self.backbone.visual_encoder.use_cls_feat
17 | assert cfg.DATA.OPEN_VOCABULARY
18 |
19 | self.use_roi_feat = cfg.MODEL.USE_ROI_FEAT
20 | self.multi_label_action = cfg.MODEL.MULTI_LABEL_ACTION
21 |
22 |
23 | def roi_align_pool(self, patch_feats, batch_boxes, raw_sizes, out_size=(7, 7), spatial_scale=1.0/16):
24 | """ patch_feats: (B, D, T, h, w)
25 | boxes: list of boxes, not normalized
26 | raw_sizes: (B, 2) in (width, height)
27 | """
28 | B, D, T, h, w = patch_feats.size()
29 | device = patch_feats.device
30 | feat_maps = patch_feats.mean(dim=2) # (B, D, h, w) temporally mean pooling
31 | boxes_list = [np.hstack([np.ones((boxes.shape[0], 1)) * i, boxes]) for i, boxes in enumerate(batch_boxes)]
32 | boxes_tensor = torch.from_numpy(np.vstack(boxes_list)).type(patch_feats.dtype).to(device)
33 | roi_feat = roi_align(feat_maps, boxes_tensor, out_size, spatial_scale) # (BN, D, 7, 7)
34 | roi_feat = rearrange(roi_feat, 'm d h w -> m (h w) d')
35 |
36 | # get meanpooled roi features
37 | roi_align_features = []
38 | batch_indices = boxes_tensor[:, 0].long()
39 | for i in range(B):
40 | rois = roi_feat[batch_indices == i].mean(dim=1) # (n, d)
41 | roi_align_features.append(rois)
42 |
43 | return roi_align_features
44 |
45 |
46 | def forward(self, slow_video, fast_video, whwh, boxes=None, labels=None, extras={}, part_forward=-1):
47 |
48 | assert not self.training, "NaiveBaseline does not need training!"
49 | assert 'prior_boxes' in extras, "NaiveBaseline use loaded boxes for testing!"
50 | device = slow_video.device
51 |
52 | prior_boxes = extras['prior_boxes']
53 | box_list = []
54 | for i in range(len(prior_boxes)):
55 | box = torch.tensor(prior_boxes[i], dtype=torch.float32, device=device)
56 | cur_whwh = whwh[i]
57 | box = clip_boxes_tensor(box, cur_whwh[1], cur_whwh[0])
58 | box[:, 0::2] /= cur_whwh[0]
59 | box[:, 1::2] /= cur_whwh[1]
60 | box_list.append(box)
61 |
62 | if self.backbone.num_pathways == 1:
63 | features = self.backbone([slow_video])
64 | else:
65 | features = self.backbone([slow_video, fast_video])
66 |
67 | patch_feats, cls_feat_visual = features # (B, 512)
68 | B = cls_feat_visual.size(0)
69 |
70 | if self.use_roi_feat:
71 | # feature projection & RoIAlign Pooling
72 | patch_feats = self.backbone.visual_encoder.project_patch_features(patch_feats[0])
73 | roi_features = self.roi_align_pool(patch_feats, prior_boxes, whwh[:, :2])
74 |
75 | # get the current text feature embeddings
76 | text_features = self.backbone.forward_text(device=slow_video.device) # (K, 512)
77 | tau_inv = self.backbone.tau_inv
78 |
79 | if isinstance(text_features, list):
80 | text_features = torch.stack(text_features).mean(1)
81 | text_features_normed = text_features / text_features.norm(dim=-1, keepdim=True) # (K, D)
82 |
83 | action_score_list = []
84 | if self.use_roi_feat:
85 | # return self.forward_roi_cls(roi_features, text_features, tau_inv, whwh)
86 | for roi_feat in roi_features:
87 | # action recognition
88 | vis_features_normed = roi_feat / roi_feat.norm(dim=-1, keepdim=True) # (N, D)
89 | action_score = tau_inv * vis_features_normed @ text_features_normed.t() # (N, K)
90 | action_score_list.append(action_score)
91 | else:
92 | vis_features_normed = cls_feat_visual / cls_feat_visual.norm(dim=-1, keepdim=True) # (B, D)
93 | action_score = tau_inv * vis_features_normed @ text_features_normed.t() # (B, K)
94 | for i in range(B):
95 | # with full frame input, we only have one score vector, which need to be repeated.
96 | scores = action_score[[i]].repeat(box_list[i].size(0), 1) # (1, K)
97 | action_score_list.append(scores)
98 |
99 | return action_score_list, box_list
100 |
101 |
102 | def build_naive_baseline(cfg):
103 | return NaiveBaseline(cfg)
--------------------------------------------------------------------------------
/alphaction/modeling/dict_model.py:
--------------------------------------------------------------------------------
1 | # Simple pytorch implementation of Dictionary Learning based on stochastic gradient descent
2 | #
3 | # June 2018
4 | # Jeremias Sulam
5 |
6 |
7 | import torch
8 | import torch.nn as nn
9 | from torch.autograd import Variable
10 | import torch.nn.functional as F
11 | import numpy as np
12 |
13 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
14 |
15 |
16 | ####################################
17 | ## Dict. Learning ##
18 | ####################################
19 |
20 | class DictLearn(nn.Module):
21 | def __init__(self, num_basis, dim_basis, SC='FISTA', sc_iters=None):
22 | super(DictLearn, self).__init__()
23 |
24 | self.W = nn.Parameter(torch.randn(dim_basis, num_basis, requires_grad=False))
25 |
26 | # normalization
27 | self.W.data = NormDict(self.W.data)
28 | self.SC = SC
29 | self.sc_iters = sc_iters
30 |
31 | if self.sc_iters is None:
32 | self.sc_iters = 20 if SC=='FISTA' else 50
33 |
34 |
35 |
36 | def forward(self, Y, K):
37 |
38 | # normalizing Dict
39 | self.W.requires_grad_(False)
40 | self.W.data = NormDict(self.W.data)
41 |
42 | # Sparse Coding
43 | if self.SC == 'IHT':
44 | Gamma, residual, errIHT = IHT(Y,self.W,K, self.sc_iters)
45 | elif self.SC == 'FISTA':
46 | Gamma, residual, errIHT = FISTA(Y,self.W,K, self.sc_iters)
47 | else: print("Oops!")
48 |
49 | # Reconstructing
50 | self.W.requires_grad_(True)
51 | X = torch.mm(Gamma,self.W.transpose(1,0))
52 |
53 | # sparsity
54 | # NNZ = np.count_nonzero(Gamma.cpu().data.numpy())/Gamma.shape[0]
55 |
56 | return X, Gamma, errIHT
57 |
58 |
59 |
60 | #--------------------------------------------------------------
61 | # Auxiliary Functions
62 | #--------------------------------------------------------------
63 |
64 | def hard_threshold_k(X, k):
65 | Gamma = X.clone()
66 | m = X.data.shape[1]
67 | a,_ = torch.abs(Gamma).data.sort(dim=1,descending=True)
68 | T = torch.mm(a[:,k].unsqueeze(1),torch.Tensor(np.ones((1,m))).to(device))
69 | mask = Variable(torch.Tensor((np.abs(Gamma.data.cpu().numpy())>T.cpu().numpy()) + 0.)).to(device)
70 | Gamma = Gamma * mask
71 | return Gamma#, mask.data.nonzero()
72 |
73 | #--------------------------------------------------------------
74 |
75 |
76 | def soft_threshold(X, lamda):
77 | #pdb.set_trace()
78 | Gamma = X.clone()
79 | Gamma = torch.sign(Gamma) * F.relu(torch.abs(Gamma)-lamda)
80 | return Gamma.to(device)
81 |
82 |
83 | #--------------------------------------------------------------
84 |
85 |
86 | def IHT(Y,W,K, ITER=50):
87 |
88 | c = PowerMethod(W)
89 | eta = 1/c
90 | Gamma = hard_threshold_k(torch.mm(Y,eta*W),K)
91 | residual = torch.mm(Gamma, W.transpose(1,0)) - Y
92 |
93 | norms = np.zeros((ITER,))
94 |
95 | for i in range(ITER):
96 | Gamma = hard_threshold_k(Gamma - eta * torch.mm(residual, W), K)
97 | residual = torch.mm(Gamma, W.transpose(1,0)) - Y
98 | norms[i] = np.linalg.norm(residual.cpu().numpy(),'fro')/ np.linalg.norm(Y.cpu().numpy(),'fro')
99 |
100 | return Gamma, residual, norms
101 |
102 |
103 | #--------------------------------------------------------------
104 |
105 |
106 | def FISTA(Y,W,lamda, ITER=20):
107 |
108 | c = PowerMethod(W)
109 | eta = 1/c
110 | norms = np.zeros((ITER,))
111 |
112 | Gamma = soft_threshold(torch.mm(Y,eta*W),lamda)
113 | Z = Gamma.clone()
114 | Gamma_1 = Gamma.clone()
115 | t = 1
116 |
117 | for i in range(ITER):
118 | Gamma_1 = Gamma.clone()
119 | residual = torch.mm(Z, W.transpose(1,0)) - Y
120 | Gamma = soft_threshold(Z - eta * torch.mm(residual, W), lamda/c)
121 |
122 | t_1 = t
123 | t = (1+np.sqrt(1 + 4*t**2))/2
124 | #pdb.set_trace()
125 | Z = Gamma + ((t_1 - 1)/t * (Gamma - Gamma_1)).to(device)
126 |
127 | norms[i] = np.linalg.norm(residual.cpu().numpy(),'fro')/ np.linalg.norm(Y.cpu().numpy(),'fro')
128 |
129 | return Gamma, residual, norms
130 |
131 |
132 | #--------------------------------------------------------------
133 |
134 | def NormDict(W):
135 | Wn = torch.norm(W, p=2, dim=0).detach()
136 | W = W.div(Wn.expand_as(W))
137 | return W
138 |
139 | #--------------------------------------------------------------
140 |
141 | def PowerMethod(W):
142 | ITER = 100
143 | m = W.shape[1]
144 | X = torch.randn(1, m).to(device)
145 | for i in range(ITER):
146 | Dgamma = torch.mm(X,W.transpose(1,0))
147 | X = torch.mm(Dgamma,W)
148 | nm = torch.norm(X,p=2)
149 | X = X/nm
150 |
151 | return nm
152 |
153 | #--------------------------------------------------------------
154 |
155 |
156 | def showFilters(W,ncol,nrows):
157 | p = int(np.sqrt(W.shape[0]))+2
158 | Nimages = W.shape[1]
159 | Mosaic = np.zeros((p*ncol,p*nrows))
160 | indx = 0
161 | for i in range(ncol):
162 | for j in range(nrows):
163 | im = W[:,indx].reshape(p-2,p-2)
164 | im = (im-np.min(im))
165 | im = im/np.max(im)
166 | Mosaic[ i*p : (i+1)*p , j*p : (j+1)*p ] = np.pad(im,(1,1),mode='constant')
167 | indx += 1
168 |
169 | return Mosaic
170 |
--------------------------------------------------------------------------------
/alphaction/modeling/encoders/clipvip/custom_layers.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from collections import OrderedDict
4 |
5 |
6 |
7 | class LayerNorm(nn.LayerNorm):
8 | """Subclass torch's LayerNorm to handle fp16."""
9 |
10 | def forward(self, x: torch.Tensor):
11 | orig_type = x.dtype
12 | ret = super().forward(x.type(torch.float32))
13 | return ret.type(orig_type)
14 |
15 |
16 | class QuickGELU(nn.Module):
17 | def forward(self, x: torch.Tensor):
18 | return x * torch.sigmoid(1.702 * x)
19 |
20 |
21 | class CrossAttnBlock(nn.Module):
22 | def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, drop: float = 0., return_kv=False):
23 | super().__init__()
24 |
25 | self.attn = nn.MultiheadAttention(d_model, n_head, dropout=drop)
26 | self.ln_x = LayerNorm(d_model)
27 | self.ln_y = LayerNorm(d_model)
28 | self.mlp = nn.Sequential(OrderedDict([
29 | ("c_fc", nn.Linear(d_model, d_model * 4)),
30 | ("fc_drop", nn.Dropout(drop)),
31 | ("gelu", QuickGELU()),
32 | ("c_proj", nn.Linear(d_model * 4, d_model)),
33 | ("proj_drop", nn.Dropout(drop)),
34 | ]))
35 | self.ln_2 = LayerNorm(d_model)
36 | self.attn_mask = attn_mask
37 | self.return_kv = return_kv
38 |
39 | def attention(self, x: torch.Tensor, y: torch.Tensor):
40 | self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
41 | return self.attn(x, y, y, need_weights=False, attn_mask=self.attn_mask)[0]
42 |
43 | def forward(self, x: torch.Tensor, y: torch.Tensor):
44 | """ x: query (T=1, B, d)
45 | y: key & value (T=64, B, d)
46 | """
47 | if len(x.size()) == 2:
48 | x = x.unsqueeze(0)
49 | x = x + self.attention(self.ln_x(x), self.ln_y(y))
50 | x = x + self.mlp(self.ln_2(x))
51 | if x.size(0) == 1:
52 | x = x.squeeze(0)
53 | if self.return_kv:
54 | return x, y
55 | return x
56 |
57 |
58 | class CrossAttnModules(nn.Sequential):
59 | def forward(self, *input):
60 | for module in self._modules.values():
61 | input = module(*input)
62 | return input[0]
--------------------------------------------------------------------------------
/alphaction/modeling/encoders/viclip/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | license: mit
3 | ---
4 |
--------------------------------------------------------------------------------
/alphaction/modeling/encoders/viclip/__init__.py:
--------------------------------------------------------------------------------
1 | from .simple_tokenizer import SimpleTokenizer as _Tokenizer
2 | from .viclip import ViCLIP
3 | import torch
4 | import numpy as np
5 | import cv2
6 |
7 | clip_candidates = {'viclip':None, 'clip':None}
8 |
9 | def get_clip(name='viclip', weight_file=None):
10 | global clip_candidates
11 | m = clip_candidates[name]
12 | if m is None:
13 | if name == 'viclip':
14 | tokenizer = _Tokenizer()
15 | vclip = ViCLIP(tokenizer, pretrain=weight_file)
16 | # m = vclip
17 | m = (vclip, tokenizer)
18 | else:
19 | raise Exception('the target clip model is not found.')
20 |
21 | return m
22 |
23 | def get_text_feat_dict(texts, clip, tokenizer, text_feat_d={}):
24 | for t in texts:
25 | feat = clip.get_text_features(t, tokenizer, text_feat_d)
26 | text_feat_d[t] = feat
27 | return text_feat_d
28 |
29 | def get_vid_feat(frames, clip):
30 | return clip.get_vid_features(frames)
31 |
32 | def _frame_from_video(video):
33 | while video.isOpened():
34 | success, frame = video.read()
35 | if success:
36 | yield frame
37 | else:
38 | break
39 |
40 | v_mean = np.array([0.485, 0.456, 0.406]).reshape(1,1,3)
41 | v_std = np.array([0.229, 0.224, 0.225]).reshape(1,1,3)
42 | def normalize(data):
43 | return (data/255.0-v_mean)/v_std
44 |
45 | def frames2tensor(vid_list, fnum=8, target_size=(224, 224), device=torch.device('cuda')):
46 | assert(len(vid_list) >= fnum)
47 | step = len(vid_list) // fnum
48 | vid_list = vid_list[::step][:fnum]
49 | vid_list = [cv2.resize(x[:,:,::-1], target_size) for x in vid_list]
50 | vid_tube = [np.expand_dims(normalize(x), axis=(0, 1)) for x in vid_list]
51 | vid_tube = np.concatenate(vid_tube, axis=1)
52 | vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3))
53 | vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float()
54 | return vid_tube
55 |
56 | def retrieve_text(frames, texts, name='viclip', weight_file=None, topk=5, device=torch.device('cuda')):
57 | clip, tokenizer = get_clip(name, weight_file)
58 | clip = clip.to(device)
59 | frames_tensor = frames2tensor(frames, device=device)
60 | vid_feat = get_vid_feat(frames_tensor, clip)
61 |
62 | text_feat_d = {}
63 | text_feat_d = get_text_feat_dict(texts, clip, tokenizer, text_feat_d)
64 | text_feats = [text_feat_d[t] for t in texts]
65 | text_feats_tensor = torch.cat(text_feats, 0)
66 |
67 | probs, idxs = clip.get_predict_label(vid_feat, text_feats_tensor, top=topk)
68 |
69 | ret_texts = [texts[i] for i in idxs.numpy()[0].tolist()]
70 | return ret_texts, probs.numpy()[0]
71 |
72 |
--------------------------------------------------------------------------------
/alphaction/modeling/encoders/viclip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/modeling/encoders/viclip/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/alphaction/modeling/encoders/viclip/demo.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import cv2
4 | import argparse
5 | import torch
6 |
7 | import sys
8 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..', '..'))
9 | from alphaction.modeling.encoders.viclip import retrieve_text, _frame_from_video
10 | from alphaction.config import cfg
11 | from alphaction.dataset import make_data_loader
12 | from alphaction.dataset.datasets import utils as utils
13 | from alphaction.utils.random_seed import set_seed
14 |
15 |
16 |
17 | def get_cfg():
18 | parser = argparse.ArgumentParser(description="PyTorch Action Detection Training")
19 | parser.add_argument(
20 | "--config-file",
21 | default="",
22 | metavar="FILE",
23 | help="path to config file",
24 | type=str,
25 | )
26 | parser.add_argument("--local_rank", type=int, default=0)
27 | parser.add_argument(
28 | "--skip-final-test",
29 | dest="skip_test",
30 | help="Do not test the final model",
31 | action="store_true",
32 | )
33 | parser.add_argument(
34 | "--skip-val-in-train",
35 | dest="skip_val",
36 | help="Do not validate during training",
37 | action="store_true",
38 | )
39 | parser.add_argument(
40 | "--transfer",
41 | dest="transfer_weight",
42 | help="Transfer weight from a pretrained model",
43 | action="store_true"
44 | )
45 | parser.add_argument(
46 | "--adjust-lr",
47 | dest="adjust_lr",
48 | help="Adjust learning rate scheduler from old checkpoint",
49 | action="store_true"
50 | )
51 | parser.add_argument(
52 | "--no-head",
53 | dest="no_head",
54 | help="Not load the head layer parameters from weight file",
55 | action="store_true"
56 | )
57 | parser.add_argument(
58 | "--use-tfboard",
59 | action='store_true',
60 | dest='tfboard',
61 | help='Use tensorboard to log stats'
62 | )
63 | parser.add_argument(
64 | "--seed",
65 | type=int,
66 | default=2,
67 | help="Manual seed at the begining."
68 | )
69 | parser.add_argument(
70 | "opts",
71 | help="Modify config options using the command-line",
72 | default=None,
73 | nargs=argparse.REMAINDER,
74 | )
75 |
76 | args = parser.parse_args()
77 |
78 | num_gpus = 1
79 | args.distributed = False
80 |
81 | torch.backends.cudnn.deterministic = True
82 | torch.backends.cudnn.benchmark = False
83 |
84 | # Merge config.
85 | cfg.merge_from_file(args.config_file)
86 | cfg.merge_from_list(args.opts)
87 | cfg.freeze()
88 |
89 | set_seed(args.seed, 0, num_gpus)
90 |
91 | return cfg
92 |
93 |
94 | def get_one_sample(dataset):
95 | idx = int(np.random.choice(list(range(len(dataset))), 1))
96 | video_idx, sec_idx, sec, center_idx = dataset._keyframe_indices[idx]
97 | # Get the frame idxs for current clip.
98 | seq = utils.get_sequence(
99 | center_idx,
100 | dataset._seq_len // 2,
101 | dataset._sample_rate,
102 | num_frames=len(dataset._image_paths[video_idx]),
103 | )
104 |
105 | # Load images of current clip.
106 | image_paths = [dataset._image_paths[video_idx][frame] for frame in seq]
107 | imgs = utils.retry_load_images(
108 | image_paths, backend='cv2'
109 | )
110 |
111 | clip_label_list = dataset._keyframe_boxes_and_labels[video_idx][sec_idx]
112 | assert len(clip_label_list) > 0
113 | labels = []
114 | for box_labels in clip_label_list:
115 | for label in box_labels[1]:
116 | if label == -1:
117 | continue
118 | label = dataset.id_to_indices['closed'][label]
119 | labels.append(label)
120 |
121 | return imgs, labels
122 |
123 |
124 |
125 | if __name__ == '__main__':
126 |
127 | cfg = get_cfg()
128 |
129 | data_loader, vocabulary_train, iter_per_epoch = make_data_loader(
130 | cfg,
131 | is_train=True,
132 | is_distributed=False,
133 | start_iter=0,
134 | )
135 |
136 | for n in range(10):
137 | print("Trial {}...".format(n + 1))
138 | frames, labels = get_one_sample(data_loader.dataset)
139 | class_texts = [elems['caption'] for clsname, elems in data_loader.dataset.text_input['closed'].items()]
140 | gt_texts = [class_texts[clsid] for clsid in labels]
141 |
142 | texts, probs = retrieve_text(frames, class_texts, name='viclip', topk=5, weight_file='pretrained/ViClip-InternVid-10M-FLT.pth')
143 |
144 | for t, p in zip(texts, probs):
145 | print(f'text: {t} ~ prob: {p:.4f}')
146 |
147 | print("Ground Truth class texts: ", gt_texts)
--------------------------------------------------------------------------------
/alphaction/modeling/encoders/viclip/simple_tokenizer.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import html
3 | import os
4 | from functools import lru_cache
5 |
6 | import ftfy
7 | import regex as re
8 |
9 |
10 | @lru_cache()
11 | def default_bpe():
12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
13 | # @lru_cache()
14 | # def default_bpe():
15 | # return "bpe_simple_vocab_16e6.txt.gz"
16 |
17 |
18 | @lru_cache()
19 | def bytes_to_unicode():
20 | """
21 | Returns list of utf-8 byte and a corresponding list of unicode strings.
22 | The reversible bpe codes work on unicode strings.
23 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
24 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
25 | This is a signficant percentage of your normal, say, 32K bpe vocab.
26 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
27 | And avoids mapping to whitespace/control characters the bpe code barfs on.
28 | """
29 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
30 | cs = bs[:]
31 | n = 0
32 | for b in range(2**8):
33 | if b not in bs:
34 | bs.append(b)
35 | cs.append(2**8+n)
36 | n += 1
37 | cs = [chr(n) for n in cs]
38 | return dict(zip(bs, cs))
39 |
40 |
41 | def get_pairs(word):
42 | """Return set of symbol pairs in a word.
43 | Word is represented as tuple of symbols (symbols being variable-length strings).
44 | """
45 | pairs = set()
46 | prev_char = word[0]
47 | for char in word[1:]:
48 | pairs.add((prev_char, char))
49 | prev_char = char
50 | return pairs
51 |
52 |
53 | def basic_clean(text):
54 | text = ftfy.fix_text(text)
55 | text = html.unescape(html.unescape(text))
56 | return text.strip()
57 |
58 |
59 | def whitespace_clean(text):
60 | text = re.sub(r'\s+', ' ', text)
61 | text = text.strip()
62 | return text
63 |
64 |
65 | class SimpleTokenizer(object):
66 | def __init__(self, bpe_path: str = default_bpe()):
67 | self.byte_encoder = bytes_to_unicode()
68 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
69 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
70 | merges = merges[1:49152-256-2+1]
71 | merges = [tuple(merge.split()) for merge in merges]
72 | vocab = list(bytes_to_unicode().values())
73 | vocab = vocab + [v+'' for v in vocab]
74 | for merge in merges:
75 | vocab.append(''.join(merge))
76 | vocab.extend(['<|startoftext|>', '<|endoftext|>'])
77 | self.encoder = dict(zip(vocab, range(len(vocab))))
78 | self.decoder = {v: k for k, v in self.encoder.items()}
79 | self.bpe_ranks = dict(zip(merges, range(len(merges))))
80 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
81 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
82 |
83 | def bpe(self, token):
84 | if token in self.cache:
85 | return self.cache[token]
86 | word = tuple(token[:-1]) + ( token[-1] + '',)
87 | pairs = get_pairs(word)
88 |
89 | if not pairs:
90 | return token+''
91 |
92 | while True:
93 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
94 | if bigram not in self.bpe_ranks:
95 | break
96 | first, second = bigram
97 | new_word = []
98 | i = 0
99 | while i < len(word):
100 | try:
101 | j = word.index(first, i)
102 | new_word.extend(word[i:j])
103 | i = j
104 | except:
105 | new_word.extend(word[i:])
106 | break
107 |
108 | if word[i] == first and i < len(word)-1 and word[i+1] == second:
109 | new_word.append(first+second)
110 | i += 2
111 | else:
112 | new_word.append(word[i])
113 | i += 1
114 | new_word = tuple(new_word)
115 | word = new_word
116 | if len(word) == 1:
117 | break
118 | else:
119 | pairs = get_pairs(word)
120 | word = ' '.join(word)
121 | self.cache[token] = word
122 | return word
123 |
124 | def encode(self, text):
125 | bpe_tokens = []
126 | text = whitespace_clean(basic_clean(text)).lower()
127 | for token in re.findall(self.pat, text):
128 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
129 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
130 | return bpe_tokens
131 |
132 | def decode(self, tokens):
133 | text = ''.join([self.decoder[token] for token in tokens])
134 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ')
135 | return text
136 |
--------------------------------------------------------------------------------
/alphaction/modeling/nonlocal_block.py:
--------------------------------------------------------------------------------
1 | from __future__ import (absolute_import, division, print_function,
2 | unicode_literals)
3 |
4 | import torch
5 | import torch.nn as nn
6 | from alphaction.layers import FrozenBatchNorm3d
7 |
8 |
9 | class NLBlock(nn.Module):
10 | def __init__(self, dim_in, dim_out, dim_inner, nl_cfg, group=False):
11 | super(NLBlock, self).__init__()
12 |
13 | self.nl_cfg = nl_cfg.clone()
14 | self.group = group
15 | self.group_size = 4
16 |
17 | init_std = nl_cfg.CONV_INIT_STD
18 | bias = not nl_cfg.NO_BIAS
19 | pool_stride = 2
20 |
21 | self.scale_value = dim_inner ** (-0.5)
22 | self.dim_inner = dim_inner
23 |
24 | self.theta = nn.Conv3d(dim_in, dim_inner, 1, bias=bias)
25 | nn.init.normal_(self.theta.weight, std=init_std)
26 | if bias:
27 | nn.init.constant_(self.theta.bias, 0)
28 |
29 | if nl_cfg.USE_MAXPOOL:
30 | self.maxpool = nn.MaxPool3d((1, pool_stride, pool_stride),
31 | stride=(1, pool_stride, pool_stride))
32 |
33 | self.phi = nn.Conv3d(dim_in, dim_inner, 1, bias=bias)
34 | nn.init.normal_(self.phi.weight, std=init_std)
35 | if bias:
36 | nn.init.constant_(self.phi.bias, 0)
37 |
38 | self.g = nn.Conv3d(dim_in, dim_inner, 1, bias=bias)
39 | nn.init.normal_(self.g.weight, std=init_std)
40 | if bias:
41 | nn.init.constant_(self.g.bias, 0)
42 |
43 | if nl_cfg.USE_SOFTMAX:
44 | self.softmax = nn.Softmax(dim=2)
45 |
46 | self.out = nn.Conv3d(dim_inner, dim_out, 1, bias=bias)
47 | if nl_cfg.USE_ZERO_INIT_CONV:
48 | nn.init.constant_(self.out.weight, 0)
49 | else:
50 | nn.init.normal_(self.out.weight, std=init_std)
51 | if bias:
52 | nn.init.constant_(self.out.bias, 0)
53 |
54 | if nl_cfg.USE_BN:
55 | if nl_cfg.FROZEN_BN:
56 | self.bn = FrozenBatchNorm3d(dim_out, eps=nl_cfg.BN_EPSILON)
57 | else:
58 | self.bn = nn.BatchNorm3d(dim_out, eps=nl_cfg.BN_EPSILON, momentum=nl_cfg.BN_MOMENTUM)
59 | nn.init.constant_(self.bn.weight, nl_cfg.BN_INIT_GAMMA)
60 |
61 | def forward(self, x):
62 | if x.dim() != 5:
63 | raise ValueError('expected 4D or 5D input (got {}D input)'
64 | .format(x.dim()))
65 |
66 | if self.group:
67 | x = x.transpose(1, 2)
68 | sz_before_group = list(x.shape)
69 | sz_after_group = sz_before_group.copy()
70 | sz_after_group[0] = -1
71 | sz_after_group[1] = self.group_size
72 | x = x.contiguous().view(*sz_after_group)
73 | x = x.transpose(1, 2)
74 |
75 | batch_size = x.shape[0]
76 |
77 | theta = self.theta(x)
78 |
79 | if self.nl_cfg.USE_MAXPOOL:
80 | max_pool = self.maxpool(x)
81 | else:
82 | max_pool = x
83 |
84 | phi = self.phi(max_pool)
85 |
86 | g = self.g(max_pool)
87 |
88 | org_size = theta.size()
89 | mat_size = [batch_size, self.dim_inner, -1]
90 | theta = theta.view(*mat_size)
91 | phi = phi.view(*mat_size)
92 | g = g.view(*mat_size)
93 |
94 | theta_phi = torch.bmm(theta.transpose(1, 2), phi)
95 |
96 | if self.nl_cfg.USE_SOFTMAX:
97 | if self.nl_cfg.USE_SCALE:
98 | theta_phi_sc = theta_phi * self.scale_value
99 | else:
100 | theta_phi_sc = theta_phi
101 | p = self.softmax(theta_phi_sc)
102 | else:
103 | p = theta_phi / theta_phi.shape[-1]
104 |
105 | t = torch.bmm(g, p.transpose(1, 2))
106 |
107 | t = t.view(org_size)
108 |
109 | out = self.out(t)
110 |
111 | if self.nl_cfg.USE_BN:
112 | out = self.bn(out)
113 | out = out + x
114 |
115 | if self.group:
116 | out = out.transpose(1, 2)
117 | out = out.contiguous().view(*sz_before_group)
118 | out = out.transpose(1, 2)
119 |
120 | return out
121 |
122 | def c2_weight_mapping(self):
123 | weight_map = {}
124 | for name, m_child in self.named_children():
125 | if m_child.state_dict():
126 | if isinstance(m_child, (nn.BatchNorm3d, FrozenBatchNorm3d)):
127 | weight_map[name + '.weight'] = '{}_s'.format(name)
128 | weight_map[name + '.running_mean'] = '{}_rm'.format(name)
129 | weight_map[name + '.running_var'] = '{}_riv'.format(name)
130 | elif isinstance(m_child, nn.GroupNorm):
131 | weight_map[name + '.weight'] = '{}_s'.format(name)
132 | else:
133 | weight_map[name + '.weight'] = '{}_w'.format(name)
134 | weight_map[name + '.bias'] = '{}_b'.format(name)
135 | return weight_map
136 |
--------------------------------------------------------------------------------
/alphaction/modeling/registry.py:
--------------------------------------------------------------------------------
1 | from alphaction.utils.registry import Registry
2 |
3 | BACKBONES = Registry()
4 | ROI_ACTION_FEATURE_EXTRACTORS = Registry()
5 | ROI_ACTION_PREDICTORS = Registry()
6 | INTERACTION_AGGREGATION_STRUCTURES = Registry()
--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/modeling/roi_heads/__init__.py
--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/modeling/roi_heads/action_head/__init__.py
--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/action_head.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from .roi_action_feature_extractor import make_roi_action_feature_extractor
4 | from .roi_action_predictors import make_roi_action_predictor
5 | from .inference import make_roi_action_post_processor
6 | from .loss import make_roi_action_loss_evaluator
7 | from .metric import make_roi_action_accuracy_evaluator
8 | from alphaction.modeling.utils import prepare_pooled_feature
9 | from alphaction.utils.comm import all_reduce
10 |
11 |
12 | class ROIActionHead(torch.nn.Module):
13 | """
14 | Generic Action Head class.
15 | """
16 |
17 | def __init__(self, cfg, dim_in):
18 | super(ROIActionHead, self).__init__()
19 | self.feature_extractor = make_roi_action_feature_extractor(cfg, dim_in)
20 | self.predictor = make_roi_action_predictor(cfg, self.feature_extractor.dim_out)
21 | self.post_processor = make_roi_action_post_processor(cfg)
22 | self.loss_evaluator = make_roi_action_loss_evaluator(cfg)
23 | self.accuracy_evaluator = make_roi_action_accuracy_evaluator(cfg)
24 | self.test_ext = cfg.TEST.EXTEND_SCALE
25 |
26 | def forward(self, slow_features, fast_features, boxes, objects=None, extras={}, part_forward=-1):
27 | # In training stage, boxes are from gt.
28 | # In testing stage, boxes are detected by human detector and proposals should be
29 | # enlarged boxes.
30 | assert not (self.training and part_forward >= 0)
31 |
32 | if part_forward == 1:
33 | boxes = extras["current_feat_p"]
34 | objects = extras["current_feat_o"]
35 |
36 | if self.training:
37 | proposals = self.loss_evaluator.sample_box(boxes)
38 | else:
39 | proposals = [box.extend(self.test_ext) for box in boxes]
40 |
41 | x, x_pooled, x_objects = self.feature_extractor(slow_features, fast_features, proposals, objects, extras, part_forward)
42 |
43 | if part_forward == 0:
44 | pooled_feature = prepare_pooled_feature(x_pooled, boxes)
45 | if x_objects is None:
46 | object_pooled_feature = None
47 | else:
48 | object_pooled_feature = prepare_pooled_feature(x_objects, objects)
49 | return [pooled_feature, object_pooled_feature], {}, {}, {}
50 |
51 | action_logits = self.predictor(x)
52 |
53 | if not self.training:
54 | result = self.post_processor((action_logits,), boxes)
55 | return result, {}, {}, {}
56 |
57 | box_num = action_logits.size(0)
58 | box_num = torch.as_tensor([box_num], dtype=torch.float32, device=action_logits.device)
59 | all_reduce(box_num, average=True)
60 |
61 | loss_dict, loss_weight = self.loss_evaluator(
62 | [action_logits], box_num.item(),
63 | )
64 |
65 | metric_dict = self.accuracy_evaluator(
66 | [action_logits], proposals, box_num.item(),
67 | )
68 |
69 | pooled_feature = prepare_pooled_feature(x_pooled, proposals)
70 | if x_objects is None:
71 | object_pooled_feature = []
72 | else:
73 | object_pooled_feature = prepare_pooled_feature(x_objects, objects)
74 |
75 | return (
76 | [pooled_feature, object_pooled_feature],
77 | loss_dict,
78 | loss_weight,
79 | metric_dict,
80 | )
81 |
82 | def c2_weight_mapping(self):
83 | weight_map = {}
84 | for name, m_child in self.named_children():
85 | if m_child.state_dict() and hasattr(m_child, "c2_weight_mapping"):
86 | child_map = m_child.c2_weight_mapping()
87 | for key, val in child_map.items():
88 | new_key = name + '.' + key
89 | weight_map[new_key] = val
90 | return weight_map
91 |
92 |
93 | def build_roi_action_head(cfg, dim_in):
94 | return ROIActionHead(cfg, dim_in)
95 |
--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/inference.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | import torch.nn.functional as F
4 |
5 | from alphaction.structures.bounding_box import BoxList
6 |
7 |
8 | class PostProcessor(nn.Module):
9 | def __init__(self, pose_action_num):
10 | super(PostProcessor, self).__init__()
11 | self.pose_action_num = pose_action_num
12 |
13 | def forward(self, x, boxes):
14 | # boxes should be (#detections,4)
15 | # prob should be calculated in different way.
16 | class_logits, = x
17 | pose_action_prob = F.softmax(class_logits[:,:self.pose_action_num],-1)
18 | interaction_action_prob = torch.sigmoid(class_logits[:,self.pose_action_num:])
19 |
20 | action_prob = torch.cat((pose_action_prob,interaction_action_prob),1)
21 |
22 | image_shapes = [box.size for box in boxes]
23 | boxes_per_image = [len(box) for box in boxes]
24 | box_tensors = [a.bbox for a in boxes]
25 |
26 | action_prob = action_prob.split(boxes_per_image, dim=0)
27 |
28 | results = []
29 | for prob, boxes_per_image, image_shape in zip(
30 | action_prob, box_tensors, image_shapes
31 | ):
32 | boxlist = self.prepare_boxlist(boxes_per_image, prob, image_shape)
33 | results.append(boxlist)
34 | return results
35 |
36 | def prepare_boxlist(self, boxes, scores, image_shape):
37 | boxlist = BoxList(boxes, image_shape, mode="xyxy")
38 | boxlist.add_field("scores", scores)
39 | return boxlist
40 |
41 |
42 | def make_roi_action_post_processor(cfg):
43 | softmax_num = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES
44 | postprocessor = PostProcessor(softmax_num)
45 | return postprocessor
46 |
--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/loss.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from alphaction.layers import SigmoidFocalLoss, SoftmaxFocalLoss
3 | from alphaction.modeling.utils import cat
4 |
5 |
6 | class ActionLossComputation(object):
7 | def __init__(self, cfg):
8 | self.proposal_per_clip = cfg.MODEL.ROI_ACTION_HEAD.PROPOSAL_PER_CLIP
9 | self.num_pose = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES
10 | self.num_object = cfg.MODEL.ROI_ACTION_HEAD.NUM_OBJECT_MANIPULATION_CLASSES
11 | self.num_person = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_INTERACTION_CLASSES
12 |
13 | self.weight_dict = dict(
14 | loss_pose_action = cfg.MODEL.ROI_ACTION_HEAD.POSE_LOSS_WEIGHT,
15 | loss_object_interaction = cfg.MODEL.ROI_ACTION_HEAD.OBJECT_LOSS_WEIGHT,
16 | loss_person_interaction = cfg.MODEL.ROI_ACTION_HEAD.PERSON_LOSS_WEIGHT,
17 | )
18 |
19 | gamma = cfg.MODEL.ROI_ACTION_HEAD.FOCAL_LOSS.GAMMA
20 | alpha = cfg.MODEL.ROI_ACTION_HEAD.FOCAL_LOSS.ALPHA
21 | self.sigmoid_focal_loss = SigmoidFocalLoss(gamma, alpha, reduction="none")
22 | self.softmax_focal_loss = SoftmaxFocalLoss(gamma, alpha, reduction="sum")
23 |
24 | def sample_box(self, boxes):
25 | proposals = []
26 | num_proposals = self.proposal_per_clip
27 | for boxes_per_image in boxes:
28 | num_boxes = len(boxes_per_image)
29 |
30 | if num_boxes > num_proposals:
31 | choice_inds = torch.randperm(num_boxes)[:num_proposals]
32 | proposals_per_image = boxes_per_image[choice_inds]
33 | else:
34 | proposals_per_image = boxes_per_image
35 | proposals_per_image = proposals_per_image.random_aug(0.2, 0.1, 0.1, 0.05)
36 | proposals.append(proposals_per_image)
37 | self._proposals = proposals
38 | return proposals
39 |
40 | def __call__(self, class_logits, avg_box_num):
41 | class_logits = cat(class_logits, dim=0)
42 | assert class_logits.shape[1] == (self.num_pose + self.num_object + self.num_person), \
43 | "The shape of tensor class logits doesn't match total number of action classes."
44 |
45 | if not hasattr(self, "_proposals"):
46 | raise RuntimeError("sample_box needs to be called before")
47 |
48 | proposals = self._proposals
49 |
50 | labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)
51 | assert class_logits.shape[1] == labels.shape[1], \
52 | "The shape of tensor class logits doesn't match the label tensor."
53 |
54 | loss_dict = {}
55 |
56 | if self.num_pose > 0:
57 | pose_label = labels[:, :self.num_pose].argmax(dim=1)
58 | pose_logits = class_logits[:, :self.num_pose]
59 | pose_loss = self.softmax_focal_loss(pose_logits, pose_label) / avg_box_num
60 | loss_dict["loss_pose_action"] = pose_loss
61 |
62 | interaction_label = labels[:, self.num_pose:].to(dtype=torch.float32)
63 | object_label = interaction_label[:, :self.num_object]
64 | person_label = interaction_label[:, self.num_object:]
65 |
66 | interaction_logits = class_logits[:, self.num_pose:]
67 | object_logits = interaction_logits[:, :self.num_object]
68 | person_logits = interaction_logits[:, self.num_object:]
69 |
70 | if self.num_object > 0:
71 | object_loss = self.sigmoid_focal_loss(object_logits, object_label).mean(dim=1).sum() / avg_box_num
72 | loss_dict["loss_object_interaction"] = object_loss
73 | if self.num_person > 0:
74 | person_loss = self.sigmoid_focal_loss(person_logits, person_label).mean(dim=1).sum() / avg_box_num
75 | loss_dict["loss_person_interaction"] = person_loss
76 |
77 | return loss_dict, self.weight_dict
78 |
79 |
80 | def make_roi_action_loss_evaluator(cfg):
81 | loss_evaluator = ActionLossComputation(cfg)
82 |
83 | return loss_evaluator
--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/metric.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from alphaction.modeling.utils import cat
3 |
4 |
5 | class ActionAccuracyComputation(object):
6 | def __init__(self, num_pose, num_object, num_person):
7 | self.num_pose = num_pose
8 | self.num_object = num_object
9 | self.num_person = num_person
10 |
11 | def logic_iou(self, pred, label):
12 | device = pred.device
13 |
14 | version = torch.__version__
15 | if eval('.'.join(version.split('.')[:2]))>=1.3:
16 | pred = pred.bool()
17 | label = label.bool()
18 |
19 | label_union = (pred | label).float().sum(dim=1)
20 | label_inter = (pred & label).float().sum(dim=1)
21 | replacer = torch.ones_like(label_union, device=device)
22 | zero_mask = label_union == 0
23 | label_inter = torch.where(zero_mask, replacer, label_inter)
24 | label_union = torch.where(zero_mask, replacer, label_union)
25 | return label_inter / label_union
26 |
27 | def __call__(self, class_logits, proposals, avg_box_num):
28 | class_logits = [logits.detach() for logits in class_logits]
29 | class_logits = cat(class_logits, dim=0)
30 | assert class_logits.shape[1] == (self.num_pose + self.num_object + self.num_person), \
31 | "The shape of tensor class logits doesn't match total number of action classes."
32 |
33 | labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)
34 |
35 | metric_dict = {}
36 | if self.num_pose>0:
37 | pose_label = labels[:, :self.num_pose].argmax(dim=1)
38 | pose_pred = class_logits[:, :self.num_pose].argmax(dim=1)
39 | accuracy_pose_action = pose_label.eq(pose_pred).float().sum()
40 | metric_dict["accuracy_pose_action"] = accuracy_pose_action / avg_box_num
41 |
42 | interaction_label = labels[:, self.num_pose:]
43 | interaction_logits = class_logits[:, self.num_pose:]
44 | interaction_pred = interaction_logits.sigmoid() > 0.5
45 |
46 | if self.num_object>0:
47 | object_label = interaction_label[:, :self.num_object]
48 | object_pred = interaction_pred[:, :self.num_object]
49 | accuracy_object_interaction = self.logic_iou(object_pred, object_label)
50 | metric_dict["accuracy_object_interaction"] = accuracy_object_interaction.sum() / avg_box_num
51 |
52 | if self.num_person>0:
53 | person_label = interaction_label[:, self.num_object:]
54 | person_pred = interaction_pred[:, self.num_object:]
55 | accuracy_person_interaction = self.logic_iou(person_pred, person_label)
56 | metric_dict["accuracy_person_interaction"] = accuracy_person_interaction.sum() / avg_box_num
57 |
58 | return metric_dict
59 |
60 |
61 | def make_roi_action_accuracy_evaluator(cfg):
62 | num_pose = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES
63 | num_object = cfg.MODEL.ROI_ACTION_HEAD.NUM_OBJECT_MANIPULATION_CLASSES
64 | num_person = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_INTERACTION_CLASSES
65 | return ActionAccuracyComputation(num_pose, num_object, num_person)
--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/roi_action_predictors.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 | from alphaction.modeling import registry
3 |
4 |
5 | @registry.ROI_ACTION_PREDICTORS.register("FCPredictor")
6 | class FCPredictor(nn.Module):
7 | def __init__(self, config, dim_in):
8 | super(FCPredictor, self).__init__()
9 |
10 | num_classes = config.MODEL.ROI_ACTION_HEAD.NUM_CLASSES
11 |
12 | dropout_rate = config.MODEL.ROI_ACTION_HEAD.DROPOUT_RATE
13 | if dropout_rate > 0:
14 | self.dropout = nn.Dropout(p=dropout_rate, inplace=True)
15 |
16 | self.cls_score = nn.Linear(dim_in, num_classes)
17 |
18 | nn.init.normal_(self.cls_score.weight, std=0.01)
19 | nn.init.constant_(self.cls_score.bias, 0)
20 |
21 | def forward(self, x):
22 | x = x.view(x.size(0), -1)
23 | if hasattr(self, "dropout"):
24 | x = self.dropout(x)
25 | scores = self.cls_score(x)
26 |
27 | return scores
28 |
29 | def c2_weight_mapping(self):
30 | return {"cls_score.weight": "pred_w",
31 | "cls_score.bias": "pred_b"}
32 |
33 |
34 | def make_roi_action_predictor(cfg, dim_in):
35 | func = registry.ROI_ACTION_PREDICTORS[cfg.MODEL.ROI_ACTION_HEAD.PREDICTOR]
36 | return func(cfg, dim_in)
37 |
--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/roi_heads_3d.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from .action_head.action_head import build_roi_action_head
4 |
5 |
6 | class Combined3dROIHeads(torch.nn.ModuleDict):
7 | def __init__(self, cfg, heads):
8 | super(Combined3dROIHeads, self).__init__(heads)
9 | self.cfg = cfg.clone()
10 |
11 | def forward(self, slow_features, fast_features, boxes, objects=None, extras={}, part_forward=-1):
12 | result, loss_action, loss_weight, accuracy_action = self.action(slow_features, fast_features, boxes, objects, extras, part_forward)
13 |
14 | return result, loss_action, loss_weight, accuracy_action
15 |
16 | def c2_weight_mapping(self):
17 | weight_map = {}
18 | for name, m_child in self.named_children():
19 | if m_child.state_dict() and hasattr(m_child,"c2_weight_mapping"):
20 | child_map = m_child.c2_weight_mapping()
21 | for key, val in child_map.items():
22 | new_key = name + '.' + key
23 | weight_map[new_key] = val
24 | return weight_map
25 |
26 |
27 | def build_3d_roi_heads(cfg, dim_in):
28 | roi_heads = []
29 | roi_heads.append(("action", build_roi_action_head(cfg, dim_in)))
30 |
31 | if roi_heads:
32 | roi_heads = Combined3dROIHeads(cfg, roi_heads)
33 |
34 | return roi_heads
35 |
--------------------------------------------------------------------------------
/alphaction/modeling/stm_decoder/util/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 |
--------------------------------------------------------------------------------
/alphaction/modeling/stm_decoder/util/adaptive_mixing_operator.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class AdaptiveMixing(nn.Module):
7 | def __init__(self, in_dim, in_points, n_groups, query_dim=None,
8 | out_dim=None, out_points=None, sampling_rate=None):
9 | super(AdaptiveMixing, self).__init__()
10 | out_dim = out_dim if out_dim is not None else in_dim
11 | out_points = out_points if out_points is not None else in_points
12 | query_dim = query_dim if query_dim is not None else in_dim
13 | sampling_rate = sampling_rate if sampling_rate is not None else 1
14 |
15 | self.query_dim = query_dim
16 | self.in_dim = in_dim
17 | self.in_points = in_points//sampling_rate
18 | self.n_groups = n_groups
19 | self.out_dim = out_dim
20 | self.out_points = out_points
21 |
22 | self.eff_in_dim = in_dim//n_groups
23 | self.eff_out_dim = out_dim//n_groups
24 |
25 | self.pad_bias_dim = 0
26 | self.pad_bias_points = 0
27 |
28 | self.eff_in_dim = self.eff_in_dim + self.pad_bias_dim
29 | self.in_points = self.in_points + self.pad_bias_points
30 |
31 | self.REDUCTION = 1
32 |
33 | self.m_parameters = self.eff_in_dim * self.eff_out_dim
34 | self.s_parameters = self.in_points * self.out_points
35 |
36 | self.total_parameters = self.m_parameters + self.s_parameters
37 |
38 | self.parameter_generator = nn.Sequential(
39 | nn.Linear(self.query_dim, self.n_groups*self.total_parameters),
40 | )
41 |
42 | self.out_proj = nn.Linear(
43 | self.eff_out_dim*self.out_points*self.n_groups, self.query_dim, bias=True
44 | )
45 |
46 | self.act = nn.ReLU(inplace=True)
47 |
48 | self._init_weights()
49 |
50 | @torch.no_grad()
51 | def _init_weights(self):
52 | nn.init.zeros_(self.parameter_generator[-1].weight)
53 |
54 | def forward(self, x, query):
55 |
56 | B, N, g, P, C = x.size()
57 | G = self.n_groups
58 | assert g == G
59 |
60 |
61 | '''generate mixing parameters'''
62 | params = self.parameter_generator(query)
63 | params = params.reshape(B*N, G, -1)
64 | out = x.reshape(B*N, G, P, C)
65 |
66 | M, S = params.split(
67 | [self.m_parameters, self.s_parameters], 2)
68 |
69 | M = M.reshape(B*N, G, self.eff_in_dim, self.eff_in_dim)
70 | S = S.reshape(B*N, G, self.out_points, self.in_points)
71 |
72 |
73 | '''adaptive channel mixing
74 | the process also can be done with torch.bmm
75 | but for clarity, we use torch.matmul
76 | '''
77 | out = torch.matmul(out, M)
78 | out = F.layer_norm(out, [out.size(-2), out.size(-1)])
79 | out = self.act(out)
80 |
81 | '''adaptive spatial mixing'''
82 | out = torch.matmul(S, out) # implicitly transpose and matmul
83 | out = F.layer_norm(out, [out.size(-2), out.size(-1)])
84 | out = self.act(out)
85 |
86 | '''linear transfomation to query dim'''
87 | out = out.reshape(B, N, -1)
88 | out = self.out_proj(out)
89 |
90 | out = query + out
91 |
92 | return out
93 |
--------------------------------------------------------------------------------
/alphaction/modeling/stm_decoder/util/msaq.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 |
5 | def translate_to_linear_weight(ref: torch.Tensor, num_total, tau=2.0):
6 | # ref: [n, n_query, 1, in_points * n_heads]
7 | # num_total: feature levels (typically 4)
8 | grid = torch.arange(num_total, device=ref.device, dtype=ref.dtype).view(
9 | *[len(ref.shape)*[1, ]+[-1, ]])
10 | # [1, 1, 1, 1, num_total]
11 |
12 | ref = ref.unsqueeze(-1).clone()
13 | # [n, n_query, 1, in_points * n_heads, 1]
14 | l2 = (ref-grid).pow(2.0).div(tau).abs().neg()
15 | # [n, n_query, 1, in_points * n_heads, num_total]
16 | weight = torch.softmax(l2, dim=-1)
17 |
18 | return weight
19 |
20 |
21 | def MHAQ3D(sample_points: torch.Tensor, value: torch.Tensor, weight=None, n_points=1):
22 | '''
23 | Args:
24 | sample_points: [n, n_query, 1, in_points * n_heads, 2]
25 | value: [n, c, t, h, w]
26 | weight: [n, n_query, 1, in_points * n_heads]
27 | n_points: in_points
28 |
29 | Returns:
30 | [B,c//n_heads,n_heads,t,in_points,n_query,1]
31 | '''
32 | B, Hq, Wq, n_heads_points, _ = sample_points.shape
33 | # print(value.shape)
34 | B, Ck, Tk, Hk, Wk = value.shape
35 |
36 | n_heads = n_heads_points//n_points
37 |
38 | sample_points = sample_points.view(B, Hq, Wq, n_heads, n_points, 2) \
39 | .permute(0, 3, 1, 2, 4, 5).contiguous().flatten(0, 1)
40 | # n*n_heads, n_query, 1, in_points, 2
41 | sample_points = sample_points.repeat(Tk, 1, 1, 1, 1)
42 | # n*n_heads*Tk, n_query, 1, in_points, 2
43 | sample_points = sample_points.flatten(2, 3)
44 | # n*n_heads*Tk, n_query, in_points, 2
45 | sample_points = sample_points*2.0-1.0
46 | value = value.view(B*n_heads, Ck//n_heads, Tk, Hk, Wk).permute(2, 0, 1, 3, 4).flatten(0, 1)
47 | out = F.grid_sample(
48 | value, sample_points,
49 | mode='bilinear', padding_mode='zeros', align_corners=False,
50 | )
51 | # n*n_heads*Tk, c//n_heads, n_query, in_points
52 |
53 | if weight is not None:
54 | weight = weight.view(B, Hq, Wq, n_heads, n_points) \
55 | .permute(0, 3, 1, 2, 4).flatten(0, 1).flatten(2, 3).unsqueeze(1).repeat(Tk, 1, 1, 1)
56 | # n*n_heads*Tk, 1, n_query, in_points
57 | out *= weight
58 |
59 | return out.view(Tk, B, n_heads, Ck//n_heads, Hq, Wq, n_points).permute(1, 3, 2, 0, 6, 4, 5)
60 |
61 |
62 | def SAMPLE4D(sample_points: torch.Tensor, values: torch.Tensor, featmap_strides, n_points: int = 1, num_levels: int = None, mapping_stride=3.0, tau=2.0, ):
63 | B, Hq, Wq, n_heads_points, _ = sample_points.shape
64 | B, C, t, _, _ = values[0].shape
65 |
66 | n_heads = n_heads_points//n_points
67 |
68 | if num_levels is None:
69 | num_levels = len(values)
70 |
71 | sample_points_xy = sample_points[..., 0:2]
72 | # print(sample_points_xy.shape) torch.Size([2, 100, 1, 128=32*4, 2])
73 | # [n, n_query, 1, in_points * n_heads, 2]
74 |
75 | sample_points_lvl = sample_points[..., 2].clone()
76 | # print(sample_points_lvl.shape) torch.Size([2, 100, 1, 128=32*4])
77 | # [n, n_query, 1, in_points * n_heads]
78 |
79 | sample_points_lvl_mapped = sample_points_lvl - mapping_stride
80 | # print(sample_points_lvl_mapped.shape) torch.Size([2, 100, 1, 128=32*4])
81 | # [n, n_query, 1, in_points * n_heads]
82 |
83 | sample_points_lvl_weight = translate_to_linear_weight(sample_points_lvl_mapped, num_levels, tau=tau)
84 | # print(sample_points_lvl_weight.shape) torch.Size([2, 100, 1, 128=32*4, 4])
85 | # [n, n_query, 1, in_points * n_heads, num_levels]
86 |
87 | sample_points_lvl_weight_list = sample_points_lvl_weight.unbind(-1)
88 | # [[n, n_query, 1, in_points * n_heads],....]
89 |
90 | out = sample_points.new_zeros(B, C//n_heads, n_heads, t, n_points, Hq, Wq)
91 | # print(out.shape) torch.Size([2, 64=256//4, 4, 4, 32, 100, 1])
92 | # n, dim//n_heads, n_heads, t, in_points, n_query, 1
93 |
94 | for i in range(num_levels):
95 | value = values[i]
96 | # B, C, T, H, W
97 | lvl_weights = sample_points_lvl_weight_list[i]
98 | stride = featmap_strides[i]
99 |
100 | mapping_size = value.new_tensor([value.size(4), value.size(3)]).view(1, 1, 1, 1, -1) * stride
101 | normalized_xy = sample_points_xy / mapping_size
102 | # [n, n_query, 1, in_points * n_heads, 2]
103 |
104 | out += MHAQ3D(normalized_xy, value, weight=lvl_weights, n_points=n_points)
105 |
106 | return out, None
107 |
--------------------------------------------------------------------------------
/alphaction/modeling/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Miscellaneous utility functions
3 | """
4 |
5 | import torch
6 | from alphaction.structures.bounding_box import BoxList
7 |
8 |
9 | def cat(tensors, dim=0):
10 | """
11 | Efficient version of torch.cat that avoids a copy if there is only a single element in a list
12 | """
13 | assert isinstance(tensors, (list, tuple))
14 | if len(tensors) == 1:
15 | return tensors[0]
16 | return torch.cat(tensors, dim)
17 |
18 | def pad_sequence(sequence, targ_size, padding_value=0):
19 | tensor_size = sequence[0].size()
20 | trailing_dims = tensor_size[1:]
21 | out_dims = (len(sequence), targ_size) + trailing_dims
22 |
23 | out_tensor = sequence[0].new_full(out_dims, padding_value)
24 | for i, tensor in enumerate(sequence):
25 | length = tensor.size(0)
26 | out_tensor[i, :length, ...] = tensor
27 |
28 | return out_tensor
29 |
30 | def prepare_pooled_feature(x_pooled, boxes, detach=True):
31 | image_shapes = [box.size for box in boxes]
32 | boxes_per_image = [len(box) for box in boxes]
33 | box_tensors = [a.bbox for a in boxes]
34 |
35 | if detach:
36 | x_pooled = x_pooled.detach()
37 | pooled_feature = x_pooled.split(boxes_per_image, dim=0)
38 |
39 | boxes_result = []
40 | for feature_per_image, boxes_per_image, image_shape in zip(
41 | pooled_feature, box_tensors, image_shapes
42 | ):
43 | boxlist = BoxList(boxes_per_image, image_shape, mode="xyxy")
44 | boxlist.add_field("pooled_feature", feature_per_image)
45 | boxes_result.append(boxlist)
46 | return boxes_result
--------------------------------------------------------------------------------
/alphaction/solver/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import make_optimizer
2 | from .build import make_lr_scheduler
3 | from .lr_scheduler import WarmupMultiStepLR
4 |
--------------------------------------------------------------------------------
/alphaction/solver/lr_scheduler.py:
--------------------------------------------------------------------------------
1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/solver/lr_scheduler.py
2 | from bisect import bisect_right
3 |
4 | import torch
5 | import math
6 |
7 |
8 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
9 | def __init__(
10 | self,
11 | optimizer,
12 | milestones,
13 | gamma=0.1,
14 | warmup_factor=1.0 / 3,
15 | warmup_iters=500,
16 | warmup_method="linear",
17 | last_epoch=-1,
18 | ):
19 | if not list(milestones) == sorted(milestones):
20 | raise ValueError(
21 | "Milestones should be a list of" " increasing integers. Got {}",
22 | milestones,
23 | )
24 |
25 | if warmup_method not in ("constant", "linear"):
26 | raise ValueError(
27 | "Only 'constant' or 'linear' warmup_method accepted"
28 | "got {}".format(warmup_method)
29 | )
30 | self.milestones = milestones
31 | self.gamma = gamma
32 | self.warmup_factor = warmup_factor
33 | self.warmup_iters = warmup_iters
34 | self.warmup_method = warmup_method
35 | super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
36 |
37 | def get_lr(self):
38 | warmup_factor = 1
39 | if self.last_epoch < self.warmup_iters:
40 | if self.warmup_method == "constant":
41 | warmup_factor = self.warmup_factor
42 | elif self.warmup_method == "linear":
43 | alpha = float(self.last_epoch) / self.warmup_iters
44 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha
45 | return [
46 | base_lr
47 | * warmup_factor
48 | * self.gamma ** bisect_right(self.milestones, self.last_epoch)
49 | for base_lr in self.base_lrs
50 | ]
51 |
52 | class HalfPeriodCosStepLR(torch.optim.lr_scheduler._LRScheduler):
53 | def __init__(
54 | self,
55 | optimizer,
56 | warmup_factor=1.0 / 3,
57 | warmup_iters=8000,
58 | max_iters=60000,
59 | warmup_method="linear",
60 | last_epoch=-1,
61 | ):
62 | if warmup_method not in ("constant", "linear"):
63 | raise ValueError(
64 | "Only 'constant' or 'linear' warmup_method accepted"
65 | "got {}".format(warmup_method)
66 | )
67 | self.warmup_factor = warmup_factor
68 | self.warmup_iters = warmup_iters
69 | self.max_iters = max_iters
70 | self.warmup_method = warmup_method
71 | super(HalfPeriodCosStepLR, self).__init__(optimizer, last_epoch)
72 |
73 | def get_lr(self):
74 | warmup_factor = 1
75 | if self.last_epoch < self.warmup_iters:
76 | if self.warmup_method == "constant":
77 | warmup_factor = self.warmup_factor
78 | elif self.warmup_method == "linear":
79 | alpha = float(self.last_epoch) / self.warmup_iters
80 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha
81 | else:
82 | warmup_factor = 0.5 * (math.cos(self.last_epoch / self.max_iters * math.pi) + 1)
83 | return [
84 | base_lr
85 | * warmup_factor
86 | for base_lr in self.base_lrs
87 | ]
--------------------------------------------------------------------------------
/alphaction/structures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/structures/__init__.py
--------------------------------------------------------------------------------
/alphaction/structures/memory_pool.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | class MemoryPool(object):
4 | def __init__(self):
5 | self.cache = defaultdict(dict)
6 |
7 | def update(self, update_info):
8 | for movie_id, feature_per_movie in update_info.items():
9 | self.cache[movie_id].update(feature_per_movie)
10 |
11 | def update_list(self, update_info_list):
12 | for update_info in update_info_list:
13 | self.update(update_info)
14 |
15 | def __getitem__(self, item):
16 | if isinstance(item, tuple) and len(item)==2:
17 | return self.cache[item[0]][item[1]]
18 | return self.cache[item]
19 |
20 | def __setitem__(self, key, value):
21 | if isinstance(key, tuple) and len(key)==2:
22 | self.cache[key[0]][key[1]] = value
23 | else:
24 | self.cache[key] = value
25 |
26 | def __delitem__(self, item):
27 | if isinstance(item, tuple) and len(item)==2:
28 | del self.cache[item[0]][item[1]]
29 | else:
30 | del self.cache[item]
31 |
32 | def __contains__(self, item):
33 | if isinstance(item, tuple) and len(item)==2:
34 | return (item[0] in self.cache and item[1] in self.cache[item[0]])
35 | return (item in self.cache)
36 |
37 | def items(self):
38 | return self.cache.items()
--------------------------------------------------------------------------------
/alphaction/utils/IA_helper.py:
--------------------------------------------------------------------------------
1 | import itertools
2 |
3 | def _block_set(ia_blocks):
4 | if len(ia_blocks) > 0 and isinstance(ia_blocks[0], list):
5 | ia_blocks = list(itertools.chain.from_iterable(ia_blocks))
6 | return ia_blocks
7 |
8 | def has_person(ia_config):
9 | ia_blocks = _block_set(ia_config.I_BLOCK_LIST)
10 | return (ia_config.ACTIVE and 'P' in ia_blocks and ia_config.MAX_PERSON > 0)
11 |
12 |
13 | def has_object(ia_config):
14 | ia_blocks = _block_set(ia_config.I_BLOCK_LIST)
15 | return (ia_config.ACTIVE and 'O' in ia_blocks and ia_config.MAX_OBJECT > 0)
16 |
17 |
18 | def has_memory(ia_config):
19 | ia_blocks = _block_set(ia_config.I_BLOCK_LIST)
20 | return (ia_config.ACTIVE and 'M' in ia_blocks and ia_config.MAX_PER_SEC > 0)
21 |
--------------------------------------------------------------------------------
/alphaction/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/utils/__init__.py
--------------------------------------------------------------------------------
/alphaction/utils/logger.py:
--------------------------------------------------------------------------------
1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/logger.py
2 | import logging
3 | import os
4 | import sys
5 | import time
6 |
7 |
8 | def setup_logger(name, save_dir, distributed_rank, filename=None):
9 | logger = logging.getLogger(name)
10 | logger.setLevel(logging.DEBUG)
11 | logger.propagate = False
12 | # don't log results for the non-master process
13 | if distributed_rank > 0:
14 | return logger
15 | ch = logging.StreamHandler(stream=sys.stdout)
16 | ch.setLevel(logging.DEBUG)
17 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
18 | ch.setFormatter(formatter)
19 | logger.addHandler(ch)
20 |
21 | if save_dir:
22 | if filename is None:
23 | filename = time.strftime("%Y-%m-%d_%H.%M.%S", time.localtime()) + ".log"
24 | fh = logging.FileHandler(os.path.join(save_dir, filename))
25 | fh.setLevel(logging.DEBUG)
26 | fh.setFormatter(formatter)
27 | logger.addHandler(fh)
28 |
29 | return logger
30 |
31 | def setup_tblogger(save_dir, distributed_rank):
32 | if distributed_rank>0:
33 | return None
34 | from tensorboardX import SummaryWriter
35 | tbdir = os.path.join(save_dir,'tb')
36 | os.makedirs(tbdir,exist_ok=True)
37 | tblogger = SummaryWriter(tbdir)
38 | return tblogger
--------------------------------------------------------------------------------
/alphaction/utils/metric_logger.py:
--------------------------------------------------------------------------------
1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/metric_logger.py
2 | from collections import defaultdict
3 | from collections import deque
4 |
5 | import torch
6 |
7 |
8 | class SmoothedValue(object):
9 | """Track a series of values and provide access to smoothed values over a
10 | window or the global series average.
11 | """
12 |
13 | def __init__(self, window_size=10):
14 | self.deque = deque(maxlen=window_size)
15 | # self.series = []
16 | self.total = 0.0
17 | self.count = 0
18 |
19 | def update(self, value):
20 | self.deque.append(value)
21 | # self.series.append(value)
22 | self.count += 1
23 | self.total += value
24 |
25 | @property
26 | def median(self):
27 | d = torch.tensor(list(self.deque))
28 | return d.median().item()
29 |
30 | @property
31 | def avg(self):
32 | d = torch.tensor(list(self.deque))
33 | return d.mean().item()
34 |
35 | @property
36 | def global_avg(self):
37 | return self.total / self.count
38 |
39 |
40 | class MetricLogger(object):
41 | def __init__(self, delimiter="\t"):
42 | self.meters = defaultdict(SmoothedValue)
43 | self.delimiter = delimiter
44 |
45 | def update(self, **kwargs):
46 | for k, v in kwargs.items():
47 | if isinstance(v, torch.Tensor):
48 | v = v.item()
49 | assert isinstance(v, (float, int))
50 | self.meters[k].update(v)
51 |
52 | def __getattr__(self, attr):
53 | if attr in self.meters:
54 | return self.meters[attr]
55 | if attr in self.__dict__:
56 | return self.__dict__[attr]
57 | raise AttributeError("'{}' object has no attribute '{}'".format(
58 | type(self).__name__, attr))
59 |
60 | def __str__(self):
61 | loss_str = []
62 | for name, meter in self.meters.items():
63 | loss_str.append(
64 | "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg)
65 | )
66 | return self.delimiter.join(loss_str)
67 |
--------------------------------------------------------------------------------
/alphaction/utils/model_serialization.py:
--------------------------------------------------------------------------------
1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/model_serialization.py
2 | from collections import OrderedDict
3 | import logging
4 |
5 | import torch
6 |
7 |
8 | def align_and_update_state_dicts(model_state_dict, loaded_state_dict, no_head):
9 | """
10 | Strategy: suppose that the models that we will create will have prefixes appended
11 | to each of its keys, for example due to an extra level of nesting that the original
12 | pre-trained weights from ImageNet won't contain. For example, model.state_dict()
13 | might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains
14 | res2.conv1.weight. We thus want to match both parameters together.
15 | For that, we look for each model weight, look among all loaded keys if there is one
16 | that is a suffix of the current weight name, and use it if that's the case.
17 | If multiple matches exist, take the one with longest size
18 | of the corresponding name. For example, for the same model as before, the pretrained
19 | weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case,
20 | we want to match backbone[0].body.conv1.weight to conv1.weight, and
21 | backbone[0].body.res2.conv1.weight to res2.conv1.weight.
22 | """
23 | current_keys = sorted(list(model_state_dict.keys()))
24 | loaded_keys = sorted(list(loaded_state_dict.keys()))
25 | # get a matrix of string matches, where each (i, j) entry correspond to the size of the
26 | # loaded_key string, if it matches
27 | match_matrix = [
28 | len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys
29 | ]
30 | match_matrix = torch.as_tensor(match_matrix).view(
31 | len(current_keys), len(loaded_keys)
32 | )
33 | max_match_size, idxs = match_matrix.max(1)
34 | # remove indices that correspond to no-match
35 | idxs[max_match_size == 0] = -1
36 |
37 | # used for logging
38 | max_size = max([len(key) for key in current_keys]) if current_keys else 1
39 | max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1
40 | # log_str_template = "{: <{}} loaded from {: <{}} of shape {}"
41 | logger = logging.getLogger(__name__)
42 | for idx_new, idx_old in enumerate(idxs.tolist()):
43 | if idx_old == -1:
44 | continue
45 | key = current_keys[idx_new]
46 | key_old = loaded_keys[idx_old]
47 |
48 | if no_head and key_old.startswith("roi_heads."):
49 | logger.info("{} will not be loaded.".format(key))
50 | continue
51 |
52 | model_state_dict[key] = loaded_state_dict[key_old]
53 | # logger.info(
54 | # log_str_template.format(
55 | # key,
56 | # max_size,
57 | # key_old,
58 | # max_size_loaded,
59 | # tuple(loaded_state_dict[key_old].shape),
60 | # )
61 | # )
62 |
63 |
64 | def strip_prefix_if_present(state_dict, prefix):
65 | keys = sorted(state_dict.keys())
66 | if not all(key.startswith(prefix) for key in keys):
67 | return state_dict
68 | stripped_state_dict = OrderedDict()
69 | for key, value in state_dict.items():
70 | stripped_state_dict[key.replace(prefix, "")] = value
71 | return stripped_state_dict
72 |
73 |
74 | def exclude_layers(model_state_dict, excluded):
75 | model_state_dict_new = OrderedDict()
76 | for key, value in model_state_dict.items():
77 | if any([exc in key for exc in excluded]):
78 | continue
79 | model_state_dict_new[key] = value
80 | return model_state_dict_new
81 |
82 |
83 | def load_state_dict(model, loaded_state_dict, no_head, excluded=[]):
84 | model_state_dict = model.state_dict()
85 | # if the state_dict comes from a model that was wrapped in a
86 | # DataParallel or DistributedDataParallel during serialization,
87 | # remove the "module" prefix before performing the matching
88 | loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.")
89 |
90 | if len(excluded) > 0:
91 | # exclude specified layers
92 | loaded_state_dict = exclude_layers(loaded_state_dict, excluded)
93 |
94 | align_and_update_state_dicts(model_state_dict, loaded_state_dict, no_head)
95 |
96 | # use strict loading
97 | model.load_state_dict(model_state_dict)
98 |
--------------------------------------------------------------------------------
/alphaction/utils/random_seed.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import random
3 | import numpy as np
4 |
5 | def set_seed(seed, rank, world_size):
6 | rng = random.Random(seed)
7 | seed_per_rank = [rng.randint(0, 2**32-1) for _ in range(world_size)]
8 | cur_seed = seed_per_rank[rank]
9 | random.seed(cur_seed)
10 | torch.manual_seed(cur_seed)
11 | torch.cuda.manual_seed(cur_seed)
12 | np.random.seed(cur_seed)
--------------------------------------------------------------------------------
/alphaction/utils/registry.py:
--------------------------------------------------------------------------------
1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/registry.py
2 |
3 | def _register_generic(module_dict, module_name, module):
4 | assert module_name not in module_dict
5 | module_dict[module_name] = module
6 |
7 |
8 | class Registry(dict):
9 | '''
10 | A helper class for managing registering modules, it extends a dictionary
11 | and provides a register functions.
12 |
13 | Eg. creeting a registry:
14 | some_registry = Registry({"default": default_module})
15 |
16 | There're two ways of registering new modules:
17 | 1): normal way is just calling register function:
18 | def foo():
19 | ...
20 | some_registry.register("foo_module", foo)
21 | 2): used as decorator when declaring the module:
22 | @some_registry.register("foo_module")
23 | @some_registry.register("foo_modeul_nickname")
24 | def foo():
25 | ...
26 |
27 | Access of module is just like using a dictionary, eg:
28 | f = some_registry["foo_modeul"]
29 | '''
30 | def __init__(self, *args, **kwargs):
31 | super(Registry, self).__init__(*args, **kwargs)
32 |
33 | def register(self, module_name, module=None):
34 | # used as function call
35 | if module is not None:
36 | _register_generic(self, module_name, module)
37 | return
38 |
39 | # used as decorator
40 | def register_fn(fn):
41 | _register_generic(self, module_name, fn)
42 | return fn
43 |
44 | return register_fn
45 |
--------------------------------------------------------------------------------
/alphaction/utils/video_decode.py:
--------------------------------------------------------------------------------
1 | import av
2 |
3 | def av_decode_video(video_path):
4 | try:
5 | with av.open(video_path) as container:
6 | frames = []
7 | for frame in container.decode(video=0):
8 | frames.append(frame.to_rgb().to_ndarray())
9 | return frames
10 | except Exception:
11 | assert len(frame) != 0
12 | return frames
--------------------------------------------------------------------------------
/alphaction/utils/visualize.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple, List
2 | import torch
3 | from torchvision.ops import box_convert
4 | import numpy as np
5 | import supervision as sv
6 | import cv2
7 | import imageio
8 |
9 |
10 | def annotate(image_source: np.ndarray, boxes: torch.Tensor, normalized=True, logits=None, phrases=[], is_xyxy=False, color=None, text_padding=10, set_text_color='black') -> np.ndarray:
11 | h, w, _ = image_source.shape
12 | if normalized:
13 | boxes = boxes * torch.Tensor([w, h, w, h])
14 | if not is_xyxy:
15 | assert isinstance(boxes, torch.Tensor)
16 | xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
17 | elif isinstance(boxes, torch.Tensor):
18 | xyxy = boxes.numpy()
19 | else:
20 | xyxy = boxes
21 | detections = sv.Detections(xyxy=xyxy)
22 |
23 | if logits is not None and len(phrases) == logits.size(0):
24 | labels = [
25 | f"{phrase} {logit:.2f}"
26 | for phrase, logit
27 | in zip(phrases, logits)
28 | ]
29 | else:
30 | labels = phrases
31 |
32 | if color is None or (not isinstance(color, tuple)):
33 | svcolor = sv.ColorPalette.default()
34 | else:
35 | svcolor = sv.Color(*color)
36 | text_color = sv.Color.white() if set_text_color == 'white' else sv.Color.black()
37 | box_annotator = sv.BoxAnnotator(color=svcolor, text_padding=text_padding, text_color=text_color)
38 | annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
39 | annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
40 | return annotated_frame
41 |
42 |
43 | def video_to_gif(video, giffile, fps=5.0, toBGR=False):
44 | assert giffile.endswith('.gif')
45 | with imageio.get_writer(giffile, mode='I', duration=1.0/fps, loop=0) as writer:
46 | for frame in video:
47 | frame_vis = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) if toBGR else np.copy(frame)
48 | writer.append_data(frame_vis)
--------------------------------------------------------------------------------
/assets/wacv25_openmixer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/assets/wacv25_openmixer.png
--------------------------------------------------------------------------------
/config_files/jhmdb/openmixer_e2e.yaml:
--------------------------------------------------------------------------------
1 | DATA:
2 | PATH_TO_DATA_DIR: "data/JHMDB"
3 | NUM_FRAMES: 16
4 | SAMPLING_RATE: 1
5 | MEAN: [0.48145466, 0.4578275, 0.40821073]
6 | STD: [0.26862954, 0.26130258, 0.27577711] # from CLIPViP (same as CLIP)
7 | DATASETS: ['jhmdb']
8 | OPEN_VOCABULARY: True
9 | REFINE_VOCAB: True
10 | JHMDB:
11 | FRAME_DIR: "Frames/"
12 | OPEN_WORLD_DIR: 'openworld'
13 | CW_SPLIT_FILE: 'train50%/closed_world_0.pkl'
14 | OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl'
15 | SAMPLES_SPLIT: 0
16 | VOCAB_REFINE: 'vocab_gpt3.5.json'
17 | MODEL:
18 | WEIGHT: null
19 | BACKBONE:
20 | CONV_BODY: "ViP-B/16"
21 | PATHWAYS: 1
22 | RESIDUAL_LATERAL: True
23 | STM:
24 | NUM_QUERIES: 100
25 | HIDDEN_DIM: 512
26 | NUM_STAGES: 3
27 | ACTION_CLASSES: 10 # 50%: 10, 75%: 15
28 | OBJECT_CLASSES: 1
29 | NUM_HEADS: 8
30 | DROPOUT: 0.0
31 | DIM_FEEDFORWARD: 2048
32 | NUM_FCS: 2
33 | ACTIVATION: 'ReLU'
34 | SPATIAL_POINTS: 32
35 | TEMPORAL_POINTS: 16 # must be the same as NUM_FRAMES
36 | OUT_MULTIPLIER: 4
37 | N_GROUPS: 4
38 | NUM_CLS: 1
39 | NUM_ACT: 1
40 | NUM_REG: 1
41 | OBJECT_WEIGHT: 2.0
42 | ACTION_WEIGHT: 48.0
43 | GIOU_WEIGHT: 2.0
44 | L1_WEIGHT: 2.0
45 | BACKGROUND_WEIGHT: 0.1
46 | INTERMEDIATE_SUPERVISION: True
47 | PERSON_THRESHOLD: 0.6
48 | USE_CLS_FEAT: True
49 | COND_CLS: True
50 | FUSE_CLS: True
51 | FUSE_METHOD: 'logit_fusion'
52 | FUSE_FACTOR: 0.99
53 | DeST: True
54 | TEXT_ENCODER: 'CLIPViP'
55 | CLIPViP:
56 | ARCH: ViP-B/16
57 | CLIP_NAME: "openai/clip-vit-base-patch16" # load from huggingface
58 | WEIGHT: "pretrained/pretrain_clipvip_base_16.pt"
59 | TEMPORAL_SIZE: 12
60 | USE_TEMPORAL_EMBED: True
61 | LOGIT_SCALE_INIT: 4.6
62 | ADD_CLS_NUM: 3
63 | CONTEXT_INIT: 'a '
64 | LEN_CONTEXT: 24
65 | CAM_METHOD: 'RITSM'
66 | USE_ATTN: False
67 | MULTI_LABEL_ACTION: False # softmax
68 | ViT:
69 | LAYER_DECAY: 1.0
70 | WEIGHT_DECAY: 1e-5
71 | SOLVER:
72 | MAX_EPOCH: 12
73 | BASE_LR: 0.00001
74 | WEIGHT_DECAY: 1e-4
75 | STEPS: (5, 8)
76 | WARMUP_FACTOR: 0.1
77 | WARMUP_EPOCH: 2
78 | CHECKPOINT_PERIOD: 1
79 | EVAL_PERIOD: 1
80 | EVAL_AFTER: 2
81 | VIDEOS_PER_BATCH: 16
82 | OPTIMIZING_METHOD: 'adamw'
83 | TEST:
84 | VIDEOS_PER_BATCH: 16
85 | EVAL_OPEN: True
86 | METRIC: 'video_ap'
87 | SMALL_OPEN_WORLD: True
88 | INDEPENDENT_EVAL: True
89 | OUTPUT_DIR: "output/jhmdb/openmixer_e2e"
90 |
--------------------------------------------------------------------------------
/config_files/jhmdb/openmixer_zsr_tl.yaml:
--------------------------------------------------------------------------------
1 | DATA:
2 | PATH_TO_DATA_DIR: "data/JHMDB"
3 | NUM_FRAMES: 16
4 | SAMPLING_RATE: 1
5 | MEAN: [0.48145466, 0.4578275, 0.40821073]
6 | STD: [0.26862954, 0.26130258, 0.27577711] # from CLIPViP (same as CLIP)
7 | DATASETS: ['jhmdb']
8 | OPEN_VOCABULARY: True
9 | REFINE_VOCAB: True
10 | JHMDB:
11 | FRAME_DIR: "Frames/"
12 | OPEN_WORLD_DIR: 'openworld'
13 | CW_SPLIT_FILE: 'train50%/closed_world_0.pkl'
14 | OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl'
15 | SAMPLES_SPLIT: 0
16 | VOCAB_REFINE: 'vocab_gpt3.5.json'
17 | MODEL:
18 | WEIGHT: null
19 | BACKBONE:
20 | CONV_BODY: "ViP-B/16"
21 | PATHWAYS: 1
22 | STM:
23 | NUM_QUERIES: 100
24 | HIDDEN_DIM: 512
25 | NUM_STAGES: 3
26 | ACTION_CLASSES: 10 # 50%: 10, 75%: 15
27 | OBJECT_CLASSES: 1
28 | NUM_HEADS: 8
29 | DROPOUT: 0.0
30 | DIM_FEEDFORWARD: 2048
31 | NUM_FCS: 2
32 | ACTIVATION: 'ReLU'
33 | SPATIAL_POINTS: 32
34 | TEMPORAL_POINTS: 16 # must be the same as NUM_FRAMES
35 | OUT_MULTIPLIER: 4
36 | N_GROUPS: 4
37 | NUM_CLS: 1
38 | NUM_ACT: 1
39 | NUM_REG: 1
40 | OBJECT_WEIGHT: 2.0
41 | ACTION_WEIGHT: 48.0
42 | GIOU_WEIGHT: 2.0
43 | L1_WEIGHT: 2.0
44 | BACKGROUND_WEIGHT: 0.1
45 | INTERMEDIATE_SUPERVISION: True
46 | PERSON_THRESHOLD: 0.6
47 | USE_CLS_FEAT: True
48 | PRETRAIN_ACTION: True
49 | TEXT_ENCODER: 'CLIPViP'
50 | CLIPViP:
51 | ARCH: ViP-B/16
52 | CLIP_NAME: "openai/clip-vit-base-patch16" # load from huggingface
53 | WEIGHT: "pretrained/pretrain_clipvip_base_16.pt"
54 | TEMPORAL_SIZE: 12
55 | USE_TEMPORAL_EMBED: True
56 | LOGIT_SCALE_INIT: 4.6
57 | ADD_CLS_NUM: 3
58 | CONTEXT_INIT: 'a '
59 | LEN_CONTEXT: 24
60 | CAM_METHOD: 'RITSM'
61 | USE_ATTN: False
62 | MULTI_LABEL_ACTION: False # softmax
63 | ViT:
64 | LAYER_DECAY: 1.0
65 | WEIGHT_DECAY: 1e-5
66 | SOLVER:
67 | MAX_EPOCH: 12
68 | BASE_LR: 0.00001
69 | WEIGHT_DECAY: 1e-4
70 | STEPS: (5, 8)
71 | WARMUP_FACTOR: 0.1
72 | WARMUP_EPOCH: 2
73 | CHECKPOINT_PERIOD: 1
74 | EVAL_PERIOD: 1
75 | EVAL_AFTER: 2
76 | VIDEOS_PER_BATCH: 16
77 | OPTIMIZING_METHOD: 'adamw'
78 | TEST:
79 | VIDEOS_PER_BATCH: 16
80 | EVAL_OPEN: True
81 | METRIC: 'video_ap'
82 | SMALL_OPEN_WORLD: True
83 | INDEPENDENT_EVAL: True
84 | OUTPUT_DIR: "output/jhmdb/openmixer_zsr_tl"
85 |
--------------------------------------------------------------------------------
/config_files/jhmdb/openmixer_zsr_zsl.yaml:
--------------------------------------------------------------------------------
1 | DATA:
2 | PATH_TO_DATA_DIR: "data/JHMDB"
3 | NUM_FRAMES: 16
4 | SAMPLING_RATE: 1
5 | MEAN: [0.48145466, 0.4578275, 0.40821073]
6 | STD: [0.26862954, 0.26130258, 0.27577711] # from CLIPViP (same as CLIP)
7 | DATASETS: ['jhmdb']
8 | OPEN_VOCABULARY: True
9 | REFINE_VOCAB: True
10 | JHMDB:
11 | FRAME_DIR: "Frames/"
12 | OPEN_WORLD_DIR: 'openworld'
13 | CW_SPLIT_FILE: 'train50%/closed_world_0.pkl'
14 | OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl'
15 | SAMPLES_SPLIT: 0
16 | VOCAB_REFINE: 'vocab_gpt3.5.json'
17 | PRIOR_BOX_FILE: 'JHMDB-MaskRCNN.pkl'
18 | MODEL:
19 | DET: NaiveBaseline
20 | MULTI_LABEL_ACTION: False
21 | PRIOR_BOXES_INIT: 'det'
22 | WEIGHT: null
23 | BACKBONE:
24 | CONV_BODY: "ViP-B/16"
25 | PATHWAYS: 1
26 | STM:
27 | USE_CLS_FEAT: True
28 | TEXT_ENCODER: 'CLIPViP'
29 | CLIPViP:
30 | ARCH: ViP-B/16
31 | CLIP_NAME: "openai/clip-vit-base-patch16" # load from huggingface
32 | WEIGHT: "pretrained/pretrain_clipvip_base_16.pt"
33 | TEMPORAL_SIZE: 12
34 | USE_TEMPORAL_EMBED: True
35 | LOGIT_SCALE_INIT: 4.6
36 | ADD_CLS_NUM: 3
37 | # CONTEXT_INIT: 'a video of '
38 | LEN_CONTEXT: 24
39 | TEST:
40 | VIDEOS_PER_BATCH: 32
41 | EVAL_OPEN: True
42 | METRIC: 'video_ap'
43 | SMALL_OPEN_WORLD: True
44 | INDEPENDENT_EVAL: True
45 | OUTPUT_DIR: "output/jhmdb/openmixer_zsr_zsl"
46 |
--------------------------------------------------------------------------------
/config_files/ucf24/openmixer_e2e.yaml:
--------------------------------------------------------------------------------
1 | DATA:
2 | PATH_TO_DATA_DIR: "data/UCF24"
3 | NUM_FRAMES: 16
4 | SAMPLING_RATE: 1
5 | MEAN: [0.48145466, 0.4578275, 0.40821073]
6 | STD: [0.26862954, 0.26130258, 0.27577711] # from CLIPViP (same as CLIP)
7 | DATASETS: ['ucf24']
8 | OPEN_VOCABULARY: True
9 | REFINE_VOCAB: True
10 | UCF24:
11 | FRAME_DIR: "rgb-images"
12 | OPEN_WORLD_DIR: 'openworld'
13 | CW_SPLIT_FILE: 'train50%/closed_world_0.pkl'
14 | OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl'
15 | VOCAB_REFINE: 'vocab_gpt4.json'
16 | # PRIOR_BOX_FILE: 'UCF24-GDINO-top10.pkl' # 'UCF24-MaskRCNN.pkl', 'UCF24-GDINO-top10.pkl'
17 | MODEL:
18 | WEIGHT: null
19 | BACKBONE:
20 | CONV_BODY: "ViP-B/16"
21 | PATHWAYS: 1
22 | RESIDUAL_LATERAL: True
23 | STM:
24 | NUM_QUERIES: 100
25 | HIDDEN_DIM: 512
26 | NUM_STAGES: 3
27 | ACTION_CLASSES: 10 # 50%: 10, 75%: 15
28 | OBJECT_CLASSES: 1
29 | NUM_HEADS: 8
30 | DROPOUT: 0.0
31 | DIM_FEEDFORWARD: 2048
32 | NUM_FCS: 2
33 | ACTIVATION: 'ReLU'
34 | SPATIAL_POINTS: 32
35 | TEMPORAL_POINTS: 16 # must be the same as NUM_FRAMES
36 | OUT_MULTIPLIER: 4
37 | N_GROUPS: 4
38 | NUM_CLS: 1
39 | NUM_ACT: 1
40 | NUM_REG: 1
41 | OBJECT_WEIGHT: 2.0
42 | ACTION_WEIGHT: 8.0
43 | GIOU_WEIGHT: 2.0
44 | L1_WEIGHT: 2.0
45 | BACKGROUND_WEIGHT: 0.1
46 | INTERMEDIATE_SUPERVISION: True
47 | PERSON_THRESHOLD: 0.6
48 | USE_CLS_FEAT: True
49 | COND_CLS: True
50 | FUSE_CLS: True
51 | FUSE_METHOD: 'logit_fusion'
52 | FUSE_FACTOR: 0.999
53 | DeST: True
54 | TEXT_ENCODER: 'CLIPViP'
55 | CLIPViP:
56 | ARCH: ViP-B/16
57 | CLIP_NAME: "openai/clip-vit-base-patch16" # load from huggingface
58 | WEIGHT: "pretrained/pretrain_clipvip_base_16.pt"
59 | TEMPORAL_SIZE: 12
60 | USE_TEMPORAL_EMBED: True
61 | LOGIT_SCALE_INIT: 4.6
62 | ADD_CLS_NUM: 3
63 | CONTEXT_INIT: ''
64 | LEN_CONTEXT: 24
65 | CAM_METHOD: 'RITSM'
66 | USE_ATTN: False
67 | MULTI_LABEL_ACTION: False # softmax
68 | ViT:
69 | LAYER_DECAY: 1.0
70 | WEIGHT_DECAY: 1e-5
71 | SOLVER:
72 | MAX_EPOCH: 12
73 | BASE_LR: 0.00001
74 | WEIGHT_DECAY: 1e-4
75 | STEPS: (5, 8)
76 | WARMUP_FACTOR: 0.1
77 | WARMUP_EPOCH: 2
78 | CHECKPOINT_PERIOD: 1
79 | EVAL_PERIOD: 1
80 | EVAL_AFTER: 2
81 | VIDEOS_PER_BATCH: 8
82 | OPTIMIZING_METHOD: 'adamw'
83 | TEST:
84 | VIDEOS_PER_BATCH: 16
85 | EVAL_OPEN: True
86 | METRIC: 'video_ap'
87 | SMALL_OPEN_WORLD: True
88 | INDEPENDENT_EVAL: True
89 | IOU_THRESH: 0.2
90 | # PRIOR_BOX_TEST: True
91 | OUTPUT_DIR: "output/ucf24/openmixer_e2e"
92 |
--------------------------------------------------------------------------------
/config_files/ucf24/openmixer_zsr_tl.yaml:
--------------------------------------------------------------------------------
1 | DATA:
2 | PATH_TO_DATA_DIR: "data/UCF24"
3 | NUM_FRAMES: 16
4 | SAMPLING_RATE: 1
5 | MEAN: [0.48145466, 0.4578275, 0.40821073]
6 | STD: [0.26862954, 0.26130258, 0.27577711] # from CLIPViP (same as CLIP)
7 | DATASETS: ['ucf24']
8 | OPEN_VOCABULARY: True
9 | REFINE_VOCAB: True
10 | UCF24:
11 | FRAME_DIR: "rgb-images"
12 | OPEN_WORLD_DIR: 'openworld'
13 | CW_SPLIT_FILE: 'train50%/closed_world_0.pkl'
14 | OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl'
15 | VOCAB_REFINE: 'vocab_gpt4.json'
16 | MODEL:
17 | WEIGHT: null
18 | BACKBONE:
19 | CONV_BODY: "ViP-B/16"
20 | PATHWAYS: 1
21 | RESIDUAL_LATERAL: True
22 | STM:
23 | NUM_QUERIES: 100
24 | HIDDEN_DIM: 512
25 | NUM_STAGES: 3
26 | ACTION_CLASSES: 10 # 50%: 10, 75%: 15
27 | OBJECT_CLASSES: 1
28 | NUM_HEADS: 8
29 | DROPOUT: 0.0
30 | DIM_FEEDFORWARD: 2048
31 | NUM_FCS: 2
32 | ACTIVATION: 'ReLU'
33 | SPATIAL_POINTS: 32
34 | TEMPORAL_POINTS: 16 # must be the same as NUM_FRAMES
35 | OUT_MULTIPLIER: 4
36 | N_GROUPS: 4
37 | NUM_CLS: 1
38 | NUM_ACT: 1
39 | NUM_REG: 1
40 | OBJECT_WEIGHT: 2.0
41 | ACTION_WEIGHT: 48.0
42 | GIOU_WEIGHT: 2.0
43 | L1_WEIGHT: 2.0
44 | BACKGROUND_WEIGHT: 0.1
45 | INTERMEDIATE_SUPERVISION: True
46 | PERSON_THRESHOLD: 0.6
47 | USE_CLS_FEAT: True
48 | PRETRAIN_ACTION: True
49 | TEXT_ENCODER: 'CLIPViP'
50 | CLIPViP:
51 | ARCH: ViP-B/16
52 | CLIP_NAME: "openai/clip-vit-base-patch16" # load from huggingface
53 | WEIGHT: "pretrained/pretrain_clipvip_base_16.pt"
54 | TEMPORAL_SIZE: 12
55 | USE_TEMPORAL_EMBED: True
56 | LOGIT_SCALE_INIT: 4.6
57 | ADD_CLS_NUM: 3
58 | CONTEXT_INIT: ''
59 | LEN_CONTEXT: 24
60 | CAM_METHOD: 'RITSM'
61 | USE_ATTN: False
62 | MULTI_LABEL_ACTION: False # softmax
63 | ViT:
64 | LAYER_DECAY: 1.0
65 | WEIGHT_DECAY: 1e-5
66 | SOLVER:
67 | MAX_EPOCH: 12
68 | BASE_LR: 0.00001
69 | WEIGHT_DECAY: 1e-4
70 | STEPS: (5, 8)
71 | WARMUP_FACTOR: 0.1
72 | WARMUP_EPOCH: 2
73 | CHECKPOINT_PERIOD: 1
74 | EVAL_PERIOD: 1
75 | EVAL_AFTER: 2
76 | VIDEOS_PER_BATCH: 16
77 | OPTIMIZING_METHOD: 'adamw'
78 | TEST:
79 | VIDEOS_PER_BATCH: 16
80 | EVAL_OPEN: True
81 | METRIC: 'video_ap'
82 | SMALL_OPEN_WORLD: True
83 | INDEPENDENT_EVAL: True
84 | IOU_THRESH: 0.2
85 | OUTPUT_DIR: "output/ucf24/openmixer_zsr_tl"
86 |
--------------------------------------------------------------------------------
/config_files/ucf24/openmixer_zsr_zsl.yaml:
--------------------------------------------------------------------------------
1 | DATA:
2 | PATH_TO_DATA_DIR: "data/UCF24"
3 | NUM_FRAMES: 16
4 | SAMPLING_RATE: 1
5 | MEAN: [0.48145466, 0.4578275, 0.40821073]
6 | STD: [0.26862954, 0.26130258, 0.27577711] # from CLIPViP (same as CLIP)
7 | DATASETS: ['ucf24']
8 | OPEN_VOCABULARY: True
9 | REFINE_VOCAB: True
10 | UCF24:
11 | FRAME_DIR: "rgb-images"
12 | OPEN_WORLD_DIR: 'openworld'
13 | CW_SPLIT_FILE: 'train50%/closed_world_0.pkl'
14 | OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl'
15 | VOCAB_REFINE: 'vocab_gpt4.json'
16 | PRIOR_BOX_FILE: 'UCF24-MaskRCNN.pkl'
17 | MODEL:
18 | DET: NaiveBaseline
19 | MULTI_LABEL_ACTION: False
20 | PRIOR_BOXES_INIT: 'det' # prior boxes in testing
21 | WEIGHT: null
22 | BACKBONE:
23 | CONV_BODY: "ViP-B/16"
24 | PATHWAYS: 1
25 | STM:
26 | USE_CLS_FEAT: True
27 | TEXT_ENCODER: 'CLIPViP'
28 | CLIPViP:
29 | ARCH: ViP-B/16
30 | CLIP_NAME: "openai/clip-vit-base-patch16" # load from huggingface
31 | WEIGHT: "pretrained/pretrain_clipvip_base_16.pt"
32 | TEMPORAL_SIZE: 12
33 | USE_TEMPORAL_EMBED: True
34 | LOGIT_SCALE_INIT: 4.6
35 | ADD_CLS_NUM: 3
36 | # CONTEXT_INIT: 'a video of '
37 | LEN_CONTEXT: 24
38 | TEST:
39 | VIDEOS_PER_BATCH: 64
40 | EVAL_OPEN: True
41 | METRIC: 'video_ap'
42 | SMALL_OPEN_WORLD: True
43 | INDEPENDENT_EVAL: True
44 | IOU_THRESH: 0.2
45 | OUTPUT_DIR: "output/ucf24/openmixer_zsr_zsl"
46 |
--------------------------------------------------------------------------------
/preprocess/generate_vdt_jhmdb.py:
--------------------------------------------------------------------------------
1 | import random
2 | import re
3 | import os
4 | import copy
5 | import json
6 |
7 | import openai
8 | openai.api_key = "YOUR_OPENAI_KEY_HERE"
9 |
10 |
11 | def read_class_list(filepath):
12 | class_list = []
13 | with open(filepath, 'r') as f:
14 | for line in f.readlines():
15 | class_list.append(line.strip())
16 | return class_list
17 |
18 | def read_class_description(filepath):
19 | with open(filepath, 'r') as f:
20 | refine_maps = json.load(f)
21 | return refine_maps
22 |
23 |
24 | def run_gpt4(class_name):
25 | prompt = """
26 | What are the visual features for distinguishing {}? Please describe with a few short sentences.
27 | """
28 | cls_name = re.sub("_", " ", class_name)
29 | message = [
30 | {"role": "system", "content": "You are a useful assistant."},
31 | {"role": "user", "content": prompt.format(cls_name)}
32 | ]
33 |
34 | response = openai.ChatCompletion.create(
35 | model="gpt-4-0613",
36 | max_tokens=1024,
37 | temperature=1.2,
38 | messages = message)
39 |
40 | # parse the response
41 | result = response['choices'][0]['message']['content']
42 | return result
43 |
44 |
45 | def generate_different_meaning():
46 | jhmdb_classes = read_class_list(os.path.join(data_path, 'vocab_open.txt'))
47 |
48 | results = {}
49 | for clsname in jhmdb_classes:
50 | print("\nProcessing action: {}...".format(clsname))
51 | cls_name = re.sub("_", " ", clsname)
52 | prompt = f"Generate 16 unique sentences describing the action '{cls_name}':"
53 | message = [
54 | {"role": "system", "content": "You are a useful assistant."},
55 | {"role": "user", "content": prompt}
56 | ]
57 | response = openai.ChatCompletion.create(model="gpt-4-0613", messages=message, max_tokens=800, request_timeout=60) # Adjust max_tokens as needed
58 | res = response.choices[0]['message']['content'].strip().split('\n')
59 | print(res)
60 |
61 | results[clsname] = res
62 |
63 | with open(os.path.join(data_path, "vocab_gpt4_m16.json"), "w") as outfile:
64 | json.dump(results, outfile)
65 |
66 |
67 | def generate_same_meaning():
68 | class_descriptions = read_class_description(os.path.join(data_path, 'vocab_gpt3.5.json'))
69 |
70 | results = {}
71 | for clsname, desc in class_descriptions.items():
72 | print("\nProcessing action: {}...".format(clsname))
73 | cls_name = re.sub("_", " ", clsname)
74 | cap_prefix, cap = desc.split(": ")
75 | prompt = f"Given a sport action type from JHMDB dataset, such as '{cls_name}, please provide 16 different sentences that express the same meaning of the caption: '{cap}'."
76 | message = [
77 | {"role": "system", "content": "You are a useful assistant."},
78 | {"role": "user", "content": prompt}
79 | ]
80 | response = openai.ChatCompletion.create(model="gpt-4-0613", messages=message, max_tokens=800, request_timeout=60) # Adjust max_tokens as needed
81 | res = response.choices[0]['message']['content'].strip().split('\n')
82 | res = [desc] + [re.sub(r'\d+.', f'{cap_prefix}:', cap) for cap in res]
83 | print(res)
84 |
85 | results[clsname] = res
86 |
87 |
88 | with open(os.path.join(data_path, "vocab_gpt4_m16new.json"), "w") as outfile:
89 | json.dump(results, outfile, indent=4)
90 |
91 |
92 | if __name__ == '__main__':
93 | random.seed(42)
94 | data_path = '../data/JHMDB/openworld'
95 |
96 | # generate_different_meaning()
97 |
98 | # generate_same_meaning()
99 |
100 | class_descriptions = read_class_description(os.path.join(data_path, 'vocab_gpt3.5.json'))
101 |
102 | # get candidate verbs
103 | seen_classes = read_class_list(os.path.join(data_path, 'train50%', 'vocab_closed_0.txt'))
104 | verbs_list = [clsname.split("_")[0] for clsname in seen_classes]
105 |
106 | prompt = """In this task, you are given an input sentence.
107 | Your job is to tell me 16 output sentences with different meanings by only changing the action verbs using a list of candidate verbs.
108 | The output format should be a dictionary of key-value pair where keys are the verbs you are choosing, and values are the generated sentences."""
109 |
110 | results = {}
111 | for clsname, desc in class_descriptions.items():
112 | if clsname not in seen_classes:
113 | continue # only process the seen classes
114 | print("\nProcessing action: {}...".format(clsname))
115 | cls_name = re.sub("_", " ", clsname)
116 | cap_prefix, cap = desc.split(": ")
117 | verbs_sub = copy.deepcopy(verbs_list)
118 | verbs_sub.remove(clsname.split("_")[0])
119 | verbs_sub = ', '.join(verbs_sub)
120 | message = [
121 | {"role": "system", "content": "You are a useful assistant."},
122 | {"role": "user", "content": prompt + f" The input sentence: {cap} The candidate verb list: [{verbs_sub}]."}
123 | ]
124 | response = openai.ChatCompletion.create(model="gpt-4-0613", messages=message, max_tokens=800, request_timeout=60)
125 | res = response.choices[0]['message']['content'].strip().split('\n')
126 | result_list = []
127 | for strline in res:
128 | if ': ' not in strline:
129 | continue
130 | strline = re.sub("\"", "", strline.strip(","))
131 | prefix, sentence = strline.split(': ')
132 | result_list.append("{}: {}".format(prefix.capitalize(), sentence))
133 | if len(result_list) == 8:
134 | break
135 | print(result_list)
136 |
137 | results[clsname] = result_list
138 |
139 | with open(os.path.join(data_path, 'train50%', "hardneg_closed_0.json"), "w") as outfile:
140 | json.dump(results, outfile, indent=4)
141 |
142 |
--------------------------------------------------------------------------------
/preprocess/generate_vdt_ucf24.py:
--------------------------------------------------------------------------------
1 | import random
2 | import re
3 | import os
4 | import copy
5 | import json
6 |
7 | import openai
8 | openai.api_key = "YOUR_OPENAI_KEY_HERE"
9 |
10 |
11 |
12 | def read_class_list(filepath):
13 | class_list = []
14 | with open(filepath, 'r') as f:
15 | for line in f.readlines():
16 | class_list.append(line.strip())
17 | return class_list
18 |
19 |
20 | if __name__ == '__main__':
21 | random.seed(42)
22 | data_path = '../data/UCF24/openworld'
23 |
24 | ucf24_classes = read_class_list(os.path.join(data_path, 'vocab_open.txt'))
25 |
26 | results = {}
27 | for clsname in ucf24_classes:
28 | print("\nProcessing action: {}...".format(clsname))
29 | prompt = f"Generate 16 captions that describe the action '{clsname}'. For example, given the action dance, your output will be like: Dance: A person is dancing on the stage, with the body moving rhythmically to music."
30 | message = [
31 | {"role": "system", "content": "You are a useful assistant."},
32 | {"role": "user", "content": prompt}
33 | ]
34 | response = openai.ChatCompletion.create(model="gpt-4-0613", messages=message, max_tokens=800, request_timeout=60) # Adjust max_tokens as needed
35 | res = response.choices[0]['message']['content'].strip().split('\n')
36 | res = [re.sub(r'\d+. ', '', cap) for cap in res]
37 | print(res)
38 | results[clsname] = res
39 |
40 | with open(os.path.join(data_path, "vocab_gpt4.json"), "w") as outfile:
41 | json.dump(results, outfile, indent=4)
42 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | yacs
3 | opencv-python
4 | tensorboardX
5 | SciPy
6 | fvcore
7 | timm
8 | iopath
9 | git+https://github.com/openai/CLIP.git
10 | transformers
11 | ttach
12 | kornia
13 | scikit-learn
14 | scikit-image
15 | einops
16 | matplotlib
17 | supervision
--------------------------------------------------------------------------------
/test_net.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import torch
5 | from alphaction.config import cfg
6 | from alphaction.dataset import make_data_loader
7 | from alphaction.engine.inference import inference
8 | from alphaction.modeling.detector import build_detection_model, build_naive_baseline
9 | from alphaction.utils.checkpoint import ActionCheckpointer
10 | from torch.utils.collect_env import get_pretty_env_info
11 | from alphaction.utils.comm import synchronize, get_rank
12 | from alphaction.utils.logger import setup_logger
13 | #pytorch issuse #973
14 | import resource
15 |
16 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
17 | resource.setrlimit(resource.RLIMIT_NOFILE, (rlimit[1], rlimit[1]))
18 |
19 |
20 | def main():
21 | parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
22 | parser.add_argument(
23 | "--config-file",
24 | default="",
25 | metavar="FILE",
26 | help="path to config file",
27 | )
28 | parser.add_argument("--local_rank", type=int, default=0)
29 | parser.add_argument(
30 | "opts",
31 | help="Modify config options using the command-line",
32 | default=None,
33 | nargs=argparse.REMAINDER,
34 | )
35 |
36 | args = parser.parse_args()
37 |
38 | num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
39 | distributed = num_gpus > 1
40 |
41 | if distributed:
42 | torch.cuda.set_device(args.local_rank)
43 | torch.distributed.init_process_group(
44 | backend="nccl", init_method="env://"
45 | )
46 |
47 | # Merge config file.
48 | cfg.merge_from_file(args.config_file)
49 | cfg.merge_from_list(args.opts)
50 | cfg.freeze()
51 |
52 |
53 | # Print experimental infos.
54 | save_dir = ""
55 | logger = setup_logger("alphaction", save_dir, get_rank())
56 | logger.info("Using {} GPUs".format(num_gpus))
57 | logger.info(cfg)
58 |
59 | logger.info("Collecting env info (might take some time)")
60 | logger.info("\n" + get_pretty_env_info())
61 |
62 | # Build the model.
63 | if cfg.MODEL.DET == 'STMDetector':
64 | model = build_detection_model(cfg)
65 | elif cfg.MODEL.DET == 'NaiveBaseline':
66 | model = build_naive_baseline(cfg)
67 | model.to("cuda")
68 |
69 | if cfg.MODEL.DET != 'NaiveBaseline':
70 | # load weight.
71 | output_dir = cfg.OUTPUT_DIR
72 | checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir)
73 | ckpt_file = os.path.join(output_dir, cfg.MODEL.WEIGHT) if cfg.MODEL.WEIGHT else None
74 | checkpointer.load(ckpt_file)
75 |
76 | output_folders = [None] * len(cfg.DATA.DATASETS)
77 | dataset_names = cfg.DATA.DATASETS
78 | if cfg.OUTPUT_DIR:
79 | for idx, dataset_name in enumerate(dataset_names):
80 | inf_folder = "inference" if not cfg.TEST.SMALL_OPEN_WORLD else "inference_small"
81 | output_folder = os.path.join(cfg.OUTPUT_DIR, inf_folder, dataset_name)
82 | os.makedirs(output_folder, exist_ok=True)
83 | output_folders[idx] = output_folder
84 |
85 | # Do inference.
86 | data_loaders_test, vocabularies_test, _ = make_data_loader(cfg, is_train=False, is_distributed=distributed)
87 | for i, (output_folder, dataset_name, data_loader_test) in enumerate(zip(output_folders, dataset_names, data_loaders_test)):
88 | # set open vocabulary
89 | if len(vocabularies_test) > 0:
90 | model.backbone.text_encoder.set_vocabulary(vocabularies_test[i])
91 |
92 | inference(
93 | model,
94 | data_loader_test,
95 | dataset_name,
96 | output_folder=output_folder,
97 | metric=cfg.TEST.METRIC,
98 | use_cache=True
99 | )
100 | synchronize()
101 |
102 |
103 | if __name__ == "__main__":
104 | main()
105 |
--------------------------------------------------------------------------------
/third_party/eval_utils.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | LIB_PATH=['../alphaction/dataset/datasets/evaluation']
3 | sys.path.extend(LIB_PATH)
4 |
5 | from pascal_evaluation.object_detection_evaluation import PascalDetectionEvaluator
6 | from pascal_evaluation.standard_fields import InputDataFields, DetectionResultFields
7 |
8 | import pickle
9 | import numpy as np
10 |
11 |
12 |
13 | def load_gt_data(anno_file, split=0):
14 | assert os.path.exists(anno_file), "Annotation file does not exist: {}".format(anno_file)
15 | with open(anno_file, 'rb') as fid:
16 | data = pickle.load(fid, encoding='iso-8859-1')
17 | return data
18 |
19 |
20 | def eval_person_boxes(results, gt_data):
21 | class_id = 1
22 |
23 | pascal_evaluator = PascalDetectionEvaluator([{'id': class_id, 'name': 'person'}],
24 | matching_iou_threshold=0.5)
25 |
26 | # prepare ground truth
27 | for vid, annos in gt_data['gttubes'].items():
28 | # each video contains only one action type
29 | act_id = list(annos.keys())[0]
30 | act_annos = annos[act_id][0]
31 | height, width = gt_data['resolution'][vid]
32 | # each action type contains only one action box on a frame
33 | for fid_box in act_annos:
34 | img_key = "%s,%04d" % (vid, float(fid_box[0]))
35 | box_normed = fid_box[1:5] / np.array([width, height, width, height], dtype=np.float32) # (xyxy)
36 | box_normed = box_normed[[1, 0, 3, 2]] # (yxyx)
37 | pascal_evaluator.add_single_ground_truth_image_info(
38 | img_key, {
39 | InputDataFields.groundtruth_boxes: box_normed[None],
40 | InputDataFields.groundtruth_classes: np.array([class_id], dtype=int),
41 | InputDataFields.groundtruth_difficult: np.zeros(1, dtype=bool)
42 | })
43 |
44 | # prepare detection results
45 | for vid, dets in results.items():
46 | boxes, scores = dets['boxes'], dets['scores']
47 | frame_ids = list(boxes.keys())
48 | for fid in frame_ids:
49 | img_key = "%s,%04d" % (vid, float(fid))
50 | boxes_pred = boxes[fid].copy()
51 | boxes_pred = boxes_pred[:, [1, 0, 3, 2]]
52 | pascal_evaluator.add_single_detected_image_info(
53 | img_key, {
54 | DetectionResultFields.detection_boxes: boxes_pred,
55 | DetectionResultFields.detection_classes: np.array([class_id]*len(boxes[fid]), dtype=int),
56 | DetectionResultFields.detection_scores: scores[fid].copy()
57 | })
58 |
59 | eval_res = pascal_evaluator.evaluate()
60 |
61 | precisions = pascal_evaluator._evaluation.precisions_per_class
62 | recalls = pascal_evaluator._evaluation.recalls_per_class
63 |
64 | return eval_res, precisions, recalls
--------------------------------------------------------------------------------
/third_party/maskrcnn_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torchvision.ops import box_convert
3 | from video_io import *
4 | from tqdm import tqdm
5 |
6 |
7 | def preprocess_clip(clip, device):
8 | """ clip: (T, H, W, 3) in uint8 format
9 | """
10 | # preprocess video
11 | clip = torch.from_numpy(clip).to(device).float() / 255.0
12 | clip = clip.permute(0, 3, 1, 2).contiguous() # (T, C, H, W)
13 | return clip
14 |
15 |
16 | def maskrcnn_video(video_path, model, categories, box_thresh=0.35, topk=None, batch_size=16, fmt='%05d.png', device=torch.device('cuda')):
17 | # load video data
18 | if isinstance(video_path, list):
19 | video = read_video_from_list(video_path)
20 | video_name = os.path.dirname(video_path[0]).split("/")[-1]
21 | elif os.path.isfile(video_path):
22 | video = read_video_from_file(video_path) # (T, H, W, C) in RGB uint8 format
23 | video_name = video_path.split("/")[-1][:-4]
24 | else:
25 | video = read_video_from_folder(video_path, fmt=fmt)
26 | video_name = video_path.split("/")[-1]
27 | num_frames = len(video)
28 |
29 | if isinstance(video_path, list):
30 | frame_ids = [int(imgfile[:-4].split("/")[-1].split("_")[-1])
31 | for imgfile in video_path]
32 | else:
33 | frame_ids = list(range(num_frames))
34 |
35 | if num_frames > batch_size:
36 | video = np.array_split(video, int(num_frames // batch_size))
37 | frame_ids = np.array_split(frame_ids, int(num_frames // batch_size))
38 | else:
39 | video, frame_ids = [video], [frame_ids]
40 |
41 | results = {'boxes': dict(), 'scores': dict()}
42 | for fids, clip in tqdm(zip(frame_ids, video), total=len(video), desc="{}".format(video_name), ncols=0):
43 | # preprocess
44 | height, width = clip.shape[1:3]
45 | batch = preprocess_clip(clip, device) # (T, 3, H, W)
46 | with torch.no_grad():
47 | outputs = model(batch)
48 | # get results
49 | for i, outs in enumerate(outputs):
50 | mask = outs['labels'] == categories.index('person')
51 | if not any(mask):
52 | continue # no person at all
53 |
54 | if box_thresh is not None:
55 | mask = mask & (outs['scores'] > box_thresh)
56 | if topk is not None:
57 | inds = torch.topk(outs['scores'], topk)[1]
58 | topk_mask = torch.zeros_like(outs['scores'], dtype=torch.bool).scatter_(0, inds, True)
59 | mask = mask & topk_mask
60 | if not any(mask): # no valid person
61 | continue
62 |
63 | # mask out and sort boxes and scores
64 | boxes = outs['boxes'][mask] # the predicted boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
65 | scores = outs['scores'][mask]
66 | idx = torch.argsort(scores, descending=True)
67 | boxes, scores = boxes[idx], scores[idx]
68 | # save
69 | boxes[:, [0, 2]] = boxes[:, [0, 2]] / width
70 | boxes[:, [1, 3]] = boxes[:, [1, 3]] / height
71 | results['boxes'][fids[i]] = boxes.cpu().numpy() # normalized (x1, y1, x2, y2)
72 | results['scores'][fids[i]] = scores.cpu().numpy()
73 |
74 | return results
75 |
--------------------------------------------------------------------------------
/third_party/run_maskrcnn.py:
--------------------------------------------------------------------------------
1 | import os, argparse, pickle
2 | from tqdm import tqdm
3 |
4 | import torch
5 | from torchvision.models.detection import maskrcnn_resnet50_fpn
6 | from torchvision.models._meta import _COCO_CATEGORIES
7 | from maskrcnn_utils import maskrcnn_video
8 | from video_io import vis_dets
9 |
10 | from eval_utils import eval_person_boxes, load_gt_data
11 | from pprint import pformat
12 | import matplotlib.pyplot as plt
13 |
14 |
15 | def main(args):
16 |
17 | dataset = args.data.upper()
18 | if args.data == 'jhmdb':
19 | video_dir = f'../data/{dataset}/Frames'
20 | fmt = '%05d.png'
21 | box_thresh, topk = None, 1
22 |
23 | elif args.data == 'ucf24':
24 | video_dir = f'../data/{dataset}/rgb-images'
25 | fmt = '%05d.jpg'
26 | box_thresh, topk = 0.35, None
27 |
28 | else:
29 | raise NotImplemented
30 | results_save_file = f'../data/{dataset}/{dataset}-MaskRCNN.pkl'
31 |
32 | if not os.path.exists(results_save_file):
33 | # setup device and model
34 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
35 | model = maskrcnn_resnet50_fpn(pretrained=True).to(device)
36 | model.eval()
37 |
38 | all_video_files = []
39 | for folder in os.listdir(video_dir):
40 | videos_class_path = os.path.join(video_dir, folder)
41 | if not os.path.isdir(videos_class_path):
42 | continue
43 | vid_files = [folder + '/' + vid for vid in os.listdir(videos_class_path) if os.path.isdir(os.path.join(videos_class_path, vid))]
44 | all_video_files.extend(vid_files)
45 |
46 | results = dict()
47 | for vid in tqdm(all_video_files, total=len(all_video_files), ncols=0):
48 | print("\nRuning on the file: {}...".format(vid))
49 | results[vid] = maskrcnn_video(os.path.join(video_dir, vid), model, _COCO_CATEGORIES,
50 | fmt=fmt, box_thresh=box_thresh, topk=topk, device=device)
51 |
52 | with open(results_save_file, 'wb') as fid:
53 | pickle.dump(results, fid, protocol=pickle.HIGHEST_PROTOCOL)
54 |
55 | else:
56 | with open(results_save_file, 'rb') as fid:
57 | results = pickle.load(fid, encoding='iso-8859-1')
58 |
59 | # evaluation
60 | if args.eval:
61 | # load the ground truth
62 | jhmdb_gt_file = '../data/JHMDB/JHMDB-GT.pkl'
63 | gt_data = load_gt_data(jhmdb_gt_file)
64 |
65 | eval_res, precisions, recalls = eval_person_boxes(results, gt_data)
66 |
67 | print(pformat(eval_res, indent=2))
68 |
69 | plt.figure(figsize=(10, 6))
70 | plt.plot(recalls[0], precisions[0], label="Precision-Recall curve")
71 | plt.xlabel("Recall")
72 | plt.ylabel("Precision")
73 | plt.legend(loc="lower left")
74 | plt.tight_layout()
75 | plt.savefig('../temp/jhmdb/precision_recall_curve_maskrcnn.png', bbox_inches='tight')
76 | plt.close()
77 |
78 | # visualize
79 | if args.vis:
80 | test_video = ['kick_ball/FIFA_11_Gamescom-Trailer_kick_ball_f_cm_np1_ba_med_4']
81 | save_dir = os.path.join(os.path.dirname(results_save_file), 'VisMaskRCNN')
82 | os.makedirs(save_dir, exist_ok=True)
83 | for vid in test_video:
84 | savefile = os.path.join(save_dir, vid.replace('/', '-') + "_pred.mp4")
85 | vis_dets(results, vid, video_dir, savefile)
86 |
87 |
88 | if __name__ == '__main__':
89 |
90 | parser = argparse.ArgumentParser(description="Mask RCNN (ResNet-50 FPN) Experiments")
91 | parser.add_argument(
92 | "--data", type=str, default='jhmdb', choices=['jhmdb', 'ucf24'], help="dataset used for testing",
93 | )
94 | parser.add_argument(
95 | "--vis", action='store_true', help="visualize the detection results",
96 | )
97 | parser.add_argument(
98 | "--eval", action='store_true', help="evaluate the quality"
99 | )
100 | args = parser.parse_args()
101 |
102 | main(args)
--------------------------------------------------------------------------------
/third_party/video_io.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | import os
4 | import supervision as sv
5 |
6 |
7 | def read_video_from_file(video_file, toRGB=True):
8 | assert os.path.exists(video_file), "File does not exist! {}".format(video_file)
9 | cap = cv2.VideoCapture(video_file)
10 | success, frame = cap.read()
11 | video = []
12 | while success:
13 | if toRGB:
14 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
15 | video.append(frame)
16 | success, frame = cap.read()
17 | video = np.array(video)
18 | return video
19 |
20 |
21 | def read_video_from_folder(video_path, fmt='%05d.png', start_frame=1, toRGB=True):
22 | frame_files = [name for name in os.listdir(video_path) if name.endswith(fmt[-4:])]
23 | vid_name = video_path.split("/")[-1]
24 | video = []
25 | for i in range(len(frame_files)):
26 | if len(fmt.split("_")) == 1:
27 | frame_name = fmt%(i + start_frame)
28 | elif len(fmt.split("_")) == 2:
29 | frame_name = fmt%(vid_name, i + start_frame)
30 | frame_file = os.path.join(video_path, frame_name) # frame starting from 1
31 | frame = cv2.imread(frame_file)
32 | if toRGB:
33 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
34 | video.append(frame)
35 | video = np.array(video)
36 | return video
37 |
38 |
39 | def read_video_from_list(img_list, toRGB=True):
40 | video = []
41 | for frame_file in img_list:
42 | frame = cv2.imread(frame_file)
43 | if toRGB:
44 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
45 | video.append(frame)
46 | video = np.array(video)
47 | return video
48 |
49 |
50 | def write_video(mat, video_file, fps=30, write_frames=True):
51 | """ mat: (T, H, W, C)
52 | """
53 | video_writer = cv2.VideoWriter(video_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (mat.shape[2], mat.shape[1]))
54 | for frame in mat:
55 | video_writer.write(frame)
56 |
57 | if write_frames:
58 | os.makedirs(video_file[:-4], exist_ok=True)
59 | for i, frame in enumerate(mat):
60 | cv2.imwrite(os.path.join(video_file[:-4], '%06d.jpg'%(i)), frame)
61 |
62 |
63 | def vis_dets(results, vid, video_dir, savefile):
64 | # read frames
65 | video = read_video_from_folder(os.path.join(video_dir, vid), toRGB=False) # (T, H, W, C)
66 | # parse detections
67 | boxes = results[vid]['boxes'] # normalized (x1, y1, x2, y2)
68 | scores = results[vid]['scores']
69 | video_vis = []
70 | # visualize
71 | for i, frame in enumerate(video):
72 | h, w = frame.shape[:2]
73 | xyxy = boxes[i] * np.array([[w, h, w, h]])
74 | detections = sv.Detections(xyxy=xyxy, confidence=scores[i])
75 | labels = [f"person {s:.2f}" for s in scores[i]]
76 | # annotate on frame
77 | box_annotator = sv.BoxAnnotator()
78 | annotated_frame = box_annotator.annotate(scene=frame.copy(), detections=detections, labels=labels)
79 | video_vis.append(annotated_frame)
80 | video_vis = np.array(video_vis)
81 | # visualize
82 | write_video(video_vis, savefile, fps=20, write_frames=False)
--------------------------------------------------------------------------------
/trainval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PHASE=$1 # train, eval
4 | DATASET=$2 # jhmdb, ucf24
5 |
6 | CFG_FILE="config_files/${DATASET}/openmixer_e2e.yaml"
7 | # CFG_FILE="config_files/${DATASET}/openmixer_zsr_tl.yaml"
8 | # CFG_FILE="config_files/${DATASET}/openmixer_zsr_zsl.yaml" # eval-only!
9 |
10 | TEST_WEIGHT=${3:-'checkpoints/model_final.pth'}
11 |
12 | eval "$(conda shell.bash hook)"
13 | conda activate openmixer
14 |
15 | if [ $PHASE == 'train' ]
16 | then
17 | python -m torch.distributed.launch --nproc_per_node=4 --master_port=2024 train_net.py \
18 | --config-file ${CFG_FILE} \
19 | --transfer \
20 | --no-head \
21 | --use-tfboard
22 | elif [ $PHASE == 'eval' ]
23 | then
24 | python -m torch.distributed.launch --nproc_per_node=4 --master_port=2405 test_net.py \
25 | --config-file ${CFG_FILE} \
26 | MODEL.WEIGHT ${TEST_WEIGHT}
27 | fi
28 |
29 | echo "${PHASE} finished!"
--------------------------------------------------------------------------------