├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── alphaction ├── __init__.py ├── cam │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── cam.py │ ├── clip_loader.py │ ├── hilacam.py │ ├── mhsa.py │ └── ritsm.py ├── config │ ├── __init__.py │ └── defaults.py ├── dataset │ ├── __init__.py │ ├── build.py │ ├── collate_batch.py │ ├── datasets │ │ ├── __init__.py │ │ ├── ava.py │ │ ├── ava_dataset.py │ │ ├── ava_helper.py │ │ ├── concat_dataset.py │ │ ├── cv2_transform.py │ │ ├── evaluation │ │ │ ├── __init__.py │ │ │ ├── ava │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ └── ava_eval.py │ │ │ ├── evaluate_map.py │ │ │ ├── jhmdb │ │ │ │ ├── __init__.py │ │ │ │ └── jhmdb_eval.py │ │ │ ├── pascal_evaluation │ │ │ │ ├── __init__.py │ │ │ │ ├── label_map_util.py │ │ │ │ ├── metrics.py │ │ │ │ ├── np_box_list.py │ │ │ │ ├── np_box_list_ops.py │ │ │ │ ├── np_box_mask_list.py │ │ │ │ ├── np_box_mask_list_ops.py │ │ │ │ ├── np_box_ops.py │ │ │ │ ├── np_mask_ops.py │ │ │ │ ├── object_detection_evaluation.py │ │ │ │ ├── per_image_evaluation.py │ │ │ │ └── standard_fields.py │ │ │ ├── pascal_wrapper.py │ │ │ └── ucf24 │ │ │ │ ├── __init__.py │ │ │ │ └── ucf24_eval.py │ │ ├── jhmdb_dataset.py │ │ ├── ucf24_dataset.py │ │ └── utils.py │ └── samplers │ │ ├── __init__.py │ │ ├── distributed.py │ │ ├── grouped_batch_sampler.py │ │ └── iteration_based_batch_sampler.py ├── engine │ ├── __init__.py │ ├── feature_extraction.py │ ├── inference.py │ └── trainer.py ├── layers │ ├── __init__.py │ └── batch_norm.py ├── modeling │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ ├── backbone.py │ │ ├── i3d.py │ │ ├── sfmodels │ │ │ ├── common.py │ │ │ ├── nonlocal_helper.py │ │ │ ├── resnet_helper.py │ │ │ └── stem_helper.py │ │ ├── slowfast.py │ │ ├── video_model_builder.py │ │ └── vit_utils.py │ ├── common_blocks.py │ ├── detector │ │ ├── __init__.py │ │ ├── action_detector.py │ │ ├── naive_baseline.py │ │ └── stm_detector.py │ ├── dict_model.py │ ├── encoders │ │ ├── clipvip │ │ │ ├── CLIP_ViP.py │ │ │ ├── clipvip_encoder.py │ │ │ ├── custom_layers.py │ │ │ └── loader.py │ │ ├── openai_clip │ │ │ ├── clip_encoder.py │ │ │ └── clip_loader.py │ │ └── viclip │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── demo.py │ │ │ ├── simple_tokenizer.py │ │ │ ├── viclip.py │ │ │ ├── viclip_encoder.py │ │ │ ├── viclip_text.py │ │ │ └── viclip_vision.py │ ├── nonlocal_block.py │ ├── registry.py │ ├── roi_heads │ │ ├── __init__.py │ │ ├── action_head │ │ │ ├── IA_structure.py │ │ │ ├── __init__.py │ │ │ ├── action_head.py │ │ │ ├── inference.py │ │ │ ├── loss.py │ │ │ ├── metric.py │ │ │ ├── roi_action_feature_extractor.py │ │ │ └── roi_action_predictors.py │ │ └── roi_heads_3d.py │ ├── stm_decoder │ │ ├── stm_decoder.py │ │ └── util │ │ │ ├── __init__.py │ │ │ ├── adaptive_mixing_operator.py │ │ │ ├── box_ops.py │ │ │ ├── head_utils.py │ │ │ ├── loss.py │ │ │ ├── misc.py │ │ │ └── msaq.py │ └── utils.py ├── solver │ ├── __init__.py │ ├── build.py │ └── lr_scheduler.py ├── structures │ ├── __init__.py │ ├── bounding_box.py │ └── memory_pool.py └── utils │ ├── IA_helper.py │ ├── __init__.py │ ├── c2_model_loading.py │ ├── checkpoint.py │ ├── comm.py │ ├── logger.py │ ├── metric_logger.py │ ├── model_serialization.py │ ├── random_seed.py │ ├── registry.py │ ├── video_decode.py │ └── visualize.py ├── assets └── wacv25_openmixer.png ├── config_files ├── jhmdb │ ├── openmixer_e2e.yaml │ ├── openmixer_zsr_tl.yaml │ └── openmixer_zsr_zsl.yaml └── ucf24 │ ├── openmixer_e2e.yaml │ ├── openmixer_zsr_tl.yaml │ └── openmixer_zsr_zsl.yaml ├── demo.py ├── preprocess ├── generate_vdt_jhmdb.py ├── generate_vdt_ucf24.py ├── openworld_split_jhmdb.py └── openworld_split_ucf24.py ├── requirements.txt ├── test_net.py ├── third_party ├── eval_utils.py ├── maskrcnn_utils.py ├── run_maskrcnn.py └── video_io.py ├── train_net.py └── trainval.sh /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | pretrained 3 | output/ 4 | *.pyc 5 | *.vscode 6 | *.log 7 | *.egg-info 8 | *.out 9 | *.err 10 | *_temp.* 11 | *.jpg 12 | *.jpeg 13 | .nfs* 14 | alphaction/cam/demo.py 15 | *.pth 16 | backup/ 17 | *backup* 18 | */figures/* 19 | *.zip 20 | data_zip/ 21 | data-release -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/GroundingDINO"] 2 | path = third_party/GroundingDINO 3 | url = https://github.com/IDEA-Research/GroundingDINO.git 4 | ignore = dirty 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Wentao Bao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenMixer 2 | This repository released the source code of the WACV 2025 paper [OpenMixer](https://arxiv.org/pdf/2411.10922), heavily dependent on the [STMixer](https://github.com/MCG-NJU/STMixer) codebase. OpenMixer is an open-vocabulary action detector that aims to detect any human actions from videos in an open world. The figure below shows the model architecture. 3 | 4 |

5 | OpenMixer 6 |

7 | 8 | ## Installation 9 | - Create conda environment: 10 | ```bash 11 | conda create -n openmixer python=3.7 12 | ``` 13 | 14 | - Install pytorch: 15 | ```bash 16 | pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117 17 | ``` 18 | 19 | - Install other libraries (including the OpenAI-CLIP): 20 | ```bash 21 | pip install -r requirements.txt 22 | ``` 23 | 24 | ## Data Preparation 25 | - First, please refer to the MMAction2 [JHMDB](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/jhmdb/README.md) and [UCF24](https://github.com/open-mmlab/mmaction2/blob/main/tools/data/ucf101_24/README.md) dataset preparation steps. 26 | 27 | - Next, please download our released [Open-World splits](https://drive.google.com/drive/folders/1Bu5GNsGIfYD-4u_7WMjBOWZj_3zs-HbJ?usp=sharing). Make sure folders are structured as follows. 28 | ```bash 29 | data 30 | ├──JHMDB 31 | | ├── openworld 32 | | ├── Frames 33 | | ├── JHMDB-MaskRCNN.pkl 34 | | ├── JHMDB-GT.pkl 35 | ├──UCF24 36 | | ├── openworld 37 | | ├── rgb-images 38 | | ├── UCF24-MaskRCNN.pkl 39 | ``` 40 | 41 | ## Models 42 | 43 | - Please download the pretrained `CLIP-ViP-B/16` checkpoint from [XPretrain/CLIP-ViP](https://github.com/microsoft/XPretrain/tree/main/CLIP-ViP), which is a video CLIP model served as the backbone of our model. After downloaded, make sure the file is located at `./pretrained/pretrain_clipvip_base_16.pt`. 44 | 45 | - [Optional] We released three OpenMixer models and inference results for each of the JHMDB and UCF24 datasets here: [Google Drive](https://drive.google.com/drive/folders/1MDT_jcJolNZjuZ15cdhXyJmewMyVBKUP?usp=sharing). They correspond to the configurations in the folder `./config_files/`. Note that for the ZSR_ZSL setting, no model training needed. 46 | 47 | 48 | ## Training 49 | 50 | We provided an easy-to-use bash script to enable training and evaluation for different settings and datasets. For example, to train the OpenMixer model under the end-to-end setting on the JHMDB dataset using 4 specified GPUs: 51 | ```bash 52 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash trainval.sh train jhmdb 53 | ``` 54 | Optionally, you may change the GPU IDs and dataset name to `ucf24`. For other settings, in the `trainval.sh`, change the `CFG_FILE` to `openmixer_zsr_tl.yaml` to train OpenMixer model under the ZSR+TL setting. 55 | 56 | 57 | ## Validation 58 | We use the same bash script for validation (inference + evaluation) 59 | ```bash 60 | CUDA_VISIBLE_DEVICES=0,1,2,3 bash trainval.sh eval jhmdb 61 | ``` 62 | Optionally, you may change the GPU IDs and dataset name to `ucf24`. For other settings, in the `trainval.sh`, change the `CFG_FILE` to `openmixer_zsr_tl.yaml` and `openmixer_zsr_zsl.yaml` for evaluating models under the ZSR+TL and ZSR+ZSL settings, respectively. 63 | 64 | 65 | ## Acknowledgements 66 | This project is built upon [STMixer](https://github.com/MCG-NJU/STMixer), [CLIP-ViP](https://github.com/microsoft/XPretrain/CLIP-ViP), and [OpenAI-CLIP](https://github.com/openai/CLIP). We sincerely thank contributors of all these great open-source repositories! 67 | 68 | 69 | ## Citation 70 | 71 | If this project helps you in your research or project, please cite 72 | our paper: 73 | 74 | ``` 75 | @InProceedings{bao2025wacv, 76 | title={Exploiting VLM Localizability and Semantics for Open Vocabulary Action Detection}, 77 | author={Wentao Bao and Kai Li and Yuxiao Chen and Deep Patel and Martin Renqiang Min and Yu Kong}, 78 | booktitle = {IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)}, 79 | year={2025} 80 | } 81 | ``` 82 | 83 | 84 | -------------------------------------------------------------------------------- /alphaction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/__init__.py -------------------------------------------------------------------------------- /alphaction/cam/.gitignore: -------------------------------------------------------------------------------- 1 | demos/ 2 | hila_clip/ 3 | pytorch_grad_cam/ 4 | *.gif 5 | *.mp4 -------------------------------------------------------------------------------- /alphaction/cam/README.md: -------------------------------------------------------------------------------- 1 | ## HilaCAM for CLIP Visual Attention 2 | 3 | Please go to [gScoreCAM](https://github.com/anguyen8/gScoreCAM), download the folders `hila_clip/` and `pytorch_grad_cam/`, and put them in this folder. 4 | 5 | The following commands show the steps: 6 | ```shell 7 | cd alphaction/cam 8 | git clone https://github.com/anguyen8/gScoreCAM 9 | cp -r gScoreCAM/hila_clip gScoreCAM/pytorch_grad_cam . 10 | rm -rf gScoreCAM 11 | 12 | ``` 13 | 14 | After that, please ensure the following python packages are installed: 15 | ```shell 16 | pip install ttach kornia scikit-learn scikit-image 17 | ``` -------------------------------------------------------------------------------- /alphaction/cam/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/cam/__init__.py -------------------------------------------------------------------------------- /alphaction/cam/clip_loader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | #* For CLIP ViT 4 | def reshape_transform(tensor, height=None, width=None): 5 | if height or width is None: 6 | grid_square = len(tensor) - 1 7 | if grid_square ** 0.5 % 1 == 0: 8 | height = width = int(grid_square**0.5) 9 | else: 10 | raise ValueError("Heatmap is not square, please set height and width.") 11 | result = tensor[1:, :, :].reshape( 12 | height, width, tensor.size(2)) 13 | 14 | # Bring the channels to the first dimension, 15 | # like in CNNs. 16 | result = result.permute(2, 0, 1) 17 | return result.unsqueeze(0) 18 | 19 | def load_clip(clip_version, attn_prob=True, attn_grad=True, attn_last_only=True, resize='adapt', custom=False, model_weight=None): 20 | device = "cuda" if torch.cuda.is_available() else "cpu" 21 | if 'vit' in clip_version.lower() and not custom: #* This is no necessary, for experimental usage, hila CLIP will hook all attentions. 22 | from hila_clip import clip 23 | clip_model, preprocess = clip.load(clip_version, device=device, jit=False) 24 | 25 | elif 'clip-vip' in clip_version.lower(): 26 | import sys, clip 27 | sys.path.append("../../") 28 | from alphaction.modeling.encoders.clipvip import loader 29 | clip_model, preprocess = loader.load(clip_version, 30 | attn_prob=attn_prob, 31 | attn_grad=attn_grad, 32 | attn_last_only=attn_last_only, 33 | device=device, model_weight=model_weight) 34 | 35 | elif custom: 36 | from hila_clip import clip 37 | clip_model, preprocess = clip.load(clip_version, device=device, jit=False) 38 | 39 | else: 40 | import clip 41 | clip_model, preprocess = clip.load(clip_version, device=device) 42 | 43 | if clip_version.startswith("RN"): 44 | target_layer = clip_model.visual.layer4[-1] 45 | cam_trans = None 46 | elif 'clip-vip' in clip_version.lower(): 47 | target_layer = clip_model.vision_model.encoder.layers[-1] 48 | cam_trans = reshape_transform 49 | else: 50 | target_layer = clip_model.visual.transformer.resblocks[-1] 51 | cam_trans = reshape_transform 52 | 53 | if resize == 'raw': # remove clip resizing 54 | if not custom: 55 | raise Exception("Raw input needs to use custom clip.") 56 | preprocess.transforms.pop(0) 57 | preprocess.transforms.pop(0) 58 | elif resize == 'adapt': # adapt to clip size 59 | from torchvision import transforms 60 | crop_size = preprocess.transforms[1].size # resize to crop size so that no information will be cropped 61 | preprocess.transforms.insert(0, transforms.Resize(crop_size)) 62 | # clip_model = torch.nn.DataParallel(clip_model) 63 | return clip_model, preprocess, target_layer, cam_trans, clip 64 | 65 | def load_clip_from_checkpoint(checkpoint, model): 66 | checkpoint = torch.load(checkpoint, map_location='cpu') 67 | 68 | # # Use these 3 lines if you use default model setting(not training setting) of the clip. For example, if you set context_length to 100 since your string is very long during training, then assign 100 to checkpoint['model_state_dict']["context_length"] 69 | # checkpoint['model_state_dict']["input_resolution"] = model.input_resolution #default is 224 70 | # checkpoint['model_state_dict']["context_length"] = model.context_length # default is 77 71 | # checkpoint['model_state_dict']["vocab_size"] = model.vocab_size 72 | 73 | model.load_state_dict(checkpoint['model_state_dict']) 74 | return model -------------------------------------------------------------------------------- /alphaction/cam/mhsa.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def get_multi_head_mask(attentions, threshold=0.6): 5 | nh, np = attentions.size(0), attentions.size(-1) 6 | # we keep only a certain percentage of the mass 7 | val, idx = torch.sort(attentions) 8 | val /= torch.sum(val, dim=-1, keepdim=True) 9 | cumval = torch.cumsum(val, dim=-1) 10 | th_attn = cumval > (1 - threshold) 11 | idx2 = torch.argsort(idx) # dim=-1 by default 12 | th_attn = th_attn.view(nh, -1) 13 | for head in range(nh): 14 | th_attn[head] = th_attn[head][idx2[head].view(-1)] 15 | if len(attentions.size()) == 3: 16 | th_attn = th_attn.view(nh, -1, np) 17 | return th_attn 18 | 19 | 20 | def get_masked_attention_map(attentions, nh, heatmap_size, cam_size, mask=None): 21 | if mask is not None: 22 | # apply mask on attention map 23 | attentions = attentions * mask.float() # (num_heads, N, L) 24 | # normalize within each frame 25 | attentions -= attentions.min(dim=-1, keepdim=True)[0] 26 | attentions /= attentions.max(dim=-1, keepdim=True)[0] 27 | 28 | num_frames = attentions.size(1) if len(attentions.size()) == 3 else 1 29 | # average over multi-heads as the final attention 30 | attentions = attentions.reshape(nh, num_frames, heatmap_size[0], heatmap_size[1]).mean(dim=0, keepdim=True) 31 | 32 | if cam_size is not None: 33 | # interpolate 34 | attentions = torch.nn.functional.interpolate(attentions, size=(cam_size[1], cam_size[0]), mode="bilinear")[0] 35 | return attentions.cpu().numpy() 36 | 37 | 38 | @torch.no_grad() 39 | def mhsa_clip(image, model, cam_size=None, threshold=0.6): 40 | # get patch token features 41 | _, attn_last = model.encode_image(image, last_attn_output=True) # (B, num_heads, L, D) 42 | nh = attn_last.shape[1] # number of head 43 | 44 | # we keep only the output patch attention 45 | # assume batch_size = 1 46 | attentions = attn_last[0, :, 0, 1:].reshape(nh, -1) # (num_heads, 7*7) 47 | heatmap_size = [int(attentions.size(-1)**0.5), int(attentions.size(-1)**0.5)] # 7 48 | 49 | th_attn = get_multi_head_mask(attentions, threshold) 50 | 51 | attn_map = get_masked_attention_map(attentions, nh, heatmap_size, cam_size, mask=th_attn) # (1, H, W) 52 | 53 | return attn_map[0] 54 | 55 | 56 | @torch.no_grad() 57 | def mhsa_clipvip(video, model, cam_size=None, threshold=0.6): 58 | """ video: (B, T, C, H, W) 59 | text: (K, L) 60 | cam_size: (W, H) 61 | """ 62 | num_proxy = model.config.vision_additional_config.add_cls_num + 1 63 | num_heads = model.config.vision_config.num_attention_heads 64 | num_frames = video.size(1) 65 | 66 | # run forward pass to get the last block attentions 67 | _, heatmap_size = model.get_image_features(video, return_ws=True) # (h,w) 68 | last_block = list(dict(model.vision_model.encoder.layers.named_children()).values())[-1] 69 | attn_inter = last_block.attn_probs['inter'] # [B*num_heads, M, M+N*L] where M=4 70 | attn_intra = last_block.attn_probs['intra'] # [B*num_heads*N, L, M+L] where L=196 if input_size=224 71 | 72 | num_patches = attn_intra.shape[-2] # L 73 | attentions_inter = attn_inter[:, 0, num_proxy:].reshape(-1, num_heads, num_frames, num_patches)[0] # [B*num_heads, N*L] --> [num_heads, N, L] 74 | # attentions_intra = attn_intra[:, 0, num_proxy:].reshape(-1, num_heads, num_frames, num_patches)[0] # [B*num_heads*N, L] --> [num_heads, N, L] 75 | 76 | th_attn = get_multi_head_mask(attentions_inter, threshold) 77 | attn_map = get_masked_attention_map(attentions_inter, num_heads, heatmap_size, cam_size, mask=th_attn) # (T, H, W) 78 | 79 | # temporal weights 80 | temporal_weights = attn_inter[:, 0, num_proxy:].reshape(-1, num_frames, num_patches).sum(dim=-1) # [B*num_heads, N] 81 | temporal_weights = temporal_weights.reshape(-1, num_heads, num_frames)[0].sum(dim=0) # [N] 82 | temporal_weights -= temporal_weights.min(dim=-1, keepdim=True)[0] 83 | temporal_weights /= temporal_weights.max(dim=-1, keepdim=True)[0] 84 | temporal_weights = temporal_weights.cpu().numpy() 85 | attn_map = temporal_weights[:, None, None] * attn_map 86 | 87 | # # visualize the weights 88 | # import matplotlib.pyplot as plt 89 | # import numpy as np 90 | # plt.bar(np.arange(num_frames) + 1, temporal_weights, 0.4) 91 | # plt.xlabel("video frames") 92 | # plt.ylabel("normalized attentions") 93 | # plt.xticks(np.arange(num_frames) + 1) 94 | # plt.tight_layout() 95 | # plt.savefig("../../_temp./temporal_weights.png") 96 | 97 | return attn_map -------------------------------------------------------------------------------- /alphaction/cam/ritsm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import cv2 4 | 5 | 6 | def clip_forward(model, image, text): 7 | # get patch token features 8 | image_features, encoder_out = model.encode_image(image, transformer_output=True) # (N, L, D) 9 | text_features = model.encode_text(text) 10 | 11 | # cosine similarity as logits 12 | logit_scale = model.logit_scale.exp() 13 | logits_per_image = logit_scale * image_features @ text_features.t() 14 | logits_per_text = logit_scale * text_features @ image_features.t() 15 | 16 | return logits_per_image, encoder_out, text_features 17 | 18 | 19 | @torch.no_grad() 20 | def ritsm_clip(image, text, model, device, index=None, cam_size=None, return_logits=False, attn_grad=False): 21 | # forward pass 22 | logits_per_image, encoder_out, text_features = clip_forward(model, image, text) 23 | probs = logits_per_image.softmax(dim=-1) 24 | if index is None: 25 | # locate the largest score of img-text pair 26 | index = np.argmax(logits_per_image.cpu().data.numpy(), axis=-1) 27 | 28 | input_size = model.visual.input_resolution # 224 29 | patch_features = encoder_out[:, 1:, :] # (B, 7*7, 768) 30 | heatmap_size = int(patch_features.size(1)**0.5) # 7 31 | 32 | # projection 33 | patch_features = model.visual.ln_post(patch_features) 34 | if model.visual.proj is not None: 35 | patch_features = patch_features @ model.visual.proj # (B, 7*7, 512) 36 | 37 | # normalize 38 | patch_features = patch_features / patch_features.norm(dim=-1, keepdim=True) 39 | # text_features = text_features / text_features.norm(dim=-1, keepdim=True) # (K=1, 512) 40 | 41 | # image-text similarity 42 | it_sim = patch_features @ text_features.t() # (B, 7*7, K=1) 43 | 44 | # reshape & resize 45 | image_relevance_all = it_sim[:, :, index].view(-1, 1, heatmap_size, heatmap_size) # (B, 1, 7, 7) 46 | image_relevance_all = torch.nn.functional.interpolate(image_relevance_all.float(), size=input_size, mode='bilinear') # (B, 1, H, W) 47 | image_relevance = image_relevance_all[0] # assume batch_size = 1 48 | image_relevance = image_relevance.reshape(input_size, input_size).detach().cpu().numpy() 49 | image_relevance = (image_relevance - image_relevance.min()) / (image_relevance.max() - image_relevance.min()) 50 | # reverse 51 | image_relevance = np.fabs(1 - image_relevance) 52 | 53 | out = cv2.resize(image_relevance, cam_size) if cam_size is not None else image_relevance 54 | if return_logits: 55 | return out, logits_per_image 56 | return out 57 | 58 | 59 | @torch.no_grad() 60 | def ritsm_clipvip(video, text, model, device, index=None, cam_size=None, return_logits=False, attn_grad=False, use_mask=False): 61 | """ video: (B, T, C, H, W) 62 | text: (K, L) 63 | """ 64 | num_proxy = model.config.vision_additional_config.add_cls_num + 1 65 | eos_idx = text.argmax(dim=-1) 66 | num_frames = video.size(1) 67 | 68 | input_size = model.config.vision_config.image_size # 224 69 | patch_size = model.config.vision_config.patch_size # 16 70 | num_patches = int(input_size // patch_size) # 14 71 | 72 | # run forward pass 73 | out_dict = model(text, video) 74 | logits_per_image = out_dict['logits_per_image'] 75 | 76 | if index is None: 77 | # locate the largest score of img-text pair 78 | index = np.argmax(logits_per_image.cpu().data.numpy(), axis=-1) 79 | 80 | # get patch features from the last vision encoder block 81 | patch_features = out_dict['vision_model_output']['last_hidden_state'][:, num_proxy:, :] # (B, T*14*14, 768) 82 | assert num_frames * (num_patches ** 2) == patch_features.size(1) 83 | 84 | # layernorm, projection, and normalization 85 | patch_features = model.vision_model.post_layernorm(patch_features) 86 | patch_features = model.visual_projection(patch_features) # 768 --> 512 87 | patch_features = patch_features / patch_features.norm(dim=-1, keepdim=True) # (B, T*14*14, 512) 88 | 89 | # get the text features 90 | text_features = out_dict['text_embeds'] # after layernorm, projection, and normalization 91 | 92 | # image-text similarity 93 | it_sim = patch_features @ text_features.t() # (B, T*14*14, K=1) 94 | 95 | if use_mask: 96 | th_attn = get_attn_mask(it_sim[0, :, index].view(num_frames, -1), threshold=0.6) 97 | it_sim = it_sim * th_attn.view(-1).unsqueeze(0).unsqueeze(-1) 98 | 99 | # reshape & resize 100 | image_relevance_all = it_sim[:, :, index].view(-1, num_frames, num_patches, num_patches) # (B, T, 14, 14) 101 | image_relevance_all = torch.nn.functional.interpolate(image_relevance_all.float(), size=input_size, mode='bilinear') # (B, T, H, W) 102 | 103 | # assume batch_size = 1 104 | image_relevance_all = image_relevance_all[0] 105 | 106 | all_maps = [] 107 | for image_relevance in image_relevance_all: 108 | image_relevance = image_relevance.reshape(input_size, input_size).detach().cpu().numpy() 109 | # normalize and reverse 110 | image_relevance = (image_relevance - image_relevance.min()) / (image_relevance.max() - image_relevance.min()) 111 | # reverse 112 | image_relevance = np.fabs(1 - image_relevance) 113 | atten_map = cv2.resize(image_relevance, cam_size) if cam_size is not None else image_relevance 114 | all_maps.append(atten_map) 115 | 116 | out = np.stack(all_maps, axis=0) 117 | 118 | if return_logits: 119 | return out, logits_per_image 120 | return out 121 | 122 | 123 | def get_attn_mask(attentions, threshold=0.6): 124 | """ attentions: (T, L) 125 | """ 126 | nh = attentions.size(0) 127 | # we keep only a certain percentage of the mass 128 | val, idx = torch.sort(attentions) 129 | val /= torch.sum(val, dim=-1, keepdim=True) 130 | cumval = torch.cumsum(val, dim=-1) 131 | th_attn = cumval > (1 - threshold) 132 | idx2 = torch.argsort(idx) # dim=-1 by default 133 | th_attn = th_attn.view(nh, -1) 134 | for head in range(nh): 135 | th_attn[head] = th_attn[head][idx2[head].view(-1)] 136 | return th_attn -------------------------------------------------------------------------------- /alphaction/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .defaults import _C as cfg 2 | -------------------------------------------------------------------------------- /alphaction/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import make_data_loader 2 | -------------------------------------------------------------------------------- /alphaction/dataset/build.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | import copy 3 | import torch.utils.data 4 | from alphaction.utils.comm import get_world_size 5 | from . import datasets as D 6 | from . import samplers 7 | from .collate_batch import BatchCollator 8 | 9 | def build_dataset(cfg, split): 10 | if cfg.DATA.DATASETS[0] == 'ucf24': 11 | dataset = D.UCF24(cfg, split) 12 | elif cfg.DATA.DATASETS[0] == 'jhmdb': 13 | dataset = D.Jhmdb(cfg, split) 14 | elif cfg.DATA.DATASETS[0] == 'ava_v2.2': 15 | dataset = D.Ava(cfg, split) 16 | else: 17 | raise NotImplementedError 18 | 19 | return [dataset] 20 | 21 | def make_data_sampler(dataset, shuffle, distributed): 22 | if distributed: 23 | return samplers.DistributedSampler(dataset, shuffle=shuffle) 24 | if shuffle: 25 | sampler = torch.utils.data.sampler.RandomSampler(dataset) 26 | else: 27 | sampler = torch.utils.data.sampler.SequentialSampler(dataset) 28 | return sampler 29 | 30 | 31 | def _quantize(x, bins): 32 | bins = copy.copy(bins) 33 | bins = sorted(bins) 34 | quantized = list(map(lambda y: bisect.bisect_right(bins, y), x)) 35 | return quantized 36 | 37 | 38 | def _compute_aspect_ratios(dataset): 39 | aspect_ratios = [] 40 | for i in range(len(dataset)): 41 | video_info = dataset.get_video_info(i) 42 | aspect_ratio = float(video_info["height"]) / float(video_info["width"]) 43 | aspect_ratios.append(aspect_ratio) 44 | return aspect_ratios 45 | 46 | 47 | def make_batch_data_sampler( 48 | dataset, sampler, aspect_grouping, videos_per_batch, num_iters=None, start_iter=0, drop_last=False 49 | ): 50 | if aspect_grouping: 51 | if not isinstance(aspect_grouping, (list, tuple)): 52 | aspect_grouping = [aspect_grouping] 53 | aspect_ratios = _compute_aspect_ratios(dataset) 54 | group_ids = _quantize(aspect_ratios, aspect_grouping) 55 | batch_sampler = samplers.GroupedBatchSampler( 56 | sampler, group_ids, videos_per_batch, drop_uneven=drop_last 57 | ) 58 | else: 59 | batch_sampler = torch.utils.data.sampler.BatchSampler( 60 | sampler, videos_per_batch, drop_last=drop_last 61 | ) 62 | if num_iters is not None: 63 | batch_sampler = samplers.IterationBasedBatchSampler( 64 | batch_sampler, num_iters, start_iter 65 | ) 66 | return batch_sampler 67 | 68 | 69 | def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0): 70 | num_gpus = get_world_size() 71 | if is_train: 72 | # for training 73 | videos_per_batch = cfg.SOLVER.VIDEOS_PER_BATCH 74 | assert ( 75 | videos_per_batch % num_gpus == 0 76 | ), "SOLVER.VIDEOS_PER_BATCH ({}) must be divisible by the number " 77 | "of GPUs ({}) used.".format(videos_per_batch, num_gpus) 78 | videos_per_gpu = videos_per_batch // num_gpus 79 | shuffle = True 80 | drop_last = True 81 | # num_iters = cfg.SOLVER.MAX_EPOCH*cfg.SOLVER.ITER_PER_EPOCH 82 | split = 'train' 83 | else: 84 | # for testing 85 | videos_per_batch = cfg.TEST.VIDEOS_PER_BATCH 86 | assert ( 87 | videos_per_batch % num_gpus == 0 88 | ), "TEST.VIDEOS_PER_BATCH ({}) must be divisible by the number " 89 | "of GPUs ({}) used.".format(videos_per_batch, num_gpus) 90 | videos_per_gpu = videos_per_batch // num_gpus 91 | shuffle = False if not is_distributed else True 92 | drop_last = False 93 | # num_iters = None 94 | start_iter = 0 95 | split = 'test' 96 | 97 | # group images which have similar aspect ratio. In this case, we only 98 | # group in two cases: those with width / height > 1, and the other way around, 99 | # but the code supports more general grouping strategy 100 | aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] 101 | 102 | # build dataset 103 | datasets = build_dataset(cfg, split=split) 104 | 105 | # build sampler and dataloader 106 | data_loaders, vocabularies, iter_per_epoch_all = [], [], [] 107 | for dataset in datasets: 108 | if is_train: 109 | # number of iterations for all epochs 110 | iter_per_epoch = int(len(dataset) // cfg.SOLVER.VIDEOS_PER_BATCH) if cfg.SOLVER.ITER_PER_EPOCH == -1 else cfg.SOLVER.ITER_PER_EPOCH 111 | iter_per_epoch_all.append(iter_per_epoch) 112 | num_iters = cfg.SOLVER.MAX_EPOCH * iter_per_epoch if is_train else None 113 | # sampler 114 | sampler = make_data_sampler(dataset, shuffle, is_distributed) 115 | batch_sampler = make_batch_data_sampler( 116 | dataset, sampler, aspect_grouping, videos_per_gpu, num_iters, start_iter, drop_last 117 | ) 118 | collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) 119 | num_workers = cfg.DATALOADER.NUM_WORKERS 120 | data_loader = torch.utils.data.DataLoader( 121 | dataset, 122 | num_workers=num_workers, 123 | batch_sampler=batch_sampler, 124 | collate_fn=collator, 125 | ) 126 | data_loaders.append(data_loader) 127 | if cfg.DATA.OPEN_VOCABULARY: 128 | vocabularies.append(dataset.text_input) 129 | else: 130 | vocabularies.append(None) 131 | if is_train: 132 | # during training, a single (possibly concatenated) data_loader is returned 133 | assert len(data_loaders) == 1 134 | return data_loaders[0], vocabularies[0]['closed'], iter_per_epoch_all[0] 135 | 136 | vocabularies_val = [] 137 | if len(vocabularies) > 0: 138 | for vocab in vocabularies: 139 | if cfg.TEST.EVAL_OPEN and vocab is not None: 140 | vocabularies_val.append(vocab['open']) 141 | else: 142 | vocabularies_val.append(vocab['closed']) 143 | 144 | return data_loaders, vocabularies_val, iter_per_epoch_all -------------------------------------------------------------------------------- /alphaction/dataset/collate_batch.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | 4 | 5 | def batch_different_videos(videos, size_divisible=0): 6 | ''' 7 | :param videos: a list of video tensors 8 | :param size_divisible: output_size(width and height) should be divisble by this param 9 | :return: batched videos as a single tensor 10 | ''' 11 | assert isinstance(videos, (tuple, list)) 12 | max_size = tuple(max(s) for s in zip(*[clip.shape for clip in videos])) 13 | 14 | if size_divisible > 0: 15 | stride = size_divisible 16 | max_size = list(max_size) 17 | max_size[2] = int(math.ceil(max_size[2] / stride) * stride) 18 | max_size[3] = int(math.ceil(max_size[3] / stride) * stride) 19 | max_size = tuple(max_size) 20 | 21 | batch_shape = (len(videos),) + max_size 22 | batched_clips = videos[0].new(*batch_shape).zero_() 23 | for clip, pad_clip in zip(videos, batched_clips): 24 | pad_clip[:clip.shape[0], :clip.shape[1], :clip.shape[2], :clip.shape[3]].copy_(clip) 25 | 26 | return batched_clips 27 | 28 | 29 | class BatchCollator(object): 30 | """ 31 | From a list of samples from the dataset, 32 | returns the batched objectimages and targets. 33 | This should be passed to the DataLoader 34 | """ 35 | 36 | def __init__(self, size_divisible=0): 37 | self.divisible = size_divisible 38 | self.size_divisible = self.divisible 39 | 40 | def __call__(self, batch): 41 | transposed_batch = list(zip(*batch)) 42 | slow_clips = batch_different_videos(transposed_batch[0], self.size_divisible) 43 | if transposed_batch[1][0] is not None: 44 | fast_clips = batch_different_videos(transposed_batch[1], self.size_divisible) 45 | else: 46 | fast_clips = None 47 | whwh = torch.stack(transposed_batch[2]) 48 | boxes = transposed_batch[3] 49 | label_arrs = transposed_batch[4] 50 | metadata = transposed_batch[5] 51 | clip_ids = transposed_batch[6] 52 | return slow_clips, fast_clips, whwh, boxes, label_arrs, metadata, clip_ids -------------------------------------------------------------------------------- /alphaction/dataset/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .concat_dataset import ConcatDataset 2 | from .ava_dataset import Ava 3 | from .jhmdb_dataset import Jhmdb 4 | from .ucf24_dataset import UCF24 5 | 6 | __all__ = ["ConcatDataset", "Ava", "Jhmdb", "UCF24"] -------------------------------------------------------------------------------- /alphaction/dataset/datasets/concat_dataset.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | 3 | from torch.utils.data.dataset import ConcatDataset as _ConcatDataset 4 | 5 | 6 | class ConcatDataset(_ConcatDataset): 7 | """ 8 | Same as torch.utils.dataset.dataset.ConcatDataset, but exposes an extra 9 | method for querying the sizes of the image 10 | """ 11 | 12 | def get_idxs(self, idx): 13 | dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) 14 | if dataset_idx == 0: 15 | sample_idx = idx 16 | else: 17 | sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] 18 | return dataset_idx, sample_idx 19 | 20 | def get_video_info(self, idx): 21 | dataset_idx, sample_idx = self.get_idxs(idx) 22 | return self.datasets[dataset_idx].get_video_info(sample_idx) 23 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from alphaction.dataset import datasets 2 | 3 | from .ava import ava_evaluation 4 | from .jhmdb import jhmdb_evaluation 5 | from .ucf24 import ucf24_evaluation 6 | 7 | 8 | def evaluate(dataset, predictions, output_folder, **kwargs): 9 | """evaluate dataset using different methods based on dataset type. 10 | Args: 11 | dataset: Dataset object 12 | predictions(list[BoxList]): each item in the list represents the 13 | prediction results for one image. 14 | output_folder: output folder, to save evaluation files or results. 15 | **kwargs: other args. 16 | Returns: 17 | evaluation result 18 | """ 19 | args = dict( 20 | dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs 21 | ) 22 | if isinstance(dataset, datasets.Ava): 23 | return ava_evaluation(**args) 24 | elif isinstance(dataset, datasets.Jhmdb): 25 | return jhmdb_evaluation(**args) 26 | elif isinstance(dataset, datasets.UCF24): 27 | return ucf24_evaluation(**args) 28 | else: 29 | dataset_name = dataset.__class__.__name__ 30 | raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name)) 31 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/ava/README.md: -------------------------------------------------------------------------------- 1 | The evaluation code of AVA is modified from [https://github.com/activitynet/ActivityNet](https://github.com/activitynet/ActivityNet). -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/ava/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .ava_eval import do_ava_evaluation 3 | 4 | 5 | def ava_evaluation(dataset, predictions, output_folder, **kwargs): 6 | logger = logging.getLogger("alphaction.inference") 7 | logger.info("performing ava evaluation.") 8 | return do_ava_evaluation( 9 | dataset=dataset, 10 | predictions=predictions, 11 | output_folder=output_folder, 12 | logger=logger, 13 | metric=kwargs.get('metric', 'frame_ap'), 14 | save_csv=kwargs.get('save_csv', False) 15 | ) 16 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/jhmdb/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .jhmdb_eval import do_jhmdb_evaluation 3 | 4 | 5 | def jhmdb_evaluation(dataset, predictions, output_folder, **kwargs): 6 | logger = logging.getLogger("alphaction.inference") 7 | logger.info("performing jhmdb evaluation.") 8 | return do_jhmdb_evaluation( 9 | dataset=dataset, 10 | predictions=predictions, 11 | output_folder=output_folder, 12 | logger=logger, 13 | metric=kwargs.get('metric', 'frame_ap'), 14 | save_csv=kwargs.get('save_csv', False) 15 | ) -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/pascal_evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/dataset/datasets/evaluation/pascal_evaluation/__init__.py -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/pascal_evaluation/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Functions for computing metrics like precision, recall, CorLoc and etc.""" 17 | from __future__ import division 18 | 19 | import numpy as np 20 | 21 | 22 | def compute_precision_recall(scores, labels, num_gt): 23 | """Compute precision and recall. 24 | 25 | Args: 26 | scores: A float numpy array representing detection score 27 | labels: A boolean numpy array representing true/false positive labels 28 | num_gt: Number of ground truth instances 29 | 30 | Raises: 31 | ValueError: if the input is not of the correct format 32 | 33 | Returns: 34 | precision: Fraction of positive instances over detected ones. This value is 35 | None if no ground truth labels are present. 36 | recall: Fraction of detected positive instance over all positive instances. 37 | This value is None if no ground truth labels are present. 38 | 39 | """ 40 | if not isinstance( 41 | labels, np.ndarray) or labels.dtype != np.bool or len(labels.shape) != 1: 42 | raise ValueError("labels must be single dimension bool numpy array") 43 | 44 | if not isinstance( 45 | scores, np.ndarray) or len(scores.shape) != 1: 46 | raise ValueError("scores must be single dimension numpy array") 47 | 48 | if num_gt < np.sum(labels): 49 | raise ValueError("Number of true positives must be smaller than num_gt.") 50 | 51 | if len(scores) != len(labels): 52 | raise ValueError("scores and labels must be of the same size.") 53 | 54 | if num_gt == 0: 55 | return None, None 56 | 57 | sorted_indices = np.argsort(scores) 58 | sorted_indices = sorted_indices[::-1] 59 | labels = labels.astype(int) 60 | true_positive_labels = labels[sorted_indices] 61 | false_positive_labels = 1 - true_positive_labels 62 | cum_true_positives = np.cumsum(true_positive_labels) 63 | cum_false_positives = np.cumsum(false_positive_labels) 64 | precision = cum_true_positives.astype(float) / ( 65 | cum_true_positives + cum_false_positives) 66 | recall = cum_true_positives.astype(float) / num_gt 67 | return precision, recall 68 | 69 | 70 | def compute_average_precision(precision, recall): 71 | """Compute Average Precision according to the definition in VOCdevkit. 72 | 73 | Precision is modified to ensure that it does not decrease as recall 74 | decrease. 75 | 76 | Args: 77 | precision: A float [N, 1] numpy array of precisions 78 | recall: A float [N, 1] numpy array of recalls 79 | 80 | Raises: 81 | ValueError: if the input is not of the correct format 82 | 83 | Returns: 84 | average_precison: The area under the precision recall curve. NaN if 85 | precision and recall are None. 86 | 87 | """ 88 | if precision is None: 89 | if recall is not None: 90 | raise ValueError("If precision is None, recall must also be None") 91 | return np.NAN 92 | 93 | if not isinstance(precision, np.ndarray) or not isinstance(recall, 94 | np.ndarray): 95 | raise ValueError("precision and recall must be numpy array") 96 | if precision.dtype != np.float or recall.dtype != np.float: 97 | raise ValueError("input must be float numpy array.") 98 | if len(precision) != len(recall): 99 | raise ValueError("precision and recall must be of the same size.") 100 | if not precision.size: 101 | return 0.0 102 | if np.amin(precision) < 0 or np.amax(precision) > 1: 103 | raise ValueError("Precision must be in the range of [0, 1].") 104 | if np.amin(recall) < 0 or np.amax(recall) > 1: 105 | raise ValueError("recall must be in the range of [0, 1].") 106 | if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)): 107 | raise ValueError("recall must be a non-decreasing array") 108 | 109 | recall = np.concatenate([[0], recall, [1]]) 110 | precision = np.concatenate([[0], precision, [0]]) 111 | 112 | # Preprocess precision to be a non-decreasing array 113 | for i in range(len(precision) - 2, -1, -1): 114 | precision[i] = np.maximum(precision[i], precision[i + 1]) 115 | 116 | indices = np.where(recall[1:] != recall[:-1])[0] + 1 117 | average_precision = np.sum( 118 | (recall[indices] - recall[indices - 1]) * precision[indices]) 119 | return average_precision 120 | 121 | 122 | def compute_cor_loc(num_gt_imgs_per_class, 123 | num_images_correctly_detected_per_class): 124 | """Compute CorLoc according to the definition in the following paper. 125 | 126 | https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf 127 | 128 | Returns nans if there are no ground truth images for a class. 129 | 130 | Args: 131 | num_gt_imgs_per_class: 1D array, representing number of images containing 132 | at least one object instance of a particular class 133 | num_images_correctly_detected_per_class: 1D array, representing number of 134 | images that are correctly detected at least one object instance of a 135 | particular class 136 | 137 | Returns: 138 | corloc_per_class: A float numpy array represents the corloc score of each 139 | class 140 | """ 141 | # Divide by zero expected for classes with no gt examples. 142 | with np.errstate(divide="ignore", invalid="ignore"): 143 | return np.where( 144 | num_gt_imgs_per_class == 0, np.nan, 145 | num_images_correctly_detected_per_class / num_gt_imgs_per_class) 146 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/pascal_evaluation/np_box_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Numpy BoxList classes and functions.""" 17 | 18 | import numpy as np 19 | 20 | 21 | class BoxList(object): 22 | """Box collection. 23 | 24 | BoxList represents a list of bounding boxes as numpy array, where each 25 | bounding box is represented as a row of 4 numbers, 26 | [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes within a 27 | given list correspond to a single image. 28 | 29 | Optionally, users can add additional related fields (such as 30 | objectness/classification scores). 31 | """ 32 | 33 | def __init__(self, data): 34 | """Constructs box collection. 35 | 36 | Args: 37 | data: a numpy array of shape [N, 4] representing box coordinates 38 | 39 | Raises: 40 | ValueError: if bbox dataset is not a numpy array 41 | ValueError: if invalid dimensions for bbox dataset 42 | """ 43 | if not isinstance(data, np.ndarray): 44 | raise ValueError('dataset must be a numpy array.') 45 | if len(data.shape) != 2 or data.shape[1] != 4: 46 | raise ValueError('Invalid dimensions for box dataset.') 47 | if data.dtype != np.float32 and data.dtype != np.float64: 48 | raise ValueError('Invalid dataset type for box dataset: float is required.') 49 | if not self._is_valid_boxes(data): 50 | raise ValueError('Invalid box dataset. dataset must be a numpy array of ' 51 | 'N*[y_min, x_min, y_max, x_max]') 52 | self.data = {'boxes': data} 53 | 54 | def num_boxes(self): 55 | """Return number of boxes held in collections.""" 56 | return self.data['boxes'].shape[0] 57 | 58 | def get_extra_fields(self): 59 | """Return all non-box fields.""" 60 | return [k for k in self.data.keys() if k != 'boxes'] 61 | 62 | def has_field(self, field): 63 | return field in self.data 64 | 65 | def add_field(self, field, field_data): 66 | """Add dataset to a specified field. 67 | 68 | Args: 69 | field: a string parameter used to speficy a related field to be accessed. 70 | field_data: a numpy array of [N, ...] representing the dataset associated 71 | with the field. 72 | Raises: 73 | ValueError: if the field is already exist or the dimension of the field 74 | dataset does not matches the number of boxes. 75 | """ 76 | if self.has_field(field): 77 | raise ValueError('Field ' + field + 'already exists') 78 | if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(): 79 | raise ValueError('Invalid dimensions for field dataset') 80 | self.data[field] = field_data 81 | 82 | def get(self): 83 | """Convenience function for accesssing box coordinates. 84 | 85 | Returns: 86 | a numpy array of shape [N, 4] representing box corners 87 | """ 88 | return self.get_field('boxes') 89 | 90 | def get_field(self, field): 91 | """Accesses dataset associated with the specified field in the box collection. 92 | 93 | Args: 94 | field: a string parameter used to speficy a related field to be accessed. 95 | 96 | Returns: 97 | a numpy 1-d array representing dataset of an associated field 98 | 99 | Raises: 100 | ValueError: if invalid field 101 | """ 102 | if not self.has_field(field): 103 | raise ValueError('field {} does not exist'.format(field)) 104 | return self.data[field] 105 | 106 | def get_coordinates(self): 107 | """Get corner coordinates of boxes. 108 | 109 | Returns: 110 | a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max] 111 | """ 112 | box_coordinates = self.get() 113 | y_min = box_coordinates[:, 0] 114 | x_min = box_coordinates[:, 1] 115 | y_max = box_coordinates[:, 2] 116 | x_max = box_coordinates[:, 3] 117 | return [y_min, x_min, y_max, x_max] 118 | 119 | def _is_valid_boxes(self, data): 120 | """Check whether dataset fullfills the format of N*[ymin, xmin, ymax, xmin]. 121 | 122 | Args: 123 | data: a numpy array of shape [N, 4] representing box coordinates 124 | 125 | Returns: 126 | a boolean indicating whether all ymax of boxes are equal or greater than 127 | ymin, and all xmax of boxes are equal or greater than xmin. 128 | """ 129 | if data.shape[0] > 0: 130 | for i in range(data.shape[0]): 131 | if data[i, 0] > data[i, 2] or data[i, 1] > data[i, 3]: 132 | return False 133 | return True 134 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/pascal_evaluation/np_box_mask_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Numpy BoxMaskList classes and functions.""" 17 | 18 | import numpy as np 19 | from . import np_box_list 20 | 21 | 22 | class BoxMaskList(np_box_list.BoxList): 23 | """Convenience wrapper for BoxList with masks. 24 | 25 | BoxMaskList extends the np_box_list.BoxList to contain masks as well. 26 | In particular, its constructor receives both boxes and masks. Note that the 27 | masks correspond to the full image. 28 | """ 29 | 30 | def __init__(self, box_data, mask_data): 31 | """Constructs box collection. 32 | 33 | Args: 34 | box_data: a numpy array of shape [N, 4] representing box coordinates 35 | mask_data: a numpy array of shape [N, height, width] representing masks 36 | with values are in {0,1}. The masks correspond to the full 37 | image. The height and the width will be equal to image height and width. 38 | 39 | Raises: 40 | ValueError: if bbox dataset is not a numpy array 41 | ValueError: if invalid dimensions for bbox dataset 42 | ValueError: if mask dataset is not a numpy array 43 | ValueError: if invalid dimension for mask dataset 44 | """ 45 | super(BoxMaskList, self).__init__(box_data) 46 | if not isinstance(mask_data, np.ndarray): 47 | raise ValueError('Mask dataset must be a numpy array.') 48 | if len(mask_data.shape) != 3: 49 | raise ValueError('Invalid dimensions for mask dataset.') 50 | if mask_data.dtype != np.uint8: 51 | raise ValueError('Invalid dataset type for mask dataset: uint8 is required.') 52 | if mask_data.shape[0] != box_data.shape[0]: 53 | raise ValueError('There should be the same number of boxes and masks.') 54 | self.data['masks'] = mask_data 55 | 56 | def get_masks(self): 57 | """Convenience function for accessing masks. 58 | 59 | Returns: 60 | a numpy array of shape [N, height, width] representing masks 61 | """ 62 | return self.get_field('masks') 63 | 64 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/pascal_evaluation/np_box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, 4] numpy arrays representing bounding boxes. 17 | 18 | Example box operations that are supported: 19 | * Areas: compute bounding box areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | import numpy as np 23 | 24 | 25 | def area(boxes): 26 | """Computes area of boxes. 27 | 28 | Args: 29 | boxes: Numpy array with shape [N, 4] holding N boxes 30 | 31 | Returns: 32 | a numpy array with shape [N*1] representing box areas 33 | """ 34 | return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 35 | 36 | 37 | def intersection(boxes1, boxes2): 38 | """Compute pairwise intersection areas between boxes. 39 | 40 | Args: 41 | boxes1: a numpy array with shape [N, 4] holding N boxes 42 | boxes2: a numpy array with shape [M, 4] holding M boxes 43 | 44 | Returns: 45 | a numpy array with shape [N*M] representing pairwise intersection area 46 | """ 47 | [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) 48 | [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) 49 | 50 | all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) 51 | all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) 52 | intersect_heights = np.maximum( 53 | np.zeros(all_pairs_max_ymin.shape), 54 | all_pairs_min_ymax - all_pairs_max_ymin) 55 | all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) 56 | all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) 57 | intersect_widths = np.maximum( 58 | np.zeros(all_pairs_max_xmin.shape), 59 | all_pairs_min_xmax - all_pairs_max_xmin) 60 | return intersect_heights * intersect_widths 61 | 62 | 63 | def iou(boxes1, boxes2): 64 | """Computes pairwise intersection-over-union between box collections. 65 | 66 | Args: 67 | boxes1: a numpy array with shape [N, 4] holding N boxes. 68 | boxes2: a numpy array with shape [M, 4] holding N boxes. 69 | 70 | Returns: 71 | a numpy array with shape [N, M] representing pairwise iou scores. 72 | """ 73 | intersect = intersection(boxes1, boxes2) 74 | area1 = area(boxes1) 75 | area2 = area(boxes2) 76 | union = np.expand_dims(area1, axis=1) + np.expand_dims( 77 | area2, axis=0) - intersect 78 | return intersect / union 79 | 80 | 81 | def ioa(boxes1, boxes2): 82 | """Computes pairwise intersection-over-area between box collections. 83 | 84 | Intersection-over-area (ioa) between two boxes box1 and box2 is defined as 85 | their intersection area over box2's area. Note that ioa is not symmetric, 86 | that is, IOA(box1, box2) != IOA(box2, box1). 87 | 88 | Args: 89 | boxes1: a numpy array with shape [N, 4] holding N boxes. 90 | boxes2: a numpy array with shape [M, 4] holding N boxes. 91 | 92 | Returns: 93 | a numpy array with shape [N, M] representing pairwise ioa scores. 94 | """ 95 | intersect = intersection(boxes1, boxes2) 96 | areas = np.expand_dims(area(boxes2), axis=0) 97 | return intersect / areas 98 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/pascal_evaluation/np_mask_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, height, width] numpy arrays representing masks. 17 | 18 | Example mask operations that are supported: 19 | * Areas: compute mask areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | import numpy as np 23 | 24 | EPSILON = 1e-7 25 | 26 | 27 | def area(masks): 28 | """Computes area of masks. 29 | 30 | Args: 31 | masks: Numpy array with shape [N, height, width] holding N masks. Masks 32 | values are of type np.uint8 and values are in {0,1}. 33 | 34 | Returns: 35 | a numpy array with shape [N*1] representing mask areas. 36 | 37 | Raises: 38 | ValueError: If masks.dtype is not np.uint8 39 | """ 40 | if masks.dtype != np.uint8: 41 | raise ValueError('Masks type should be np.uint8') 42 | return np.sum(masks, axis=(1, 2), dtype=np.float32) 43 | 44 | 45 | def intersection(masks1, masks2): 46 | """Compute pairwise intersection areas between masks. 47 | 48 | Args: 49 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 50 | values are of type np.uint8 and values are in {0,1}. 51 | masks2: a numpy array with shape [M, height, width] holding M masks. Masks 52 | values are of type np.uint8 and values are in {0,1}. 53 | 54 | Returns: 55 | a numpy array with shape [N*M] representing pairwise intersection area. 56 | 57 | Raises: 58 | ValueError: If masks1 and masks2 are not of type np.uint8. 59 | """ 60 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 61 | raise ValueError('masks1 and masks2 should be of type np.uint8') 62 | n = masks1.shape[0] 63 | m = masks2.shape[0] 64 | answer = np.zeros([n, m], dtype=np.float32) 65 | for i in np.arange(n): 66 | for j in np.arange(m): 67 | answer[i, j] = np.sum(np.minimum(masks1[i], masks2[j]), dtype=np.float32) 68 | return answer 69 | 70 | 71 | def iou(masks1, masks2): 72 | """Computes pairwise intersection-over-union between mask collections. 73 | 74 | Args: 75 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 76 | values are of type np.uint8 and values are in {0,1}. 77 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks 78 | values are of type np.uint8 and values are in {0,1}. 79 | 80 | Returns: 81 | a numpy array with shape [N, M] representing pairwise iou scores. 82 | 83 | Raises: 84 | ValueError: If masks1 and masks2 are not of type np.uint8. 85 | """ 86 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 87 | raise ValueError('masks1 and masks2 should be of type np.uint8') 88 | intersect = intersection(masks1, masks2) 89 | area1 = area(masks1) 90 | area2 = area(masks2) 91 | union = np.expand_dims(area1, axis=1) + np.expand_dims( 92 | area2, axis=0) - intersect 93 | return intersect / np.maximum(union, EPSILON) 94 | 95 | 96 | def ioa(masks1, masks2): 97 | """Computes pairwise intersection-over-area between box collections. 98 | 99 | Intersection-over-area (ioa) between two masks, mask1 and mask2 is defined as 100 | their intersection area over mask2's area. Note that ioa is not symmetric, 101 | that is, IOA(mask1, mask2) != IOA(mask2, mask1). 102 | 103 | Args: 104 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 105 | values are of type np.uint8 and values are in {0,1}. 106 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks 107 | values are of type np.uint8 and values are in {0,1}. 108 | 109 | Returns: 110 | a numpy array with shape [N, M] representing pairwise ioa scores. 111 | 112 | Raises: 113 | ValueError: If masks1 and masks2 are not of type np.uint8. 114 | """ 115 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 116 | raise ValueError('masks1 and masks2 should be of type np.uint8') 117 | intersect = intersection(masks1, masks2) 118 | areas = np.expand_dims(area(masks2), axis=0) 119 | return intersect / (areas + EPSILON) 120 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/pascal_wrapper.py: -------------------------------------------------------------------------------- 1 | from .pascal_evaluation import object_detection_evaluation, standard_fields 2 | import numpy as np 3 | 4 | 5 | 6 | def parse_id(activity_list=None, class_num=24): 7 | if activity_list is None: # use the class ID instead 8 | activity_list = ['Class{}'.format(i) for i in range(class_num)] 9 | # activity_list = ['Basketball', 'BasketballDunk', 'Biking', 'CliffDiving', 'CricketBowling', 'Diving', 'Fencing', 'FloorGymnastics', 'GolfSwing', 'HorseRiding', 'IceDancing', 'LongJump', 'PoleVault', 'RopeClimbing', 'SalsaSpin', 'SkateBoarding', 'Skiing', 'Skijet', 'SoccerJuggling', 'Surfing', 'TennisSwing', 'TrampolineJumping', 'VolleyballSpiking', 'WalkingWithDog'] 10 | categories = [] 11 | for i, act_name in enumerate(activity_list): 12 | categories.append({'id': i + 1, 'name': act_name}) 13 | return categories 14 | 15 | 16 | class STDetectionEvaluaterUCF(object): 17 | ''' 18 | evaluater class designed for multi-iou thresholds 19 | based on https://github.com/activitynet/ActivityNet/blob/master/Evaluation/get_ava_performance.py 20 | parameters: 21 | dataset that provide GT annos, in the format of AWSCVMotionDataset 22 | tiou_thresholds: a list of iou thresholds 23 | attributes: 24 | clear(): clear detection results, GT is kept 25 | load_detection_from_path(), load anno from a list of path, in the format of [confi x1 y1 x2 y2 scoresx15] 26 | evaluate(): run evaluation code 27 | ''' 28 | 29 | def __init__(self, tiou_thresholds=[0.5], load_from_dataset=False, activity_list=None, class_num=24): 30 | categories = parse_id(activity_list=activity_list, class_num=class_num) 31 | self.class_num = class_num 32 | self.categories = categories 33 | self.tiou_thresholds = tiou_thresholds 34 | self.lst_pascal_evaluator = [] 35 | self.load_from_dataset = load_from_dataset 36 | self.exclude_key = [] 37 | for iou in self.tiou_thresholds: 38 | self.lst_pascal_evaluator.append( 39 | object_detection_evaluation.PascalDetectionEvaluator(categories, matching_iou_threshold=iou)) 40 | 41 | def clear(self): 42 | for evaluator in self.lst_pascal_evaluator: 43 | evaluator.clear() 44 | 45 | def load_ground_truth(self, ground_truth): 46 | # write into evaluator 47 | for image_key, info in ground_truth.items(): 48 | boxes = info['bbox'].copy() # normalized coordinates 49 | resolution = info['resolution'] 50 | boxes_eval = [] 51 | for box in boxes: 52 | area = (box[3] - box[1]) * resolution[0] * (box[2] - box[0]) * resolution[1] 53 | if area < 10: continue # ignore too small boxes 54 | boxes_eval.append(box) 55 | if len(boxes_eval) == 0: # no boxes 56 | self.exclude_key.append(image_key) # mark the excluded frames to filter the detections later 57 | continue 58 | 59 | for evaluator in self.lst_pascal_evaluator: 60 | evaluator.add_single_ground_truth_image_info( 61 | image_key, { 62 | standard_fields.InputDataFields.groundtruth_boxes: 63 | np.vstack(boxes_eval), 64 | standard_fields.InputDataFields.groundtruth_classes: 65 | np.array(info['labels'], dtype=int), 66 | standard_fields.InputDataFields.groundtruth_difficult: 67 | np.zeros(len(boxes_eval), dtype=bool) 68 | }) 69 | 70 | 71 | def load_detection(self, detections): 72 | """ Load detection results from dict memory 73 | """ 74 | for image_key, info in detections.items(): 75 | # filtering out results that are in the excluded frames 76 | if image_key in self.exclude_key or len(info['boxes']) == 0: 77 | continue 78 | 79 | # sorted by confidence: 80 | boxes, labels, scores = info['boxes'], info['action_ids'], info['scores'] 81 | index = np.argsort(-scores) 82 | boxes, labels, scores = boxes[index], labels[index], scores[index] 83 | 84 | # add info into evaluator 85 | for evaluator in self.lst_pascal_evaluator: 86 | evaluator.add_single_detected_image_info( 87 | image_key, { 88 | standard_fields.DetectionResultFields.detection_boxes: boxes, 89 | standard_fields.DetectionResultFields.detection_classes: labels, 90 | standard_fields.DetectionResultFields.detection_scores: scores 91 | }) 92 | 93 | def evaluate(self): 94 | result = {} 95 | for x, iou in enumerate(self.tiou_thresholds): 96 | evaluator = self.lst_pascal_evaluator[x] 97 | metrics = evaluator.evaluate() 98 | result.update(metrics) 99 | return result 100 | 101 | 102 | def frame_mAP_pascal(_results, _targets, vocab, logger, iou_list=[0.5]): 103 | evaluater = STDetectionEvaluaterUCF(tiou_thresholds=iou_list, activity_list=vocab, class_num=len(vocab)) 104 | 105 | logger.info("Adding ground truth into evaluator") 106 | evaluater.load_ground_truth(_targets) 107 | 108 | logger.info("Adding predictions into evaluator") 109 | evaluater.load_detection(_results) 110 | 111 | eval_res = evaluater.evaluate() 112 | 113 | return eval_res -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/ucf24/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .ucf24_eval import do_ucf24_evaluation 3 | 4 | 5 | def ucf24_evaluation(dataset, predictions, output_folder, **kwargs): 6 | logger = logging.getLogger("alphaction.inference") 7 | logger.info("performing UCF24 evaluation.") 8 | return do_ucf24_evaluation( 9 | dataset=dataset, 10 | predictions=predictions, 11 | output_folder=output_folder, 12 | logger=logger, 13 | metric=kwargs.get('metric', 'frame_ap'), 14 | save_csv=kwargs.get('save_csv', False) 15 | ) -------------------------------------------------------------------------------- /alphaction/dataset/datasets/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import logging 4 | import numpy as np 5 | import time 6 | import cv2 7 | import torch 8 | from iopath.common.file_io import g_pathmgr 9 | import os 10 | import pickle 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def retry_load_images(image_paths, retry=10, backend="pytorch"): 17 | """ 18 | This function is to load images with support of retrying for failed load. 19 | Args: 20 | image_paths (list): paths of images needed to be loaded. 21 | retry (int, optional): maximum time of loading retrying. Defaults to 10. 22 | backend (str): `pytorch` or `cv2`. 23 | Returns: 24 | imgs (list): list of loaded images. 25 | """ 26 | for i in range(retry): 27 | imgs = [] 28 | for image_path in image_paths: 29 | with g_pathmgr.open(image_path, "rb") as f: 30 | img_str = np.frombuffer(f.read(), np.uint8) 31 | img = cv2.imdecode(img_str, flags=cv2.IMREAD_COLOR) 32 | imgs.append(img) 33 | 34 | if all(img is not None for img in imgs): 35 | if backend == "pytorch": 36 | imgs = torch.as_tensor(np.stack(imgs)) 37 | return imgs 38 | else: 39 | logger.warn("Reading failed. Will retry.") 40 | time.sleep(1.0) 41 | if i == retry - 1: 42 | raise Exception("Failed to load images {}".format(image_paths)) 43 | 44 | 45 | def read_greyscale_image(img_file): 46 | assert os.path.exists(img_file), "File does not exist!\n{}".format(img_file) 47 | im = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) 48 | im = im.astype(np.float32) / 255.0 49 | im = torch.from_numpy(im) 50 | return im 51 | 52 | 53 | def get_sequence(center_idx, half_len, sample_rate, num_frames): 54 | """ 55 | Sample frames among the corresponding clip. 56 | Args: 57 | center_idx (int): center frame idx for current clip 58 | half_len (int): half of the clip length 59 | sample_rate (int): sampling rate for sampling frames inside of the clip 60 | num_frames (int): number of expected sampled frames 61 | Returns: 62 | seq (list): list of indexes of sampled frames in this clip. 63 | """ 64 | seq = list(range(center_idx - half_len, center_idx + half_len, sample_rate)) 65 | 66 | for seq_idx in range(len(seq)): 67 | if seq[seq_idx] < 0: 68 | seq[seq_idx] = 0 69 | elif seq[seq_idx] >= num_frames: 70 | seq[seq_idx] = num_frames - 1 71 | return seq 72 | 73 | def pack_pathway_output(cfg, frames, pathways=2): 74 | """ 75 | Prepare output as a list of tensors. Each tensor corresponding to a 76 | unique pathway. 77 | Args: 78 | frames (tensor): frames of images sampled from the video. The 79 | dimension is `channel` x `num frames` x `height` x `width`. 80 | Returns: 81 | frame_list (list): list of tensors with the dimension of 82 | `channel` x `num frames` x `height` x `width`. 83 | """ 84 | if cfg.DATA.REVERSE_INPUT_CHANNEL: 85 | frames = frames[[2, 1, 0], :, :, :] 86 | if pathways==1: 87 | frame_list = [frames] 88 | elif pathways==2: 89 | fast_pathway = frames 90 | # Perform temporal sampling from the fast pathway. 91 | slow_pathway = torch.index_select( 92 | frames, 93 | 1, 94 | torch.linspace( 95 | 0, frames.shape[1] - 1, frames.shape[1] // cfg.SLOWFAST.ALPHA 96 | ).long(), 97 | ) 98 | frame_list = [slow_pathway, fast_pathway] 99 | else: 100 | raise NotImplementedError() 101 | return frame_list 102 | 103 | 104 | def load_dets_data(det_file, topk=None): 105 | assert os.path.exists(det_file), "detection file does not exist: {}".format(det_file) 106 | with open(det_file, 'rb') as fid: 107 | data = pickle.load(fid, encoding='iso-8859-1') 108 | # get list of all frames 109 | all_dets = dict() 110 | for vid, dets in data.items(): 111 | for i in list(dets['boxes'].keys()): 112 | boxes, scores = dets['boxes'][i], dets['scores'][i] 113 | key = "%s,%05d" % (vid, i) 114 | if topk is None: 115 | all_dets[key] = np.hstack((boxes, scores[:, None])) # (n, 5) 116 | else: 117 | indices = np.argsort(scores)[::-1][:topk] # topK maximum indices 118 | all_dets[key] = np.hstack((boxes[indices], scores[indices, None])) # (n, 5) 119 | return all_dets -------------------------------------------------------------------------------- /alphaction/dataset/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from .distributed import DistributedSampler 2 | from .grouped_batch_sampler import GroupedBatchSampler 3 | from .iteration_based_batch_sampler import IterationBasedBatchSampler 4 | 5 | __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"] 6 | -------------------------------------------------------------------------------- /alphaction/dataset/samplers/distributed.py: -------------------------------------------------------------------------------- 1 | # Code is copy-pasted exactly as in torch.utils.dataset.distributed. 2 | # FIXME remove this once c10d fixes the bug it has 3 | import math 4 | import torch 5 | import torch.distributed as dist 6 | from torch.utils.data.sampler import Sampler 7 | 8 | 9 | class DistributedSampler(Sampler): 10 | """Sampler that restricts dataset loading to a subset of the dataset. 11 | It is especially useful in conjunction with 12 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 13 | process can pass a DistributedSampler instance as a DataLoader sampler, 14 | and load a subset of the original dataset that is exclusive to it. 15 | .. note:: 16 | Dataset is assumed to be of constant size. 17 | Arguments: 18 | dataset: Dataset used for sampling. 19 | num_replicas (optional): Number of processes participating in 20 | distributed training. 21 | rank (optional): Rank of the current process within num_replicas. 22 | """ 23 | 24 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 25 | if num_replicas is None: 26 | if not dist.is_available(): 27 | raise RuntimeError("Requires distributed package to be available") 28 | num_replicas = dist.get_world_size() 29 | if rank is None: 30 | if not dist.is_available(): 31 | raise RuntimeError("Requires distributed package to be available") 32 | rank = dist.get_rank() 33 | self.dataset = dataset 34 | self.num_replicas = num_replicas 35 | self.rank = rank 36 | self.epoch = 0 37 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 38 | self.total_size = self.num_samples * self.num_replicas 39 | self.shuffle = shuffle 40 | 41 | def __iter__(self): 42 | if self.shuffle: 43 | # deterministically shuffle based on epoch 44 | g = torch.Generator() 45 | g.manual_seed(self.epoch) 46 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 47 | else: 48 | indices = torch.arange(len(self.dataset)).tolist() 49 | 50 | # add extra samples to make it evenly divisible 51 | indices += indices[: (self.total_size - len(indices))] 52 | assert len(indices) == self.total_size 53 | 54 | # subsample 55 | offset = self.num_samples * self.rank 56 | indices = indices[offset : offset + self.num_samples] 57 | assert len(indices) == self.num_samples 58 | 59 | return iter(indices) 60 | 61 | def __len__(self): 62 | return self.num_samples 63 | 64 | def set_epoch(self, epoch): 65 | self.epoch = epoch 66 | -------------------------------------------------------------------------------- /alphaction/dataset/samplers/grouped_batch_sampler.py: -------------------------------------------------------------------------------- 1 | # Modified based on https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py 2 | import itertools 3 | 4 | import torch 5 | from torch.utils.data.sampler import BatchSampler 6 | from torch.utils.data.sampler import Sampler 7 | 8 | 9 | class GroupedBatchSampler(BatchSampler): 10 | """ 11 | Wraps another sampler to yield a mini-batch of indices. 12 | It enforces that elements from the same group should appear in groups of batch_size. 13 | It also tries to provide mini-batches which follows an ordering which is 14 | as close as possible to the ordering from the original sampler. 15 | 16 | Arguments: 17 | sampler (Sampler): Base sampler. 18 | batch_size (int): Size of mini-batch. 19 | drop_uneven (bool): If ``True``, the sampler will drop the batches whose 20 | size is less than ``batch_size`` 21 | 22 | """ 23 | 24 | def __init__(self, sampler, group_ids, batch_size, drop_uneven=False): 25 | if not isinstance(sampler, Sampler): 26 | raise ValueError( 27 | "sampler should be an instance of " 28 | "torch.utils.dataset.Sampler, but got sampler={}".format(sampler) 29 | ) 30 | self.sampler = sampler 31 | self.group_ids = torch.as_tensor(group_ids) 32 | assert self.group_ids.dim() == 1 33 | self.batch_size = batch_size 34 | self.drop_uneven = drop_uneven 35 | 36 | self.groups = torch.unique(self.group_ids).sort(0)[0] 37 | 38 | def _prepare_batches(self): 39 | dataset_size = len(self.group_ids) 40 | # get the sampled indices from the sampler 41 | sampled_ids = torch.as_tensor(list(self.sampler)) 42 | # potentially not all elements of the dataset were sampled 43 | # by the sampler (e.g., DistributedSampler). 44 | # construct a tensor which contains -1 if the element was 45 | # not sampled, and a non-negative number indicating the 46 | # order where the element was sampled. 47 | # for example. if sampled_ids = [3, 1] and dataset_size = 5, 48 | # the order is [-1, 1, -1, 0, -1] 49 | order = torch.full((dataset_size,), -1, dtype=torch.int64) 50 | order[sampled_ids] = torch.arange(len(sampled_ids)) 51 | 52 | # get a mask with the elements that were sampled 53 | mask = order >= 0 54 | 55 | # find the elements that belong to each individual cluster 56 | clusters = [(self.group_ids == i) & mask for i in self.groups] 57 | # get relative order of the elements inside each cluster 58 | # that follows the order from the sampler 59 | relative_order = [order[cluster] for cluster in clusters] 60 | # with the relative order, find the absolute order in the 61 | # sampled space 62 | permutation_ids = [s[s.sort()[1]] for s in relative_order] 63 | # permute each cluster so that they follow the order from 64 | # the sampler 65 | permuted_clusters = [sampled_ids[idx] for idx in permutation_ids] 66 | 67 | # splits each cluster in batch_size, and merge as a list of tensors 68 | splits = [c.split(self.batch_size) for c in permuted_clusters] 69 | merged = tuple(itertools.chain.from_iterable(splits)) 70 | 71 | # now each batch internally has the right order, but 72 | # they are grouped by clusters. Find the permutation between 73 | # different batches that brings them as close as possible to 74 | # the order that we have in the sampler. For that, we will consider the 75 | # ordering as coming from the first element of each batch, and sort 76 | # correspondingly 77 | first_element_of_batch = [t[0].item() for t in merged] 78 | # get and inverse mapping from sampled indices and the position where 79 | # they occur (as returned by the sampler) 80 | inv_sampled_ids_map = {v: k for k, v in enumerate(sampled_ids.tolist())} 81 | # from the first element in each batch, get a relative ordering 82 | first_index_of_batch = torch.as_tensor( 83 | [inv_sampled_ids_map[s] for s in first_element_of_batch] 84 | ) 85 | 86 | # permute the batches so that they approximately follow the order 87 | # from the sampler 88 | permutation_order = first_index_of_batch.sort(0)[1].tolist() 89 | # finally, permute the batches 90 | batches = [merged[i].tolist() for i in permutation_order] 91 | 92 | if self.drop_uneven: 93 | kept = [] 94 | for batch in batches: 95 | if len(batch) == self.batch_size: 96 | kept.append(batch) 97 | batches = kept 98 | return batches 99 | 100 | def __iter__(self): 101 | batches = self._prepare_batches() 102 | self._batches = batches 103 | return iter(batches) 104 | 105 | def __len__(self): 106 | if not hasattr(self, "_batches"): 107 | self._batches = self._prepare_batches() 108 | return len(self._batches) 109 | -------------------------------------------------------------------------------- /alphaction/dataset/samplers/iteration_based_batch_sampler.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py 2 | from torch.utils.data.sampler import BatchSampler 3 | 4 | 5 | class IterationBasedBatchSampler(BatchSampler): 6 | """ 7 | Wraps a BatchSampler, resampling from it until 8 | a specified number of iterations have been sampled 9 | """ 10 | 11 | def __init__(self, batch_sampler, num_iterations, start_iter=0): 12 | self.batch_sampler = batch_sampler 13 | self.num_iterations = num_iterations 14 | self.start_iter = start_iter 15 | 16 | def __iter__(self): 17 | iteration = self.start_iter 18 | while iteration <= self.num_iterations: 19 | # if the underlying sampler has a set_epoch method, like 20 | # DistributedSampler, used for making each process see 21 | # a different split of the dataset, then set it 22 | if hasattr(self.batch_sampler.sampler, "set_epoch"): 23 | self.batch_sampler.sampler.set_epoch(iteration) 24 | for batch in self.batch_sampler: 25 | iteration += 1 26 | if iteration > self.num_iterations: 27 | break 28 | yield batch 29 | 30 | def __len__(self): 31 | return self.num_iterations 32 | -------------------------------------------------------------------------------- /alphaction/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/engine/__init__.py -------------------------------------------------------------------------------- /alphaction/engine/feature_extraction.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | from tqdm import tqdm 4 | import time 5 | import datetime 6 | from alphaction.utils.comm import get_rank, synchronize, get_world_size 7 | 8 | 9 | def do_feature_extraction(model_ddp, data_loader, distributed): 10 | 11 | device = torch.device("cuda") 12 | num_devices = get_world_size() 13 | dataset = data_loader.dataset 14 | 15 | if dataset.finished_feat_extraction(): 16 | return 17 | logger = logging.getLogger("alphaction.feature_extraction.{}".format(dataset._split)) 18 | 19 | logger.info("Start feature extraction on {} dataset({} videos).".format(dataset.__class__.__name__, len(dataset))) 20 | start_time = time.time() 21 | model = model_ddp.module if distributed else model_ddp 22 | model.eval() 23 | 24 | extra_args = {} if get_world_size() == 1 else dict(desc="feature extracting", disable=(not get_rank()==0)) 25 | 26 | with torch.no_grad(): 27 | for i, batch in tqdm(enumerate(data_loader), **extra_args): 28 | video, _, whwh, boxes, _, metadata, idx = batch 29 | video = video.to(device) 30 | 31 | # extract patch token features and CLS token feature 32 | features, cls_feat = model.backbone([video]) 33 | # extract text features 34 | text_features = model.backbone.forward_text(device=device) 35 | 36 | # save torch tensors 37 | dataset.save_features(idx, features[0].cpu(), cls_feat.cpu(), text_features.cpu()) 38 | 39 | if dataset.finished_feat_extraction(): 40 | logger.info("Finished feature extraction. ") 41 | break # check if all samples are processed 42 | 43 | synchronize() 44 | total_time = time.time() - start_time 45 | total_time_str = str(datetime.timedelta(seconds=total_time)) 46 | logger.info("Feature extraction time: {} ({} s / video per device, on {} devices)".format( 47 | total_time_str, total_time * num_devices / len(dataset), num_devices)) 48 | -------------------------------------------------------------------------------- /alphaction/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .batch_norm import FrozenBatchNorm1d, FrozenBatchNorm2d, FrozenBatchNorm3d 2 | 3 | __all__ = [ "FrozenBatchNorm1d", "FrozenBatchNorm2d", "FrozenBatchNorm3d"] 4 | 5 | -------------------------------------------------------------------------------- /alphaction/layers/batch_norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class _FrozenBatchNorm(nn.Module): 6 | def __init__(self, num_features, eps=1e-5, affine=True, track_running_stats=True): 7 | super(_FrozenBatchNorm, self).__init__() 8 | self.num_features = num_features 9 | self.eps = eps 10 | self.affine = affine 11 | self.track_running_stats = track_running_stats 12 | if self.affine: 13 | self.register_buffer("weight", torch.Tensor(num_features)) 14 | self.register_buffer("bias", torch.Tensor(num_features)) 15 | else: 16 | self.register_buffer("weight", None) 17 | self.register_buffer("bias", None) 18 | if self.track_running_stats: 19 | self.register_buffer('running_mean', torch.zeros(num_features)) 20 | self.register_buffer('running_var', torch.ones(num_features)) 21 | else: 22 | self.register_parameter('running_mean', None) 23 | self.register_parameter('running_var', None) 24 | self.reset_parameters() 25 | 26 | def reset_running_stats(self): 27 | if self.track_running_stats: 28 | self.running_mean.zero_() 29 | self.running_var.fill_(1) 30 | 31 | def reset_parameters(self): 32 | self.reset_running_stats() 33 | if self.affine: 34 | self.weight.data.uniform_() 35 | self.bias.data.zero_() 36 | 37 | def _check_input_dim(self, input): 38 | raise NotImplementedError 39 | 40 | def forward(self, input): 41 | self._check_input_dim(input) 42 | view_shape = (1, self.num_features) + (1,) * (input.dim() - 2) 43 | 44 | if self.track_running_stats: 45 | scale = self.weight / (self.running_var + self.eps).sqrt() 46 | bias = self.bias - self.running_mean * scale 47 | else: 48 | scale = self.weight 49 | bias = self.bias 50 | 51 | return scale.view(*view_shape) * input + bias.view(*view_shape) 52 | 53 | def extra_repr(self): 54 | return '{num_features}, eps={eps}, affine={affine}, ' \ 55 | 'track_running_stats={track_running_stats}'.format(**self.__dict__) 56 | 57 | def _load_from_state_dict(self, state_dict, prefix, metadata, strict, 58 | missing_keys, unexpected_keys, error_msgs): 59 | num_batches_tracked_key = prefix + 'num_batches_tracked' 60 | if num_batches_tracked_key in state_dict: 61 | del state_dict[num_batches_tracked_key] 62 | super(_FrozenBatchNorm, self)._load_from_state_dict( 63 | state_dict, prefix, metadata, strict, 64 | missing_keys, unexpected_keys, error_msgs) 65 | 66 | 67 | class FrozenBatchNorm1d(_FrozenBatchNorm): 68 | def _check_input_dim(self, input): 69 | if input.dim() != 2 and input.dim() != 3: 70 | raise ValueError('expected 2D or 3D input (got {}D input)' 71 | .format(input.dim())) 72 | 73 | 74 | class FrozenBatchNorm2d(_FrozenBatchNorm): 75 | def _check_input_dim(self, input): 76 | if input.dim() != 4: 77 | raise ValueError('expected 4D input (got {}D input)' 78 | .format(input.dim())) 79 | 80 | 81 | class FrozenBatchNorm3d(_FrozenBatchNorm): 82 | def _check_input_dim(self, input): 83 | if input.dim() != 5: 84 | raise ValueError('expected 5D input (got {}D input)' 85 | .format(input.dim())) 86 | -------------------------------------------------------------------------------- /alphaction/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/modeling/__init__.py -------------------------------------------------------------------------------- /alphaction/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import build_backbone -------------------------------------------------------------------------------- /alphaction/modeling/backbone/backbone.py: -------------------------------------------------------------------------------- 1 | from alphaction.modeling import registry 2 | from . import slowfast, i3d, video_model_builder 3 | 4 | @registry.BACKBONES.register("Slowfast-Resnet50") 5 | @registry.BACKBONES.register("Slowfast-Resnet101") 6 | def build_slowfast_resnet_backbone(cfg): 7 | model = slowfast.SlowFast(cfg) 8 | return model 9 | 10 | @registry.BACKBONES.register("PySlowonly") 11 | def build_pyslowonly_resnet_backbone(cfg): 12 | model = video_model_builder.ResNet(cfg) 13 | return model 14 | 15 | @registry.BACKBONES.register("PySlowfast-R50") 16 | @registry.BACKBONES.register("PySlowfast-R101") 17 | def build_pyslowfast_resnet_backbone(cfg): 18 | model = video_model_builder.SlowFast(cfg) 19 | return model 20 | 21 | @registry.BACKBONES.register("MAE-ViT-B") 22 | @registry.BACKBONES.register("MAE-ViT-L") 23 | def build_mae_vit_backbone(cfg): 24 | model = video_model_builder.ViT(cfg) 25 | return model 26 | 27 | @registry.BACKBONES.register("I3D-Resnet50") 28 | @registry.BACKBONES.register("I3D-Resnet101") 29 | @registry.BACKBONES.register("I3D-Resnet50-Sparse") 30 | @registry.BACKBONES.register("I3D-Resnet101-Sparse") 31 | def build_i3d_resnet_backbone(cfg): 32 | model = i3d.I3D(cfg) 33 | return model 34 | 35 | # OpenAI CLIP 36 | @registry.BACKBONES.register("ViT-B/16") 37 | @registry.BACKBONES.register("ViT-B/32") 38 | @registry.BACKBONES.register("ViT-L/14") 39 | def build_clip_vit_backbone(cfg): 40 | from alphaction.modeling.encoders.openai_clip.clip_encoder import build_clip_backbone 41 | model = build_clip_backbone(cfg) 42 | return model 43 | 44 | # CLIP-ViP 45 | @registry.BACKBONES.register("ViP-B/16") 46 | @registry.BACKBONES.register("ViP-B/32") 47 | def build_clipvip_backbone(cfg): 48 | from alphaction.modeling.encoders.clipvip.clipvip_encoder import build_clipvip_backbone 49 | model = build_clipvip_backbone(cfg) 50 | return model 51 | 52 | # ViCLIP from InternVideo 53 | @registry.BACKBONES.register("ViCLIP-L/14") 54 | def build_viclip_backbone(cfg): 55 | from alphaction.modeling.encoders.viclip.viclip_encoder import build_viclip_backbone 56 | model = build_viclip_backbone(cfg) 57 | return model 58 | 59 | 60 | def build_backbone(cfg): 61 | assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \ 62 | "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format( 63 | cfg.MODEL.BACKBONE.CONV_BODY 64 | ) 65 | return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg) 66 | -------------------------------------------------------------------------------- /alphaction/modeling/backbone/i3d.py: -------------------------------------------------------------------------------- 1 | from __future__ import (absolute_import, division, print_function, 2 | unicode_literals) 3 | 4 | import torch.nn as nn 5 | from alphaction.layers import FrozenBatchNorm3d 6 | from alphaction.modeling.common_blocks import ResNLBlock 7 | 8 | 9 | def get_model_cfg(cfg): 10 | backbone_strs = cfg.MODEL.BACKBONE.CONV_BODY.split('-')[1:] 11 | error_msg = 'Model backbone {} is not supported.'.format(cfg.MODEL.BACKBONE.CONV_BODY) 12 | 13 | use_temp_convs_1 = [2] 14 | temp_strides_1 = [2] 15 | max_pool_stride_1 = 2 16 | 17 | use_temp_convs_2 = [1, 1, 1] 18 | temp_strides_2 = [1, 1, 1] 19 | max_pool_stride_2 = 2 20 | 21 | use_temp_convs_3 = [1, 0, 1, 0] 22 | temp_strides_3 = [1, 1, 1, 1] 23 | 24 | use_temp_convs_5 = [0, 1, 0] 25 | temp_strides_5 = [1, 1, 1] 26 | 27 | avg_pool_stride = int(cfg.INPUT.FRAME_NUM / 8) 28 | if backbone_strs[0] == 'Resnet50': 29 | block_config = (3, 4, 6, 3) 30 | 31 | use_temp_convs_4 = [1, 0, 1, 0, 1, 0] 32 | temp_strides_4 = [1, 1, 1, 1, 1, 1] 33 | elif backbone_strs[0] == 'Resnet101': 34 | block_config = (3, 4, 23, 3) 35 | 36 | use_temp_convs_4 = [] 37 | for i in range(23): 38 | if i % 2 == 0: 39 | use_temp_convs_4.append(1) 40 | else: 41 | use_temp_convs_4.append(0) 42 | temp_strides_4 = [1, ] * 23 43 | else: 44 | raise KeyError(error_msg) 45 | 46 | if len(backbone_strs) > 1: 47 | if len(backbone_strs) == 2 and backbone_strs[1] == 'Sparse': 48 | temp_strides_1 = [1] 49 | max_pool_stride_1 = 1 50 | avg_pool_stride = int(cfg.INPUT.FRAME_NUM / 2) 51 | else: 52 | raise KeyError(error_msg) 53 | 54 | use_temp_convs_set = [use_temp_convs_1, use_temp_convs_2, use_temp_convs_3, use_temp_convs_4, use_temp_convs_5] 55 | temp_strides_set = [temp_strides_1, temp_strides_2, temp_strides_3, temp_strides_4, temp_strides_5] 56 | pool_strides_set = [max_pool_stride_1, max_pool_stride_2, avg_pool_stride] 57 | return block_config, use_temp_convs_set, temp_strides_set, pool_strides_set 58 | 59 | 60 | class I3D(nn.Module): 61 | def __init__(self, cfg): 62 | super(I3D, self).__init__() 63 | 64 | self.cfg = cfg.clone() 65 | 66 | block_config, use_temp_convs_set, temp_strides_set, pool_strides_set = get_model_cfg(cfg) 67 | conv3_nonlocal = cfg.MODEL.BACKBONE.I3D.CONV3_NONLOCAL 68 | conv4_nonlocal = cfg.MODEL.BACKBONE.I3D.CONV4_NONLOCAL 69 | 70 | dim_inner = 64 71 | conv_dims = [64, 256, 512, 1024, 2048] 72 | self.dim_out = conv_dims[-1] 73 | n1, n2, n3, n4 = block_config 74 | layer_mod = 2 75 | conv3_nl_mod = layer_mod 76 | conv4_nl_mod = layer_mod 77 | if not conv3_nonlocal: 78 | conv3_nl_mod = 1000 79 | if not conv4_nonlocal: 80 | conv4_nl_mod = 1000 81 | self.c2_mapping = None 82 | 83 | data_dim = 3 84 | self.conv1 = nn.Conv3d(data_dim, conv_dims[0], (1 + use_temp_convs_set[0][0] * 2, 7, 7), 85 | stride=(temp_strides_set[0][0], 2, 2), 86 | padding=(use_temp_convs_set[0][0], 3, 3), bias=False) 87 | nn.init.kaiming_normal_(self.conv1.weight) 88 | 89 | if cfg.MODEL.BACKBONE.FROZEN_BN: 90 | self.bn1 = FrozenBatchNorm3d(conv_dims[0], eps=cfg.MODEL.BACKBONE.BN_EPSILON) 91 | nn.init.constant_(self.bn1.weight, 1.0) 92 | nn.init.constant_(self.bn1.bias, 0.0) 93 | else: 94 | self.bn1 = nn.BatchNorm3d(conv_dims[0], eps=cfg.MODEL.BACKBONE.BN_EPSILON, momentum=cfg.MODEL.BACKBONE.BN_MOMENTUM) 95 | 96 | self.relu = nn.ReLU(inplace=True) 97 | self.maxpool1 = nn.MaxPool3d((pool_strides_set[0], 3, 3), stride=(pool_strides_set[0], 2, 2)) 98 | 99 | self.res_nl1 = ResNLBlock(cfg, conv_dims[0], conv_dims[1], stride=1, num_blocks=n1, dim_inner=dim_inner, 100 | use_temp_convs=use_temp_convs_set[1], temp_strides=temp_strides_set[1]) 101 | self.maxpool2 = nn.MaxPool3d((pool_strides_set[1], 1, 1), stride=(pool_strides_set[1], 1, 1)) 102 | 103 | self.res_nl2 = ResNLBlock(cfg, conv_dims[1], conv_dims[2], stride=2, num_blocks=n2, 104 | dim_inner=dim_inner * 2, use_temp_convs=use_temp_convs_set[2], 105 | temp_strides=temp_strides_set[2], nonlocal_mod=conv3_nl_mod, 106 | group_nonlocal=cfg.MODEL.BACKBONE.I3D.CONV3_GROUP_NL) 107 | 108 | self.res_nl3 = ResNLBlock(cfg, conv_dims[2], conv_dims[3], stride=2, num_blocks=n3, 109 | dim_inner=dim_inner * 4, use_temp_convs=use_temp_convs_set[3], 110 | temp_strides=temp_strides_set[3], nonlocal_mod=conv4_nl_mod) 111 | 112 | self.res_nl4 = ResNLBlock(cfg, conv_dims[3], conv_dims[4], stride=1, num_blocks=n4, 113 | dim_inner=dim_inner * 8, use_temp_convs=use_temp_convs_set[4], 114 | temp_strides=temp_strides_set[4], 115 | dilation=2) 116 | 117 | def forward(self, _, x): 118 | # We only use fast videos, which is the second input. 119 | out = self.conv1(x) 120 | out = self.bn1(out) 121 | out = self.relu(out) 122 | out = self.maxpool1(out) 123 | 124 | out = self.res_nl1(out) 125 | out = self.maxpool2(out) 126 | 127 | out = self.res_nl2(out) 128 | 129 | out = self.res_nl3(out) 130 | 131 | out = self.res_nl4(out) 132 | return None, out 133 | 134 | def c2_weight_mapping(self): 135 | if self.c2_mapping is None: 136 | weight_map = {'conv1.weight': 'conv1_w', 137 | 'bn1.weight': 'res_conv1_bn_s', 138 | 'bn1.bias': 'res_conv1_bn_b', 139 | 'bn1.running_mean': 'res_conv1_bn_rm', 140 | 'bn1.running_var': 'res_conv1_bn_riv'} 141 | for i in range(1, 5): 142 | name = 'res_nl{}'.format(i) 143 | child_map = getattr(self, name).c2_weight_mapping() 144 | for key, val in child_map.items(): 145 | new_key = name + '.' + key 146 | weight_map[new_key] = val.format(i + 1) 147 | self.c2_mapping = weight_map 148 | return self.c2_mapping 149 | -------------------------------------------------------------------------------- /alphaction/modeling/backbone/sfmodels/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class Mlp(nn.Module): 8 | def __init__( 9 | self, 10 | in_features, 11 | hidden_features=None, 12 | out_features=None, 13 | act_layer=nn.GELU, 14 | drop_rate=0.0, 15 | ): 16 | super().__init__() 17 | self.drop_rate = drop_rate 18 | out_features = out_features or in_features 19 | hidden_features = hidden_features or in_features 20 | self.fc1 = nn.Linear(in_features, hidden_features) 21 | self.act = act_layer() 22 | self.fc2 = nn.Linear(hidden_features, out_features) 23 | if self.drop_rate > 0.0: 24 | self.drop = nn.Dropout(drop_rate) 25 | 26 | def forward(self, x): 27 | x = self.fc1(x) 28 | x = self.act(x) 29 | if self.drop_rate > 0.0: 30 | x = self.drop(x) 31 | x = self.fc2(x) 32 | if self.drop_rate > 0.0: 33 | x = self.drop(x) 34 | return x 35 | 36 | 37 | class Permute(nn.Module): 38 | def __init__(self, dims): 39 | super().__init__() 40 | self.dims = dims 41 | 42 | def forward(self, x): 43 | return x.permute(*self.dims) 44 | 45 | 46 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 47 | """ 48 | Stochastic Depth per sample. 49 | """ 50 | if drop_prob == 0.0 or not training: 51 | return x 52 | keep_prob = 1 - drop_prob 53 | shape = (x.shape[0],) + (1,) * ( 54 | x.ndim - 1 55 | ) # work with diff dim tensors, not just 2D ConvNets 56 | mask = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) 57 | mask.floor_() # binarize 58 | output = x.div(keep_prob) * mask 59 | return output 60 | 61 | 62 | class DropPath(nn.Module): 63 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 64 | 65 | def __init__(self, drop_prob=None): 66 | super(DropPath, self).__init__() 67 | self.drop_prob = drop_prob 68 | 69 | def forward(self, x): 70 | return drop_path(x, self.drop_prob, self.training) 71 | 72 | 73 | class TwoStreamFusion(nn.Module): 74 | def __init__(self, mode, dim=None, kernel=3, padding=1): 75 | """ 76 | A general constructor for neural modules fusing two equal sized tensors 77 | in forward. Following options are supported: 78 | "add" / "max" / "min" / "avg" : respective operations on the two halves. 79 | "concat" : NOOP. 80 | "concat_linear_{dim_mult}_{drop_rate}" : MLP to fuse with hidden dim "dim_mult" 81 | (optional, def 1.) higher than input dim 82 | with optional dropout "drop_rate" (def: 0.) 83 | "ln+concat_linear_{dim_mult}_{drop_rate}" : perform MLP after layernorm on the input. 84 | """ 85 | super().__init__() 86 | self.mode = mode 87 | if mode == "add": 88 | self.fuse_fn = lambda x: torch.stack(torch.chunk(x, 2, dim=2)).sum( 89 | dim=0 90 | ) 91 | elif mode == "max": 92 | self.fuse_fn = ( 93 | lambda x: torch.stack(torch.chunk(x, 2, dim=2)) 94 | .max(dim=0) 95 | .values 96 | ) 97 | elif mode == "min": 98 | self.fuse_fn = ( 99 | lambda x: torch.stack(torch.chunk(x, 2, dim=2)) 100 | .min(dim=0) 101 | .values 102 | ) 103 | elif mode == "avg": 104 | self.fuse_fn = lambda x: torch.stack(torch.chunk(x, 2, dim=2)).mean( 105 | dim=0 106 | ) 107 | elif mode == "concat": 108 | # x itself is the channel concat version 109 | self.fuse_fn = lambda x: x 110 | elif "concat_linear" in mode: 111 | if len(mode.split("_")) == 2: 112 | dim_mult = 1.0 113 | drop_rate = 0.0 114 | elif len(mode.split("_")) == 3: 115 | dim_mult = float(mode.split("_")[-1]) 116 | drop_rate = 0.0 117 | 118 | elif len(mode.split("_")) == 4: 119 | dim_mult = float(mode.split("_")[-2]) 120 | drop_rate = float(mode.split("_")[-1]) 121 | else: 122 | raise NotImplementedError 123 | 124 | if mode.split("+")[0] == "ln": 125 | self.fuse_fn = nn.Sequential( 126 | nn.LayerNorm(dim), 127 | Mlp( 128 | in_features=dim, 129 | hidden_features=int(dim * dim_mult), 130 | act_layer=nn.GELU, 131 | out_features=dim, 132 | drop_rate=drop_rate, 133 | ), 134 | ) 135 | else: 136 | self.fuse_fn = Mlp( 137 | in_features=dim, 138 | hidden_features=int(dim * dim_mult), 139 | act_layer=nn.GELU, 140 | out_features=dim, 141 | drop_rate=drop_rate, 142 | ) 143 | 144 | else: 145 | raise NotImplementedError 146 | 147 | def forward(self, x): 148 | if "concat_linear" in self.mode: 149 | return self.fuse_fn(x) + x 150 | 151 | else: 152 | return self.fuse_fn(x) -------------------------------------------------------------------------------- /alphaction/modeling/backbone/sfmodels/nonlocal_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Non-local helper""" 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | class Nonlocal(nn.Module): 11 | """ 12 | Builds Non-local Neural Networks as a generic family of building 13 | blocks for capturing long-range dependencies. Non-local Network 14 | computes the response at a position as a weighted sum of the 15 | features at all positions. This building block can be plugged into 16 | many computer vision architectures. 17 | More details in the paper: https://arxiv.org/pdf/1711.07971.pdf 18 | """ 19 | 20 | def __init__( 21 | self, 22 | dim, 23 | dim_inner, 24 | pool_size=None, 25 | instantiation="softmax", 26 | zero_init_final_conv=False, 27 | zero_init_final_norm=True, 28 | norm_eps=1e-5, 29 | norm_momentum=0.1, 30 | norm_module=nn.BatchNorm3d, 31 | ): 32 | """ 33 | Args: 34 | dim (int): number of dimension for the input. 35 | dim_inner (int): number of dimension inside of the Non-local block. 36 | pool_size (list): the kernel size of spatial temporal pooling, 37 | temporal pool kernel size, spatial pool kernel size, spatial 38 | pool kernel size in order. By default pool_size is None, 39 | then there would be no pooling used. 40 | instantiation (string): supports two different instantiation method: 41 | "dot_product": normalizing correlation matrix with L2. 42 | "softmax": normalizing correlation matrix with Softmax. 43 | zero_init_final_conv (bool): If true, zero initializing the final 44 | convolution of the Non-local block. 45 | zero_init_final_norm (bool): 46 | If true, zero initializing the final batch norm of the Non-local 47 | block. 48 | norm_module (nn.Module): nn.Module for the normalization layer. The 49 | default is nn.BatchNorm3d. 50 | """ 51 | super(Nonlocal, self).__init__() 52 | self.dim = dim 53 | self.dim_inner = dim_inner 54 | self.pool_size = pool_size 55 | self.instantiation = instantiation 56 | self.use_pool = ( 57 | False 58 | if pool_size is None 59 | else any((size > 1 for size in pool_size)) 60 | ) 61 | self.norm_eps = norm_eps 62 | self.norm_momentum = norm_momentum 63 | self._construct_nonlocal( 64 | zero_init_final_conv, zero_init_final_norm, norm_module 65 | ) 66 | 67 | def _construct_nonlocal( 68 | self, zero_init_final_conv, zero_init_final_norm, norm_module 69 | ): 70 | # Three convolution heads: theta, phi, and g. 71 | self.conv_theta = nn.Conv3d( 72 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 73 | ) 74 | self.conv_phi = nn.Conv3d( 75 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 76 | ) 77 | self.conv_g = nn.Conv3d( 78 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 79 | ) 80 | 81 | # Final convolution output. 82 | self.conv_out = nn.Conv3d( 83 | self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0 84 | ) 85 | # Zero initializing the final convolution output. 86 | self.conv_out.zero_init = zero_init_final_conv 87 | 88 | # TODO: change the name to `norm` 89 | self.bn = norm_module( 90 | num_features=self.dim, 91 | eps=self.norm_eps, 92 | momentum=self.norm_momentum, 93 | ) 94 | # Zero initializing the final bn. 95 | self.bn.transform_final_bn = zero_init_final_norm 96 | 97 | # Optional to add the spatial-temporal pooling. 98 | if self.use_pool: 99 | self.pool = nn.MaxPool3d( 100 | kernel_size=self.pool_size, 101 | stride=self.pool_size, 102 | padding=[0, 0, 0], 103 | ) 104 | 105 | def forward(self, x): 106 | x_identity = x 107 | N, C, T, H, W = x.size() 108 | 109 | theta = self.conv_theta(x) 110 | 111 | # Perform temporal-spatial pooling to reduce the computation. 112 | if self.use_pool: 113 | x = self.pool(x) 114 | 115 | phi = self.conv_phi(x) 116 | g = self.conv_g(x) 117 | 118 | theta = theta.view(N, self.dim_inner, -1) 119 | phi = phi.view(N, self.dim_inner, -1) 120 | g = g.view(N, self.dim_inner, -1) 121 | 122 | # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW). 123 | theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi)) 124 | # For original Non-local paper, there are two main ways to normalize 125 | # the affinity tensor: 126 | # 1) Softmax normalization (norm on exp). 127 | # 2) dot_product normalization. 128 | if self.instantiation == "softmax": 129 | # Normalizing the affinity tensor theta_phi before softmax. 130 | theta_phi = theta_phi * (self.dim_inner**-0.5) 131 | theta_phi = nn.functional.softmax(theta_phi, dim=2) 132 | elif self.instantiation == "dot_product": 133 | spatial_temporal_dim = theta_phi.shape[2] 134 | theta_phi = theta_phi / spatial_temporal_dim 135 | else: 136 | raise NotImplementedError( 137 | "Unknown norm type {}".format(self.instantiation) 138 | ) 139 | 140 | # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW). 141 | theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g)) 142 | 143 | # (N, C, TxHxW) => (N, C, T, H, W). 144 | theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W) 145 | 146 | p = self.conv_out(theta_phi_g) 147 | p = self.bn(p) 148 | return x_identity + p -------------------------------------------------------------------------------- /alphaction/modeling/detector/__init__.py: -------------------------------------------------------------------------------- 1 | from .stm_detector import build_detection_model 2 | from .naive_baseline import build_naive_baseline -------------------------------------------------------------------------------- /alphaction/modeling/detector/action_detector.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from ..backbone import build_backbone 4 | from ..roi_heads.roi_heads_3d import build_3d_roi_heads 5 | 6 | 7 | class ActionDetector(nn.Module): 8 | def __init__(self, cfg): 9 | super(ActionDetector, self).__init__() 10 | self.backbone = build_backbone(cfg) 11 | self.roi_heads = build_3d_roi_heads(cfg, self.backbone.dim_out) 12 | 13 | def forward(self, slow_video, fast_video, boxes, objects=None, extras={}, part_forward=-1): 14 | # part_forward is used to split this model into two parts. 15 | # if part_forward<0, just use it as a single model 16 | # if part_forward=0, use this model to extract pooled feature(person and object, no memory features). 17 | # if part_forward=1, use the ia structure to aggregate interactions and give final result. 18 | # implemented in roi_heads 19 | 20 | if part_forward==1: 21 | slow_features = fast_features = None 22 | else: 23 | slow_features, fast_features = self.backbone(slow_video, fast_video) 24 | 25 | result, detector_losses, loss_weight, detector_metrics = self.roi_heads(slow_features, fast_features, boxes, objects, extras, part_forward) 26 | 27 | if self.training: 28 | return detector_losses, loss_weight, detector_metrics, result 29 | 30 | return result 31 | 32 | def c2_weight_mapping(self): 33 | if not hasattr(self, "c2_mapping"): 34 | weight_map = {} 35 | for name, m_child in self.named_children(): 36 | if m_child.state_dict() and hasattr(m_child, "c2_weight_mapping"): 37 | child_map = m_child.c2_weight_mapping() 38 | for key, val in child_map.items(): 39 | new_key = name + '.' + key 40 | weight_map[new_key] = val 41 | self.c2_mapping = weight_map 42 | return self.c2_mapping 43 | 44 | def build_detection_model(cfg): 45 | return ActionDetector(cfg) -------------------------------------------------------------------------------- /alphaction/modeling/detector/naive_baseline.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ..backbone import build_backbone 5 | from alphaction.modeling.stm_decoder.util.box_ops import clip_boxes_tensor 6 | from torchvision.ops import roi_align 7 | from einops import rearrange 8 | import numpy as np 9 | 10 | 11 | 12 | class NaiveBaseline(nn.Module): 13 | def __init__(self, cfg): 14 | super(NaiveBaseline, self).__init__() 15 | self.backbone = build_backbone(cfg) 16 | assert self.backbone.visual_encoder.use_cls_feat 17 | assert cfg.DATA.OPEN_VOCABULARY 18 | 19 | self.use_roi_feat = cfg.MODEL.USE_ROI_FEAT 20 | self.multi_label_action = cfg.MODEL.MULTI_LABEL_ACTION 21 | 22 | 23 | def roi_align_pool(self, patch_feats, batch_boxes, raw_sizes, out_size=(7, 7), spatial_scale=1.0/16): 24 | """ patch_feats: (B, D, T, h, w) 25 | boxes: list of boxes, not normalized 26 | raw_sizes: (B, 2) in (width, height) 27 | """ 28 | B, D, T, h, w = patch_feats.size() 29 | device = patch_feats.device 30 | feat_maps = patch_feats.mean(dim=2) # (B, D, h, w) temporally mean pooling 31 | boxes_list = [np.hstack([np.ones((boxes.shape[0], 1)) * i, boxes]) for i, boxes in enumerate(batch_boxes)] 32 | boxes_tensor = torch.from_numpy(np.vstack(boxes_list)).type(patch_feats.dtype).to(device) 33 | roi_feat = roi_align(feat_maps, boxes_tensor, out_size, spatial_scale) # (BN, D, 7, 7) 34 | roi_feat = rearrange(roi_feat, 'm d h w -> m (h w) d') 35 | 36 | # get meanpooled roi features 37 | roi_align_features = [] 38 | batch_indices = boxes_tensor[:, 0].long() 39 | for i in range(B): 40 | rois = roi_feat[batch_indices == i].mean(dim=1) # (n, d) 41 | roi_align_features.append(rois) 42 | 43 | return roi_align_features 44 | 45 | 46 | def forward(self, slow_video, fast_video, whwh, boxes=None, labels=None, extras={}, part_forward=-1): 47 | 48 | assert not self.training, "NaiveBaseline does not need training!" 49 | assert 'prior_boxes' in extras, "NaiveBaseline use loaded boxes for testing!" 50 | device = slow_video.device 51 | 52 | prior_boxes = extras['prior_boxes'] 53 | box_list = [] 54 | for i in range(len(prior_boxes)): 55 | box = torch.tensor(prior_boxes[i], dtype=torch.float32, device=device) 56 | cur_whwh = whwh[i] 57 | box = clip_boxes_tensor(box, cur_whwh[1], cur_whwh[0]) 58 | box[:, 0::2] /= cur_whwh[0] 59 | box[:, 1::2] /= cur_whwh[1] 60 | box_list.append(box) 61 | 62 | if self.backbone.num_pathways == 1: 63 | features = self.backbone([slow_video]) 64 | else: 65 | features = self.backbone([slow_video, fast_video]) 66 | 67 | patch_feats, cls_feat_visual = features # (B, 512) 68 | B = cls_feat_visual.size(0) 69 | 70 | if self.use_roi_feat: 71 | # feature projection & RoIAlign Pooling 72 | patch_feats = self.backbone.visual_encoder.project_patch_features(patch_feats[0]) 73 | roi_features = self.roi_align_pool(patch_feats, prior_boxes, whwh[:, :2]) 74 | 75 | # get the current text feature embeddings 76 | text_features = self.backbone.forward_text(device=slow_video.device) # (K, 512) 77 | tau_inv = self.backbone.tau_inv 78 | 79 | if isinstance(text_features, list): 80 | text_features = torch.stack(text_features).mean(1) 81 | text_features_normed = text_features / text_features.norm(dim=-1, keepdim=True) # (K, D) 82 | 83 | action_score_list = [] 84 | if self.use_roi_feat: 85 | # return self.forward_roi_cls(roi_features, text_features, tau_inv, whwh) 86 | for roi_feat in roi_features: 87 | # action recognition 88 | vis_features_normed = roi_feat / roi_feat.norm(dim=-1, keepdim=True) # (N, D) 89 | action_score = tau_inv * vis_features_normed @ text_features_normed.t() # (N, K) 90 | action_score_list.append(action_score) 91 | else: 92 | vis_features_normed = cls_feat_visual / cls_feat_visual.norm(dim=-1, keepdim=True) # (B, D) 93 | action_score = tau_inv * vis_features_normed @ text_features_normed.t() # (B, K) 94 | for i in range(B): 95 | # with full frame input, we only have one score vector, which need to be repeated. 96 | scores = action_score[[i]].repeat(box_list[i].size(0), 1) # (1, K) 97 | action_score_list.append(scores) 98 | 99 | return action_score_list, box_list 100 | 101 | 102 | def build_naive_baseline(cfg): 103 | return NaiveBaseline(cfg) -------------------------------------------------------------------------------- /alphaction/modeling/dict_model.py: -------------------------------------------------------------------------------- 1 | # Simple pytorch implementation of Dictionary Learning based on stochastic gradient descent 2 | # 3 | # June 2018 4 | # Jeremias Sulam 5 | 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import Variable 10 | import torch.nn.functional as F 11 | import numpy as np 12 | 13 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 14 | 15 | 16 | #################################### 17 | ## Dict. Learning ## 18 | #################################### 19 | 20 | class DictLearn(nn.Module): 21 | def __init__(self, num_basis, dim_basis, SC='FISTA', sc_iters=None): 22 | super(DictLearn, self).__init__() 23 | 24 | self.W = nn.Parameter(torch.randn(dim_basis, num_basis, requires_grad=False)) 25 | 26 | # normalization 27 | self.W.data = NormDict(self.W.data) 28 | self.SC = SC 29 | self.sc_iters = sc_iters 30 | 31 | if self.sc_iters is None: 32 | self.sc_iters = 20 if SC=='FISTA' else 50 33 | 34 | 35 | 36 | def forward(self, Y, K): 37 | 38 | # normalizing Dict 39 | self.W.requires_grad_(False) 40 | self.W.data = NormDict(self.W.data) 41 | 42 | # Sparse Coding 43 | if self.SC == 'IHT': 44 | Gamma, residual, errIHT = IHT(Y,self.W,K, self.sc_iters) 45 | elif self.SC == 'FISTA': 46 | Gamma, residual, errIHT = FISTA(Y,self.W,K, self.sc_iters) 47 | else: print("Oops!") 48 | 49 | # Reconstructing 50 | self.W.requires_grad_(True) 51 | X = torch.mm(Gamma,self.W.transpose(1,0)) 52 | 53 | # sparsity 54 | # NNZ = np.count_nonzero(Gamma.cpu().data.numpy())/Gamma.shape[0] 55 | 56 | return X, Gamma, errIHT 57 | 58 | 59 | 60 | #-------------------------------------------------------------- 61 | # Auxiliary Functions 62 | #-------------------------------------------------------------- 63 | 64 | def hard_threshold_k(X, k): 65 | Gamma = X.clone() 66 | m = X.data.shape[1] 67 | a,_ = torch.abs(Gamma).data.sort(dim=1,descending=True) 68 | T = torch.mm(a[:,k].unsqueeze(1),torch.Tensor(np.ones((1,m))).to(device)) 69 | mask = Variable(torch.Tensor((np.abs(Gamma.data.cpu().numpy())>T.cpu().numpy()) + 0.)).to(device) 70 | Gamma = Gamma * mask 71 | return Gamma#, mask.data.nonzero() 72 | 73 | #-------------------------------------------------------------- 74 | 75 | 76 | def soft_threshold(X, lamda): 77 | #pdb.set_trace() 78 | Gamma = X.clone() 79 | Gamma = torch.sign(Gamma) * F.relu(torch.abs(Gamma)-lamda) 80 | return Gamma.to(device) 81 | 82 | 83 | #-------------------------------------------------------------- 84 | 85 | 86 | def IHT(Y,W,K, ITER=50): 87 | 88 | c = PowerMethod(W) 89 | eta = 1/c 90 | Gamma = hard_threshold_k(torch.mm(Y,eta*W),K) 91 | residual = torch.mm(Gamma, W.transpose(1,0)) - Y 92 | 93 | norms = np.zeros((ITER,)) 94 | 95 | for i in range(ITER): 96 | Gamma = hard_threshold_k(Gamma - eta * torch.mm(residual, W), K) 97 | residual = torch.mm(Gamma, W.transpose(1,0)) - Y 98 | norms[i] = np.linalg.norm(residual.cpu().numpy(),'fro')/ np.linalg.norm(Y.cpu().numpy(),'fro') 99 | 100 | return Gamma, residual, norms 101 | 102 | 103 | #-------------------------------------------------------------- 104 | 105 | 106 | def FISTA(Y,W,lamda, ITER=20): 107 | 108 | c = PowerMethod(W) 109 | eta = 1/c 110 | norms = np.zeros((ITER,)) 111 | 112 | Gamma = soft_threshold(torch.mm(Y,eta*W),lamda) 113 | Z = Gamma.clone() 114 | Gamma_1 = Gamma.clone() 115 | t = 1 116 | 117 | for i in range(ITER): 118 | Gamma_1 = Gamma.clone() 119 | residual = torch.mm(Z, W.transpose(1,0)) - Y 120 | Gamma = soft_threshold(Z - eta * torch.mm(residual, W), lamda/c) 121 | 122 | t_1 = t 123 | t = (1+np.sqrt(1 + 4*t**2))/2 124 | #pdb.set_trace() 125 | Z = Gamma + ((t_1 - 1)/t * (Gamma - Gamma_1)).to(device) 126 | 127 | norms[i] = np.linalg.norm(residual.cpu().numpy(),'fro')/ np.linalg.norm(Y.cpu().numpy(),'fro') 128 | 129 | return Gamma, residual, norms 130 | 131 | 132 | #-------------------------------------------------------------- 133 | 134 | def NormDict(W): 135 | Wn = torch.norm(W, p=2, dim=0).detach() 136 | W = W.div(Wn.expand_as(W)) 137 | return W 138 | 139 | #-------------------------------------------------------------- 140 | 141 | def PowerMethod(W): 142 | ITER = 100 143 | m = W.shape[1] 144 | X = torch.randn(1, m).to(device) 145 | for i in range(ITER): 146 | Dgamma = torch.mm(X,W.transpose(1,0)) 147 | X = torch.mm(Dgamma,W) 148 | nm = torch.norm(X,p=2) 149 | X = X/nm 150 | 151 | return nm 152 | 153 | #-------------------------------------------------------------- 154 | 155 | 156 | def showFilters(W,ncol,nrows): 157 | p = int(np.sqrt(W.shape[0]))+2 158 | Nimages = W.shape[1] 159 | Mosaic = np.zeros((p*ncol,p*nrows)) 160 | indx = 0 161 | for i in range(ncol): 162 | for j in range(nrows): 163 | im = W[:,indx].reshape(p-2,p-2) 164 | im = (im-np.min(im)) 165 | im = im/np.max(im) 166 | Mosaic[ i*p : (i+1)*p , j*p : (j+1)*p ] = np.pad(im,(1,1),mode='constant') 167 | indx += 1 168 | 169 | return Mosaic 170 | -------------------------------------------------------------------------------- /alphaction/modeling/encoders/clipvip/custom_layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import OrderedDict 4 | 5 | 6 | 7 | class LayerNorm(nn.LayerNorm): 8 | """Subclass torch's LayerNorm to handle fp16.""" 9 | 10 | def forward(self, x: torch.Tensor): 11 | orig_type = x.dtype 12 | ret = super().forward(x.type(torch.float32)) 13 | return ret.type(orig_type) 14 | 15 | 16 | class QuickGELU(nn.Module): 17 | def forward(self, x: torch.Tensor): 18 | return x * torch.sigmoid(1.702 * x) 19 | 20 | 21 | class CrossAttnBlock(nn.Module): 22 | def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, drop: float = 0., return_kv=False): 23 | super().__init__() 24 | 25 | self.attn = nn.MultiheadAttention(d_model, n_head, dropout=drop) 26 | self.ln_x = LayerNorm(d_model) 27 | self.ln_y = LayerNorm(d_model) 28 | self.mlp = nn.Sequential(OrderedDict([ 29 | ("c_fc", nn.Linear(d_model, d_model * 4)), 30 | ("fc_drop", nn.Dropout(drop)), 31 | ("gelu", QuickGELU()), 32 | ("c_proj", nn.Linear(d_model * 4, d_model)), 33 | ("proj_drop", nn.Dropout(drop)), 34 | ])) 35 | self.ln_2 = LayerNorm(d_model) 36 | self.attn_mask = attn_mask 37 | self.return_kv = return_kv 38 | 39 | def attention(self, x: torch.Tensor, y: torch.Tensor): 40 | self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None 41 | return self.attn(x, y, y, need_weights=False, attn_mask=self.attn_mask)[0] 42 | 43 | def forward(self, x: torch.Tensor, y: torch.Tensor): 44 | """ x: query (T=1, B, d) 45 | y: key & value (T=64, B, d) 46 | """ 47 | if len(x.size()) == 2: 48 | x = x.unsqueeze(0) 49 | x = x + self.attention(self.ln_x(x), self.ln_y(y)) 50 | x = x + self.mlp(self.ln_2(x)) 51 | if x.size(0) == 1: 52 | x = x.squeeze(0) 53 | if self.return_kv: 54 | return x, y 55 | return x 56 | 57 | 58 | class CrossAttnModules(nn.Sequential): 59 | def forward(self, *input): 60 | for module in self._modules.values(): 61 | input = module(*input) 62 | return input[0] -------------------------------------------------------------------------------- /alphaction/modeling/encoders/viclip/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | license: mit 3 | --- 4 | -------------------------------------------------------------------------------- /alphaction/modeling/encoders/viclip/__init__.py: -------------------------------------------------------------------------------- 1 | from .simple_tokenizer import SimpleTokenizer as _Tokenizer 2 | from .viclip import ViCLIP 3 | import torch 4 | import numpy as np 5 | import cv2 6 | 7 | clip_candidates = {'viclip':None, 'clip':None} 8 | 9 | def get_clip(name='viclip', weight_file=None): 10 | global clip_candidates 11 | m = clip_candidates[name] 12 | if m is None: 13 | if name == 'viclip': 14 | tokenizer = _Tokenizer() 15 | vclip = ViCLIP(tokenizer, pretrain=weight_file) 16 | # m = vclip 17 | m = (vclip, tokenizer) 18 | else: 19 | raise Exception('the target clip model is not found.') 20 | 21 | return m 22 | 23 | def get_text_feat_dict(texts, clip, tokenizer, text_feat_d={}): 24 | for t in texts: 25 | feat = clip.get_text_features(t, tokenizer, text_feat_d) 26 | text_feat_d[t] = feat 27 | return text_feat_d 28 | 29 | def get_vid_feat(frames, clip): 30 | return clip.get_vid_features(frames) 31 | 32 | def _frame_from_video(video): 33 | while video.isOpened(): 34 | success, frame = video.read() 35 | if success: 36 | yield frame 37 | else: 38 | break 39 | 40 | v_mean = np.array([0.485, 0.456, 0.406]).reshape(1,1,3) 41 | v_std = np.array([0.229, 0.224, 0.225]).reshape(1,1,3) 42 | def normalize(data): 43 | return (data/255.0-v_mean)/v_std 44 | 45 | def frames2tensor(vid_list, fnum=8, target_size=(224, 224), device=torch.device('cuda')): 46 | assert(len(vid_list) >= fnum) 47 | step = len(vid_list) // fnum 48 | vid_list = vid_list[::step][:fnum] 49 | vid_list = [cv2.resize(x[:,:,::-1], target_size) for x in vid_list] 50 | vid_tube = [np.expand_dims(normalize(x), axis=(0, 1)) for x in vid_list] 51 | vid_tube = np.concatenate(vid_tube, axis=1) 52 | vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3)) 53 | vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float() 54 | return vid_tube 55 | 56 | def retrieve_text(frames, texts, name='viclip', weight_file=None, topk=5, device=torch.device('cuda')): 57 | clip, tokenizer = get_clip(name, weight_file) 58 | clip = clip.to(device) 59 | frames_tensor = frames2tensor(frames, device=device) 60 | vid_feat = get_vid_feat(frames_tensor, clip) 61 | 62 | text_feat_d = {} 63 | text_feat_d = get_text_feat_dict(texts, clip, tokenizer, text_feat_d) 64 | text_feats = [text_feat_d[t] for t in texts] 65 | text_feats_tensor = torch.cat(text_feats, 0) 66 | 67 | probs, idxs = clip.get_predict_label(vid_feat, text_feats_tensor, top=topk) 68 | 69 | ret_texts = [texts[i] for i in idxs.numpy()[0].tolist()] 70 | return ret_texts, probs.numpy()[0] 71 | 72 | -------------------------------------------------------------------------------- /alphaction/modeling/encoders/viclip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/modeling/encoders/viclip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /alphaction/modeling/encoders/viclip/demo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import cv2 4 | import argparse 5 | import torch 6 | 7 | import sys 8 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', '..', '..')) 9 | from alphaction.modeling.encoders.viclip import retrieve_text, _frame_from_video 10 | from alphaction.config import cfg 11 | from alphaction.dataset import make_data_loader 12 | from alphaction.dataset.datasets import utils as utils 13 | from alphaction.utils.random_seed import set_seed 14 | 15 | 16 | 17 | def get_cfg(): 18 | parser = argparse.ArgumentParser(description="PyTorch Action Detection Training") 19 | parser.add_argument( 20 | "--config-file", 21 | default="", 22 | metavar="FILE", 23 | help="path to config file", 24 | type=str, 25 | ) 26 | parser.add_argument("--local_rank", type=int, default=0) 27 | parser.add_argument( 28 | "--skip-final-test", 29 | dest="skip_test", 30 | help="Do not test the final model", 31 | action="store_true", 32 | ) 33 | parser.add_argument( 34 | "--skip-val-in-train", 35 | dest="skip_val", 36 | help="Do not validate during training", 37 | action="store_true", 38 | ) 39 | parser.add_argument( 40 | "--transfer", 41 | dest="transfer_weight", 42 | help="Transfer weight from a pretrained model", 43 | action="store_true" 44 | ) 45 | parser.add_argument( 46 | "--adjust-lr", 47 | dest="adjust_lr", 48 | help="Adjust learning rate scheduler from old checkpoint", 49 | action="store_true" 50 | ) 51 | parser.add_argument( 52 | "--no-head", 53 | dest="no_head", 54 | help="Not load the head layer parameters from weight file", 55 | action="store_true" 56 | ) 57 | parser.add_argument( 58 | "--use-tfboard", 59 | action='store_true', 60 | dest='tfboard', 61 | help='Use tensorboard to log stats' 62 | ) 63 | parser.add_argument( 64 | "--seed", 65 | type=int, 66 | default=2, 67 | help="Manual seed at the begining." 68 | ) 69 | parser.add_argument( 70 | "opts", 71 | help="Modify config options using the command-line", 72 | default=None, 73 | nargs=argparse.REMAINDER, 74 | ) 75 | 76 | args = parser.parse_args() 77 | 78 | num_gpus = 1 79 | args.distributed = False 80 | 81 | torch.backends.cudnn.deterministic = True 82 | torch.backends.cudnn.benchmark = False 83 | 84 | # Merge config. 85 | cfg.merge_from_file(args.config_file) 86 | cfg.merge_from_list(args.opts) 87 | cfg.freeze() 88 | 89 | set_seed(args.seed, 0, num_gpus) 90 | 91 | return cfg 92 | 93 | 94 | def get_one_sample(dataset): 95 | idx = int(np.random.choice(list(range(len(dataset))), 1)) 96 | video_idx, sec_idx, sec, center_idx = dataset._keyframe_indices[idx] 97 | # Get the frame idxs for current clip. 98 | seq = utils.get_sequence( 99 | center_idx, 100 | dataset._seq_len // 2, 101 | dataset._sample_rate, 102 | num_frames=len(dataset._image_paths[video_idx]), 103 | ) 104 | 105 | # Load images of current clip. 106 | image_paths = [dataset._image_paths[video_idx][frame] for frame in seq] 107 | imgs = utils.retry_load_images( 108 | image_paths, backend='cv2' 109 | ) 110 | 111 | clip_label_list = dataset._keyframe_boxes_and_labels[video_idx][sec_idx] 112 | assert len(clip_label_list) > 0 113 | labels = [] 114 | for box_labels in clip_label_list: 115 | for label in box_labels[1]: 116 | if label == -1: 117 | continue 118 | label = dataset.id_to_indices['closed'][label] 119 | labels.append(label) 120 | 121 | return imgs, labels 122 | 123 | 124 | 125 | if __name__ == '__main__': 126 | 127 | cfg = get_cfg() 128 | 129 | data_loader, vocabulary_train, iter_per_epoch = make_data_loader( 130 | cfg, 131 | is_train=True, 132 | is_distributed=False, 133 | start_iter=0, 134 | ) 135 | 136 | for n in range(10): 137 | print("Trial {}...".format(n + 1)) 138 | frames, labels = get_one_sample(data_loader.dataset) 139 | class_texts = [elems['caption'] for clsname, elems in data_loader.dataset.text_input['closed'].items()] 140 | gt_texts = [class_texts[clsid] for clsid in labels] 141 | 142 | texts, probs = retrieve_text(frames, class_texts, name='viclip', topk=5, weight_file='pretrained/ViClip-InternVid-10M-FLT.pth') 143 | 144 | for t, p in zip(texts, probs): 145 | print(f'text: {t} ~ prob: {p:.4f}') 146 | 147 | print("Ground Truth class texts: ", gt_texts) -------------------------------------------------------------------------------- /alphaction/modeling/encoders/viclip/simple_tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | from functools import lru_cache 5 | 6 | import ftfy 7 | import regex as re 8 | 9 | 10 | @lru_cache() 11 | def default_bpe(): 12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") 13 | # @lru_cache() 14 | # def default_bpe(): 15 | # return "bpe_simple_vocab_16e6.txt.gz" 16 | 17 | 18 | @lru_cache() 19 | def bytes_to_unicode(): 20 | """ 21 | Returns list of utf-8 byte and a corresponding list of unicode strings. 22 | The reversible bpe codes work on unicode strings. 23 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 24 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 25 | This is a signficant percentage of your normal, say, 32K bpe vocab. 26 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 27 | And avoids mapping to whitespace/control characters the bpe code barfs on. 28 | """ 29 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 30 | cs = bs[:] 31 | n = 0 32 | for b in range(2**8): 33 | if b not in bs: 34 | bs.append(b) 35 | cs.append(2**8+n) 36 | n += 1 37 | cs = [chr(n) for n in cs] 38 | return dict(zip(bs, cs)) 39 | 40 | 41 | def get_pairs(word): 42 | """Return set of symbol pairs in a word. 43 | Word is represented as tuple of symbols (symbols being variable-length strings). 44 | """ 45 | pairs = set() 46 | prev_char = word[0] 47 | for char in word[1:]: 48 | pairs.add((prev_char, char)) 49 | prev_char = char 50 | return pairs 51 | 52 | 53 | def basic_clean(text): 54 | text = ftfy.fix_text(text) 55 | text = html.unescape(html.unescape(text)) 56 | return text.strip() 57 | 58 | 59 | def whitespace_clean(text): 60 | text = re.sub(r'\s+', ' ', text) 61 | text = text.strip() 62 | return text 63 | 64 | 65 | class SimpleTokenizer(object): 66 | def __init__(self, bpe_path: str = default_bpe()): 67 | self.byte_encoder = bytes_to_unicode() 68 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 69 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') 70 | merges = merges[1:49152-256-2+1] 71 | merges = [tuple(merge.split()) for merge in merges] 72 | vocab = list(bytes_to_unicode().values()) 73 | vocab = vocab + [v+'' for v in vocab] 74 | for merge in merges: 75 | vocab.append(''.join(merge)) 76 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 77 | self.encoder = dict(zip(vocab, range(len(vocab)))) 78 | self.decoder = {v: k for k, v in self.encoder.items()} 79 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 80 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 81 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) 82 | 83 | def bpe(self, token): 84 | if token in self.cache: 85 | return self.cache[token] 86 | word = tuple(token[:-1]) + ( token[-1] + '',) 87 | pairs = get_pairs(word) 88 | 89 | if not pairs: 90 | return token+'' 91 | 92 | while True: 93 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 94 | if bigram not in self.bpe_ranks: 95 | break 96 | first, second = bigram 97 | new_word = [] 98 | i = 0 99 | while i < len(word): 100 | try: 101 | j = word.index(first, i) 102 | new_word.extend(word[i:j]) 103 | i = j 104 | except: 105 | new_word.extend(word[i:]) 106 | break 107 | 108 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 109 | new_word.append(first+second) 110 | i += 2 111 | else: 112 | new_word.append(word[i]) 113 | i += 1 114 | new_word = tuple(new_word) 115 | word = new_word 116 | if len(word) == 1: 117 | break 118 | else: 119 | pairs = get_pairs(word) 120 | word = ' '.join(word) 121 | self.cache[token] = word 122 | return word 123 | 124 | def encode(self, text): 125 | bpe_tokens = [] 126 | text = whitespace_clean(basic_clean(text)).lower() 127 | for token in re.findall(self.pat, text): 128 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 129 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 130 | return bpe_tokens 131 | 132 | def decode(self, tokens): 133 | text = ''.join([self.decoder[token] for token in tokens]) 134 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') 135 | return text 136 | -------------------------------------------------------------------------------- /alphaction/modeling/nonlocal_block.py: -------------------------------------------------------------------------------- 1 | from __future__ import (absolute_import, division, print_function, 2 | unicode_literals) 3 | 4 | import torch 5 | import torch.nn as nn 6 | from alphaction.layers import FrozenBatchNorm3d 7 | 8 | 9 | class NLBlock(nn.Module): 10 | def __init__(self, dim_in, dim_out, dim_inner, nl_cfg, group=False): 11 | super(NLBlock, self).__init__() 12 | 13 | self.nl_cfg = nl_cfg.clone() 14 | self.group = group 15 | self.group_size = 4 16 | 17 | init_std = nl_cfg.CONV_INIT_STD 18 | bias = not nl_cfg.NO_BIAS 19 | pool_stride = 2 20 | 21 | self.scale_value = dim_inner ** (-0.5) 22 | self.dim_inner = dim_inner 23 | 24 | self.theta = nn.Conv3d(dim_in, dim_inner, 1, bias=bias) 25 | nn.init.normal_(self.theta.weight, std=init_std) 26 | if bias: 27 | nn.init.constant_(self.theta.bias, 0) 28 | 29 | if nl_cfg.USE_MAXPOOL: 30 | self.maxpool = nn.MaxPool3d((1, pool_stride, pool_stride), 31 | stride=(1, pool_stride, pool_stride)) 32 | 33 | self.phi = nn.Conv3d(dim_in, dim_inner, 1, bias=bias) 34 | nn.init.normal_(self.phi.weight, std=init_std) 35 | if bias: 36 | nn.init.constant_(self.phi.bias, 0) 37 | 38 | self.g = nn.Conv3d(dim_in, dim_inner, 1, bias=bias) 39 | nn.init.normal_(self.g.weight, std=init_std) 40 | if bias: 41 | nn.init.constant_(self.g.bias, 0) 42 | 43 | if nl_cfg.USE_SOFTMAX: 44 | self.softmax = nn.Softmax(dim=2) 45 | 46 | self.out = nn.Conv3d(dim_inner, dim_out, 1, bias=bias) 47 | if nl_cfg.USE_ZERO_INIT_CONV: 48 | nn.init.constant_(self.out.weight, 0) 49 | else: 50 | nn.init.normal_(self.out.weight, std=init_std) 51 | if bias: 52 | nn.init.constant_(self.out.bias, 0) 53 | 54 | if nl_cfg.USE_BN: 55 | if nl_cfg.FROZEN_BN: 56 | self.bn = FrozenBatchNorm3d(dim_out, eps=nl_cfg.BN_EPSILON) 57 | else: 58 | self.bn = nn.BatchNorm3d(dim_out, eps=nl_cfg.BN_EPSILON, momentum=nl_cfg.BN_MOMENTUM) 59 | nn.init.constant_(self.bn.weight, nl_cfg.BN_INIT_GAMMA) 60 | 61 | def forward(self, x): 62 | if x.dim() != 5: 63 | raise ValueError('expected 4D or 5D input (got {}D input)' 64 | .format(x.dim())) 65 | 66 | if self.group: 67 | x = x.transpose(1, 2) 68 | sz_before_group = list(x.shape) 69 | sz_after_group = sz_before_group.copy() 70 | sz_after_group[0] = -1 71 | sz_after_group[1] = self.group_size 72 | x = x.contiguous().view(*sz_after_group) 73 | x = x.transpose(1, 2) 74 | 75 | batch_size = x.shape[0] 76 | 77 | theta = self.theta(x) 78 | 79 | if self.nl_cfg.USE_MAXPOOL: 80 | max_pool = self.maxpool(x) 81 | else: 82 | max_pool = x 83 | 84 | phi = self.phi(max_pool) 85 | 86 | g = self.g(max_pool) 87 | 88 | org_size = theta.size() 89 | mat_size = [batch_size, self.dim_inner, -1] 90 | theta = theta.view(*mat_size) 91 | phi = phi.view(*mat_size) 92 | g = g.view(*mat_size) 93 | 94 | theta_phi = torch.bmm(theta.transpose(1, 2), phi) 95 | 96 | if self.nl_cfg.USE_SOFTMAX: 97 | if self.nl_cfg.USE_SCALE: 98 | theta_phi_sc = theta_phi * self.scale_value 99 | else: 100 | theta_phi_sc = theta_phi 101 | p = self.softmax(theta_phi_sc) 102 | else: 103 | p = theta_phi / theta_phi.shape[-1] 104 | 105 | t = torch.bmm(g, p.transpose(1, 2)) 106 | 107 | t = t.view(org_size) 108 | 109 | out = self.out(t) 110 | 111 | if self.nl_cfg.USE_BN: 112 | out = self.bn(out) 113 | out = out + x 114 | 115 | if self.group: 116 | out = out.transpose(1, 2) 117 | out = out.contiguous().view(*sz_before_group) 118 | out = out.transpose(1, 2) 119 | 120 | return out 121 | 122 | def c2_weight_mapping(self): 123 | weight_map = {} 124 | for name, m_child in self.named_children(): 125 | if m_child.state_dict(): 126 | if isinstance(m_child, (nn.BatchNorm3d, FrozenBatchNorm3d)): 127 | weight_map[name + '.weight'] = '{}_s'.format(name) 128 | weight_map[name + '.running_mean'] = '{}_rm'.format(name) 129 | weight_map[name + '.running_var'] = '{}_riv'.format(name) 130 | elif isinstance(m_child, nn.GroupNorm): 131 | weight_map[name + '.weight'] = '{}_s'.format(name) 132 | else: 133 | weight_map[name + '.weight'] = '{}_w'.format(name) 134 | weight_map[name + '.bias'] = '{}_b'.format(name) 135 | return weight_map 136 | -------------------------------------------------------------------------------- /alphaction/modeling/registry.py: -------------------------------------------------------------------------------- 1 | from alphaction.utils.registry import Registry 2 | 3 | BACKBONES = Registry() 4 | ROI_ACTION_FEATURE_EXTRACTORS = Registry() 5 | ROI_ACTION_PREDICTORS = Registry() 6 | INTERACTION_AGGREGATION_STRUCTURES = Registry() -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/modeling/roi_heads/__init__.py -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/action_head/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/modeling/roi_heads/action_head/__init__.py -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/action_head/action_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .roi_action_feature_extractor import make_roi_action_feature_extractor 4 | from .roi_action_predictors import make_roi_action_predictor 5 | from .inference import make_roi_action_post_processor 6 | from .loss import make_roi_action_loss_evaluator 7 | from .metric import make_roi_action_accuracy_evaluator 8 | from alphaction.modeling.utils import prepare_pooled_feature 9 | from alphaction.utils.comm import all_reduce 10 | 11 | 12 | class ROIActionHead(torch.nn.Module): 13 | """ 14 | Generic Action Head class. 15 | """ 16 | 17 | def __init__(self, cfg, dim_in): 18 | super(ROIActionHead, self).__init__() 19 | self.feature_extractor = make_roi_action_feature_extractor(cfg, dim_in) 20 | self.predictor = make_roi_action_predictor(cfg, self.feature_extractor.dim_out) 21 | self.post_processor = make_roi_action_post_processor(cfg) 22 | self.loss_evaluator = make_roi_action_loss_evaluator(cfg) 23 | self.accuracy_evaluator = make_roi_action_accuracy_evaluator(cfg) 24 | self.test_ext = cfg.TEST.EXTEND_SCALE 25 | 26 | def forward(self, slow_features, fast_features, boxes, objects=None, extras={}, part_forward=-1): 27 | # In training stage, boxes are from gt. 28 | # In testing stage, boxes are detected by human detector and proposals should be 29 | # enlarged boxes. 30 | assert not (self.training and part_forward >= 0) 31 | 32 | if part_forward == 1: 33 | boxes = extras["current_feat_p"] 34 | objects = extras["current_feat_o"] 35 | 36 | if self.training: 37 | proposals = self.loss_evaluator.sample_box(boxes) 38 | else: 39 | proposals = [box.extend(self.test_ext) for box in boxes] 40 | 41 | x, x_pooled, x_objects = self.feature_extractor(slow_features, fast_features, proposals, objects, extras, part_forward) 42 | 43 | if part_forward == 0: 44 | pooled_feature = prepare_pooled_feature(x_pooled, boxes) 45 | if x_objects is None: 46 | object_pooled_feature = None 47 | else: 48 | object_pooled_feature = prepare_pooled_feature(x_objects, objects) 49 | return [pooled_feature, object_pooled_feature], {}, {}, {} 50 | 51 | action_logits = self.predictor(x) 52 | 53 | if not self.training: 54 | result = self.post_processor((action_logits,), boxes) 55 | return result, {}, {}, {} 56 | 57 | box_num = action_logits.size(0) 58 | box_num = torch.as_tensor([box_num], dtype=torch.float32, device=action_logits.device) 59 | all_reduce(box_num, average=True) 60 | 61 | loss_dict, loss_weight = self.loss_evaluator( 62 | [action_logits], box_num.item(), 63 | ) 64 | 65 | metric_dict = self.accuracy_evaluator( 66 | [action_logits], proposals, box_num.item(), 67 | ) 68 | 69 | pooled_feature = prepare_pooled_feature(x_pooled, proposals) 70 | if x_objects is None: 71 | object_pooled_feature = [] 72 | else: 73 | object_pooled_feature = prepare_pooled_feature(x_objects, objects) 74 | 75 | return ( 76 | [pooled_feature, object_pooled_feature], 77 | loss_dict, 78 | loss_weight, 79 | metric_dict, 80 | ) 81 | 82 | def c2_weight_mapping(self): 83 | weight_map = {} 84 | for name, m_child in self.named_children(): 85 | if m_child.state_dict() and hasattr(m_child, "c2_weight_mapping"): 86 | child_map = m_child.c2_weight_mapping() 87 | for key, val in child_map.items(): 88 | new_key = name + '.' + key 89 | weight_map[new_key] = val 90 | return weight_map 91 | 92 | 93 | def build_roi_action_head(cfg, dim_in): 94 | return ROIActionHead(cfg, dim_in) 95 | -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/action_head/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from alphaction.structures.bounding_box import BoxList 6 | 7 | 8 | class PostProcessor(nn.Module): 9 | def __init__(self, pose_action_num): 10 | super(PostProcessor, self).__init__() 11 | self.pose_action_num = pose_action_num 12 | 13 | def forward(self, x, boxes): 14 | # boxes should be (#detections,4) 15 | # prob should be calculated in different way. 16 | class_logits, = x 17 | pose_action_prob = F.softmax(class_logits[:,:self.pose_action_num],-1) 18 | interaction_action_prob = torch.sigmoid(class_logits[:,self.pose_action_num:]) 19 | 20 | action_prob = torch.cat((pose_action_prob,interaction_action_prob),1) 21 | 22 | image_shapes = [box.size for box in boxes] 23 | boxes_per_image = [len(box) for box in boxes] 24 | box_tensors = [a.bbox for a in boxes] 25 | 26 | action_prob = action_prob.split(boxes_per_image, dim=0) 27 | 28 | results = [] 29 | for prob, boxes_per_image, image_shape in zip( 30 | action_prob, box_tensors, image_shapes 31 | ): 32 | boxlist = self.prepare_boxlist(boxes_per_image, prob, image_shape) 33 | results.append(boxlist) 34 | return results 35 | 36 | def prepare_boxlist(self, boxes, scores, image_shape): 37 | boxlist = BoxList(boxes, image_shape, mode="xyxy") 38 | boxlist.add_field("scores", scores) 39 | return boxlist 40 | 41 | 42 | def make_roi_action_post_processor(cfg): 43 | softmax_num = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES 44 | postprocessor = PostProcessor(softmax_num) 45 | return postprocessor 46 | -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/action_head/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from alphaction.layers import SigmoidFocalLoss, SoftmaxFocalLoss 3 | from alphaction.modeling.utils import cat 4 | 5 | 6 | class ActionLossComputation(object): 7 | def __init__(self, cfg): 8 | self.proposal_per_clip = cfg.MODEL.ROI_ACTION_HEAD.PROPOSAL_PER_CLIP 9 | self.num_pose = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES 10 | self.num_object = cfg.MODEL.ROI_ACTION_HEAD.NUM_OBJECT_MANIPULATION_CLASSES 11 | self.num_person = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_INTERACTION_CLASSES 12 | 13 | self.weight_dict = dict( 14 | loss_pose_action = cfg.MODEL.ROI_ACTION_HEAD.POSE_LOSS_WEIGHT, 15 | loss_object_interaction = cfg.MODEL.ROI_ACTION_HEAD.OBJECT_LOSS_WEIGHT, 16 | loss_person_interaction = cfg.MODEL.ROI_ACTION_HEAD.PERSON_LOSS_WEIGHT, 17 | ) 18 | 19 | gamma = cfg.MODEL.ROI_ACTION_HEAD.FOCAL_LOSS.GAMMA 20 | alpha = cfg.MODEL.ROI_ACTION_HEAD.FOCAL_LOSS.ALPHA 21 | self.sigmoid_focal_loss = SigmoidFocalLoss(gamma, alpha, reduction="none") 22 | self.softmax_focal_loss = SoftmaxFocalLoss(gamma, alpha, reduction="sum") 23 | 24 | def sample_box(self, boxes): 25 | proposals = [] 26 | num_proposals = self.proposal_per_clip 27 | for boxes_per_image in boxes: 28 | num_boxes = len(boxes_per_image) 29 | 30 | if num_boxes > num_proposals: 31 | choice_inds = torch.randperm(num_boxes)[:num_proposals] 32 | proposals_per_image = boxes_per_image[choice_inds] 33 | else: 34 | proposals_per_image = boxes_per_image 35 | proposals_per_image = proposals_per_image.random_aug(0.2, 0.1, 0.1, 0.05) 36 | proposals.append(proposals_per_image) 37 | self._proposals = proposals 38 | return proposals 39 | 40 | def __call__(self, class_logits, avg_box_num): 41 | class_logits = cat(class_logits, dim=0) 42 | assert class_logits.shape[1] == (self.num_pose + self.num_object + self.num_person), \ 43 | "The shape of tensor class logits doesn't match total number of action classes." 44 | 45 | if not hasattr(self, "_proposals"): 46 | raise RuntimeError("sample_box needs to be called before") 47 | 48 | proposals = self._proposals 49 | 50 | labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0) 51 | assert class_logits.shape[1] == labels.shape[1], \ 52 | "The shape of tensor class logits doesn't match the label tensor." 53 | 54 | loss_dict = {} 55 | 56 | if self.num_pose > 0: 57 | pose_label = labels[:, :self.num_pose].argmax(dim=1) 58 | pose_logits = class_logits[:, :self.num_pose] 59 | pose_loss = self.softmax_focal_loss(pose_logits, pose_label) / avg_box_num 60 | loss_dict["loss_pose_action"] = pose_loss 61 | 62 | interaction_label = labels[:, self.num_pose:].to(dtype=torch.float32) 63 | object_label = interaction_label[:, :self.num_object] 64 | person_label = interaction_label[:, self.num_object:] 65 | 66 | interaction_logits = class_logits[:, self.num_pose:] 67 | object_logits = interaction_logits[:, :self.num_object] 68 | person_logits = interaction_logits[:, self.num_object:] 69 | 70 | if self.num_object > 0: 71 | object_loss = self.sigmoid_focal_loss(object_logits, object_label).mean(dim=1).sum() / avg_box_num 72 | loss_dict["loss_object_interaction"] = object_loss 73 | if self.num_person > 0: 74 | person_loss = self.sigmoid_focal_loss(person_logits, person_label).mean(dim=1).sum() / avg_box_num 75 | loss_dict["loss_person_interaction"] = person_loss 76 | 77 | return loss_dict, self.weight_dict 78 | 79 | 80 | def make_roi_action_loss_evaluator(cfg): 81 | loss_evaluator = ActionLossComputation(cfg) 82 | 83 | return loss_evaluator -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/action_head/metric.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from alphaction.modeling.utils import cat 3 | 4 | 5 | class ActionAccuracyComputation(object): 6 | def __init__(self, num_pose, num_object, num_person): 7 | self.num_pose = num_pose 8 | self.num_object = num_object 9 | self.num_person = num_person 10 | 11 | def logic_iou(self, pred, label): 12 | device = pred.device 13 | 14 | version = torch.__version__ 15 | if eval('.'.join(version.split('.')[:2]))>=1.3: 16 | pred = pred.bool() 17 | label = label.bool() 18 | 19 | label_union = (pred | label).float().sum(dim=1) 20 | label_inter = (pred & label).float().sum(dim=1) 21 | replacer = torch.ones_like(label_union, device=device) 22 | zero_mask = label_union == 0 23 | label_inter = torch.where(zero_mask, replacer, label_inter) 24 | label_union = torch.where(zero_mask, replacer, label_union) 25 | return label_inter / label_union 26 | 27 | def __call__(self, class_logits, proposals, avg_box_num): 28 | class_logits = [logits.detach() for logits in class_logits] 29 | class_logits = cat(class_logits, dim=0) 30 | assert class_logits.shape[1] == (self.num_pose + self.num_object + self.num_person), \ 31 | "The shape of tensor class logits doesn't match total number of action classes." 32 | 33 | labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0) 34 | 35 | metric_dict = {} 36 | if self.num_pose>0: 37 | pose_label = labels[:, :self.num_pose].argmax(dim=1) 38 | pose_pred = class_logits[:, :self.num_pose].argmax(dim=1) 39 | accuracy_pose_action = pose_label.eq(pose_pred).float().sum() 40 | metric_dict["accuracy_pose_action"] = accuracy_pose_action / avg_box_num 41 | 42 | interaction_label = labels[:, self.num_pose:] 43 | interaction_logits = class_logits[:, self.num_pose:] 44 | interaction_pred = interaction_logits.sigmoid() > 0.5 45 | 46 | if self.num_object>0: 47 | object_label = interaction_label[:, :self.num_object] 48 | object_pred = interaction_pred[:, :self.num_object] 49 | accuracy_object_interaction = self.logic_iou(object_pred, object_label) 50 | metric_dict["accuracy_object_interaction"] = accuracy_object_interaction.sum() / avg_box_num 51 | 52 | if self.num_person>0: 53 | person_label = interaction_label[:, self.num_object:] 54 | person_pred = interaction_pred[:, self.num_object:] 55 | accuracy_person_interaction = self.logic_iou(person_pred, person_label) 56 | metric_dict["accuracy_person_interaction"] = accuracy_person_interaction.sum() / avg_box_num 57 | 58 | return metric_dict 59 | 60 | 61 | def make_roi_action_accuracy_evaluator(cfg): 62 | num_pose = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES 63 | num_object = cfg.MODEL.ROI_ACTION_HEAD.NUM_OBJECT_MANIPULATION_CLASSES 64 | num_person = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_INTERACTION_CLASSES 65 | return ActionAccuracyComputation(num_pose, num_object, num_person) -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/action_head/roi_action_predictors.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from alphaction.modeling import registry 3 | 4 | 5 | @registry.ROI_ACTION_PREDICTORS.register("FCPredictor") 6 | class FCPredictor(nn.Module): 7 | def __init__(self, config, dim_in): 8 | super(FCPredictor, self).__init__() 9 | 10 | num_classes = config.MODEL.ROI_ACTION_HEAD.NUM_CLASSES 11 | 12 | dropout_rate = config.MODEL.ROI_ACTION_HEAD.DROPOUT_RATE 13 | if dropout_rate > 0: 14 | self.dropout = nn.Dropout(p=dropout_rate, inplace=True) 15 | 16 | self.cls_score = nn.Linear(dim_in, num_classes) 17 | 18 | nn.init.normal_(self.cls_score.weight, std=0.01) 19 | nn.init.constant_(self.cls_score.bias, 0) 20 | 21 | def forward(self, x): 22 | x = x.view(x.size(0), -1) 23 | if hasattr(self, "dropout"): 24 | x = self.dropout(x) 25 | scores = self.cls_score(x) 26 | 27 | return scores 28 | 29 | def c2_weight_mapping(self): 30 | return {"cls_score.weight": "pred_w", 31 | "cls_score.bias": "pred_b"} 32 | 33 | 34 | def make_roi_action_predictor(cfg, dim_in): 35 | func = registry.ROI_ACTION_PREDICTORS[cfg.MODEL.ROI_ACTION_HEAD.PREDICTOR] 36 | return func(cfg, dim_in) 37 | -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/roi_heads_3d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .action_head.action_head import build_roi_action_head 4 | 5 | 6 | class Combined3dROIHeads(torch.nn.ModuleDict): 7 | def __init__(self, cfg, heads): 8 | super(Combined3dROIHeads, self).__init__(heads) 9 | self.cfg = cfg.clone() 10 | 11 | def forward(self, slow_features, fast_features, boxes, objects=None, extras={}, part_forward=-1): 12 | result, loss_action, loss_weight, accuracy_action = self.action(slow_features, fast_features, boxes, objects, extras, part_forward) 13 | 14 | return result, loss_action, loss_weight, accuracy_action 15 | 16 | def c2_weight_mapping(self): 17 | weight_map = {} 18 | for name, m_child in self.named_children(): 19 | if m_child.state_dict() and hasattr(m_child,"c2_weight_mapping"): 20 | child_map = m_child.c2_weight_mapping() 21 | for key, val in child_map.items(): 22 | new_key = name + '.' + key 23 | weight_map[new_key] = val 24 | return weight_map 25 | 26 | 27 | def build_3d_roi_heads(cfg, dim_in): 28 | roi_heads = [] 29 | roi_heads.append(("action", build_roi_action_head(cfg, dim_in))) 30 | 31 | if roi_heads: 32 | roi_heads = Combined3dROIHeads(cfg, roi_heads) 33 | 34 | return roi_heads 35 | -------------------------------------------------------------------------------- /alphaction/modeling/stm_decoder/util/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | -------------------------------------------------------------------------------- /alphaction/modeling/stm_decoder/util/adaptive_mixing_operator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class AdaptiveMixing(nn.Module): 7 | def __init__(self, in_dim, in_points, n_groups, query_dim=None, 8 | out_dim=None, out_points=None, sampling_rate=None): 9 | super(AdaptiveMixing, self).__init__() 10 | out_dim = out_dim if out_dim is not None else in_dim 11 | out_points = out_points if out_points is not None else in_points 12 | query_dim = query_dim if query_dim is not None else in_dim 13 | sampling_rate = sampling_rate if sampling_rate is not None else 1 14 | 15 | self.query_dim = query_dim 16 | self.in_dim = in_dim 17 | self.in_points = in_points//sampling_rate 18 | self.n_groups = n_groups 19 | self.out_dim = out_dim 20 | self.out_points = out_points 21 | 22 | self.eff_in_dim = in_dim//n_groups 23 | self.eff_out_dim = out_dim//n_groups 24 | 25 | self.pad_bias_dim = 0 26 | self.pad_bias_points = 0 27 | 28 | self.eff_in_dim = self.eff_in_dim + self.pad_bias_dim 29 | self.in_points = self.in_points + self.pad_bias_points 30 | 31 | self.REDUCTION = 1 32 | 33 | self.m_parameters = self.eff_in_dim * self.eff_out_dim 34 | self.s_parameters = self.in_points * self.out_points 35 | 36 | self.total_parameters = self.m_parameters + self.s_parameters 37 | 38 | self.parameter_generator = nn.Sequential( 39 | nn.Linear(self.query_dim, self.n_groups*self.total_parameters), 40 | ) 41 | 42 | self.out_proj = nn.Linear( 43 | self.eff_out_dim*self.out_points*self.n_groups, self.query_dim, bias=True 44 | ) 45 | 46 | self.act = nn.ReLU(inplace=True) 47 | 48 | self._init_weights() 49 | 50 | @torch.no_grad() 51 | def _init_weights(self): 52 | nn.init.zeros_(self.parameter_generator[-1].weight) 53 | 54 | def forward(self, x, query): 55 | 56 | B, N, g, P, C = x.size() 57 | G = self.n_groups 58 | assert g == G 59 | 60 | 61 | '''generate mixing parameters''' 62 | params = self.parameter_generator(query) 63 | params = params.reshape(B*N, G, -1) 64 | out = x.reshape(B*N, G, P, C) 65 | 66 | M, S = params.split( 67 | [self.m_parameters, self.s_parameters], 2) 68 | 69 | M = M.reshape(B*N, G, self.eff_in_dim, self.eff_in_dim) 70 | S = S.reshape(B*N, G, self.out_points, self.in_points) 71 | 72 | 73 | '''adaptive channel mixing 74 | the process also can be done with torch.bmm 75 | but for clarity, we use torch.matmul 76 | ''' 77 | out = torch.matmul(out, M) 78 | out = F.layer_norm(out, [out.size(-2), out.size(-1)]) 79 | out = self.act(out) 80 | 81 | '''adaptive spatial mixing''' 82 | out = torch.matmul(S, out) # implicitly transpose and matmul 83 | out = F.layer_norm(out, [out.size(-2), out.size(-1)]) 84 | out = self.act(out) 85 | 86 | '''linear transfomation to query dim''' 87 | out = out.reshape(B, N, -1) 88 | out = self.out_proj(out) 89 | 90 | out = query + out 91 | 92 | return out 93 | -------------------------------------------------------------------------------- /alphaction/modeling/stm_decoder/util/msaq.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def translate_to_linear_weight(ref: torch.Tensor, num_total, tau=2.0): 6 | # ref: [n, n_query, 1, in_points * n_heads] 7 | # num_total: feature levels (typically 4) 8 | grid = torch.arange(num_total, device=ref.device, dtype=ref.dtype).view( 9 | *[len(ref.shape)*[1, ]+[-1, ]]) 10 | # [1, 1, 1, 1, num_total] 11 | 12 | ref = ref.unsqueeze(-1).clone() 13 | # [n, n_query, 1, in_points * n_heads, 1] 14 | l2 = (ref-grid).pow(2.0).div(tau).abs().neg() 15 | # [n, n_query, 1, in_points * n_heads, num_total] 16 | weight = torch.softmax(l2, dim=-1) 17 | 18 | return weight 19 | 20 | 21 | def MHAQ3D(sample_points: torch.Tensor, value: torch.Tensor, weight=None, n_points=1): 22 | ''' 23 | Args: 24 | sample_points: [n, n_query, 1, in_points * n_heads, 2] 25 | value: [n, c, t, h, w] 26 | weight: [n, n_query, 1, in_points * n_heads] 27 | n_points: in_points 28 | 29 | Returns: 30 | [B,c//n_heads,n_heads,t,in_points,n_query,1] 31 | ''' 32 | B, Hq, Wq, n_heads_points, _ = sample_points.shape 33 | # print(value.shape) 34 | B, Ck, Tk, Hk, Wk = value.shape 35 | 36 | n_heads = n_heads_points//n_points 37 | 38 | sample_points = sample_points.view(B, Hq, Wq, n_heads, n_points, 2) \ 39 | .permute(0, 3, 1, 2, 4, 5).contiguous().flatten(0, 1) 40 | # n*n_heads, n_query, 1, in_points, 2 41 | sample_points = sample_points.repeat(Tk, 1, 1, 1, 1) 42 | # n*n_heads*Tk, n_query, 1, in_points, 2 43 | sample_points = sample_points.flatten(2, 3) 44 | # n*n_heads*Tk, n_query, in_points, 2 45 | sample_points = sample_points*2.0-1.0 46 | value = value.view(B*n_heads, Ck//n_heads, Tk, Hk, Wk).permute(2, 0, 1, 3, 4).flatten(0, 1) 47 | out = F.grid_sample( 48 | value, sample_points, 49 | mode='bilinear', padding_mode='zeros', align_corners=False, 50 | ) 51 | # n*n_heads*Tk, c//n_heads, n_query, in_points 52 | 53 | if weight is not None: 54 | weight = weight.view(B, Hq, Wq, n_heads, n_points) \ 55 | .permute(0, 3, 1, 2, 4).flatten(0, 1).flatten(2, 3).unsqueeze(1).repeat(Tk, 1, 1, 1) 56 | # n*n_heads*Tk, 1, n_query, in_points 57 | out *= weight 58 | 59 | return out.view(Tk, B, n_heads, Ck//n_heads, Hq, Wq, n_points).permute(1, 3, 2, 0, 6, 4, 5) 60 | 61 | 62 | def SAMPLE4D(sample_points: torch.Tensor, values: torch.Tensor, featmap_strides, n_points: int = 1, num_levels: int = None, mapping_stride=3.0, tau=2.0, ): 63 | B, Hq, Wq, n_heads_points, _ = sample_points.shape 64 | B, C, t, _, _ = values[0].shape 65 | 66 | n_heads = n_heads_points//n_points 67 | 68 | if num_levels is None: 69 | num_levels = len(values) 70 | 71 | sample_points_xy = sample_points[..., 0:2] 72 | # print(sample_points_xy.shape) torch.Size([2, 100, 1, 128=32*4, 2]) 73 | # [n, n_query, 1, in_points * n_heads, 2] 74 | 75 | sample_points_lvl = sample_points[..., 2].clone() 76 | # print(sample_points_lvl.shape) torch.Size([2, 100, 1, 128=32*4]) 77 | # [n, n_query, 1, in_points * n_heads] 78 | 79 | sample_points_lvl_mapped = sample_points_lvl - mapping_stride 80 | # print(sample_points_lvl_mapped.shape) torch.Size([2, 100, 1, 128=32*4]) 81 | # [n, n_query, 1, in_points * n_heads] 82 | 83 | sample_points_lvl_weight = translate_to_linear_weight(sample_points_lvl_mapped, num_levels, tau=tau) 84 | # print(sample_points_lvl_weight.shape) torch.Size([2, 100, 1, 128=32*4, 4]) 85 | # [n, n_query, 1, in_points * n_heads, num_levels] 86 | 87 | sample_points_lvl_weight_list = sample_points_lvl_weight.unbind(-1) 88 | # [[n, n_query, 1, in_points * n_heads],....] 89 | 90 | out = sample_points.new_zeros(B, C//n_heads, n_heads, t, n_points, Hq, Wq) 91 | # print(out.shape) torch.Size([2, 64=256//4, 4, 4, 32, 100, 1]) 92 | # n, dim//n_heads, n_heads, t, in_points, n_query, 1 93 | 94 | for i in range(num_levels): 95 | value = values[i] 96 | # B, C, T, H, W 97 | lvl_weights = sample_points_lvl_weight_list[i] 98 | stride = featmap_strides[i] 99 | 100 | mapping_size = value.new_tensor([value.size(4), value.size(3)]).view(1, 1, 1, 1, -1) * stride 101 | normalized_xy = sample_points_xy / mapping_size 102 | # [n, n_query, 1, in_points * n_heads, 2] 103 | 104 | out += MHAQ3D(normalized_xy, value, weight=lvl_weights, n_points=n_points) 105 | 106 | return out, None 107 | -------------------------------------------------------------------------------- /alphaction/modeling/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Miscellaneous utility functions 3 | """ 4 | 5 | import torch 6 | from alphaction.structures.bounding_box import BoxList 7 | 8 | 9 | def cat(tensors, dim=0): 10 | """ 11 | Efficient version of torch.cat that avoids a copy if there is only a single element in a list 12 | """ 13 | assert isinstance(tensors, (list, tuple)) 14 | if len(tensors) == 1: 15 | return tensors[0] 16 | return torch.cat(tensors, dim) 17 | 18 | def pad_sequence(sequence, targ_size, padding_value=0): 19 | tensor_size = sequence[0].size() 20 | trailing_dims = tensor_size[1:] 21 | out_dims = (len(sequence), targ_size) + trailing_dims 22 | 23 | out_tensor = sequence[0].new_full(out_dims, padding_value) 24 | for i, tensor in enumerate(sequence): 25 | length = tensor.size(0) 26 | out_tensor[i, :length, ...] = tensor 27 | 28 | return out_tensor 29 | 30 | def prepare_pooled_feature(x_pooled, boxes, detach=True): 31 | image_shapes = [box.size for box in boxes] 32 | boxes_per_image = [len(box) for box in boxes] 33 | box_tensors = [a.bbox for a in boxes] 34 | 35 | if detach: 36 | x_pooled = x_pooled.detach() 37 | pooled_feature = x_pooled.split(boxes_per_image, dim=0) 38 | 39 | boxes_result = [] 40 | for feature_per_image, boxes_per_image, image_shape in zip( 41 | pooled_feature, box_tensors, image_shapes 42 | ): 43 | boxlist = BoxList(boxes_per_image, image_shape, mode="xyxy") 44 | boxlist.add_field("pooled_feature", feature_per_image) 45 | boxes_result.append(boxlist) 46 | return boxes_result -------------------------------------------------------------------------------- /alphaction/solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import make_optimizer 2 | from .build import make_lr_scheduler 3 | from .lr_scheduler import WarmupMultiStepLR 4 | -------------------------------------------------------------------------------- /alphaction/solver/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/solver/lr_scheduler.py 2 | from bisect import bisect_right 3 | 4 | import torch 5 | import math 6 | 7 | 8 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): 9 | def __init__( 10 | self, 11 | optimizer, 12 | milestones, 13 | gamma=0.1, 14 | warmup_factor=1.0 / 3, 15 | warmup_iters=500, 16 | warmup_method="linear", 17 | last_epoch=-1, 18 | ): 19 | if not list(milestones) == sorted(milestones): 20 | raise ValueError( 21 | "Milestones should be a list of" " increasing integers. Got {}", 22 | milestones, 23 | ) 24 | 25 | if warmup_method not in ("constant", "linear"): 26 | raise ValueError( 27 | "Only 'constant' or 'linear' warmup_method accepted" 28 | "got {}".format(warmup_method) 29 | ) 30 | self.milestones = milestones 31 | self.gamma = gamma 32 | self.warmup_factor = warmup_factor 33 | self.warmup_iters = warmup_iters 34 | self.warmup_method = warmup_method 35 | super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) 36 | 37 | def get_lr(self): 38 | warmup_factor = 1 39 | if self.last_epoch < self.warmup_iters: 40 | if self.warmup_method == "constant": 41 | warmup_factor = self.warmup_factor 42 | elif self.warmup_method == "linear": 43 | alpha = float(self.last_epoch) / self.warmup_iters 44 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha 45 | return [ 46 | base_lr 47 | * warmup_factor 48 | * self.gamma ** bisect_right(self.milestones, self.last_epoch) 49 | for base_lr in self.base_lrs 50 | ] 51 | 52 | class HalfPeriodCosStepLR(torch.optim.lr_scheduler._LRScheduler): 53 | def __init__( 54 | self, 55 | optimizer, 56 | warmup_factor=1.0 / 3, 57 | warmup_iters=8000, 58 | max_iters=60000, 59 | warmup_method="linear", 60 | last_epoch=-1, 61 | ): 62 | if warmup_method not in ("constant", "linear"): 63 | raise ValueError( 64 | "Only 'constant' or 'linear' warmup_method accepted" 65 | "got {}".format(warmup_method) 66 | ) 67 | self.warmup_factor = warmup_factor 68 | self.warmup_iters = warmup_iters 69 | self.max_iters = max_iters 70 | self.warmup_method = warmup_method 71 | super(HalfPeriodCosStepLR, self).__init__(optimizer, last_epoch) 72 | 73 | def get_lr(self): 74 | warmup_factor = 1 75 | if self.last_epoch < self.warmup_iters: 76 | if self.warmup_method == "constant": 77 | warmup_factor = self.warmup_factor 78 | elif self.warmup_method == "linear": 79 | alpha = float(self.last_epoch) / self.warmup_iters 80 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha 81 | else: 82 | warmup_factor = 0.5 * (math.cos(self.last_epoch / self.max_iters * math.pi) + 1) 83 | return [ 84 | base_lr 85 | * warmup_factor 86 | for base_lr in self.base_lrs 87 | ] -------------------------------------------------------------------------------- /alphaction/structures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/structures/__init__.py -------------------------------------------------------------------------------- /alphaction/structures/memory_pool.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | class MemoryPool(object): 4 | def __init__(self): 5 | self.cache = defaultdict(dict) 6 | 7 | def update(self, update_info): 8 | for movie_id, feature_per_movie in update_info.items(): 9 | self.cache[movie_id].update(feature_per_movie) 10 | 11 | def update_list(self, update_info_list): 12 | for update_info in update_info_list: 13 | self.update(update_info) 14 | 15 | def __getitem__(self, item): 16 | if isinstance(item, tuple) and len(item)==2: 17 | return self.cache[item[0]][item[1]] 18 | return self.cache[item] 19 | 20 | def __setitem__(self, key, value): 21 | if isinstance(key, tuple) and len(key)==2: 22 | self.cache[key[0]][key[1]] = value 23 | else: 24 | self.cache[key] = value 25 | 26 | def __delitem__(self, item): 27 | if isinstance(item, tuple) and len(item)==2: 28 | del self.cache[item[0]][item[1]] 29 | else: 30 | del self.cache[item] 31 | 32 | def __contains__(self, item): 33 | if isinstance(item, tuple) and len(item)==2: 34 | return (item[0] in self.cache and item[1] in self.cache[item[0]]) 35 | return (item in self.cache) 36 | 37 | def items(self): 38 | return self.cache.items() -------------------------------------------------------------------------------- /alphaction/utils/IA_helper.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | def _block_set(ia_blocks): 4 | if len(ia_blocks) > 0 and isinstance(ia_blocks[0], list): 5 | ia_blocks = list(itertools.chain.from_iterable(ia_blocks)) 6 | return ia_blocks 7 | 8 | def has_person(ia_config): 9 | ia_blocks = _block_set(ia_config.I_BLOCK_LIST) 10 | return (ia_config.ACTIVE and 'P' in ia_blocks and ia_config.MAX_PERSON > 0) 11 | 12 | 13 | def has_object(ia_config): 14 | ia_blocks = _block_set(ia_config.I_BLOCK_LIST) 15 | return (ia_config.ACTIVE and 'O' in ia_blocks and ia_config.MAX_OBJECT > 0) 16 | 17 | 18 | def has_memory(ia_config): 19 | ia_blocks = _block_set(ia_config.I_BLOCK_LIST) 20 | return (ia_config.ACTIVE and 'M' in ia_blocks and ia_config.MAX_PER_SEC > 0) 21 | -------------------------------------------------------------------------------- /alphaction/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/alphaction/utils/__init__.py -------------------------------------------------------------------------------- /alphaction/utils/logger.py: -------------------------------------------------------------------------------- 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/logger.py 2 | import logging 3 | import os 4 | import sys 5 | import time 6 | 7 | 8 | def setup_logger(name, save_dir, distributed_rank, filename=None): 9 | logger = logging.getLogger(name) 10 | logger.setLevel(logging.DEBUG) 11 | logger.propagate = False 12 | # don't log results for the non-master process 13 | if distributed_rank > 0: 14 | return logger 15 | ch = logging.StreamHandler(stream=sys.stdout) 16 | ch.setLevel(logging.DEBUG) 17 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") 18 | ch.setFormatter(formatter) 19 | logger.addHandler(ch) 20 | 21 | if save_dir: 22 | if filename is None: 23 | filename = time.strftime("%Y-%m-%d_%H.%M.%S", time.localtime()) + ".log" 24 | fh = logging.FileHandler(os.path.join(save_dir, filename)) 25 | fh.setLevel(logging.DEBUG) 26 | fh.setFormatter(formatter) 27 | logger.addHandler(fh) 28 | 29 | return logger 30 | 31 | def setup_tblogger(save_dir, distributed_rank): 32 | if distributed_rank>0: 33 | return None 34 | from tensorboardX import SummaryWriter 35 | tbdir = os.path.join(save_dir,'tb') 36 | os.makedirs(tbdir,exist_ok=True) 37 | tblogger = SummaryWriter(tbdir) 38 | return tblogger -------------------------------------------------------------------------------- /alphaction/utils/metric_logger.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/metric_logger.py 2 | from collections import defaultdict 3 | from collections import deque 4 | 5 | import torch 6 | 7 | 8 | class SmoothedValue(object): 9 | """Track a series of values and provide access to smoothed values over a 10 | window or the global series average. 11 | """ 12 | 13 | def __init__(self, window_size=10): 14 | self.deque = deque(maxlen=window_size) 15 | # self.series = [] 16 | self.total = 0.0 17 | self.count = 0 18 | 19 | def update(self, value): 20 | self.deque.append(value) 21 | # self.series.append(value) 22 | self.count += 1 23 | self.total += value 24 | 25 | @property 26 | def median(self): 27 | d = torch.tensor(list(self.deque)) 28 | return d.median().item() 29 | 30 | @property 31 | def avg(self): 32 | d = torch.tensor(list(self.deque)) 33 | return d.mean().item() 34 | 35 | @property 36 | def global_avg(self): 37 | return self.total / self.count 38 | 39 | 40 | class MetricLogger(object): 41 | def __init__(self, delimiter="\t"): 42 | self.meters = defaultdict(SmoothedValue) 43 | self.delimiter = delimiter 44 | 45 | def update(self, **kwargs): 46 | for k, v in kwargs.items(): 47 | if isinstance(v, torch.Tensor): 48 | v = v.item() 49 | assert isinstance(v, (float, int)) 50 | self.meters[k].update(v) 51 | 52 | def __getattr__(self, attr): 53 | if attr in self.meters: 54 | return self.meters[attr] 55 | if attr in self.__dict__: 56 | return self.__dict__[attr] 57 | raise AttributeError("'{}' object has no attribute '{}'".format( 58 | type(self).__name__, attr)) 59 | 60 | def __str__(self): 61 | loss_str = [] 62 | for name, meter in self.meters.items(): 63 | loss_str.append( 64 | "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg) 65 | ) 66 | return self.delimiter.join(loss_str) 67 | -------------------------------------------------------------------------------- /alphaction/utils/model_serialization.py: -------------------------------------------------------------------------------- 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/model_serialization.py 2 | from collections import OrderedDict 3 | import logging 4 | 5 | import torch 6 | 7 | 8 | def align_and_update_state_dicts(model_state_dict, loaded_state_dict, no_head): 9 | """ 10 | Strategy: suppose that the models that we will create will have prefixes appended 11 | to each of its keys, for example due to an extra level of nesting that the original 12 | pre-trained weights from ImageNet won't contain. For example, model.state_dict() 13 | might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains 14 | res2.conv1.weight. We thus want to match both parameters together. 15 | For that, we look for each model weight, look among all loaded keys if there is one 16 | that is a suffix of the current weight name, and use it if that's the case. 17 | If multiple matches exist, take the one with longest size 18 | of the corresponding name. For example, for the same model as before, the pretrained 19 | weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case, 20 | we want to match backbone[0].body.conv1.weight to conv1.weight, and 21 | backbone[0].body.res2.conv1.weight to res2.conv1.weight. 22 | """ 23 | current_keys = sorted(list(model_state_dict.keys())) 24 | loaded_keys = sorted(list(loaded_state_dict.keys())) 25 | # get a matrix of string matches, where each (i, j) entry correspond to the size of the 26 | # loaded_key string, if it matches 27 | match_matrix = [ 28 | len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys 29 | ] 30 | match_matrix = torch.as_tensor(match_matrix).view( 31 | len(current_keys), len(loaded_keys) 32 | ) 33 | max_match_size, idxs = match_matrix.max(1) 34 | # remove indices that correspond to no-match 35 | idxs[max_match_size == 0] = -1 36 | 37 | # used for logging 38 | max_size = max([len(key) for key in current_keys]) if current_keys else 1 39 | max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1 40 | # log_str_template = "{: <{}} loaded from {: <{}} of shape {}" 41 | logger = logging.getLogger(__name__) 42 | for idx_new, idx_old in enumerate(idxs.tolist()): 43 | if idx_old == -1: 44 | continue 45 | key = current_keys[idx_new] 46 | key_old = loaded_keys[idx_old] 47 | 48 | if no_head and key_old.startswith("roi_heads."): 49 | logger.info("{} will not be loaded.".format(key)) 50 | continue 51 | 52 | model_state_dict[key] = loaded_state_dict[key_old] 53 | # logger.info( 54 | # log_str_template.format( 55 | # key, 56 | # max_size, 57 | # key_old, 58 | # max_size_loaded, 59 | # tuple(loaded_state_dict[key_old].shape), 60 | # ) 61 | # ) 62 | 63 | 64 | def strip_prefix_if_present(state_dict, prefix): 65 | keys = sorted(state_dict.keys()) 66 | if not all(key.startswith(prefix) for key in keys): 67 | return state_dict 68 | stripped_state_dict = OrderedDict() 69 | for key, value in state_dict.items(): 70 | stripped_state_dict[key.replace(prefix, "")] = value 71 | return stripped_state_dict 72 | 73 | 74 | def exclude_layers(model_state_dict, excluded): 75 | model_state_dict_new = OrderedDict() 76 | for key, value in model_state_dict.items(): 77 | if any([exc in key for exc in excluded]): 78 | continue 79 | model_state_dict_new[key] = value 80 | return model_state_dict_new 81 | 82 | 83 | def load_state_dict(model, loaded_state_dict, no_head, excluded=[]): 84 | model_state_dict = model.state_dict() 85 | # if the state_dict comes from a model that was wrapped in a 86 | # DataParallel or DistributedDataParallel during serialization, 87 | # remove the "module" prefix before performing the matching 88 | loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.") 89 | 90 | if len(excluded) > 0: 91 | # exclude specified layers 92 | loaded_state_dict = exclude_layers(loaded_state_dict, excluded) 93 | 94 | align_and_update_state_dicts(model_state_dict, loaded_state_dict, no_head) 95 | 96 | # use strict loading 97 | model.load_state_dict(model_state_dict) 98 | -------------------------------------------------------------------------------- /alphaction/utils/random_seed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | import numpy as np 4 | 5 | def set_seed(seed, rank, world_size): 6 | rng = random.Random(seed) 7 | seed_per_rank = [rng.randint(0, 2**32-1) for _ in range(world_size)] 8 | cur_seed = seed_per_rank[rank] 9 | random.seed(cur_seed) 10 | torch.manual_seed(cur_seed) 11 | torch.cuda.manual_seed(cur_seed) 12 | np.random.seed(cur_seed) -------------------------------------------------------------------------------- /alphaction/utils/registry.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/registry.py 2 | 3 | def _register_generic(module_dict, module_name, module): 4 | assert module_name not in module_dict 5 | module_dict[module_name] = module 6 | 7 | 8 | class Registry(dict): 9 | ''' 10 | A helper class for managing registering modules, it extends a dictionary 11 | and provides a register functions. 12 | 13 | Eg. creeting a registry: 14 | some_registry = Registry({"default": default_module}) 15 | 16 | There're two ways of registering new modules: 17 | 1): normal way is just calling register function: 18 | def foo(): 19 | ... 20 | some_registry.register("foo_module", foo) 21 | 2): used as decorator when declaring the module: 22 | @some_registry.register("foo_module") 23 | @some_registry.register("foo_modeul_nickname") 24 | def foo(): 25 | ... 26 | 27 | Access of module is just like using a dictionary, eg: 28 | f = some_registry["foo_modeul"] 29 | ''' 30 | def __init__(self, *args, **kwargs): 31 | super(Registry, self).__init__(*args, **kwargs) 32 | 33 | def register(self, module_name, module=None): 34 | # used as function call 35 | if module is not None: 36 | _register_generic(self, module_name, module) 37 | return 38 | 39 | # used as decorator 40 | def register_fn(fn): 41 | _register_generic(self, module_name, fn) 42 | return fn 43 | 44 | return register_fn 45 | -------------------------------------------------------------------------------- /alphaction/utils/video_decode.py: -------------------------------------------------------------------------------- 1 | import av 2 | 3 | def av_decode_video(video_path): 4 | try: 5 | with av.open(video_path) as container: 6 | frames = [] 7 | for frame in container.decode(video=0): 8 | frames.append(frame.to_rgb().to_ndarray()) 9 | return frames 10 | except Exception: 11 | assert len(frame) != 0 12 | return frames -------------------------------------------------------------------------------- /alphaction/utils/visualize.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List 2 | import torch 3 | from torchvision.ops import box_convert 4 | import numpy as np 5 | import supervision as sv 6 | import cv2 7 | import imageio 8 | 9 | 10 | def annotate(image_source: np.ndarray, boxes: torch.Tensor, normalized=True, logits=None, phrases=[], is_xyxy=False, color=None, text_padding=10, set_text_color='black') -> np.ndarray: 11 | h, w, _ = image_source.shape 12 | if normalized: 13 | boxes = boxes * torch.Tensor([w, h, w, h]) 14 | if not is_xyxy: 15 | assert isinstance(boxes, torch.Tensor) 16 | xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy() 17 | elif isinstance(boxes, torch.Tensor): 18 | xyxy = boxes.numpy() 19 | else: 20 | xyxy = boxes 21 | detections = sv.Detections(xyxy=xyxy) 22 | 23 | if logits is not None and len(phrases) == logits.size(0): 24 | labels = [ 25 | f"{phrase} {logit:.2f}" 26 | for phrase, logit 27 | in zip(phrases, logits) 28 | ] 29 | else: 30 | labels = phrases 31 | 32 | if color is None or (not isinstance(color, tuple)): 33 | svcolor = sv.ColorPalette.default() 34 | else: 35 | svcolor = sv.Color(*color) 36 | text_color = sv.Color.white() if set_text_color == 'white' else sv.Color.black() 37 | box_annotator = sv.BoxAnnotator(color=svcolor, text_padding=text_padding, text_color=text_color) 38 | annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR) 39 | annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) 40 | return annotated_frame 41 | 42 | 43 | def video_to_gif(video, giffile, fps=5.0, toBGR=False): 44 | assert giffile.endswith('.gif') 45 | with imageio.get_writer(giffile, mode='I', duration=1.0/fps, loop=0) as writer: 46 | for frame in video: 47 | frame_vis = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) if toBGR else np.copy(frame) 48 | writer.append_data(frame_vis) -------------------------------------------------------------------------------- /assets/wacv25_openmixer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cogito2012/OpenMixer/3328de29d2cf75f0ae69c2abbafc2dadb5d1e629/assets/wacv25_openmixer.png -------------------------------------------------------------------------------- /config_files/jhmdb/openmixer_e2e.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | PATH_TO_DATA_DIR: "data/JHMDB" 3 | NUM_FRAMES: 16 4 | SAMPLING_RATE: 1 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] # from CLIPViP (same as CLIP) 7 | DATASETS: ['jhmdb'] 8 | OPEN_VOCABULARY: True 9 | REFINE_VOCAB: True 10 | JHMDB: 11 | FRAME_DIR: "Frames/" 12 | OPEN_WORLD_DIR: 'openworld' 13 | CW_SPLIT_FILE: 'train50%/closed_world_0.pkl' 14 | OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl' 15 | SAMPLES_SPLIT: 0 16 | VOCAB_REFINE: 'vocab_gpt3.5.json' 17 | MODEL: 18 | WEIGHT: null 19 | BACKBONE: 20 | CONV_BODY: "ViP-B/16" 21 | PATHWAYS: 1 22 | RESIDUAL_LATERAL: True 23 | STM: 24 | NUM_QUERIES: 100 25 | HIDDEN_DIM: 512 26 | NUM_STAGES: 3 27 | ACTION_CLASSES: 10 # 50%: 10, 75%: 15 28 | OBJECT_CLASSES: 1 29 | NUM_HEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | NUM_FCS: 2 33 | ACTIVATION: 'ReLU' 34 | SPATIAL_POINTS: 32 35 | TEMPORAL_POINTS: 16 # must be the same as NUM_FRAMES 36 | OUT_MULTIPLIER: 4 37 | N_GROUPS: 4 38 | NUM_CLS: 1 39 | NUM_ACT: 1 40 | NUM_REG: 1 41 | OBJECT_WEIGHT: 2.0 42 | ACTION_WEIGHT: 48.0 43 | GIOU_WEIGHT: 2.0 44 | L1_WEIGHT: 2.0 45 | BACKGROUND_WEIGHT: 0.1 46 | INTERMEDIATE_SUPERVISION: True 47 | PERSON_THRESHOLD: 0.6 48 | USE_CLS_FEAT: True 49 | COND_CLS: True 50 | FUSE_CLS: True 51 | FUSE_METHOD: 'logit_fusion' 52 | FUSE_FACTOR: 0.99 53 | DeST: True 54 | TEXT_ENCODER: 'CLIPViP' 55 | CLIPViP: 56 | ARCH: ViP-B/16 57 | CLIP_NAME: "openai/clip-vit-base-patch16" # load from huggingface 58 | WEIGHT: "pretrained/pretrain_clipvip_base_16.pt" 59 | TEMPORAL_SIZE: 12 60 | USE_TEMPORAL_EMBED: True 61 | LOGIT_SCALE_INIT: 4.6 62 | ADD_CLS_NUM: 3 63 | CONTEXT_INIT: 'a ' 64 | LEN_CONTEXT: 24 65 | CAM_METHOD: 'RITSM' 66 | USE_ATTN: False 67 | MULTI_LABEL_ACTION: False # softmax 68 | ViT: 69 | LAYER_DECAY: 1.0 70 | WEIGHT_DECAY: 1e-5 71 | SOLVER: 72 | MAX_EPOCH: 12 73 | BASE_LR: 0.00001 74 | WEIGHT_DECAY: 1e-4 75 | STEPS: (5, 8) 76 | WARMUP_FACTOR: 0.1 77 | WARMUP_EPOCH: 2 78 | CHECKPOINT_PERIOD: 1 79 | EVAL_PERIOD: 1 80 | EVAL_AFTER: 2 81 | VIDEOS_PER_BATCH: 16 82 | OPTIMIZING_METHOD: 'adamw' 83 | TEST: 84 | VIDEOS_PER_BATCH: 16 85 | EVAL_OPEN: True 86 | METRIC: 'video_ap' 87 | SMALL_OPEN_WORLD: True 88 | INDEPENDENT_EVAL: True 89 | OUTPUT_DIR: "output/jhmdb/openmixer_e2e" 90 | -------------------------------------------------------------------------------- /config_files/jhmdb/openmixer_zsr_tl.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | PATH_TO_DATA_DIR: "data/JHMDB" 3 | NUM_FRAMES: 16 4 | SAMPLING_RATE: 1 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] # from CLIPViP (same as CLIP) 7 | DATASETS: ['jhmdb'] 8 | OPEN_VOCABULARY: True 9 | REFINE_VOCAB: True 10 | JHMDB: 11 | FRAME_DIR: "Frames/" 12 | OPEN_WORLD_DIR: 'openworld' 13 | CW_SPLIT_FILE: 'train50%/closed_world_0.pkl' 14 | OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl' 15 | SAMPLES_SPLIT: 0 16 | VOCAB_REFINE: 'vocab_gpt3.5.json' 17 | MODEL: 18 | WEIGHT: null 19 | BACKBONE: 20 | CONV_BODY: "ViP-B/16" 21 | PATHWAYS: 1 22 | STM: 23 | NUM_QUERIES: 100 24 | HIDDEN_DIM: 512 25 | NUM_STAGES: 3 26 | ACTION_CLASSES: 10 # 50%: 10, 75%: 15 27 | OBJECT_CLASSES: 1 28 | NUM_HEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | NUM_FCS: 2 32 | ACTIVATION: 'ReLU' 33 | SPATIAL_POINTS: 32 34 | TEMPORAL_POINTS: 16 # must be the same as NUM_FRAMES 35 | OUT_MULTIPLIER: 4 36 | N_GROUPS: 4 37 | NUM_CLS: 1 38 | NUM_ACT: 1 39 | NUM_REG: 1 40 | OBJECT_WEIGHT: 2.0 41 | ACTION_WEIGHT: 48.0 42 | GIOU_WEIGHT: 2.0 43 | L1_WEIGHT: 2.0 44 | BACKGROUND_WEIGHT: 0.1 45 | INTERMEDIATE_SUPERVISION: True 46 | PERSON_THRESHOLD: 0.6 47 | USE_CLS_FEAT: True 48 | PRETRAIN_ACTION: True 49 | TEXT_ENCODER: 'CLIPViP' 50 | CLIPViP: 51 | ARCH: ViP-B/16 52 | CLIP_NAME: "openai/clip-vit-base-patch16" # load from huggingface 53 | WEIGHT: "pretrained/pretrain_clipvip_base_16.pt" 54 | TEMPORAL_SIZE: 12 55 | USE_TEMPORAL_EMBED: True 56 | LOGIT_SCALE_INIT: 4.6 57 | ADD_CLS_NUM: 3 58 | CONTEXT_INIT: 'a ' 59 | LEN_CONTEXT: 24 60 | CAM_METHOD: 'RITSM' 61 | USE_ATTN: False 62 | MULTI_LABEL_ACTION: False # softmax 63 | ViT: 64 | LAYER_DECAY: 1.0 65 | WEIGHT_DECAY: 1e-5 66 | SOLVER: 67 | MAX_EPOCH: 12 68 | BASE_LR: 0.00001 69 | WEIGHT_DECAY: 1e-4 70 | STEPS: (5, 8) 71 | WARMUP_FACTOR: 0.1 72 | WARMUP_EPOCH: 2 73 | CHECKPOINT_PERIOD: 1 74 | EVAL_PERIOD: 1 75 | EVAL_AFTER: 2 76 | VIDEOS_PER_BATCH: 16 77 | OPTIMIZING_METHOD: 'adamw' 78 | TEST: 79 | VIDEOS_PER_BATCH: 16 80 | EVAL_OPEN: True 81 | METRIC: 'video_ap' 82 | SMALL_OPEN_WORLD: True 83 | INDEPENDENT_EVAL: True 84 | OUTPUT_DIR: "output/jhmdb/openmixer_zsr_tl" 85 | -------------------------------------------------------------------------------- /config_files/jhmdb/openmixer_zsr_zsl.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | PATH_TO_DATA_DIR: "data/JHMDB" 3 | NUM_FRAMES: 16 4 | SAMPLING_RATE: 1 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] # from CLIPViP (same as CLIP) 7 | DATASETS: ['jhmdb'] 8 | OPEN_VOCABULARY: True 9 | REFINE_VOCAB: True 10 | JHMDB: 11 | FRAME_DIR: "Frames/" 12 | OPEN_WORLD_DIR: 'openworld' 13 | CW_SPLIT_FILE: 'train50%/closed_world_0.pkl' 14 | OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl' 15 | SAMPLES_SPLIT: 0 16 | VOCAB_REFINE: 'vocab_gpt3.5.json' 17 | PRIOR_BOX_FILE: 'JHMDB-MaskRCNN.pkl' 18 | MODEL: 19 | DET: NaiveBaseline 20 | MULTI_LABEL_ACTION: False 21 | PRIOR_BOXES_INIT: 'det' 22 | WEIGHT: null 23 | BACKBONE: 24 | CONV_BODY: "ViP-B/16" 25 | PATHWAYS: 1 26 | STM: 27 | USE_CLS_FEAT: True 28 | TEXT_ENCODER: 'CLIPViP' 29 | CLIPViP: 30 | ARCH: ViP-B/16 31 | CLIP_NAME: "openai/clip-vit-base-patch16" # load from huggingface 32 | WEIGHT: "pretrained/pretrain_clipvip_base_16.pt" 33 | TEMPORAL_SIZE: 12 34 | USE_TEMPORAL_EMBED: True 35 | LOGIT_SCALE_INIT: 4.6 36 | ADD_CLS_NUM: 3 37 | # CONTEXT_INIT: 'a video of ' 38 | LEN_CONTEXT: 24 39 | TEST: 40 | VIDEOS_PER_BATCH: 32 41 | EVAL_OPEN: True 42 | METRIC: 'video_ap' 43 | SMALL_OPEN_WORLD: True 44 | INDEPENDENT_EVAL: True 45 | OUTPUT_DIR: "output/jhmdb/openmixer_zsr_zsl" 46 | -------------------------------------------------------------------------------- /config_files/ucf24/openmixer_e2e.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | PATH_TO_DATA_DIR: "data/UCF24" 3 | NUM_FRAMES: 16 4 | SAMPLING_RATE: 1 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] # from CLIPViP (same as CLIP) 7 | DATASETS: ['ucf24'] 8 | OPEN_VOCABULARY: True 9 | REFINE_VOCAB: True 10 | UCF24: 11 | FRAME_DIR: "rgb-images" 12 | OPEN_WORLD_DIR: 'openworld' 13 | CW_SPLIT_FILE: 'train50%/closed_world_0.pkl' 14 | OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl' 15 | VOCAB_REFINE: 'vocab_gpt4.json' 16 | # PRIOR_BOX_FILE: 'UCF24-GDINO-top10.pkl' # 'UCF24-MaskRCNN.pkl', 'UCF24-GDINO-top10.pkl' 17 | MODEL: 18 | WEIGHT: null 19 | BACKBONE: 20 | CONV_BODY: "ViP-B/16" 21 | PATHWAYS: 1 22 | RESIDUAL_LATERAL: True 23 | STM: 24 | NUM_QUERIES: 100 25 | HIDDEN_DIM: 512 26 | NUM_STAGES: 3 27 | ACTION_CLASSES: 10 # 50%: 10, 75%: 15 28 | OBJECT_CLASSES: 1 29 | NUM_HEADS: 8 30 | DROPOUT: 0.0 31 | DIM_FEEDFORWARD: 2048 32 | NUM_FCS: 2 33 | ACTIVATION: 'ReLU' 34 | SPATIAL_POINTS: 32 35 | TEMPORAL_POINTS: 16 # must be the same as NUM_FRAMES 36 | OUT_MULTIPLIER: 4 37 | N_GROUPS: 4 38 | NUM_CLS: 1 39 | NUM_ACT: 1 40 | NUM_REG: 1 41 | OBJECT_WEIGHT: 2.0 42 | ACTION_WEIGHT: 8.0 43 | GIOU_WEIGHT: 2.0 44 | L1_WEIGHT: 2.0 45 | BACKGROUND_WEIGHT: 0.1 46 | INTERMEDIATE_SUPERVISION: True 47 | PERSON_THRESHOLD: 0.6 48 | USE_CLS_FEAT: True 49 | COND_CLS: True 50 | FUSE_CLS: True 51 | FUSE_METHOD: 'logit_fusion' 52 | FUSE_FACTOR: 0.999 53 | DeST: True 54 | TEXT_ENCODER: 'CLIPViP' 55 | CLIPViP: 56 | ARCH: ViP-B/16 57 | CLIP_NAME: "openai/clip-vit-base-patch16" # load from huggingface 58 | WEIGHT: "pretrained/pretrain_clipvip_base_16.pt" 59 | TEMPORAL_SIZE: 12 60 | USE_TEMPORAL_EMBED: True 61 | LOGIT_SCALE_INIT: 4.6 62 | ADD_CLS_NUM: 3 63 | CONTEXT_INIT: '' 64 | LEN_CONTEXT: 24 65 | CAM_METHOD: 'RITSM' 66 | USE_ATTN: False 67 | MULTI_LABEL_ACTION: False # softmax 68 | ViT: 69 | LAYER_DECAY: 1.0 70 | WEIGHT_DECAY: 1e-5 71 | SOLVER: 72 | MAX_EPOCH: 12 73 | BASE_LR: 0.00001 74 | WEIGHT_DECAY: 1e-4 75 | STEPS: (5, 8) 76 | WARMUP_FACTOR: 0.1 77 | WARMUP_EPOCH: 2 78 | CHECKPOINT_PERIOD: 1 79 | EVAL_PERIOD: 1 80 | EVAL_AFTER: 2 81 | VIDEOS_PER_BATCH: 8 82 | OPTIMIZING_METHOD: 'adamw' 83 | TEST: 84 | VIDEOS_PER_BATCH: 16 85 | EVAL_OPEN: True 86 | METRIC: 'video_ap' 87 | SMALL_OPEN_WORLD: True 88 | INDEPENDENT_EVAL: True 89 | IOU_THRESH: 0.2 90 | # PRIOR_BOX_TEST: True 91 | OUTPUT_DIR: "output/ucf24/openmixer_e2e" 92 | -------------------------------------------------------------------------------- /config_files/ucf24/openmixer_zsr_tl.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | PATH_TO_DATA_DIR: "data/UCF24" 3 | NUM_FRAMES: 16 4 | SAMPLING_RATE: 1 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] # from CLIPViP (same as CLIP) 7 | DATASETS: ['ucf24'] 8 | OPEN_VOCABULARY: True 9 | REFINE_VOCAB: True 10 | UCF24: 11 | FRAME_DIR: "rgb-images" 12 | OPEN_WORLD_DIR: 'openworld' 13 | CW_SPLIT_FILE: 'train50%/closed_world_0.pkl' 14 | OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl' 15 | VOCAB_REFINE: 'vocab_gpt4.json' 16 | MODEL: 17 | WEIGHT: null 18 | BACKBONE: 19 | CONV_BODY: "ViP-B/16" 20 | PATHWAYS: 1 21 | RESIDUAL_LATERAL: True 22 | STM: 23 | NUM_QUERIES: 100 24 | HIDDEN_DIM: 512 25 | NUM_STAGES: 3 26 | ACTION_CLASSES: 10 # 50%: 10, 75%: 15 27 | OBJECT_CLASSES: 1 28 | NUM_HEADS: 8 29 | DROPOUT: 0.0 30 | DIM_FEEDFORWARD: 2048 31 | NUM_FCS: 2 32 | ACTIVATION: 'ReLU' 33 | SPATIAL_POINTS: 32 34 | TEMPORAL_POINTS: 16 # must be the same as NUM_FRAMES 35 | OUT_MULTIPLIER: 4 36 | N_GROUPS: 4 37 | NUM_CLS: 1 38 | NUM_ACT: 1 39 | NUM_REG: 1 40 | OBJECT_WEIGHT: 2.0 41 | ACTION_WEIGHT: 48.0 42 | GIOU_WEIGHT: 2.0 43 | L1_WEIGHT: 2.0 44 | BACKGROUND_WEIGHT: 0.1 45 | INTERMEDIATE_SUPERVISION: True 46 | PERSON_THRESHOLD: 0.6 47 | USE_CLS_FEAT: True 48 | PRETRAIN_ACTION: True 49 | TEXT_ENCODER: 'CLIPViP' 50 | CLIPViP: 51 | ARCH: ViP-B/16 52 | CLIP_NAME: "openai/clip-vit-base-patch16" # load from huggingface 53 | WEIGHT: "pretrained/pretrain_clipvip_base_16.pt" 54 | TEMPORAL_SIZE: 12 55 | USE_TEMPORAL_EMBED: True 56 | LOGIT_SCALE_INIT: 4.6 57 | ADD_CLS_NUM: 3 58 | CONTEXT_INIT: '' 59 | LEN_CONTEXT: 24 60 | CAM_METHOD: 'RITSM' 61 | USE_ATTN: False 62 | MULTI_LABEL_ACTION: False # softmax 63 | ViT: 64 | LAYER_DECAY: 1.0 65 | WEIGHT_DECAY: 1e-5 66 | SOLVER: 67 | MAX_EPOCH: 12 68 | BASE_LR: 0.00001 69 | WEIGHT_DECAY: 1e-4 70 | STEPS: (5, 8) 71 | WARMUP_FACTOR: 0.1 72 | WARMUP_EPOCH: 2 73 | CHECKPOINT_PERIOD: 1 74 | EVAL_PERIOD: 1 75 | EVAL_AFTER: 2 76 | VIDEOS_PER_BATCH: 16 77 | OPTIMIZING_METHOD: 'adamw' 78 | TEST: 79 | VIDEOS_PER_BATCH: 16 80 | EVAL_OPEN: True 81 | METRIC: 'video_ap' 82 | SMALL_OPEN_WORLD: True 83 | INDEPENDENT_EVAL: True 84 | IOU_THRESH: 0.2 85 | OUTPUT_DIR: "output/ucf24/openmixer_zsr_tl" 86 | -------------------------------------------------------------------------------- /config_files/ucf24/openmixer_zsr_zsl.yaml: -------------------------------------------------------------------------------- 1 | DATA: 2 | PATH_TO_DATA_DIR: "data/UCF24" 3 | NUM_FRAMES: 16 4 | SAMPLING_RATE: 1 5 | MEAN: [0.48145466, 0.4578275, 0.40821073] 6 | STD: [0.26862954, 0.26130258, 0.27577711] # from CLIPViP (same as CLIP) 7 | DATASETS: ['ucf24'] 8 | OPEN_VOCABULARY: True 9 | REFINE_VOCAB: True 10 | UCF24: 11 | FRAME_DIR: "rgb-images" 12 | OPEN_WORLD_DIR: 'openworld' 13 | CW_SPLIT_FILE: 'train50%/closed_world_0.pkl' 14 | OW_SPLIT_FILE: 'train50%/open_world_0_small.pkl' 15 | VOCAB_REFINE: 'vocab_gpt4.json' 16 | PRIOR_BOX_FILE: 'UCF24-MaskRCNN.pkl' 17 | MODEL: 18 | DET: NaiveBaseline 19 | MULTI_LABEL_ACTION: False 20 | PRIOR_BOXES_INIT: 'det' # prior boxes in testing 21 | WEIGHT: null 22 | BACKBONE: 23 | CONV_BODY: "ViP-B/16" 24 | PATHWAYS: 1 25 | STM: 26 | USE_CLS_FEAT: True 27 | TEXT_ENCODER: 'CLIPViP' 28 | CLIPViP: 29 | ARCH: ViP-B/16 30 | CLIP_NAME: "openai/clip-vit-base-patch16" # load from huggingface 31 | WEIGHT: "pretrained/pretrain_clipvip_base_16.pt" 32 | TEMPORAL_SIZE: 12 33 | USE_TEMPORAL_EMBED: True 34 | LOGIT_SCALE_INIT: 4.6 35 | ADD_CLS_NUM: 3 36 | # CONTEXT_INIT: 'a video of ' 37 | LEN_CONTEXT: 24 38 | TEST: 39 | VIDEOS_PER_BATCH: 64 40 | EVAL_OPEN: True 41 | METRIC: 'video_ap' 42 | SMALL_OPEN_WORLD: True 43 | INDEPENDENT_EVAL: True 44 | IOU_THRESH: 0.2 45 | OUTPUT_DIR: "output/ucf24/openmixer_zsr_zsl" 46 | -------------------------------------------------------------------------------- /preprocess/generate_vdt_jhmdb.py: -------------------------------------------------------------------------------- 1 | import random 2 | import re 3 | import os 4 | import copy 5 | import json 6 | 7 | import openai 8 | openai.api_key = "YOUR_OPENAI_KEY_HERE" 9 | 10 | 11 | def read_class_list(filepath): 12 | class_list = [] 13 | with open(filepath, 'r') as f: 14 | for line in f.readlines(): 15 | class_list.append(line.strip()) 16 | return class_list 17 | 18 | def read_class_description(filepath): 19 | with open(filepath, 'r') as f: 20 | refine_maps = json.load(f) 21 | return refine_maps 22 | 23 | 24 | def run_gpt4(class_name): 25 | prompt = """ 26 | What are the visual features for distinguishing {}? Please describe with a few short sentences. 27 | """ 28 | cls_name = re.sub("_", " ", class_name) 29 | message = [ 30 | {"role": "system", "content": "You are a useful assistant."}, 31 | {"role": "user", "content": prompt.format(cls_name)} 32 | ] 33 | 34 | response = openai.ChatCompletion.create( 35 | model="gpt-4-0613", 36 | max_tokens=1024, 37 | temperature=1.2, 38 | messages = message) 39 | 40 | # parse the response 41 | result = response['choices'][0]['message']['content'] 42 | return result 43 | 44 | 45 | def generate_different_meaning(): 46 | jhmdb_classes = read_class_list(os.path.join(data_path, 'vocab_open.txt')) 47 | 48 | results = {} 49 | for clsname in jhmdb_classes: 50 | print("\nProcessing action: {}...".format(clsname)) 51 | cls_name = re.sub("_", " ", clsname) 52 | prompt = f"Generate 16 unique sentences describing the action '{cls_name}':" 53 | message = [ 54 | {"role": "system", "content": "You are a useful assistant."}, 55 | {"role": "user", "content": prompt} 56 | ] 57 | response = openai.ChatCompletion.create(model="gpt-4-0613", messages=message, max_tokens=800, request_timeout=60) # Adjust max_tokens as needed 58 | res = response.choices[0]['message']['content'].strip().split('\n') 59 | print(res) 60 | 61 | results[clsname] = res 62 | 63 | with open(os.path.join(data_path, "vocab_gpt4_m16.json"), "w") as outfile: 64 | json.dump(results, outfile) 65 | 66 | 67 | def generate_same_meaning(): 68 | class_descriptions = read_class_description(os.path.join(data_path, 'vocab_gpt3.5.json')) 69 | 70 | results = {} 71 | for clsname, desc in class_descriptions.items(): 72 | print("\nProcessing action: {}...".format(clsname)) 73 | cls_name = re.sub("_", " ", clsname) 74 | cap_prefix, cap = desc.split(": ") 75 | prompt = f"Given a sport action type from JHMDB dataset, such as '{cls_name}, please provide 16 different sentences that express the same meaning of the caption: '{cap}'." 76 | message = [ 77 | {"role": "system", "content": "You are a useful assistant."}, 78 | {"role": "user", "content": prompt} 79 | ] 80 | response = openai.ChatCompletion.create(model="gpt-4-0613", messages=message, max_tokens=800, request_timeout=60) # Adjust max_tokens as needed 81 | res = response.choices[0]['message']['content'].strip().split('\n') 82 | res = [desc] + [re.sub(r'\d+.', f'{cap_prefix}:', cap) for cap in res] 83 | print(res) 84 | 85 | results[clsname] = res 86 | 87 | 88 | with open(os.path.join(data_path, "vocab_gpt4_m16new.json"), "w") as outfile: 89 | json.dump(results, outfile, indent=4) 90 | 91 | 92 | if __name__ == '__main__': 93 | random.seed(42) 94 | data_path = '../data/JHMDB/openworld' 95 | 96 | # generate_different_meaning() 97 | 98 | # generate_same_meaning() 99 | 100 | class_descriptions = read_class_description(os.path.join(data_path, 'vocab_gpt3.5.json')) 101 | 102 | # get candidate verbs 103 | seen_classes = read_class_list(os.path.join(data_path, 'train50%', 'vocab_closed_0.txt')) 104 | verbs_list = [clsname.split("_")[0] for clsname in seen_classes] 105 | 106 | prompt = """In this task, you are given an input sentence. 107 | Your job is to tell me 16 output sentences with different meanings by only changing the action verbs using a list of candidate verbs. 108 | The output format should be a dictionary of key-value pair where keys are the verbs you are choosing, and values are the generated sentences.""" 109 | 110 | results = {} 111 | for clsname, desc in class_descriptions.items(): 112 | if clsname not in seen_classes: 113 | continue # only process the seen classes 114 | print("\nProcessing action: {}...".format(clsname)) 115 | cls_name = re.sub("_", " ", clsname) 116 | cap_prefix, cap = desc.split(": ") 117 | verbs_sub = copy.deepcopy(verbs_list) 118 | verbs_sub.remove(clsname.split("_")[0]) 119 | verbs_sub = ', '.join(verbs_sub) 120 | message = [ 121 | {"role": "system", "content": "You are a useful assistant."}, 122 | {"role": "user", "content": prompt + f" The input sentence: {cap} The candidate verb list: [{verbs_sub}]."} 123 | ] 124 | response = openai.ChatCompletion.create(model="gpt-4-0613", messages=message, max_tokens=800, request_timeout=60) 125 | res = response.choices[0]['message']['content'].strip().split('\n') 126 | result_list = [] 127 | for strline in res: 128 | if ': ' not in strline: 129 | continue 130 | strline = re.sub("\"", "", strline.strip(",")) 131 | prefix, sentence = strline.split(': ') 132 | result_list.append("{}: {}".format(prefix.capitalize(), sentence)) 133 | if len(result_list) == 8: 134 | break 135 | print(result_list) 136 | 137 | results[clsname] = result_list 138 | 139 | with open(os.path.join(data_path, 'train50%', "hardneg_closed_0.json"), "w") as outfile: 140 | json.dump(results, outfile, indent=4) 141 | 142 | -------------------------------------------------------------------------------- /preprocess/generate_vdt_ucf24.py: -------------------------------------------------------------------------------- 1 | import random 2 | import re 3 | import os 4 | import copy 5 | import json 6 | 7 | import openai 8 | openai.api_key = "YOUR_OPENAI_KEY_HERE" 9 | 10 | 11 | 12 | def read_class_list(filepath): 13 | class_list = [] 14 | with open(filepath, 'r') as f: 15 | for line in f.readlines(): 16 | class_list.append(line.strip()) 17 | return class_list 18 | 19 | 20 | if __name__ == '__main__': 21 | random.seed(42) 22 | data_path = '../data/UCF24/openworld' 23 | 24 | ucf24_classes = read_class_list(os.path.join(data_path, 'vocab_open.txt')) 25 | 26 | results = {} 27 | for clsname in ucf24_classes: 28 | print("\nProcessing action: {}...".format(clsname)) 29 | prompt = f"Generate 16 captions that describe the action '{clsname}'. For example, given the action dance, your output will be like: Dance: A person is dancing on the stage, with the body moving rhythmically to music." 30 | message = [ 31 | {"role": "system", "content": "You are a useful assistant."}, 32 | {"role": "user", "content": prompt} 33 | ] 34 | response = openai.ChatCompletion.create(model="gpt-4-0613", messages=message, max_tokens=800, request_timeout=60) # Adjust max_tokens as needed 35 | res = response.choices[0]['message']['content'].strip().split('\n') 36 | res = [re.sub(r'\d+. ', '', cap) for cap in res] 37 | print(res) 38 | results[clsname] = res 39 | 40 | with open(os.path.join(data_path, "vocab_gpt4.json"), "w") as outfile: 41 | json.dump(results, outfile, indent=4) 42 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | yacs 3 | opencv-python 4 | tensorboardX 5 | SciPy 6 | fvcore 7 | timm 8 | iopath 9 | git+https://github.com/openai/CLIP.git 10 | transformers 11 | ttach 12 | kornia 13 | scikit-learn 14 | scikit-image 15 | einops 16 | matplotlib 17 | supervision -------------------------------------------------------------------------------- /test_net.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from alphaction.config import cfg 6 | from alphaction.dataset import make_data_loader 7 | from alphaction.engine.inference import inference 8 | from alphaction.modeling.detector import build_detection_model, build_naive_baseline 9 | from alphaction.utils.checkpoint import ActionCheckpointer 10 | from torch.utils.collect_env import get_pretty_env_info 11 | from alphaction.utils.comm import synchronize, get_rank 12 | from alphaction.utils.logger import setup_logger 13 | #pytorch issuse #973 14 | import resource 15 | 16 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 17 | resource.setrlimit(resource.RLIMIT_NOFILE, (rlimit[1], rlimit[1])) 18 | 19 | 20 | def main(): 21 | parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference") 22 | parser.add_argument( 23 | "--config-file", 24 | default="", 25 | metavar="FILE", 26 | help="path to config file", 27 | ) 28 | parser.add_argument("--local_rank", type=int, default=0) 29 | parser.add_argument( 30 | "opts", 31 | help="Modify config options using the command-line", 32 | default=None, 33 | nargs=argparse.REMAINDER, 34 | ) 35 | 36 | args = parser.parse_args() 37 | 38 | num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 39 | distributed = num_gpus > 1 40 | 41 | if distributed: 42 | torch.cuda.set_device(args.local_rank) 43 | torch.distributed.init_process_group( 44 | backend="nccl", init_method="env://" 45 | ) 46 | 47 | # Merge config file. 48 | cfg.merge_from_file(args.config_file) 49 | cfg.merge_from_list(args.opts) 50 | cfg.freeze() 51 | 52 | 53 | # Print experimental infos. 54 | save_dir = "" 55 | logger = setup_logger("alphaction", save_dir, get_rank()) 56 | logger.info("Using {} GPUs".format(num_gpus)) 57 | logger.info(cfg) 58 | 59 | logger.info("Collecting env info (might take some time)") 60 | logger.info("\n" + get_pretty_env_info()) 61 | 62 | # Build the model. 63 | if cfg.MODEL.DET == 'STMDetector': 64 | model = build_detection_model(cfg) 65 | elif cfg.MODEL.DET == 'NaiveBaseline': 66 | model = build_naive_baseline(cfg) 67 | model.to("cuda") 68 | 69 | if cfg.MODEL.DET != 'NaiveBaseline': 70 | # load weight. 71 | output_dir = cfg.OUTPUT_DIR 72 | checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir) 73 | ckpt_file = os.path.join(output_dir, cfg.MODEL.WEIGHT) if cfg.MODEL.WEIGHT else None 74 | checkpointer.load(ckpt_file) 75 | 76 | output_folders = [None] * len(cfg.DATA.DATASETS) 77 | dataset_names = cfg.DATA.DATASETS 78 | if cfg.OUTPUT_DIR: 79 | for idx, dataset_name in enumerate(dataset_names): 80 | inf_folder = "inference" if not cfg.TEST.SMALL_OPEN_WORLD else "inference_small" 81 | output_folder = os.path.join(cfg.OUTPUT_DIR, inf_folder, dataset_name) 82 | os.makedirs(output_folder, exist_ok=True) 83 | output_folders[idx] = output_folder 84 | 85 | # Do inference. 86 | data_loaders_test, vocabularies_test, _ = make_data_loader(cfg, is_train=False, is_distributed=distributed) 87 | for i, (output_folder, dataset_name, data_loader_test) in enumerate(zip(output_folders, dataset_names, data_loaders_test)): 88 | # set open vocabulary 89 | if len(vocabularies_test) > 0: 90 | model.backbone.text_encoder.set_vocabulary(vocabularies_test[i]) 91 | 92 | inference( 93 | model, 94 | data_loader_test, 95 | dataset_name, 96 | output_folder=output_folder, 97 | metric=cfg.TEST.METRIC, 98 | use_cache=True 99 | ) 100 | synchronize() 101 | 102 | 103 | if __name__ == "__main__": 104 | main() 105 | -------------------------------------------------------------------------------- /third_party/eval_utils.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | LIB_PATH=['../alphaction/dataset/datasets/evaluation'] 3 | sys.path.extend(LIB_PATH) 4 | 5 | from pascal_evaluation.object_detection_evaluation import PascalDetectionEvaluator 6 | from pascal_evaluation.standard_fields import InputDataFields, DetectionResultFields 7 | 8 | import pickle 9 | import numpy as np 10 | 11 | 12 | 13 | def load_gt_data(anno_file, split=0): 14 | assert os.path.exists(anno_file), "Annotation file does not exist: {}".format(anno_file) 15 | with open(anno_file, 'rb') as fid: 16 | data = pickle.load(fid, encoding='iso-8859-1') 17 | return data 18 | 19 | 20 | def eval_person_boxes(results, gt_data): 21 | class_id = 1 22 | 23 | pascal_evaluator = PascalDetectionEvaluator([{'id': class_id, 'name': 'person'}], 24 | matching_iou_threshold=0.5) 25 | 26 | # prepare ground truth 27 | for vid, annos in gt_data['gttubes'].items(): 28 | # each video contains only one action type 29 | act_id = list(annos.keys())[0] 30 | act_annos = annos[act_id][0] 31 | height, width = gt_data['resolution'][vid] 32 | # each action type contains only one action box on a frame 33 | for fid_box in act_annos: 34 | img_key = "%s,%04d" % (vid, float(fid_box[0])) 35 | box_normed = fid_box[1:5] / np.array([width, height, width, height], dtype=np.float32) # (xyxy) 36 | box_normed = box_normed[[1, 0, 3, 2]] # (yxyx) 37 | pascal_evaluator.add_single_ground_truth_image_info( 38 | img_key, { 39 | InputDataFields.groundtruth_boxes: box_normed[None], 40 | InputDataFields.groundtruth_classes: np.array([class_id], dtype=int), 41 | InputDataFields.groundtruth_difficult: np.zeros(1, dtype=bool) 42 | }) 43 | 44 | # prepare detection results 45 | for vid, dets in results.items(): 46 | boxes, scores = dets['boxes'], dets['scores'] 47 | frame_ids = list(boxes.keys()) 48 | for fid in frame_ids: 49 | img_key = "%s,%04d" % (vid, float(fid)) 50 | boxes_pred = boxes[fid].copy() 51 | boxes_pred = boxes_pred[:, [1, 0, 3, 2]] 52 | pascal_evaluator.add_single_detected_image_info( 53 | img_key, { 54 | DetectionResultFields.detection_boxes: boxes_pred, 55 | DetectionResultFields.detection_classes: np.array([class_id]*len(boxes[fid]), dtype=int), 56 | DetectionResultFields.detection_scores: scores[fid].copy() 57 | }) 58 | 59 | eval_res = pascal_evaluator.evaluate() 60 | 61 | precisions = pascal_evaluator._evaluation.precisions_per_class 62 | recalls = pascal_evaluator._evaluation.recalls_per_class 63 | 64 | return eval_res, precisions, recalls -------------------------------------------------------------------------------- /third_party/maskrcnn_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision.ops import box_convert 3 | from video_io import * 4 | from tqdm import tqdm 5 | 6 | 7 | def preprocess_clip(clip, device): 8 | """ clip: (T, H, W, 3) in uint8 format 9 | """ 10 | # preprocess video 11 | clip = torch.from_numpy(clip).to(device).float() / 255.0 12 | clip = clip.permute(0, 3, 1, 2).contiguous() # (T, C, H, W) 13 | return clip 14 | 15 | 16 | def maskrcnn_video(video_path, model, categories, box_thresh=0.35, topk=None, batch_size=16, fmt='%05d.png', device=torch.device('cuda')): 17 | # load video data 18 | if isinstance(video_path, list): 19 | video = read_video_from_list(video_path) 20 | video_name = os.path.dirname(video_path[0]).split("/")[-1] 21 | elif os.path.isfile(video_path): 22 | video = read_video_from_file(video_path) # (T, H, W, C) in RGB uint8 format 23 | video_name = video_path.split("/")[-1][:-4] 24 | else: 25 | video = read_video_from_folder(video_path, fmt=fmt) 26 | video_name = video_path.split("/")[-1] 27 | num_frames = len(video) 28 | 29 | if isinstance(video_path, list): 30 | frame_ids = [int(imgfile[:-4].split("/")[-1].split("_")[-1]) 31 | for imgfile in video_path] 32 | else: 33 | frame_ids = list(range(num_frames)) 34 | 35 | if num_frames > batch_size: 36 | video = np.array_split(video, int(num_frames // batch_size)) 37 | frame_ids = np.array_split(frame_ids, int(num_frames // batch_size)) 38 | else: 39 | video, frame_ids = [video], [frame_ids] 40 | 41 | results = {'boxes': dict(), 'scores': dict()} 42 | for fids, clip in tqdm(zip(frame_ids, video), total=len(video), desc="{}".format(video_name), ncols=0): 43 | # preprocess 44 | height, width = clip.shape[1:3] 45 | batch = preprocess_clip(clip, device) # (T, 3, H, W) 46 | with torch.no_grad(): 47 | outputs = model(batch) 48 | # get results 49 | for i, outs in enumerate(outputs): 50 | mask = outs['labels'] == categories.index('person') 51 | if not any(mask): 52 | continue # no person at all 53 | 54 | if box_thresh is not None: 55 | mask = mask & (outs['scores'] > box_thresh) 56 | if topk is not None: 57 | inds = torch.topk(outs['scores'], topk)[1] 58 | topk_mask = torch.zeros_like(outs['scores'], dtype=torch.bool).scatter_(0, inds, True) 59 | mask = mask & topk_mask 60 | if not any(mask): # no valid person 61 | continue 62 | 63 | # mask out and sort boxes and scores 64 | boxes = outs['boxes'][mask] # the predicted boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H. 65 | scores = outs['scores'][mask] 66 | idx = torch.argsort(scores, descending=True) 67 | boxes, scores = boxes[idx], scores[idx] 68 | # save 69 | boxes[:, [0, 2]] = boxes[:, [0, 2]] / width 70 | boxes[:, [1, 3]] = boxes[:, [1, 3]] / height 71 | results['boxes'][fids[i]] = boxes.cpu().numpy() # normalized (x1, y1, x2, y2) 72 | results['scores'][fids[i]] = scores.cpu().numpy() 73 | 74 | return results 75 | -------------------------------------------------------------------------------- /third_party/run_maskrcnn.py: -------------------------------------------------------------------------------- 1 | import os, argparse, pickle 2 | from tqdm import tqdm 3 | 4 | import torch 5 | from torchvision.models.detection import maskrcnn_resnet50_fpn 6 | from torchvision.models._meta import _COCO_CATEGORIES 7 | from maskrcnn_utils import maskrcnn_video 8 | from video_io import vis_dets 9 | 10 | from eval_utils import eval_person_boxes, load_gt_data 11 | from pprint import pformat 12 | import matplotlib.pyplot as plt 13 | 14 | 15 | def main(args): 16 | 17 | dataset = args.data.upper() 18 | if args.data == 'jhmdb': 19 | video_dir = f'../data/{dataset}/Frames' 20 | fmt = '%05d.png' 21 | box_thresh, topk = None, 1 22 | 23 | elif args.data == 'ucf24': 24 | video_dir = f'../data/{dataset}/rgb-images' 25 | fmt = '%05d.jpg' 26 | box_thresh, topk = 0.35, None 27 | 28 | else: 29 | raise NotImplemented 30 | results_save_file = f'../data/{dataset}/{dataset}-MaskRCNN.pkl' 31 | 32 | if not os.path.exists(results_save_file): 33 | # setup device and model 34 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 35 | model = maskrcnn_resnet50_fpn(pretrained=True).to(device) 36 | model.eval() 37 | 38 | all_video_files = [] 39 | for folder in os.listdir(video_dir): 40 | videos_class_path = os.path.join(video_dir, folder) 41 | if not os.path.isdir(videos_class_path): 42 | continue 43 | vid_files = [folder + '/' + vid for vid in os.listdir(videos_class_path) if os.path.isdir(os.path.join(videos_class_path, vid))] 44 | all_video_files.extend(vid_files) 45 | 46 | results = dict() 47 | for vid in tqdm(all_video_files, total=len(all_video_files), ncols=0): 48 | print("\nRuning on the file: {}...".format(vid)) 49 | results[vid] = maskrcnn_video(os.path.join(video_dir, vid), model, _COCO_CATEGORIES, 50 | fmt=fmt, box_thresh=box_thresh, topk=topk, device=device) 51 | 52 | with open(results_save_file, 'wb') as fid: 53 | pickle.dump(results, fid, protocol=pickle.HIGHEST_PROTOCOL) 54 | 55 | else: 56 | with open(results_save_file, 'rb') as fid: 57 | results = pickle.load(fid, encoding='iso-8859-1') 58 | 59 | # evaluation 60 | if args.eval: 61 | # load the ground truth 62 | jhmdb_gt_file = '../data/JHMDB/JHMDB-GT.pkl' 63 | gt_data = load_gt_data(jhmdb_gt_file) 64 | 65 | eval_res, precisions, recalls = eval_person_boxes(results, gt_data) 66 | 67 | print(pformat(eval_res, indent=2)) 68 | 69 | plt.figure(figsize=(10, 6)) 70 | plt.plot(recalls[0], precisions[0], label="Precision-Recall curve") 71 | plt.xlabel("Recall") 72 | plt.ylabel("Precision") 73 | plt.legend(loc="lower left") 74 | plt.tight_layout() 75 | plt.savefig('../temp/jhmdb/precision_recall_curve_maskrcnn.png', bbox_inches='tight') 76 | plt.close() 77 | 78 | # visualize 79 | if args.vis: 80 | test_video = ['kick_ball/FIFA_11_Gamescom-Trailer_kick_ball_f_cm_np1_ba_med_4'] 81 | save_dir = os.path.join(os.path.dirname(results_save_file), 'VisMaskRCNN') 82 | os.makedirs(save_dir, exist_ok=True) 83 | for vid in test_video: 84 | savefile = os.path.join(save_dir, vid.replace('/', '-') + "_pred.mp4") 85 | vis_dets(results, vid, video_dir, savefile) 86 | 87 | 88 | if __name__ == '__main__': 89 | 90 | parser = argparse.ArgumentParser(description="Mask RCNN (ResNet-50 FPN) Experiments") 91 | parser.add_argument( 92 | "--data", type=str, default='jhmdb', choices=['jhmdb', 'ucf24'], help="dataset used for testing", 93 | ) 94 | parser.add_argument( 95 | "--vis", action='store_true', help="visualize the detection results", 96 | ) 97 | parser.add_argument( 98 | "--eval", action='store_true', help="evaluate the quality" 99 | ) 100 | args = parser.parse_args() 101 | 102 | main(args) -------------------------------------------------------------------------------- /third_party/video_io.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import os 4 | import supervision as sv 5 | 6 | 7 | def read_video_from_file(video_file, toRGB=True): 8 | assert os.path.exists(video_file), "File does not exist! {}".format(video_file) 9 | cap = cv2.VideoCapture(video_file) 10 | success, frame = cap.read() 11 | video = [] 12 | while success: 13 | if toRGB: 14 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 15 | video.append(frame) 16 | success, frame = cap.read() 17 | video = np.array(video) 18 | return video 19 | 20 | 21 | def read_video_from_folder(video_path, fmt='%05d.png', start_frame=1, toRGB=True): 22 | frame_files = [name for name in os.listdir(video_path) if name.endswith(fmt[-4:])] 23 | vid_name = video_path.split("/")[-1] 24 | video = [] 25 | for i in range(len(frame_files)): 26 | if len(fmt.split("_")) == 1: 27 | frame_name = fmt%(i + start_frame) 28 | elif len(fmt.split("_")) == 2: 29 | frame_name = fmt%(vid_name, i + start_frame) 30 | frame_file = os.path.join(video_path, frame_name) # frame starting from 1 31 | frame = cv2.imread(frame_file) 32 | if toRGB: 33 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 34 | video.append(frame) 35 | video = np.array(video) 36 | return video 37 | 38 | 39 | def read_video_from_list(img_list, toRGB=True): 40 | video = [] 41 | for frame_file in img_list: 42 | frame = cv2.imread(frame_file) 43 | if toRGB: 44 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 45 | video.append(frame) 46 | video = np.array(video) 47 | return video 48 | 49 | 50 | def write_video(mat, video_file, fps=30, write_frames=True): 51 | """ mat: (T, H, W, C) 52 | """ 53 | video_writer = cv2.VideoWriter(video_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (mat.shape[2], mat.shape[1])) 54 | for frame in mat: 55 | video_writer.write(frame) 56 | 57 | if write_frames: 58 | os.makedirs(video_file[:-4], exist_ok=True) 59 | for i, frame in enumerate(mat): 60 | cv2.imwrite(os.path.join(video_file[:-4], '%06d.jpg'%(i)), frame) 61 | 62 | 63 | def vis_dets(results, vid, video_dir, savefile): 64 | # read frames 65 | video = read_video_from_folder(os.path.join(video_dir, vid), toRGB=False) # (T, H, W, C) 66 | # parse detections 67 | boxes = results[vid]['boxes'] # normalized (x1, y1, x2, y2) 68 | scores = results[vid]['scores'] 69 | video_vis = [] 70 | # visualize 71 | for i, frame in enumerate(video): 72 | h, w = frame.shape[:2] 73 | xyxy = boxes[i] * np.array([[w, h, w, h]]) 74 | detections = sv.Detections(xyxy=xyxy, confidence=scores[i]) 75 | labels = [f"person {s:.2f}" for s in scores[i]] 76 | # annotate on frame 77 | box_annotator = sv.BoxAnnotator() 78 | annotated_frame = box_annotator.annotate(scene=frame.copy(), detections=detections, labels=labels) 79 | video_vis.append(annotated_frame) 80 | video_vis = np.array(video_vis) 81 | # visualize 82 | write_video(video_vis, savefile, fps=20, write_frames=False) -------------------------------------------------------------------------------- /trainval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PHASE=$1 # train, eval 4 | DATASET=$2 # jhmdb, ucf24 5 | 6 | CFG_FILE="config_files/${DATASET}/openmixer_e2e.yaml" 7 | # CFG_FILE="config_files/${DATASET}/openmixer_zsr_tl.yaml" 8 | # CFG_FILE="config_files/${DATASET}/openmixer_zsr_zsl.yaml" # eval-only! 9 | 10 | TEST_WEIGHT=${3:-'checkpoints/model_final.pth'} 11 | 12 | eval "$(conda shell.bash hook)" 13 | conda activate openmixer 14 | 15 | if [ $PHASE == 'train' ] 16 | then 17 | python -m torch.distributed.launch --nproc_per_node=4 --master_port=2024 train_net.py \ 18 | --config-file ${CFG_FILE} \ 19 | --transfer \ 20 | --no-head \ 21 | --use-tfboard 22 | elif [ $PHASE == 'eval' ] 23 | then 24 | python -m torch.distributed.launch --nproc_per_node=4 --master_port=2405 test_net.py \ 25 | --config-file ${CFG_FILE} \ 26 | MODEL.WEIGHT ${TEST_WEIGHT} 27 | fi 28 | 29 | echo "${PHASE} finished!" --------------------------------------------------------------------------------