├── .flake8 ├── .gitignore ├── DATA.md ├── GETTING_STARTED.md ├── INSTALL.md ├── MODEL_ZOO.md ├── README.md ├── alphaction ├── __init__.py ├── config │ ├── __init__.py │ ├── defaults.py │ └── paths_catalog.py ├── csrc │ ├── ROIAlign3d.h │ ├── ROIPool3d.h │ ├── SigmoidFocalLoss.h │ ├── SoftmaxFocalLoss.h │ ├── cpu │ │ └── vision.h │ ├── cuda │ │ ├── ROIAlign3d_cuda.cu │ │ ├── ROIPool3d_cuda.cu │ │ ├── SigmoidFocalLoss_cuda.cu │ │ ├── SoftmaxFocalLoss_cuda.cu │ │ └── vision.h │ └── vision.cpp ├── dataset │ ├── __init__.py │ ├── build.py │ ├── collate_batch.py │ ├── datasets │ │ ├── __init__.py │ │ ├── ava.py │ │ ├── concat_dataset.py │ │ └── evaluation │ │ │ ├── __init__.py │ │ │ └── ava │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── ava_eval.py │ │ │ └── pascal_evaluation │ │ │ ├── __init__.py │ │ │ ├── label_map_util.py │ │ │ ├── metrics.py │ │ │ ├── np_box_list.py │ │ │ ├── np_box_list_ops.py │ │ │ ├── np_box_mask_list.py │ │ │ ├── np_box_mask_list_ops.py │ │ │ ├── np_box_ops.py │ │ │ ├── np_mask_ops.py │ │ │ ├── object_detection_evaluation.py │ │ │ ├── per_image_evaluation.py │ │ │ └── standard_fields.py │ ├── samplers │ │ ├── __init__.py │ │ ├── distributed.py │ │ ├── grouped_batch_sampler.py │ │ └── iteration_based_batch_sampler.py │ └── transforms │ │ ├── __init__.py │ │ ├── build.py │ │ ├── object_transforms.py │ │ └── video_transforms.py ├── engine │ ├── __init__.py │ ├── inference.py │ └── trainer.py ├── layers │ ├── __init__.py │ ├── batch_norm.py │ ├── roi_align_3d.py │ ├── roi_pool_3d.py │ ├── sigmoid_focal_loss.py │ └── softmax_focal_loss.py ├── modeling │ ├── __init__.py │ ├── backbone │ │ ├── __init__.py │ │ ├── backbone.py │ │ ├── i3d.py │ │ └── slowfast.py │ ├── common_blocks.py │ ├── detector │ │ ├── __init__.py │ │ └── action_detector.py │ ├── nonlocal_block.py │ ├── poolers.py │ ├── registry.py │ ├── roi_heads │ │ ├── __init__.py │ │ ├── action_head │ │ │ ├── IA_structure.py │ │ │ ├── __init__.py │ │ │ ├── action_head.py │ │ │ ├── inference.py │ │ │ ├── loss.py │ │ │ ├── metric.py │ │ │ ├── roi_action_feature_extractor.py │ │ │ └── roi_action_predictors.py │ │ └── roi_heads_3d.py │ └── utils.py ├── solver │ ├── __init__.py │ ├── build.py │ └── lr_scheduler.py ├── structures │ ├── __init__.py │ ├── bounding_box.py │ └── memory_pool.py └── utils │ ├── IA_helper.py │ ├── __init__.py │ ├── c2_model_loading.py │ ├── checkpoint.py │ ├── comm.py │ ├── logger.py │ ├── metric_logger.py │ ├── model_serialization.py │ ├── random_seed.py │ ├── registry.py │ └── video_decode.py ├── config_files ├── resnet101_8x8f_baseline.yaml ├── resnet101_8x8f_denseserial.yaml ├── resnet50_4x16f_baseline.yaml ├── resnet50_4x16f_denseserial.yaml ├── resnet50_4x16f_parallel.yaml └── resnet50_4x16f_serial.yaml ├── demo ├── README.md ├── Roboto-Bold.ttf ├── action_predictor.py ├── demo.py ├── video_detection_loader.py └── visualizer.py ├── detector ├── __init__.py ├── apis.py ├── nms │ ├── __init__.py │ ├── nms_wrapper.py │ └── src │ │ ├── nms_cpu.cpp │ │ ├── nms_cuda.cpp │ │ ├── nms_kernel.cu │ │ └── soft_nms_cpu.pyx ├── tracker │ ├── README.md │ ├── __init__.py │ ├── cfg │ │ ├── ccmcpe.json │ │ └── yolov3.cfg │ ├── models.py │ ├── preprocess.py │ ├── tracker │ │ ├── __init__.py │ │ ├── basetrack.py │ │ ├── matching.py │ │ └── multitracker.py │ └── utils │ │ ├── __init__.py │ │ ├── datasets.py │ │ ├── evaluation.py │ │ ├── io.py │ │ ├── kalman_filter.py │ │ ├── log.py │ │ ├── nms.py │ │ ├── parse_config.py │ │ ├── timer.py │ │ ├── utils.py │ │ └── visualization.py ├── tracker_api.py ├── tracker_cfg.py ├── yolo │ ├── README.md │ ├── __init__.py │ ├── bbox.py │ ├── cam_demo.py │ ├── cfg │ │ ├── tiny-yolo-voc.cfg │ │ ├── yolo-voc.cfg │ │ ├── yolo.cfg │ │ ├── yolov3-spp.cfg │ │ └── yolov3.cfg │ ├── darknet.py │ ├── detect.py │ ├── pallete │ ├── preprocess.py │ ├── util.py │ ├── video_demo.py │ └── video_demo_half.py ├── yolo_api.py └── yolo_cfg.py ├── setup.py ├── test_net.py ├── tools └── ava │ ├── csv2COCO.py │ └── process_ava_videos.py └── train_net.py /.flake8: -------------------------------------------------------------------------------- 1 | # This is an example .flake8 config, used when developing *Black* itself. 2 | # Keep in sync with setup.cfg which is used for source packages. 3 | 4 | [flake8] 5 | ignore = E203, E266, E501, W503 6 | max-line-length = 80 7 | max-complexity = 18 8 | select = B,C,E,F,W,T4,B9 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # compilation and distribution 2 | __pycache__ 3 | _ext 4 | *.pyc 5 | *.so 6 | *.egg-info/ 7 | build/ 8 | dist/ 9 | 10 | # pytorch/python/numpy formats 11 | *.pth 12 | *.pkl 13 | *.npy 14 | *.pt 15 | 16 | # ipython/jupyter notebooks 17 | *.ipynb 18 | **/.ipynb_checkpoints/ 19 | 20 | # Editor temporaries 21 | *.swn 22 | *.swo 23 | *.swp 24 | *~ 25 | .DS_Store 26 | 27 | # Pycharm editor settings 28 | .idea 29 | 30 | # project dirs 31 | /datasets 32 | /models 33 | /data 34 | /detector/nms/src/soft_nms_cpu.cpp -------------------------------------------------------------------------------- /DATA.md: -------------------------------------------------------------------------------- 1 | ## Data Preparation 2 | 3 | ### Easy Version 4 | 5 | 1. Download the tar.gz file from [[here]](https://pan.baidu.com/s/1UrflK4IgiVbVBOP5fDHdKA) with code `q5v5`. 6 | 7 | 2. run following commands to unzip the file and create a 8 | symbolic link to the extracted files. 9 | 10 | ```bash 11 | tar zxvf AVA_compress.tar.gz -C /some/path/ 12 | cd /path/to/AlphAction/ 13 | mkdir data 14 | ln -s /some/path/AVA data/AVA 15 | ``` 16 | 17 | ### Step-by-step Version 18 | 19 | 1. **Download Annotations.** Donwload AVA Actions annotations from the 20 | [official dataset website](https://research.google.com/ava/download.html). 21 | Organize those annotations file as following structure: 22 | 23 | ``` 24 | AVA/ 25 | |_ annotations/ 26 | | |_ ava_action_list_v2.2.pbtxt 27 | | |_ ava_action_list_v2.2_for_activitynet_2019.pbtxt 28 | | |_ ava_include_timestamps_v2.2.txt 29 | | |_ ava_train_excluded_timestamps_v2.2.csv 30 | | |_ ava_val_excluded_timestamps_v2.2.csv 31 | | |_ ava_train_v2.2.csv 32 | | |_ ava_val_v2.2.csv 33 | ``` 34 | 35 | 2. **Download Videos.** Download the list of training/validation file names 36 | from [CVDF repository](https://github.com/cvdfoundation/ava-dataset) and 37 | download all videos following those links provided there. Place 38 | the list file and video files as follows: 39 | 40 | ``` 41 | AVA/ 42 | |_ annotations/ 43 | | |_ ava_file_names_trainval_v2.1.txt 44 | |_ movies/ 45 | | |_ trainval/ 46 | | | |_ .mp4 47 | | | |_ ... 48 | | | |_ .mp4 49 | ``` 50 | 51 | 3. **Create Symbolic Link.** Create a symbolic link that 52 | references the AVA dataset directory by running following 53 | commands. 54 | 55 | ```shell 56 | cd /path/to/AlphAction 57 | mkdir data 58 | ln -s /path/to/AVA data/AVA 59 | ``` 60 | 61 | 4. **Preprocess Videos.** Running following commands to 62 | process raw movies. 63 | 64 | ```shell 65 | python tools/process_ava_videos.py \ 66 | --movie_root data/AVA/movies/trainval \ 67 | --clip_root data/AVA/clips/trainval \ 68 | --kframe_root data/AVA/keyframes/trainval \ 69 | --process_num $[`nproc`/2] 70 | ``` 71 | 72 | This script extracts video clips and key frames from 73 | those raw movies. Each video clip lasts exactly one 74 | second and ranges from second 895 to second 1805. 75 | All video clips are scaled such that the shortest side 76 | becomes no larger than 360 and transcoded to have fps 25. 77 | The first frame of each video clip is extracted as key 78 | frame, which follows the definition in AVA dataset. 79 | (Key frames are only used to detect persons and objects.) 80 | The output video clips and key frames will be saved as follows: 81 | 82 | ``` 83 | AVA/ 84 | |_ clips/ 85 | | |_ trainval/ 86 | | | |_ 87 | | | | |_ [895~1805].mp4 88 | | | |_ ... 89 | | | |_ 90 | | | | |_ [895~1805].mp4 91 | |_ keyframes/ 92 | | |_ trainval/ 93 | | | |_ 94 | | | | |_ [895~1805].jpg 95 | | | |_ ... 96 | | | |_ 97 | | | | |_ [895~1805].jpg 98 | ``` 99 | 100 | 5. **Convert Annotations.** Our codes use COCO-style anntations, 101 | so we have to convert official csv annotations into COCO json format 102 | by running following commands. 103 | 104 | ```shell 105 | python tools/csv2COCO.py \ 106 | --csv_path data/AVA/annotations/ava_train_v2.2.csv \ 107 | --movie_list data/AVA/annotations/ava_file_names_trainval_v2.1.txt \ 108 | --img_root data/AVA/keyframes/trainval 109 | python tools/csv2COCO.py \ 110 | --csv_path data/AVA/annotations/ava_val_v2.2.csv \ 111 | --movie_list data/AVA/annotations/ava_file_names_trainval_v2.1.txt \ 112 | --img_root data/AVA/keyframes/trainval 113 | ``` 114 | 115 | The converted json files will be stored in `AVA/annotations` directory 116 | as follows, `*_min.json` means that the json file has no space indent. 117 | 118 | Alternatively, you could just download our json files 119 | here([train](https://drive.google.com/file/d/1BLCMkcnWusaqhHNrDjxzTOMFNT_W-rbr/view?usp=sharing), 120 | [val](https://drive.google.com/file/d/1A9_ywPZA4kr3qM8e27yvxkAF5idcQRme/view?usp=sharing)). 121 | 122 | ``` 123 | AVA/ 124 | |_ annotations/ 125 | | |_ ava_train_v2.2.json 126 | | |_ ava_train_v2.2_min.json 127 | | |_ ava_val_v2.2.json 128 | | |_ ava_val_v2.2_min.json 129 | ``` 130 | 131 | 6. **Detect Persons and Objects.** The predicted person boxes 132 | for AVA validation set can be donwloaded [[here]](https://drive.google.com/file/d/1XnPoJqTVtBVF3XxpFtvDTZQ6EFW8b2S4/view?usp=sharing). 133 | Note that we only use ground truth person boxes for training. 134 | The object boxes files are also available for download([train](https://drive.google.com/file/d/17nH47vH4q9fCs-fs4lQ9QV1POGWzQloh/view?usp=sharing), 135 | [val](https://drive.google.com/file/d/1DcXdaSkwR5Ga50kowe1OEbSQy3AKp57L/view?usp=sharing)). 136 | These files should be placed at following locations. 137 | 138 | ``` 139 | AVA/ 140 | |_ boxes/ 141 | | |_ ava_val_det_person_bbox.json 142 | | |_ ava_train_det_object_bbox.json 143 | | |_ ava_val_det_object_bbox.json 144 | ``` 145 | 146 | For person detector, we first trained it on MSCOCO 147 | keypoint dataset and then fine-tuned it on AVA dataset. 148 | The final model weight is available [[here]](https://drive.google.com/file/d/1T6kx1AJe0IA-aqrpLyeRblq2uYlaopow/view?usp=sharing). 149 | 150 | For object detector, we use the model provided in 151 | [maskrcnn-benchmark repository](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_X_101_32x8d_FPN_1x.pth), 152 | which is trained on MSCOCO dataset. Person boxes are removed 153 | from the predicted results. -------------------------------------------------------------------------------- /GETTING_STARTED.md: -------------------------------------------------------------------------------- 1 | ## Getting Started with AlphAction 2 | 3 | The hyper-parameters of each experiment are controlled by 4 | a .yaml config file, which is located in the directory 5 | `config_files`. All of these configuration files assume 6 | that we are running on 8 GPUs. We need to create a symbolic 7 | link to the directory `output`, where the output (logs and checkpoints) 8 | will be saved. Besides, we recommend to create a directory `models` to place 9 | model weights. These can be done with following commands. 10 | 11 | ```shell 12 | mkdir -p /path/to/output 13 | ln -s /path/to/output data/output 14 | mkdir -p /path/to/models 15 | ln -s /path/to/models data/models 16 | ``` 17 | 18 | ### Training 19 | 20 | Download pre-trained models from [MODEL_ZOO.md](MODEL_ZOO.md#pre-trained-models). 21 | Then place pre-trained models in `data/models` directory with following structure: 22 | 23 | ``` 24 | models/ 25 | |_ pretrained_models/ 26 | | |_ SlowFast-ResNet50-4x16.pth 27 | | |_ SlowFast-ResNet101-8x8.pth 28 | ``` 29 | 30 | To train on a single GPU, you only need to run following command. The 31 | argument `--use-tfboard` enables tensorboard to log training process. 32 | Because the config files assume that we are using 8 GPUs, the global 33 | batch size `SOLVER.VIDEOS_PER_BATCH` and `TEST.VIDEOS_PER_BATCH` can 34 | be too large for a single GPU. Therefore, in the following command, we 35 | modify the batch size and also adjust the learning rate and schedule 36 | length according to the linear scaling rule. 37 | 38 | ```shell 39 | python train_net.py --config-file "path/to/config/file.yaml" \ 40 | --transfer --no-head --use-tfboard \ 41 | SOLVER.BASE_LR 0.000125 \ 42 | SOLVER.STEPS '(560000, 720000)' \ 43 | SOLVER.MAX_ITER 880000 \ 44 | SOLVER.VIDEOS_PER_BATCH 2 \ 45 | TEST.VIDEOS_PER_BATCH 2 46 | ``` 47 | 48 | We use the launch utility `torch.distributed.launch` to launch multiple 49 | processes for distributed training on multiple gpus. `GPU_NUM` should be 50 | replaced by the number of gpus to use. Hyper-parameters in the config file 51 | can still be modified in the way used in single-GPU training. 52 | 53 | ```shell 54 | python -m torch.distributed.launch --nproc_per_node=GPU_NUM \ 55 | train_net.py --config-file "path/to/config/file.yaml" \ 56 | --transfer --no-head --use-tfboard 57 | ``` 58 | 59 | ### Inference 60 | 61 | To do inference on multiple GPUs, you should run the following command. Note that 62 | our code first trys to load the `last_checkpoint` in the `OUTPUT_DIR`. If there 63 | is no such file in `OUTPUT_DIR`, it will then load the model from the 64 | path specified in `MODEL.WEIGHT`. To use `MODEL.WEIGHT` to do the inference, 65 | you need to ensure that there is no `last_checkpoint` in `OUTPUT_DIR`. 66 | You can download the model weights from [MODEL_ZOO.md](MODEL_ZOO.md#ava-models). 67 | 68 | ```shell 69 | python -m torch.distributed.launch --nproc_per_node=GPU_NUM \ 70 | test_net.py --config-file "path/to/config/file.yaml" \ 71 | MODEL.WEIGHT "path/to/model/weight" 72 | ``` -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | **Requirements** 4 | 5 | - Python >= 3.5 6 | - [Pytorch](https://pytorch.org/) == 1.4.0 (other versions are not tested) 7 | - [PyAV](https://github.com/mikeboers/PyAV) >= 6.2.0 8 | - [yacs](https://github.com/rbgirshick/yacs) 9 | - [OpenCV](https://opencv.org/) 10 | - [tensorboardX](https://github.com/lanpa/tensorboardX) 11 | - [tqdm](https://github.com/tqdm/tqdm) 12 | - [FFmpeg](https://www.ffmpeg.org/) 13 | - [Cython](https://cython.org/), [cython_bbox](https://github.com/samson-wang/cython_bbox), [SciPy](https://scipy.org/scipylib/), [matplotlib](https://matplotlib.org/), [easydict](https://github.com/makinacorpus/easydict) (for running demo) 14 | - Linux + Nvidia GPUs 15 | 16 | We recommend to setup the environment with Anaconda, 17 | the step-by-step installation script is shown below. 18 | 19 | ```bash 20 | conda create -n alphaction python=3.7 21 | conda activate alphaction 22 | 23 | # install pytorch with the same cuda version as in your environment 24 | cuda_version=$(nvcc --version | grep -oP '(?<=release )[\d\.]*?(?=,)') 25 | conda install pytorch=1.4.0 torchvision cudatoolkit=$cuda_version -c pytorch 26 | # you should check manually if you successfully install pytorch here, there may be no such package for some cuda versions. 27 | 28 | conda install av -c conda-forge 29 | conda install cython 30 | 31 | git clone https://github.com/MVIG-SJTU/AlphAction.git 32 | cd AlphAction 33 | pip install -e . # Other dependicies will be installed here 34 | 35 | ``` 36 | -------------------------------------------------------------------------------- /MODEL_ZOO.md: -------------------------------------------------------------------------------- 1 | ## AlphAction Model Zoo 2 | 3 | ### Pre-trained Models 4 | 5 | We provide backbone models pre-trained on Kinetics dataset, used for further 6 | fine-tuning on AVA dataset. The reported accuracy are obtained by 30-view testing. 7 | 8 | | backbone | pre-train | frame length | sample rate | top-1 | top-5 | model | 9 | | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | 10 | | SlowFast-R50 | Kinetics-700 | 4 | 16 | 66.34 | 86.66 | [[link]](https://drive.google.com/file/d/1hqFuhD1p0lMpl3Yi5paIGY-hlPTVYgyi/view?usp=sharing) | 11 | | SlowFast-R101 | Kinetics-700 | 8 | 8 | 69.32 | 88.84 | [[link]](https://drive.google.com/file/d/1JDQLyyL-GFd3qi0S31Mdt5oNmUXnyJza/view?usp=sharing) | 12 | 13 | ### AVA Models 14 | 15 | | config | backbone | IA structure | mAP | in paper | model | 16 | | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | 17 | | [resnet50_4x16f_baseline](config_files/resnet50_4x16f_baseline.yaml) | SlowFast-R50-4x16 | w/o | 26.7 | 26.5 | [[link]](https://drive.google.com/file/d/1HmFVEe_wsOP9WUNdU_W7PkgWsHzZtgDf/view?usp=sharing) | 18 | | [resnet50_4x16f_parallel](config_files/resnet50_4x16f_parallel.yaml) | SlowFast-R50-4x16 | Parallel | 29.0 | 28.9 | [[link]](https://drive.google.com/file/d/1CdgwZk6HQGryBssVhE7E48HZ3CCubBg0/view?usp=sharing) | 19 | | [resnet50_4x16f_serial](config_files/resnet50_4x16f_serial.yaml) | SlowFast-R50-4x16 | Serial | 29.8 | 29.6 | [[link]](https://drive.google.com/file/d/1RTUi_ARCtar1r-u7UaTxCaLyJFnVe9Ro/view?usp=sharing) | 20 | | [resnet50_4x16f_denseserial](config_files/resnet50_4x16f_denseserial.yaml) | SlowFast-R50-4x16 | Dense Serial | 30.0 | 29.8 | [[link]](https://drive.google.com/file/d/1bYxGyf6kptfUBNAHtFcG7x4Ryp7mcWxH/view?usp=sharing) | 21 | | [resnet101_8x8f_baseline](config_files/resnet101_8x8f_baseline.yaml) | SlowFast-R101-8x8 | w/o | 29.3 | 29.3 | [[link]](https://drive.google.com/file/d/1oVGRV82iIaxm7XJqAXw7AoDTxFtdqvfv/view?usp=sharing) | 22 | | [resnet101_8x8f_denseserial](config_files/resnet101_8x8f_denseserial.yaml) | SlowFast-R101-8x8 | Dense Serial | 32.4 | 32.3 | [[link]](https://drive.google.com/file/d/1yqqc2_X6Ywi165PIuq68NdTs2WwMygHh/view?usp=sharing) | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AlphAction 2 | 3 | AlphAction aims to detect the actions of multiple persons in videos. It is 4 | **the first open-source project that achieves 30+ mAP (32.4 mAP) with single 5 | model on AVA dataset.** 6 | 7 | This project is the official implementation of paper 8 | [Asynchronous Interaction Aggregation for Action Detection](https://arxiv.org/abs/2004.07485) (**ECCV 2020**), authored 9 | by Jiajun Tang*, Jin Xia* (equal contribution), Xinzhi Mu, [Bo Pang](https://bopang1996.github.io/), 10 | [Cewu Lu](http://mvig.sjtu.edu.cn/) (corresponding author). 11 | 12 |
13 |
14 | demo1 15 | demo2 16 |
17 |
18 | demo3 19 |
20 |
21 | 22 | ## Demo Video 23 | 24 | [![AlphAction demo video](https://user-images.githubusercontent.com/22748802/94115680-a83a1500-fe7c-11ea-878c-536db277fba7.jpg)](https://www.youtube.com/watch?v=TdGmbOJ9hoE "AlphAction demo video") 25 | [[YouTube]](https://www.youtube.com/watch?v=TdGmbOJ9hoE) [[BiliBili]](https://www.bilibili.com/video/BV14A411J7Xv) 26 | 27 | ## Installation 28 | 29 | You need first to install this project, please check [INSTALL.md](INSTALL.md) 30 | 31 | ## Data Preparation 32 | 33 | To do training or inference on AVA dataset, please check [DATA.md](DATA.md) 34 | for data preparation instructions. If you have difficulty accessing Google Drive, you can instead find most files (including models) on Baidu NetDisk([[link]](https://pan.baidu.com/s/1MmYiZ4Vyeznke5_3L4WjYw), code: `smti`). 35 | 36 | ## Model Zoo 37 | 38 | Please see [MODEL_ZOO.md](MODEL_ZOO.md) for downloading models. 39 | 40 | ## Training and Inference 41 | 42 | To do training or inference with AlphAction, please refer to [GETTING_STARTED.md](GETTING_STARTED.md). 43 | 44 | ## Demo Program 45 | 46 | To run the demo program on video or webcam, please check the folder [demo](demo). 47 | We select 15 common categories from the 80 action categories of AVA, and 48 | provide a practical model which achieves high accuracy (about 70 mAP) on these categories. 49 | 50 | ## Acknowledgement 51 | We thankfully acknowledge the computing resource support of Huawei Corporation 52 | for this project. 53 | 54 | ## Citation 55 | 56 | If this project helps you in your research or project, please cite 57 | this paper: 58 | 59 | ``` 60 | @inproceedings{tang2020asynchronous, 61 | title={Asynchronous Interaction Aggregation for Action Detection}, 62 | author={Tang, Jiajun and Xia, Jin and Mu, Xinzhi and Pang, Bo and Lu, Cewu}, 63 | booktitle={Proceedings of the European conference on computer vision (ECCV)}, 64 | year={2020} 65 | } 66 | ``` 67 | -------------------------------------------------------------------------------- /alphaction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/__init__.py -------------------------------------------------------------------------------- /alphaction/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .defaults import _C as cfg 2 | -------------------------------------------------------------------------------- /alphaction/config/paths_catalog.py: -------------------------------------------------------------------------------- 1 | """Centralized catalog of paths.""" 2 | 3 | import os 4 | 5 | 6 | class DatasetCatalog(object): 7 | DATA_DIR = "data" 8 | DATASETS = { 9 | "ava_video_train_v2.2": { 10 | "video_root": "AVA/clips/trainval", 11 | "ann_file": "AVA/annotations/ava_train_v2.2_min.json", 12 | "box_file": "", 13 | "eval_file_paths": { 14 | "csv_gt_file": "AVA/annotations/ava_train_v2.2.csv", 15 | "labelmap_file": "AVA/annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt", 16 | "exclusion_file": "AVA/annotations/ava_train_excluded_timestamps_v2.2.csv", 17 | }, 18 | "object_file": "AVA/boxes/ava_train_det_object_bbox.json", 19 | }, 20 | "ava_video_val_v2.2": { 21 | "video_root": "AVA/clips/trainval", 22 | "ann_file": "AVA/annotations/ava_val_v2.2_min.json", 23 | "box_file": "AVA/boxes/ava_val_det_person_bbox.json", 24 | "eval_file_paths": { 25 | "csv_gt_file": "AVA/annotations/ava_val_v2.2.csv", 26 | "labelmap_file": "AVA/annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt", 27 | "exclusion_file": "AVA/annotations/ava_val_excluded_timestamps_v2.2.csv", 28 | }, 29 | "object_file": "AVA/boxes/ava_val_det_object_bbox.json", 30 | }, 31 | } 32 | 33 | @staticmethod 34 | def get(name): 35 | if "ava_video" in name: 36 | data_dir = DatasetCatalog.DATA_DIR 37 | attrs = DatasetCatalog.DATASETS[name] 38 | if attrs["box_file"]=="": 39 | box_file = "" 40 | else: 41 | box_file = os.path.join(data_dir, attrs["box_file"]) 42 | args = dict( 43 | video_root=os.path.join(data_dir, attrs["video_root"]), 44 | ann_file=os.path.join(data_dir, attrs["ann_file"]), 45 | box_file=box_file, 46 | eval_file_paths={key: os.path.join(data_dir, attrs["eval_file_paths"][key]) for key in 47 | attrs["eval_file_paths"]}, 48 | object_file=os.path.join(data_dir, attrs["object_file"]), 49 | ) 50 | return dict( 51 | factory="AVAVideoDataset", 52 | args=args 53 | ) 54 | raise RuntimeError("Dataset not available: {}".format(name)) -------------------------------------------------------------------------------- /alphaction/csrc/ROIAlign3d.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | // Interface for Python 10 | at::Tensor ROIAlign3d_forward(const at::Tensor& input, 11 | const at::Tensor& rois, 12 | const float spatial_scale, 13 | const int pooled_height, 14 | const int pooled_width, 15 | const int sampling_ratio) { 16 | if (input.is_cuda()) { 17 | #ifdef WITH_CUDA 18 | return ROIAlign3d_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 19 | #else 20 | AT_ERROR("Not compiled with GPU support"); 21 | #endif 22 | } 23 | AT_ERROR("Not implemented on the CPU"); 24 | } 25 | 26 | at::Tensor ROIAlign3d_backward(const at::Tensor& grad, 27 | const at::Tensor& rois, 28 | const float spatial_scale, 29 | const int pooled_height, 30 | const int pooled_width, 31 | const int batch_size, 32 | const int channels, 33 | const int length, 34 | const int height, 35 | const int width, 36 | const int sampling_ratio) { 37 | if (grad.is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIAlign3d_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, length, height, width, sampling_ratio); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /alphaction/csrc/ROIPool3d.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | std::tuple ROIPool3d_forward(const at::Tensor& input, 11 | const at::Tensor& rois, 12 | const float spatial_scale, 13 | const int pooled_height, 14 | const int pooled_width) { 15 | if (input.is_cuda()) { 16 | #ifdef WITH_CUDA 17 | return ROIPool3d_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); 18 | #else 19 | AT_ERROR("Not compiled with GPU support"); 20 | #endif 21 | } 22 | AT_ERROR("Not implemented on the CPU"); 23 | } 24 | 25 | at::Tensor ROIPool3d_backward(const at::Tensor& grad, 26 | const at::Tensor& input, 27 | const at::Tensor& rois, 28 | const at::Tensor& argmax, 29 | const float spatial_scale, 30 | const int pooled_height, 31 | const int pooled_width, 32 | const int batch_size, 33 | const int channels, 34 | const int length, 35 | const int height, 36 | const int width) { 37 | if (grad.is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIPool3d_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, length, height, width); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } -------------------------------------------------------------------------------- /alphaction/csrc/SigmoidFocalLoss.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | // Interface for Python 10 | at::Tensor SigmoidFocalLoss_forward( 11 | const at::Tensor& logits, 12 | const at::Tensor& targets, 13 | const float gamma, 14 | const float alpha) { 15 | if (logits.is_cuda()) { 16 | #ifdef WITH_CUDA 17 | return SigmoidFocalLoss_forward_cuda(logits, targets, gamma, alpha); 18 | #else 19 | AT_ERROR("Not compiled with GPU support"); 20 | #endif 21 | } 22 | AT_ERROR("Not implemented on the CPU"); 23 | } 24 | 25 | at::Tensor SigmoidFocalLoss_backward( 26 | const at::Tensor& logits, 27 | const at::Tensor& targets, 28 | const at::Tensor& d_losses, 29 | const float gamma, 30 | const float alpha) { 31 | if (logits.is_cuda()) { 32 | #ifdef WITH_CUDA 33 | return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, gamma, alpha); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } -------------------------------------------------------------------------------- /alphaction/csrc/SoftmaxFocalLoss.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | // Interface for Python 10 | std::tuple SoftmaxFocalLoss_forward( 11 | const at::Tensor& logits, 12 | const at::Tensor& targets, 13 | const float gamma, 14 | const float alpha) { 15 | if (logits.is_cuda()) { 16 | #ifdef WITH_CUDA 17 | return SoftmaxFocalLoss_forward_cuda(logits, targets, gamma, alpha); 18 | #else 19 | AT_ERROR("Not compiled with GPU support"); 20 | #endif 21 | } 22 | AT_ERROR("Not implemented on the CPU"); 23 | } 24 | 25 | at::Tensor SoftmaxFocalLoss_backward( 26 | const at::Tensor& logits, 27 | const at::Tensor& targets, 28 | const at::Tensor& P, 29 | const at::Tensor& d_losses, 30 | const float gamma, 31 | const float alpha) { 32 | if (logits.is_cuda()) { 33 | #ifdef WITH_CUDA 34 | return SoftmaxFocalLoss_backward_cuda(logits, targets, P, d_losses, gamma, alpha); 35 | #else 36 | AT_ERROR("Not compiled with GPU support"); 37 | #endif 38 | } 39 | AT_ERROR("Not implemented on the CPU"); 40 | } -------------------------------------------------------------------------------- /alphaction/csrc/cpu/vision.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | -------------------------------------------------------------------------------- /alphaction/csrc/cuda/vision.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor ROIAlign3d_forward_cuda(const at::Tensor& input, 5 | const at::Tensor& rois, 6 | const float spatial_scale, 7 | const int pooled_height, 8 | const int pooled_width, 9 | const int sampling_ratio); 10 | 11 | at::Tensor ROIAlign3d_backward_cuda(const at::Tensor& grad, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width, 16 | const int batch_size, 17 | const int channels, 18 | const int length, 19 | const int height, 20 | const int width, 21 | const int sampling_ratio); 22 | 23 | std::tuple ROIPool3d_forward_cuda(const at::Tensor& input, 24 | const at::Tensor& rois, 25 | const float spatial_scale, 26 | const int pooled_height, 27 | const int pooled_width); 28 | 29 | at::Tensor ROIPool3d_backward_cuda(const at::Tensor& grad, 30 | const at::Tensor& input, 31 | const at::Tensor& rois, 32 | const at::Tensor& argmax, 33 | const float spatial_scale, 34 | const int pooled_height, 35 | const int pooled_width, 36 | const int batch_size, 37 | const int channels, 38 | const int length, 39 | const int height, 40 | const int width); 41 | 42 | 43 | at::Tensor SigmoidFocalLoss_forward_cuda( 44 | const at::Tensor& logits, 45 | const at::Tensor& targets, 46 | const float gamma, 47 | const float alpha); 48 | 49 | at::Tensor SigmoidFocalLoss_backward_cuda( 50 | const at::Tensor& logits, 51 | const at::Tensor& targets, 52 | const at::Tensor& d_losses, 53 | const float gamma, 54 | const float alpha); 55 | 56 | std::tuple SoftmaxFocalLoss_forward_cuda( 57 | const at::Tensor& logits, 58 | const at::Tensor& targets, 59 | const float gamma, 60 | const float alpha); 61 | 62 | at::Tensor SoftmaxFocalLoss_backward_cuda( 63 | const at::Tensor& logits, 64 | const at::Tensor& targets, 65 | const at::Tensor& P, 66 | const at::Tensor& d_losses, 67 | const float gamma, 68 | const float alpha); -------------------------------------------------------------------------------- /alphaction/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | #include "ROIAlign3d.h" 2 | #include "ROIPool3d.h" 3 | #include "SoftmaxFocalLoss.h" 4 | #include "SigmoidFocalLoss.h" 5 | 6 | 7 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 8 | m.def("roi_align_3d_forward",&ROIAlign3d_forward, "ROIAlign3d_forward"); 9 | m.def("roi_align_3d_backward",&ROIAlign3d_backward, "ROIAlign3d_backward"); 10 | m.def("roi_pool_3d_forward", &ROIPool3d_forward, "ROIPool3d_forward"); 11 | m.def("roi_pool_3d_backward", &ROIPool3d_backward, "ROIPool3d_backward"); 12 | m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward"); 13 | m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward"); 14 | m.def("softmax_focalloss_forward", &SoftmaxFocalLoss_forward, "SoftmaxFocalLoss_forward"); 15 | m.def("softmax_focalloss_backward", &SoftmaxFocalLoss_backward, "SoftmaxFocalLoss_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /alphaction/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import make_data_loader 2 | -------------------------------------------------------------------------------- /alphaction/dataset/collate_batch.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | def batch_different_videos(videos, size_divisible=0): 5 | ''' 6 | :param videos: a list of video tensors 7 | :param size_divisible: output_size(width and height) should be divisble by this param 8 | :return: batched videos as a single tensor 9 | ''' 10 | assert isinstance(videos, (tuple, list)) 11 | max_size = tuple(max(s) for s in zip(*[clip.shape for clip in videos])) 12 | 13 | if size_divisible > 0: 14 | stride = size_divisible 15 | max_size = list(max_size) 16 | max_size[2] = int(math.ceil(max_size[2] / stride) * stride) 17 | max_size[3] = int(math.ceil(max_size[3] / stride) * stride) 18 | max_size = tuple(max_size) 19 | 20 | batch_shape = (len(videos),) + max_size 21 | batched_clips = videos[0].new(*batch_shape).zero_() 22 | for clip, pad_clip in zip(videos, batched_clips): 23 | pad_clip[:clip.shape[0], :clip.shape[1], :clip.shape[2], :clip.shape[3]].copy_(clip) 24 | 25 | return batched_clips 26 | 27 | 28 | class BatchCollator(object): 29 | """ 30 | From a list of samples from the dataset, 31 | returns the batched objectimages and targets. 32 | This should be passed to the DataLoader 33 | """ 34 | 35 | def __init__(self, size_divisible=0): 36 | self.divisible = size_divisible 37 | self.size_divisible = self.divisible 38 | 39 | def __call__(self, batch): 40 | transposed_batch = list(zip(*batch)) 41 | slow_clips = batch_different_videos(transposed_batch[0], self.size_divisible) 42 | fast_clips = batch_different_videos(transposed_batch[1], self.size_divisible) 43 | boxes = transposed_batch[2] 44 | objects = transposed_batch[3] 45 | extras = transposed_batch[4] 46 | clip_ids = transposed_batch[5] 47 | return slow_clips, fast_clips, boxes, objects, extras, clip_ids 48 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .concat_dataset import ConcatDataset 2 | from .ava import AVAVideoDataset 3 | 4 | __all__ = ["ConcatDataset", "AVAVideoDataset"] -------------------------------------------------------------------------------- /alphaction/dataset/datasets/concat_dataset.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | 3 | from torch.utils.data.dataset import ConcatDataset as _ConcatDataset 4 | 5 | 6 | class ConcatDataset(_ConcatDataset): 7 | """ 8 | Same as torch.utils.dataset.dataset.ConcatDataset, but exposes an extra 9 | method for querying the sizes of the image 10 | """ 11 | 12 | def get_idxs(self, idx): 13 | dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) 14 | if dataset_idx == 0: 15 | sample_idx = idx 16 | else: 17 | sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] 18 | return dataset_idx, sample_idx 19 | 20 | def get_video_info(self, idx): 21 | dataset_idx, sample_idx = self.get_idxs(idx) 22 | return self.datasets[dataset_idx].get_video_info(sample_idx) 23 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from alphaction.dataset import datasets 2 | 3 | from .ava import ava_evaluation 4 | 5 | 6 | def evaluate(dataset, predictions, output_folder, **kwargs): 7 | """evaluate dataset using different methods based on dataset type. 8 | Args: 9 | dataset: Dataset object 10 | predictions(list[BoxList]): each item in the list represents the 11 | prediction results for one image. 12 | output_folder: output folder, to save evaluation files or results. 13 | **kwargs: other args. 14 | Returns: 15 | evaluation result 16 | """ 17 | args = dict( 18 | dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs 19 | ) 20 | if isinstance(dataset, datasets.AVAVideoDataset): 21 | return ava_evaluation(**args) 22 | else: 23 | dataset_name = dataset.__class__.__name__ 24 | raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name)) 25 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/ava/README.md: -------------------------------------------------------------------------------- 1 | The evaluation code of AVA is modified from [https://github.com/activitynet/ActivityNet](https://github.com/activitynet/ActivityNet). -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/ava/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .ava_eval import do_ava_evaluation 3 | 4 | 5 | def ava_evaluation(dataset, predictions, output_folder, **_): 6 | logger = logging.getLogger("alphaction.inference") 7 | logger.info("performing ava evaluation.") 8 | return do_ava_evaluation( 9 | dataset=dataset, 10 | predictions=predictions, 11 | output_folder=output_folder, 12 | logger=logger, 13 | ) 14 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/ava/pascal_evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/dataset/datasets/evaluation/ava/pascal_evaluation/__init__.py -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/ava/pascal_evaluation/np_box_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Numpy BoxList classes and functions.""" 17 | 18 | import numpy as np 19 | 20 | 21 | class BoxList(object): 22 | """Box collection. 23 | 24 | BoxList represents a list of bounding boxes as numpy array, where each 25 | bounding box is represented as a row of 4 numbers, 26 | [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes within a 27 | given list correspond to a single image. 28 | 29 | Optionally, users can add additional related fields (such as 30 | objectness/classification scores). 31 | """ 32 | 33 | def __init__(self, data): 34 | """Constructs box collection. 35 | 36 | Args: 37 | data: a numpy array of shape [N, 4] representing box coordinates 38 | 39 | Raises: 40 | ValueError: if bbox dataset is not a numpy array 41 | ValueError: if invalid dimensions for bbox dataset 42 | """ 43 | if not isinstance(data, np.ndarray): 44 | raise ValueError('dataset must be a numpy array.') 45 | if len(data.shape) != 2 or data.shape[1] != 4: 46 | raise ValueError('Invalid dimensions for box dataset.') 47 | if data.dtype != np.float32 and data.dtype != np.float64: 48 | raise ValueError('Invalid dataset type for box dataset: float is required.') 49 | if not self._is_valid_boxes(data): 50 | raise ValueError('Invalid box dataset. dataset must be a numpy array of ' 51 | 'N*[y_min, x_min, y_max, x_max]') 52 | self.data = {'boxes': data} 53 | 54 | def num_boxes(self): 55 | """Return number of boxes held in collections.""" 56 | return self.data['boxes'].shape[0] 57 | 58 | def get_extra_fields(self): 59 | """Return all non-box fields.""" 60 | return [k for k in self.data.keys() if k != 'boxes'] 61 | 62 | def has_field(self, field): 63 | return field in self.data 64 | 65 | def add_field(self, field, field_data): 66 | """Add dataset to a specified field. 67 | 68 | Args: 69 | field: a string parameter used to speficy a related field to be accessed. 70 | field_data: a numpy array of [N, ...] representing the dataset associated 71 | with the field. 72 | Raises: 73 | ValueError: if the field is already exist or the dimension of the field 74 | dataset does not matches the number of boxes. 75 | """ 76 | if self.has_field(field): 77 | raise ValueError('Field ' + field + 'already exists') 78 | if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(): 79 | raise ValueError('Invalid dimensions for field dataset') 80 | self.data[field] = field_data 81 | 82 | def get(self): 83 | """Convenience function for accesssing box coordinates. 84 | 85 | Returns: 86 | a numpy array of shape [N, 4] representing box corners 87 | """ 88 | return self.get_field('boxes') 89 | 90 | def get_field(self, field): 91 | """Accesses dataset associated with the specified field in the box collection. 92 | 93 | Args: 94 | field: a string parameter used to speficy a related field to be accessed. 95 | 96 | Returns: 97 | a numpy 1-d array representing dataset of an associated field 98 | 99 | Raises: 100 | ValueError: if invalid field 101 | """ 102 | if not self.has_field(field): 103 | raise ValueError('field {} does not exist'.format(field)) 104 | return self.data[field] 105 | 106 | def get_coordinates(self): 107 | """Get corner coordinates of boxes. 108 | 109 | Returns: 110 | a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max] 111 | """ 112 | box_coordinates = self.get() 113 | y_min = box_coordinates[:, 0] 114 | x_min = box_coordinates[:, 1] 115 | y_max = box_coordinates[:, 2] 116 | x_max = box_coordinates[:, 3] 117 | return [y_min, x_min, y_max, x_max] 118 | 119 | def _is_valid_boxes(self, data): 120 | """Check whether dataset fullfills the format of N*[ymin, xmin, ymax, xmin]. 121 | 122 | Args: 123 | data: a numpy array of shape [N, 4] representing box coordinates 124 | 125 | Returns: 126 | a boolean indicating whether all ymax of boxes are equal or greater than 127 | ymin, and all xmax of boxes are equal or greater than xmin. 128 | """ 129 | if data.shape[0] > 0: 130 | for i in range(data.shape[0]): 131 | if data[i, 0] > data[i, 2] or data[i, 1] > data[i, 3]: 132 | return False 133 | return True 134 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/ava/pascal_evaluation/np_box_mask_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Numpy BoxMaskList classes and functions.""" 17 | 18 | import numpy as np 19 | from . import np_box_list 20 | 21 | 22 | class BoxMaskList(np_box_list.BoxList): 23 | """Convenience wrapper for BoxList with masks. 24 | 25 | BoxMaskList extends the np_box_list.BoxList to contain masks as well. 26 | In particular, its constructor receives both boxes and masks. Note that the 27 | masks correspond to the full image. 28 | """ 29 | 30 | def __init__(self, box_data, mask_data): 31 | """Constructs box collection. 32 | 33 | Args: 34 | box_data: a numpy array of shape [N, 4] representing box coordinates 35 | mask_data: a numpy array of shape [N, height, width] representing masks 36 | with values are in {0,1}. The masks correspond to the full 37 | image. The height and the width will be equal to image height and width. 38 | 39 | Raises: 40 | ValueError: if bbox dataset is not a numpy array 41 | ValueError: if invalid dimensions for bbox dataset 42 | ValueError: if mask dataset is not a numpy array 43 | ValueError: if invalid dimension for mask dataset 44 | """ 45 | super(BoxMaskList, self).__init__(box_data) 46 | if not isinstance(mask_data, np.ndarray): 47 | raise ValueError('Mask dataset must be a numpy array.') 48 | if len(mask_data.shape) != 3: 49 | raise ValueError('Invalid dimensions for mask dataset.') 50 | if mask_data.dtype != np.uint8: 51 | raise ValueError('Invalid dataset type for mask dataset: uint8 is required.') 52 | if mask_data.shape[0] != box_data.shape[0]: 53 | raise ValueError('There should be the same number of boxes and masks.') 54 | self.data['masks'] = mask_data 55 | 56 | def get_masks(self): 57 | """Convenience function for accessing masks. 58 | 59 | Returns: 60 | a numpy array of shape [N, height, width] representing masks 61 | """ 62 | return self.get_field('masks') 63 | 64 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/ava/pascal_evaluation/np_box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, 4] numpy arrays representing bounding boxes. 17 | 18 | Example box operations that are supported: 19 | * Areas: compute bounding box areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | import numpy as np 23 | 24 | 25 | def area(boxes): 26 | """Computes area of boxes. 27 | 28 | Args: 29 | boxes: Numpy array with shape [N, 4] holding N boxes 30 | 31 | Returns: 32 | a numpy array with shape [N*1] representing box areas 33 | """ 34 | return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 35 | 36 | 37 | def intersection(boxes1, boxes2): 38 | """Compute pairwise intersection areas between boxes. 39 | 40 | Args: 41 | boxes1: a numpy array with shape [N, 4] holding N boxes 42 | boxes2: a numpy array with shape [M, 4] holding M boxes 43 | 44 | Returns: 45 | a numpy array with shape [N*M] representing pairwise intersection area 46 | """ 47 | [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) 48 | [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) 49 | 50 | all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) 51 | all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) 52 | intersect_heights = np.maximum( 53 | np.zeros(all_pairs_max_ymin.shape), 54 | all_pairs_min_ymax - all_pairs_max_ymin) 55 | all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) 56 | all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) 57 | intersect_widths = np.maximum( 58 | np.zeros(all_pairs_max_xmin.shape), 59 | all_pairs_min_xmax - all_pairs_max_xmin) 60 | return intersect_heights * intersect_widths 61 | 62 | 63 | def iou(boxes1, boxes2): 64 | """Computes pairwise intersection-over-union between box collections. 65 | 66 | Args: 67 | boxes1: a numpy array with shape [N, 4] holding N boxes. 68 | boxes2: a numpy array with shape [M, 4] holding N boxes. 69 | 70 | Returns: 71 | a numpy array with shape [N, M] representing pairwise iou scores. 72 | """ 73 | intersect = intersection(boxes1, boxes2) 74 | area1 = area(boxes1) 75 | area2 = area(boxes2) 76 | union = np.expand_dims(area1, axis=1) + np.expand_dims( 77 | area2, axis=0) - intersect 78 | return intersect / union 79 | 80 | 81 | def ioa(boxes1, boxes2): 82 | """Computes pairwise intersection-over-area between box collections. 83 | 84 | Intersection-over-area (ioa) between two boxes box1 and box2 is defined as 85 | their intersection area over box2's area. Note that ioa is not symmetric, 86 | that is, IOA(box1, box2) != IOA(box2, box1). 87 | 88 | Args: 89 | boxes1: a numpy array with shape [N, 4] holding N boxes. 90 | boxes2: a numpy array with shape [M, 4] holding N boxes. 91 | 92 | Returns: 93 | a numpy array with shape [N, M] representing pairwise ioa scores. 94 | """ 95 | intersect = intersection(boxes1, boxes2) 96 | areas = np.expand_dims(area(boxes2), axis=0) 97 | return intersect / areas 98 | -------------------------------------------------------------------------------- /alphaction/dataset/datasets/evaluation/ava/pascal_evaluation/np_mask_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, height, width] numpy arrays representing masks. 17 | 18 | Example mask operations that are supported: 19 | * Areas: compute mask areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | import numpy as np 23 | 24 | EPSILON = 1e-7 25 | 26 | 27 | def area(masks): 28 | """Computes area of masks. 29 | 30 | Args: 31 | masks: Numpy array with shape [N, height, width] holding N masks. Masks 32 | values are of type np.uint8 and values are in {0,1}. 33 | 34 | Returns: 35 | a numpy array with shape [N*1] representing mask areas. 36 | 37 | Raises: 38 | ValueError: If masks.dtype is not np.uint8 39 | """ 40 | if masks.dtype != np.uint8: 41 | raise ValueError('Masks type should be np.uint8') 42 | return np.sum(masks, axis=(1, 2), dtype=np.float32) 43 | 44 | 45 | def intersection(masks1, masks2): 46 | """Compute pairwise intersection areas between masks. 47 | 48 | Args: 49 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 50 | values are of type np.uint8 and values are in {0,1}. 51 | masks2: a numpy array with shape [M, height, width] holding M masks. Masks 52 | values are of type np.uint8 and values are in {0,1}. 53 | 54 | Returns: 55 | a numpy array with shape [N*M] representing pairwise intersection area. 56 | 57 | Raises: 58 | ValueError: If masks1 and masks2 are not of type np.uint8. 59 | """ 60 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 61 | raise ValueError('masks1 and masks2 should be of type np.uint8') 62 | n = masks1.shape[0] 63 | m = masks2.shape[0] 64 | answer = np.zeros([n, m], dtype=np.float32) 65 | for i in np.arange(n): 66 | for j in np.arange(m): 67 | answer[i, j] = np.sum(np.minimum(masks1[i], masks2[j]), dtype=np.float32) 68 | return answer 69 | 70 | 71 | def iou(masks1, masks2): 72 | """Computes pairwise intersection-over-union between mask collections. 73 | 74 | Args: 75 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 76 | values are of type np.uint8 and values are in {0,1}. 77 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks 78 | values are of type np.uint8 and values are in {0,1}. 79 | 80 | Returns: 81 | a numpy array with shape [N, M] representing pairwise iou scores. 82 | 83 | Raises: 84 | ValueError: If masks1 and masks2 are not of type np.uint8. 85 | """ 86 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 87 | raise ValueError('masks1 and masks2 should be of type np.uint8') 88 | intersect = intersection(masks1, masks2) 89 | area1 = area(masks1) 90 | area2 = area(masks2) 91 | union = np.expand_dims(area1, axis=1) + np.expand_dims( 92 | area2, axis=0) - intersect 93 | return intersect / np.maximum(union, EPSILON) 94 | 95 | 96 | def ioa(masks1, masks2): 97 | """Computes pairwise intersection-over-area between box collections. 98 | 99 | Intersection-over-area (ioa) between two masks, mask1 and mask2 is defined as 100 | their intersection area over mask2's area. Note that ioa is not symmetric, 101 | that is, IOA(mask1, mask2) != IOA(mask2, mask1). 102 | 103 | Args: 104 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 105 | values are of type np.uint8 and values are in {0,1}. 106 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks 107 | values are of type np.uint8 and values are in {0,1}. 108 | 109 | Returns: 110 | a numpy array with shape [N, M] representing pairwise ioa scores. 111 | 112 | Raises: 113 | ValueError: If masks1 and masks2 are not of type np.uint8. 114 | """ 115 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 116 | raise ValueError('masks1 and masks2 should be of type np.uint8') 117 | intersect = intersection(masks1, masks2) 118 | areas = np.expand_dims(area(masks2), axis=0) 119 | return intersect / (areas + EPSILON) 120 | -------------------------------------------------------------------------------- /alphaction/dataset/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from .distributed import DistributedSampler 2 | from .grouped_batch_sampler import GroupedBatchSampler 3 | from .iteration_based_batch_sampler import IterationBasedBatchSampler 4 | 5 | __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"] 6 | -------------------------------------------------------------------------------- /alphaction/dataset/samplers/distributed.py: -------------------------------------------------------------------------------- 1 | # Code is copy-pasted exactly as in torch.utils.dataset.distributed. 2 | # FIXME remove this once c10d fixes the bug it has 3 | import math 4 | import torch 5 | import torch.distributed as dist 6 | from torch.utils.data.sampler import Sampler 7 | 8 | 9 | class DistributedSampler(Sampler): 10 | """Sampler that restricts dataset loading to a subset of the dataset. 11 | It is especially useful in conjunction with 12 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 13 | process can pass a DistributedSampler instance as a DataLoader sampler, 14 | and load a subset of the original dataset that is exclusive to it. 15 | .. note:: 16 | Dataset is assumed to be of constant size. 17 | Arguments: 18 | dataset: Dataset used for sampling. 19 | num_replicas (optional): Number of processes participating in 20 | distributed training. 21 | rank (optional): Rank of the current process within num_replicas. 22 | """ 23 | 24 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 25 | if num_replicas is None: 26 | if not dist.is_available(): 27 | raise RuntimeError("Requires distributed package to be available") 28 | num_replicas = dist.get_world_size() 29 | if rank is None: 30 | if not dist.is_available(): 31 | raise RuntimeError("Requires distributed package to be available") 32 | rank = dist.get_rank() 33 | self.dataset = dataset 34 | self.num_replicas = num_replicas 35 | self.rank = rank 36 | self.epoch = 0 37 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 38 | self.total_size = self.num_samples * self.num_replicas 39 | self.shuffle = shuffle 40 | 41 | def __iter__(self): 42 | if self.shuffle: 43 | # deterministically shuffle based on epoch 44 | g = torch.Generator() 45 | g.manual_seed(self.epoch) 46 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 47 | else: 48 | indices = torch.arange(len(self.dataset)).tolist() 49 | 50 | # add extra samples to make it evenly divisible 51 | indices += indices[: (self.total_size - len(indices))] 52 | assert len(indices) == self.total_size 53 | 54 | # subsample 55 | offset = self.num_samples * self.rank 56 | indices = indices[offset : offset + self.num_samples] 57 | assert len(indices) == self.num_samples 58 | 59 | return iter(indices) 60 | 61 | def __len__(self): 62 | return self.num_samples 63 | 64 | def set_epoch(self, epoch): 65 | self.epoch = epoch 66 | -------------------------------------------------------------------------------- /alphaction/dataset/samplers/grouped_batch_sampler.py: -------------------------------------------------------------------------------- 1 | # Modified based on https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py 2 | import itertools 3 | 4 | import torch 5 | from torch.utils.data.sampler import BatchSampler 6 | from torch.utils.data.sampler import Sampler 7 | 8 | 9 | class GroupedBatchSampler(BatchSampler): 10 | """ 11 | Wraps another sampler to yield a mini-batch of indices. 12 | It enforces that elements from the same group should appear in groups of batch_size. 13 | It also tries to provide mini-batches which follows an ordering which is 14 | as close as possible to the ordering from the original sampler. 15 | 16 | Arguments: 17 | sampler (Sampler): Base sampler. 18 | batch_size (int): Size of mini-batch. 19 | drop_uneven (bool): If ``True``, the sampler will drop the batches whose 20 | size is less than ``batch_size`` 21 | 22 | """ 23 | 24 | def __init__(self, sampler, group_ids, batch_size, drop_uneven=False): 25 | if not isinstance(sampler, Sampler): 26 | raise ValueError( 27 | "sampler should be an instance of " 28 | "torch.utils.dataset.Sampler, but got sampler={}".format(sampler) 29 | ) 30 | self.sampler = sampler 31 | self.group_ids = torch.as_tensor(group_ids) 32 | assert self.group_ids.dim() == 1 33 | self.batch_size = batch_size 34 | self.drop_uneven = drop_uneven 35 | 36 | self.groups = torch.unique(self.group_ids).sort(0)[0] 37 | 38 | def _prepare_batches(self): 39 | dataset_size = len(self.group_ids) 40 | # get the sampled indices from the sampler 41 | sampled_ids = torch.as_tensor(list(self.sampler)) 42 | # potentially not all elements of the dataset were sampled 43 | # by the sampler (e.g., DistributedSampler). 44 | # construct a tensor which contains -1 if the element was 45 | # not sampled, and a non-negative number indicating the 46 | # order where the element was sampled. 47 | # for example. if sampled_ids = [3, 1] and dataset_size = 5, 48 | # the order is [-1, 1, -1, 0, -1] 49 | order = torch.full((dataset_size,), -1, dtype=torch.int64) 50 | order[sampled_ids] = torch.arange(len(sampled_ids)) 51 | 52 | # get a mask with the elements that were sampled 53 | mask = order >= 0 54 | 55 | # find the elements that belong to each individual cluster 56 | clusters = [(self.group_ids == i) & mask for i in self.groups] 57 | # get relative order of the elements inside each cluster 58 | # that follows the order from the sampler 59 | relative_order = [order[cluster] for cluster in clusters] 60 | # with the relative order, find the absolute order in the 61 | # sampled space 62 | permutation_ids = [s[s.sort()[1]] for s in relative_order] 63 | # permute each cluster so that they follow the order from 64 | # the sampler 65 | permuted_clusters = [sampled_ids[idx] for idx in permutation_ids] 66 | 67 | # splits each cluster in batch_size, and merge as a list of tensors 68 | splits = [c.split(self.batch_size) for c in permuted_clusters] 69 | merged = tuple(itertools.chain.from_iterable(splits)) 70 | 71 | # now each batch internally has the right order, but 72 | # they are grouped by clusters. Find the permutation between 73 | # different batches that brings them as close as possible to 74 | # the order that we have in the sampler. For that, we will consider the 75 | # ordering as coming from the first element of each batch, and sort 76 | # correspondingly 77 | first_element_of_batch = [t[0].item() for t in merged] 78 | # get and inverse mapping from sampled indices and the position where 79 | # they occur (as returned by the sampler) 80 | inv_sampled_ids_map = {v: k for k, v in enumerate(sampled_ids.tolist())} 81 | # from the first element in each batch, get a relative ordering 82 | first_index_of_batch = torch.as_tensor( 83 | [inv_sampled_ids_map[s] for s in first_element_of_batch] 84 | ) 85 | 86 | # permute the batches so that they approximately follow the order 87 | # from the sampler 88 | permutation_order = first_index_of_batch.sort(0)[1].tolist() 89 | # finally, permute the batches 90 | batches = [merged[i].tolist() for i in permutation_order] 91 | 92 | if self.drop_uneven: 93 | kept = [] 94 | for batch in batches: 95 | if len(batch) == self.batch_size: 96 | kept.append(batch) 97 | batches = kept 98 | return batches 99 | 100 | def __iter__(self): 101 | batches = self._prepare_batches() 102 | self._batches = batches 103 | return iter(batches) 104 | 105 | def __len__(self): 106 | if not hasattr(self, "_batches"): 107 | self._batches = self._prepare_batches() 108 | return len(self._batches) 109 | -------------------------------------------------------------------------------- /alphaction/dataset/samplers/iteration_based_batch_sampler.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py 2 | from torch.utils.data.sampler import BatchSampler 3 | 4 | 5 | class IterationBasedBatchSampler(BatchSampler): 6 | """ 7 | Wraps a BatchSampler, resampling from it until 8 | a specified number of iterations have been sampled 9 | """ 10 | 11 | def __init__(self, batch_sampler, num_iterations, start_iter=0): 12 | self.batch_sampler = batch_sampler 13 | self.num_iterations = num_iterations 14 | self.start_iter = start_iter 15 | 16 | def __iter__(self): 17 | iteration = self.start_iter 18 | while iteration <= self.num_iterations: 19 | # if the underlying sampler has a set_epoch method, like 20 | # DistributedSampler, used for making each process see 21 | # a different split of the dataset, then set it 22 | if hasattr(self.batch_sampler.sampler, "set_epoch"): 23 | self.batch_sampler.sampler.set_epoch(iteration) 24 | for batch in self.batch_sampler: 25 | iteration += 1 26 | if iteration > self.num_iterations: 27 | break 28 | yield batch 29 | 30 | def __len__(self): 31 | return self.num_iterations 32 | -------------------------------------------------------------------------------- /alphaction/dataset/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import build_transforms, build_object_transforms -------------------------------------------------------------------------------- /alphaction/dataset/transforms/build.py: -------------------------------------------------------------------------------- 1 | from . import video_transforms as T 2 | from . import object_transforms as OT 3 | 4 | 5 | def build_transforms(cfg, is_train=True): 6 | # build transforms for training of testing 7 | if is_train: 8 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 9 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 10 | color_jitter = cfg.INPUT.COLOR_JITTER 11 | flip_prob = 0.5 12 | slow_jitter = cfg.INPUT.SLOW_JITTER 13 | else: 14 | min_size = cfg.INPUT.MIN_SIZE_TEST 15 | max_size = cfg.INPUT.MAX_SIZE_TEST 16 | color_jitter = False 17 | flip_prob = 0 18 | slow_jitter = False 19 | 20 | frame_num = cfg.INPUT.FRAME_NUM 21 | sample_rate = cfg.INPUT.FRAME_SAMPLE_RATE 22 | 23 | if color_jitter: 24 | color_transform = T.ColorJitter( 25 | cfg.INPUT.HUE_JITTER, cfg.INPUT.SAT_JITTER, cfg.INPUT.VAL_JITTER 26 | ) 27 | else: 28 | color_transform = T.Identity() 29 | 30 | to_bgr = cfg.INPUT.TO_BGR 31 | normalize_transform = T.Normalize( 32 | mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr=to_bgr 33 | ) 34 | 35 | tau = cfg.INPUT.TAU 36 | alpha = cfg.INPUT.ALPHA 37 | 38 | transform = T.Compose( 39 | [ 40 | T.TemporalCrop(frame_num, sample_rate), 41 | T.Resize(min_size, max_size), 42 | color_transform, 43 | T.RandomHorizontalFlip(flip_prob), 44 | T.ToTensor(), 45 | normalize_transform, 46 | T.SlowFastCrop(tau, alpha, slow_jitter), 47 | ] 48 | ) 49 | 50 | return transform 51 | 52 | 53 | def build_object_transforms(cfg, is_train=True): 54 | # build transforms for object boxes, should be kept consistent with video transforms. 55 | if is_train: 56 | flip_prob = 0.5 57 | else: 58 | flip_prob = 0 59 | 60 | transform = OT.Compose([ 61 | OT.PickTop(cfg.MODEL.IA_STRUCTURE.MAX_OBJECT), 62 | OT.Resize(), 63 | OT.RandomHorizontalFlip(flip_prob) 64 | ]) 65 | return transform -------------------------------------------------------------------------------- /alphaction/dataset/transforms/object_transforms.py: -------------------------------------------------------------------------------- 1 | 2 | class Compose(object): 3 | # Class used to compose different kinds of object transforms 4 | def __init__(self, transforms): 5 | self.transforms = transforms 6 | 7 | def __call__(self, object_boxes, transform_randoms): 8 | #should reuse the random varaible in video transforms 9 | for t in self.transforms: 10 | object_boxes = t(object_boxes, transform_randoms) 11 | return object_boxes 12 | 13 | 14 | class PickTop(object): 15 | # pick top scored object boxes. 16 | def __init__(self, top_k): 17 | self.top_k = top_k 18 | 19 | def __call__(self, objects, _): 20 | objects = objects.top_k(self.top_k) 21 | return objects 22 | 23 | 24 | class Resize(object): 25 | def __call__(self, object_boxes, transform_randoms): 26 | # resize according to video transforms 27 | size = transform_randoms["Resize"] 28 | if object_boxes is not None: 29 | object_boxes = object_boxes.resize(size) 30 | return object_boxes 31 | 32 | 33 | class RandomHorizontalFlip(object): 34 | def __init__(self, prob=0.5): 35 | self.prob = prob 36 | 37 | def __call__(self, object_boxes, transform_randoms): 38 | # flip according to video transforms 39 | flip_random = transform_randoms["Flip"] 40 | if flip_random < self.prob: 41 | object_boxes.transpose(0) 42 | return object_boxes 43 | -------------------------------------------------------------------------------- /alphaction/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/engine/__init__.py -------------------------------------------------------------------------------- /alphaction/layers/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .roi_align_3d import ROIAlign3d 4 | from .roi_align_3d import roi_align_3d 5 | from .roi_pool_3d import ROIPool3d 6 | from .roi_pool_3d import roi_pool_3d 7 | from .batch_norm import FrozenBatchNorm1d, FrozenBatchNorm2d, FrozenBatchNorm3d 8 | from .sigmoid_focal_loss import SigmoidFocalLoss 9 | from .softmax_focal_loss import SoftmaxFocalLoss 10 | 11 | __all__ = ["roi_align_3d", "ROIAlign3d", "roi_pool_3d", "ROIPool3d", 12 | "SigmoidFocalLoss", "SoftmaxFocalLoss", "FrozenBatchNorm1d", 13 | "FrozenBatchNorm2d", "FrozenBatchNorm3d", 14 | ] 15 | 16 | -------------------------------------------------------------------------------- /alphaction/layers/batch_norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class _FrozenBatchNorm(nn.Module): 6 | def __init__(self, num_features, eps=1e-5, affine=True, track_running_stats=True): 7 | super(_FrozenBatchNorm, self).__init__() 8 | self.num_features = num_features 9 | self.eps = eps 10 | self.affine = affine 11 | self.track_running_stats = track_running_stats 12 | if self.affine: 13 | self.register_buffer("weight", torch.Tensor(num_features)) 14 | self.register_buffer("bias", torch.Tensor(num_features)) 15 | else: 16 | self.register_buffer("weight", None) 17 | self.register_buffer("bias", None) 18 | if self.track_running_stats: 19 | self.register_buffer('running_mean', torch.zeros(num_features)) 20 | self.register_buffer('running_var', torch.ones(num_features)) 21 | else: 22 | self.register_parameter('running_mean', None) 23 | self.register_parameter('running_var', None) 24 | self.reset_parameters() 25 | 26 | def reset_running_stats(self): 27 | if self.track_running_stats: 28 | self.running_mean.zero_() 29 | self.running_var.fill_(1) 30 | 31 | def reset_parameters(self): 32 | self.reset_running_stats() 33 | if self.affine: 34 | self.weight.data.uniform_() 35 | self.bias.data.zero_() 36 | 37 | def _check_input_dim(self, input): 38 | raise NotImplementedError 39 | 40 | def forward(self, input): 41 | self._check_input_dim(input) 42 | view_shape = (1, self.num_features) + (1,) * (input.dim() - 2) 43 | 44 | if self.track_running_stats: 45 | scale = self.weight / (self.running_var + self.eps).sqrt() 46 | bias = self.bias - self.running_mean * scale 47 | else: 48 | scale = self.weight 49 | bias = self.bias 50 | 51 | return scale.view(*view_shape) * input + bias.view(*view_shape) 52 | 53 | def extra_repr(self): 54 | return '{num_features}, eps={eps}, affine={affine}, ' \ 55 | 'track_running_stats={track_running_stats}'.format(**self.__dict__) 56 | 57 | def _load_from_state_dict(self, state_dict, prefix, metadata, strict, 58 | missing_keys, unexpected_keys, error_msgs): 59 | num_batches_tracked_key = prefix + 'num_batches_tracked' 60 | if num_batches_tracked_key in state_dict: 61 | del state_dict[num_batches_tracked_key] 62 | super(_FrozenBatchNorm, self)._load_from_state_dict( 63 | state_dict, prefix, metadata, strict, 64 | missing_keys, unexpected_keys, error_msgs) 65 | 66 | 67 | class FrozenBatchNorm1d(_FrozenBatchNorm): 68 | def _check_input_dim(self, input): 69 | if input.dim() != 2 and input.dim() != 3: 70 | raise ValueError('expected 2D or 3D input (got {}D input)' 71 | .format(input.dim())) 72 | 73 | 74 | class FrozenBatchNorm2d(_FrozenBatchNorm): 75 | def _check_input_dim(self, input): 76 | if input.dim() != 4: 77 | raise ValueError('expected 4D input (got {}D input)' 78 | .format(input.dim())) 79 | 80 | 81 | class FrozenBatchNorm3d(_FrozenBatchNorm): 82 | def _check_input_dim(self, input): 83 | if input.dim() != 5: 84 | raise ValueError('expected 5D input (got {}D input)' 85 | .format(input.dim())) 86 | -------------------------------------------------------------------------------- /alphaction/layers/roi_align_3d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.autograd import Function 4 | from torch.autograd.function import once_differentiable 5 | from torch.nn.modules.utils import _pair 6 | 7 | import alphaction._custom_cuda_ext as _C 8 | 9 | 10 | class _ROIAlign3d(Function): 11 | @staticmethod 12 | def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): 13 | ctx.save_for_backward(roi) 14 | ctx.output_size = _pair(output_size) 15 | ctx.spatial_scale = spatial_scale 16 | ctx.sampling_ratio = sampling_ratio 17 | ctx.input_shape = input.size() 18 | output = _C.roi_align_3d_forward( 19 | input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio 20 | ) 21 | return output 22 | 23 | @staticmethod 24 | @once_differentiable 25 | def backward(ctx, grad_output): 26 | rois, = ctx.saved_tensors 27 | output_size = ctx.output_size 28 | spatial_scale = ctx.spatial_scale 29 | sampling_ratio = ctx.sampling_ratio 30 | bs, ch, l, h, w = ctx.input_shape 31 | grad_input = _C.roi_align_3d_backward( 32 | grad_output, 33 | rois, 34 | spatial_scale, 35 | output_size[0], 36 | output_size[1], 37 | bs, 38 | ch, 39 | l, 40 | h, 41 | w, 42 | sampling_ratio, 43 | ) 44 | return grad_input, None, None, None, None 45 | 46 | 47 | roi_align_3d = _ROIAlign3d.apply 48 | 49 | 50 | class ROIAlign3d(nn.Module): 51 | def __init__(self, output_size, spatial_scale, sampling_ratio): 52 | super(ROIAlign3d, self).__init__() 53 | self.output_size = output_size 54 | self.spatial_scale = spatial_scale 55 | self.sampling_ratio = sampling_ratio 56 | 57 | def forward(self, input, rois): 58 | return roi_align_3d( 59 | input, rois, self.output_size, self.spatial_scale, self.sampling_ratio 60 | ) 61 | 62 | def __repr__(self): 63 | tmpstr = self.__class__.__name__ + "(" 64 | tmpstr += "output_size=" + str(self.output_size) 65 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 66 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 67 | tmpstr += ")" 68 | return tmpstr 69 | -------------------------------------------------------------------------------- /alphaction/layers/roi_pool_3d.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from torch.autograd import Function 3 | from torch.autograd.function import once_differentiable 4 | from torch.nn.modules.utils import _pair 5 | 6 | import alphaction._custom_cuda_ext as _C 7 | 8 | 9 | class _ROIPool3d(Function): 10 | @staticmethod 11 | def forward(ctx, input, roi, output_size, spatial_scale): 12 | ctx.output_size = _pair(output_size) 13 | ctx.spatial_scale = spatial_scale 14 | ctx.input_shape = input.size() 15 | output, argmax = _C.roi_pool_3d_forward( 16 | input, roi, spatial_scale, output_size[0], output_size[1] 17 | ) 18 | ctx.save_for_backward(input, roi, argmax) 19 | return output 20 | 21 | @staticmethod 22 | @once_differentiable 23 | def backward(ctx, grad_output): 24 | input, rois, argmax = ctx.saved_tensors 25 | output_size = ctx.output_size 26 | spatial_scale = ctx.spatial_scale 27 | bs, ch, l, h, w = ctx.input_shape 28 | grad_input = _C.roi_pool_3d_backward( 29 | grad_output, 30 | input, 31 | rois, 32 | argmax, 33 | spatial_scale, 34 | output_size[0], 35 | output_size[1], 36 | bs, 37 | ch, 38 | l, 39 | h, 40 | w, 41 | ) 42 | return grad_input, None, None, None 43 | 44 | 45 | roi_pool_3d = _ROIPool3d.apply 46 | 47 | 48 | class ROIPool3d(nn.Module): 49 | def __init__(self, output_size, spatial_scale): 50 | super(ROIPool3d, self).__init__() 51 | self.output_size = output_size 52 | self.spatial_scale = spatial_scale 53 | 54 | def forward(self, input, rois): 55 | return roi_pool_3d(input, rois, self.output_size, self.spatial_scale) 56 | 57 | def __repr__(self): 58 | tmpstr = self.__class__.__name__ + "(" 59 | tmpstr += "output_size=" + str(self.output_size) 60 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 61 | tmpstr += ")" 62 | return tmpstr -------------------------------------------------------------------------------- /alphaction/layers/sigmoid_focal_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.autograd import Function 4 | from torch.autograd.function import once_differentiable 5 | 6 | import alphaction._custom_cuda_ext as _C 7 | 8 | 9 | class _SigmoidFocalLoss(Function): 10 | @staticmethod 11 | def forward(ctx, logits, targets, gamma, alpha): 12 | ctx.save_for_backward(logits, targets) 13 | ctx.gamma = gamma 14 | ctx.alpha = alpha 15 | 16 | losses = _C.sigmoid_focalloss_forward( 17 | logits, targets, gamma, alpha 18 | ) 19 | return losses 20 | 21 | @staticmethod 22 | @once_differentiable 23 | def backward(ctx, d_loss): 24 | logits, targets = ctx.saved_tensors 25 | gamma = ctx.gamma 26 | alpha = ctx.alpha 27 | d_logits = _C.sigmoid_focalloss_backward( 28 | logits, targets, d_loss, gamma, alpha 29 | ) 30 | return d_logits, None, None, None 31 | 32 | 33 | def sigmoid_focal_loss(logits, targets, gamma, alpha, reduction='mean'): 34 | assert reduction in ["none", "mean", "sum"], "Unsupported reduction type \"{}\"".format(reduction) 35 | logits = logits.float() 36 | targets = targets.float() 37 | 38 | ret = _SigmoidFocalLoss.apply(logits, targets, gamma, alpha) 39 | if reduction != "none": 40 | ret = torch.mean(ret) if reduction == "mean" else torch.sum(ret) 41 | 42 | return ret 43 | 44 | 45 | class SigmoidFocalLoss(nn.Module): 46 | def __init__(self, gamma, alpha, reduction="mean"): 47 | super(SigmoidFocalLoss, self).__init__() 48 | self.gamma = gamma 49 | self.alpha = alpha 50 | self.reduction = reduction 51 | 52 | def forward(self, logits, targets): 53 | loss = sigmoid_focal_loss(logits, targets, self.gamma, self.alpha, self.reduction) 54 | return loss 55 | 56 | def __repr__(self): 57 | tmpstr = self.__class__.__name__ + "(" 58 | tmpstr += "gamma=" + str(self.gamma) 59 | tmpstr += ", alpha=" + str(self.alpha) 60 | tmpstr += ")" 61 | return tmpstr 62 | -------------------------------------------------------------------------------- /alphaction/layers/softmax_focal_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.autograd import Function 4 | from torch.autograd.function import once_differentiable 5 | 6 | import alphaction._custom_cuda_ext as _C 7 | 8 | 9 | class _SoftmaxFocalLoss(Function): 10 | @staticmethod 11 | def forward(ctx, logits, targets, gamma, alpha): 12 | ctx.gamma = gamma 13 | ctx.alpha = alpha 14 | 15 | losses, P = _C.softmax_focalloss_forward( 16 | logits, targets, gamma, alpha 17 | ) 18 | ctx.save_for_backward(logits, targets, P) 19 | return losses 20 | 21 | @staticmethod 22 | @once_differentiable 23 | def backward(ctx, d_loss): 24 | logits, targets, P = ctx.saved_tensors 25 | gamma = ctx.gamma 26 | alpha = ctx.alpha 27 | d_logits = _C.softmax_focalloss_backward( 28 | logits, targets, P, d_loss, gamma, alpha 29 | ) 30 | return d_logits, None, None, None 31 | 32 | 33 | def softmax_focal_loss(logits, targets, gamma, alpha, reduction='mean'): 34 | assert reduction in ["none", "mean", "sum"], "Unsupported reduction type \"{}\"".format(reduction) 35 | logits = logits.float() 36 | targets = targets.int() 37 | 38 | ret = _SoftmaxFocalLoss.apply(logits, targets, gamma, alpha) 39 | if reduction != "none": 40 | ret = torch.mean(ret) if reduction == "mean" else torch.sum(ret) 41 | 42 | return ret 43 | 44 | 45 | class SoftmaxFocalLoss(nn.Module): 46 | def __init__(self, gamma, alpha, reduction="mean"): 47 | super(SoftmaxFocalLoss, self).__init__() 48 | self.gamma = gamma 49 | self.alpha = alpha 50 | self.reduction = reduction 51 | 52 | def forward(self, logits, targets): 53 | loss = softmax_focal_loss(logits, targets, self.gamma, self.alpha, self.reduction) 54 | return loss 55 | 56 | def __repr__(self): 57 | tmpstr = self.__class__.__name__ + "(" 58 | tmpstr += "gamma=" + str(self.gamma) 59 | tmpstr += ", alpha=" + str(self.alpha) 60 | tmpstr += ")" 61 | return tmpstr 62 | -------------------------------------------------------------------------------- /alphaction/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/modeling/__init__.py -------------------------------------------------------------------------------- /alphaction/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import build_backbone -------------------------------------------------------------------------------- /alphaction/modeling/backbone/backbone.py: -------------------------------------------------------------------------------- 1 | from alphaction.modeling import registry 2 | from . import slowfast, i3d 3 | 4 | @registry.BACKBONES.register("Slowfast-Resnet50") 5 | @registry.BACKBONES.register("Slowfast-Resnet101") 6 | def build_slowfast_resnet_backbone(cfg): 7 | model = slowfast.SlowFast(cfg) 8 | return model 9 | 10 | @registry.BACKBONES.register("I3D-Resnet50") 11 | @registry.BACKBONES.register("I3D-Resnet101") 12 | @registry.BACKBONES.register("I3D-Resnet50-Sparse") 13 | @registry.BACKBONES.register("I3D-Resnet101-Sparse") 14 | def build_i3d_resnet_backbone(cfg): 15 | model = i3d.I3D(cfg) 16 | return model 17 | 18 | def build_backbone(cfg): 19 | assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \ 20 | "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format( 21 | cfg.MODEL.BACKBONE.CONV_BODY 22 | ) 23 | return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg) -------------------------------------------------------------------------------- /alphaction/modeling/detector/__init__.py: -------------------------------------------------------------------------------- 1 | from .action_detector import build_detection_model -------------------------------------------------------------------------------- /alphaction/modeling/detector/action_detector.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from ..backbone import build_backbone 4 | from ..roi_heads.roi_heads_3d import build_3d_roi_heads 5 | 6 | 7 | class ActionDetector(nn.Module): 8 | def __init__(self, cfg): 9 | super(ActionDetector, self).__init__() 10 | self.backbone = build_backbone(cfg) 11 | self.roi_heads = build_3d_roi_heads(cfg, self.backbone.dim_out) 12 | 13 | def forward(self, slow_video, fast_video, boxes, objects=None, extras={}, part_forward=-1): 14 | # part_forward is used to split this model into two parts. 15 | # if part_forward<0, just use it as a single model 16 | # if part_forward=0, use this model to extract pooled feature(person and object, no memory features). 17 | # if part_forward=1, use the ia structure to aggregate interactions and give final result. 18 | # implemented in roi_heads 19 | 20 | if part_forward==1: 21 | slow_features = fast_features = None 22 | else: 23 | slow_features, fast_features = self.backbone(slow_video, fast_video) 24 | 25 | result, detector_losses, loss_weight, detector_metrics = self.roi_heads(slow_features, fast_features, boxes, objects, extras, part_forward) 26 | 27 | if self.training: 28 | return detector_losses, loss_weight, detector_metrics, result 29 | 30 | return result 31 | 32 | def c2_weight_mapping(self): 33 | if not hasattr(self, "c2_mapping"): 34 | weight_map = {} 35 | for name, m_child in self.named_children(): 36 | if m_child.state_dict() and hasattr(m_child, "c2_weight_mapping"): 37 | child_map = m_child.c2_weight_mapping() 38 | for key, val in child_map.items(): 39 | new_key = name + '.' + key 40 | weight_map[new_key] = val 41 | self.c2_mapping = weight_map 42 | return self.c2_mapping 43 | 44 | def build_detection_model(cfg): 45 | return ActionDetector(cfg) -------------------------------------------------------------------------------- /alphaction/modeling/nonlocal_block.py: -------------------------------------------------------------------------------- 1 | from __future__ import (absolute_import, division, print_function, 2 | unicode_literals) 3 | 4 | import torch 5 | import torch.nn as nn 6 | from alphaction.layers import FrozenBatchNorm3d 7 | 8 | 9 | class NLBlock(nn.Module): 10 | def __init__(self, dim_in, dim_out, dim_inner, nl_cfg, group=False): 11 | super(NLBlock, self).__init__() 12 | 13 | self.nl_cfg = nl_cfg.clone() 14 | self.group = group 15 | self.group_size = 4 16 | 17 | init_std = nl_cfg.CONV_INIT_STD 18 | bias = not nl_cfg.NO_BIAS 19 | pool_stride = 2 20 | 21 | self.scale_value = dim_inner ** (-0.5) 22 | self.dim_inner = dim_inner 23 | 24 | self.theta = nn.Conv3d(dim_in, dim_inner, 1, bias=bias) 25 | nn.init.normal_(self.theta.weight, std=init_std) 26 | if bias: 27 | nn.init.constant_(self.theta.bias, 0) 28 | 29 | if nl_cfg.USE_MAXPOOL: 30 | self.maxpool = nn.MaxPool3d((1, pool_stride, pool_stride), 31 | stride=(1, pool_stride, pool_stride)) 32 | 33 | self.phi = nn.Conv3d(dim_in, dim_inner, 1, bias=bias) 34 | nn.init.normal_(self.phi.weight, std=init_std) 35 | if bias: 36 | nn.init.constant_(self.phi.bias, 0) 37 | 38 | self.g = nn.Conv3d(dim_in, dim_inner, 1, bias=bias) 39 | nn.init.normal_(self.g.weight, std=init_std) 40 | if bias: 41 | nn.init.constant_(self.g.bias, 0) 42 | 43 | if nl_cfg.USE_SOFTMAX: 44 | self.softmax = nn.Softmax(dim=2) 45 | 46 | self.out = nn.Conv3d(dim_inner, dim_out, 1, bias=bias) 47 | if nl_cfg.USE_ZERO_INIT_CONV: 48 | nn.init.constant_(self.out.weight, 0) 49 | else: 50 | nn.init.normal_(self.out.weight, std=init_std) 51 | if bias: 52 | nn.init.constant_(self.out.bias, 0) 53 | 54 | if nl_cfg.USE_BN: 55 | if nl_cfg.FROZEN_BN: 56 | self.bn = FrozenBatchNorm3d(dim_out, eps=nl_cfg.BN_EPSILON) 57 | else: 58 | self.bn = nn.BatchNorm3d(dim_out, eps=nl_cfg.BN_EPSILON, momentum=nl_cfg.BN_MOMENTUM) 59 | nn.init.constant_(self.bn.weight, nl_cfg.BN_INIT_GAMMA) 60 | 61 | def forward(self, x): 62 | if x.dim() != 5: 63 | raise ValueError('expected 4D or 5D input (got {}D input)' 64 | .format(x.dim())) 65 | 66 | if self.group: 67 | x = x.transpose(1, 2) 68 | sz_before_group = list(x.shape) 69 | sz_after_group = sz_before_group.copy() 70 | sz_after_group[0] = -1 71 | sz_after_group[1] = self.group_size 72 | x = x.contiguous().view(*sz_after_group) 73 | x = x.transpose(1, 2) 74 | 75 | batch_size = x.shape[0] 76 | 77 | theta = self.theta(x) 78 | 79 | if self.nl_cfg.USE_MAXPOOL: 80 | max_pool = self.maxpool(x) 81 | else: 82 | max_pool = x 83 | 84 | phi = self.phi(max_pool) 85 | 86 | g = self.g(max_pool) 87 | 88 | org_size = theta.size() 89 | mat_size = [batch_size, self.dim_inner, -1] 90 | theta = theta.view(*mat_size) 91 | phi = phi.view(*mat_size) 92 | g = g.view(*mat_size) 93 | 94 | theta_phi = torch.bmm(theta.transpose(1, 2), phi) 95 | 96 | if self.nl_cfg.USE_SOFTMAX: 97 | if self.nl_cfg.USE_SCALE: 98 | theta_phi_sc = theta_phi * self.scale_value 99 | else: 100 | theta_phi_sc = theta_phi 101 | p = self.softmax(theta_phi_sc) 102 | else: 103 | p = theta_phi / theta_phi.shape[-1] 104 | 105 | t = torch.bmm(g, p.transpose(1, 2)) 106 | 107 | t = t.view(org_size) 108 | 109 | out = self.out(t) 110 | 111 | if self.nl_cfg.USE_BN: 112 | out = self.bn(out) 113 | out = out + x 114 | 115 | if self.group: 116 | out = out.transpose(1, 2) 117 | out = out.contiguous().view(*sz_before_group) 118 | out = out.transpose(1, 2) 119 | 120 | return out 121 | 122 | def c2_weight_mapping(self): 123 | weight_map = {} 124 | for name, m_child in self.named_children(): 125 | if m_child.state_dict(): 126 | if isinstance(m_child, (nn.BatchNorm3d, FrozenBatchNorm3d)): 127 | weight_map[name + '.weight'] = '{}_s'.format(name) 128 | weight_map[name + '.running_mean'] = '{}_rm'.format(name) 129 | weight_map[name + '.running_var'] = '{}_riv'.format(name) 130 | elif isinstance(m_child, nn.GroupNorm): 131 | weight_map[name + '.weight'] = '{}_s'.format(name) 132 | else: 133 | weight_map[name + '.weight'] = '{}_w'.format(name) 134 | weight_map[name + '.bias'] = '{}_b'.format(name) 135 | return weight_map 136 | -------------------------------------------------------------------------------- /alphaction/modeling/poolers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from alphaction.layers import ROIAlign3d, ROIPool3d 5 | 6 | 7 | class Pooler3d(nn.Module): 8 | def __init__(self, output_size, scale, sampling_ratio=None, pooler_type='align3d'): 9 | super(Pooler3d, self).__init__() 10 | if pooler_type == 'align3d': 11 | assert sampling_ratio is not None, 'Sampling ratio should be specified for 3d roi align.' 12 | self.pooler = ROIAlign3d( 13 | output_size, spatial_scale=scale, sampling_ratio=sampling_ratio 14 | ) 15 | elif pooler_type == 'pooling3d': 16 | self.pooler = ROIPool3d( 17 | output_size, spatial_scale=scale 18 | ) 19 | self.output_size = output_size 20 | 21 | def convert_to_roi_format(self, boxes, dtype, device): 22 | bbox_list = list() 23 | ids_list = list() 24 | for i, b in enumerate(boxes): 25 | if not b: 26 | bbox_list.append(torch.zeros((0, 4), dtype=dtype, device=device)) 27 | ids_list.append(torch.zeros((0, 1), dtype=dtype, device=device)) 28 | else: 29 | bbox_list.append(b.bbox) 30 | ids_list.append(torch.full((len(b), 1), i, dtype=dtype, device=device)) 31 | concat_boxes = torch.cat(bbox_list, dim=0) 32 | ids = torch.cat(ids_list, dim=0) 33 | rois = torch.cat([ids, concat_boxes], dim=1) 34 | 35 | return rois 36 | 37 | def forward(self, x, boxes): 38 | rois = self.convert_to_roi_format(boxes, x.dtype, x.device) 39 | return self.pooler(x, rois) 40 | 41 | 42 | def make_3d_pooler(head_cfg): 43 | resolution = head_cfg.POOLER_RESOLUTION 44 | scale = head_cfg.POOLER_SCALE 45 | sampling_ratio = head_cfg.POOLER_SAMPLING_RATIO 46 | pooler_type = head_cfg.POOLER_TYPE 47 | pooler = Pooler3d( 48 | output_size=(resolution, resolution), 49 | scale=scale, 50 | sampling_ratio=sampling_ratio, 51 | pooler_type=pooler_type, 52 | ) 53 | return pooler -------------------------------------------------------------------------------- /alphaction/modeling/registry.py: -------------------------------------------------------------------------------- 1 | from alphaction.utils.registry import Registry 2 | 3 | BACKBONES = Registry() 4 | ROI_ACTION_FEATURE_EXTRACTORS = Registry() 5 | ROI_ACTION_PREDICTORS = Registry() 6 | INTERACTION_AGGREGATION_STRUCTURES = Registry() -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/modeling/roi_heads/__init__.py -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/action_head/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/modeling/roi_heads/action_head/__init__.py -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/action_head/action_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .roi_action_feature_extractor import make_roi_action_feature_extractor 4 | from .roi_action_predictors import make_roi_action_predictor 5 | from .inference import make_roi_action_post_processor 6 | from .loss import make_roi_action_loss_evaluator 7 | from .metric import make_roi_action_accuracy_evaluator 8 | from alphaction.modeling.utils import prepare_pooled_feature 9 | from alphaction.utils.comm import all_reduce 10 | 11 | 12 | class ROIActionHead(torch.nn.Module): 13 | """ 14 | Generic Action Head class. 15 | """ 16 | 17 | def __init__(self, cfg, dim_in): 18 | super(ROIActionHead, self).__init__() 19 | self.feature_extractor = make_roi_action_feature_extractor(cfg, dim_in) 20 | self.predictor = make_roi_action_predictor(cfg, self.feature_extractor.dim_out) 21 | self.post_processor = make_roi_action_post_processor(cfg) 22 | self.loss_evaluator = make_roi_action_loss_evaluator(cfg) 23 | self.accuracy_evaluator = make_roi_action_accuracy_evaluator(cfg) 24 | self.test_ext = cfg.TEST.EXTEND_SCALE 25 | 26 | def forward(self, slow_features, fast_features, boxes, objects=None, extras={}, part_forward=-1): 27 | # In training stage, boxes are from gt. 28 | # In testing stage, boxes are detected by human detector and proposals should be 29 | # enlarged boxes. 30 | assert not (self.training and part_forward >= 0) 31 | 32 | if part_forward == 1: 33 | boxes = extras["current_feat_p"] 34 | objects = extras["current_feat_o"] 35 | 36 | if self.training: 37 | proposals = self.loss_evaluator.sample_box(boxes) 38 | else: 39 | proposals = [box.extend(self.test_ext) for box in boxes] 40 | 41 | x, x_pooled, x_objects = self.feature_extractor(slow_features, fast_features, proposals, objects, extras, part_forward) 42 | 43 | if part_forward == 0: 44 | pooled_feature = prepare_pooled_feature(x_pooled, boxes) 45 | if x_objects is None: 46 | object_pooled_feature = None 47 | else: 48 | object_pooled_feature = prepare_pooled_feature(x_objects, objects) 49 | return [pooled_feature, object_pooled_feature], {}, {}, {} 50 | 51 | action_logits = self.predictor(x) 52 | 53 | if not self.training: 54 | result = self.post_processor((action_logits,), boxes) 55 | return result, {}, {}, {} 56 | 57 | box_num = action_logits.size(0) 58 | box_num = torch.as_tensor([box_num], dtype=torch.float32, device=action_logits.device) 59 | all_reduce(box_num, average=True) 60 | 61 | loss_dict, loss_weight = self.loss_evaluator( 62 | [action_logits], box_num.item(), 63 | ) 64 | 65 | metric_dict = self.accuracy_evaluator( 66 | [action_logits], proposals, box_num.item(), 67 | ) 68 | 69 | pooled_feature = prepare_pooled_feature(x_pooled, proposals) 70 | if x_objects is None: 71 | object_pooled_feature = [] 72 | else: 73 | object_pooled_feature = prepare_pooled_feature(x_objects, objects) 74 | 75 | return ( 76 | [pooled_feature, object_pooled_feature], 77 | loss_dict, 78 | loss_weight, 79 | metric_dict, 80 | ) 81 | 82 | def c2_weight_mapping(self): 83 | weight_map = {} 84 | for name, m_child in self.named_children(): 85 | if m_child.state_dict() and hasattr(m_child, "c2_weight_mapping"): 86 | child_map = m_child.c2_weight_mapping() 87 | for key, val in child_map.items(): 88 | new_key = name + '.' + key 89 | weight_map[new_key] = val 90 | return weight_map 91 | 92 | 93 | def build_roi_action_head(cfg, dim_in): 94 | return ROIActionHead(cfg, dim_in) 95 | -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/action_head/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from alphaction.structures.bounding_box import BoxList 6 | 7 | 8 | class PostProcessor(nn.Module): 9 | def __init__(self, pose_action_num): 10 | super(PostProcessor, self).__init__() 11 | self.pose_action_num = pose_action_num 12 | 13 | def forward(self, x, boxes): 14 | # boxes should be (#detections,4) 15 | # prob should be calculated in different way. 16 | class_logits, = x 17 | pose_action_prob = F.softmax(class_logits[:,:self.pose_action_num],-1) 18 | interaction_action_prob = torch.sigmoid(class_logits[:,self.pose_action_num:]) 19 | 20 | action_prob = torch.cat((pose_action_prob,interaction_action_prob),1) 21 | 22 | image_shapes = [box.size for box in boxes] 23 | boxes_per_image = [len(box) for box in boxes] 24 | box_tensors = [a.bbox for a in boxes] 25 | 26 | action_prob = action_prob.split(boxes_per_image, dim=0) 27 | 28 | results = [] 29 | for prob, boxes_per_image, image_shape in zip( 30 | action_prob, box_tensors, image_shapes 31 | ): 32 | boxlist = self.prepare_boxlist(boxes_per_image, prob, image_shape) 33 | results.append(boxlist) 34 | return results 35 | 36 | def prepare_boxlist(self, boxes, scores, image_shape): 37 | boxlist = BoxList(boxes, image_shape, mode="xyxy") 38 | boxlist.add_field("scores", scores) 39 | return boxlist 40 | 41 | 42 | def make_roi_action_post_processor(cfg): 43 | softmax_num = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES 44 | postprocessor = PostProcessor(softmax_num) 45 | return postprocessor 46 | -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/action_head/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from alphaction.layers import SigmoidFocalLoss, SoftmaxFocalLoss 3 | from alphaction.modeling.utils import cat 4 | 5 | 6 | class ActionLossComputation(object): 7 | def __init__(self, cfg): 8 | self.proposal_per_clip = cfg.MODEL.ROI_ACTION_HEAD.PROPOSAL_PER_CLIP 9 | self.num_pose = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES 10 | self.num_object = cfg.MODEL.ROI_ACTION_HEAD.NUM_OBJECT_MANIPULATION_CLASSES 11 | self.num_person = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_INTERACTION_CLASSES 12 | 13 | self.weight_dict = dict( 14 | loss_pose_action = cfg.MODEL.ROI_ACTION_HEAD.POSE_LOSS_WEIGHT, 15 | loss_object_interaction = cfg.MODEL.ROI_ACTION_HEAD.OBJECT_LOSS_WEIGHT, 16 | loss_person_interaction = cfg.MODEL.ROI_ACTION_HEAD.PERSON_LOSS_WEIGHT, 17 | ) 18 | 19 | gamma = cfg.MODEL.ROI_ACTION_HEAD.FOCAL_LOSS.GAMMA 20 | alpha = cfg.MODEL.ROI_ACTION_HEAD.FOCAL_LOSS.ALPHA 21 | self.sigmoid_focal_loss = SigmoidFocalLoss(gamma, alpha, reduction="none") 22 | self.softmax_focal_loss = SoftmaxFocalLoss(gamma, alpha, reduction="sum") 23 | 24 | def sample_box(self, boxes): 25 | proposals = [] 26 | num_proposals = self.proposal_per_clip 27 | for boxes_per_image in boxes: 28 | num_boxes = len(boxes_per_image) 29 | 30 | if num_boxes > num_proposals: 31 | choice_inds = torch.randperm(num_boxes)[:num_proposals] 32 | proposals_per_image = boxes_per_image[choice_inds] 33 | else: 34 | proposals_per_image = boxes_per_image 35 | proposals_per_image = proposals_per_image.random_aug(0.2, 0.1, 0.1, 0.05) 36 | proposals.append(proposals_per_image) 37 | self._proposals = proposals 38 | return proposals 39 | 40 | def __call__(self, class_logits, avg_box_num): 41 | class_logits = cat(class_logits, dim=0) 42 | assert class_logits.shape[1] == (self.num_pose + self.num_object + self.num_person), \ 43 | "The shape of tensor class logits doesn't match total number of action classes." 44 | 45 | if not hasattr(self, "_proposals"): 46 | raise RuntimeError("sample_box needs to be called before") 47 | 48 | proposals = self._proposals 49 | 50 | labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0) 51 | assert class_logits.shape[1] == labels.shape[1], \ 52 | "The shape of tensor class logits doesn't match the label tensor." 53 | 54 | loss_dict = {} 55 | 56 | if self.num_pose > 0: 57 | pose_label = labels[:, :self.num_pose].argmax(dim=1) 58 | pose_logits = class_logits[:, :self.num_pose] 59 | pose_loss = self.softmax_focal_loss(pose_logits, pose_label) / avg_box_num 60 | loss_dict["loss_pose_action"] = pose_loss 61 | 62 | interaction_label = labels[:, self.num_pose:].to(dtype=torch.float32) 63 | object_label = interaction_label[:, :self.num_object] 64 | person_label = interaction_label[:, self.num_object:] 65 | 66 | interaction_logits = class_logits[:, self.num_pose:] 67 | object_logits = interaction_logits[:, :self.num_object] 68 | person_logits = interaction_logits[:, self.num_object:] 69 | 70 | if self.num_object > 0: 71 | object_loss = self.sigmoid_focal_loss(object_logits, object_label).mean(dim=1).sum() / avg_box_num 72 | loss_dict["loss_object_interaction"] = object_loss 73 | if self.num_person > 0: 74 | person_loss = self.sigmoid_focal_loss(person_logits, person_label).mean(dim=1).sum() / avg_box_num 75 | loss_dict["loss_person_interaction"] = person_loss 76 | 77 | return loss_dict, self.weight_dict 78 | 79 | 80 | def make_roi_action_loss_evaluator(cfg): 81 | loss_evaluator = ActionLossComputation(cfg) 82 | 83 | return loss_evaluator -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/action_head/metric.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from alphaction.modeling.utils import cat 3 | 4 | 5 | class ActionAccuracyComputation(object): 6 | def __init__(self, num_pose, num_object, num_person): 7 | self.num_pose = num_pose 8 | self.num_object = num_object 9 | self.num_person = num_person 10 | 11 | def logic_iou(self, pred, label): 12 | device = pred.device 13 | 14 | version = torch.__version__ 15 | if eval('.'.join(version.split('.')[:2]))>=1.3: 16 | pred = pred.bool() 17 | label = label.bool() 18 | 19 | label_union = (pred | label).float().sum(dim=1) 20 | label_inter = (pred & label).float().sum(dim=1) 21 | replacer = torch.ones_like(label_union, device=device) 22 | zero_mask = label_union == 0 23 | label_inter = torch.where(zero_mask, replacer, label_inter) 24 | label_union = torch.where(zero_mask, replacer, label_union) 25 | return label_inter / label_union 26 | 27 | def __call__(self, class_logits, proposals, avg_box_num): 28 | class_logits = [logits.detach() for logits in class_logits] 29 | class_logits = cat(class_logits, dim=0) 30 | assert class_logits.shape[1] == (self.num_pose + self.num_object + self.num_person), \ 31 | "The shape of tensor class logits doesn't match total number of action classes." 32 | 33 | labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0) 34 | 35 | metric_dict = {} 36 | if self.num_pose>0: 37 | pose_label = labels[:, :self.num_pose].argmax(dim=1) 38 | pose_pred = class_logits[:, :self.num_pose].argmax(dim=1) 39 | accuracy_pose_action = pose_label.eq(pose_pred).float().sum() 40 | metric_dict["accuracy_pose_action"] = accuracy_pose_action / avg_box_num 41 | 42 | interaction_label = labels[:, self.num_pose:] 43 | interaction_logits = class_logits[:, self.num_pose:] 44 | interaction_pred = interaction_logits.sigmoid() > 0.5 45 | 46 | if self.num_object>0: 47 | object_label = interaction_label[:, :self.num_object] 48 | object_pred = interaction_pred[:, :self.num_object] 49 | accuracy_object_interaction = self.logic_iou(object_pred, object_label) 50 | metric_dict["accuracy_object_interaction"] = accuracy_object_interaction.sum() / avg_box_num 51 | 52 | if self.num_person>0: 53 | person_label = interaction_label[:, self.num_object:] 54 | person_pred = interaction_pred[:, self.num_object:] 55 | accuracy_person_interaction = self.logic_iou(person_pred, person_label) 56 | metric_dict["accuracy_person_interaction"] = accuracy_person_interaction.sum() / avg_box_num 57 | 58 | return metric_dict 59 | 60 | 61 | def make_roi_action_accuracy_evaluator(cfg): 62 | num_pose = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES 63 | num_object = cfg.MODEL.ROI_ACTION_HEAD.NUM_OBJECT_MANIPULATION_CLASSES 64 | num_person = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_INTERACTION_CLASSES 65 | return ActionAccuracyComputation(num_pose, num_object, num_person) -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/action_head/roi_action_predictors.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from alphaction.modeling import registry 3 | 4 | 5 | @registry.ROI_ACTION_PREDICTORS.register("FCPredictor") 6 | class FCPredictor(nn.Module): 7 | def __init__(self, config, dim_in): 8 | super(FCPredictor, self).__init__() 9 | 10 | num_classes = config.MODEL.ROI_ACTION_HEAD.NUM_CLASSES 11 | 12 | dropout_rate = config.MODEL.ROI_ACTION_HEAD.DROPOUT_RATE 13 | if dropout_rate > 0: 14 | self.dropout = nn.Dropout(p=dropout_rate, inplace=True) 15 | 16 | self.cls_score = nn.Linear(dim_in, num_classes) 17 | 18 | nn.init.normal_(self.cls_score.weight, std=0.01) 19 | nn.init.constant_(self.cls_score.bias, 0) 20 | 21 | def forward(self, x): 22 | x = x.view(x.size(0), -1) 23 | if hasattr(self, "dropout"): 24 | x = self.dropout(x) 25 | scores = self.cls_score(x) 26 | 27 | return scores 28 | 29 | def c2_weight_mapping(self): 30 | return {"cls_score.weight": "pred_w", 31 | "cls_score.bias": "pred_b"} 32 | 33 | 34 | def make_roi_action_predictor(cfg, dim_in): 35 | func = registry.ROI_ACTION_PREDICTORS[cfg.MODEL.ROI_ACTION_HEAD.PREDICTOR] 36 | return func(cfg, dim_in) 37 | -------------------------------------------------------------------------------- /alphaction/modeling/roi_heads/roi_heads_3d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .action_head.action_head import build_roi_action_head 4 | 5 | 6 | class Combined3dROIHeads(torch.nn.ModuleDict): 7 | def __init__(self, cfg, heads): 8 | super(Combined3dROIHeads, self).__init__(heads) 9 | self.cfg = cfg.clone() 10 | 11 | def forward(self, slow_features, fast_features, boxes, objects=None, extras={}, part_forward=-1): 12 | result, loss_action, loss_weight, accuracy_action = self.action(slow_features, fast_features, boxes, objects, extras, part_forward) 13 | 14 | return result, loss_action, loss_weight, accuracy_action 15 | 16 | def c2_weight_mapping(self): 17 | weight_map = {} 18 | for name, m_child in self.named_children(): 19 | if m_child.state_dict() and hasattr(m_child,"c2_weight_mapping"): 20 | child_map = m_child.c2_weight_mapping() 21 | for key, val in child_map.items(): 22 | new_key = name + '.' + key 23 | weight_map[new_key] = val 24 | return weight_map 25 | 26 | 27 | def build_3d_roi_heads(cfg, dim_in): 28 | roi_heads = [] 29 | roi_heads.append(("action", build_roi_action_head(cfg, dim_in))) 30 | 31 | if roi_heads: 32 | roi_heads = Combined3dROIHeads(cfg, roi_heads) 33 | 34 | return roi_heads 35 | -------------------------------------------------------------------------------- /alphaction/modeling/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Miscellaneous utility functions 3 | """ 4 | 5 | import torch 6 | from alphaction.structures.bounding_box import BoxList 7 | 8 | 9 | def cat(tensors, dim=0): 10 | """ 11 | Efficient version of torch.cat that avoids a copy if there is only a single element in a list 12 | """ 13 | assert isinstance(tensors, (list, tuple)) 14 | if len(tensors) == 1: 15 | return tensors[0] 16 | return torch.cat(tensors, dim) 17 | 18 | def pad_sequence(sequence, targ_size, padding_value=0): 19 | tensor_size = sequence[0].size() 20 | trailing_dims = tensor_size[1:] 21 | out_dims = (len(sequence), targ_size) + trailing_dims 22 | 23 | out_tensor = sequence[0].new_full(out_dims, padding_value) 24 | for i, tensor in enumerate(sequence): 25 | length = tensor.size(0) 26 | out_tensor[i, :length, ...] = tensor 27 | 28 | return out_tensor 29 | 30 | def prepare_pooled_feature(x_pooled, boxes, detach=True): 31 | image_shapes = [box.size for box in boxes] 32 | boxes_per_image = [len(box) for box in boxes] 33 | box_tensors = [a.bbox for a in boxes] 34 | 35 | if detach: 36 | x_pooled = x_pooled.detach() 37 | pooled_feature = x_pooled.split(boxes_per_image, dim=0) 38 | 39 | boxes_result = [] 40 | for feature_per_image, boxes_per_image, image_shape in zip( 41 | pooled_feature, box_tensors, image_shapes 42 | ): 43 | boxlist = BoxList(boxes_per_image, image_shape, mode="xyxy") 44 | boxlist.add_field("pooled_feature", feature_per_image) 45 | boxes_result.append(boxlist) 46 | return boxes_result -------------------------------------------------------------------------------- /alphaction/solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import make_optimizer 2 | from .build import make_lr_scheduler 3 | from .lr_scheduler import WarmupMultiStepLR 4 | -------------------------------------------------------------------------------- /alphaction/solver/build.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .lr_scheduler import WarmupMultiStepLR, HalfPeriodCosStepLR 4 | 5 | import torch.nn as nn 6 | from alphaction.modeling.roi_heads.action_head.IA_structure import IAStructure 7 | 8 | 9 | def make_optimizer(cfg, model): 10 | params = [] 11 | bn_param_set = set() 12 | transformer_param_set = set() 13 | for name, module in model.named_modules(): 14 | if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)): 15 | bn_param_set.add(name + ".weight") 16 | bn_param_set.add(name + ".bias") 17 | elif isinstance(module, IAStructure): 18 | for param_name, _ in module.named_parameters(name): 19 | transformer_param_set.add(param_name) 20 | for key, value in model.named_parameters(): 21 | if not value.requires_grad: 22 | continue 23 | lr = cfg.SOLVER.BASE_LR 24 | weight_decay = cfg.SOLVER.WEIGHT_DECAY 25 | if key in bn_param_set: 26 | weight_decay = cfg.SOLVER.WEIGHT_DECAY_BN 27 | elif "bias" in key: 28 | lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR 29 | weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS 30 | if key in transformer_param_set: 31 | lr = lr * cfg.SOLVER.IA_LR_FACTOR 32 | params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] 33 | 34 | optimizer = torch.optim.SGD(params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM) 35 | return optimizer 36 | 37 | 38 | def make_lr_scheduler(cfg, optimizer): 39 | scheduler = cfg.SOLVER.SCHEDULER 40 | if scheduler not in ("half_period_cosine", "warmup_multi_step"): 41 | raise ValueError('Scheduler not available') 42 | if scheduler == 'warmup_multi_step': 43 | return WarmupMultiStepLR( 44 | optimizer, 45 | cfg.SOLVER.STEPS, 46 | cfg.SOLVER.GAMMA, 47 | warmup_factor=cfg.SOLVER.WARMUP_FACTOR, 48 | warmup_iters=cfg.SOLVER.WARMUP_ITERS if cfg.SOLVER.WARMUP_ON else 0, 49 | warmup_method=cfg.SOLVER.WARMUP_METHOD, 50 | ) 51 | elif scheduler == 'half_period_cosine': 52 | return HalfPeriodCosStepLR( 53 | optimizer, 54 | warmup_factor=cfg.SOLVER.WARMUP_FACTOR, 55 | warmup_iters=cfg.SOLVER.WARMUP_ITERS if cfg.SOLVER.WARMUP_ON else 0, 56 | max_iters=cfg.SOLVER.MAX_ITER, 57 | warmup_method=cfg.SOLVER.WARMUP_METHOD, 58 | ) 59 | -------------------------------------------------------------------------------- /alphaction/solver/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/solver/lr_scheduler.py 2 | from bisect import bisect_right 3 | 4 | import torch 5 | import math 6 | 7 | 8 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): 9 | def __init__( 10 | self, 11 | optimizer, 12 | milestones, 13 | gamma=0.1, 14 | warmup_factor=1.0 / 3, 15 | warmup_iters=500, 16 | warmup_method="linear", 17 | last_epoch=-1, 18 | ): 19 | if not list(milestones) == sorted(milestones): 20 | raise ValueError( 21 | "Milestones should be a list of" " increasing integers. Got {}", 22 | milestones, 23 | ) 24 | 25 | if warmup_method not in ("constant", "linear"): 26 | raise ValueError( 27 | "Only 'constant' or 'linear' warmup_method accepted" 28 | "got {}".format(warmup_method) 29 | ) 30 | self.milestones = milestones 31 | self.gamma = gamma 32 | self.warmup_factor = warmup_factor 33 | self.warmup_iters = warmup_iters 34 | self.warmup_method = warmup_method 35 | super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) 36 | 37 | def get_lr(self): 38 | warmup_factor = 1 39 | if self.last_epoch < self.warmup_iters: 40 | if self.warmup_method == "constant": 41 | warmup_factor = self.warmup_factor 42 | elif self.warmup_method == "linear": 43 | alpha = float(self.last_epoch) / self.warmup_iters 44 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha 45 | return [ 46 | base_lr 47 | * warmup_factor 48 | * self.gamma ** bisect_right(self.milestones, self.last_epoch) 49 | for base_lr in self.base_lrs 50 | ] 51 | 52 | class HalfPeriodCosStepLR(torch.optim.lr_scheduler._LRScheduler): 53 | def __init__( 54 | self, 55 | optimizer, 56 | warmup_factor=1.0 / 3, 57 | warmup_iters=8000, 58 | max_iters=60000, 59 | warmup_method="linear", 60 | last_epoch=-1, 61 | ): 62 | if warmup_method not in ("constant", "linear"): 63 | raise ValueError( 64 | "Only 'constant' or 'linear' warmup_method accepted" 65 | "got {}".format(warmup_method) 66 | ) 67 | self.warmup_factor = warmup_factor 68 | self.warmup_iters = warmup_iters 69 | self.max_iters = max_iters 70 | self.warmup_method = warmup_method 71 | super(HalfPeriodCosStepLR, self).__init__(optimizer, last_epoch) 72 | 73 | def get_lr(self): 74 | warmup_factor = 1 75 | if self.last_epoch < self.warmup_iters: 76 | if self.warmup_method == "constant": 77 | warmup_factor = self.warmup_factor 78 | elif self.warmup_method == "linear": 79 | alpha = float(self.last_epoch) / self.warmup_iters 80 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha 81 | else: 82 | warmup_factor = 0.5 * (math.cos(self.last_epoch / self.max_iters * math.pi) + 1) 83 | return [ 84 | base_lr 85 | * warmup_factor 86 | for base_lr in self.base_lrs 87 | ] -------------------------------------------------------------------------------- /alphaction/structures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/structures/__init__.py -------------------------------------------------------------------------------- /alphaction/structures/memory_pool.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | class MemoryPool(object): 4 | def __init__(self): 5 | self.cache = defaultdict(dict) 6 | 7 | def update(self, update_info): 8 | for movie_id, feature_per_movie in update_info.items(): 9 | self.cache[movie_id].update(feature_per_movie) 10 | 11 | def update_list(self, update_info_list): 12 | for update_info in update_info_list: 13 | self.update(update_info) 14 | 15 | def __getitem__(self, item): 16 | if isinstance(item, tuple) and len(item)==2: 17 | return self.cache[item[0]][item[1]] 18 | return self.cache[item] 19 | 20 | def __setitem__(self, key, value): 21 | if isinstance(key, tuple) and len(key)==2: 22 | self.cache[key[0]][key[1]] = value 23 | else: 24 | self.cache[key] = value 25 | 26 | def __delitem__(self, item): 27 | if isinstance(item, tuple) and len(item)==2: 28 | del self.cache[item[0]][item[1]] 29 | else: 30 | del self.cache[item] 31 | 32 | def __contains__(self, item): 33 | if isinstance(item, tuple) and len(item)==2: 34 | return (item[0] in self.cache and item[1] in self.cache[item[0]]) 35 | return (item in self.cache) 36 | 37 | def items(self): 38 | return self.cache.items() -------------------------------------------------------------------------------- /alphaction/utils/IA_helper.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | def _block_set(ia_blocks): 4 | if len(ia_blocks) > 0 and isinstance(ia_blocks[0], list): 5 | ia_blocks = list(itertools.chain.from_iterable(ia_blocks)) 6 | return ia_blocks 7 | 8 | def has_person(ia_config): 9 | ia_blocks = _block_set(ia_config.I_BLOCK_LIST) 10 | return (ia_config.ACTIVE and 'P' in ia_blocks and ia_config.MAX_PERSON > 0) 11 | 12 | 13 | def has_object(ia_config): 14 | ia_blocks = _block_set(ia_config.I_BLOCK_LIST) 15 | return (ia_config.ACTIVE and 'O' in ia_blocks and ia_config.MAX_OBJECT > 0) 16 | 17 | 18 | def has_memory(ia_config): 19 | ia_blocks = _block_set(ia_config.I_BLOCK_LIST) 20 | return (ia_config.ACTIVE and 'M' in ia_blocks and ia_config.MAX_PER_SEC > 0) 21 | -------------------------------------------------------------------------------- /alphaction/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/utils/__init__.py -------------------------------------------------------------------------------- /alphaction/utils/c2_model_loading.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | import pickle 4 | from collections import OrderedDict 5 | 6 | 7 | def _rename_weights(weights, weight_map): 8 | logger = logging.getLogger(__name__) 9 | logger.info("Remapping C2 weights") 10 | max_c2_key_size = max([len(k) for k in weight_map.values()]) 11 | new_weights = OrderedDict() 12 | for k in weight_map: 13 | c2_name = weight_map[k] 14 | logger.info("C2 name: {: <{}} mapped name: {}".format(c2_name, max_c2_key_size, k)) 15 | if c2_name not in weights: 16 | logger.info("{} not found in C2 weights file, skipped.".format(c2_name)) 17 | continue 18 | v = weights[c2_name] 19 | w = torch.from_numpy(v) 20 | new_weights[k] = w 21 | return new_weights 22 | 23 | 24 | def _load_c2_pickled_weights(file_path): 25 | with open(file_path, "rb") as f: 26 | if torch._six.PY3: 27 | data = pickle.load(f, encoding="latin1") 28 | else: 29 | data = pickle.load(f) 30 | if "blobs" in data: 31 | weights = data["blobs"] 32 | else: 33 | weights = data 34 | return weights 35 | 36 | 37 | def load_c2_format(f, weight_map): 38 | # We also support load from caffe2 weights. 39 | state_dict = _load_c2_pickled_weights(f) 40 | state_dict = _rename_weights(state_dict, weight_map) 41 | return dict(model=state_dict) 42 | -------------------------------------------------------------------------------- /alphaction/utils/checkpoint.py: -------------------------------------------------------------------------------- 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/checkpoint.py 2 | import logging 3 | import os 4 | 5 | import torch 6 | 7 | from alphaction.utils.model_serialization import load_state_dict 8 | from alphaction.utils.c2_model_loading import load_c2_format 9 | from alphaction.structures.memory_pool import MemoryPool 10 | 11 | 12 | class Checkpointer(object): 13 | def __init__( 14 | self, 15 | model, 16 | optimizer=None, 17 | scheduler=None, 18 | save_dir="", 19 | save_to_disk=None, 20 | logger=None, 21 | ): 22 | self.model = model 23 | self.optimizer = optimizer 24 | self.scheduler = scheduler 25 | self.save_dir = save_dir 26 | self.save_to_disk = save_to_disk 27 | if logger is None: 28 | logger = logging.getLogger(__name__) 29 | self.logger = logger 30 | 31 | def save(self, name, **kwargs): 32 | if not self.save_dir: 33 | return 34 | 35 | if not self.save_to_disk: 36 | return 37 | 38 | data = {} 39 | data["model"] = self.model.state_dict() 40 | if self.optimizer is not None: 41 | data["optimizer"] = self.optimizer.state_dict() 42 | if self.scheduler is not None: 43 | data["scheduler"] = self.scheduler.state_dict() 44 | data.update(kwargs) 45 | 46 | save_file = os.path.join(self.save_dir, "{}.pth".format(name)) 47 | self.logger.info("Saving checkpoint to {}".format(save_file)) 48 | torch.save(data, save_file) 49 | self.tag_last_checkpoint(save_file) 50 | 51 | def load(self, f=None, model_weight_only=False, adjust_scheduler=False, no_head=False): 52 | if self.has_checkpoint(): 53 | # override argument with existing checkpoint 54 | f = self.get_checkpoint_file() 55 | if not f: 56 | # no checkpoint could be found 57 | self.logger.info("No checkpoint found. Initializing model from scratch") 58 | return {} 59 | self.logger.info("Loading checkpoint from {}".format(f)) 60 | checkpoint = self._load_file(f) 61 | self._load_model(checkpoint, no_head) 62 | if "optimizer" in checkpoint and self.optimizer: 63 | if model_weight_only: 64 | del checkpoint['optimizer'] 65 | else: 66 | self.logger.info("Loading optimizer from {}".format(f)) 67 | self.optimizer.load_state_dict(checkpoint.pop("optimizer")) 68 | if "scheduler" in checkpoint and self.scheduler: 69 | if model_weight_only: 70 | del checkpoint['scheduler'] 71 | elif adjust_scheduler: 72 | last_epoch = checkpoint.pop("scheduler")['last_epoch'] 73 | self.logger.info("Adjust scheduler at iteration {}".format(last_epoch)) 74 | self.scheduler.step(last_epoch) 75 | else: 76 | self.logger.info("Loading scheduler from {}".format(f)) 77 | self.scheduler.load_state_dict(checkpoint.pop("scheduler")) 78 | 79 | if model_weight_only: 80 | checkpoint["iteration"] = 0 81 | checkpoint["person_pool"] = MemoryPool() 82 | # return any further checkpoint dataset 83 | return checkpoint 84 | 85 | def has_checkpoint(self): 86 | save_file = os.path.join(self.save_dir, "last_checkpoint") 87 | return os.path.exists(save_file) 88 | 89 | def get_checkpoint_file(self): 90 | save_file = os.path.join(self.save_dir, "last_checkpoint") 91 | try: 92 | with open(save_file, "r") as f: 93 | last_saved = f.read() 94 | last_saved = last_saved.strip() 95 | except IOError: 96 | # if file doesn't exist, maybe because it has just been 97 | # deleted by a separate process 98 | last_saved = "" 99 | return last_saved 100 | 101 | def tag_last_checkpoint(self, last_filename): 102 | save_file = os.path.join(self.save_dir, "last_checkpoint") 103 | with open(save_file, "w") as f: 104 | f.write(last_filename) 105 | 106 | def _load_file(self, f): 107 | return torch.load(f, map_location=torch.device("cpu")) 108 | 109 | def _load_model(self, checkpoint, no_head): 110 | load_state_dict(self.model, checkpoint.pop("model"), no_head) 111 | 112 | 113 | class ActionCheckpointer(Checkpointer): 114 | def __init__( 115 | self, 116 | cfg, 117 | model, 118 | optimizer=None, 119 | scheduler=None, 120 | save_dir="", 121 | save_to_disk=None, 122 | logger=None, 123 | ): 124 | super(ActionCheckpointer, self).__init__( 125 | model, optimizer, scheduler, save_dir, save_to_disk, logger 126 | ) 127 | self.cfg = cfg.clone() 128 | 129 | def _load_file(self, f): 130 | if f.endswith(".pkl"): 131 | return load_c2_format(f, self._get_c2_weight_map()) 132 | loaded = super(ActionCheckpointer, self)._load_file(f) 133 | if "model" not in loaded: 134 | loaded = dict(model=loaded) 135 | return loaded 136 | 137 | def _get_c2_weight_map(self): 138 | if hasattr(self.model, "c2_weight_mapping"): 139 | return self.model.c2_weight_mapping() 140 | elif hasattr(self.model, "module") and hasattr(self.model.module, "c2_weight_mapping"): 141 | return self.model.module.c2_weight_mapping() 142 | else: 143 | raise RuntimeError("Cannot get C2 weight mapping from current model definition.") -------------------------------------------------------------------------------- /alphaction/utils/logger.py: -------------------------------------------------------------------------------- 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/logger.py 2 | import logging 3 | import os 4 | import sys 5 | import time 6 | 7 | 8 | def setup_logger(name, save_dir, distributed_rank, filename=None): 9 | logger = logging.getLogger(name) 10 | logger.setLevel(logging.DEBUG) 11 | logger.propagate = False 12 | # don't log results for the non-master process 13 | if distributed_rank > 0: 14 | return logger 15 | ch = logging.StreamHandler(stream=sys.stdout) 16 | ch.setLevel(logging.DEBUG) 17 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") 18 | ch.setFormatter(formatter) 19 | logger.addHandler(ch) 20 | 21 | if save_dir: 22 | if filename is None: 23 | filename = time.strftime("%Y-%m-%d_%H.%M.%S", time.localtime()) + ".log" 24 | fh = logging.FileHandler(os.path.join(save_dir, filename)) 25 | fh.setLevel(logging.DEBUG) 26 | fh.setFormatter(formatter) 27 | logger.addHandler(fh) 28 | 29 | return logger 30 | 31 | def setup_tblogger(save_dir, distributed_rank): 32 | if distributed_rank>0: 33 | return None 34 | from tensorboardX import SummaryWriter 35 | tbdir = os.path.join(save_dir,'tb') 36 | os.makedirs(tbdir,exist_ok=True) 37 | tblogger = SummaryWriter(tbdir) 38 | return tblogger -------------------------------------------------------------------------------- /alphaction/utils/metric_logger.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/metric_logger.py 2 | from collections import defaultdict 3 | from collections import deque 4 | 5 | import torch 6 | 7 | 8 | class SmoothedValue(object): 9 | """Track a series of values and provide access to smoothed values over a 10 | window or the global series average. 11 | """ 12 | 13 | def __init__(self, window_size=20): 14 | self.deque = deque(maxlen=window_size) 15 | self.series = [] 16 | self.total = 0.0 17 | self.count = 0 18 | 19 | def update(self, value): 20 | self.deque.append(value) 21 | self.series.append(value) 22 | self.count += 1 23 | self.total += value 24 | 25 | @property 26 | def median(self): 27 | d = torch.tensor(list(self.deque)) 28 | return d.median().item() 29 | 30 | @property 31 | def avg(self): 32 | d = torch.tensor(list(self.deque)) 33 | return d.mean().item() 34 | 35 | @property 36 | def global_avg(self): 37 | return self.total / self.count 38 | 39 | 40 | class MetricLogger(object): 41 | def __init__(self, delimiter="\t"): 42 | self.meters = defaultdict(SmoothedValue) 43 | self.delimiter = delimiter 44 | 45 | def update(self, **kwargs): 46 | for k, v in kwargs.items(): 47 | if isinstance(v, torch.Tensor): 48 | v = v.item() 49 | assert isinstance(v, (float, int)) 50 | self.meters[k].update(v) 51 | 52 | def __getattr__(self, attr): 53 | if attr in self.meters: 54 | return self.meters[attr] 55 | if attr in self.__dict__: 56 | return self.__dict__[attr] 57 | raise AttributeError("'{}' object has no attribute '{}'".format( 58 | type(self).__name__, attr)) 59 | 60 | def __str__(self): 61 | loss_str = [] 62 | for name, meter in self.meters.items(): 63 | loss_str.append( 64 | "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg) 65 | ) 66 | return self.delimiter.join(loss_str) 67 | -------------------------------------------------------------------------------- /alphaction/utils/model_serialization.py: -------------------------------------------------------------------------------- 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/model_serialization.py 2 | from collections import OrderedDict 3 | import logging 4 | 5 | import torch 6 | 7 | 8 | def align_and_update_state_dicts(model_state_dict, loaded_state_dict, no_head): 9 | """ 10 | Strategy: suppose that the models that we will create will have prefixes appended 11 | to each of its keys, for example due to an extra level of nesting that the original 12 | pre-trained weights from ImageNet won't contain. For example, model.state_dict() 13 | might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains 14 | res2.conv1.weight. We thus want to match both parameters together. 15 | For that, we look for each model weight, look among all loaded keys if there is one 16 | that is a suffix of the current weight name, and use it if that's the case. 17 | If multiple matches exist, take the one with longest size 18 | of the corresponding name. For example, for the same model as before, the pretrained 19 | weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case, 20 | we want to match backbone[0].body.conv1.weight to conv1.weight, and 21 | backbone[0].body.res2.conv1.weight to res2.conv1.weight. 22 | """ 23 | current_keys = sorted(list(model_state_dict.keys())) 24 | loaded_keys = sorted(list(loaded_state_dict.keys())) 25 | # get a matrix of string matches, where each (i, j) entry correspond to the size of the 26 | # loaded_key string, if it matches 27 | match_matrix = [ 28 | len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys 29 | ] 30 | match_matrix = torch.as_tensor(match_matrix).view( 31 | len(current_keys), len(loaded_keys) 32 | ) 33 | max_match_size, idxs = match_matrix.max(1) 34 | # remove indices that correspond to no-match 35 | idxs[max_match_size == 0] = -1 36 | 37 | # used for logging 38 | max_size = max([len(key) for key in current_keys]) if current_keys else 1 39 | max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1 40 | log_str_template = "{: <{}} loaded from {: <{}} of shape {}" 41 | logger = logging.getLogger(__name__) 42 | for idx_new, idx_old in enumerate(idxs.tolist()): 43 | if idx_old == -1: 44 | continue 45 | key = current_keys[idx_new] 46 | key_old = loaded_keys[idx_old] 47 | 48 | if no_head and key_old.startswith("roi_heads."): 49 | logger.info("{} will not be loaded.".format(key)) 50 | continue 51 | 52 | model_state_dict[key] = loaded_state_dict[key_old] 53 | logger.info( 54 | log_str_template.format( 55 | key, 56 | max_size, 57 | key_old, 58 | max_size_loaded, 59 | tuple(loaded_state_dict[key_old].shape), 60 | ) 61 | ) 62 | 63 | 64 | def strip_prefix_if_present(state_dict, prefix): 65 | keys = sorted(state_dict.keys()) 66 | if not all(key.startswith(prefix) for key in keys): 67 | return state_dict 68 | stripped_state_dict = OrderedDict() 69 | for key, value in state_dict.items(): 70 | stripped_state_dict[key.replace(prefix, "")] = value 71 | return stripped_state_dict 72 | 73 | 74 | def load_state_dict(model, loaded_state_dict, no_head): 75 | model_state_dict = model.state_dict() 76 | # if the state_dict comes from a model that was wrapped in a 77 | # DataParallel or DistributedDataParallel during serialization, 78 | # remove the "module" prefix before performing the matching 79 | loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.") 80 | align_and_update_state_dicts(model_state_dict, loaded_state_dict, no_head) 81 | 82 | # use strict loading 83 | model.load_state_dict(model_state_dict) 84 | -------------------------------------------------------------------------------- /alphaction/utils/random_seed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | import numpy as np 4 | 5 | def set_seed(seed, rank, world_size): 6 | rng = random.Random(seed) 7 | seed_per_rank = [rng.randint(0, 2**32-1) for _ in range(world_size)] 8 | cur_seed = seed_per_rank[rank] 9 | random.seed(cur_seed) 10 | torch.manual_seed(cur_seed) 11 | torch.cuda.manual_seed(cur_seed) 12 | np.random.seed(cur_seed) -------------------------------------------------------------------------------- /alphaction/utils/registry.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/registry.py 2 | 3 | def _register_generic(module_dict, module_name, module): 4 | assert module_name not in module_dict 5 | module_dict[module_name] = module 6 | 7 | 8 | class Registry(dict): 9 | ''' 10 | A helper class for managing registering modules, it extends a dictionary 11 | and provides a register functions. 12 | 13 | Eg. creeting a registry: 14 | some_registry = Registry({"default": default_module}) 15 | 16 | There're two ways of registering new modules: 17 | 1): normal way is just calling register function: 18 | def foo(): 19 | ... 20 | some_registry.register("foo_module", foo) 21 | 2): used as decorator when declaring the module: 22 | @some_registry.register("foo_module") 23 | @some_registry.register("foo_modeul_nickname") 24 | def foo(): 25 | ... 26 | 27 | Access of module is just like using a dictionary, eg: 28 | f = some_registry["foo_modeul"] 29 | ''' 30 | def __init__(self, *args, **kwargs): 31 | super(Registry, self).__init__(*args, **kwargs) 32 | 33 | def register(self, module_name, module=None): 34 | # used as function call 35 | if module is not None: 36 | _register_generic(self, module_name, module) 37 | return 38 | 39 | # used as decorator 40 | def register_fn(fn): 41 | _register_generic(self, module_name, fn) 42 | return fn 43 | 44 | return register_fn 45 | -------------------------------------------------------------------------------- /alphaction/utils/video_decode.py: -------------------------------------------------------------------------------- 1 | import av 2 | 3 | def av_decode_video(video_path): 4 | with av.open(video_path) as container: 5 | frames = [] 6 | for frame in container.decode(video=0): 7 | frames.append(frame.to_rgb().to_ndarray()) 8 | return frames -------------------------------------------------------------------------------- /config_files/resnet101_8x8f_baseline.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | WEIGHT: "data/models/pretrained_models/SlowFast-ResNet101-8x8.pth" 3 | BACKBONE: 4 | CONV_BODY: "Slowfast-Resnet101" 5 | FROZEN_BN: True 6 | SLOWFAST: 7 | BETA: 0.125 8 | LATERAL: "tconv" 9 | SLOW: 10 | ACTIVE: True 11 | CONV3_NONLOCAL: False 12 | CONV4_NONLOCAL: False 13 | FAST: 14 | ACTIVE: True 15 | CONV3_NONLOCAL: False 16 | CONV4_NONLOCAL: False 17 | NONLOCAL: 18 | USE_ZERO_INIT_CONV: False 19 | BN_INIT_GAMMA: 0.0 20 | FROZEN_BN: True 21 | ROI_ACTION_HEAD: 22 | FEATURE_EXTRACTOR: "2MLPFeatureExtractor" 23 | POOLER_TYPE: "align3d" 24 | MEAN_BEFORE_POOLER: True 25 | POOLER_RESOLUTION: 7 26 | POOLER_SCALE: 0.0625 27 | POOLER_SAMPLING_RATIO: 0 28 | NUM_CLASSES: 80 29 | PROPOSAL_PER_CLIP: 10 30 | DROPOUT_RATE: 0.2 31 | IA_STRUCTURE: 32 | ACTIVE: False 33 | INPUT: 34 | FRAME_NUM: 64 35 | FRAME_SAMPLE_RATE: 1 36 | TAU: 8 37 | ALPHA: 4 38 | SLOW_JITTER: True 39 | COLOR_JITTER: True 40 | DATASETS: 41 | TRAIN: ("ava_video_train_v2.2",) 42 | TEST: ("ava_video_val_v2.2",) 43 | DATALOADER: 44 | NUM_WORKERS: 4 45 | SIZE_DIVISIBILITY: 16 46 | SOLVER: 47 | BASE_LR: 0.0004 48 | WARMUP_FACTOR: 0.25 49 | BIAS_LR_FACTOR: 2 50 | WEIGHT_DECAY: 1e-7 51 | STEPS: (50000, 70000) 52 | WARMUP_ITERS: 2000 53 | MAX_ITER: 90000 54 | CHECKPOINT_PERIOD: 10000 55 | EVAL_PERIOD: 10000 56 | VIDEOS_PER_BATCH: 16 57 | TEST: 58 | BOX_THRESH: 0.8 59 | ACTION_THRESH: 0. 60 | VIDEOS_PER_BATCH: 16 61 | OUTPUT_DIR: "data/output/resnet101_8x8f_baseline" -------------------------------------------------------------------------------- /config_files/resnet101_8x8f_denseserial.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | WEIGHT: "data/models/pretrained_models/SlowFast-ResNet101-8x8.pth" 3 | BACKBONE: 4 | CONV_BODY: "Slowfast-Resnet101" 5 | FROZEN_BN: True 6 | SLOWFAST: 7 | BETA: 0.125 8 | LATERAL: "tconv" 9 | SLOW: 10 | ACTIVE: True 11 | CONV3_NONLOCAL: False 12 | CONV4_NONLOCAL: False 13 | FAST: 14 | ACTIVE: True 15 | CONV3_NONLOCAL: False 16 | CONV4_NONLOCAL: False 17 | NONLOCAL: 18 | USE_ZERO_INIT_CONV: False 19 | BN_INIT_GAMMA: 0.0 20 | FROZEN_BN: True 21 | ROI_ACTION_HEAD: 22 | FEATURE_EXTRACTOR: "2MLPFeatureExtractor" 23 | POOLER_TYPE: "align3d" 24 | MEAN_BEFORE_POOLER: True 25 | POOLER_RESOLUTION: 7 26 | POOLER_SCALE: 0.0625 27 | POOLER_SAMPLING_RATIO: 0 28 | NUM_CLASSES: 80 29 | PROPOSAL_PER_CLIP: 10 30 | DROPOUT_RATE: 0.2 31 | IA_STRUCTURE: 32 | ACTIVE: True 33 | STRUCTURE: "dense" 34 | DROPOUT: 0.2 35 | FUSION: "add" 36 | TEMPORAL_POSITION: True 37 | USE_ZERO_INIT_CONV: True 38 | LAYER_NORM: True 39 | MAX_PERSON: 25 40 | MAX_OBJECT: 5 41 | MAX_PER_SEC: 5 42 | DIM_INNER: 1024 43 | DIM_OUT: 2304 44 | I_BLOCK_LIST: [ "P", "O", "M", "P", "O", "M" ] 45 | INPUT: 46 | FRAME_NUM: 64 47 | FRAME_SAMPLE_RATE: 1 48 | TAU: 8 49 | ALPHA: 4 50 | SLOW_JITTER: True 51 | COLOR_JITTER: True 52 | DATASETS: 53 | TRAIN: ("ava_video_train_v2.2",) 54 | TEST: ("ava_video_val_v2.2",) 55 | DATALOADER: 56 | NUM_WORKERS: 4 57 | SIZE_DIVISIBILITY: 16 58 | SOLVER: 59 | BASE_LR: 0.0004 60 | WARMUP_FACTOR: 0.25 61 | BIAS_LR_FACTOR: 2 62 | IA_LR_FACTOR: 10.0 63 | WEIGHT_DECAY: 1e-7 64 | STEPS: (70000, 90000) 65 | WARMUP_ITERS: 2000 66 | MAX_ITER: 110000 67 | CHECKPOINT_PERIOD: 10000 68 | EVAL_PERIOD: 10000 69 | VIDEOS_PER_BATCH: 16 70 | TEST: 71 | BOX_THRESH: 0.8 72 | ACTION_THRESH: 0. 73 | VIDEOS_PER_BATCH: 16 74 | OUTPUT_DIR: "data/output/resnet101_8x8f_denseserial" -------------------------------------------------------------------------------- /config_files/resnet50_4x16f_baseline.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | WEIGHT: "data/models/pretrained_models/SlowFast-ResNet50-4x16.pth" 3 | BACKBONE: 4 | CONV_BODY: "Slowfast-Resnet50" 5 | FROZEN_BN: True 6 | SLOWFAST: 7 | BETA: 0.125 8 | LATERAL: "tconv" 9 | SLOW: 10 | ACTIVE: True 11 | CONV3_NONLOCAL: False 12 | CONV4_NONLOCAL: False 13 | FAST: 14 | ACTIVE: True 15 | CONV3_NONLOCAL: False 16 | CONV4_NONLOCAL: False 17 | NONLOCAL: 18 | USE_ZERO_INIT_CONV: False 19 | BN_INIT_GAMMA: 0.0 20 | FROZEN_BN: True 21 | ROI_ACTION_HEAD: 22 | FEATURE_EXTRACTOR: "2MLPFeatureExtractor" 23 | POOLER_TYPE: "align3d" 24 | MEAN_BEFORE_POOLER: True 25 | POOLER_RESOLUTION: 7 26 | POOLER_SCALE: 0.0625 27 | POOLER_SAMPLING_RATIO: 0 28 | NUM_CLASSES: 80 29 | PROPOSAL_PER_CLIP: 10 30 | DROPOUT_RATE: 0.2 31 | IA_STRUCTURE: 32 | ACTIVE: False 33 | INPUT: 34 | FRAME_NUM: 64 35 | FRAME_SAMPLE_RATE: 1 36 | TAU: 16 37 | ALPHA: 8 38 | SLOW_JITTER: True 39 | COLOR_JITTER: True 40 | DATASETS: 41 | TRAIN: ("ava_video_train_v2.2",) 42 | TEST: ("ava_video_val_v2.2",) 43 | DATALOADER: 44 | NUM_WORKERS: 4 45 | SIZE_DIVISIBILITY: 16 46 | SOLVER: 47 | BASE_LR: 0.0004 48 | WARMUP_FACTOR: 0.25 49 | BIAS_LR_FACTOR: 2 50 | WEIGHT_DECAY: 1e-7 51 | STEPS: (50000, 70000) 52 | WARMUP_ITERS: 2000 53 | MAX_ITER: 90000 54 | CHECKPOINT_PERIOD: 10000 55 | EVAL_PERIOD: 10000 56 | VIDEOS_PER_BATCH: 16 57 | TEST: 58 | BOX_THRESH: 0.8 59 | ACTION_THRESH: 0. 60 | VIDEOS_PER_BATCH: 16 61 | OUTPUT_DIR: "data/output/resnet50_4x16f_baseline" -------------------------------------------------------------------------------- /config_files/resnet50_4x16f_denseserial.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | WEIGHT: "data/models/pretrained_models/SlowFast-ResNet50-4x16.pth" 3 | BACKBONE: 4 | CONV_BODY: "Slowfast-Resnet50" 5 | FROZEN_BN: True 6 | SLOWFAST: 7 | BETA: 0.125 8 | LATERAL: "tconv" 9 | SLOW: 10 | ACTIVE: True 11 | CONV3_NONLOCAL: False 12 | CONV4_NONLOCAL: False 13 | FAST: 14 | ACTIVE: True 15 | CONV3_NONLOCAL: False 16 | CONV4_NONLOCAL: False 17 | NONLOCAL: 18 | USE_ZERO_INIT_CONV: False 19 | BN_INIT_GAMMA: 0.0 20 | FROZEN_BN: True 21 | ROI_ACTION_HEAD: 22 | FEATURE_EXTRACTOR: "2MLPFeatureExtractor" 23 | POOLER_TYPE: "align3d" 24 | MEAN_BEFORE_POOLER: True 25 | POOLER_RESOLUTION: 7 26 | POOLER_SCALE: 0.0625 27 | POOLER_SAMPLING_RATIO: 0 28 | NUM_CLASSES: 80 29 | PROPOSAL_PER_CLIP: 10 30 | DROPOUT_RATE: 0.2 31 | IA_STRUCTURE: 32 | ACTIVE: True 33 | STRUCTURE: "dense" 34 | DROPOUT: 0.2 35 | FUSION: "add" 36 | TEMPORAL_POSITION: True 37 | USE_ZERO_INIT_CONV: True 38 | LAYER_NORM: True 39 | MAX_PERSON: 25 40 | MAX_OBJECT: 5 41 | MAX_PER_SEC: 5 42 | DIM_INNER: 1024 43 | DIM_OUT: 2304 44 | I_BLOCK_LIST: [ "P", "O", "M", "P", "O", "M" ] 45 | INPUT: 46 | FRAME_NUM: 64 47 | FRAME_SAMPLE_RATE: 1 48 | TAU: 16 49 | ALPHA: 8 50 | SLOW_JITTER: True 51 | COLOR_JITTER: True 52 | DATASETS: 53 | TRAIN: ("ava_video_train_v2.2",) 54 | TEST: ("ava_video_val_v2.2",) 55 | DATALOADER: 56 | NUM_WORKERS: 4 57 | SIZE_DIVISIBILITY: 16 58 | SOLVER: 59 | BASE_LR: 0.0004 60 | WARMUP_FACTOR: 0.25 61 | BIAS_LR_FACTOR: 2 62 | IA_LR_FACTOR: 10.0 63 | WEIGHT_DECAY: 1e-7 64 | STEPS: (70000, 90000) 65 | WARMUP_ITERS: 2000 66 | MAX_ITER: 110000 67 | CHECKPOINT_PERIOD: 10000 68 | EVAL_PERIOD: 10000 69 | VIDEOS_PER_BATCH: 16 70 | TEST: 71 | BOX_THRESH: 0.8 72 | ACTION_THRESH: 0. 73 | VIDEOS_PER_BATCH: 16 74 | OUTPUT_DIR: "data/output/resnet50_4x16f_denseserial" -------------------------------------------------------------------------------- /config_files/resnet50_4x16f_parallel.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | WEIGHT: "data/models/pretrained_models/SlowFast-ResNet50-4x16.pth" 3 | BACKBONE: 4 | CONV_BODY: "Slowfast-Resnet50" 5 | FROZEN_BN: True 6 | SLOWFAST: 7 | BETA: 0.125 8 | LATERAL: "tconv" 9 | SLOW: 10 | ACTIVE: True 11 | CONV3_NONLOCAL: False 12 | CONV4_NONLOCAL: False 13 | FAST: 14 | ACTIVE: True 15 | CONV3_NONLOCAL: False 16 | CONV4_NONLOCAL: False 17 | NONLOCAL: 18 | USE_ZERO_INIT_CONV: False 19 | BN_INIT_GAMMA: 0.0 20 | FROZEN_BN: True 21 | ROI_ACTION_HEAD: 22 | FEATURE_EXTRACTOR: "2MLPFeatureExtractor" 23 | POOLER_TYPE: "align3d" 24 | MEAN_BEFORE_POOLER: True 25 | POOLER_RESOLUTION: 7 26 | POOLER_SCALE: 0.0625 27 | POOLER_SAMPLING_RATIO: 0 28 | NUM_CLASSES: 80 29 | PROPOSAL_PER_CLIP: 10 30 | DROPOUT_RATE: 0.2 31 | IA_STRUCTURE: 32 | ACTIVE: True 33 | STRUCTURE: "parallel" 34 | DROPOUT: 0.2 35 | FUSION: "add" 36 | TEMPORAL_POSITION: True 37 | USE_ZERO_INIT_CONV: True 38 | LAYER_NORM: True 39 | MAX_PERSON: 25 40 | MAX_OBJECT: 5 41 | MAX_PER_SEC: 5 42 | DIM_INNER: 1024 43 | DIM_OUT: 2304 44 | I_BLOCK_LIST: [ [ "P", "O", "M" ], [ "P", "O", "M" ] ] 45 | INPUT: 46 | FRAME_NUM: 64 47 | FRAME_SAMPLE_RATE: 1 48 | TAU: 16 49 | ALPHA: 8 50 | SLOW_JITTER: True 51 | COLOR_JITTER: True 52 | DATASETS: 53 | TRAIN: ("ava_video_train_v2.2",) 54 | TEST: ("ava_video_val_v2.2",) 55 | DATALOADER: 56 | NUM_WORKERS: 4 57 | SIZE_DIVISIBILITY: 16 58 | SOLVER: 59 | BASE_LR: 0.0004 60 | WARMUP_FACTOR: 0.25 61 | BIAS_LR_FACTOR: 2 62 | IA_LR_FACTOR: 10.0 63 | WEIGHT_DECAY: 1e-7 64 | STEPS: (70000, 90000) 65 | WARMUP_ITERS: 2000 66 | MAX_ITER: 110000 67 | CHECKPOINT_PERIOD: 10000 68 | EVAL_PERIOD: 10000 69 | VIDEOS_PER_BATCH: 16 70 | TEST: 71 | BOX_THRESH: 0.8 72 | ACTION_THRESH: 0. 73 | VIDEOS_PER_BATCH: 16 74 | OUTPUT_DIR: "data/output/resnet50_4x16f_parallel" -------------------------------------------------------------------------------- /config_files/resnet50_4x16f_serial.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | WEIGHT: "data/models/pretrained_models/SlowFast-ResNet50-4x16.pth" 3 | BACKBONE: 4 | CONV_BODY: "Slowfast-Resnet50" 5 | FROZEN_BN: True 6 | SLOWFAST: 7 | BETA: 0.125 8 | LATERAL: "tconv" 9 | SLOW: 10 | ACTIVE: True 11 | CONV3_NONLOCAL: False 12 | CONV4_NONLOCAL: False 13 | FAST: 14 | ACTIVE: True 15 | CONV3_NONLOCAL: False 16 | CONV4_NONLOCAL: False 17 | NONLOCAL: 18 | USE_ZERO_INIT_CONV: False 19 | BN_INIT_GAMMA: 0.0 20 | FROZEN_BN: True 21 | ROI_ACTION_HEAD: 22 | FEATURE_EXTRACTOR: "2MLPFeatureExtractor" 23 | POOLER_TYPE: "align3d" 24 | MEAN_BEFORE_POOLER: True 25 | POOLER_RESOLUTION: 7 26 | POOLER_SCALE: 0.0625 27 | POOLER_SAMPLING_RATIO: 0 28 | NUM_CLASSES: 80 29 | PROPOSAL_PER_CLIP: 10 30 | DROPOUT_RATE: 0.2 31 | IA_STRUCTURE: 32 | ACTIVE: True 33 | DROPOUT: 0.2 34 | FUSION: "add" 35 | TEMPORAL_POSITION: True 36 | USE_ZERO_INIT_CONV: True 37 | LAYER_NORM: True 38 | MAX_PERSON: 25 39 | MAX_OBJECT: 5 40 | MAX_PER_SEC: 5 41 | DIM_OUT: 2304 42 | DIM_INNER: 1024 43 | I_BLOCK_LIST: [ "P", "O", "M", "P", "O", "M" ] 44 | INPUT: 45 | FRAME_NUM: 64 46 | FRAME_SAMPLE_RATE: 1 47 | TAU: 16 48 | ALPHA: 8 49 | SLOW_JITTER: True 50 | COLOR_JITTER: True 51 | DATASETS: 52 | TRAIN: ("ava_video_train_v2.2",) 53 | TEST: ("ava_video_val_v2.2",) 54 | DATALOADER: 55 | NUM_WORKERS: 4 56 | SIZE_DIVISIBILITY: 16 57 | SOLVER: 58 | BASE_LR: 0.0004 59 | WARMUP_FACTOR: 0.25 60 | BIAS_LR_FACTOR: 2 61 | IA_LR_FACTOR: 10.0 62 | WEIGHT_DECAY: 1e-7 63 | STEPS: (70000, 90000) 64 | WARMUP_ITERS: 2000 65 | MAX_ITER: 110000 66 | CHECKPOINT_PERIOD: 10000 67 | EVAL_PERIOD: 10000 68 | VIDEOS_PER_BATCH: 16 69 | TEST: 70 | BOX_THRESH: 0.8 71 | ACTION_THRESH: 0. 72 | VIDEOS_PER_BATCH: 16 73 | OUTPUT_DIR: "data/output/resnet50_4x16f_serial" -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | # Demo 2 | 3 | ### Installation 4 | 5 | To run this demo, make sure that you install all requirements following [INSTALL.md](../INSTALL.md). 6 | 7 | ### Preparation 8 | 9 | 1. Download the object detection model manually: **yolov3-spp.weights** ([Google Drive](https://drive.google.com/file/d/1260DRQM5XtSF7W213AWxk6RX2zfa3Zq6/view?usp=sharing)). Place it into `data/models/detector_models`. 10 | 2. Download the person tracking model manually: **jde.uncertainty.pt** ([Google Drive](https://drive.google.com/file/d/1nuCX5bR-1-HGZ0_WoH4xZzPiV_jgBphC/view?usp=sharing)). Place it into `data/models/detector_models`. 11 | 3. Please download our action models. Place them into ```data/models/aia_models```. All models are available in [MODEL_ZOO.md](../MODEL_ZOO.md). 12 | 4. We also provide a practical model ([Google Drive](https://drive.google.com/file/d/1gi6oKLj3wBGCOwwIiI9L4mS8pznFj7L1/view?usp=sharing)) trained on 15 common action categories in AVA. This 13 | model achieves about 70 mAP on these categories. Please use [`resnet101_8x8f_denseserial.yaml`](../config_files/resnet101_8x8f_denseserial.yaml) 14 | and eable `--common-cate` to apply this model. 15 | 16 | ### Usage 17 | 18 | 1. Video Input 19 | 20 | ``` 21 | cd demo 22 | python demo.py --video-path path/to/your/video --output-path path/to/the/output \ 23 | --cfg-path path/to/cfg/file --weight-path path/to/the/weight [--common-cate] 24 | ``` 25 | 26 | 2. Webcam Input 27 | 28 | ``` 29 | cd demo 30 | python demo.py --webcam --output-path path/to/the/output \ 31 | --cfg-path path/to/cfg/file --weight-path path/to/the/weight [--common-cate] 32 | ``` 33 | -------------------------------------------------------------------------------- /demo/Roboto-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/demo/Roboto-Bold.ttf -------------------------------------------------------------------------------- /detector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/detector/__init__.py -------------------------------------------------------------------------------- /detector/apis.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------- 2 | # Copyright (c) Shanghai Jiao Tong University. All rights reserved. 3 | # Written by Chao Xu (xuchao.19962007@sjtu.edu.cn) 4 | # ----------------------------------------------------- 5 | 6 | """API of detector""" 7 | from abc import ABC, abstractmethod 8 | 9 | 10 | def get_detector(opt=None): 11 | if opt.detector == 'yolo': 12 | from detector.yolo_api import YOLODetector 13 | from detector.yolo_cfg import cfg 14 | return YOLODetector(cfg, opt) 15 | elif opt.detector == 'tracker': 16 | from detector.tracker_api import Tracker 17 | from detector.tracker_cfg import cfg 18 | return Tracker(cfg, opt) 19 | else: 20 | raise NotImplementedError 21 | 22 | 23 | class BaseDetector(ABC): 24 | def __init__(self): 25 | pass 26 | 27 | @abstractmethod 28 | def image_preprocess(self, img_name): 29 | pass 30 | 31 | @abstractmethod 32 | def images_detection(self, imgs, orig_dim_list): 33 | pass 34 | 35 | @abstractmethod 36 | def detect_one_img(self, img_name): 37 | pass 38 | -------------------------------------------------------------------------------- /detector/nms/__init__.py: -------------------------------------------------------------------------------- 1 | from .nms_wrapper import nms, soft_nms 2 | 3 | __all__ = ['nms', 'soft_nms'] 4 | -------------------------------------------------------------------------------- /detector/nms/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from . import nms_cpu, nms_cuda 5 | from .soft_nms_cpu import soft_nms_cpu 6 | 7 | 8 | def nms(dets, iou_thr, device_id=None): 9 | """Dispatch to either CPU or GPU NMS implementations. 10 | 11 | The input can be either a torch tensor or numpy array. GPU NMS will be used 12 | if the input is a gpu tensor or device_id is specified, otherwise CPU NMS 13 | will be used. The returned type will always be the same as inputs. 14 | 15 | Arguments: 16 | dets (torch.Tensor or np.ndarray): bboxes with scores. 17 | iou_thr (float): IoU threshold for NMS. 18 | device_id (int, optional): when `dets` is a numpy array, if `device_id` 19 | is None, then cpu nms is used, otherwise gpu_nms will be used. 20 | 21 | Returns: 22 | tuple: kept bboxes and indice, which is always the same data type as 23 | the input. 24 | """ 25 | # convert dets (tensor or numpy array) to tensor 26 | if isinstance(dets, torch.Tensor): 27 | is_numpy = False 28 | dets_th = dets.to('cpu') 29 | elif isinstance(dets, np.ndarray): 30 | is_numpy = True 31 | device = 'cpu' if device_id is None else 'cuda:{}'.format(device_id) 32 | dets_th = torch.from_numpy(dets).to(device) 33 | else: 34 | raise TypeError( 35 | 'dets must be either a Tensor or numpy array, but got {}'.format( 36 | type(dets))) 37 | 38 | # execute cpu or cuda nms 39 | if dets_th.shape[0] == 0: 40 | inds = dets_th.new_zeros(0, dtype=torch.long) 41 | else: 42 | if dets_th.is_cuda: 43 | inds = nms_cuda.nms(dets_th, iou_thr) 44 | else: 45 | inds = nms_cpu.nms(dets_th, iou_thr) 46 | 47 | if is_numpy: 48 | inds = inds.cpu().numpy() 49 | return dets[inds, :], inds 50 | 51 | 52 | def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3): 53 | if isinstance(dets, torch.Tensor): 54 | is_tensor = True 55 | dets_np = dets.detach().cpu().numpy() 56 | elif isinstance(dets, np.ndarray): 57 | is_tensor = False 58 | dets_np = dets 59 | else: 60 | raise TypeError( 61 | 'dets must be either a Tensor or numpy array, but got {}'.format( 62 | type(dets))) 63 | 64 | method_codes = {'linear': 1, 'gaussian': 2} 65 | if method not in method_codes: 66 | raise ValueError('Invalid method for SoftNMS: {}'.format(method)) 67 | new_dets, inds = soft_nms_cpu( 68 | dets_np, 69 | iou_thr, 70 | method=method_codes[method], 71 | sigma=sigma, 72 | min_score=min_score) 73 | 74 | if is_tensor: 75 | return dets.new_tensor(new_dets), dets.new_tensor( 76 | inds, dtype=torch.long) 77 | else: 78 | return new_dets.astype(np.float32), inds.astype(np.int64) 79 | -------------------------------------------------------------------------------- /detector/nms/src/nms_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | 4 | template 5 | at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) { 6 | AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor"); 7 | 8 | if (dets.numel() == 0) { 9 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 10 | } 11 | 12 | auto x1_t = dets.select(1, 0).contiguous(); 13 | auto y1_t = dets.select(1, 1).contiguous(); 14 | auto x2_t = dets.select(1, 2).contiguous(); 15 | auto y2_t = dets.select(1, 3).contiguous(); 16 | auto scores = dets.select(1, 4).contiguous(); 17 | 18 | at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); 19 | 20 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 21 | 22 | auto ndets = dets.size(0); 23 | at::Tensor suppressed_t = 24 | at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); 25 | 26 | auto suppressed = suppressed_t.data_ptr(); 27 | auto order = order_t.data_ptr(); 28 | auto x1 = x1_t.data_ptr(); 29 | auto y1 = y1_t.data_ptr(); 30 | auto x2 = x2_t.data_ptr(); 31 | auto y2 = y2_t.data_ptr(); 32 | auto areas = areas_t.data_ptr(); 33 | 34 | for (int64_t _i = 0; _i < ndets; _i++) { 35 | auto i = order[_i]; 36 | if (suppressed[i] == 1) continue; 37 | auto ix1 = x1[i]; 38 | auto iy1 = y1[i]; 39 | auto ix2 = x2[i]; 40 | auto iy2 = y2[i]; 41 | auto iarea = areas[i]; 42 | 43 | for (int64_t _j = _i + 1; _j < ndets; _j++) { 44 | auto j = order[_j]; 45 | if (suppressed[j] == 1) continue; 46 | auto xx1 = std::max(ix1, x1[j]); 47 | auto yy1 = std::max(iy1, y1[j]); 48 | auto xx2 = std::min(ix2, x2[j]); 49 | auto yy2 = std::min(iy2, y2[j]); 50 | 51 | auto w = std::max(static_cast(0), xx2 - xx1 + 1); 52 | auto h = std::max(static_cast(0), yy2 - yy1 + 1); 53 | auto inter = w * h; 54 | auto ovr = inter / (iarea + areas[j] - inter); 55 | if (ovr >= threshold) suppressed[j] = 1; 56 | } 57 | } 58 | return at::nonzero(suppressed_t == 0).squeeze(1); 59 | } 60 | 61 | at::Tensor nms(const at::Tensor& dets, const float threshold) { 62 | at::Tensor result; 63 | AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] { 64 | result = nms_cpu_kernel(dets, threshold); 65 | }); 66 | return result; 67 | } 68 | 69 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 70 | m.def("nms", &nms, "non-maximum suppression"); 71 | } -------------------------------------------------------------------------------- /detector/nms/src/nms_cuda.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | 4 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x, " must be a CUDAtensor ") 5 | 6 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); 7 | 8 | at::Tensor nms(const at::Tensor& dets, const float threshold) { 9 | CHECK_CUDA(dets); 10 | if (dets.numel() == 0) 11 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 12 | return nms_cuda(dets, threshold); 13 | } 14 | 15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 16 | m.def("nms", &nms, "non-maximum suppression"); 17 | } -------------------------------------------------------------------------------- /detector/nms/src/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 12 | 13 | __device__ inline float devIoU(float const * const a, float const * const b) { 14 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 15 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 16 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 17 | float interS = width * height; 18 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 19 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 20 | return interS / (Sa + Sb - interS); 21 | } 22 | 23 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 24 | const float *dev_boxes, unsigned long long *dev_mask) { 25 | const int row_start = blockIdx.y; 26 | const int col_start = blockIdx.x; 27 | 28 | // if (row_start > col_start) return; 29 | 30 | const int row_size = 31 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 32 | const int col_size = 33 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 34 | 35 | __shared__ float block_boxes[threadsPerBlock * 5]; 36 | if (threadIdx.x < col_size) { 37 | block_boxes[threadIdx.x * 5 + 0] = 38 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 39 | block_boxes[threadIdx.x * 5 + 1] = 40 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 41 | block_boxes[threadIdx.x * 5 + 2] = 42 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 43 | block_boxes[threadIdx.x * 5 + 3] = 44 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 45 | block_boxes[threadIdx.x * 5 + 4] = 46 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 47 | } 48 | __syncthreads(); 49 | 50 | if (threadIdx.x < row_size) { 51 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 52 | const float *cur_box = dev_boxes + cur_box_idx * 5; 53 | int i = 0; 54 | unsigned long long t = 0; 55 | int start = 0; 56 | if (row_start == col_start) { 57 | start = threadIdx.x + 1; 58 | } 59 | for (i = start; i < col_size; i++) { 60 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 61 | t |= 1ULL << i; 62 | } 63 | } 64 | const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); 65 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 66 | } 67 | } 68 | 69 | // boxes is a N x 5 tensor 70 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { 71 | using scalar_t = float; 72 | AT_ASSERTM(boxes.is_cuda(), "boxes must be a CUDA tensor"); 73 | auto scores = boxes.select(1, 4); 74 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 75 | auto boxes_sorted = boxes.index_select(0, order_t); 76 | 77 | int boxes_num = boxes.size(0); 78 | 79 | const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); 80 | 81 | scalar_t* boxes_dev = boxes_sorted.data_ptr(); 82 | 83 | THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState 84 | 85 | unsigned long long* mask_dev = NULL; 86 | //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, 87 | // boxes_num * col_blocks * sizeof(unsigned long long))); 88 | 89 | mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); 90 | 91 | dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), 92 | THCCeilDiv(boxes_num, threadsPerBlock)); 93 | dim3 threads(threadsPerBlock); 94 | nms_kernel<<>>(boxes_num, 95 | nms_overlap_thresh, 96 | boxes_dev, 97 | mask_dev); 98 | 99 | std::vector mask_host(boxes_num * col_blocks); 100 | THCudaCheck(cudaMemcpy(&mask_host[0], 101 | mask_dev, 102 | sizeof(unsigned long long) * boxes_num * col_blocks, 103 | cudaMemcpyDeviceToHost)); 104 | 105 | std::vector remv(col_blocks); 106 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 107 | 108 | at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); 109 | int64_t* keep_out = keep.data_ptr(); 110 | 111 | int num_to_keep = 0; 112 | for (int i = 0; i < boxes_num; i++) { 113 | int nblock = i / threadsPerBlock; 114 | int inblock = i % threadsPerBlock; 115 | 116 | if (!(remv[nblock] & (1ULL << inblock))) { 117 | keep_out[num_to_keep++] = i; 118 | unsigned long long *p = &mask_host[0] + i * col_blocks; 119 | for (int j = nblock; j < col_blocks; j++) { 120 | remv[j] |= p[j]; 121 | } 122 | } 123 | } 124 | 125 | THCudaFree(state, mask_dev); 126 | // TODO improve this part 127 | return std::get<0>(order_t.index({ 128 | keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( 129 | order_t.device(), keep.scalar_type()) 130 | }).sort(0, false)); 131 | } -------------------------------------------------------------------------------- /detector/nms/src/soft_nms_cpu.pyx: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------- 2 | # Soft-NMS: Improving Object Detection With One Line of Code 3 | # Copyright (c) University of Maryland, College Park 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Navaneeth Bodla and Bharat Singh 6 | # Modified by Kai Chen 7 | # ---------------------------------------------------------- 8 | 9 | # cython: language_level=3, boundscheck=False 10 | 11 | import numpy as np 12 | cimport numpy as np 13 | 14 | 15 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 16 | return a if a >= b else b 17 | 18 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 19 | return a if a <= b else b 20 | 21 | 22 | def soft_nms_cpu( 23 | np.ndarray[float, ndim=2] boxes_in, 24 | float iou_thr, 25 | unsigned int method=1, 26 | float sigma=0.5, 27 | float min_score=0.001, 28 | ): 29 | boxes = boxes_in.copy() 30 | cdef int N = boxes.shape[0] 31 | cdef float iw, ih, box_area 32 | cdef float ua 33 | cdef int pos = 0 34 | cdef float maxscore = 0 35 | cdef int maxpos = 0 36 | cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov 37 | inds = np.arange(N) 38 | 39 | for i in range(N): 40 | maxscore = boxes[i, 4] 41 | maxpos = i 42 | 43 | tx1 = boxes[i, 0] 44 | ty1 = boxes[i, 1] 45 | tx2 = boxes[i, 2] 46 | ty2 = boxes[i, 3] 47 | ts = boxes[i, 4] 48 | ti = inds[i] 49 | 50 | pos = i + 1 51 | # get max box 52 | while pos < N: 53 | if maxscore < boxes[pos, 4]: 54 | maxscore = boxes[pos, 4] 55 | maxpos = pos 56 | pos = pos + 1 57 | 58 | # add max box as a detection 59 | boxes[i, 0] = boxes[maxpos, 0] 60 | boxes[i, 1] = boxes[maxpos, 1] 61 | boxes[i, 2] = boxes[maxpos, 2] 62 | boxes[i, 3] = boxes[maxpos, 3] 63 | boxes[i, 4] = boxes[maxpos, 4] 64 | inds[i] = inds[maxpos] 65 | 66 | # swap ith box with position of max box 67 | boxes[maxpos, 0] = tx1 68 | boxes[maxpos, 1] = ty1 69 | boxes[maxpos, 2] = tx2 70 | boxes[maxpos, 3] = ty2 71 | boxes[maxpos, 4] = ts 72 | inds[maxpos] = ti 73 | 74 | tx1 = boxes[i, 0] 75 | ty1 = boxes[i, 1] 76 | tx2 = boxes[i, 2] 77 | ty2 = boxes[i, 3] 78 | ts = boxes[i, 4] 79 | 80 | pos = i + 1 81 | # NMS iterations, note that N changes if detection boxes fall below 82 | # threshold 83 | while pos < N: 84 | x1 = boxes[pos, 0] 85 | y1 = boxes[pos, 1] 86 | x2 = boxes[pos, 2] 87 | y2 = boxes[pos, 3] 88 | s = boxes[pos, 4] 89 | 90 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 91 | iw = (min(tx2, x2) - max(tx1, x1) + 1) 92 | if iw > 0: 93 | ih = (min(ty2, y2) - max(ty1, y1) + 1) 94 | if ih > 0: 95 | ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) 96 | ov = iw * ih / ua # iou between max box and detection box 97 | 98 | if method == 1: # linear 99 | if ov > iou_thr: 100 | weight = 1 - ov 101 | else: 102 | weight = 1 103 | elif method == 2: # gaussian 104 | weight = np.exp(-(ov * ov) / sigma) 105 | else: # original NMS 106 | if ov > iou_thr: 107 | weight = 0 108 | else: 109 | weight = 1 110 | 111 | boxes[pos, 4] = weight * boxes[pos, 4] 112 | 113 | # if box score falls below threshold, discard the box by 114 | # swapping with last box update N 115 | if boxes[pos, 4] < min_score: 116 | boxes[pos, 0] = boxes[N-1, 0] 117 | boxes[pos, 1] = boxes[N-1, 1] 118 | boxes[pos, 2] = boxes[N-1, 2] 119 | boxes[pos, 3] = boxes[N-1, 3] 120 | boxes[pos, 4] = boxes[N-1, 4] 121 | inds[pos] = inds[N - 1] 122 | N = N - 1 123 | pos = pos - 1 124 | 125 | pos = pos + 1 126 | 127 | return boxes[:N], inds[:N] 128 | -------------------------------------------------------------------------------- /detector/tracker/README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | MOT Tracker adapted from [Towards-Realtime-MOT](https://github.com/Zhongdao/Towards-Realtime-MOT), many thanks to their wonderful work! 3 | -------------------------------------------------------------------------------- /detector/tracker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/detector/tracker/__init__.py -------------------------------------------------------------------------------- /detector/tracker/cfg/ccmcpe.json: -------------------------------------------------------------------------------- 1 | { 2 | "root":"/home/wangzd/datasets/MOT", 3 | "train": 4 | { 5 | "mot17":"./data/mot17.train", 6 | "caltech":"./data/caltech.train", 7 | "citypersons":"./data/citypersons.train", 8 | "cuhksysu":"./data/cuhksysu.train", 9 | "prw":"./data/prw.train", 10 | "eth":"./data/eth.train" 11 | }, 12 | "test_emb": 13 | { 14 | "caltech":"./data/caltech.10k.val", 15 | "cuhksysu":"./data/cuhksysu.val", 16 | "prw":"./data/prw.val" 17 | }, 18 | "test": 19 | { 20 | "mot19":"./data/mot19.train", 21 | "caltech":"./data/caltech.val", 22 | "citypersons":"./data/citypersons.val" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /detector/tracker/preprocess.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import cv2 5 | 6 | try: 7 | from util import count_parameters as count 8 | from util import convert2cpu as cpu 9 | except ImportError: 10 | from yolo.util import count_parameters as count 11 | from yolo.util import convert2cpu as cpu 12 | from PIL import Image, ImageDraw 13 | 14 | 15 | def letterbox_image(img, img_size=(1088, 608), color=(127.5, 127.5, 127.5)): 16 | # resize a rectangular image to a padded rectangular 17 | height=img_size[1] 18 | width=img_size[0] 19 | shape = img.shape[:2] # shape = [height, width] 20 | ratio = min(float(height)/shape[0], float(width)/shape[1]) 21 | new_shape = (round(shape[1] * ratio), round(shape[0] * ratio)) # new_shape = [width, height] 22 | dw = (width - new_shape[0]) / 2 # width padding 23 | dh = (height - new_shape[1]) / 2 # height padding 24 | top, bottom = round(dh - 0.1), round(dh + 0.1) 25 | left, right = round(dw - 0.1), round(dw + 0.1) 26 | img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border 27 | img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded rectangular 28 | return img 29 | 30 | 31 | def prep_image(img, img_size=(1088, 608)): 32 | """ 33 | Prepare image for inputting to the neural network. 34 | 35 | Returns a Variable 36 | """ 37 | 38 | orig_im = cv2.imread(img) 39 | dim = orig_im.shape[1], orig_im.shape[0] 40 | img = (letterbox_image(orig_im, img_size)) 41 | img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() 42 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 43 | return img_, orig_im, dim 44 | 45 | 46 | def prep_frame(img, img_size=(1088, 608)): 47 | """ 48 | Prepare image for inputting to the neural network. 49 | 50 | Returns a Variable 51 | """ 52 | 53 | orig_im = img 54 | dim = orig_im.shape[1], orig_im.shape[0] 55 | img = (letterbox_image(orig_im, img_size)) 56 | img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() 57 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 58 | return img_, orig_im, dim 59 | 60 | -------------------------------------------------------------------------------- /detector/tracker/tracker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/detector/tracker/tracker/__init__.py -------------------------------------------------------------------------------- /detector/tracker/tracker/basetrack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import OrderedDict 3 | 4 | 5 | class TrackState(object): 6 | New = 0 7 | Tracked = 1 8 | Lost = 2 9 | Removed = 3 10 | 11 | 12 | class BaseTrack(object): 13 | _count = 0 14 | 15 | track_id = 0 16 | is_activated = False 17 | state = TrackState.New 18 | 19 | history = OrderedDict() 20 | features = [] 21 | curr_feature = None 22 | score = 0 23 | start_frame = 0 24 | frame_id = 0 25 | time_since_update = 0 26 | 27 | # multi-camera 28 | location = (np.inf, np.inf) 29 | 30 | @property 31 | def end_frame(self): 32 | return self.frame_id 33 | 34 | @staticmethod 35 | def next_id(): 36 | BaseTrack._count += 1 37 | return BaseTrack._count 38 | 39 | def activate(self, *args): 40 | raise NotImplementedError 41 | 42 | def predict(self): 43 | raise NotImplementedError 44 | 45 | def update(self, *args, **kwargs): 46 | raise NotImplementedError 47 | 48 | def mark_lost(self): 49 | self.state = TrackState.Lost 50 | 51 | def mark_removed(self): 52 | self.state = TrackState.Removed 53 | 54 | -------------------------------------------------------------------------------- /detector/tracker/tracker/matching.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import scipy 4 | from scipy.spatial.distance import cdist 5 | from scipy.optimize import linear_sum_assignment 6 | 7 | from cython_bbox import bbox_overlaps as bbox_ious 8 | from tracker.utils import kalman_filter 9 | import time 10 | 11 | def merge_matches(m1, m2, shape): 12 | O,P,Q = shape 13 | m1 = np.asarray(m1) 14 | m2 = np.asarray(m2) 15 | 16 | M1 = scipy.sparse.coo_matrix((np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P)) 17 | M2 = scipy.sparse.coo_matrix((np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q)) 18 | 19 | mask = M1*M2 20 | match = mask.nonzero() 21 | match = list(zip(match[0], match[1])) 22 | unmatched_O = tuple(set(range(O)) - set([i for i, j in match])) 23 | unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match])) 24 | 25 | return match, unmatched_O, unmatched_Q 26 | 27 | 28 | def _indices_to_matches(cost_matrix, indices, thresh): 29 | matched_cost = cost_matrix[tuple(zip(*indices))] 30 | matched_mask = (matched_cost <= thresh) 31 | 32 | matches = indices[matched_mask] 33 | unmatched_a = tuple(set(range(cost_matrix.shape[0])) - set(matches[:, 0])) 34 | unmatched_b = tuple(set(range(cost_matrix.shape[1])) - set(matches[:, 1])) 35 | 36 | return matches, unmatched_a, unmatched_b 37 | 38 | 39 | def linear_assignment(cost_matrix, thresh): 40 | """ 41 | Simple linear assignment 42 | :type cost_matrix: np.ndarray 43 | :type thresh: float 44 | :return: matches, unmatched_a, unmatched_b 45 | """ 46 | if cost_matrix.size == 0: 47 | return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1])) 48 | 49 | cost_matrix[cost_matrix > thresh] = thresh + 1e-4 50 | row_ind, col_ind = linear_sum_assignment(cost_matrix) 51 | indices = np.column_stack((row_ind, col_ind)) 52 | 53 | return _indices_to_matches(cost_matrix, indices, thresh) 54 | 55 | 56 | def ious(atlbrs, btlbrs): 57 | """ 58 | Compute cost based on IoU 59 | :type atlbrs: list[tlbr] | np.ndarray 60 | :type atlbrs: list[tlbr] | np.ndarray 61 | 62 | :rtype ious np.ndarray 63 | """ 64 | ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float) 65 | if ious.size == 0: 66 | return ious 67 | 68 | ious = bbox_ious( 69 | np.ascontiguousarray(atlbrs, dtype=np.float), 70 | np.ascontiguousarray(btlbrs, dtype=np.float) 71 | ) 72 | 73 | return ious 74 | 75 | 76 | def iou_distance(atracks, btracks): 77 | """ 78 | Compute cost based on IoU 79 | :type atracks: list[STrack] 80 | :type btracks: list[STrack] 81 | 82 | :rtype cost_matrix np.ndarray 83 | """ 84 | 85 | if (len(atracks)>0 and isinstance(atracks[0], np.ndarray)) or (len(btracks) > 0 and isinstance(btracks[0], np.ndarray)): 86 | atlbrs = atracks 87 | btlbrs = btracks 88 | else: 89 | atlbrs = [track.tlbr for track in atracks] 90 | btlbrs = [track.tlbr for track in btracks] 91 | _ious = ious(atlbrs, btlbrs) 92 | cost_matrix = 1 - _ious 93 | 94 | return cost_matrix 95 | 96 | def embedding_distance(tracks, detections, metric='cosine'): 97 | """ 98 | :param tracks: list[STrack] 99 | :param detections: list[BaseTrack] 100 | :param metric: 101 | :return: cost_matrix np.ndarray 102 | """ 103 | 104 | cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) 105 | if cost_matrix.size == 0: 106 | return cost_matrix 107 | det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float) 108 | for i, track in enumerate(tracks): 109 | cost_matrix[i, :] = np.maximum(0.0, cdist(track.smooth_feat.reshape(1,-1), det_features, metric)) 110 | return cost_matrix 111 | 112 | 113 | def gate_cost_matrix(kf, cost_matrix, tracks, detections, only_position=False): 114 | if cost_matrix.size == 0: 115 | return cost_matrix 116 | gating_dim = 2 if only_position else 4 117 | gating_threshold = kalman_filter.chi2inv95[gating_dim] 118 | measurements = np.asarray([det.to_xyah() for det in detections]) 119 | for row, track in enumerate(tracks): 120 | gating_distance = kf.gating_distance( 121 | track.mean, track.covariance, measurements, only_position) 122 | cost_matrix[row, gating_distance > gating_threshold] = np.inf 123 | return cost_matrix 124 | -------------------------------------------------------------------------------- /detector/tracker/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/detector/tracker/utils/__init__.py -------------------------------------------------------------------------------- /detector/tracker/utils/evaluation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import copy 4 | import motmetrics as mm 5 | 6 | from utils.io import read_results, unzip_objs 7 | 8 | 9 | class Evaluator(object): 10 | 11 | def __init__(self, data_root, seq_name, data_type): 12 | self.data_root = data_root 13 | self.seq_name = seq_name 14 | self.data_type = data_type 15 | 16 | self.load_annotations() 17 | self.reset_accumulator() 18 | 19 | def load_annotations(self): 20 | assert self.data_type == 'mot' 21 | 22 | gt_filename = os.path.join(self.data_root, self.seq_name, 'gt', 'gt.txt') 23 | self.gt_frame_dict = read_results(gt_filename, self.data_type, is_gt=True) 24 | self.gt_ignore_frame_dict = read_results(gt_filename, self.data_type, is_ignore=True) 25 | 26 | def reset_accumulator(self): 27 | self.acc = mm.MOTAccumulator(auto_id=True) 28 | 29 | def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False): 30 | # results 31 | trk_tlwhs = np.copy(trk_tlwhs) 32 | trk_ids = np.copy(trk_ids) 33 | 34 | # gts 35 | gt_objs = self.gt_frame_dict.get(frame_id, []) 36 | gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2] 37 | 38 | # ignore boxes 39 | ignore_objs = self.gt_ignore_frame_dict.get(frame_id, []) 40 | ignore_tlwhs = unzip_objs(ignore_objs)[0] 41 | 42 | # remove ignored results 43 | keep = np.ones(len(trk_tlwhs), dtype=bool) 44 | iou_distance = mm.distances.iou_matrix(ignore_tlwhs, trk_tlwhs, max_iou=0.5) 45 | match_is, match_js = mm.lap.linear_sum_assignment(iou_distance) 46 | match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js]) 47 | match_ious = iou_distance[match_is, match_js] 48 | 49 | match_js = np.asarray(match_js, dtype=int) 50 | match_js = match_js[np.logical_not(np.isnan(match_ious))] 51 | keep[match_js] = False 52 | trk_tlwhs = trk_tlwhs[keep] 53 | trk_ids = trk_ids[keep] 54 | 55 | # get distance matrix 56 | iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5) 57 | 58 | # acc 59 | self.acc.update(gt_ids, trk_ids, iou_distance) 60 | 61 | if rtn_events and iou_distance.size > 0 and hasattr(self.acc, 'last_mot_events'): 62 | events = self.acc.last_mot_events # only supported by https://github.com/longcw/py-motmetrics 63 | else: 64 | events = None 65 | return events 66 | 67 | def eval_file(self, filename): 68 | self.reset_accumulator() 69 | 70 | result_frame_dict = read_results(filename, self.data_type, is_gt=False) 71 | frames = sorted(list(set(self.gt_frame_dict.keys()) | set(result_frame_dict.keys()))) 72 | for frame_id in frames: 73 | trk_objs = result_frame_dict.get(frame_id, []) 74 | trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2] 75 | self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False) 76 | 77 | return self.acc 78 | 79 | @staticmethod 80 | def get_summary(accs, names, metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall')): 81 | names = copy.deepcopy(names) 82 | if metrics is None: 83 | metrics = mm.metrics.motchallenge_metrics 84 | metrics = copy.deepcopy(metrics) 85 | 86 | mh = mm.metrics.create() 87 | summary = mh.compute_many( 88 | accs, 89 | metrics=metrics, 90 | names=names, 91 | generate_overall=True 92 | ) 93 | 94 | return summary 95 | 96 | @staticmethod 97 | def save_summary(summary, filename): 98 | import pandas as pd 99 | writer = pd.ExcelWriter(filename) 100 | summary.to_excel(writer) 101 | writer.save() 102 | -------------------------------------------------------------------------------- /detector/tracker/utils/io.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict 3 | import numpy as np 4 | 5 | from utils.log import logger 6 | 7 | 8 | def write_results(filename, results_dict: Dict, data_type: str): 9 | if not filename: 10 | return 11 | path = os.path.dirname(filename) 12 | if not os.path.exists(path): 13 | os.makedirs(path) 14 | 15 | if data_type in ('mot', 'mcmot', 'lab'): 16 | save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n' 17 | elif data_type == 'kitti': 18 | save_format = '{frame} {id} pedestrian -1 -1 -10 {x1} {y1} {x2} {y2} -1 -1 -1 -1000 -1000 -1000 -10 {score}\n' 19 | else: 20 | raise ValueError(data_type) 21 | 22 | with open(filename, 'w') as f: 23 | for frame_id, frame_data in results_dict.items(): 24 | if data_type == 'kitti': 25 | frame_id -= 1 26 | for tlwh, track_id in frame_data: 27 | if track_id < 0: 28 | continue 29 | x1, y1, w, h = tlwh 30 | x2, y2 = x1 + w, y1 + h 31 | line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h, score=1.0) 32 | f.write(line) 33 | logger.info('Save results to {}'.format(filename)) 34 | 35 | 36 | def read_results(filename, data_type: str, is_gt=False, is_ignore=False): 37 | if data_type in ('mot', 'lab'): 38 | read_fun = read_mot_results 39 | else: 40 | raise ValueError('Unknown data type: {}'.format(data_type)) 41 | 42 | return read_fun(filename, is_gt, is_ignore) 43 | 44 | 45 | """ 46 | labels={'ped', ... % 1 47 | 'person_on_vhcl', ... % 2 48 | 'car', ... % 3 49 | 'bicycle', ... % 4 50 | 'mbike', ... % 5 51 | 'non_mot_vhcl', ... % 6 52 | 'static_person', ... % 7 53 | 'distractor', ... % 8 54 | 'occluder', ... % 9 55 | 'occluder_on_grnd', ... %10 56 | 'occluder_full', ... % 11 57 | 'reflection', ... % 12 58 | 'crowd' ... % 13 59 | }; 60 | """ 61 | 62 | 63 | def read_mot_results(filename, is_gt, is_ignore): 64 | valid_labels = {1} 65 | ignore_labels = {2, 7, 8, 12} 66 | results_dict = dict() 67 | if os.path.isfile(filename): 68 | with open(filename, 'r') as f: 69 | for line in f.readlines(): 70 | linelist = line.split(',') 71 | if len(linelist) < 7: 72 | continue 73 | fid = int(linelist[0]) 74 | if fid < 1: 75 | continue 76 | results_dict.setdefault(fid, list()) 77 | 78 | if is_gt: 79 | if 'MOT16-' in filename or 'MOT17-' in filename: 80 | label = int(float(linelist[7])) 81 | mark = int(float(linelist[6])) 82 | if mark == 0 or label not in valid_labels: 83 | continue 84 | score = 1 85 | elif is_ignore: 86 | if 'MOT16-' in filename or 'MOT17-' in filename: 87 | label = int(float(linelist[7])) 88 | vis_ratio = float(linelist[8]) 89 | if label not in ignore_labels and vis_ratio >= 0: 90 | continue 91 | else: 92 | continue 93 | score = 1 94 | else: 95 | score = float(linelist[6]) 96 | 97 | tlwh = tuple(map(float, linelist[2:6])) 98 | target_id = int(linelist[1]) 99 | 100 | results_dict[fid].append((tlwh, target_id, score)) 101 | 102 | return results_dict 103 | 104 | 105 | def unzip_objs(objs): 106 | if len(objs) > 0: 107 | tlwhs, ids, scores = zip(*objs) 108 | else: 109 | tlwhs, ids, scores = [], [], [] 110 | tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) 111 | 112 | return tlwhs, ids, scores -------------------------------------------------------------------------------- /detector/tracker/utils/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def get_logger(name='root'): 5 | formatter = logging.Formatter( 6 | # fmt='%(asctime)s [%(levelname)s]: %(filename)s(%(funcName)s:%(lineno)s) >> %(message)s') 7 | fmt='%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') 8 | 9 | handler = logging.StreamHandler() 10 | handler.setFormatter(formatter) 11 | 12 | logger = logging.getLogger(name) 13 | logger.setLevel(logging.DEBUG) 14 | logger.addHandler(handler) 15 | return logger 16 | 17 | 18 | logger = get_logger('root') 19 | -------------------------------------------------------------------------------- /detector/tracker/utils/nms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # from ._utils import _C 3 | from utils import _C 4 | 5 | nms = _C.nms 6 | # nms.__doc__ = """ 7 | # This function performs Non-maximum suppresion""" 8 | -------------------------------------------------------------------------------- /detector/tracker/utils/parse_config.py: -------------------------------------------------------------------------------- 1 | def parse_model_cfg(path): 2 | """Parses the yolo-v3 layer configuration file and returns module definitions""" 3 | file = open(path, 'r') 4 | lines = file.read().split('\n') 5 | lines = [x for x in lines if x and not x.startswith('#')] 6 | lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces 7 | module_defs = [] 8 | for line in lines: 9 | if line.startswith('['): # This marks the start of a new block 10 | module_defs.append({}) 11 | module_defs[-1]['type'] = line[1:-1].rstrip() 12 | if module_defs[-1]['type'] == 'convolutional': 13 | module_defs[-1]['batch_normalize'] = 0 14 | else: 15 | key, value = line.split("=") 16 | value = value.strip() 17 | module_defs[-1][key.rstrip()] = value.strip() 18 | 19 | return module_defs 20 | 21 | 22 | def parse_data_cfg(path): 23 | """Parses the data configuration file""" 24 | options = dict() 25 | options['gpus'] = '0' 26 | options['num_workers'] = '10' 27 | with open(path, 'r') as fp: 28 | lines = fp.readlines() 29 | for line in lines: 30 | line = line.strip() 31 | if line == '' or line.startswith('#'): 32 | continue 33 | key, value = line.split('=') 34 | options[key.strip()] = value.strip() 35 | return options 36 | -------------------------------------------------------------------------------- /detector/tracker/utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | 11 | class Timer(object): 12 | """A simple timer.""" 13 | def __init__(self): 14 | self.total_time = 0. 15 | self.calls = 0 16 | self.start_time = 0. 17 | self.diff = 0. 18 | self.average_time = 0. 19 | 20 | self.duration = 0. 21 | 22 | def tic(self): 23 | # using time.time instead of time.clock because time time.clock 24 | # does not normalize for multithreading 25 | self.start_time = time.time() 26 | 27 | def toc(self, average=True): 28 | self.diff = time.time() - self.start_time 29 | self.total_time += self.diff 30 | self.calls += 1 31 | self.average_time = self.total_time / self.calls 32 | if average: 33 | self.duration = self.average_time 34 | else: 35 | self.duration = self.diff 36 | return self.duration 37 | 38 | def clear(self): 39 | self.total_time = 0. 40 | self.calls = 0 41 | self.start_time = 0. 42 | self.diff = 0. 43 | self.average_time = 0. 44 | self.duration = 0. 45 | 46 | -------------------------------------------------------------------------------- /detector/tracker/utils/visualization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | 5 | def tlwhs_to_tlbrs(tlwhs): 6 | tlbrs = np.copy(tlwhs) 7 | if len(tlbrs) == 0: 8 | return tlbrs 9 | tlbrs[:, 2] += tlwhs[:, 0] 10 | tlbrs[:, 3] += tlwhs[:, 1] 11 | return tlbrs 12 | 13 | 14 | def get_color(idx): 15 | idx = idx * 3 16 | color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255) 17 | 18 | return color 19 | 20 | 21 | def resize_image(image, max_size=800): 22 | if max(image.shape[:2]) > max_size: 23 | scale = float(max_size) / max(image.shape[:2]) 24 | image = cv2.resize(image, None, fx=scale, fy=scale) 25 | return image 26 | 27 | 28 | def plot_tracking(image, tlwhs, obj_ids, scores=None, frame_id=0, fps=0., ids2=None): 29 | im = np.ascontiguousarray(np.copy(image)) 30 | im_h, im_w = im.shape[:2] 31 | 32 | top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255 33 | 34 | text_scale = max(1, image.shape[1] / 1600.) 35 | text_thickness = 1 if text_scale > 1.1 else 1 36 | line_thickness = max(1, int(image.shape[1] / 500.)) 37 | 38 | radius = max(5, int(im_w/140.)) 39 | cv2.putText(im, 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)), 40 | (0, int(15 * text_scale)), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=2) 41 | 42 | for i, tlwh in enumerate(tlwhs): 43 | x1, y1, w, h = tlwh 44 | intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h))) 45 | obj_id = int(obj_ids[i]) 46 | id_text = '{}'.format(int(obj_id)) 47 | if ids2 is not None: 48 | id_text = id_text + ', {}'.format(int(ids2[i])) 49 | _line_thickness = 1 if obj_id <= 0 else line_thickness 50 | color = get_color(abs(obj_id)) 51 | cv2.rectangle(im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness) 52 | cv2.putText(im, id_text, (intbox[0], intbox[1] + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), 53 | thickness=text_thickness) 54 | return im 55 | 56 | 57 | def plot_trajectory(image, tlwhs, track_ids): 58 | image = image.copy() 59 | for one_tlwhs, track_id in zip(tlwhs, track_ids): 60 | color = get_color(int(track_id)) 61 | for tlwh in one_tlwhs: 62 | x1, y1, w, h = tuple(map(int, tlwh)) 63 | cv2.circle(image, (int(x1 + 0.5 * w), int(y1 + h)), 2, color, thickness=2) 64 | 65 | return image 66 | 67 | 68 | def plot_detections(image, tlbrs, scores=None, color=(255, 0, 0), ids=None): 69 | im = np.copy(image) 70 | text_scale = max(1, image.shape[1] / 800.) 71 | thickness = 2 if text_scale > 1.3 else 1 72 | for i, det in enumerate(tlbrs): 73 | x1, y1, x2, y2 = np.asarray(det[:4], dtype=np.int) 74 | if len(det) >= 7: 75 | label = 'det' if det[5] > 0 else 'trk' 76 | if ids is not None: 77 | text = '{}# {:.2f}: {:d}'.format(label, det[6], ids[i]) 78 | cv2.putText(im, text, (x1, y1 + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 255, 255), 79 | thickness=thickness) 80 | else: 81 | text = '{}# {:.2f}'.format(label, det[6]) 82 | 83 | if scores is not None: 84 | text = '{:.2f}'.format(scores[i]) 85 | cv2.putText(im, text, (x1, y1 + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 255, 255), 86 | thickness=thickness) 87 | 88 | cv2.rectangle(im, (x1, y1), (x2, y2), color, 2) 89 | 90 | return im 91 | -------------------------------------------------------------------------------- /detector/tracker_cfg.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | cfg = edict() 4 | cfg.CONFIG = '../detector/tracker/cfg/yolov3.cfg' 5 | cfg.WEIGHTS = '../data/models/detector_models/jde.uncertainty.pt' 6 | cfg.IMG_SIZE = (1088, 608) 7 | cfg.NMS_THRES = 0.4 8 | cfg.CONFIDENCE = 0.2 9 | cfg.BUFFER_SIZE = 30 # frame buffer 10 | -------------------------------------------------------------------------------- /detector/yolo/README.md: -------------------------------------------------------------------------------- 1 | # A PyTorch implementation of a YOLO v3 Object Detector 2 | 3 | Forked from https://github.com/ayooshkathuria/pytorch-yolo-v3 4 | -------------------------------------------------------------------------------- /detector/yolo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/detector/yolo/__init__.py -------------------------------------------------------------------------------- /detector/yolo/bbox.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import random 5 | 6 | import numpy as np 7 | import cv2 8 | 9 | def confidence_filter(result, confidence): 10 | conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2) 11 | result = result*conf_mask 12 | 13 | return result 14 | 15 | def confidence_filter_cls(result, confidence): 16 | max_scores = torch.max(result[:,:,5:25], 2)[0] 17 | res = torch.cat((result, max_scores),2) 18 | print(res.shape) 19 | 20 | 21 | cond_1 = (res[:,:,4] > confidence).float() 22 | cond_2 = (res[:,:,25] > 0.995).float() 23 | 24 | conf = cond_1 + cond_2 25 | conf = torch.clamp(conf, 0.0, 1.0) 26 | conf = conf.unsqueeze(2) 27 | result = result*conf 28 | return result 29 | 30 | 31 | 32 | def get_abs_coord(box): 33 | box[2], box[3] = abs(box[2]), abs(box[3]) 34 | x1 = (box[0] - box[2]/2) - 1 35 | y1 = (box[1] - box[3]/2) - 1 36 | x2 = (box[0] + box[2]/2) - 1 37 | y2 = (box[1] + box[3]/2) - 1 38 | return x1, y1, x2, y2 39 | 40 | 41 | 42 | def sanity_fix(box): 43 | if (box[0] > box[2]): 44 | box[0], box[2] = box[2], box[0] 45 | 46 | if (box[1] > box[3]): 47 | box[1], box[3] = box[3], box[1] 48 | 49 | return box 50 | 51 | def bbox_iou(box1, box2, args=None): 52 | """ 53 | Returns the IoU of two bounding boxes 54 | 55 | 56 | """ 57 | #Get the coordinates of bounding boxes 58 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3] 59 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3] 60 | 61 | #get the corrdinates of the intersection rectangle 62 | inter_rect_x1 = torch.max(b1_x1, b2_x1) 63 | inter_rect_y1 = torch.max(b1_y1, b2_y1) 64 | inter_rect_x2 = torch.min(b1_x2, b2_x2) 65 | inter_rect_y2 = torch.min(b1_y2, b2_y2) 66 | 67 | #Intersection area 68 | if not args: 69 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda()) 70 | else: 71 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).to(args.device))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).to(args.device)) 72 | #Union Area 73 | b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1) 74 | b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1) 75 | 76 | iou = inter_area / (b1_area + b2_area - inter_area) 77 | 78 | return iou 79 | 80 | 81 | def pred_corner_coord(prediction): 82 | #Get indices of non-zero confidence bboxes 83 | ind_nz = torch.nonzero(prediction[:,:,4], as_tuple=False).transpose(0,1).contiguous() 84 | 85 | box = prediction[ind_nz[0], ind_nz[1]] 86 | 87 | 88 | box_a = box.new(box.shape) 89 | box_a[:,0] = (box[:,0] - box[:,2]/2) 90 | box_a[:,1] = (box[:,1] - box[:,3]/2) 91 | box_a[:,2] = (box[:,0] + box[:,2]/2) 92 | box_a[:,3] = (box[:,1] + box[:,3]/2) 93 | box[:,:4] = box_a[:,:4] 94 | 95 | prediction[ind_nz[0], ind_nz[1]] = box 96 | 97 | return prediction 98 | 99 | 100 | 101 | 102 | def write(x, batches, results, colors, classes): 103 | c1 = tuple(x[1:3].int()) 104 | c2 = tuple(x[3:5].int()) 105 | img = results[int(x[0])] 106 | cls = int(x[-1]) 107 | label = "{0}".format(classes[cls]) 108 | color = random.choice(colors) 109 | cv2.rectangle(img, c1, c2,color, 1) 110 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 111 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 112 | cv2.rectangle(img, c1, c2,color, -1) 113 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); 114 | return img 115 | -------------------------------------------------------------------------------- /detector/yolo/cam_demo.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import time 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import numpy as np 7 | import cv2 8 | from util import * 9 | from darknet import Darknet 10 | from preprocess import prep_image, inp_to_image 11 | import pandas as pd 12 | import random 13 | import argparse 14 | import pickle as pkl 15 | 16 | def get_test_input(input_dim, CUDA): 17 | img = cv2.imread("imgs/messi.jpg") 18 | img = cv2.resize(img, (input_dim, input_dim)) 19 | img_ = img[:,:,::-1].transpose((2,0,1)) 20 | img_ = img_[np.newaxis,:,:,:]/255.0 21 | img_ = torch.from_numpy(img_).float() 22 | img_ = Variable(img_) 23 | 24 | if CUDA: 25 | img_ = img_.cuda() 26 | 27 | return img_ 28 | 29 | def prep_image(img, inp_dim): 30 | """ 31 | Prepare image for inputting to the neural network. 32 | 33 | Returns a Variable 34 | """ 35 | 36 | orig_im = img 37 | dim = orig_im.shape[1], orig_im.shape[0] 38 | img = cv2.resize(orig_im, (inp_dim, inp_dim)) 39 | img_ = img[:,:,::-1].transpose((2,0,1)).copy() 40 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 41 | return img_, orig_im, dim 42 | 43 | def write(x, img): 44 | c1 = tuple(x[1:3].int()) 45 | c2 = tuple(x[3:5].int()) 46 | cls = int(x[-1]) 47 | label = "{0}".format(classes[cls]) 48 | color = random.choice(colors) 49 | cv2.rectangle(img, c1, c2,color, 1) 50 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 51 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 52 | cv2.rectangle(img, c1, c2,color, -1) 53 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); 54 | return img 55 | 56 | def arg_parse(): 57 | """ 58 | Parse arguements to the detect module 59 | 60 | """ 61 | 62 | 63 | parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo') 64 | parser.add_argument("--confidence", dest = "confidence", help = "Object Confidence to filter predictions", default = 0.25) 65 | parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4) 66 | parser.add_argument("--reso", dest = 'reso', help = 67 | "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed", 68 | default = "160", type = str) 69 | return parser.parse_args() 70 | 71 | 72 | 73 | if __name__ == '__main__': 74 | cfgfile = "cfg/yolov3-spp.cfg" 75 | weightsfile = "yolov3-spp.weights" 76 | num_classes = 80 77 | 78 | args = arg_parse() 79 | confidence = float(args.confidence) 80 | nms_thesh = float(args.nms_thresh) 81 | start = 0 82 | CUDA = torch.cuda.is_available() 83 | 84 | 85 | 86 | 87 | num_classes = 80 88 | bbox_attrs = 5 + num_classes 89 | 90 | model = Darknet(cfgfile) 91 | model.load_weights(weightsfile) 92 | 93 | model.net_info["height"] = args.reso 94 | inp_dim = int(model.net_info["height"]) 95 | 96 | assert inp_dim % 32 == 0 97 | assert inp_dim > 32 98 | 99 | if CUDA: 100 | model.cuda() 101 | 102 | model.eval() 103 | 104 | videofile = 'video.avi' 105 | 106 | cap = cv2.VideoCapture(0) 107 | 108 | assert cap.isOpened(), 'Cannot capture source' 109 | 110 | frames = 0 111 | start = time.time() 112 | while cap.isOpened(): 113 | 114 | ret, frame = cap.read() 115 | if ret: 116 | 117 | img, orig_im, dim = prep_image(frame, inp_dim) 118 | 119 | # im_dim = torch.FloatTensor(dim).repeat(1,2) 120 | 121 | 122 | if CUDA: 123 | im_dim = im_dim.cuda() 124 | img = img.cuda() 125 | 126 | 127 | output = model(Variable(img), CUDA) 128 | output = write_results(output, confidence, num_classes, nms = True, nms_conf = nms_thesh) 129 | 130 | if type(output) == int: 131 | frames += 1 132 | print("FPS of the video is {:5.2f}".format( frames / (time.time() - start))) 133 | cv2.imshow("frame", orig_im) 134 | key = cv2.waitKey(1) 135 | if key & 0xFF == ord('q'): 136 | break 137 | continue 138 | 139 | 140 | 141 | output[:,1:5] = torch.clamp(output[:,1:5], 0.0, float(inp_dim))/inp_dim 142 | 143 | # im_dim = im_dim.repeat(output.size(0), 1) 144 | output[:,[1,3]] *= frame.shape[1] 145 | output[:,[2,4]] *= frame.shape[0] 146 | 147 | 148 | classes = load_classes('data/coco.names') 149 | colors = pkl.load(open("pallete", "rb")) 150 | 151 | list(map(lambda x: write(x, orig_im), output)) 152 | 153 | 154 | cv2.imshow("frame", orig_im) 155 | key = cv2.waitKey(1) 156 | if key & 0xFF == ord('q'): 157 | break 158 | frames += 1 159 | print("FPS of the video is {:5.2f}".format( frames / (time.time() - start))) 160 | 161 | 162 | else: 163 | break 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /detector/yolo/cfg/tiny-yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | batch=64 3 | subdivisions=8 4 | width=416 5 | height=416 6 | channels=3 7 | momentum=0.9 8 | decay=0.0005 9 | angle=0 10 | saturation = 1.5 11 | exposure = 1.5 12 | hue=.1 13 | 14 | learning_rate=0.001 15 | max_batches = 40200 16 | policy=steps 17 | steps=-1,100,20000,30000 18 | scales=.1,10,.1,.1 19 | 20 | [convolutional] 21 | batch_normalize=1 22 | filters=16 23 | size=3 24 | stride=1 25 | pad=1 26 | activation=leaky 27 | 28 | [maxpool] 29 | size=2 30 | stride=2 31 | 32 | [convolutional] 33 | batch_normalize=1 34 | filters=32 35 | size=3 36 | stride=1 37 | pad=1 38 | activation=leaky 39 | 40 | [maxpool] 41 | size=2 42 | stride=2 43 | 44 | [convolutional] 45 | batch_normalize=1 46 | filters=64 47 | size=3 48 | stride=1 49 | pad=1 50 | activation=leaky 51 | 52 | [maxpool] 53 | size=2 54 | stride=2 55 | 56 | [convolutional] 57 | batch_normalize=1 58 | filters=128 59 | size=3 60 | stride=1 61 | pad=1 62 | activation=leaky 63 | 64 | [maxpool] 65 | size=2 66 | stride=2 67 | 68 | [convolutional] 69 | batch_normalize=1 70 | filters=256 71 | size=3 72 | stride=1 73 | pad=1 74 | activation=leaky 75 | 76 | [maxpool] 77 | size=2 78 | stride=2 79 | 80 | [convolutional] 81 | batch_normalize=1 82 | filters=512 83 | size=3 84 | stride=1 85 | pad=1 86 | activation=leaky 87 | 88 | [maxpool] 89 | size=2 90 | stride=1 91 | 92 | [convolutional] 93 | batch_normalize=1 94 | filters=1024 95 | size=3 96 | stride=1 97 | pad=1 98 | activation=leaky 99 | 100 | ########### 101 | 102 | [convolutional] 103 | batch_normalize=1 104 | size=3 105 | stride=1 106 | pad=1 107 | filters=1024 108 | activation=leaky 109 | 110 | [convolutional] 111 | size=1 112 | stride=1 113 | pad=1 114 | filters=125 115 | activation=linear 116 | 117 | [region] 118 | anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 119 | bias_match=1 120 | classes=20 121 | coords=4 122 | num=5 123 | softmax=1 124 | jitter=.2 125 | rescore=1 126 | 127 | object_scale=5 128 | noobject_scale=1 129 | class_scale=1 130 | coord_scale=1 131 | 132 | absolute=1 133 | thresh = .6 134 | random=1 135 | -------------------------------------------------------------------------------- /detector/yolo/cfg/yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=64 4 | subdivisions=8 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | height=416 9 | width=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 80200 21 | policy=steps 22 | steps=-1,500,40000,60000 23 | scales=0.1,10,.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=125 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 243 | bias_match=1 244 | classes=20 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /detector/yolo/cfg/yolo.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=425 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 243 | bias_match=1 244 | classes=80 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /detector/yolo/detect.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import time 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import numpy as np 7 | import cv2 8 | from util import * 9 | import argparse 10 | import os 11 | import os.path as osp 12 | from darknet import Darknet 13 | from preprocess import prep_image, inp_to_image 14 | import pandas as pd 15 | import random 16 | import pickle as pkl 17 | import itertools 18 | 19 | 20 | if __name__ == '__main__': 21 | 22 | scales = "1,2,3" 23 | images = "imgs/messi.jpg" 24 | batch_size = 1 25 | confidence = 0.5 26 | nms_thesh = 0.4 27 | 28 | CUDA = torch.cuda.is_available() 29 | 30 | num_classes = 80 31 | classes = load_classes('data/coco.names') 32 | 33 | #Set up the neural network 34 | print("Loading network.....") 35 | model = Darknet("cfg/yolov3-spp.cfg") 36 | model.load_weights("yolov3-spp.weights") 37 | print("Network successfully loaded") 38 | 39 | model.net_info["height"] = "608" 40 | inp_dim = int(model.net_info["height"]) 41 | assert inp_dim % 32 == 0 42 | assert inp_dim > 32 43 | 44 | #If there's a GPU availible, put the model on GPU 45 | if CUDA: 46 | model.cuda() 47 | 48 | #Set the model in evaluation mode 49 | model.eval() 50 | 51 | #Detection phase 52 | try: 53 | imlist = [] 54 | imlist.append(osp.join(osp.realpath('.'), images)) 55 | except FileNotFoundError: 56 | print ("No file or directory with the name {}".format(images)) 57 | exit() 58 | 59 | batches = list(map(prep_image, imlist, [inp_dim for x in range(len(imlist))])) 60 | im_batches = [x[0] for x in batches] 61 | orig_ims = [x[1] for x in batches] 62 | im_dim_list = [x[2] for x in batches] 63 | im_dim_list = torch.FloatTensor(im_dim_list).repeat(1, 2) 64 | 65 | if CUDA: 66 | im_dim_list = im_dim_list.cuda() 67 | 68 | 69 | for batch in im_batches: 70 | #load the image 71 | if CUDA: 72 | batch = batch.cuda() 73 | with torch.no_grad(): 74 | prediction = model(Variable(batch), CUDA) 75 | 76 | prediction = write_results(prediction, confidence, num_classes, nms=True, nms_conf=nms_thesh) 77 | output = prediction 78 | 79 | if CUDA: 80 | torch.cuda.synchronize() 81 | 82 | try: 83 | output 84 | except NameError: 85 | print("No detections were made") 86 | exit() 87 | print(im_dim_list.shape) 88 | im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long()) 89 | 90 | scaling_factor = torch.min(inp_dim/im_dim_list,1)[0].view(-1,1) 91 | 92 | 93 | output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2 94 | output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2 95 | 96 | output[:,1:5] /= scaling_factor 97 | 98 | for i in range(output.shape[0]): 99 | output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0]) 100 | output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1]) 101 | 102 | print(output) 103 | print(output.shape) 104 | -------------------------------------------------------------------------------- /detector/yolo/pallete: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/detector/yolo/pallete -------------------------------------------------------------------------------- /detector/yolo/preprocess.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import numpy as np 5 | import cv2 6 | 7 | try: 8 | from util import count_parameters as count 9 | from util import convert2cpu as cpu 10 | except ImportError: 11 | from yolo.util import count_parameters as count 12 | from yolo.util import convert2cpu as cpu 13 | from PIL import Image, ImageDraw 14 | 15 | 16 | def letterbox_image(img, inp_dim): 17 | '''resize image with unchanged aspect ratio using padding''' 18 | img_w, img_h = img.shape[1], img.shape[0] 19 | w, h = inp_dim 20 | new_w = int(img_w * min(w / img_w, h / img_h)) 21 | new_h = int(img_h * min(w / img_w, h / img_h)) 22 | resized_image = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC) 23 | 24 | canvas = np.full((inp_dim[1], inp_dim[0], 3), 128) 25 | 26 | canvas[(h - new_h) // 2:(h - new_h) // 2 + new_h, (w - new_w) // 2:(w - new_w) // 2 + new_w, :] = resized_image 27 | 28 | return canvas 29 | 30 | 31 | def prep_image(img, inp_dim): 32 | """ 33 | Prepare image for inputting to the neural network. 34 | 35 | Returns a Variable 36 | """ 37 | 38 | orig_im = cv2.imread(img) 39 | dim = orig_im.shape[1], orig_im.shape[0] 40 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 41 | img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() 42 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 43 | return img_, orig_im, dim 44 | 45 | 46 | def prep_frame(img, inp_dim): 47 | """ 48 | Prepare image for inputting to the neural network. 49 | 50 | Returns a Variable 51 | """ 52 | 53 | orig_im = img 54 | dim = orig_im.shape[1], orig_im.shape[0] 55 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 56 | img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() 57 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 58 | return img_, orig_im, dim 59 | 60 | 61 | def prep_image_pil(img, network_dim): 62 | orig_im = Image.open(img) 63 | img = orig_im.convert('RGB') 64 | dim = img.size 65 | img = img.resize(network_dim) 66 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes())) 67 | img = img.view(*network_dim, 3).transpose(0, 1).transpose(0, 2).contiguous() 68 | img = img.view(1, 3, *network_dim) 69 | img = img.float().div(255.0) 70 | return (img, orig_im, dim) 71 | 72 | 73 | def inp_to_image(inp): 74 | inp = inp.cpu().squeeze() 75 | inp = inp * 255 76 | try: 77 | inp = inp.data.numpy() 78 | except RuntimeError: 79 | inp = inp.numpy() 80 | inp = inp.transpose(1, 2, 0) 81 | 82 | inp = inp[:, :, ::-1] 83 | return inp 84 | -------------------------------------------------------------------------------- /detector/yolo_cfg.py: -------------------------------------------------------------------------------- 1 | from easydict import EasyDict as edict 2 | 3 | cfg = edict() 4 | cfg.CONFIG = '../detector/yolo/cfg/yolov3-spp.cfg' 5 | cfg.WEIGHTS = '../data/models/detector_models/yolov3-spp.weights' 6 | cfg.INP_DIM = 608 7 | cfg.NMS_THRES = 0.6 8 | cfg.CONFIDENCE = 0.01 9 | cfg.NUM_CLASSES = 80 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import glob 4 | import os 5 | 6 | import numpy as np 7 | import torch 8 | from setuptools import setup, Extension, find_packages 9 | from torch.utils.cpp_extension import CUDA_HOME 10 | from torch.utils.cpp_extension import CppExtension 11 | from torch.utils.cpp_extension import CUDAExtension 12 | from Cython.Build import cythonize 13 | import platform 14 | 15 | def make_cython_ext(name, module, sources): 16 | extra_compile_args = None 17 | if platform.system() != 'Windows': 18 | extra_compile_args = { 19 | 'cxx': ['-Wno-unused-function', '-Wno-write-strings'] 20 | } 21 | 22 | extension = Extension( 23 | '{}.{}'.format(module, name), 24 | [os.path.join(*module.split('.'), p) for p in sources], 25 | include_dirs=[np.get_include()], 26 | language='c++', 27 | extra_compile_args=extra_compile_args) 28 | extension, = cythonize(extension) 29 | return extension 30 | 31 | 32 | def make_cuda_ext(name, module, sources): 33 | 34 | return CUDAExtension( 35 | name='{}.{}'.format(module, name), 36 | sources=[os.path.join(*module.split('.'), p) for p in sources], 37 | extra_compile_args={ 38 | 'cxx': [], 39 | 'nvcc': [ 40 | '-D__CUDA_NO_HALF_OPERATORS__', 41 | '-D__CUDA_NO_HALF_CONVERSIONS__', 42 | '-D__CUDA_NO_HALF2_OPERATORS__', 43 | ] 44 | }) 45 | 46 | 47 | def get_extensions(): 48 | this_dir = os.path.dirname(os.path.abspath(__file__)) 49 | extensions_dir = os.path.join(this_dir, "alphaction/csrc") 50 | 51 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 52 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 53 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 54 | 55 | sources = main_file + source_cpu 56 | extension = CppExtension 57 | 58 | extra_compile_args = {"cxx": []} 59 | define_macros = [] 60 | 61 | if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1": 62 | extension = CUDAExtension 63 | sources += source_cuda 64 | define_macros += [("WITH_CUDA", None)] 65 | extra_compile_args["nvcc"] = [ 66 | "-O3", 67 | "-DCUDA_HAS_FP16=1", 68 | "-D__CUDA_NO_HALF_OPERATORS__", 69 | "-D__CUDA_NO_HALF_CONVERSIONS__", 70 | "-D__CUDA_NO_HALF2_OPERATORS__", 71 | ] 72 | 73 | sources = [os.path.join(extensions_dir, s) for s in sources] 74 | 75 | include_dirs = [extensions_dir] 76 | 77 | ext_modules = [ 78 | extension( 79 | "alphaction._custom_cuda_ext", 80 | sources, 81 | include_dirs=include_dirs, 82 | define_macros=define_macros, 83 | extra_compile_args=extra_compile_args, 84 | ), 85 | make_cython_ext( 86 | name='soft_nms_cpu', 87 | module='detector.nms', 88 | sources=['src/soft_nms_cpu.pyx']), 89 | make_cuda_ext( 90 | name='nms_cpu', 91 | module='detector.nms', 92 | sources=['src/nms_cpu.cpp']), 93 | make_cuda_ext( 94 | name='nms_cuda', 95 | module='detector.nms', 96 | sources=['src/nms_cuda.cpp', 'src/nms_kernel.cu']), 97 | 98 | ] 99 | 100 | return ext_modules 101 | 102 | 103 | setup( 104 | name="alphaction", 105 | author="yelantf", 106 | url="https://github.com/MVIG-SJTU/AlphAction", 107 | ext_modules=get_extensions(), 108 | packages=find_packages(".", exclude=[ 109 | "config_files", "demo", "gifs", "tools", "data", 110 | ]), 111 | install_requires=[ 112 | "tqdm", 113 | "yacs", 114 | "opencv-python", 115 | "tensorboardX", 116 | "SciPy", 117 | "matplotlib", 118 | "cython-bbox", 119 | "easydict", 120 | ], 121 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 122 | ) 123 | -------------------------------------------------------------------------------- /test_net.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from alphaction.config import cfg 6 | from alphaction.dataset import make_data_loader 7 | from alphaction.engine.inference import inference 8 | from alphaction.modeling.detector import build_detection_model 9 | from alphaction.utils.checkpoint import ActionCheckpointer 10 | from torch.utils.collect_env import get_pretty_env_info 11 | from alphaction.utils.comm import synchronize, get_rank 12 | from alphaction.utils.IA_helper import has_memory 13 | from alphaction.utils.logger import setup_logger 14 | #pytorch issuse #973 15 | import resource 16 | 17 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 18 | resource.setrlimit(resource.RLIMIT_NOFILE, (rlimit[1], rlimit[1])) 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference") 23 | parser.add_argument( 24 | "--config-file", 25 | default="", 26 | metavar="FILE", 27 | help="path to config file", 28 | ) 29 | parser.add_argument("--local_rank", type=int, default=0) 30 | parser.add_argument( 31 | "opts", 32 | help="Modify config options using the command-line", 33 | default=None, 34 | nargs=argparse.REMAINDER, 35 | ) 36 | 37 | args = parser.parse_args() 38 | 39 | num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 40 | distributed = num_gpus > 1 41 | 42 | if distributed: 43 | torch.cuda.set_device(args.local_rank) 44 | torch.distributed.init_process_group( 45 | backend="nccl", init_method="env://" 46 | ) 47 | 48 | # Merge config file. 49 | cfg.merge_from_file(args.config_file) 50 | cfg.merge_from_list(args.opts) 51 | cfg.freeze() 52 | 53 | # Print experimental infos. 54 | save_dir = "" 55 | logger = setup_logger("alphaction", save_dir, get_rank()) 56 | logger.info("Using {} GPUs".format(num_gpus)) 57 | logger.info(cfg) 58 | 59 | logger.info("Collecting env info (might take some time)") 60 | logger.info("\n" + get_pretty_env_info()) 61 | 62 | # Build the model. 63 | model = build_detection_model(cfg) 64 | model.to("cuda") 65 | 66 | # load weight. 67 | output_dir = cfg.OUTPUT_DIR 68 | checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir) 69 | checkpointer.load(cfg.MODEL.WEIGHT) 70 | 71 | output_folders = [None] * len(cfg.DATASETS.TEST) 72 | dataset_names = cfg.DATASETS.TEST 73 | mem_active = has_memory(cfg.MODEL.IA_STRUCTURE) 74 | if cfg.OUTPUT_DIR: 75 | for idx, dataset_name in enumerate(dataset_names): 76 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) 77 | os.makedirs(output_folder, exist_ok=True) 78 | output_folders[idx] = output_folder 79 | 80 | # Do inference. 81 | data_loaders_test = make_data_loader(cfg, is_train=False, is_distributed=distributed) 82 | for output_folder, dataset_name, data_loader_test in zip(output_folders, dataset_names, data_loaders_test): 83 | inference( 84 | model, 85 | data_loader_test, 86 | dataset_name, 87 | mem_active=mem_active, 88 | output_folder=output_folder, 89 | ) 90 | synchronize() 91 | 92 | 93 | if __name__ == "__main__": 94 | main() 95 | --------------------------------------------------------------------------------