├── .flake8
├── .gitignore
├── DATA.md
├── GETTING_STARTED.md
├── INSTALL.md
├── MODEL_ZOO.md
├── README.md
├── alphaction
    ├── __init__.py
    ├── config
    │   ├── __init__.py
    │   ├── defaults.py
    │   └── paths_catalog.py
    ├── csrc
    │   ├── ROIAlign3d.h
    │   ├── ROIPool3d.h
    │   ├── SigmoidFocalLoss.h
    │   ├── SoftmaxFocalLoss.h
    │   ├── cpu
    │   │   └── vision.h
    │   ├── cuda
    │   │   ├── ROIAlign3d_cuda.cu
    │   │   ├── ROIPool3d_cuda.cu
    │   │   ├── SigmoidFocalLoss_cuda.cu
    │   │   ├── SoftmaxFocalLoss_cuda.cu
    │   │   └── vision.h
    │   └── vision.cpp
    ├── dataset
    │   ├── __init__.py
    │   ├── build.py
    │   ├── collate_batch.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── ava.py
    │   │   ├── concat_dataset.py
    │   │   └── evaluation
    │   │   │   ├── __init__.py
    │   │   │   └── ava
    │   │   │       ├── README.md
    │   │   │       ├── __init__.py
    │   │   │       ├── ava_eval.py
    │   │   │       └── pascal_evaluation
    │   │   │           ├── __init__.py
    │   │   │           ├── label_map_util.py
    │   │   │           ├── metrics.py
    │   │   │           ├── np_box_list.py
    │   │   │           ├── np_box_list_ops.py
    │   │   │           ├── np_box_mask_list.py
    │   │   │           ├── np_box_mask_list_ops.py
    │   │   │           ├── np_box_ops.py
    │   │   │           ├── np_mask_ops.py
    │   │   │           ├── object_detection_evaluation.py
    │   │   │           ├── per_image_evaluation.py
    │   │   │           └── standard_fields.py
    │   ├── samplers
    │   │   ├── __init__.py
    │   │   ├── distributed.py
    │   │   ├── grouped_batch_sampler.py
    │   │   └── iteration_based_batch_sampler.py
    │   └── transforms
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   ├── object_transforms.py
    │   │   └── video_transforms.py
    ├── engine
    │   ├── __init__.py
    │   ├── inference.py
    │   └── trainer.py
    ├── layers
    │   ├── __init__.py
    │   ├── batch_norm.py
    │   ├── roi_align_3d.py
    │   ├── roi_pool_3d.py
    │   ├── sigmoid_focal_loss.py
    │   └── softmax_focal_loss.py
    ├── modeling
    │   ├── __init__.py
    │   ├── backbone
    │   │   ├── __init__.py
    │   │   ├── backbone.py
    │   │   ├── i3d.py
    │   │   └── slowfast.py
    │   ├── common_blocks.py
    │   ├── detector
    │   │   ├── __init__.py
    │   │   └── action_detector.py
    │   ├── nonlocal_block.py
    │   ├── poolers.py
    │   ├── registry.py
    │   ├── roi_heads
    │   │   ├── __init__.py
    │   │   ├── action_head
    │   │   │   ├── IA_structure.py
    │   │   │   ├── __init__.py
    │   │   │   ├── action_head.py
    │   │   │   ├── inference.py
    │   │   │   ├── loss.py
    │   │   │   ├── metric.py
    │   │   │   ├── roi_action_feature_extractor.py
    │   │   │   └── roi_action_predictors.py
    │   │   └── roi_heads_3d.py
    │   └── utils.py
    ├── solver
    │   ├── __init__.py
    │   ├── build.py
    │   └── lr_scheduler.py
    ├── structures
    │   ├── __init__.py
    │   ├── bounding_box.py
    │   └── memory_pool.py
    └── utils
    │   ├── IA_helper.py
    │   ├── __init__.py
    │   ├── c2_model_loading.py
    │   ├── checkpoint.py
    │   ├── comm.py
    │   ├── logger.py
    │   ├── metric_logger.py
    │   ├── model_serialization.py
    │   ├── random_seed.py
    │   ├── registry.py
    │   └── video_decode.py
├── config_files
    ├── resnet101_8x8f_baseline.yaml
    ├── resnet101_8x8f_denseserial.yaml
    ├── resnet50_4x16f_baseline.yaml
    ├── resnet50_4x16f_denseserial.yaml
    ├── resnet50_4x16f_parallel.yaml
    └── resnet50_4x16f_serial.yaml
├── demo
    ├── README.md
    ├── Roboto-Bold.ttf
    ├── action_predictor.py
    ├── demo.py
    ├── video_detection_loader.py
    └── visualizer.py
├── detector
    ├── __init__.py
    ├── apis.py
    ├── nms
    │   ├── __init__.py
    │   ├── nms_wrapper.py
    │   └── src
    │   │   ├── nms_cpu.cpp
    │   │   ├── nms_cuda.cpp
    │   │   ├── nms_kernel.cu
    │   │   └── soft_nms_cpu.pyx
    ├── tracker
    │   ├── README.md
    │   ├── __init__.py
    │   ├── cfg
    │   │   ├── ccmcpe.json
    │   │   └── yolov3.cfg
    │   ├── models.py
    │   ├── preprocess.py
    │   ├── tracker
    │   │   ├── __init__.py
    │   │   ├── basetrack.py
    │   │   ├── matching.py
    │   │   └── multitracker.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── datasets.py
    │   │   ├── evaluation.py
    │   │   ├── io.py
    │   │   ├── kalman_filter.py
    │   │   ├── log.py
    │   │   ├── nms.py
    │   │   ├── parse_config.py
    │   │   ├── timer.py
    │   │   ├── utils.py
    │   │   └── visualization.py
    ├── tracker_api.py
    ├── tracker_cfg.py
    ├── yolo
    │   ├── README.md
    │   ├── __init__.py
    │   ├── bbox.py
    │   ├── cam_demo.py
    │   ├── cfg
    │   │   ├── tiny-yolo-voc.cfg
    │   │   ├── yolo-voc.cfg
    │   │   ├── yolo.cfg
    │   │   ├── yolov3-spp.cfg
    │   │   └── yolov3.cfg
    │   ├── darknet.py
    │   ├── detect.py
    │   ├── pallete
    │   ├── preprocess.py
    │   ├── util.py
    │   ├── video_demo.py
    │   └── video_demo_half.py
    ├── yolo_api.py
    └── yolo_cfg.py
├── setup.py
├── test_net.py
├── tools
    └── ava
    │   ├── csv2COCO.py
    │   └── process_ava_videos.py
└── train_net.py


/.flake8:
--------------------------------------------------------------------------------
1 | # This is an example .flake8 config, used when developing *Black* itself.
2 | # Keep in sync with setup.cfg which is used for source packages.
3 | 
4 | [flake8]
5 | ignore = E203, E266, E501, W503
6 | max-line-length = 80
7 | max-complexity = 18
8 | select = B,C,E,F,W,T4,B9
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # compilation and distribution
 2 | __pycache__
 3 | _ext
 4 | *.pyc
 5 | *.so
 6 | *.egg-info/
 7 | build/
 8 | dist/
 9 | 
10 | # pytorch/python/numpy formats
11 | *.pth
12 | *.pkl
13 | *.npy
14 | *.pt
15 | 
16 | # ipython/jupyter notebooks
17 | *.ipynb
18 | **/.ipynb_checkpoints/
19 | 
20 | # Editor temporaries
21 | *.swn
22 | *.swo
23 | *.swp
24 | *~
25 | .DS_Store
26 | 
27 | # Pycharm editor settings
28 | .idea
29 | 
30 | # project dirs
31 | /datasets
32 | /models
33 | /data
34 | /detector/nms/src/soft_nms_cpu.cpp


--------------------------------------------------------------------------------
/DATA.md:
--------------------------------------------------------------------------------
  1 | ## Data Preparation
  2 | 
  3 | ### Easy Version
  4 | 
  5 | 1. Download the tar.gz file from [[here]](https://pan.baidu.com/s/1UrflK4IgiVbVBOP5fDHdKA) with code `q5v5`. 
  6 | 
  7 | 2. run following commands to unzip the file and create a 
  8 | symbolic link to the extracted files.
  9 | 
 10 |     ```bash
 11 |     tar zxvf AVA_compress.tar.gz -C /some/path/
 12 |     cd /path/to/AlphAction/
 13 |     mkdir data
 14 |     ln -s /some/path/AVA data/AVA
 15 |     ```
 16 | 
 17 | ### Step-by-step Version
 18 | 
 19 | 1. **Download Annotations.** Donwload AVA Actions annotations from the 
 20 | [official dataset website](https://research.google.com/ava/download.html).
 21 | Organize those annotations file as following structure:
 22 | 
 23 |     ```
 24 |     AVA/
 25 |     |_ annotations/
 26 |     |  |_ ava_action_list_v2.2.pbtxt
 27 |     |  |_ ava_action_list_v2.2_for_activitynet_2019.pbtxt
 28 |     |  |_ ava_include_timestamps_v2.2.txt
 29 |     |  |_ ava_train_excluded_timestamps_v2.2.csv
 30 |     |  |_ ava_val_excluded_timestamps_v2.2.csv
 31 |     |  |_ ava_train_v2.2.csv
 32 |     |  |_ ava_val_v2.2.csv
 33 |     ```
 34 | 
 35 | 2. **Download Videos.** Download the list of training/validation file names
 36 | from [CVDF repository](https://github.com/cvdfoundation/ava-dataset) and 
 37 | download all videos following those links provided there. Place 
 38 | the list file and video files as follows:
 39 | 
 40 |     ```
 41 |     AVA/
 42 |     |_ annotations/
 43 |     |  |_ ava_file_names_trainval_v2.1.txt
 44 |     |_ movies/
 45 |     |  |_ trainval/
 46 |     |  |  |_ <MOVIE-ID-1>.mp4
 47 |     |  |  |_ ...
 48 |     |  |  |_ <MOVIE-ID-N>.mp4
 49 |     ```
 50 |    
 51 | 3. **Create Symbolic Link.** Create a symbolic link that 
 52 | references the AVA dataset directory by running following
 53 | commands.
 54 |     
 55 |     ```shell
 56 |     cd /path/to/AlphAction
 57 |     mkdir data
 58 |     ln -s /path/to/AVA data/AVA
 59 |     ```
 60 |    
 61 | 4. **Preprocess Videos.** Running following commands to 
 62 | process raw movies.
 63 | 
 64 |     ```shell
 65 |     python tools/process_ava_videos.py \
 66 |     --movie_root data/AVA/movies/trainval \
 67 |     --clip_root data/AVA/clips/trainval \
 68 |     --kframe_root data/AVA/keyframes/trainval \
 69 |     --process_num $[`nproc`/2]
 70 |     ```
 71 |    
 72 |     This script extracts video clips and key frames from 
 73 |     those raw movies. Each video clip lasts exactly one 
 74 |     second and ranges from second 895 to second 1805. 
 75 |     All video clips are scaled such that the shortest side
 76 |     becomes no larger than 360 and transcoded to have fps 25.
 77 |     The first frame of each video clip is extracted as key 
 78 |     frame, which follows the definition in AVA dataset. 
 79 |     (Key frames are only used to detect persons and objects.)
 80 |     The output video clips and key frames will be saved as follows:
 81 | 
 82 |     ```
 83 |     AVA/
 84 |     |_ clips/
 85 |     |  |_ trainval/
 86 |     |  |  |_ <MOVIE-ID-1>
 87 |     |  |  |  |_ [895~1805].mp4
 88 |     |  |  |_ ...
 89 |     |  |  |_ <MOVIE-ID-N>
 90 |     |  |  |  |_ [895~1805].mp4
 91 |     |_ keyframes/
 92 |     |  |_ trainval/
 93 |     |  |  |_ <MOVIE-ID-1>
 94 |     |  |  |  |_ [895~1805].jpg
 95 |     |  |  |_ ...
 96 |     |  |  |_ <MOVIE-ID-N>
 97 |     |  |  |  |_ [895~1805].jpg
 98 |     ```
 99 | 
100 | 5. **Convert Annotations.** Our codes use COCO-style anntations, 
101 | so we have to convert official csv annotations into COCO json format
102 | by running following commands.
103 | 
104 |     ```shell
105 |     python tools/csv2COCO.py \
106 |     --csv_path data/AVA/annotations/ava_train_v2.2.csv \
107 |     --movie_list data/AVA/annotations/ava_file_names_trainval_v2.1.txt \
108 |     --img_root data/AVA/keyframes/trainval
109 |     python tools/csv2COCO.py \
110 |     --csv_path data/AVA/annotations/ava_val_v2.2.csv \
111 |     --movie_list data/AVA/annotations/ava_file_names_trainval_v2.1.txt \
112 |     --img_root data/AVA/keyframes/trainval
113 |     ```
114 |    
115 |     The converted json files will be stored in `AVA/annotations` directory
116 |     as follows, `*_min.json` means that the json file has no space indent.
117 |     
118 |     Alternatively, you could just download our json files 
119 |     here([train](https://drive.google.com/file/d/1BLCMkcnWusaqhHNrDjxzTOMFNT_W-rbr/view?usp=sharing), 
120 |     [val](https://drive.google.com/file/d/1A9_ywPZA4kr3qM8e27yvxkAF5idcQRme/view?usp=sharing)).
121 |    
122 |     ```
123 |     AVA/
124 |     |_ annotations/
125 |     |  |_ ava_train_v2.2.json
126 |     |  |_ ava_train_v2.2_min.json
127 |     |  |_ ava_val_v2.2.json
128 |     |  |_ ava_val_v2.2_min.json
129 |     ```
130 |    
131 | 6. **Detect Persons and Objects.** The predicted person boxes 
132 | for AVA validation set can be donwloaded [[here]](https://drive.google.com/file/d/1XnPoJqTVtBVF3XxpFtvDTZQ6EFW8b2S4/view?usp=sharing).
133 | Note that we only use ground truth person boxes for training.
134 | The object boxes files are also available for download([train](https://drive.google.com/file/d/17nH47vH4q9fCs-fs4lQ9QV1POGWzQloh/view?usp=sharing), 
135 | [val](https://drive.google.com/file/d/1DcXdaSkwR5Ga50kowe1OEbSQy3AKp57L/view?usp=sharing)).
136 | These files should be placed at following locations.
137 | 
138 |     ```
139 |     AVA/
140 |     |_ boxes/
141 |     |  |_ ava_val_det_person_bbox.json
142 |     |  |_ ava_train_det_object_bbox.json
143 |     |  |_ ava_val_det_object_bbox.json
144 |     ```
145 | 
146 |     For person detector, we first trained it on MSCOCO 
147 |     keypoint dataset and then fine-tuned it on AVA dataset. 
148 |     The final model weight is available [[here]](https://drive.google.com/file/d/1T6kx1AJe0IA-aqrpLyeRblq2uYlaopow/view?usp=sharing).
149 |     
150 |     For object detector, we use the model provided in 
151 |     [maskrcnn-benchmark repository](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_X_101_32x8d_FPN_1x.pth), 
152 |     which is trained on MSCOCO dataset. Person boxes are removed
153 |     from the predicted results.


--------------------------------------------------------------------------------
/GETTING_STARTED.md:
--------------------------------------------------------------------------------
 1 | ## Getting Started with AlphAction
 2 | 
 3 | The hyper-parameters of each experiment are controlled by 
 4 | a .yaml config file, which is located in the directory 
 5 | `config_files`. All of these configuration files assume 
 6 | that we are running on 8 GPUs. We need to create a symbolic
 7 | link to the directory `output`, where the output (logs and checkpoints)
 8 | will be saved. Besides, we recommend to create a directory `models` to place 
 9 | model weights. These can be done with following commands.
10 | 
11 | ```shell
12 | mkdir -p /path/to/output
13 | ln -s /path/to/output data/output
14 | mkdir -p /path/to/models
15 | ln -s /path/to/models data/models
16 | ```
17 | 
18 | ### Training
19 | 
20 | Download pre-trained models from [MODEL_ZOO.md](MODEL_ZOO.md#pre-trained-models).
21 | Then place pre-trained models in `data/models` directory with following structure:
22 | 
23 | ```
24 | models/
25 | |_ pretrained_models/
26 | |  |_ SlowFast-ResNet50-4x16.pth
27 | |  |_ SlowFast-ResNet101-8x8.pth
28 | ```
29 | 
30 | To train on a single GPU, you only need to run following command. The 
31 | argument `--use-tfboard` enables tensorboard to log training process. 
32 | Because the config files assume that we are using 8 GPUs, the global 
33 | batch size `SOLVER.VIDEOS_PER_BATCH` and `TEST.VIDEOS_PER_BATCH` can
34 | be too large for a single GPU. Therefore, in the following command, we 
35 | modify the batch size and also adjust the learning rate and schedule 
36 | length according to the linear scaling rule.
37 | 
38 | ```shell
39 | python train_net.py --config-file "path/to/config/file.yaml" \
40 | --transfer --no-head --use-tfboard \
41 | SOLVER.BASE_LR 0.000125 \
42 | SOLVER.STEPS '(560000, 720000)' \
43 | SOLVER.MAX_ITER 880000 \ 
44 | SOLVER.VIDEOS_PER_BATCH 2 \
45 | TEST.VIDEOS_PER_BATCH 2
46 | ```
47 | 
48 | We use the launch utility `torch.distributed.launch` to launch multiple 
49 | processes for distributed training on multiple gpus. `GPU_NUM` should be
50 | replaced by the number of gpus to use. Hyper-parameters in the config file
51 | can still be modified in the way used in single-GPU training.
52 | 
53 | ```shell
54 | python -m torch.distributed.launch --nproc_per_node=GPU_NUM \
55 | train_net.py --config-file "path/to/config/file.yaml" \
56 | --transfer --no-head --use-tfboard
57 | ```
58 | 
59 | ### Inference
60 | 
61 | To do inference on multiple GPUs, you should run the following command. Note that 
62 | our code first trys to load the `last_checkpoint` in the `OUTPUT_DIR`. If there
63 |  is no such file in `OUTPUT_DIR`, it will then load the model from the 
64 |  path specified in `MODEL.WEIGHT`. To use `MODEL.WEIGHT` to do the inference,
65 |  you need to ensure that there is no `last_checkpoint` in `OUTPUT_DIR`. 
66 |  You can download the model weights from [MODEL_ZOO.md](MODEL_ZOO.md#ava-models).
67 |  
68 |  ```shell
69 | python -m torch.distributed.launch --nproc_per_node=GPU_NUM \
70 | test_net.py --config-file "path/to/config/file.yaml" \
71 | MODEL.WEIGHT "path/to/model/weight"
72 |  ```


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | **Requirements**
 4 | 
 5 | - Python >= 3.5
 6 | - [Pytorch](https://pytorch.org/) == 1.4.0 （other versions are not tested)
 7 | - [PyAV](https://github.com/mikeboers/PyAV) >= 6.2.0
 8 | - [yacs](https://github.com/rbgirshick/yacs)
 9 | - [OpenCV](https://opencv.org/)
10 | - [tensorboardX](https://github.com/lanpa/tensorboardX)
11 | - [tqdm](https://github.com/tqdm/tqdm)
12 | - [FFmpeg](https://www.ffmpeg.org/)
13 | - [Cython](https://cython.org/), [cython_bbox](https://github.com/samson-wang/cython_bbox), [SciPy](https://scipy.org/scipylib/), [matplotlib](https://matplotlib.org/), [easydict](https://github.com/makinacorpus/easydict) (for running demo)
14 | - Linux + Nvidia GPUs
15 | 
16 | We recommend to setup the environment with Anaconda, 
17 | the step-by-step installation script is shown below.
18 | 
19 | ```bash
20 | conda create -n alphaction python=3.7
21 | conda activate alphaction
22 | 
23 | # install pytorch with the same cuda version as in your environment
24 | cuda_version=$(nvcc --version | grep -oP '(?<=release )[\d\.]*?(?=,)')
25 | conda install pytorch=1.4.0 torchvision cudatoolkit=$cuda_version -c pytorch
26 | # you should check manually if you successfully install pytorch here, there may be no such package for some cuda versions.
27 | 
28 | conda install av -c conda-forge
29 | conda install cython
30 | 
31 | git clone https://github.com/MVIG-SJTU/AlphAction.git
32 | cd AlphAction
33 | pip install -e .    # Other dependicies will be installed here
34 | 
35 | ```
36 | 


--------------------------------------------------------------------------------
/MODEL_ZOO.md:
--------------------------------------------------------------------------------
 1 | ## AlphAction Model Zoo
 2 | 
 3 | ### Pre-trained Models
 4 | 
 5 | We provide backbone models pre-trained on Kinetics dataset, used for further
 6 | fine-tuning on AVA dataset. The reported accuracy are obtained by 30-view testing.
 7 | 
 8 | | backbone | pre-train | frame length | sample rate | top-1 | top-5 | model |
 9 | | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
10 | | SlowFast-R50 | Kinetics-700 | 4 | 16 | 66.34 | 86.66 | [[link]](https://drive.google.com/file/d/1hqFuhD1p0lMpl3Yi5paIGY-hlPTVYgyi/view?usp=sharing) |
11 | | SlowFast-R101 | Kinetics-700 | 8 | 8 | 69.32 | 88.84 | [[link]](https://drive.google.com/file/d/1JDQLyyL-GFd3qi0S31Mdt5oNmUXnyJza/view?usp=sharing) |
12 | 
13 | ### AVA Models
14 | 
15 | | config | backbone | IA structure | mAP | in paper | model |
16 | | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
17 | | [resnet50_4x16f_baseline](config_files/resnet50_4x16f_baseline.yaml) | SlowFast-R50-4x16 | w/o | 26.7 | 26.5 | [[link]](https://drive.google.com/file/d/1HmFVEe_wsOP9WUNdU_W7PkgWsHzZtgDf/view?usp=sharing) |
18 | | [resnet50_4x16f_parallel](config_files/resnet50_4x16f_parallel.yaml) | SlowFast-R50-4x16 | Parallel | 29.0 | 28.9 | [[link]](https://drive.google.com/file/d/1CdgwZk6HQGryBssVhE7E48HZ3CCubBg0/view?usp=sharing) |
19 | | [resnet50_4x16f_serial](config_files/resnet50_4x16f_serial.yaml) | SlowFast-R50-4x16 | Serial | 29.8 | 29.6 | [[link]](https://drive.google.com/file/d/1RTUi_ARCtar1r-u7UaTxCaLyJFnVe9Ro/view?usp=sharing) |
20 | | [resnet50_4x16f_denseserial](config_files/resnet50_4x16f_denseserial.yaml) | SlowFast-R50-4x16 | Dense Serial | 30.0 | 29.8 | [[link]](https://drive.google.com/file/d/1bYxGyf6kptfUBNAHtFcG7x4Ryp7mcWxH/view?usp=sharing) | 
21 | | [resnet101_8x8f_baseline](config_files/resnet101_8x8f_baseline.yaml) | SlowFast-R101-8x8 | w/o | 29.3 | 29.3 | [[link]](https://drive.google.com/file/d/1oVGRV82iIaxm7XJqAXw7AoDTxFtdqvfv/view?usp=sharing) |
22 | | [resnet101_8x8f_denseserial](config_files/resnet101_8x8f_denseserial.yaml) | SlowFast-R101-8x8 | Dense Serial | 32.4 | 32.3 | [[link]](https://drive.google.com/file/d/1yqqc2_X6Ywi165PIuq68NdTs2WwMygHh/view?usp=sharing) |
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AlphAction
 2 | 
 3 | AlphAction aims to detect the actions of multiple persons in videos. It is 
 4 | **the first open-source project that achieves 30+ mAP (32.4 mAP) with single 
 5 | model on AVA dataset.** 
 6 | 
 7 | This project is the official implementation of paper 
 8 | [Asynchronous Interaction Aggregation for Action Detection](https://arxiv.org/abs/2004.07485) (**ECCV 2020**), authored
 9 | by Jiajun Tang*, Jin Xia* (equal contribution), Xinzhi Mu, [Bo Pang](https://bopang1996.github.io/), 
10 | [Cewu Lu](http://mvig.sjtu.edu.cn/) (corresponding author). 
11 | 
12 | <br/>
13 | <div align="center">
14 |   <img  src="https://user-images.githubusercontent.com/22748802/94115535-71fc9580-fe7c-11ea-98af-d8e9a8a2de82.gif" width="410" alt="demo1">
15 |   <img  src="https://user-images.githubusercontent.com/22748802/94115605-8ccf0a00-fe7c-11ea-8855-ab84232612a0.gif" width="410" alt="demo2">
16 | </div>
17 | <div align="center">
18 |   <img  src="https://user-images.githubusercontent.com/22748802/94115715-b12ae680-fe7c-11ea-8180-8e3d7f57a4bb.gif" alt="demo3">
19 | </div>
20 | <br/>
21 | 
22 | ## Demo Video
23 | 
24 | [![AlphAction demo video](https://user-images.githubusercontent.com/22748802/94115680-a83a1500-fe7c-11ea-878c-536db277fba7.jpg)](https://www.youtube.com/watch?v=TdGmbOJ9hoE "AlphAction demo video")
25 | [[YouTube]](https://www.youtube.com/watch?v=TdGmbOJ9hoE) [[BiliBili]](https://www.bilibili.com/video/BV14A411J7Xv)
26 | 
27 | ## Installation 
28 | 
29 | You need first to install this project, please check [INSTALL.md](INSTALL.md)
30 | 
31 | ## Data Preparation
32 | 
33 | To do training or inference on AVA dataset, please check [DATA.md](DATA.md)
34 | for data preparation instructions. If you have difficulty accessing Google Drive, you can instead find most files (including models) on Baidu NetDisk([[link]](https://pan.baidu.com/s/1MmYiZ4Vyeznke5_3L4WjYw), code: `smti`).
35 | 
36 | ## Model Zoo
37 | 
38 | Please see [MODEL_ZOO.md](MODEL_ZOO.md) for downloading models.
39 | 
40 | ## Training and Inference
41 | 
42 | To do training or inference with AlphAction, please refer to [GETTING_STARTED.md](GETTING_STARTED.md).
43 | 
44 | ## Demo Program
45 | 
46 | To run the demo program on video or webcam, please check the folder [demo](demo).
47 | We select 15 common categories from the 80 action categories of AVA, and 
48 | provide a practical model which achieves high accuracy (about 70 mAP) on these categories. 
49 | 
50 | ## Acknowledgement
51 | We thankfully acknowledge the computing resource support of Huawei Corporation
52 | for this project. 
53 | 
54 | ## Citation
55 | 
56 | If this project helps you in your research or project, please cite
57 | this paper:
58 | 
59 | ```
60 | @inproceedings{tang2020asynchronous,
61 |   title={Asynchronous Interaction Aggregation for Action Detection},
62 |   author={Tang, Jiajun and Xia, Jin and Mu, Xinzhi and Pang, Bo and Lu, Cewu},
63 |   booktitle={Proceedings of the European conference on computer vision (ECCV)},
64 |   year={2020}
65 | }
66 | ```
67 | 


--------------------------------------------------------------------------------
/alphaction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/__init__.py


--------------------------------------------------------------------------------
/alphaction/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .defaults import _C as cfg
2 | 


--------------------------------------------------------------------------------
/alphaction/config/paths_catalog.py:
--------------------------------------------------------------------------------
 1 | """Centralized catalog of paths."""
 2 | 
 3 | import os
 4 | 
 5 | 
 6 | class DatasetCatalog(object):
 7 |     DATA_DIR = "data"
 8 |     DATASETS = {
 9 |         "ava_video_train_v2.2": {
10 |             "video_root": "AVA/clips/trainval",
11 |             "ann_file": "AVA/annotations/ava_train_v2.2_min.json",
12 |             "box_file": "",
13 |             "eval_file_paths": {
14 |                 "csv_gt_file": "AVA/annotations/ava_train_v2.2.csv",
15 |                 "labelmap_file": "AVA/annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt",
16 |                 "exclusion_file": "AVA/annotations/ava_train_excluded_timestamps_v2.2.csv",
17 |             },
18 |             "object_file": "AVA/boxes/ava_train_det_object_bbox.json",
19 |         },
20 |         "ava_video_val_v2.2": {
21 |             "video_root": "AVA/clips/trainval",
22 |             "ann_file": "AVA/annotations/ava_val_v2.2_min.json",
23 |             "box_file": "AVA/boxes/ava_val_det_person_bbox.json",
24 |             "eval_file_paths": {
25 |                 "csv_gt_file": "AVA/annotations/ava_val_v2.2.csv",
26 |                 "labelmap_file": "AVA/annotations/ava_action_list_v2.2_for_activitynet_2019.pbtxt",
27 |                 "exclusion_file": "AVA/annotations/ava_val_excluded_timestamps_v2.2.csv",
28 |             },
29 |             "object_file": "AVA/boxes/ava_val_det_object_bbox.json",
30 |         },
31 |     }
32 | 
33 |     @staticmethod
34 |     def get(name):
35 |         if "ava_video" in name:
36 |             data_dir = DatasetCatalog.DATA_DIR
37 |             attrs = DatasetCatalog.DATASETS[name]
38 |             if attrs["box_file"]=="":
39 |                 box_file = ""
40 |             else:
41 |                 box_file = os.path.join(data_dir, attrs["box_file"])
42 |             args = dict(
43 |                 video_root=os.path.join(data_dir, attrs["video_root"]),
44 |                 ann_file=os.path.join(data_dir, attrs["ann_file"]),
45 |                 box_file=box_file,
46 |                 eval_file_paths={key: os.path.join(data_dir, attrs["eval_file_paths"][key]) for key in
47 |                                  attrs["eval_file_paths"]},
48 |                 object_file=os.path.join(data_dir, attrs["object_file"]),
49 |             )
50 |             return dict(
51 |                 factory="AVAVideoDataset",
52 |                 args=args
53 |             )
54 |         raise RuntimeError("Dataset not available: {}".format(name))


--------------------------------------------------------------------------------
/alphaction/csrc/ROIAlign3d.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cpu/vision.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/vision.h"
 7 | #endif
 8 | 
 9 | // Interface for Python
10 | at::Tensor ROIAlign3d_forward(const at::Tensor& input,
11 |                             const at::Tensor& rois,
12 |                             const float spatial_scale,
13 |                             const int pooled_height,
14 |                             const int pooled_width,
15 |                             const int sampling_ratio) {
16 |   if (input.is_cuda()) {
17 | #ifdef WITH_CUDA
18 |     return ROIAlign3d_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
19 | #else
20 |     AT_ERROR("Not compiled with GPU support");
21 | #endif
22 |   }
23 |   AT_ERROR("Not implemented on the CPU");
24 | }
25 | 
26 | at::Tensor ROIAlign3d_backward(const at::Tensor& grad,
27 |                              const at::Tensor& rois,
28 |                              const float spatial_scale,
29 |                              const int pooled_height,
30 |                              const int pooled_width,
31 |                              const int batch_size,
32 |                              const int channels,
33 |                              const int length,
34 |                              const int height,
35 |                              const int width,
36 |                              const int sampling_ratio) {
37 |   if (grad.is_cuda()) {
38 | #ifdef WITH_CUDA
39 |     return ROIAlign3d_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, length, height, width, sampling_ratio);
40 | #else
41 |     AT_ERROR("Not compiled with GPU support");
42 | #endif
43 |   }
44 |   AT_ERROR("Not implemented on the CPU");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/alphaction/csrc/ROIPool3d.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cpu/vision.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/vision.h"
 7 | #endif
 8 | 
 9 | 
10 | std::tuple<at::Tensor, at::Tensor> ROIPool3d_forward(const at::Tensor& input,
11 |                                 const at::Tensor& rois,
12 |                                 const float spatial_scale,
13 |                                 const int pooled_height,
14 |                                 const int pooled_width) {
15 |   if (input.is_cuda()) {
16 | #ifdef WITH_CUDA
17 |     return ROIPool3d_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
18 | #else
19 |     AT_ERROR("Not compiled with GPU support");
20 | #endif
21 |   }
22 |   AT_ERROR("Not implemented on the CPU");
23 | }
24 | 
25 | at::Tensor ROIPool3d_backward(const at::Tensor& grad,
26 |                                  const at::Tensor& input,
27 |                                  const at::Tensor& rois,
28 |                                  const at::Tensor& argmax,
29 |                                  const float spatial_scale,
30 |                                  const int pooled_height,
31 |                                  const int pooled_width,
32 |                                  const int batch_size,
33 |                                  const int channels,
34 |                                  const int length,
35 |                                  const int height,
36 |                                  const int width) {
37 |   if (grad.is_cuda()) {
38 | #ifdef WITH_CUDA
39 |     return ROIPool3d_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, length, height, width);
40 | #else
41 |     AT_ERROR("Not compiled with GPU support");
42 | #endif
43 |   }
44 |   AT_ERROR("Not implemented on the CPU");
45 | }


--------------------------------------------------------------------------------
/alphaction/csrc/SigmoidFocalLoss.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cpu/vision.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/vision.h"
 7 | #endif
 8 | 
 9 | // Interface for Python
10 | at::Tensor SigmoidFocalLoss_forward(
11 | 		const at::Tensor& logits,
12 |                 const at::Tensor& targets,
13 | 		const float gamma,
14 | 		const float alpha) {
15 |   if (logits.is_cuda()) {
16 | #ifdef WITH_CUDA
17 |     return SigmoidFocalLoss_forward_cuda(logits, targets, gamma, alpha);
18 | #else
19 |     AT_ERROR("Not compiled with GPU support");
20 | #endif
21 |   }
22 |   AT_ERROR("Not implemented on the CPU");
23 | }
24 | 
25 | at::Tensor SigmoidFocalLoss_backward(
26 | 			     const at::Tensor& logits,
27 |                              const at::Tensor& targets,
28 | 			     const at::Tensor& d_losses,
29 | 			     const float gamma,
30 | 			     const float alpha) {
31 |   if (logits.is_cuda()) {
32 | #ifdef WITH_CUDA
33 |     return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, gamma, alpha);
34 | #else
35 |     AT_ERROR("Not compiled with GPU support");
36 | #endif
37 |   }
38 |   AT_ERROR("Not implemented on the CPU");
39 | }


--------------------------------------------------------------------------------
/alphaction/csrc/SoftmaxFocalLoss.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cpu/vision.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/vision.h"
 7 | #endif
 8 | 
 9 | // Interface for Python
10 | std::tuple<at::Tensor, at::Tensor> SoftmaxFocalLoss_forward(
11 | 		const at::Tensor& logits,
12 |                 const at::Tensor& targets,
13 | 		const float gamma,
14 | 		const float alpha) {
15 |   if (logits.is_cuda()) {
16 | #ifdef WITH_CUDA
17 |     return SoftmaxFocalLoss_forward_cuda(logits, targets, gamma, alpha);
18 | #else
19 |     AT_ERROR("Not compiled with GPU support");
20 | #endif
21 |   }
22 |   AT_ERROR("Not implemented on the CPU");
23 | }
24 | 
25 | at::Tensor SoftmaxFocalLoss_backward(
26 | 			     const at::Tensor& logits,
27 |                              const at::Tensor& targets,
28 |                  const at::Tensor& P,
29 | 			     const at::Tensor& d_losses,
30 | 			     const float gamma,
31 | 			     const float alpha) {
32 |   if (logits.is_cuda()) {
33 | #ifdef WITH_CUDA
34 |     return SoftmaxFocalLoss_backward_cuda(logits, targets, P, d_losses, gamma, alpha);
35 | #else
36 |     AT_ERROR("Not compiled with GPU support");
37 | #endif
38 |   }
39 |   AT_ERROR("Not implemented on the CPU");
40 | }


--------------------------------------------------------------------------------
/alphaction/csrc/cpu/vision.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <torch/extension.h>
3 | 


--------------------------------------------------------------------------------
/alphaction/csrc/cuda/vision.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | at::Tensor ROIAlign3d_forward_cuda(const at::Tensor& input,
 5 |                                  const at::Tensor& rois,
 6 |                                  const float spatial_scale,
 7 |                                  const int pooled_height,
 8 |                                  const int pooled_width,
 9 |                                  const int sampling_ratio);
10 | 
11 | at::Tensor ROIAlign3d_backward_cuda(const at::Tensor& grad,
12 |                                   const at::Tensor& rois,
13 |                                   const float spatial_scale,
14 |                                   const int pooled_height,
15 |                                   const int pooled_width,
16 |                                   const int batch_size,
17 |                                   const int channels,
18 |                                   const int length,
19 |                                   const int height,
20 |                                   const int width,
21 |                                   const int sampling_ratio);
22 | 
23 | std::tuple<at::Tensor, at::Tensor> ROIPool3d_forward_cuda(const at::Tensor& input,
24 |                                 const at::Tensor& rois,
25 |                                 const float spatial_scale,
26 |                                 const int pooled_height,
27 |                                 const int pooled_width);
28 | 
29 | at::Tensor ROIPool3d_backward_cuda(const at::Tensor& grad,
30 |                                  const at::Tensor& input,
31 |                                  const at::Tensor& rois,
32 |                                  const at::Tensor& argmax,
33 |                                  const float spatial_scale,
34 |                                  const int pooled_height,
35 |                                  const int pooled_width,
36 |                                  const int batch_size,
37 |                                  const int channels,
38 |                                  const int length,
39 |                                  const int height,
40 |                                  const int width);
41 | 
42 | 
43 | at::Tensor SigmoidFocalLoss_forward_cuda(
44 | 		const at::Tensor& logits,
45 |         const at::Tensor& targets,
46 | 		const float gamma,
47 | 		const float alpha);
48 | 
49 | at::Tensor SigmoidFocalLoss_backward_cuda(
50 | 		const at::Tensor& logits,
51 |         const at::Tensor& targets,
52 | 		const at::Tensor& d_losses,
53 | 		const float gamma,
54 | 		const float alpha);
55 | 
56 | std::tuple<at::Tensor, at::Tensor> SoftmaxFocalLoss_forward_cuda(
57 | 		const at::Tensor& logits,
58 |         const at::Tensor& targets,
59 | 		const float gamma,
60 | 		const float alpha);
61 | 
62 | at::Tensor SoftmaxFocalLoss_backward_cuda(
63 | 		const at::Tensor& logits,
64 |         const at::Tensor& targets,
65 |         const at::Tensor& P,
66 | 		const at::Tensor& d_losses,
67 | 		const float gamma,
68 | 		const float alpha);


--------------------------------------------------------------------------------
/alphaction/csrc/vision.cpp:
--------------------------------------------------------------------------------
 1 | #include "ROIAlign3d.h"
 2 | #include "ROIPool3d.h"
 3 | #include "SoftmaxFocalLoss.h"
 4 | #include "SigmoidFocalLoss.h"
 5 | 
 6 | 
 7 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 8 |   m.def("roi_align_3d_forward",&ROIAlign3d_forward, "ROIAlign3d_forward");
 9 |   m.def("roi_align_3d_backward",&ROIAlign3d_backward, "ROIAlign3d_backward");
10 |   m.def("roi_pool_3d_forward", &ROIPool3d_forward, "ROIPool3d_forward");
11 |   m.def("roi_pool_3d_backward", &ROIPool3d_backward, "ROIPool3d_backward");
12 |   m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward");
13 |   m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward");
14 |   m.def("softmax_focalloss_forward", &SoftmaxFocalLoss_forward, "SoftmaxFocalLoss_forward");
15 |   m.def("softmax_focalloss_backward", &SoftmaxFocalLoss_backward, "SoftmaxFocalLoss_backward");
16 | }
17 | 


--------------------------------------------------------------------------------
/alphaction/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import make_data_loader
2 | 


--------------------------------------------------------------------------------
/alphaction/dataset/collate_batch.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | 
 4 | def batch_different_videos(videos, size_divisible=0):
 5 |     '''
 6 |     :param videos: a list of video tensors
 7 |     :param size_divisible: output_size(width and height) should be divisble by this param
 8 |     :return: batched videos as a single tensor
 9 |     '''
10 |     assert isinstance(videos, (tuple, list))
11 |     max_size = tuple(max(s) for s in zip(*[clip.shape for clip in videos]))
12 | 
13 |     if size_divisible > 0:
14 |         stride = size_divisible
15 |         max_size = list(max_size)
16 |         max_size[2] = int(math.ceil(max_size[2] / stride) * stride)
17 |         max_size[3] = int(math.ceil(max_size[3] / stride) * stride)
18 |         max_size = tuple(max_size)
19 | 
20 |     batch_shape = (len(videos),) + max_size
21 |     batched_clips = videos[0].new(*batch_shape).zero_()
22 |     for clip, pad_clip in zip(videos, batched_clips):
23 |         pad_clip[:clip.shape[0], :clip.shape[1], :clip.shape[2], :clip.shape[3]].copy_(clip)
24 | 
25 |     return batched_clips
26 | 
27 | 
28 | class BatchCollator(object):
29 |     """
30 |     From a list of samples from the dataset,
31 |     returns the batched objectimages and targets.
32 |     This should be passed to the DataLoader
33 |     """
34 | 
35 |     def __init__(self, size_divisible=0):
36 |         self.divisible = size_divisible
37 |         self.size_divisible = self.divisible
38 | 
39 |     def __call__(self, batch):
40 |         transposed_batch = list(zip(*batch))
41 |         slow_clips = batch_different_videos(transposed_batch[0], self.size_divisible)
42 |         fast_clips = batch_different_videos(transposed_batch[1], self.size_divisible)
43 |         boxes = transposed_batch[2]
44 |         objects = transposed_batch[3]
45 |         extras = transposed_batch[4]
46 |         clip_ids = transposed_batch[5]
47 |         return slow_clips, fast_clips, boxes, objects, extras, clip_ids
48 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .concat_dataset import ConcatDataset
2 | from .ava import AVAVideoDataset
3 | 
4 | __all__ = ["ConcatDataset", "AVAVideoDataset"]


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/concat_dataset.py:
--------------------------------------------------------------------------------
 1 | import bisect
 2 | 
 3 | from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
 4 | 
 5 | 
 6 | class ConcatDataset(_ConcatDataset):
 7 |     """
 8 |     Same as torch.utils.dataset.dataset.ConcatDataset, but exposes an extra
 9 |     method for querying the sizes of the image
10 |     """
11 | 
12 |     def get_idxs(self, idx):
13 |         dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
14 |         if dataset_idx == 0:
15 |             sample_idx = idx
16 |         else:
17 |             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
18 |         return dataset_idx, sample_idx
19 | 
20 |     def get_video_info(self, idx):
21 |         dataset_idx, sample_idx = self.get_idxs(idx)
22 |         return self.datasets[dataset_idx].get_video_info(sample_idx)
23 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | from alphaction.dataset import datasets
 2 | 
 3 | from .ava import ava_evaluation
 4 | 
 5 | 
 6 | def evaluate(dataset, predictions, output_folder, **kwargs):
 7 |     """evaluate dataset using different methods based on dataset type.
 8 |     Args:
 9 |         dataset: Dataset object
10 |         predictions(list[BoxList]): each item in the list represents the
11 |             prediction results for one image.
12 |         output_folder: output folder, to save evaluation files or results.
13 |         **kwargs: other args.
14 |     Returns:
15 |         evaluation result
16 |     """
17 |     args = dict(
18 |         dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs
19 |     )
20 |     if isinstance(dataset, datasets.AVAVideoDataset):
21 |         return ava_evaluation(**args)
22 |     else:
23 |         dataset_name = dataset.__class__.__name__
24 |         raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name))
25 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ava/README.md:
--------------------------------------------------------------------------------
1 | The evaluation code of AVA is modified from [https://github.com/activitynet/ActivityNet](https://github.com/activitynet/ActivityNet).


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ava/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .ava_eval import do_ava_evaluation
 3 | 
 4 | 
 5 | def ava_evaluation(dataset, predictions, output_folder, **_):
 6 |     logger = logging.getLogger("alphaction.inference")
 7 |     logger.info("performing ava evaluation.")
 8 |     return do_ava_evaluation(
 9 |         dataset=dataset,
10 |         predictions=predictions,
11 |         output_folder=output_folder,
12 |         logger=logger,
13 |     )
14 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ava/pascal_evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/dataset/datasets/evaluation/ava/pascal_evaluation/__init__.py


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ava/pascal_evaluation/np_box_list.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Numpy BoxList classes and functions."""
 17 | 
 18 | import numpy as np
 19 | 
 20 | 
 21 | class BoxList(object):
 22 |   """Box collection.
 23 | 
 24 |   BoxList represents a list of bounding boxes as numpy array, where each
 25 |   bounding box is represented as a row of 4 numbers,
 26 |   [y_min, x_min, y_max, x_max].  It is assumed that all bounding boxes within a
 27 |   given list correspond to a single image.
 28 | 
 29 |   Optionally, users can add additional related fields (such as
 30 |   objectness/classification scores).
 31 |   """
 32 | 
 33 |   def __init__(self, data):
 34 |     """Constructs box collection.
 35 | 
 36 |     Args:
 37 |       data: a numpy array of shape [N, 4] representing box coordinates
 38 | 
 39 |     Raises:
 40 |       ValueError: if bbox dataset is not a numpy array
 41 |       ValueError: if invalid dimensions for bbox dataset
 42 |     """
 43 |     if not isinstance(data, np.ndarray):
 44 |       raise ValueError('dataset must be a numpy array.')
 45 |     if len(data.shape) != 2 or data.shape[1] != 4:
 46 |       raise ValueError('Invalid dimensions for box dataset.')
 47 |     if data.dtype != np.float32 and data.dtype != np.float64:
 48 |       raise ValueError('Invalid dataset type for box dataset: float is required.')
 49 |     if not self._is_valid_boxes(data):
 50 |       raise ValueError('Invalid box dataset. dataset must be a numpy array of '
 51 |                        'N*[y_min, x_min, y_max, x_max]')
 52 |     self.data = {'boxes': data}
 53 | 
 54 |   def num_boxes(self):
 55 |     """Return number of boxes held in collections."""
 56 |     return self.data['boxes'].shape[0]
 57 | 
 58 |   def get_extra_fields(self):
 59 |     """Return all non-box fields."""
 60 |     return [k for k in self.data.keys() if k != 'boxes']
 61 | 
 62 |   def has_field(self, field):
 63 |     return field in self.data
 64 | 
 65 |   def add_field(self, field, field_data):
 66 |     """Add dataset to a specified field.
 67 | 
 68 |     Args:
 69 |       field: a string parameter used to speficy a related field to be accessed.
 70 |       field_data: a numpy array of [N, ...] representing the dataset associated
 71 |           with the field.
 72 |     Raises:
 73 |       ValueError: if the field is already exist or the dimension of the field
 74 |           dataset does not matches the number of boxes.
 75 |     """
 76 |     if self.has_field(field):
 77 |       raise ValueError('Field ' + field + 'already exists')
 78 |     if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes():
 79 |       raise ValueError('Invalid dimensions for field dataset')
 80 |     self.data[field] = field_data
 81 | 
 82 |   def get(self):
 83 |     """Convenience function for accesssing box coordinates.
 84 | 
 85 |     Returns:
 86 |       a numpy array of shape [N, 4] representing box corners
 87 |     """
 88 |     return self.get_field('boxes')
 89 | 
 90 |   def get_field(self, field):
 91 |     """Accesses dataset associated with the specified field in the box collection.
 92 | 
 93 |     Args:
 94 |       field: a string parameter used to speficy a related field to be accessed.
 95 | 
 96 |     Returns:
 97 |       a numpy 1-d array representing dataset of an associated field
 98 | 
 99 |     Raises:
100 |       ValueError: if invalid field
101 |     """
102 |     if not self.has_field(field):
103 |       raise ValueError('field {} does not exist'.format(field))
104 |     return self.data[field]
105 | 
106 |   def get_coordinates(self):
107 |     """Get corner coordinates of boxes.
108 | 
109 |     Returns:
110 |      a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max]
111 |     """
112 |     box_coordinates = self.get()
113 |     y_min = box_coordinates[:, 0]
114 |     x_min = box_coordinates[:, 1]
115 |     y_max = box_coordinates[:, 2]
116 |     x_max = box_coordinates[:, 3]
117 |     return [y_min, x_min, y_max, x_max]
118 | 
119 |   def _is_valid_boxes(self, data):
120 |     """Check whether dataset fullfills the format of N*[ymin, xmin, ymax, xmin].
121 | 
122 |     Args:
123 |       data: a numpy array of shape [N, 4] representing box coordinates
124 | 
125 |     Returns:
126 |       a boolean indicating whether all ymax of boxes are equal or greater than
127 |           ymin, and all xmax of boxes are equal or greater than xmin.
128 |     """
129 |     if data.shape[0] > 0:
130 |       for i in range(data.shape[0]):
131 |         if data[i, 0] > data[i, 2] or data[i, 1] > data[i, 3]:
132 |           return False
133 |     return True
134 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ava/pascal_evaluation/np_box_mask_list.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Numpy BoxMaskList classes and functions."""
17 | 
18 | import numpy as np
19 | from . import np_box_list
20 | 
21 | 
22 | class BoxMaskList(np_box_list.BoxList):
23 |   """Convenience wrapper for BoxList with masks.
24 | 
25 |   BoxMaskList extends the np_box_list.BoxList to contain masks as well.
26 |   In particular, its constructor receives both boxes and masks. Note that the
27 |   masks correspond to the full image.
28 |   """
29 | 
30 |   def __init__(self, box_data, mask_data):
31 |     """Constructs box collection.
32 | 
33 |     Args:
34 |       box_data: a numpy array of shape [N, 4] representing box coordinates
35 |       mask_data: a numpy array of shape [N, height, width] representing masks
36 |         with values are in {0,1}. The masks correspond to the full
37 |         image. The height and the width will be equal to image height and width.
38 | 
39 |     Raises:
40 |       ValueError: if bbox dataset is not a numpy array
41 |       ValueError: if invalid dimensions for bbox dataset
42 |       ValueError: if mask dataset is not a numpy array
43 |       ValueError: if invalid dimension for mask dataset
44 |     """
45 |     super(BoxMaskList, self).__init__(box_data)
46 |     if not isinstance(mask_data, np.ndarray):
47 |       raise ValueError('Mask dataset must be a numpy array.')
48 |     if len(mask_data.shape) != 3:
49 |       raise ValueError('Invalid dimensions for mask dataset.')
50 |     if mask_data.dtype != np.uint8:
51 |       raise ValueError('Invalid dataset type for mask dataset: uint8 is required.')
52 |     if mask_data.shape[0] != box_data.shape[0]:
53 |       raise ValueError('There should be the same number of boxes and masks.')
54 |     self.data['masks'] = mask_data
55 | 
56 |   def get_masks(self):
57 |     """Convenience function for accessing masks.
58 | 
59 |     Returns:
60 |       a numpy array of shape [N, height, width] representing masks
61 |     """
62 |     return self.get_field('masks')
63 | 
64 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ava/pascal_evaluation/np_box_ops.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Operations for [N, 4] numpy arrays representing bounding boxes.
17 | 
18 | Example box operations that are supported:
19 |   * Areas: compute bounding box areas
20 |   * IOU: pairwise intersection-over-union scores
21 | """
22 | import numpy as np
23 | 
24 | 
25 | def area(boxes):
26 |   """Computes area of boxes.
27 | 
28 |   Args:
29 |     boxes: Numpy array with shape [N, 4] holding N boxes
30 | 
31 |   Returns:
32 |     a numpy array with shape [N*1] representing box areas
33 |   """
34 |   return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
35 | 
36 | 
37 | def intersection(boxes1, boxes2):
38 |   """Compute pairwise intersection areas between boxes.
39 | 
40 |   Args:
41 |     boxes1: a numpy array with shape [N, 4] holding N boxes
42 |     boxes2: a numpy array with shape [M, 4] holding M boxes
43 | 
44 |   Returns:
45 |     a numpy array with shape [N*M] representing pairwise intersection area
46 |   """
47 |   [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
48 |   [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
49 | 
50 |   all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
51 |   all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
52 |   intersect_heights = np.maximum(
53 |       np.zeros(all_pairs_max_ymin.shape),
54 |       all_pairs_min_ymax - all_pairs_max_ymin)
55 |   all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
56 |   all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
57 |   intersect_widths = np.maximum(
58 |       np.zeros(all_pairs_max_xmin.shape),
59 |       all_pairs_min_xmax - all_pairs_max_xmin)
60 |   return intersect_heights * intersect_widths
61 | 
62 | 
63 | def iou(boxes1, boxes2):
64 |   """Computes pairwise intersection-over-union between box collections.
65 | 
66 |   Args:
67 |     boxes1: a numpy array with shape [N, 4] holding N boxes.
68 |     boxes2: a numpy array with shape [M, 4] holding N boxes.
69 | 
70 |   Returns:
71 |     a numpy array with shape [N, M] representing pairwise iou scores.
72 |   """
73 |   intersect = intersection(boxes1, boxes2)
74 |   area1 = area(boxes1)
75 |   area2 = area(boxes2)
76 |   union = np.expand_dims(area1, axis=1) + np.expand_dims(
77 |       area2, axis=0) - intersect
78 |   return intersect / union
79 | 
80 | 
81 | def ioa(boxes1, boxes2):
82 |   """Computes pairwise intersection-over-area between box collections.
83 | 
84 |   Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
85 |   their intersection area over box2's area. Note that ioa is not symmetric,
86 |   that is, IOA(box1, box2) != IOA(box2, box1).
87 | 
88 |   Args:
89 |     boxes1: a numpy array with shape [N, 4] holding N boxes.
90 |     boxes2: a numpy array with shape [M, 4] holding N boxes.
91 | 
92 |   Returns:
93 |     a numpy array with shape [N, M] representing pairwise ioa scores.
94 |   """
95 |   intersect = intersection(boxes1, boxes2)
96 |   areas = np.expand_dims(area(boxes2), axis=0)
97 |   return intersect / areas
98 | 


--------------------------------------------------------------------------------
/alphaction/dataset/datasets/evaluation/ava/pascal_evaluation/np_mask_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Operations for [N, height, width] numpy arrays representing masks.
 17 | 
 18 | Example mask operations that are supported:
 19 |   * Areas: compute mask areas
 20 |   * IOU: pairwise intersection-over-union scores
 21 | """
 22 | import numpy as np
 23 | 
 24 | EPSILON = 1e-7
 25 | 
 26 | 
 27 | def area(masks):
 28 |   """Computes area of masks.
 29 | 
 30 |   Args:
 31 |     masks: Numpy array with shape [N, height, width] holding N masks. Masks
 32 |       values are of type np.uint8 and values are in {0,1}.
 33 | 
 34 |   Returns:
 35 |     a numpy array with shape [N*1] representing mask areas.
 36 | 
 37 |   Raises:
 38 |     ValueError: If masks.dtype is not np.uint8
 39 |   """
 40 |   if masks.dtype != np.uint8:
 41 |     raise ValueError('Masks type should be np.uint8')
 42 |   return np.sum(masks, axis=(1, 2), dtype=np.float32)
 43 | 
 44 | 
 45 | def intersection(masks1, masks2):
 46 |   """Compute pairwise intersection areas between masks.
 47 | 
 48 |   Args:
 49 |     masks1: a numpy array with shape [N, height, width] holding N masks. Masks
 50 |       values are of type np.uint8 and values are in {0,1}.
 51 |     masks2: a numpy array with shape [M, height, width] holding M masks. Masks
 52 |       values are of type np.uint8 and values are in {0,1}.
 53 | 
 54 |   Returns:
 55 |     a numpy array with shape [N*M] representing pairwise intersection area.
 56 | 
 57 |   Raises:
 58 |     ValueError: If masks1 and masks2 are not of type np.uint8.
 59 |   """
 60 |   if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
 61 |     raise ValueError('masks1 and masks2 should be of type np.uint8')
 62 |   n = masks1.shape[0]
 63 |   m = masks2.shape[0]
 64 |   answer = np.zeros([n, m], dtype=np.float32)
 65 |   for i in np.arange(n):
 66 |     for j in np.arange(m):
 67 |       answer[i, j] = np.sum(np.minimum(masks1[i], masks2[j]), dtype=np.float32)
 68 |   return answer
 69 | 
 70 | 
 71 | def iou(masks1, masks2):
 72 |   """Computes pairwise intersection-over-union between mask collections.
 73 | 
 74 |   Args:
 75 |     masks1: a numpy array with shape [N, height, width] holding N masks. Masks
 76 |       values are of type np.uint8 and values are in {0,1}.
 77 |     masks2: a numpy array with shape [M, height, width] holding N masks. Masks
 78 |       values are of type np.uint8 and values are in {0,1}.
 79 | 
 80 |   Returns:
 81 |     a numpy array with shape [N, M] representing pairwise iou scores.
 82 | 
 83 |   Raises:
 84 |     ValueError: If masks1 and masks2 are not of type np.uint8.
 85 |   """
 86 |   if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
 87 |     raise ValueError('masks1 and masks2 should be of type np.uint8')
 88 |   intersect = intersection(masks1, masks2)
 89 |   area1 = area(masks1)
 90 |   area2 = area(masks2)
 91 |   union = np.expand_dims(area1, axis=1) + np.expand_dims(
 92 |       area2, axis=0) - intersect
 93 |   return intersect / np.maximum(union, EPSILON)
 94 | 
 95 | 
 96 | def ioa(masks1, masks2):
 97 |   """Computes pairwise intersection-over-area between box collections.
 98 | 
 99 |   Intersection-over-area (ioa) between two masks, mask1 and mask2 is defined as
100 |   their intersection area over mask2's area. Note that ioa is not symmetric,
101 |   that is, IOA(mask1, mask2) != IOA(mask2, mask1).
102 | 
103 |   Args:
104 |     masks1: a numpy array with shape [N, height, width] holding N masks. Masks
105 |       values are of type np.uint8 and values are in {0,1}.
106 |     masks2: a numpy array with shape [M, height, width] holding N masks. Masks
107 |       values are of type np.uint8 and values are in {0,1}.
108 | 
109 |   Returns:
110 |     a numpy array with shape [N, M] representing pairwise ioa scores.
111 | 
112 |   Raises:
113 |     ValueError: If masks1 and masks2 are not of type np.uint8.
114 |   """
115 |   if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
116 |     raise ValueError('masks1 and masks2 should be of type np.uint8')
117 |   intersect = intersection(masks1, masks2)
118 |   areas = np.expand_dims(area(masks2), axis=0)
119 |   return intersect / (areas + EPSILON)
120 | 


--------------------------------------------------------------------------------
/alphaction/dataset/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from .distributed import DistributedSampler
2 | from .grouped_batch_sampler import GroupedBatchSampler
3 | from .iteration_based_batch_sampler import IterationBasedBatchSampler
4 | 
5 | __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"]
6 | 


--------------------------------------------------------------------------------
/alphaction/dataset/samplers/distributed.py:
--------------------------------------------------------------------------------
 1 | # Code is copy-pasted exactly as in torch.utils.dataset.distributed.
 2 | # FIXME remove this once c10d fixes the bug it has
 3 | import math
 4 | import torch
 5 | import torch.distributed as dist
 6 | from torch.utils.data.sampler import Sampler
 7 | 
 8 | 
 9 | class DistributedSampler(Sampler):
10 |     """Sampler that restricts dataset loading to a subset of the dataset.
11 |     It is especially useful in conjunction with
12 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
13 |     process can pass a DistributedSampler instance as a DataLoader sampler,
14 |     and load a subset of the original dataset that is exclusive to it.
15 |     .. note::
16 |         Dataset is assumed to be of constant size.
17 |     Arguments:
18 |         dataset: Dataset used for sampling.
19 |         num_replicas (optional): Number of processes participating in
20 |             distributed training.
21 |         rank (optional): Rank of the current process within num_replicas.
22 |     """
23 | 
24 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
25 |         if num_replicas is None:
26 |             if not dist.is_available():
27 |                 raise RuntimeError("Requires distributed package to be available")
28 |             num_replicas = dist.get_world_size()
29 |         if rank is None:
30 |             if not dist.is_available():
31 |                 raise RuntimeError("Requires distributed package to be available")
32 |             rank = dist.get_rank()
33 |         self.dataset = dataset
34 |         self.num_replicas = num_replicas
35 |         self.rank = rank
36 |         self.epoch = 0
37 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
38 |         self.total_size = self.num_samples * self.num_replicas
39 |         self.shuffle = shuffle
40 | 
41 |     def __iter__(self):
42 |         if self.shuffle:
43 |             # deterministically shuffle based on epoch
44 |             g = torch.Generator()
45 |             g.manual_seed(self.epoch)
46 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
47 |         else:
48 |             indices = torch.arange(len(self.dataset)).tolist()
49 | 
50 |         # add extra samples to make it evenly divisible
51 |         indices += indices[: (self.total_size - len(indices))]
52 |         assert len(indices) == self.total_size
53 | 
54 |         # subsample
55 |         offset = self.num_samples * self.rank
56 |         indices = indices[offset : offset + self.num_samples]
57 |         assert len(indices) == self.num_samples
58 | 
59 |         return iter(indices)
60 | 
61 |     def __len__(self):
62 |         return self.num_samples
63 | 
64 |     def set_epoch(self, epoch):
65 |         self.epoch = epoch
66 | 


--------------------------------------------------------------------------------
/alphaction/dataset/samplers/grouped_batch_sampler.py:
--------------------------------------------------------------------------------
  1 | # Modified based on https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py
  2 | import itertools
  3 | 
  4 | import torch
  5 | from torch.utils.data.sampler import BatchSampler
  6 | from torch.utils.data.sampler import Sampler
  7 | 
  8 | 
  9 | class GroupedBatchSampler(BatchSampler):
 10 |     """
 11 |     Wraps another sampler to yield a mini-batch of indices.
 12 |     It enforces that elements from the same group should appear in groups of batch_size.
 13 |     It also tries to provide mini-batches which follows an ordering which is
 14 |     as close as possible to the ordering from the original sampler.
 15 | 
 16 |     Arguments:
 17 |         sampler (Sampler): Base sampler.
 18 |         batch_size (int): Size of mini-batch.
 19 |         drop_uneven (bool): If ``True``, the sampler will drop the batches whose
 20 |             size is less than ``batch_size``
 21 | 
 22 |     """
 23 | 
 24 |     def __init__(self, sampler, group_ids, batch_size, drop_uneven=False):
 25 |         if not isinstance(sampler, Sampler):
 26 |             raise ValueError(
 27 |                 "sampler should be an instance of "
 28 |                 "torch.utils.dataset.Sampler, but got sampler={}".format(sampler)
 29 |             )
 30 |         self.sampler = sampler
 31 |         self.group_ids = torch.as_tensor(group_ids)
 32 |         assert self.group_ids.dim() == 1
 33 |         self.batch_size = batch_size
 34 |         self.drop_uneven = drop_uneven
 35 | 
 36 |         self.groups = torch.unique(self.group_ids).sort(0)[0]
 37 | 
 38 |     def _prepare_batches(self):
 39 |         dataset_size = len(self.group_ids)
 40 |         # get the sampled indices from the sampler
 41 |         sampled_ids = torch.as_tensor(list(self.sampler))
 42 |         # potentially not all elements of the dataset were sampled
 43 |         # by the sampler (e.g., DistributedSampler).
 44 |         # construct a tensor which contains -1 if the element was
 45 |         # not sampled, and a non-negative number indicating the
 46 |         # order where the element was sampled.
 47 |         # for example. if sampled_ids = [3, 1] and dataset_size = 5,
 48 |         # the order is [-1, 1, -1, 0, -1]
 49 |         order = torch.full((dataset_size,), -1, dtype=torch.int64)
 50 |         order[sampled_ids] = torch.arange(len(sampled_ids))
 51 | 
 52 |         # get a mask with the elements that were sampled
 53 |         mask = order >= 0
 54 | 
 55 |         # find the elements that belong to each individual cluster
 56 |         clusters = [(self.group_ids == i) & mask for i in self.groups]
 57 |         # get relative order of the elements inside each cluster
 58 |         # that follows the order from the sampler
 59 |         relative_order = [order[cluster] for cluster in clusters]
 60 |         # with the relative order, find the absolute order in the
 61 |         # sampled space
 62 |         permutation_ids = [s[s.sort()[1]] for s in relative_order]
 63 |         # permute each cluster so that they follow the order from
 64 |         # the sampler
 65 |         permuted_clusters = [sampled_ids[idx] for idx in permutation_ids]
 66 | 
 67 |         # splits each cluster in batch_size, and merge as a list of tensors
 68 |         splits = [c.split(self.batch_size) for c in permuted_clusters]
 69 |         merged = tuple(itertools.chain.from_iterable(splits))
 70 | 
 71 |         # now each batch internally has the right order, but
 72 |         # they are grouped by clusters. Find the permutation between
 73 |         # different batches that brings them as close as possible to
 74 |         # the order that we have in the sampler. For that, we will consider the
 75 |         # ordering as coming from the first element of each batch, and sort
 76 |         # correspondingly
 77 |         first_element_of_batch = [t[0].item() for t in merged]
 78 |         # get and inverse mapping from sampled indices and the position where
 79 |         # they occur (as returned by the sampler)
 80 |         inv_sampled_ids_map = {v: k for k, v in enumerate(sampled_ids.tolist())}
 81 |         # from the first element in each batch, get a relative ordering
 82 |         first_index_of_batch = torch.as_tensor(
 83 |             [inv_sampled_ids_map[s] for s in first_element_of_batch]
 84 |         )
 85 | 
 86 |         # permute the batches so that they approximately follow the order
 87 |         # from the sampler
 88 |         permutation_order = first_index_of_batch.sort(0)[1].tolist()
 89 |         # finally, permute the batches
 90 |         batches = [merged[i].tolist() for i in permutation_order]
 91 | 
 92 |         if self.drop_uneven:
 93 |             kept = []
 94 |             for batch in batches:
 95 |                 if len(batch) == self.batch_size:
 96 |                     kept.append(batch)
 97 |             batches = kept
 98 |         return batches
 99 | 
100 |     def __iter__(self):
101 |         batches = self._prepare_batches()
102 |         self._batches = batches
103 |         return iter(batches)
104 | 
105 |     def __len__(self):
106 |         if not hasattr(self, "_batches"):
107 |             self._batches = self._prepare_batches()
108 |         return len(self._batches)
109 | 


--------------------------------------------------------------------------------
/alphaction/dataset/samplers/iteration_based_batch_sampler.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py
 2 | from torch.utils.data.sampler import BatchSampler
 3 | 
 4 | 
 5 | class IterationBasedBatchSampler(BatchSampler):
 6 |     """
 7 |     Wraps a BatchSampler, resampling from it until
 8 |     a specified number of iterations have been sampled
 9 |     """
10 | 
11 |     def __init__(self, batch_sampler, num_iterations, start_iter=0):
12 |         self.batch_sampler = batch_sampler
13 |         self.num_iterations = num_iterations
14 |         self.start_iter = start_iter
15 | 
16 |     def __iter__(self):
17 |         iteration = self.start_iter
18 |         while iteration <= self.num_iterations:
19 |             # if the underlying sampler has a set_epoch method, like
20 |             # DistributedSampler, used for making each process see
21 |             # a different split of the dataset, then set it
22 |             if hasattr(self.batch_sampler.sampler, "set_epoch"):
23 |                 self.batch_sampler.sampler.set_epoch(iteration)
24 |             for batch in self.batch_sampler:
25 |                 iteration += 1
26 |                 if iteration > self.num_iterations:
27 |                     break
28 |                 yield batch
29 | 
30 |     def __len__(self):
31 |         return self.num_iterations
32 | 


--------------------------------------------------------------------------------
/alphaction/dataset/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import build_transforms, build_object_transforms


--------------------------------------------------------------------------------
/alphaction/dataset/transforms/build.py:
--------------------------------------------------------------------------------
 1 | from . import video_transforms as T
 2 | from . import object_transforms as OT
 3 | 
 4 | 
 5 | def build_transforms(cfg, is_train=True):
 6 |     # build transforms for training of testing
 7 |     if is_train:
 8 |         min_size = cfg.INPUT.MIN_SIZE_TRAIN
 9 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
10 |         color_jitter = cfg.INPUT.COLOR_JITTER
11 |         flip_prob = 0.5
12 |         slow_jitter = cfg.INPUT.SLOW_JITTER
13 |     else:
14 |         min_size = cfg.INPUT.MIN_SIZE_TEST
15 |         max_size = cfg.INPUT.MAX_SIZE_TEST
16 |         color_jitter = False
17 |         flip_prob = 0
18 |         slow_jitter = False
19 | 
20 |     frame_num = cfg.INPUT.FRAME_NUM
21 |     sample_rate = cfg.INPUT.FRAME_SAMPLE_RATE
22 | 
23 |     if color_jitter:
24 |         color_transform = T.ColorJitter(
25 |             cfg.INPUT.HUE_JITTER, cfg.INPUT.SAT_JITTER, cfg.INPUT.VAL_JITTER
26 |         )
27 |     else:
28 |         color_transform = T.Identity()
29 | 
30 |     to_bgr = cfg.INPUT.TO_BGR
31 |     normalize_transform = T.Normalize(
32 |         mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr=to_bgr
33 |     )
34 | 
35 |     tau = cfg.INPUT.TAU
36 |     alpha = cfg.INPUT.ALPHA
37 | 
38 |     transform = T.Compose(
39 |         [
40 |             T.TemporalCrop(frame_num, sample_rate),
41 |             T.Resize(min_size, max_size),
42 |             color_transform,
43 |             T.RandomHorizontalFlip(flip_prob),
44 |             T.ToTensor(),
45 |             normalize_transform,
46 |             T.SlowFastCrop(tau, alpha, slow_jitter),
47 |         ]
48 |     )
49 | 
50 |     return transform
51 | 
52 | 
53 | def build_object_transforms(cfg, is_train=True):
54 |     # build transforms for object boxes, should be kept consistent with video transforms.
55 |     if is_train:
56 |         flip_prob = 0.5
57 |     else:
58 |         flip_prob = 0
59 | 
60 |     transform = OT.Compose([
61 |         OT.PickTop(cfg.MODEL.IA_STRUCTURE.MAX_OBJECT),
62 |         OT.Resize(),
63 |         OT.RandomHorizontalFlip(flip_prob)
64 |     ])
65 |     return transform


--------------------------------------------------------------------------------
/alphaction/dataset/transforms/object_transforms.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Compose(object):
 3 |     # Class used to compose different kinds of object transforms
 4 |     def __init__(self, transforms):
 5 |         self.transforms = transforms
 6 | 
 7 |     def __call__(self, object_boxes, transform_randoms):
 8 |         #should reuse the random varaible in video transforms
 9 |         for t in self.transforms:
10 |             object_boxes = t(object_boxes, transform_randoms)
11 |         return object_boxes
12 | 
13 | 
14 | class PickTop(object):
15 |     # pick top scored object boxes.
16 |     def __init__(self, top_k):
17 |         self.top_k = top_k
18 | 
19 |     def __call__(self, objects, _):
20 |         objects = objects.top_k(self.top_k)
21 |         return objects
22 | 
23 | 
24 | class Resize(object):
25 |     def __call__(self, object_boxes, transform_randoms):
26 |         # resize according to video transforms
27 |         size = transform_randoms["Resize"]
28 |         if object_boxes is not None:
29 |             object_boxes = object_boxes.resize(size)
30 |         return object_boxes
31 | 
32 | 
33 | class RandomHorizontalFlip(object):
34 |     def __init__(self, prob=0.5):
35 |         self.prob = prob
36 | 
37 |     def __call__(self, object_boxes, transform_randoms):
38 |         # flip according to video transforms
39 |         flip_random = transform_randoms["Flip"]
40 |         if flip_random < self.prob:
41 |             object_boxes.transpose(0)
42 |         return object_boxes
43 | 


--------------------------------------------------------------------------------
/alphaction/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/engine/__init__.py


--------------------------------------------------------------------------------
/alphaction/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .roi_align_3d import ROIAlign3d
 4 | from .roi_align_3d import roi_align_3d
 5 | from .roi_pool_3d import ROIPool3d
 6 | from .roi_pool_3d import roi_pool_3d
 7 | from .batch_norm import FrozenBatchNorm1d, FrozenBatchNorm2d, FrozenBatchNorm3d
 8 | from .sigmoid_focal_loss import SigmoidFocalLoss
 9 | from .softmax_focal_loss import SoftmaxFocalLoss
10 | 
11 | __all__ = ["roi_align_3d", "ROIAlign3d", "roi_pool_3d", "ROIPool3d",
12 |            "SigmoidFocalLoss", "SoftmaxFocalLoss", "FrozenBatchNorm1d",
13 |            "FrozenBatchNorm2d", "FrozenBatchNorm3d",
14 |           ]
15 | 
16 | 


--------------------------------------------------------------------------------
/alphaction/layers/batch_norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class _FrozenBatchNorm(nn.Module):
 6 |     def __init__(self, num_features, eps=1e-5, affine=True, track_running_stats=True):
 7 |         super(_FrozenBatchNorm, self).__init__()
 8 |         self.num_features = num_features
 9 |         self.eps = eps
10 |         self.affine = affine
11 |         self.track_running_stats = track_running_stats
12 |         if self.affine:
13 |             self.register_buffer("weight", torch.Tensor(num_features))
14 |             self.register_buffer("bias", torch.Tensor(num_features))
15 |         else:
16 |             self.register_buffer("weight", None)
17 |             self.register_buffer("bias", None)
18 |         if self.track_running_stats:
19 |             self.register_buffer('running_mean', torch.zeros(num_features))
20 |             self.register_buffer('running_var', torch.ones(num_features))
21 |         else:
22 |             self.register_parameter('running_mean', None)
23 |             self.register_parameter('running_var', None)
24 |         self.reset_parameters()
25 | 
26 |     def reset_running_stats(self):
27 |         if self.track_running_stats:
28 |             self.running_mean.zero_()
29 |             self.running_var.fill_(1)
30 | 
31 |     def reset_parameters(self):
32 |         self.reset_running_stats()
33 |         if self.affine:
34 |             self.weight.data.uniform_()
35 |             self.bias.data.zero_()
36 | 
37 |     def _check_input_dim(self, input):
38 |         raise NotImplementedError
39 | 
40 |     def forward(self, input):
41 |         self._check_input_dim(input)
42 |         view_shape = (1, self.num_features) + (1,) * (input.dim() - 2)
43 | 
44 |         if self.track_running_stats:
45 |             scale = self.weight / (self.running_var + self.eps).sqrt()
46 |             bias = self.bias - self.running_mean * scale
47 |         else:
48 |             scale = self.weight
49 |             bias = self.bias
50 | 
51 |         return scale.view(*view_shape) * input + bias.view(*view_shape)
52 | 
53 |     def extra_repr(self):
54 |         return '{num_features}, eps={eps}, affine={affine}, ' \
55 |                'track_running_stats={track_running_stats}'.format(**self.__dict__)
56 | 
57 |     def _load_from_state_dict(self, state_dict, prefix, metadata, strict,
58 |                               missing_keys, unexpected_keys, error_msgs):
59 |         num_batches_tracked_key = prefix + 'num_batches_tracked'
60 |         if num_batches_tracked_key in state_dict:
61 |             del state_dict[num_batches_tracked_key]
62 |         super(_FrozenBatchNorm, self)._load_from_state_dict(
63 |             state_dict, prefix, metadata, strict,
64 |             missing_keys, unexpected_keys, error_msgs)
65 | 
66 | 
67 | class FrozenBatchNorm1d(_FrozenBatchNorm):
68 |     def _check_input_dim(self, input):
69 |         if input.dim() != 2 and input.dim() != 3:
70 |             raise ValueError('expected 2D or 3D input (got {}D input)'
71 |                              .format(input.dim()))
72 | 
73 | 
74 | class FrozenBatchNorm2d(_FrozenBatchNorm):
75 |     def _check_input_dim(self, input):
76 |         if input.dim() != 4:
77 |             raise ValueError('expected 4D input (got {}D input)'
78 |                              .format(input.dim()))
79 | 
80 | 
81 | class FrozenBatchNorm3d(_FrozenBatchNorm):
82 |     def _check_input_dim(self, input):
83 |         if input.dim() != 5:
84 |             raise ValueError('expected 5D input (got {}D input)'
85 |                              .format(input.dim()))
86 | 


--------------------------------------------------------------------------------
/alphaction/layers/roi_align_3d.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.autograd import Function
 4 | from torch.autograd.function import once_differentiable
 5 | from torch.nn.modules.utils import _pair
 6 | 
 7 | import alphaction._custom_cuda_ext as _C
 8 | 
 9 | 
10 | class _ROIAlign3d(Function):
11 |     @staticmethod
12 |     def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
13 |         ctx.save_for_backward(roi)
14 |         ctx.output_size = _pair(output_size)
15 |         ctx.spatial_scale = spatial_scale
16 |         ctx.sampling_ratio = sampling_ratio
17 |         ctx.input_shape = input.size()
18 |         output = _C.roi_align_3d_forward(
19 |             input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
20 |         )
21 |         return output
22 | 
23 |     @staticmethod
24 |     @once_differentiable
25 |     def backward(ctx, grad_output):
26 |         rois, = ctx.saved_tensors
27 |         output_size = ctx.output_size
28 |         spatial_scale = ctx.spatial_scale
29 |         sampling_ratio = ctx.sampling_ratio
30 |         bs, ch, l, h, w = ctx.input_shape
31 |         grad_input = _C.roi_align_3d_backward(
32 |             grad_output,
33 |             rois,
34 |             spatial_scale,
35 |             output_size[0],
36 |             output_size[1],
37 |             bs,
38 |             ch,
39 |             l,
40 |             h,
41 |             w,
42 |             sampling_ratio,
43 |         )
44 |         return grad_input, None, None, None, None
45 | 
46 | 
47 | roi_align_3d = _ROIAlign3d.apply
48 | 
49 | 
50 | class ROIAlign3d(nn.Module):
51 |     def __init__(self, output_size, spatial_scale, sampling_ratio):
52 |         super(ROIAlign3d, self).__init__()
53 |         self.output_size = output_size
54 |         self.spatial_scale = spatial_scale
55 |         self.sampling_ratio = sampling_ratio
56 | 
57 |     def forward(self, input, rois):
58 |         return roi_align_3d(
59 |             input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
60 |         )
61 | 
62 |     def __repr__(self):
63 |         tmpstr = self.__class__.__name__ + "("
64 |         tmpstr += "output_size=" + str(self.output_size)
65 |         tmpstr += ", spatial_scale=" + str(self.spatial_scale)
66 |         tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
67 |         tmpstr += ")"
68 |         return tmpstr
69 | 


--------------------------------------------------------------------------------
/alphaction/layers/roi_pool_3d.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from torch.autograd import Function
 3 | from torch.autograd.function import once_differentiable
 4 | from torch.nn.modules.utils import _pair
 5 | 
 6 | import alphaction._custom_cuda_ext as _C
 7 | 
 8 | 
 9 | class _ROIPool3d(Function):
10 |     @staticmethod
11 |     def forward(ctx, input, roi, output_size, spatial_scale):
12 |         ctx.output_size = _pair(output_size)
13 |         ctx.spatial_scale = spatial_scale
14 |         ctx.input_shape = input.size()
15 |         output, argmax = _C.roi_pool_3d_forward(
16 |             input, roi, spatial_scale, output_size[0], output_size[1]
17 |         )
18 |         ctx.save_for_backward(input, roi, argmax)
19 |         return output
20 | 
21 |     @staticmethod
22 |     @once_differentiable
23 |     def backward(ctx, grad_output):
24 |         input, rois, argmax = ctx.saved_tensors
25 |         output_size = ctx.output_size
26 |         spatial_scale = ctx.spatial_scale
27 |         bs, ch, l, h, w = ctx.input_shape
28 |         grad_input = _C.roi_pool_3d_backward(
29 |             grad_output,
30 |             input,
31 |             rois,
32 |             argmax,
33 |             spatial_scale,
34 |             output_size[0],
35 |             output_size[1],
36 |             bs,
37 |             ch,
38 |             l,
39 |             h,
40 |             w,
41 |         )
42 |         return grad_input, None, None, None
43 | 
44 | 
45 | roi_pool_3d = _ROIPool3d.apply
46 | 
47 | 
48 | class ROIPool3d(nn.Module):
49 |     def __init__(self, output_size, spatial_scale):
50 |         super(ROIPool3d, self).__init__()
51 |         self.output_size = output_size
52 |         self.spatial_scale = spatial_scale
53 | 
54 |     def forward(self, input, rois):
55 |         return roi_pool_3d(input, rois, self.output_size, self.spatial_scale)
56 | 
57 |     def __repr__(self):
58 |         tmpstr = self.__class__.__name__ + "("
59 |         tmpstr += "output_size=" + str(self.output_size)
60 |         tmpstr += ", spatial_scale=" + str(self.spatial_scale)
61 |         tmpstr += ")"
62 |         return tmpstr


--------------------------------------------------------------------------------
/alphaction/layers/sigmoid_focal_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.autograd import Function
 4 | from torch.autograd.function import once_differentiable
 5 | 
 6 | import alphaction._custom_cuda_ext as _C
 7 | 
 8 | 
 9 | class _SigmoidFocalLoss(Function):
10 |     @staticmethod
11 |     def forward(ctx, logits, targets, gamma, alpha):
12 |         ctx.save_for_backward(logits, targets)
13 |         ctx.gamma = gamma
14 |         ctx.alpha = alpha
15 | 
16 |         losses = _C.sigmoid_focalloss_forward(
17 |             logits, targets, gamma, alpha
18 |         )
19 |         return losses
20 | 
21 |     @staticmethod
22 |     @once_differentiable
23 |     def backward(ctx, d_loss):
24 |         logits, targets = ctx.saved_tensors
25 |         gamma = ctx.gamma
26 |         alpha = ctx.alpha
27 |         d_logits = _C.sigmoid_focalloss_backward(
28 |             logits, targets, d_loss, gamma, alpha
29 |         )
30 |         return d_logits, None, None, None
31 | 
32 | 
33 | def sigmoid_focal_loss(logits, targets, gamma, alpha, reduction='mean'):
34 |     assert reduction in ["none", "mean", "sum"], "Unsupported reduction type \"{}\"".format(reduction)
35 |     logits = logits.float()
36 |     targets = targets.float()
37 | 
38 |     ret = _SigmoidFocalLoss.apply(logits, targets, gamma, alpha)
39 |     if reduction != "none":
40 |         ret = torch.mean(ret) if reduction == "mean" else torch.sum(ret)
41 | 
42 |     return ret
43 | 
44 | 
45 | class SigmoidFocalLoss(nn.Module):
46 |     def __init__(self, gamma, alpha, reduction="mean"):
47 |         super(SigmoidFocalLoss, self).__init__()
48 |         self.gamma = gamma
49 |         self.alpha = alpha
50 |         self.reduction = reduction
51 | 
52 |     def forward(self, logits, targets):
53 |         loss = sigmoid_focal_loss(logits, targets, self.gamma, self.alpha, self.reduction)
54 |         return loss
55 | 
56 |     def __repr__(self):
57 |         tmpstr = self.__class__.__name__ + "("
58 |         tmpstr += "gamma=" + str(self.gamma)
59 |         tmpstr += ", alpha=" + str(self.alpha)
60 |         tmpstr += ")"
61 |         return tmpstr
62 | 


--------------------------------------------------------------------------------
/alphaction/layers/softmax_focal_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.autograd import Function
 4 | from torch.autograd.function import once_differentiable
 5 | 
 6 | import alphaction._custom_cuda_ext as _C
 7 | 
 8 | 
 9 | class _SoftmaxFocalLoss(Function):
10 |     @staticmethod
11 |     def forward(ctx, logits, targets, gamma, alpha):
12 |         ctx.gamma = gamma
13 |         ctx.alpha = alpha
14 | 
15 |         losses, P = _C.softmax_focalloss_forward(
16 |             logits, targets, gamma, alpha
17 |         )
18 |         ctx.save_for_backward(logits, targets, P)
19 |         return losses
20 | 
21 |     @staticmethod
22 |     @once_differentiable
23 |     def backward(ctx, d_loss):
24 |         logits, targets, P = ctx.saved_tensors
25 |         gamma = ctx.gamma
26 |         alpha = ctx.alpha
27 |         d_logits = _C.softmax_focalloss_backward(
28 |             logits, targets, P, d_loss, gamma, alpha
29 |         )
30 |         return d_logits, None, None, None
31 | 
32 | 
33 | def softmax_focal_loss(logits, targets, gamma, alpha, reduction='mean'):
34 |     assert reduction in ["none", "mean", "sum"], "Unsupported reduction type \"{}\"".format(reduction)
35 |     logits = logits.float()
36 |     targets = targets.int()
37 | 
38 |     ret = _SoftmaxFocalLoss.apply(logits, targets, gamma, alpha)
39 |     if reduction != "none":
40 |         ret = torch.mean(ret) if reduction == "mean" else torch.sum(ret)
41 | 
42 |     return ret
43 | 
44 | 
45 | class SoftmaxFocalLoss(nn.Module):
46 |     def __init__(self, gamma, alpha, reduction="mean"):
47 |         super(SoftmaxFocalLoss, self).__init__()
48 |         self.gamma = gamma
49 |         self.alpha = alpha
50 |         self.reduction = reduction
51 | 
52 |     def forward(self, logits, targets):
53 |         loss = softmax_focal_loss(logits, targets, self.gamma, self.alpha, self.reduction)
54 |         return loss
55 | 
56 |     def __repr__(self):
57 |         tmpstr = self.__class__.__name__ + "("
58 |         tmpstr += "gamma=" + str(self.gamma)
59 |         tmpstr += ", alpha=" + str(self.alpha)
60 |         tmpstr += ")"
61 |         return tmpstr
62 | 


--------------------------------------------------------------------------------
/alphaction/modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/modeling/__init__.py


--------------------------------------------------------------------------------
/alphaction/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import build_backbone


--------------------------------------------------------------------------------
/alphaction/modeling/backbone/backbone.py:
--------------------------------------------------------------------------------
 1 | from alphaction.modeling import registry
 2 | from . import slowfast, i3d
 3 | 
 4 | @registry.BACKBONES.register("Slowfast-Resnet50")
 5 | @registry.BACKBONES.register("Slowfast-Resnet101")
 6 | def build_slowfast_resnet_backbone(cfg):
 7 |     model = slowfast.SlowFast(cfg)
 8 |     return model
 9 | 
10 | @registry.BACKBONES.register("I3D-Resnet50")
11 | @registry.BACKBONES.register("I3D-Resnet101")
12 | @registry.BACKBONES.register("I3D-Resnet50-Sparse")
13 | @registry.BACKBONES.register("I3D-Resnet101-Sparse")
14 | def build_i3d_resnet_backbone(cfg):
15 |     model = i3d.I3D(cfg)
16 |     return model
17 | 
18 | def build_backbone(cfg):
19 |     assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \
20 |         "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format(
21 |             cfg.MODEL.BACKBONE.CONV_BODY
22 |         )
23 |     return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg)


--------------------------------------------------------------------------------
/alphaction/modeling/detector/__init__.py:
--------------------------------------------------------------------------------
1 | from .action_detector import build_detection_model


--------------------------------------------------------------------------------
/alphaction/modeling/detector/action_detector.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | from ..backbone import build_backbone
 4 | from ..roi_heads.roi_heads_3d import build_3d_roi_heads
 5 | 
 6 | 
 7 | class ActionDetector(nn.Module):
 8 |     def __init__(self, cfg):
 9 |         super(ActionDetector, self).__init__()
10 |         self.backbone = build_backbone(cfg)
11 |         self.roi_heads = build_3d_roi_heads(cfg, self.backbone.dim_out)
12 | 
13 |     def forward(self, slow_video, fast_video, boxes, objects=None, extras={}, part_forward=-1):
14 |         # part_forward is used to split this model into two parts.
15 |         # if part_forward<0, just use it as a single model
16 |         # if part_forward=0, use this model to extract pooled feature(person and object, no memory features).
17 |         # if part_forward=1, use the ia structure to aggregate interactions and give final result.
18 |         # implemented in roi_heads
19 | 
20 |         if part_forward==1:
21 |             slow_features = fast_features = None
22 |         else:
23 |             slow_features, fast_features = self.backbone(slow_video, fast_video)
24 | 
25 |         result, detector_losses, loss_weight, detector_metrics = self.roi_heads(slow_features, fast_features, boxes, objects, extras, part_forward)
26 | 
27 |         if self.training:
28 |             return detector_losses, loss_weight, detector_metrics, result
29 | 
30 |         return result
31 | 
32 |     def c2_weight_mapping(self):
33 |         if not hasattr(self, "c2_mapping"):
34 |             weight_map = {}
35 |             for name, m_child in self.named_children():
36 |                 if m_child.state_dict() and hasattr(m_child, "c2_weight_mapping"):
37 |                     child_map = m_child.c2_weight_mapping()
38 |                     for key, val in child_map.items():
39 |                         new_key = name + '.' + key
40 |                         weight_map[new_key] = val
41 |             self.c2_mapping = weight_map
42 |         return self.c2_mapping
43 | 
44 | def build_detection_model(cfg):
45 |     return ActionDetector(cfg)


--------------------------------------------------------------------------------
/alphaction/modeling/nonlocal_block.py:
--------------------------------------------------------------------------------
  1 | from __future__ import (absolute_import, division, print_function,
  2 |                         unicode_literals)
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | from alphaction.layers import FrozenBatchNorm3d
  7 | 
  8 | 
  9 | class NLBlock(nn.Module):
 10 |     def __init__(self, dim_in, dim_out, dim_inner, nl_cfg, group=False):
 11 |         super(NLBlock, self).__init__()
 12 | 
 13 |         self.nl_cfg = nl_cfg.clone()
 14 |         self.group = group
 15 |         self.group_size = 4
 16 | 
 17 |         init_std = nl_cfg.CONV_INIT_STD
 18 |         bias = not nl_cfg.NO_BIAS
 19 |         pool_stride = 2
 20 | 
 21 |         self.scale_value = dim_inner ** (-0.5)
 22 |         self.dim_inner = dim_inner
 23 | 
 24 |         self.theta = nn.Conv3d(dim_in, dim_inner, 1, bias=bias)
 25 |         nn.init.normal_(self.theta.weight, std=init_std)
 26 |         if bias:
 27 |             nn.init.constant_(self.theta.bias, 0)
 28 | 
 29 |         if nl_cfg.USE_MAXPOOL:
 30 |             self.maxpool = nn.MaxPool3d((1, pool_stride, pool_stride),
 31 |                                         stride=(1, pool_stride, pool_stride))
 32 | 
 33 |         self.phi = nn.Conv3d(dim_in, dim_inner, 1, bias=bias)
 34 |         nn.init.normal_(self.phi.weight, std=init_std)
 35 |         if bias:
 36 |             nn.init.constant_(self.phi.bias, 0)
 37 | 
 38 |         self.g = nn.Conv3d(dim_in, dim_inner, 1, bias=bias)
 39 |         nn.init.normal_(self.g.weight, std=init_std)
 40 |         if bias:
 41 |             nn.init.constant_(self.g.bias, 0)
 42 | 
 43 |         if nl_cfg.USE_SOFTMAX:
 44 |             self.softmax = nn.Softmax(dim=2)
 45 | 
 46 |         self.out = nn.Conv3d(dim_inner, dim_out, 1, bias=bias)
 47 |         if nl_cfg.USE_ZERO_INIT_CONV:
 48 |             nn.init.constant_(self.out.weight, 0)
 49 |         else:
 50 |             nn.init.normal_(self.out.weight, std=init_std)
 51 |         if bias:
 52 |             nn.init.constant_(self.out.bias, 0)
 53 | 
 54 |         if nl_cfg.USE_BN:
 55 |             if nl_cfg.FROZEN_BN:
 56 |                 self.bn = FrozenBatchNorm3d(dim_out, eps=nl_cfg.BN_EPSILON)
 57 |             else:
 58 |                 self.bn = nn.BatchNorm3d(dim_out, eps=nl_cfg.BN_EPSILON, momentum=nl_cfg.BN_MOMENTUM)
 59 |             nn.init.constant_(self.bn.weight, nl_cfg.BN_INIT_GAMMA)
 60 | 
 61 |     def forward(self, x):
 62 |         if x.dim() != 5:
 63 |             raise ValueError('expected 4D or 5D input (got {}D input)'
 64 |                              .format(x.dim()))
 65 | 
 66 |         if self.group:
 67 |             x = x.transpose(1, 2)
 68 |             sz_before_group = list(x.shape)
 69 |             sz_after_group = sz_before_group.copy()
 70 |             sz_after_group[0] = -1
 71 |             sz_after_group[1] = self.group_size
 72 |             x = x.contiguous().view(*sz_after_group)
 73 |             x = x.transpose(1, 2)
 74 | 
 75 |         batch_size = x.shape[0]
 76 | 
 77 |         theta = self.theta(x)
 78 | 
 79 |         if self.nl_cfg.USE_MAXPOOL:
 80 |             max_pool = self.maxpool(x)
 81 |         else:
 82 |             max_pool = x
 83 | 
 84 |         phi = self.phi(max_pool)
 85 | 
 86 |         g = self.g(max_pool)
 87 | 
 88 |         org_size = theta.size()
 89 |         mat_size = [batch_size, self.dim_inner, -1]
 90 |         theta = theta.view(*mat_size)
 91 |         phi = phi.view(*mat_size)
 92 |         g = g.view(*mat_size)
 93 | 
 94 |         theta_phi = torch.bmm(theta.transpose(1, 2), phi)
 95 | 
 96 |         if self.nl_cfg.USE_SOFTMAX:
 97 |             if self.nl_cfg.USE_SCALE:
 98 |                 theta_phi_sc = theta_phi * self.scale_value
 99 |             else:
100 |                 theta_phi_sc = theta_phi
101 |             p = self.softmax(theta_phi_sc)
102 |         else:
103 |             p = theta_phi / theta_phi.shape[-1]
104 | 
105 |         t = torch.bmm(g, p.transpose(1, 2))
106 | 
107 |         t = t.view(org_size)
108 | 
109 |         out = self.out(t)
110 | 
111 |         if self.nl_cfg.USE_BN:
112 |             out = self.bn(out)
113 |         out = out + x
114 | 
115 |         if self.group:
116 |             out = out.transpose(1, 2)
117 |             out = out.contiguous().view(*sz_before_group)
118 |             out = out.transpose(1, 2)
119 | 
120 |         return out
121 | 
122 |     def c2_weight_mapping(self):
123 |         weight_map = {}
124 |         for name, m_child in self.named_children():
125 |             if m_child.state_dict():
126 |                 if isinstance(m_child, (nn.BatchNorm3d, FrozenBatchNorm3d)):
127 |                     weight_map[name + '.weight'] = '{}_s'.format(name)
128 |                     weight_map[name + '.running_mean'] = '{}_rm'.format(name)
129 |                     weight_map[name + '.running_var'] = '{}_riv'.format(name)
130 |                 elif isinstance(m_child, nn.GroupNorm):
131 |                     weight_map[name + '.weight'] = '{}_s'.format(name)
132 |                 else:
133 |                     weight_map[name + '.weight'] = '{}_w'.format(name)
134 |                 weight_map[name + '.bias'] = '{}_b'.format(name)
135 |         return weight_map
136 | 


--------------------------------------------------------------------------------
/alphaction/modeling/poolers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | from alphaction.layers import ROIAlign3d, ROIPool3d
 5 | 
 6 | 
 7 | class Pooler3d(nn.Module):
 8 |     def __init__(self, output_size, scale, sampling_ratio=None, pooler_type='align3d'):
 9 |         super(Pooler3d, self).__init__()
10 |         if pooler_type == 'align3d':
11 |             assert sampling_ratio is not None, 'Sampling ratio should be specified for 3d roi align.'
12 |             self.pooler = ROIAlign3d(
13 |                 output_size, spatial_scale=scale, sampling_ratio=sampling_ratio
14 |             )
15 |         elif pooler_type == 'pooling3d':
16 |             self.pooler = ROIPool3d(
17 |                 output_size, spatial_scale=scale
18 |             )
19 |         self.output_size = output_size
20 | 
21 |     def convert_to_roi_format(self, boxes, dtype, device):
22 |         bbox_list = list()
23 |         ids_list = list()
24 |         for i, b in enumerate(boxes):
25 |             if not b:
26 |                 bbox_list.append(torch.zeros((0, 4), dtype=dtype, device=device))
27 |                 ids_list.append(torch.zeros((0, 1), dtype=dtype, device=device))
28 |             else:
29 |                 bbox_list.append(b.bbox)
30 |                 ids_list.append(torch.full((len(b), 1), i, dtype=dtype, device=device))
31 |         concat_boxes = torch.cat(bbox_list, dim=0)
32 |         ids = torch.cat(ids_list, dim=0)
33 |         rois = torch.cat([ids, concat_boxes], dim=1)
34 | 
35 |         return rois
36 | 
37 |     def forward(self, x, boxes):
38 |         rois = self.convert_to_roi_format(boxes, x.dtype, x.device)
39 |         return self.pooler(x, rois)
40 | 
41 | 
42 | def make_3d_pooler(head_cfg):
43 |     resolution = head_cfg.POOLER_RESOLUTION
44 |     scale = head_cfg.POOLER_SCALE
45 |     sampling_ratio = head_cfg.POOLER_SAMPLING_RATIO
46 |     pooler_type = head_cfg.POOLER_TYPE
47 |     pooler = Pooler3d(
48 |         output_size=(resolution, resolution),
49 |         scale=scale,
50 |         sampling_ratio=sampling_ratio,
51 |         pooler_type=pooler_type,
52 |     )
53 |     return pooler


--------------------------------------------------------------------------------
/alphaction/modeling/registry.py:
--------------------------------------------------------------------------------
1 | from alphaction.utils.registry import Registry
2 | 
3 | BACKBONES = Registry()
4 | ROI_ACTION_FEATURE_EXTRACTORS = Registry()
5 | ROI_ACTION_PREDICTORS = Registry()
6 | INTERACTION_AGGREGATION_STRUCTURES = Registry()


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/modeling/roi_heads/__init__.py


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/modeling/roi_heads/action_head/__init__.py


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/action_head.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .roi_action_feature_extractor import make_roi_action_feature_extractor
 4 | from .roi_action_predictors import make_roi_action_predictor
 5 | from .inference import make_roi_action_post_processor
 6 | from .loss import make_roi_action_loss_evaluator
 7 | from .metric import make_roi_action_accuracy_evaluator
 8 | from alphaction.modeling.utils import prepare_pooled_feature
 9 | from alphaction.utils.comm import all_reduce
10 | 
11 | 
12 | class ROIActionHead(torch.nn.Module):
13 |     """
14 |     Generic Action Head class.
15 |     """
16 | 
17 |     def __init__(self, cfg, dim_in):
18 |         super(ROIActionHead, self).__init__()
19 |         self.feature_extractor = make_roi_action_feature_extractor(cfg, dim_in)
20 |         self.predictor = make_roi_action_predictor(cfg, self.feature_extractor.dim_out)
21 |         self.post_processor = make_roi_action_post_processor(cfg)
22 |         self.loss_evaluator = make_roi_action_loss_evaluator(cfg)
23 |         self.accuracy_evaluator = make_roi_action_accuracy_evaluator(cfg)
24 |         self.test_ext = cfg.TEST.EXTEND_SCALE
25 | 
26 |     def forward(self, slow_features, fast_features, boxes, objects=None, extras={}, part_forward=-1):
27 |         # In training stage, boxes are from gt.
28 |         # In testing stage, boxes are detected by human detector and proposals should be
29 |         # enlarged boxes.
30 |         assert not (self.training and part_forward >= 0)
31 | 
32 |         if part_forward == 1:
33 |             boxes = extras["current_feat_p"]
34 |             objects = extras["current_feat_o"]
35 | 
36 |         if self.training:
37 |             proposals = self.loss_evaluator.sample_box(boxes)
38 |         else:
39 |             proposals = [box.extend(self.test_ext) for box in boxes]
40 | 
41 |         x, x_pooled, x_objects = self.feature_extractor(slow_features, fast_features, proposals, objects, extras, part_forward)
42 | 
43 |         if part_forward == 0:
44 |             pooled_feature = prepare_pooled_feature(x_pooled, boxes)
45 |             if x_objects is None:
46 |                 object_pooled_feature = None
47 |             else:
48 |                 object_pooled_feature = prepare_pooled_feature(x_objects, objects)
49 |             return [pooled_feature, object_pooled_feature], {}, {}, {}
50 | 
51 |         action_logits = self.predictor(x)
52 | 
53 |         if not self.training:
54 |             result = self.post_processor((action_logits,), boxes)
55 |             return result, {}, {}, {}
56 | 
57 |         box_num = action_logits.size(0)
58 |         box_num = torch.as_tensor([box_num], dtype=torch.float32, device=action_logits.device)
59 |         all_reduce(box_num, average=True)
60 | 
61 |         loss_dict, loss_weight = self.loss_evaluator(
62 |             [action_logits], box_num.item(),
63 |         )
64 | 
65 |         metric_dict = self.accuracy_evaluator(
66 |             [action_logits], proposals, box_num.item(),
67 |         )
68 | 
69 |         pooled_feature = prepare_pooled_feature(x_pooled, proposals)
70 |         if x_objects is None:
71 |             object_pooled_feature = []
72 |         else:
73 |             object_pooled_feature = prepare_pooled_feature(x_objects, objects)
74 | 
75 |         return (
76 |             [pooled_feature, object_pooled_feature],
77 |             loss_dict,
78 |             loss_weight,
79 |             metric_dict,
80 |         )
81 | 
82 |     def c2_weight_mapping(self):
83 |         weight_map = {}
84 |         for name, m_child in self.named_children():
85 |             if m_child.state_dict() and hasattr(m_child, "c2_weight_mapping"):
86 |                 child_map = m_child.c2_weight_mapping()
87 |                 for key, val in child_map.items():
88 |                     new_key = name + '.' + key
89 |                     weight_map[new_key] = val
90 |         return weight_map
91 | 
92 | 
93 | def build_roi_action_head(cfg, dim_in):
94 |     return ROIActionHead(cfg, dim_in)
95 | 


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/inference.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from alphaction.structures.bounding_box import BoxList
 6 | 
 7 | 
 8 | class PostProcessor(nn.Module):
 9 |     def __init__(self, pose_action_num):
10 |         super(PostProcessor, self).__init__()
11 |         self.pose_action_num = pose_action_num
12 | 
13 |     def forward(self, x, boxes):
14 |         # boxes should be (#detections,4)
15 |         # prob should be calculated in different way.
16 |         class_logits, = x
17 |         pose_action_prob = F.softmax(class_logits[:,:self.pose_action_num],-1)
18 |         interaction_action_prob = torch.sigmoid(class_logits[:,self.pose_action_num:])
19 | 
20 |         action_prob = torch.cat((pose_action_prob,interaction_action_prob),1)
21 | 
22 |         image_shapes = [box.size for box in boxes]
23 |         boxes_per_image = [len(box) for box in boxes]
24 |         box_tensors = [a.bbox for a in boxes]
25 | 
26 |         action_prob = action_prob.split(boxes_per_image, dim=0)
27 | 
28 |         results = []
29 |         for prob, boxes_per_image, image_shape in zip(
30 |                 action_prob, box_tensors, image_shapes
31 |         ):
32 |             boxlist = self.prepare_boxlist(boxes_per_image, prob, image_shape)
33 |             results.append(boxlist)
34 |         return results
35 | 
36 |     def prepare_boxlist(self, boxes, scores, image_shape):
37 |         boxlist = BoxList(boxes, image_shape, mode="xyxy")
38 |         boxlist.add_field("scores", scores)
39 |         return boxlist
40 | 
41 | 
42 | def make_roi_action_post_processor(cfg):
43 |     softmax_num = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES
44 |     postprocessor = PostProcessor(softmax_num)
45 |     return postprocessor
46 | 


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from alphaction.layers import SigmoidFocalLoss, SoftmaxFocalLoss
 3 | from alphaction.modeling.utils import cat
 4 | 
 5 | 
 6 | class ActionLossComputation(object):
 7 |     def __init__(self, cfg):
 8 |         self.proposal_per_clip = cfg.MODEL.ROI_ACTION_HEAD.PROPOSAL_PER_CLIP
 9 |         self.num_pose = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES
10 |         self.num_object = cfg.MODEL.ROI_ACTION_HEAD.NUM_OBJECT_MANIPULATION_CLASSES
11 |         self.num_person = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_INTERACTION_CLASSES
12 | 
13 |         self.weight_dict = dict(
14 |             loss_pose_action = cfg.MODEL.ROI_ACTION_HEAD.POSE_LOSS_WEIGHT,
15 |             loss_object_interaction = cfg.MODEL.ROI_ACTION_HEAD.OBJECT_LOSS_WEIGHT,
16 |             loss_person_interaction = cfg.MODEL.ROI_ACTION_HEAD.PERSON_LOSS_WEIGHT,
17 |         )
18 | 
19 |         gamma = cfg.MODEL.ROI_ACTION_HEAD.FOCAL_LOSS.GAMMA
20 |         alpha = cfg.MODEL.ROI_ACTION_HEAD.FOCAL_LOSS.ALPHA
21 |         self.sigmoid_focal_loss = SigmoidFocalLoss(gamma, alpha, reduction="none")
22 |         self.softmax_focal_loss = SoftmaxFocalLoss(gamma, alpha, reduction="sum")
23 | 
24 |     def sample_box(self, boxes):
25 |         proposals = []
26 |         num_proposals = self.proposal_per_clip
27 |         for boxes_per_image in boxes:
28 |             num_boxes = len(boxes_per_image)
29 | 
30 |             if num_boxes > num_proposals:
31 |                 choice_inds = torch.randperm(num_boxes)[:num_proposals]
32 |                 proposals_per_image = boxes_per_image[choice_inds]
33 |             else:
34 |                 proposals_per_image = boxes_per_image
35 |             proposals_per_image = proposals_per_image.random_aug(0.2, 0.1, 0.1, 0.05)
36 |             proposals.append(proposals_per_image)
37 |         self._proposals = proposals
38 |         return proposals
39 | 
40 |     def __call__(self, class_logits, avg_box_num):
41 |         class_logits = cat(class_logits, dim=0)
42 |         assert class_logits.shape[1] == (self.num_pose + self.num_object + self.num_person), \
43 |             "The shape of tensor class logits doesn't match total number of action classes."
44 | 
45 |         if not hasattr(self, "_proposals"):
46 |             raise RuntimeError("sample_box needs to be called before")
47 | 
48 |         proposals = self._proposals
49 | 
50 |         labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)
51 |         assert class_logits.shape[1] == labels.shape[1], \
52 |             "The shape of tensor class logits doesn't match the label tensor."
53 | 
54 |         loss_dict = {}
55 | 
56 |         if self.num_pose > 0:
57 |             pose_label = labels[:, :self.num_pose].argmax(dim=1)
58 |             pose_logits = class_logits[:, :self.num_pose]
59 |             pose_loss = self.softmax_focal_loss(pose_logits, pose_label) / avg_box_num
60 |             loss_dict["loss_pose_action"] = pose_loss
61 | 
62 |         interaction_label = labels[:, self.num_pose:].to(dtype=torch.float32)
63 |         object_label = interaction_label[:, :self.num_object]
64 |         person_label = interaction_label[:, self.num_object:]
65 | 
66 |         interaction_logits = class_logits[:, self.num_pose:]
67 |         object_logits = interaction_logits[:, :self.num_object]
68 |         person_logits = interaction_logits[:, self.num_object:]
69 | 
70 |         if self.num_object > 0:
71 |             object_loss = self.sigmoid_focal_loss(object_logits, object_label).mean(dim=1).sum() / avg_box_num
72 |             loss_dict["loss_object_interaction"] = object_loss
73 |         if self.num_person > 0:
74 |             person_loss = self.sigmoid_focal_loss(person_logits, person_label).mean(dim=1).sum() / avg_box_num
75 |             loss_dict["loss_person_interaction"] = person_loss
76 | 
77 |         return loss_dict, self.weight_dict
78 | 
79 | 
80 | def make_roi_action_loss_evaluator(cfg):
81 |     loss_evaluator = ActionLossComputation(cfg)
82 | 
83 |     return loss_evaluator


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/metric.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from alphaction.modeling.utils import cat
 3 | 
 4 | 
 5 | class ActionAccuracyComputation(object):
 6 |     def __init__(self, num_pose, num_object, num_person):
 7 |         self.num_pose = num_pose
 8 |         self.num_object = num_object
 9 |         self.num_person = num_person
10 | 
11 |     def logic_iou(self, pred, label):
12 |         device = pred.device
13 | 
14 |         version = torch.__version__
15 |         if eval('.'.join(version.split('.')[:2]))>=1.3:
16 |             pred = pred.bool()
17 |             label = label.bool()
18 | 
19 |         label_union = (pred | label).float().sum(dim=1)
20 |         label_inter = (pred & label).float().sum(dim=1)
21 |         replacer = torch.ones_like(label_union, device=device)
22 |         zero_mask = label_union == 0
23 |         label_inter = torch.where(zero_mask, replacer, label_inter)
24 |         label_union = torch.where(zero_mask, replacer, label_union)
25 |         return label_inter / label_union
26 | 
27 |     def __call__(self, class_logits, proposals, avg_box_num):
28 |         class_logits = [logits.detach() for logits in class_logits]
29 |         class_logits = cat(class_logits, dim=0)
30 |         assert class_logits.shape[1] == (self.num_pose + self.num_object + self.num_person), \
31 |             "The shape of tensor class logits doesn't match total number of action classes."
32 | 
33 |         labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)
34 | 
35 |         metric_dict = {}
36 |         if self.num_pose>0:
37 |             pose_label = labels[:, :self.num_pose].argmax(dim=1)
38 |             pose_pred = class_logits[:, :self.num_pose].argmax(dim=1)
39 |             accuracy_pose_action = pose_label.eq(pose_pred).float().sum()
40 |             metric_dict["accuracy_pose_action"] = accuracy_pose_action / avg_box_num
41 | 
42 |         interaction_label = labels[:, self.num_pose:]
43 |         interaction_logits = class_logits[:, self.num_pose:]
44 |         interaction_pred = interaction_logits.sigmoid() > 0.5
45 | 
46 |         if self.num_object>0:
47 |             object_label = interaction_label[:, :self.num_object]
48 |             object_pred = interaction_pred[:, :self.num_object]
49 |             accuracy_object_interaction = self.logic_iou(object_pred, object_label)
50 |             metric_dict["accuracy_object_interaction"] = accuracy_object_interaction.sum() / avg_box_num
51 | 
52 |         if self.num_person>0:
53 |             person_label = interaction_label[:, self.num_object:]
54 |             person_pred = interaction_pred[:, self.num_object:]
55 |             accuracy_person_interaction = self.logic_iou(person_pred, person_label)
56 |             metric_dict["accuracy_person_interaction"] = accuracy_person_interaction.sum() / avg_box_num
57 | 
58 |         return metric_dict
59 | 
60 | 
61 | def make_roi_action_accuracy_evaluator(cfg):
62 |     num_pose = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_MOVEMENT_CLASSES
63 |     num_object = cfg.MODEL.ROI_ACTION_HEAD.NUM_OBJECT_MANIPULATION_CLASSES
64 |     num_person = cfg.MODEL.ROI_ACTION_HEAD.NUM_PERSON_INTERACTION_CLASSES
65 |     return ActionAccuracyComputation(num_pose, num_object, num_person)


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/action_head/roi_action_predictors.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from alphaction.modeling import registry
 3 | 
 4 | 
 5 | @registry.ROI_ACTION_PREDICTORS.register("FCPredictor")
 6 | class FCPredictor(nn.Module):
 7 |     def __init__(self, config, dim_in):
 8 |         super(FCPredictor, self).__init__()
 9 | 
10 |         num_classes = config.MODEL.ROI_ACTION_HEAD.NUM_CLASSES
11 | 
12 |         dropout_rate = config.MODEL.ROI_ACTION_HEAD.DROPOUT_RATE
13 |         if dropout_rate > 0:
14 |             self.dropout = nn.Dropout(p=dropout_rate, inplace=True)
15 | 
16 |         self.cls_score = nn.Linear(dim_in, num_classes)
17 | 
18 |         nn.init.normal_(self.cls_score.weight, std=0.01)
19 |         nn.init.constant_(self.cls_score.bias, 0)
20 | 
21 |     def forward(self, x):
22 |         x = x.view(x.size(0), -1)
23 |         if hasattr(self, "dropout"):
24 |             x = self.dropout(x)
25 |         scores = self.cls_score(x)
26 | 
27 |         return scores
28 | 
29 |     def c2_weight_mapping(self):
30 |         return {"cls_score.weight": "pred_w",
31 |                 "cls_score.bias": "pred_b"}
32 | 
33 | 
34 | def make_roi_action_predictor(cfg, dim_in):
35 |     func = registry.ROI_ACTION_PREDICTORS[cfg.MODEL.ROI_ACTION_HEAD.PREDICTOR]
36 |     return func(cfg, dim_in)
37 | 


--------------------------------------------------------------------------------
/alphaction/modeling/roi_heads/roi_heads_3d.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .action_head.action_head import build_roi_action_head
 4 | 
 5 | 
 6 | class Combined3dROIHeads(torch.nn.ModuleDict):
 7 |     def __init__(self, cfg, heads):
 8 |         super(Combined3dROIHeads, self).__init__(heads)
 9 |         self.cfg = cfg.clone()
10 | 
11 |     def forward(self, slow_features, fast_features, boxes, objects=None, extras={}, part_forward=-1):
12 |         result, loss_action, loss_weight, accuracy_action = self.action(slow_features, fast_features, boxes, objects, extras, part_forward)
13 | 
14 |         return result, loss_action, loss_weight, accuracy_action
15 | 
16 |     def c2_weight_mapping(self):
17 |         weight_map = {}
18 |         for name, m_child in self.named_children():
19 |             if m_child.state_dict() and hasattr(m_child,"c2_weight_mapping"):
20 |                 child_map = m_child.c2_weight_mapping()
21 |                 for key, val in child_map.items():
22 |                     new_key = name + '.' + key
23 |                     weight_map[new_key] = val
24 |         return weight_map
25 | 
26 | 
27 | def build_3d_roi_heads(cfg, dim_in):
28 |     roi_heads = []
29 |     roi_heads.append(("action", build_roi_action_head(cfg, dim_in)))
30 | 
31 |     if roi_heads:
32 |         roi_heads = Combined3dROIHeads(cfg, roi_heads)
33 | 
34 |     return roi_heads
35 | 


--------------------------------------------------------------------------------
/alphaction/modeling/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Miscellaneous utility functions
 3 | """
 4 | 
 5 | import torch
 6 | from alphaction.structures.bounding_box import BoxList
 7 | 
 8 | 
 9 | def cat(tensors, dim=0):
10 |     """
11 |     Efficient version of torch.cat that avoids a copy if there is only a single element in a list
12 |     """
13 |     assert isinstance(tensors, (list, tuple))
14 |     if len(tensors) == 1:
15 |         return tensors[0]
16 |     return torch.cat(tensors, dim)
17 | 
18 | def pad_sequence(sequence, targ_size, padding_value=0):
19 |     tensor_size = sequence[0].size()
20 |     trailing_dims = tensor_size[1:]
21 |     out_dims = (len(sequence), targ_size) + trailing_dims
22 | 
23 |     out_tensor = sequence[0].new_full(out_dims, padding_value)
24 |     for i, tensor in enumerate(sequence):
25 |         length = tensor.size(0)
26 |         out_tensor[i, :length, ...] = tensor
27 | 
28 |     return out_tensor
29 | 
30 | def prepare_pooled_feature(x_pooled, boxes, detach=True):
31 |     image_shapes = [box.size for box in boxes]
32 |     boxes_per_image = [len(box) for box in boxes]
33 |     box_tensors = [a.bbox for a in boxes]
34 | 
35 |     if detach:
36 |         x_pooled = x_pooled.detach()
37 |     pooled_feature = x_pooled.split(boxes_per_image, dim=0)
38 | 
39 |     boxes_result = []
40 |     for feature_per_image, boxes_per_image, image_shape in zip(
41 |             pooled_feature, box_tensors, image_shapes
42 |     ):
43 |         boxlist = BoxList(boxes_per_image, image_shape, mode="xyxy")
44 |         boxlist.add_field("pooled_feature", feature_per_image)
45 |         boxes_result.append(boxlist)
46 |     return boxes_result


--------------------------------------------------------------------------------
/alphaction/solver/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import make_optimizer
2 | from .build import make_lr_scheduler
3 | from .lr_scheduler import WarmupMultiStepLR
4 | 


--------------------------------------------------------------------------------
/alphaction/solver/build.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .lr_scheduler import WarmupMultiStepLR, HalfPeriodCosStepLR
 4 | 
 5 | import torch.nn as nn
 6 | from alphaction.modeling.roi_heads.action_head.IA_structure import IAStructure
 7 | 
 8 | 
 9 | def make_optimizer(cfg, model):
10 |     params = []
11 |     bn_param_set = set()
12 |     transformer_param_set = set()
13 |     for name, module in model.named_modules():
14 |         if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)):
15 |             bn_param_set.add(name + ".weight")
16 |             bn_param_set.add(name + ".bias")
17 |         elif isinstance(module, IAStructure):
18 |             for param_name, _ in module.named_parameters(name):
19 |                 transformer_param_set.add(param_name)
20 |     for key, value in model.named_parameters():
21 |         if not value.requires_grad:
22 |             continue
23 |         lr = cfg.SOLVER.BASE_LR
24 |         weight_decay = cfg.SOLVER.WEIGHT_DECAY
25 |         if key in bn_param_set:
26 |             weight_decay = cfg.SOLVER.WEIGHT_DECAY_BN
27 |         elif "bias" in key:
28 |             lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
29 |             weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
30 |         if key in transformer_param_set:
31 |             lr = lr * cfg.SOLVER.IA_LR_FACTOR
32 |         params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
33 | 
34 |     optimizer = torch.optim.SGD(params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM)
35 |     return optimizer
36 | 
37 | 
38 | def make_lr_scheduler(cfg, optimizer):
39 |     scheduler = cfg.SOLVER.SCHEDULER
40 |     if scheduler not in ("half_period_cosine", "warmup_multi_step"):
41 |         raise ValueError('Scheduler not available')
42 |     if scheduler == 'warmup_multi_step':
43 |         return WarmupMultiStepLR(
44 |             optimizer,
45 |             cfg.SOLVER.STEPS,
46 |             cfg.SOLVER.GAMMA,
47 |             warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
48 |             warmup_iters=cfg.SOLVER.WARMUP_ITERS if cfg.SOLVER.WARMUP_ON else 0,
49 |             warmup_method=cfg.SOLVER.WARMUP_METHOD,
50 |         )
51 |     elif scheduler == 'half_period_cosine':
52 |         return HalfPeriodCosStepLR(
53 |             optimizer,
54 |             warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
55 |             warmup_iters=cfg.SOLVER.WARMUP_ITERS if cfg.SOLVER.WARMUP_ON else 0,
56 |             max_iters=cfg.SOLVER.MAX_ITER,
57 |             warmup_method=cfg.SOLVER.WARMUP_METHOD,
58 |         )
59 | 


--------------------------------------------------------------------------------
/alphaction/solver/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/solver/lr_scheduler.py
 2 | from bisect import bisect_right
 3 | 
 4 | import torch
 5 | import math
 6 | 
 7 | 
 8 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
 9 |     def __init__(
10 |         self,
11 |         optimizer,
12 |         milestones,
13 |         gamma=0.1,
14 |         warmup_factor=1.0 / 3,
15 |         warmup_iters=500,
16 |         warmup_method="linear",
17 |         last_epoch=-1,
18 |     ):
19 |         if not list(milestones) == sorted(milestones):
20 |             raise ValueError(
21 |                 "Milestones should be a list of" " increasing integers. Got {}",
22 |                 milestones,
23 |             )
24 | 
25 |         if warmup_method not in ("constant", "linear"):
26 |             raise ValueError(
27 |                 "Only 'constant' or 'linear' warmup_method accepted"
28 |                 "got {}".format(warmup_method)
29 |             )
30 |         self.milestones = milestones
31 |         self.gamma = gamma
32 |         self.warmup_factor = warmup_factor
33 |         self.warmup_iters = warmup_iters
34 |         self.warmup_method = warmup_method
35 |         super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
36 | 
37 |     def get_lr(self):
38 |         warmup_factor = 1
39 |         if self.last_epoch < self.warmup_iters:
40 |             if self.warmup_method == "constant":
41 |                 warmup_factor = self.warmup_factor
42 |             elif self.warmup_method == "linear":
43 |                 alpha = float(self.last_epoch) / self.warmup_iters
44 |                 warmup_factor = self.warmup_factor * (1 - alpha) + alpha
45 |         return [
46 |             base_lr
47 |             * warmup_factor
48 |             * self.gamma ** bisect_right(self.milestones, self.last_epoch)
49 |             for base_lr in self.base_lrs
50 |         ]
51 | 
52 | class HalfPeriodCosStepLR(torch.optim.lr_scheduler._LRScheduler):
53 |     def __init__(
54 |         self,
55 |         optimizer,
56 |         warmup_factor=1.0 / 3,
57 |         warmup_iters=8000,
58 |         max_iters=60000,
59 |         warmup_method="linear",
60 |         last_epoch=-1,
61 |     ):
62 |         if warmup_method not in ("constant", "linear"):
63 |             raise ValueError(
64 |                 "Only 'constant' or 'linear' warmup_method accepted"
65 |                 "got {}".format(warmup_method)
66 |             )
67 |         self.warmup_factor = warmup_factor
68 |         self.warmup_iters = warmup_iters
69 |         self.max_iters = max_iters
70 |         self.warmup_method = warmup_method
71 |         super(HalfPeriodCosStepLR, self).__init__(optimizer, last_epoch)
72 | 
73 |     def get_lr(self):
74 |         warmup_factor = 1
75 |         if self.last_epoch < self.warmup_iters:
76 |             if self.warmup_method == "constant":
77 |                 warmup_factor = self.warmup_factor
78 |             elif self.warmup_method == "linear":
79 |                 alpha = float(self.last_epoch) / self.warmup_iters
80 |                 warmup_factor = self.warmup_factor * (1 - alpha) + alpha
81 |         else:
82 |             warmup_factor = 0.5 * (math.cos(self.last_epoch / self.max_iters * math.pi) + 1)
83 |         return [
84 |             base_lr
85 |             * warmup_factor
86 |             for base_lr in self.base_lrs
87 |         ]


--------------------------------------------------------------------------------
/alphaction/structures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/structures/__init__.py


--------------------------------------------------------------------------------
/alphaction/structures/memory_pool.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | class MemoryPool(object):
 4 |     def __init__(self):
 5 |         self.cache = defaultdict(dict)
 6 | 
 7 |     def update(self, update_info):
 8 |         for movie_id, feature_per_movie in update_info.items():
 9 |             self.cache[movie_id].update(feature_per_movie)
10 | 
11 |     def update_list(self, update_info_list):
12 |         for update_info in update_info_list:
13 |             self.update(update_info)
14 | 
15 |     def __getitem__(self, item):
16 |         if isinstance(item, tuple) and len(item)==2:
17 |             return self.cache[item[0]][item[1]]
18 |         return self.cache[item]
19 | 
20 |     def __setitem__(self, key, value):
21 |         if isinstance(key, tuple) and len(key)==2:
22 |             self.cache[key[0]][key[1]] = value
23 |         else:
24 |             self.cache[key] = value
25 | 
26 |     def __delitem__(self, item):
27 |         if isinstance(item, tuple) and len(item)==2:
28 |             del self.cache[item[0]][item[1]]
29 |         else:
30 |             del self.cache[item]
31 | 
32 |     def __contains__(self, item):
33 |         if isinstance(item, tuple) and len(item)==2:
34 |             return (item[0] in self.cache and item[1] in self.cache[item[0]])
35 |         return (item in self.cache)
36 | 
37 |     def items(self):
38 |         return self.cache.items()


--------------------------------------------------------------------------------
/alphaction/utils/IA_helper.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | 
 3 | def _block_set(ia_blocks):
 4 |     if len(ia_blocks) > 0 and isinstance(ia_blocks[0], list):
 5 |         ia_blocks = list(itertools.chain.from_iterable(ia_blocks))
 6 |     return ia_blocks
 7 | 
 8 | def has_person(ia_config):
 9 |     ia_blocks = _block_set(ia_config.I_BLOCK_LIST)
10 |     return (ia_config.ACTIVE and 'P' in ia_blocks and ia_config.MAX_PERSON > 0)
11 | 
12 | 
13 | def has_object(ia_config):
14 |     ia_blocks = _block_set(ia_config.I_BLOCK_LIST)
15 |     return (ia_config.ACTIVE and 'O' in ia_blocks and ia_config.MAX_OBJECT > 0)
16 | 
17 | 
18 | def has_memory(ia_config):
19 |     ia_blocks = _block_set(ia_config.I_BLOCK_LIST)
20 |     return (ia_config.ACTIVE and 'M' in ia_blocks and ia_config.MAX_PER_SEC > 0)
21 | 


--------------------------------------------------------------------------------
/alphaction/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/alphaction/utils/__init__.py


--------------------------------------------------------------------------------
/alphaction/utils/c2_model_loading.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch
 3 | import pickle
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | def _rename_weights(weights, weight_map):
 8 |     logger = logging.getLogger(__name__)
 9 |     logger.info("Remapping C2 weights")
10 |     max_c2_key_size = max([len(k) for k in weight_map.values()])
11 |     new_weights = OrderedDict()
12 |     for k in weight_map:
13 |         c2_name = weight_map[k]
14 |         logger.info("C2 name: {: <{}} mapped name: {}".format(c2_name, max_c2_key_size, k))
15 |         if c2_name not in weights:
16 |             logger.info("{} not found in C2 weights file, skipped.".format(c2_name))
17 |             continue
18 |         v = weights[c2_name]
19 |         w = torch.from_numpy(v)
20 |         new_weights[k] = w
21 |     return new_weights
22 | 
23 | 
24 | def _load_c2_pickled_weights(file_path):
25 |     with open(file_path, "rb") as f:
26 |         if torch._six.PY3:
27 |             data = pickle.load(f, encoding="latin1")
28 |         else:
29 |             data = pickle.load(f)
30 |     if "blobs" in data:
31 |         weights = data["blobs"]
32 |     else:
33 |         weights = data
34 |     return weights
35 | 
36 | 
37 | def load_c2_format(f, weight_map):
38 |     # We also support load from caffe2 weights.
39 |     state_dict = _load_c2_pickled_weights(f)
40 |     state_dict = _rename_weights(state_dict, weight_map)
41 |     return dict(model=state_dict)
42 | 


--------------------------------------------------------------------------------
/alphaction/utils/checkpoint.py:
--------------------------------------------------------------------------------
  1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/checkpoint.py
  2 | import logging
  3 | import os
  4 | 
  5 | import torch
  6 | 
  7 | from alphaction.utils.model_serialization import load_state_dict
  8 | from alphaction.utils.c2_model_loading import load_c2_format
  9 | from alphaction.structures.memory_pool import MemoryPool
 10 | 
 11 | 
 12 | class Checkpointer(object):
 13 |     def __init__(
 14 |             self,
 15 |             model,
 16 |             optimizer=None,
 17 |             scheduler=None,
 18 |             save_dir="",
 19 |             save_to_disk=None,
 20 |             logger=None,
 21 |     ):
 22 |         self.model = model
 23 |         self.optimizer = optimizer
 24 |         self.scheduler = scheduler
 25 |         self.save_dir = save_dir
 26 |         self.save_to_disk = save_to_disk
 27 |         if logger is None:
 28 |             logger = logging.getLogger(__name__)
 29 |         self.logger = logger
 30 | 
 31 |     def save(self, name, **kwargs):
 32 |         if not self.save_dir:
 33 |             return
 34 | 
 35 |         if not self.save_to_disk:
 36 |             return
 37 | 
 38 |         data = {}
 39 |         data["model"] = self.model.state_dict()
 40 |         if self.optimizer is not None:
 41 |             data["optimizer"] = self.optimizer.state_dict()
 42 |         if self.scheduler is not None:
 43 |             data["scheduler"] = self.scheduler.state_dict()
 44 |         data.update(kwargs)
 45 | 
 46 |         save_file = os.path.join(self.save_dir, "{}.pth".format(name))
 47 |         self.logger.info("Saving checkpoint to {}".format(save_file))
 48 |         torch.save(data, save_file)
 49 |         self.tag_last_checkpoint(save_file)
 50 | 
 51 |     def load(self, f=None, model_weight_only=False, adjust_scheduler=False, no_head=False):
 52 |         if self.has_checkpoint():
 53 |             # override argument with existing checkpoint
 54 |             f = self.get_checkpoint_file()
 55 |         if not f:
 56 |             # no checkpoint could be found
 57 |             self.logger.info("No checkpoint found. Initializing model from scratch")
 58 |             return {}
 59 |         self.logger.info("Loading checkpoint from {}".format(f))
 60 |         checkpoint = self._load_file(f)
 61 |         self._load_model(checkpoint, no_head)
 62 |         if "optimizer" in checkpoint and self.optimizer:
 63 |             if model_weight_only:
 64 |                 del checkpoint['optimizer']
 65 |             else:
 66 |                 self.logger.info("Loading optimizer from {}".format(f))
 67 |                 self.optimizer.load_state_dict(checkpoint.pop("optimizer"))
 68 |         if "scheduler" in checkpoint and self.scheduler:
 69 |             if model_weight_only:
 70 |                 del checkpoint['scheduler']
 71 |             elif adjust_scheduler:
 72 |                 last_epoch = checkpoint.pop("scheduler")['last_epoch']
 73 |                 self.logger.info("Adjust scheduler at iteration {}".format(last_epoch))
 74 |                 self.scheduler.step(last_epoch)
 75 |             else:
 76 |                 self.logger.info("Loading scheduler from {}".format(f))
 77 |                 self.scheduler.load_state_dict(checkpoint.pop("scheduler"))
 78 | 
 79 |         if model_weight_only:
 80 |             checkpoint["iteration"] = 0
 81 |             checkpoint["person_pool"] = MemoryPool()
 82 |         # return any further checkpoint dataset
 83 |         return checkpoint
 84 | 
 85 |     def has_checkpoint(self):
 86 |         save_file = os.path.join(self.save_dir, "last_checkpoint")
 87 |         return os.path.exists(save_file)
 88 | 
 89 |     def get_checkpoint_file(self):
 90 |         save_file = os.path.join(self.save_dir, "last_checkpoint")
 91 |         try:
 92 |             with open(save_file, "r") as f:
 93 |                 last_saved = f.read()
 94 |                 last_saved = last_saved.strip()
 95 |         except IOError:
 96 |             # if file doesn't exist, maybe because it has just been
 97 |             # deleted by a separate process
 98 |             last_saved = ""
 99 |         return last_saved
100 | 
101 |     def tag_last_checkpoint(self, last_filename):
102 |         save_file = os.path.join(self.save_dir, "last_checkpoint")
103 |         with open(save_file, "w") as f:
104 |             f.write(last_filename)
105 | 
106 |     def _load_file(self, f):
107 |         return torch.load(f, map_location=torch.device("cpu"))
108 | 
109 |     def _load_model(self, checkpoint, no_head):
110 |         load_state_dict(self.model, checkpoint.pop("model"), no_head)
111 | 
112 | 
113 | class ActionCheckpointer(Checkpointer):
114 |     def __init__(
115 |             self,
116 |             cfg,
117 |             model,
118 |             optimizer=None,
119 |             scheduler=None,
120 |             save_dir="",
121 |             save_to_disk=None,
122 |             logger=None,
123 |     ):
124 |         super(ActionCheckpointer, self).__init__(
125 |             model, optimizer, scheduler, save_dir, save_to_disk, logger
126 |         )
127 |         self.cfg = cfg.clone()
128 | 
129 |     def _load_file(self, f):
130 |         if f.endswith(".pkl"):
131 |             return load_c2_format(f, self._get_c2_weight_map())
132 |         loaded = super(ActionCheckpointer, self)._load_file(f)
133 |         if "model" not in loaded:
134 |             loaded = dict(model=loaded)
135 |         return loaded
136 | 
137 |     def _get_c2_weight_map(self):
138 |         if hasattr(self.model, "c2_weight_mapping"):
139 |             return self.model.c2_weight_mapping()
140 |         elif hasattr(self.model, "module") and hasattr(self.model.module, "c2_weight_mapping"):
141 |             return self.model.module.c2_weight_mapping()
142 |         else:
143 |             raise RuntimeError("Cannot get C2 weight mapping from current model definition.")


--------------------------------------------------------------------------------
/alphaction/utils/logger.py:
--------------------------------------------------------------------------------
 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/logger.py
 2 | import logging
 3 | import os
 4 | import sys
 5 | import time
 6 | 
 7 | 
 8 | def setup_logger(name, save_dir, distributed_rank, filename=None):
 9 |     logger = logging.getLogger(name)
10 |     logger.setLevel(logging.DEBUG)
11 |     logger.propagate = False
12 |     # don't log results for the non-master process
13 |     if distributed_rank > 0:
14 |         return logger
15 |     ch = logging.StreamHandler(stream=sys.stdout)
16 |     ch.setLevel(logging.DEBUG)
17 |     formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
18 |     ch.setFormatter(formatter)
19 |     logger.addHandler(ch)
20 | 
21 |     if save_dir:
22 |         if filename is None:
23 |             filename = time.strftime("%Y-%m-%d_%H.%M.%S", time.localtime()) + ".log"
24 |         fh = logging.FileHandler(os.path.join(save_dir, filename))
25 |         fh.setLevel(logging.DEBUG)
26 |         fh.setFormatter(formatter)
27 |         logger.addHandler(fh)
28 | 
29 |     return logger
30 | 
31 | def setup_tblogger(save_dir, distributed_rank):
32 |     if distributed_rank>0:
33 |         return None
34 |     from tensorboardX import SummaryWriter
35 |     tbdir = os.path.join(save_dir,'tb')
36 |     os.makedirs(tbdir,exist_ok=True)
37 |     tblogger = SummaryWriter(tbdir)
38 |     return tblogger


--------------------------------------------------------------------------------
/alphaction/utils/metric_logger.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/metric_logger.py
 2 | from collections import defaultdict
 3 | from collections import deque
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class SmoothedValue(object):
 9 |     """Track a series of values and provide access to smoothed values over a
10 |     window or the global series average.
11 |     """
12 | 
13 |     def __init__(self, window_size=20):
14 |         self.deque = deque(maxlen=window_size)
15 |         self.series = []
16 |         self.total = 0.0
17 |         self.count = 0
18 | 
19 |     def update(self, value):
20 |         self.deque.append(value)
21 |         self.series.append(value)
22 |         self.count += 1
23 |         self.total += value
24 | 
25 |     @property
26 |     def median(self):
27 |         d = torch.tensor(list(self.deque))
28 |         return d.median().item()
29 | 
30 |     @property
31 |     def avg(self):
32 |         d = torch.tensor(list(self.deque))
33 |         return d.mean().item()
34 | 
35 |     @property
36 |     def global_avg(self):
37 |         return self.total / self.count
38 | 
39 | 
40 | class MetricLogger(object):
41 |     def __init__(self, delimiter="\t"):
42 |         self.meters = defaultdict(SmoothedValue)
43 |         self.delimiter = delimiter
44 | 
45 |     def update(self, **kwargs):
46 |         for k, v in kwargs.items():
47 |             if isinstance(v, torch.Tensor):
48 |                 v = v.item()
49 |             assert isinstance(v, (float, int))
50 |             self.meters[k].update(v)
51 | 
52 |     def __getattr__(self, attr):
53 |         if attr in self.meters:
54 |             return self.meters[attr]
55 |         if attr in self.__dict__:
56 |             return self.__dict__[attr]
57 |         raise AttributeError("'{}' object has no attribute '{}'".format(
58 |                     type(self).__name__, attr))
59 | 
60 |     def __str__(self):
61 |         loss_str = []
62 |         for name, meter in self.meters.items():
63 |             loss_str.append(
64 |                 "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg)
65 |             )
66 |         return self.delimiter.join(loss_str)
67 | 


--------------------------------------------------------------------------------
/alphaction/utils/model_serialization.py:
--------------------------------------------------------------------------------
 1 | # Modified from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/model_serialization.py
 2 | from collections import OrderedDict
 3 | import logging
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | def align_and_update_state_dicts(model_state_dict, loaded_state_dict, no_head):
 9 |     """
10 |     Strategy: suppose that the models that we will create will have prefixes appended
11 |     to each of its keys, for example due to an extra level of nesting that the original
12 |     pre-trained weights from ImageNet won't contain. For example, model.state_dict()
13 |     might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains
14 |     res2.conv1.weight. We thus want to match both parameters together.
15 |     For that, we look for each model weight, look among all loaded keys if there is one
16 |     that is a suffix of the current weight name, and use it if that's the case.
17 |     If multiple matches exist, take the one with longest size
18 |     of the corresponding name. For example, for the same model as before, the pretrained
19 |     weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case,
20 |     we want to match backbone[0].body.conv1.weight to conv1.weight, and
21 |     backbone[0].body.res2.conv1.weight to res2.conv1.weight.
22 |     """
23 |     current_keys = sorted(list(model_state_dict.keys()))
24 |     loaded_keys = sorted(list(loaded_state_dict.keys()))
25 |     # get a matrix of string matches, where each (i, j) entry correspond to the size of the
26 |     # loaded_key string, if it matches
27 |     match_matrix = [
28 |         len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys
29 |     ]
30 |     match_matrix = torch.as_tensor(match_matrix).view(
31 |         len(current_keys), len(loaded_keys)
32 |     )
33 |     max_match_size, idxs = match_matrix.max(1)
34 |     # remove indices that correspond to no-match
35 |     idxs[max_match_size == 0] = -1
36 | 
37 |     # used for logging
38 |     max_size = max([len(key) for key in current_keys]) if current_keys else 1
39 |     max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1
40 |     log_str_template = "{: <{}} loaded from {: <{}} of shape {}"
41 |     logger = logging.getLogger(__name__)
42 |     for idx_new, idx_old in enumerate(idxs.tolist()):
43 |         if idx_old == -1:
44 |             continue
45 |         key = current_keys[idx_new]
46 |         key_old = loaded_keys[idx_old]
47 | 
48 |         if no_head and key_old.startswith("roi_heads."):
49 |             logger.info("{} will not be loaded.".format(key))
50 |             continue
51 | 
52 |         model_state_dict[key] = loaded_state_dict[key_old]
53 |         logger.info(
54 |             log_str_template.format(
55 |                 key,
56 |                 max_size,
57 |                 key_old,
58 |                 max_size_loaded,
59 |                 tuple(loaded_state_dict[key_old].shape),
60 |             )
61 |         )
62 | 
63 | 
64 | def strip_prefix_if_present(state_dict, prefix):
65 |     keys = sorted(state_dict.keys())
66 |     if not all(key.startswith(prefix) for key in keys):
67 |         return state_dict
68 |     stripped_state_dict = OrderedDict()
69 |     for key, value in state_dict.items():
70 |         stripped_state_dict[key.replace(prefix, "")] = value
71 |     return stripped_state_dict
72 | 
73 | 
74 | def load_state_dict(model, loaded_state_dict, no_head):
75 |     model_state_dict = model.state_dict()
76 |     # if the state_dict comes from a model that was wrapped in a
77 |     # DataParallel or DistributedDataParallel during serialization,
78 |     # remove the "module" prefix before performing the matching
79 |     loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.")
80 |     align_and_update_state_dicts(model_state_dict, loaded_state_dict, no_head)
81 | 
82 |     # use strict loading
83 |     model.load_state_dict(model_state_dict)
84 | 


--------------------------------------------------------------------------------
/alphaction/utils/random_seed.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import random
 3 | import numpy as np
 4 | 
 5 | def set_seed(seed, rank, world_size):
 6 |     rng = random.Random(seed)
 7 |     seed_per_rank = [rng.randint(0, 2**32-1) for _ in range(world_size)]
 8 |     cur_seed = seed_per_rank[rank]
 9 |     random.seed(cur_seed)
10 |     torch.manual_seed(cur_seed)
11 |     torch.cuda.manual_seed(cur_seed)
12 |     np.random.seed(cur_seed)


--------------------------------------------------------------------------------
/alphaction/utils/registry.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/utils/registry.py
 2 | 
 3 | def _register_generic(module_dict, module_name, module):
 4 |     assert module_name not in module_dict
 5 |     module_dict[module_name] = module
 6 | 
 7 | 
 8 | class Registry(dict):
 9 |     '''
10 |     A helper class for managing registering modules, it extends a dictionary
11 |     and provides a register functions.
12 | 
13 |     Eg. creeting a registry:
14 |         some_registry = Registry({"default": default_module})
15 | 
16 |     There're two ways of registering new modules:
17 |     1): normal way is just calling register function:
18 |         def foo():
19 |             ...
20 |         some_registry.register("foo_module", foo)
21 |     2): used as decorator when declaring the module:
22 |         @some_registry.register("foo_module")
23 |         @some_registry.register("foo_modeul_nickname")
24 |         def foo():
25 |             ...
26 | 
27 |     Access of module is just like using a dictionary, eg:
28 |         f = some_registry["foo_modeul"]
29 |     '''
30 |     def __init__(self, *args, **kwargs):
31 |         super(Registry, self).__init__(*args, **kwargs)
32 | 
33 |     def register(self, module_name, module=None):
34 |         # used as function call
35 |         if module is not None:
36 |             _register_generic(self, module_name, module)
37 |             return
38 | 
39 |         # used as decorator
40 |         def register_fn(fn):
41 |             _register_generic(self, module_name, fn)
42 |             return fn
43 | 
44 |         return register_fn
45 | 


--------------------------------------------------------------------------------
/alphaction/utils/video_decode.py:
--------------------------------------------------------------------------------
1 | import av
2 | 
3 | def av_decode_video(video_path):
4 |     with av.open(video_path) as container:
5 |         frames = []
6 |         for frame in container.decode(video=0):
7 |             frames.append(frame.to_rgb().to_ndarray())
8 |     return frames


--------------------------------------------------------------------------------
/config_files/resnet101_8x8f_baseline.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   WEIGHT: "data/models/pretrained_models/SlowFast-ResNet101-8x8.pth"
 3 |   BACKBONE:
 4 |     CONV_BODY: "Slowfast-Resnet101"
 5 |     FROZEN_BN: True
 6 |     SLOWFAST:
 7 |       BETA: 0.125
 8 |       LATERAL: "tconv"
 9 |       SLOW:
10 |         ACTIVE: True
11 |         CONV3_NONLOCAL: False
12 |         CONV4_NONLOCAL: False
13 |       FAST:
14 |         ACTIVE: True
15 |         CONV3_NONLOCAL: False
16 |         CONV4_NONLOCAL: False
17 |   NONLOCAL:
18 |     USE_ZERO_INIT_CONV: False
19 |     BN_INIT_GAMMA: 0.0
20 |     FROZEN_BN: True
21 |   ROI_ACTION_HEAD:
22 |     FEATURE_EXTRACTOR: "2MLPFeatureExtractor"
23 |     POOLER_TYPE: "align3d"
24 |     MEAN_BEFORE_POOLER: True
25 |     POOLER_RESOLUTION: 7
26 |     POOLER_SCALE: 0.0625
27 |     POOLER_SAMPLING_RATIO: 0
28 |     NUM_CLASSES: 80
29 |     PROPOSAL_PER_CLIP: 10
30 |     DROPOUT_RATE: 0.2
31 |   IA_STRUCTURE:
32 |     ACTIVE: False
33 | INPUT:
34 |   FRAME_NUM: 64
35 |   FRAME_SAMPLE_RATE: 1
36 |   TAU: 8
37 |   ALPHA: 4
38 |   SLOW_JITTER: True
39 |   COLOR_JITTER: True
40 | DATASETS:
41 |   TRAIN: ("ava_video_train_v2.2",)
42 |   TEST: ("ava_video_val_v2.2",)
43 | DATALOADER:
44 |   NUM_WORKERS: 4
45 |   SIZE_DIVISIBILITY: 16
46 | SOLVER:
47 |   BASE_LR: 0.0004
48 |   WARMUP_FACTOR: 0.25
49 |   BIAS_LR_FACTOR: 2
50 |   WEIGHT_DECAY: 1e-7
51 |   STEPS: (50000, 70000)
52 |   WARMUP_ITERS: 2000
53 |   MAX_ITER: 90000
54 |   CHECKPOINT_PERIOD: 10000
55 |   EVAL_PERIOD: 10000
56 |   VIDEOS_PER_BATCH: 16
57 | TEST:
58 |   BOX_THRESH: 0.8
59 |   ACTION_THRESH: 0.
60 |   VIDEOS_PER_BATCH: 16
61 | OUTPUT_DIR: "data/output/resnet101_8x8f_baseline"


--------------------------------------------------------------------------------
/config_files/resnet101_8x8f_denseserial.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   WEIGHT: "data/models/pretrained_models/SlowFast-ResNet101-8x8.pth"
 3 |   BACKBONE:
 4 |     CONV_BODY: "Slowfast-Resnet101"
 5 |     FROZEN_BN: True
 6 |     SLOWFAST:
 7 |       BETA: 0.125
 8 |       LATERAL: "tconv"
 9 |       SLOW:
10 |         ACTIVE: True
11 |         CONV3_NONLOCAL: False
12 |         CONV4_NONLOCAL: False
13 |       FAST:
14 |         ACTIVE: True
15 |         CONV3_NONLOCAL: False
16 |         CONV4_NONLOCAL: False
17 |   NONLOCAL:
18 |     USE_ZERO_INIT_CONV: False
19 |     BN_INIT_GAMMA: 0.0
20 |     FROZEN_BN: True
21 |   ROI_ACTION_HEAD:
22 |     FEATURE_EXTRACTOR: "2MLPFeatureExtractor"
23 |     POOLER_TYPE: "align3d"
24 |     MEAN_BEFORE_POOLER: True
25 |     POOLER_RESOLUTION: 7
26 |     POOLER_SCALE: 0.0625
27 |     POOLER_SAMPLING_RATIO: 0
28 |     NUM_CLASSES: 80
29 |     PROPOSAL_PER_CLIP: 10
30 |     DROPOUT_RATE: 0.2
31 |   IA_STRUCTURE:
32 |     ACTIVE: True
33 |     STRUCTURE: "dense"
34 |     DROPOUT: 0.2
35 |     FUSION: "add"
36 |     TEMPORAL_POSITION: True
37 |     USE_ZERO_INIT_CONV: True
38 |     LAYER_NORM: True
39 |     MAX_PERSON: 25
40 |     MAX_OBJECT: 5
41 |     MAX_PER_SEC: 5
42 |     DIM_INNER: 1024
43 |     DIM_OUT: 2304
44 |     I_BLOCK_LIST: [ "P", "O", "M", "P", "O", "M" ]
45 | INPUT:
46 |   FRAME_NUM: 64
47 |   FRAME_SAMPLE_RATE: 1
48 |   TAU: 8
49 |   ALPHA: 4
50 |   SLOW_JITTER: True
51 |   COLOR_JITTER: True
52 | DATASETS:
53 |   TRAIN: ("ava_video_train_v2.2",)
54 |   TEST: ("ava_video_val_v2.2",)
55 | DATALOADER:
56 |   NUM_WORKERS: 4
57 |   SIZE_DIVISIBILITY: 16
58 | SOLVER:
59 |   BASE_LR: 0.0004
60 |   WARMUP_FACTOR: 0.25
61 |   BIAS_LR_FACTOR: 2
62 |   IA_LR_FACTOR: 10.0
63 |   WEIGHT_DECAY: 1e-7
64 |   STEPS: (70000, 90000)
65 |   WARMUP_ITERS: 2000
66 |   MAX_ITER: 110000
67 |   CHECKPOINT_PERIOD: 10000
68 |   EVAL_PERIOD: 10000
69 |   VIDEOS_PER_BATCH: 16
70 | TEST:
71 |   BOX_THRESH: 0.8
72 |   ACTION_THRESH: 0.
73 |   VIDEOS_PER_BATCH: 16
74 | OUTPUT_DIR: "data/output/resnet101_8x8f_denseserial"


--------------------------------------------------------------------------------
/config_files/resnet50_4x16f_baseline.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   WEIGHT: "data/models/pretrained_models/SlowFast-ResNet50-4x16.pth"
 3 |   BACKBONE:
 4 |     CONV_BODY: "Slowfast-Resnet50"
 5 |     FROZEN_BN: True
 6 |     SLOWFAST:
 7 |       BETA: 0.125
 8 |       LATERAL: "tconv"
 9 |       SLOW:
10 |         ACTIVE: True
11 |         CONV3_NONLOCAL: False
12 |         CONV4_NONLOCAL: False
13 |       FAST:
14 |         ACTIVE: True
15 |         CONV3_NONLOCAL: False
16 |         CONV4_NONLOCAL: False
17 |   NONLOCAL:
18 |     USE_ZERO_INIT_CONV: False
19 |     BN_INIT_GAMMA: 0.0
20 |     FROZEN_BN: True
21 |   ROI_ACTION_HEAD:
22 |     FEATURE_EXTRACTOR: "2MLPFeatureExtractor"
23 |     POOLER_TYPE: "align3d"
24 |     MEAN_BEFORE_POOLER: True
25 |     POOLER_RESOLUTION: 7
26 |     POOLER_SCALE: 0.0625
27 |     POOLER_SAMPLING_RATIO: 0
28 |     NUM_CLASSES: 80
29 |     PROPOSAL_PER_CLIP: 10
30 |     DROPOUT_RATE: 0.2
31 |   IA_STRUCTURE:
32 |     ACTIVE: False
33 | INPUT:
34 |   FRAME_NUM: 64
35 |   FRAME_SAMPLE_RATE: 1
36 |   TAU: 16
37 |   ALPHA: 8
38 |   SLOW_JITTER: True
39 |   COLOR_JITTER: True
40 | DATASETS:
41 |   TRAIN: ("ava_video_train_v2.2",)
42 |   TEST: ("ava_video_val_v2.2",)
43 | DATALOADER:
44 |   NUM_WORKERS: 4
45 |   SIZE_DIVISIBILITY: 16
46 | SOLVER:
47 |   BASE_LR: 0.0004
48 |   WARMUP_FACTOR: 0.25
49 |   BIAS_LR_FACTOR: 2
50 |   WEIGHT_DECAY: 1e-7
51 |   STEPS: (50000, 70000)
52 |   WARMUP_ITERS: 2000
53 |   MAX_ITER: 90000
54 |   CHECKPOINT_PERIOD: 10000
55 |   EVAL_PERIOD: 10000
56 |   VIDEOS_PER_BATCH: 16
57 | TEST:
58 |   BOX_THRESH: 0.8
59 |   ACTION_THRESH: 0.
60 |   VIDEOS_PER_BATCH: 16
61 | OUTPUT_DIR: "data/output/resnet50_4x16f_baseline"


--------------------------------------------------------------------------------
/config_files/resnet50_4x16f_denseserial.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   WEIGHT: "data/models/pretrained_models/SlowFast-ResNet50-4x16.pth"
 3 |   BACKBONE:
 4 |     CONV_BODY: "Slowfast-Resnet50"
 5 |     FROZEN_BN: True
 6 |     SLOWFAST:
 7 |       BETA: 0.125
 8 |       LATERAL: "tconv"
 9 |       SLOW:
10 |         ACTIVE: True
11 |         CONV3_NONLOCAL: False
12 |         CONV4_NONLOCAL: False
13 |       FAST:
14 |         ACTIVE: True
15 |         CONV3_NONLOCAL: False
16 |         CONV4_NONLOCAL: False
17 |   NONLOCAL:
18 |     USE_ZERO_INIT_CONV: False
19 |     BN_INIT_GAMMA: 0.0
20 |     FROZEN_BN: True
21 |   ROI_ACTION_HEAD:
22 |     FEATURE_EXTRACTOR: "2MLPFeatureExtractor"
23 |     POOLER_TYPE: "align3d"
24 |     MEAN_BEFORE_POOLER: True
25 |     POOLER_RESOLUTION: 7
26 |     POOLER_SCALE: 0.0625
27 |     POOLER_SAMPLING_RATIO: 0
28 |     NUM_CLASSES: 80
29 |     PROPOSAL_PER_CLIP: 10
30 |     DROPOUT_RATE: 0.2
31 |   IA_STRUCTURE:
32 |     ACTIVE: True
33 |     STRUCTURE: "dense"
34 |     DROPOUT: 0.2
35 |     FUSION: "add"
36 |     TEMPORAL_POSITION: True
37 |     USE_ZERO_INIT_CONV: True
38 |     LAYER_NORM: True
39 |     MAX_PERSON: 25
40 |     MAX_OBJECT: 5
41 |     MAX_PER_SEC: 5
42 |     DIM_INNER: 1024
43 |     DIM_OUT: 2304
44 |     I_BLOCK_LIST: [ "P", "O", "M", "P", "O", "M" ]
45 | INPUT:
46 |   FRAME_NUM: 64
47 |   FRAME_SAMPLE_RATE: 1
48 |   TAU: 16
49 |   ALPHA: 8
50 |   SLOW_JITTER: True
51 |   COLOR_JITTER: True
52 | DATASETS:
53 |   TRAIN: ("ava_video_train_v2.2",)
54 |   TEST: ("ava_video_val_v2.2",)
55 | DATALOADER:
56 |   NUM_WORKERS: 4
57 |   SIZE_DIVISIBILITY: 16
58 | SOLVER:
59 |   BASE_LR: 0.0004
60 |   WARMUP_FACTOR: 0.25
61 |   BIAS_LR_FACTOR: 2
62 |   IA_LR_FACTOR: 10.0
63 |   WEIGHT_DECAY: 1e-7
64 |   STEPS: (70000, 90000)
65 |   WARMUP_ITERS: 2000
66 |   MAX_ITER: 110000
67 |   CHECKPOINT_PERIOD: 10000
68 |   EVAL_PERIOD: 10000
69 |   VIDEOS_PER_BATCH: 16
70 | TEST:
71 |   BOX_THRESH: 0.8
72 |   ACTION_THRESH: 0.
73 |   VIDEOS_PER_BATCH: 16
74 | OUTPUT_DIR: "data/output/resnet50_4x16f_denseserial"


--------------------------------------------------------------------------------
/config_files/resnet50_4x16f_parallel.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   WEIGHT: "data/models/pretrained_models/SlowFast-ResNet50-4x16.pth"
 3 |   BACKBONE:
 4 |     CONV_BODY: "Slowfast-Resnet50"
 5 |     FROZEN_BN: True
 6 |     SLOWFAST:
 7 |       BETA: 0.125
 8 |       LATERAL: "tconv"
 9 |       SLOW:
10 |         ACTIVE: True
11 |         CONV3_NONLOCAL: False
12 |         CONV4_NONLOCAL: False
13 |       FAST:
14 |         ACTIVE: True
15 |         CONV3_NONLOCAL: False
16 |         CONV4_NONLOCAL: False
17 |   NONLOCAL:
18 |     USE_ZERO_INIT_CONV: False
19 |     BN_INIT_GAMMA: 0.0
20 |     FROZEN_BN: True
21 |   ROI_ACTION_HEAD:
22 |     FEATURE_EXTRACTOR: "2MLPFeatureExtractor"
23 |     POOLER_TYPE: "align3d"
24 |     MEAN_BEFORE_POOLER: True
25 |     POOLER_RESOLUTION: 7
26 |     POOLER_SCALE: 0.0625
27 |     POOLER_SAMPLING_RATIO: 0
28 |     NUM_CLASSES: 80
29 |     PROPOSAL_PER_CLIP: 10
30 |     DROPOUT_RATE: 0.2
31 |   IA_STRUCTURE:
32 |     ACTIVE: True
33 |     STRUCTURE: "parallel"
34 |     DROPOUT: 0.2
35 |     FUSION: "add"
36 |     TEMPORAL_POSITION: True
37 |     USE_ZERO_INIT_CONV: True
38 |     LAYER_NORM: True
39 |     MAX_PERSON: 25
40 |     MAX_OBJECT: 5
41 |     MAX_PER_SEC: 5
42 |     DIM_INNER: 1024
43 |     DIM_OUT: 2304
44 |     I_BLOCK_LIST: [ [ "P", "O", "M" ], [ "P", "O", "M" ] ]
45 | INPUT:
46 |   FRAME_NUM: 64
47 |   FRAME_SAMPLE_RATE: 1
48 |   TAU: 16
49 |   ALPHA: 8
50 |   SLOW_JITTER: True
51 |   COLOR_JITTER: True
52 | DATASETS:
53 |   TRAIN: ("ava_video_train_v2.2",)
54 |   TEST: ("ava_video_val_v2.2",)
55 | DATALOADER:
56 |   NUM_WORKERS: 4
57 |   SIZE_DIVISIBILITY: 16
58 | SOLVER:
59 |   BASE_LR: 0.0004
60 |   WARMUP_FACTOR: 0.25
61 |   BIAS_LR_FACTOR: 2
62 |   IA_LR_FACTOR: 10.0
63 |   WEIGHT_DECAY: 1e-7
64 |   STEPS: (70000, 90000)
65 |   WARMUP_ITERS: 2000
66 |   MAX_ITER: 110000
67 |   CHECKPOINT_PERIOD: 10000
68 |   EVAL_PERIOD: 10000
69 |   VIDEOS_PER_BATCH: 16
70 | TEST:
71 |   BOX_THRESH: 0.8
72 |   ACTION_THRESH: 0.
73 |   VIDEOS_PER_BATCH: 16
74 | OUTPUT_DIR: "data/output/resnet50_4x16f_parallel"


--------------------------------------------------------------------------------
/config_files/resnet50_4x16f_serial.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   WEIGHT: "data/models/pretrained_models/SlowFast-ResNet50-4x16.pth"
 3 |   BACKBONE:
 4 |     CONV_BODY: "Slowfast-Resnet50"
 5 |     FROZEN_BN: True
 6 |     SLOWFAST:
 7 |       BETA: 0.125
 8 |       LATERAL: "tconv"
 9 |       SLOW:
10 |         ACTIVE: True
11 |         CONV3_NONLOCAL: False
12 |         CONV4_NONLOCAL: False
13 |       FAST:
14 |         ACTIVE: True
15 |         CONV3_NONLOCAL: False
16 |         CONV4_NONLOCAL: False
17 |   NONLOCAL:
18 |     USE_ZERO_INIT_CONV: False
19 |     BN_INIT_GAMMA: 0.0
20 |     FROZEN_BN: True
21 |   ROI_ACTION_HEAD:
22 |     FEATURE_EXTRACTOR: "2MLPFeatureExtractor"
23 |     POOLER_TYPE: "align3d"
24 |     MEAN_BEFORE_POOLER: True
25 |     POOLER_RESOLUTION: 7
26 |     POOLER_SCALE: 0.0625
27 |     POOLER_SAMPLING_RATIO: 0
28 |     NUM_CLASSES: 80
29 |     PROPOSAL_PER_CLIP: 10
30 |     DROPOUT_RATE: 0.2
31 |   IA_STRUCTURE:
32 |     ACTIVE: True
33 |     DROPOUT: 0.2
34 |     FUSION: "add"
35 |     TEMPORAL_POSITION: True
36 |     USE_ZERO_INIT_CONV: True
37 |     LAYER_NORM: True
38 |     MAX_PERSON: 25
39 |     MAX_OBJECT: 5
40 |     MAX_PER_SEC: 5
41 |     DIM_OUT: 2304
42 |     DIM_INNER: 1024
43 |     I_BLOCK_LIST: [ "P", "O", "M", "P", "O", "M" ]
44 | INPUT:
45 |   FRAME_NUM: 64
46 |   FRAME_SAMPLE_RATE: 1
47 |   TAU: 16
48 |   ALPHA: 8
49 |   SLOW_JITTER: True
50 |   COLOR_JITTER: True
51 | DATASETS:
52 |   TRAIN: ("ava_video_train_v2.2",)
53 |   TEST: ("ava_video_val_v2.2",)
54 | DATALOADER:
55 |   NUM_WORKERS: 4
56 |   SIZE_DIVISIBILITY: 16
57 | SOLVER:
58 |   BASE_LR: 0.0004
59 |   WARMUP_FACTOR: 0.25
60 |   BIAS_LR_FACTOR: 2
61 |   IA_LR_FACTOR: 10.0
62 |   WEIGHT_DECAY: 1e-7
63 |   STEPS: (70000, 90000)
64 |   WARMUP_ITERS: 2000
65 |   MAX_ITER: 110000
66 |   CHECKPOINT_PERIOD: 10000
67 |   EVAL_PERIOD: 10000
68 |   VIDEOS_PER_BATCH: 16
69 | TEST:
70 |   BOX_THRESH: 0.8
71 |   ACTION_THRESH: 0.
72 |   VIDEOS_PER_BATCH: 16
73 | OUTPUT_DIR: "data/output/resnet50_4x16f_serial"


--------------------------------------------------------------------------------
/demo/README.md:
--------------------------------------------------------------------------------
 1 | # Demo
 2 | 
 3 | ### Installation 
 4 | 
 5 | To run this demo, make sure that you install all requirements following [INSTALL.md](../INSTALL.md).
 6 | 
 7 | ### Preparation
 8 | 
 9 | 1. Download the object detection model manually: **yolov3-spp.weights** ([Google Drive](https://drive.google.com/file/d/1260DRQM5XtSF7W213AWxk6RX2zfa3Zq6/view?usp=sharing)). Place it into `data/models/detector_models`.
10 | 2. Download the person tracking model manually: **jde.uncertainty.pt** ([Google Drive](https://drive.google.com/file/d/1nuCX5bR-1-HGZ0_WoH4xZzPiV_jgBphC/view?usp=sharing)). Place it into `data/models/detector_models`.
11 | 3. Please download our action models. Place them into ```data/models/aia_models```. All models are available in [MODEL_ZOO.md](../MODEL_ZOO.md).
12 | 4. We also provide a practical model ([Google Drive](https://drive.google.com/file/d/1gi6oKLj3wBGCOwwIiI9L4mS8pznFj7L1/view?usp=sharing)) trained on 15 common action categories in AVA. This 
13 | model achieves about 70 mAP on these categories. Please use [`resnet101_8x8f_denseserial.yaml`](../config_files/resnet101_8x8f_denseserial.yaml)
14 | and eable `--common-cate` to apply this model.
15 | 
16 | ### Usage
17 | 
18 | 1. Video Input
19 | 
20 |     ```
21 |     cd demo
22 |     python demo.py --video-path path/to/your/video --output-path path/to/the/output \ 
23 |     --cfg-path path/to/cfg/file --weight-path path/to/the/weight [--common-cate] 
24 |     ```
25 | 
26 | 2. Webcam Input
27 | 
28 |     ```
29 |     cd demo
30 |     python demo.py --webcam --output-path path/to/the/output \
31 |     --cfg-path path/to/cfg/file --weight-path path/to/the/weight [--common-cate] 
32 |     ```
33 | 


--------------------------------------------------------------------------------
/demo/Roboto-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/demo/Roboto-Bold.ttf


--------------------------------------------------------------------------------
/detector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/detector/__init__.py


--------------------------------------------------------------------------------
/detector/apis.py:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------
 2 | # Copyright (c) Shanghai Jiao Tong University. All rights reserved.
 3 | # Written by Chao Xu (xuchao.19962007@sjtu.edu.cn)
 4 | # -----------------------------------------------------
 5 | 
 6 | """API of detector"""
 7 | from abc import ABC, abstractmethod
 8 | 
 9 | 
10 | def get_detector(opt=None):
11 |     if opt.detector == 'yolo':
12 |         from detector.yolo_api import YOLODetector
13 |         from detector.yolo_cfg import cfg
14 |         return YOLODetector(cfg, opt)
15 |     elif opt.detector == 'tracker':
16 |         from detector.tracker_api import Tracker
17 |         from detector.tracker_cfg import cfg
18 |         return Tracker(cfg, opt)
19 |     else:
20 |         raise NotImplementedError
21 | 
22 | 
23 | class BaseDetector(ABC):
24 |     def __init__(self):
25 |         pass
26 | 
27 |     @abstractmethod
28 |     def image_preprocess(self, img_name):
29 |         pass
30 | 
31 |     @abstractmethod
32 |     def images_detection(self, imgs, orig_dim_list):
33 |         pass
34 | 
35 |     @abstractmethod
36 |     def detect_one_img(self, img_name):
37 |         pass
38 | 


--------------------------------------------------------------------------------
/detector/nms/__init__.py:
--------------------------------------------------------------------------------
1 | from .nms_wrapper import nms, soft_nms
2 | 
3 | __all__ = ['nms', 'soft_nms']
4 | 


--------------------------------------------------------------------------------
/detector/nms/nms_wrapper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from . import nms_cpu, nms_cuda
 5 | from .soft_nms_cpu import soft_nms_cpu
 6 | 
 7 | 
 8 | def nms(dets, iou_thr, device_id=None):
 9 |     """Dispatch to either CPU or GPU NMS implementations.
10 | 
11 |     The input can be either a torch tensor or numpy array. GPU NMS will be used
12 |     if the input is a gpu tensor or device_id is specified, otherwise CPU NMS
13 |     will be used. The returned type will always be the same as inputs.
14 | 
15 |     Arguments:
16 |         dets (torch.Tensor or np.ndarray): bboxes with scores.
17 |         iou_thr (float): IoU threshold for NMS.
18 |         device_id (int, optional): when `dets` is a numpy array, if `device_id`
19 |             is None, then cpu nms is used, otherwise gpu_nms will be used.
20 | 
21 |     Returns:
22 |         tuple: kept bboxes and indice, which is always the same data type as
23 |             the input.
24 |     """
25 |     # convert dets (tensor or numpy array) to tensor
26 |     if isinstance(dets, torch.Tensor):
27 |         is_numpy = False
28 |         dets_th = dets.to('cpu')
29 |     elif isinstance(dets, np.ndarray):
30 |         is_numpy = True
31 |         device = 'cpu' if device_id is None else 'cuda:{}'.format(device_id)
32 |         dets_th = torch.from_numpy(dets).to(device)
33 |     else:
34 |         raise TypeError(
35 |             'dets must be either a Tensor or numpy array, but got {}'.format(
36 |                 type(dets)))
37 | 
38 |     # execute cpu or cuda nms
39 |     if dets_th.shape[0] == 0:
40 |         inds = dets_th.new_zeros(0, dtype=torch.long)
41 |     else:
42 |         if dets_th.is_cuda:
43 |             inds = nms_cuda.nms(dets_th, iou_thr)
44 |         else:
45 |             inds = nms_cpu.nms(dets_th, iou_thr)
46 | 
47 |     if is_numpy:
48 |         inds = inds.cpu().numpy()
49 |     return dets[inds, :], inds
50 | 
51 | 
52 | def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3):
53 |     if isinstance(dets, torch.Tensor):
54 |         is_tensor = True
55 |         dets_np = dets.detach().cpu().numpy()
56 |     elif isinstance(dets, np.ndarray):
57 |         is_tensor = False
58 |         dets_np = dets
59 |     else:
60 |         raise TypeError(
61 |             'dets must be either a Tensor or numpy array, but got {}'.format(
62 |                 type(dets)))
63 | 
64 |     method_codes = {'linear': 1, 'gaussian': 2}
65 |     if method not in method_codes:
66 |         raise ValueError('Invalid method for SoftNMS: {}'.format(method))
67 |     new_dets, inds = soft_nms_cpu(
68 |         dets_np,
69 |         iou_thr,
70 |         method=method_codes[method],
71 |         sigma=sigma,
72 |         min_score=min_score)
73 | 
74 |     if is_tensor:
75 |         return dets.new_tensor(new_dets), dets.new_tensor(
76 |             inds, dtype=torch.long)
77 |     else:
78 |         return new_dets.astype(np.float32), inds.astype(np.int64)
79 | 


--------------------------------------------------------------------------------
/detector/nms/src/nms_cpu.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #include <torch/extension.h>
 3 | 
 4 | template <typename scalar_t>
 5 | at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) {
 6 |   AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
 7 | 
 8 |   if (dets.numel() == 0) {
 9 |     return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
10 |   }
11 | 
12 |   auto x1_t = dets.select(1, 0).contiguous();
13 |   auto y1_t = dets.select(1, 1).contiguous();
14 |   auto x2_t = dets.select(1, 2).contiguous();
15 |   auto y2_t = dets.select(1, 3).contiguous();
16 |   auto scores = dets.select(1, 4).contiguous();
17 | 
18 |   at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
19 | 
20 |   auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
21 | 
22 |   auto ndets = dets.size(0);
23 |   at::Tensor suppressed_t =
24 |       at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
25 | 
26 |   auto suppressed = suppressed_t.data_ptr<uint8_t>();
27 |   auto order = order_t.data_ptr<int64_t>();
28 |   auto x1 = x1_t.data_ptr<scalar_t>();
29 |   auto y1 = y1_t.data_ptr<scalar_t>();
30 |   auto x2 = x2_t.data_ptr<scalar_t>();
31 |   auto y2 = y2_t.data_ptr<scalar_t>();
32 |   auto areas = areas_t.data_ptr<scalar_t>();
33 | 
34 |   for (int64_t _i = 0; _i < ndets; _i++) {
35 |     auto i = order[_i];
36 |     if (suppressed[i] == 1) continue;
37 |     auto ix1 = x1[i];
38 |     auto iy1 = y1[i];
39 |     auto ix2 = x2[i];
40 |     auto iy2 = y2[i];
41 |     auto iarea = areas[i];
42 | 
43 |     for (int64_t _j = _i + 1; _j < ndets; _j++) {
44 |       auto j = order[_j];
45 |       if (suppressed[j] == 1) continue;
46 |       auto xx1 = std::max(ix1, x1[j]);
47 |       auto yy1 = std::max(iy1, y1[j]);
48 |       auto xx2 = std::min(ix2, x2[j]);
49 |       auto yy2 = std::min(iy2, y2[j]);
50 | 
51 |       auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
52 |       auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
53 |       auto inter = w * h;
54 |       auto ovr = inter / (iarea + areas[j] - inter);
55 |       if (ovr >= threshold) suppressed[j] = 1;
56 |     }
57 |   }
58 |   return at::nonzero(suppressed_t == 0).squeeze(1);
59 | }
60 | 
61 | at::Tensor nms(const at::Tensor& dets, const float threshold) {
62 |   at::Tensor result;
63 |   AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
64 |     result = nms_cpu_kernel<scalar_t>(dets, threshold);
65 |   });
66 |   return result;
67 | }
68 | 
69 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
70 |   m.def("nms", &nms, "non-maximum suppression");
71 | }


--------------------------------------------------------------------------------
/detector/nms/src/nms_cuda.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #include <torch/extension.h>
 3 | 
 4 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x, " must be a CUDAtensor ")
 5 | 
 6 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
 7 | 
 8 | at::Tensor nms(const at::Tensor& dets, const float threshold) {
 9 |   CHECK_CUDA(dets);
10 |   if (dets.numel() == 0)
11 |     return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
12 |   return nms_cuda(dets, threshold);
13 | }
14 | 
15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
16 |   m.def("nms", &nms, "non-maximum suppression");
17 | }


--------------------------------------------------------------------------------
/detector/nms/src/nms_kernel.cu:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | #include <ATen/ATen.h>
  3 | #include <ATen/cuda/CUDAContext.h>
  4 | 
  5 | #include <THC/THC.h>
  6 | #include <THC/THCDeviceUtils.cuh>
  7 | 
  8 | #include <vector>
  9 | #include <iostream>
 10 | 
 11 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
 12 | 
 13 | __device__ inline float devIoU(float const * const a, float const * const b) {
 14 |   float left = max(a[0], b[0]), right = min(a[2], b[2]);
 15 |   float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
 16 |   float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
 17 |   float interS = width * height;
 18 |   float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
 19 |   float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
 20 |   return interS / (Sa + Sb - interS);
 21 | }
 22 | 
 23 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 24 |                            const float *dev_boxes, unsigned long long *dev_mask) {
 25 |   const int row_start = blockIdx.y;
 26 |   const int col_start = blockIdx.x;
 27 | 
 28 |   // if (row_start > col_start) return;
 29 | 
 30 |   const int row_size =
 31 |         min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
 32 |   const int col_size =
 33 |         min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
 34 | 
 35 |   __shared__ float block_boxes[threadsPerBlock * 5];
 36 |   if (threadIdx.x < col_size) {
 37 |     block_boxes[threadIdx.x * 5 + 0] =
 38 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
 39 |     block_boxes[threadIdx.x * 5 + 1] =
 40 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
 41 |     block_boxes[threadIdx.x * 5 + 2] =
 42 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
 43 |     block_boxes[threadIdx.x * 5 + 3] =
 44 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
 45 |     block_boxes[threadIdx.x * 5 + 4] =
 46 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
 47 |   }
 48 |   __syncthreads();
 49 | 
 50 |   if (threadIdx.x < row_size) {
 51 |     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
 52 |     const float *cur_box = dev_boxes + cur_box_idx * 5;
 53 |     int i = 0;
 54 |     unsigned long long t = 0;
 55 |     int start = 0;
 56 |     if (row_start == col_start) {
 57 |       start = threadIdx.x + 1;
 58 |     }
 59 |     for (i = start; i < col_size; i++) {
 60 |       if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
 61 |         t |= 1ULL << i;
 62 |       }
 63 |     }
 64 |     const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
 65 |     dev_mask[cur_box_idx * col_blocks + col_start] = t;
 66 |   }
 67 | }
 68 | 
 69 | // boxes is a N x 5 tensor
 70 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
 71 |   using scalar_t = float;
 72 |   AT_ASSERTM(boxes.is_cuda(), "boxes must be a CUDA tensor");
 73 |   auto scores = boxes.select(1, 4);
 74 |   auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
 75 |   auto boxes_sorted = boxes.index_select(0, order_t);
 76 | 
 77 |   int boxes_num = boxes.size(0);
 78 | 
 79 |   const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
 80 | 
 81 |   scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();
 82 | 
 83 |   THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
 84 | 
 85 |   unsigned long long* mask_dev = NULL;
 86 |   //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
 87 |   //                      boxes_num * col_blocks * sizeof(unsigned long long)));
 88 | 
 89 |   mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
 90 | 
 91 |   dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
 92 |               THCCeilDiv(boxes_num, threadsPerBlock));
 93 |   dim3 threads(threadsPerBlock);
 94 |   nms_kernel<<<blocks, threads>>>(boxes_num,
 95 |                                   nms_overlap_thresh,
 96 |                                   boxes_dev,
 97 |                                   mask_dev);
 98 | 
 99 |   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
100 |   THCudaCheck(cudaMemcpy(&mask_host[0],
101 |                         mask_dev,
102 |                         sizeof(unsigned long long) * boxes_num * col_blocks,
103 |                         cudaMemcpyDeviceToHost));
104 | 
105 |   std::vector<unsigned long long> remv(col_blocks);
106 |   memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
107 | 
108 |   at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
109 |   int64_t* keep_out = keep.data_ptr<int64_t>();
110 | 
111 |   int num_to_keep = 0;
112 |   for (int i = 0; i < boxes_num; i++) {
113 |     int nblock = i / threadsPerBlock;
114 |     int inblock = i % threadsPerBlock;
115 | 
116 |     if (!(remv[nblock] & (1ULL << inblock))) {
117 |       keep_out[num_to_keep++] = i;
118 |       unsigned long long *p = &mask_host[0] + i * col_blocks;
119 |       for (int j = nblock; j < col_blocks; j++) {
120 |         remv[j] |= p[j];
121 |       }
122 |     }
123 |   }
124 | 
125 |   THCudaFree(state, mask_dev);
126 |   // TODO improve this part
127 |   return std::get<0>(order_t.index({
128 |                        keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
129 |                          order_t.device(), keep.scalar_type())
130 |                      }).sort(0, false));
131 | }


--------------------------------------------------------------------------------
/detector/nms/src/soft_nms_cpu.pyx:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------------------
  2 | # Soft-NMS: Improving Object Detection With One Line of Code
  3 | # Copyright (c) University of Maryland, College Park
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Navaneeth Bodla and Bharat Singh
  6 | # Modified by Kai Chen
  7 | # ----------------------------------------------------------
  8 | 
  9 | # cython: language_level=3, boundscheck=False
 10 | 
 11 | import numpy as np
 12 | cimport numpy as np
 13 | 
 14 | 
 15 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
 16 |     return a if a >= b else b
 17 | 
 18 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
 19 |     return a if a <= b else b
 20 | 
 21 | 
 22 | def soft_nms_cpu(
 23 |     np.ndarray[float, ndim=2] boxes_in,
 24 |     float iou_thr,
 25 |     unsigned int method=1,
 26 |     float sigma=0.5,
 27 |     float min_score=0.001,
 28 | ):
 29 |     boxes = boxes_in.copy()
 30 |     cdef int N = boxes.shape[0]
 31 |     cdef float iw, ih, box_area
 32 |     cdef float ua
 33 |     cdef int pos = 0
 34 |     cdef float maxscore = 0
 35 |     cdef int maxpos = 0
 36 |     cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov
 37 |     inds = np.arange(N)
 38 | 
 39 |     for i in range(N):
 40 |         maxscore = boxes[i, 4]
 41 |         maxpos = i
 42 | 
 43 |         tx1 = boxes[i, 0]
 44 |         ty1 = boxes[i, 1]
 45 |         tx2 = boxes[i, 2]
 46 |         ty2 = boxes[i, 3]
 47 |         ts = boxes[i, 4]
 48 |         ti = inds[i]
 49 | 
 50 |         pos = i + 1
 51 |         # get max box
 52 |         while pos < N:
 53 |             if maxscore < boxes[pos, 4]:
 54 |                 maxscore = boxes[pos, 4]
 55 |                 maxpos = pos
 56 |             pos = pos + 1
 57 | 
 58 |         # add max box as a detection
 59 |         boxes[i, 0] = boxes[maxpos, 0]
 60 |         boxes[i, 1] = boxes[maxpos, 1]
 61 |         boxes[i, 2] = boxes[maxpos, 2]
 62 |         boxes[i, 3] = boxes[maxpos, 3]
 63 |         boxes[i, 4] = boxes[maxpos, 4]
 64 |         inds[i] = inds[maxpos]
 65 | 
 66 |         # swap ith box with position of max box
 67 |         boxes[maxpos, 0] = tx1
 68 |         boxes[maxpos, 1] = ty1
 69 |         boxes[maxpos, 2] = tx2
 70 |         boxes[maxpos, 3] = ty2
 71 |         boxes[maxpos, 4] = ts
 72 |         inds[maxpos] = ti
 73 | 
 74 |         tx1 = boxes[i, 0]
 75 |         ty1 = boxes[i, 1]
 76 |         tx2 = boxes[i, 2]
 77 |         ty2 = boxes[i, 3]
 78 |         ts = boxes[i, 4]
 79 | 
 80 |         pos = i + 1
 81 |         # NMS iterations, note that N changes if detection boxes fall below
 82 |         # threshold
 83 |         while pos < N:
 84 |             x1 = boxes[pos, 0]
 85 |             y1 = boxes[pos, 1]
 86 |             x2 = boxes[pos, 2]
 87 |             y2 = boxes[pos, 3]
 88 |             s = boxes[pos, 4]
 89 | 
 90 |             area = (x2 - x1 + 1) * (y2 - y1 + 1)
 91 |             iw = (min(tx2, x2) - max(tx1, x1) + 1)
 92 |             if iw > 0:
 93 |                 ih = (min(ty2, y2) - max(ty1, y1) + 1)
 94 |                 if ih > 0:
 95 |                     ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
 96 |                     ov = iw * ih / ua  # iou between max box and detection box
 97 | 
 98 |                     if method == 1:  # linear
 99 |                         if ov > iou_thr:
100 |                             weight = 1 - ov
101 |                         else:
102 |                             weight = 1
103 |                     elif method == 2:  # gaussian
104 |                         weight = np.exp(-(ov * ov) / sigma)
105 |                     else:  # original NMS
106 |                         if ov > iou_thr:
107 |                             weight = 0
108 |                         else:
109 |                             weight = 1
110 | 
111 |                     boxes[pos, 4] = weight * boxes[pos, 4]
112 | 
113 |                     # if box score falls below threshold, discard the box by
114 |                     # swapping with last box update N
115 |                     if boxes[pos, 4] < min_score:
116 |                         boxes[pos, 0] = boxes[N-1, 0]
117 |                         boxes[pos, 1] = boxes[N-1, 1]
118 |                         boxes[pos, 2] = boxes[N-1, 2]
119 |                         boxes[pos, 3] = boxes[N-1, 3]
120 |                         boxes[pos, 4] = boxes[N-1, 4]
121 |                         inds[pos] = inds[N - 1]
122 |                         N = N - 1
123 |                         pos = pos - 1
124 | 
125 |             pos = pos + 1
126 | 
127 |     return boxes[:N], inds[:N]
128 | 


--------------------------------------------------------------------------------
/detector/tracker/README.md:
--------------------------------------------------------------------------------
1 | ## Introduction
2 | MOT Tracker adapted from [Towards-Realtime-MOT](https://github.com/Zhongdao/Towards-Realtime-MOT), many thanks to their wonderful work!
3 | 


--------------------------------------------------------------------------------
/detector/tracker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/detector/tracker/__init__.py


--------------------------------------------------------------------------------
/detector/tracker/cfg/ccmcpe.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "root":"/home/wangzd/datasets/MOT",
 3 |     "train":
 4 |     {
 5 |         "mot17":"./data/mot17.train",
 6 |         "caltech":"./data/caltech.train",
 7 |         "citypersons":"./data/citypersons.train",
 8 |         "cuhksysu":"./data/cuhksysu.train",
 9 |         "prw":"./data/prw.train",
10 |         "eth":"./data/eth.train"
11 |     },
12 |     "test_emb":
13 |     {
14 |         "caltech":"./data/caltech.10k.val",
15 |         "cuhksysu":"./data/cuhksysu.val",
16 |         "prw":"./data/prw.val"
17 |     },
18 |     "test":
19 |     {
20 |         "mot19":"./data/mot19.train",
21 |         "caltech":"./data/caltech.val",
22 |         "citypersons":"./data/citypersons.val"
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/detector/tracker/preprocess.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import torch
 4 | import cv2
 5 | 
 6 | try:
 7 |     from util import count_parameters as count
 8 |     from util import convert2cpu as cpu
 9 | except ImportError:
10 |     from yolo.util import count_parameters as count
11 |     from yolo.util import convert2cpu as cpu
12 | from PIL import Image, ImageDraw
13 | 
14 | 
15 | def letterbox_image(img, img_size=(1088, 608), color=(127.5, 127.5, 127.5)):  
16 |     # resize a rectangular image to a padded rectangular 
17 |     height=img_size[1]
18 |     width=img_size[0]
19 |     shape = img.shape[:2]  # shape = [height, width]
20 |     ratio = min(float(height)/shape[0], float(width)/shape[1])
21 |     new_shape = (round(shape[1] * ratio), round(shape[0] * ratio)) # new_shape = [width, height]
22 |     dw = (width - new_shape[0]) / 2  # width padding
23 |     dh = (height - new_shape[1]) / 2  # height padding
24 |     top, bottom = round(dh - 0.1), round(dh + 0.1)
25 |     left, right = round(dw - 0.1), round(dw + 0.1)
26 |     img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
27 |     img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # padded rectangular
28 |     return img
29 | 
30 | 
31 | def prep_image(img, img_size=(1088, 608)):
32 |     """
33 |     Prepare image for inputting to the neural network.
34 | 
35 |     Returns a Variable
36 |     """
37 | 
38 |     orig_im = cv2.imread(img)
39 |     dim = orig_im.shape[1], orig_im.shape[0]
40 |     img = (letterbox_image(orig_im, img_size))
41 |     img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
42 |     img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
43 |     return img_, orig_im, dim
44 | 
45 | 
46 | def prep_frame(img, img_size=(1088, 608)):
47 |     """
48 |     Prepare image for inputting to the neural network.
49 | 
50 |     Returns a Variable
51 |     """
52 | 
53 |     orig_im = img
54 |     dim = orig_im.shape[1], orig_im.shape[0]
55 |     img = (letterbox_image(orig_im, img_size))
56 |     img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
57 |     img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
58 |     return img_, orig_im, dim
59 | 
60 | 


--------------------------------------------------------------------------------
/detector/tracker/tracker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/detector/tracker/tracker/__init__.py


--------------------------------------------------------------------------------
/detector/tracker/tracker/basetrack.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from collections import OrderedDict
 3 | 
 4 | 
 5 | class TrackState(object):
 6 |     New = 0
 7 |     Tracked = 1
 8 |     Lost = 2
 9 |     Removed = 3
10 | 
11 | 
12 | class BaseTrack(object):
13 |     _count = 0
14 | 
15 |     track_id = 0
16 |     is_activated = False
17 |     state = TrackState.New
18 | 
19 |     history = OrderedDict()
20 |     features = []
21 |     curr_feature = None
22 |     score = 0
23 |     start_frame = 0
24 |     frame_id = 0
25 |     time_since_update = 0
26 | 
27 |     # multi-camera
28 |     location = (np.inf, np.inf)
29 | 
30 |     @property
31 |     def end_frame(self):
32 |         return self.frame_id
33 | 
34 |     @staticmethod
35 |     def next_id():
36 |         BaseTrack._count += 1
37 |         return BaseTrack._count
38 | 
39 |     def activate(self, *args):
40 |         raise NotImplementedError
41 | 
42 |     def predict(self):
43 |         raise NotImplementedError
44 | 
45 |     def update(self, *args, **kwargs):
46 |         raise NotImplementedError
47 | 
48 |     def mark_lost(self):
49 |         self.state = TrackState.Lost
50 | 
51 |     def mark_removed(self):
52 |         self.state = TrackState.Removed
53 | 
54 | 


--------------------------------------------------------------------------------
/detector/tracker/tracker/matching.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import scipy
  4 | from scipy.spatial.distance import cdist
  5 | from scipy.optimize import linear_sum_assignment
  6 | 
  7 | from cython_bbox import bbox_overlaps as bbox_ious
  8 | from tracker.utils import kalman_filter
  9 | import time
 10 | 
 11 | def merge_matches(m1, m2, shape):
 12 |     O,P,Q = shape
 13 |     m1 = np.asarray(m1)
 14 |     m2 = np.asarray(m2)
 15 | 
 16 |     M1 = scipy.sparse.coo_matrix((np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P))
 17 |     M2 = scipy.sparse.coo_matrix((np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q))
 18 | 
 19 |     mask = M1*M2
 20 |     match = mask.nonzero()
 21 |     match = list(zip(match[0], match[1]))
 22 |     unmatched_O = tuple(set(range(O)) - set([i for i, j in match]))
 23 |     unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match]))
 24 | 
 25 |     return match, unmatched_O, unmatched_Q
 26 | 
 27 | 
 28 | def _indices_to_matches(cost_matrix, indices, thresh):
 29 |     matched_cost = cost_matrix[tuple(zip(*indices))]
 30 |     matched_mask = (matched_cost <= thresh)
 31 | 
 32 |     matches = indices[matched_mask]
 33 |     unmatched_a = tuple(set(range(cost_matrix.shape[0])) - set(matches[:, 0]))
 34 |     unmatched_b = tuple(set(range(cost_matrix.shape[1])) - set(matches[:, 1]))
 35 | 
 36 |     return matches, unmatched_a, unmatched_b
 37 | 
 38 | 
 39 | def linear_assignment(cost_matrix, thresh):
 40 |     """
 41 |     Simple linear assignment
 42 |     :type cost_matrix: np.ndarray
 43 |     :type thresh: float
 44 |     :return: matches, unmatched_a, unmatched_b
 45 |     """
 46 |     if cost_matrix.size == 0:
 47 |         return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1]))
 48 | 
 49 |     cost_matrix[cost_matrix > thresh] = thresh + 1e-4
 50 |     row_ind, col_ind = linear_sum_assignment(cost_matrix)
 51 |     indices = np.column_stack((row_ind, col_ind))
 52 | 
 53 |     return _indices_to_matches(cost_matrix, indices, thresh)
 54 | 
 55 | 
 56 | def ious(atlbrs, btlbrs):
 57 |     """
 58 |     Compute cost based on IoU
 59 |     :type atlbrs: list[tlbr] | np.ndarray
 60 |     :type atlbrs: list[tlbr] | np.ndarray
 61 | 
 62 |     :rtype ious np.ndarray
 63 |     """
 64 |     ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float)
 65 |     if ious.size == 0:
 66 |         return ious
 67 | 
 68 |     ious = bbox_ious(
 69 |         np.ascontiguousarray(atlbrs, dtype=np.float),
 70 |         np.ascontiguousarray(btlbrs, dtype=np.float)
 71 |     )
 72 | 
 73 |     return ious
 74 | 
 75 | 
 76 | def iou_distance(atracks, btracks):
 77 |     """
 78 |     Compute cost based on IoU
 79 |     :type atracks: list[STrack]
 80 |     :type btracks: list[STrack]
 81 | 
 82 |     :rtype cost_matrix np.ndarray
 83 |     """
 84 | 
 85 |     if (len(atracks)>0 and isinstance(atracks[0], np.ndarray)) or (len(btracks) > 0 and isinstance(btracks[0], np.ndarray)):
 86 |         atlbrs = atracks
 87 |         btlbrs = btracks
 88 |     else:
 89 |         atlbrs = [track.tlbr for track in atracks]
 90 |         btlbrs = [track.tlbr for track in btracks]
 91 |     _ious = ious(atlbrs, btlbrs)
 92 |     cost_matrix = 1 - _ious
 93 | 
 94 |     return cost_matrix
 95 | 
 96 | def embedding_distance(tracks, detections, metric='cosine'):
 97 |     """
 98 |     :param tracks: list[STrack]
 99 |     :param detections: list[BaseTrack]
100 |     :param metric:
101 |     :return: cost_matrix np.ndarray
102 |     """
103 | 
104 |     cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
105 |     if cost_matrix.size == 0:
106 |         return cost_matrix
107 |     det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float)
108 |     for i, track in enumerate(tracks):
109 |         cost_matrix[i, :] = np.maximum(0.0, cdist(track.smooth_feat.reshape(1,-1), det_features, metric))
110 |     return cost_matrix
111 | 
112 | 
113 | def gate_cost_matrix(kf, cost_matrix, tracks, detections, only_position=False):
114 |     if cost_matrix.size == 0:
115 |         return cost_matrix
116 |     gating_dim = 2 if only_position else 4
117 |     gating_threshold = kalman_filter.chi2inv95[gating_dim]
118 |     measurements = np.asarray([det.to_xyah() for det in detections])
119 |     for row, track in enumerate(tracks):
120 |         gating_distance = kf.gating_distance(
121 |             track.mean, track.covariance, measurements, only_position)
122 |         cost_matrix[row, gating_distance > gating_threshold] = np.inf
123 |     return cost_matrix
124 | 


--------------------------------------------------------------------------------
/detector/tracker/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/detector/tracker/utils/__init__.py


--------------------------------------------------------------------------------
/detector/tracker/utils/evaluation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import copy
  4 | import motmetrics as mm
  5 | 
  6 | from utils.io import read_results, unzip_objs
  7 | 
  8 | 
  9 | class Evaluator(object):
 10 | 
 11 |     def __init__(self, data_root, seq_name, data_type):
 12 |         self.data_root = data_root
 13 |         self.seq_name = seq_name
 14 |         self.data_type = data_type
 15 | 
 16 |         self.load_annotations()
 17 |         self.reset_accumulator()
 18 | 
 19 |     def load_annotations(self):
 20 |         assert self.data_type == 'mot'
 21 | 
 22 |         gt_filename = os.path.join(self.data_root, self.seq_name, 'gt', 'gt.txt')
 23 |         self.gt_frame_dict = read_results(gt_filename, self.data_type, is_gt=True)
 24 |         self.gt_ignore_frame_dict = read_results(gt_filename, self.data_type, is_ignore=True)
 25 | 
 26 |     def reset_accumulator(self):
 27 |         self.acc = mm.MOTAccumulator(auto_id=True)
 28 | 
 29 |     def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False):
 30 |         # results
 31 |         trk_tlwhs = np.copy(trk_tlwhs)
 32 |         trk_ids = np.copy(trk_ids)
 33 | 
 34 |         # gts
 35 |         gt_objs = self.gt_frame_dict.get(frame_id, [])
 36 |         gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]
 37 | 
 38 |         # ignore boxes
 39 |         ignore_objs = self.gt_ignore_frame_dict.get(frame_id, [])
 40 |         ignore_tlwhs = unzip_objs(ignore_objs)[0]
 41 | 
 42 |         # remove ignored results
 43 |         keep = np.ones(len(trk_tlwhs), dtype=bool)
 44 |         iou_distance = mm.distances.iou_matrix(ignore_tlwhs, trk_tlwhs, max_iou=0.5)
 45 |         match_is, match_js = mm.lap.linear_sum_assignment(iou_distance)
 46 |         match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js])
 47 |         match_ious = iou_distance[match_is, match_js]
 48 | 
 49 |         match_js = np.asarray(match_js, dtype=int)
 50 |         match_js = match_js[np.logical_not(np.isnan(match_ious))]
 51 |         keep[match_js] = False
 52 |         trk_tlwhs = trk_tlwhs[keep]
 53 |         trk_ids = trk_ids[keep]
 54 | 
 55 |         # get distance matrix
 56 |         iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5)
 57 | 
 58 |         # acc
 59 |         self.acc.update(gt_ids, trk_ids, iou_distance)
 60 | 
 61 |         if rtn_events and iou_distance.size > 0 and hasattr(self.acc, 'last_mot_events'):
 62 |             events = self.acc.last_mot_events  # only supported by https://github.com/longcw/py-motmetrics
 63 |         else:
 64 |             events = None
 65 |         return events
 66 | 
 67 |     def eval_file(self, filename):
 68 |         self.reset_accumulator()
 69 | 
 70 |         result_frame_dict = read_results(filename, self.data_type, is_gt=False)
 71 |         frames = sorted(list(set(self.gt_frame_dict.keys()) | set(result_frame_dict.keys())))
 72 |         for frame_id in frames:
 73 |             trk_objs = result_frame_dict.get(frame_id, [])
 74 |             trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
 75 |             self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False)
 76 | 
 77 |         return self.acc
 78 | 
 79 |     @staticmethod
 80 |     def get_summary(accs, names, metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall')):
 81 |         names = copy.deepcopy(names)
 82 |         if metrics is None:
 83 |             metrics = mm.metrics.motchallenge_metrics
 84 |         metrics = copy.deepcopy(metrics)
 85 | 
 86 |         mh = mm.metrics.create()
 87 |         summary = mh.compute_many(
 88 |             accs,
 89 |             metrics=metrics,
 90 |             names=names,
 91 |             generate_overall=True
 92 |         )
 93 | 
 94 |         return summary
 95 | 
 96 |     @staticmethod
 97 |     def save_summary(summary, filename):
 98 |         import pandas as pd
 99 |         writer = pd.ExcelWriter(filename)
100 |         summary.to_excel(writer)
101 |         writer.save()
102 | 


--------------------------------------------------------------------------------
/detector/tracker/utils/io.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Dict
  3 | import numpy as np
  4 | 
  5 | from utils.log import logger
  6 | 
  7 | 
  8 | def write_results(filename, results_dict: Dict, data_type: str):
  9 |     if not filename:
 10 |         return
 11 |     path = os.path.dirname(filename)
 12 |     if not os.path.exists(path):
 13 |         os.makedirs(path)
 14 | 
 15 |     if data_type in ('mot', 'mcmot', 'lab'):
 16 |         save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n'
 17 |     elif data_type == 'kitti':
 18 |         save_format = '{frame} {id} pedestrian -1 -1 -10 {x1} {y1} {x2} {y2} -1 -1 -1 -1000 -1000 -1000 -10 {score}\n'
 19 |     else:
 20 |         raise ValueError(data_type)
 21 | 
 22 |     with open(filename, 'w') as f:
 23 |         for frame_id, frame_data in results_dict.items():
 24 |             if data_type == 'kitti':
 25 |                 frame_id -= 1
 26 |             for tlwh, track_id in frame_data:
 27 |                 if track_id < 0:
 28 |                     continue
 29 |                 x1, y1, w, h = tlwh
 30 |                 x2, y2 = x1 + w, y1 + h
 31 |                 line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h, score=1.0)
 32 |                 f.write(line)
 33 |     logger.info('Save results to {}'.format(filename))
 34 | 
 35 | 
 36 | def read_results(filename, data_type: str, is_gt=False, is_ignore=False):
 37 |     if data_type in ('mot', 'lab'):
 38 |         read_fun = read_mot_results
 39 |     else:
 40 |         raise ValueError('Unknown data type: {}'.format(data_type))
 41 | 
 42 |     return read_fun(filename, is_gt, is_ignore)
 43 | 
 44 | 
 45 | """
 46 | labels={'ped', ...			% 1
 47 | 'person_on_vhcl', ...	% 2
 48 | 'car', ...				% 3
 49 | 'bicycle', ...			% 4
 50 | 'mbike', ...			% 5
 51 | 'non_mot_vhcl', ...		% 6
 52 | 'static_person', ...	% 7
 53 | 'distractor', ...		% 8
 54 | 'occluder', ...			% 9
 55 | 'occluder_on_grnd', ...		%10
 56 | 'occluder_full', ...		% 11
 57 | 'reflection', ...		% 12
 58 | 'crowd' ...			% 13
 59 | };
 60 | """
 61 | 
 62 | 
 63 | def read_mot_results(filename, is_gt, is_ignore):
 64 |     valid_labels = {1}
 65 |     ignore_labels = {2, 7, 8, 12}
 66 |     results_dict = dict()
 67 |     if os.path.isfile(filename):
 68 |         with open(filename, 'r') as f:
 69 |             for line in f.readlines():
 70 |                 linelist = line.split(',')
 71 |                 if len(linelist) < 7:
 72 |                     continue
 73 |                 fid = int(linelist[0])
 74 |                 if fid < 1:
 75 |                     continue
 76 |                 results_dict.setdefault(fid, list())
 77 | 
 78 |                 if is_gt:
 79 |                     if 'MOT16-' in filename or 'MOT17-' in filename:
 80 |                         label = int(float(linelist[7]))
 81 |                         mark = int(float(linelist[6]))
 82 |                         if mark == 0 or label not in valid_labels:
 83 |                             continue
 84 |                     score = 1
 85 |                 elif is_ignore:
 86 |                     if 'MOT16-' in filename or 'MOT17-' in filename:
 87 |                         label = int(float(linelist[7]))
 88 |                         vis_ratio = float(linelist[8])
 89 |                         if label not in ignore_labels and vis_ratio >= 0:
 90 |                             continue
 91 |                     else:
 92 |                         continue
 93 |                     score = 1
 94 |                 else:
 95 |                     score = float(linelist[6])
 96 | 
 97 |                 tlwh = tuple(map(float, linelist[2:6]))
 98 |                 target_id = int(linelist[1])
 99 | 
100 |                 results_dict[fid].append((tlwh, target_id, score))
101 | 
102 |     return results_dict
103 | 
104 | 
105 | def unzip_objs(objs):
106 |     if len(objs) > 0:
107 |         tlwhs, ids, scores = zip(*objs)
108 |     else:
109 |         tlwhs, ids, scores = [], [], []
110 |     tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
111 | 
112 |     return tlwhs, ids, scores


--------------------------------------------------------------------------------
/detector/tracker/utils/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def get_logger(name='root'):
 5 |     formatter = logging.Formatter(
 6 |         # fmt='%(asctime)s [%(levelname)s]: %(filename)s(%(funcName)s:%(lineno)s) >> %(message)s')
 7 |         fmt='%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
 8 | 
 9 |     handler = logging.StreamHandler()
10 |     handler.setFormatter(formatter)
11 | 
12 |     logger = logging.getLogger(name)
13 |     logger.setLevel(logging.DEBUG)
14 |     logger.addHandler(handler)
15 |     return logger
16 | 
17 | 
18 | logger = get_logger('root')
19 | 


--------------------------------------------------------------------------------
/detector/tracker/utils/nms.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | # from ._utils import _C
3 | from utils import _C
4 | 
5 | nms = _C.nms
6 | # nms.__doc__ = """
7 | # This function performs Non-maximum suppresion"""
8 | 


--------------------------------------------------------------------------------
/detector/tracker/utils/parse_config.py:
--------------------------------------------------------------------------------
 1 | def parse_model_cfg(path):
 2 |     """Parses the yolo-v3 layer configuration file and returns module definitions"""
 3 |     file = open(path, 'r')
 4 |     lines = file.read().split('\n')
 5 |     lines = [x for x in lines if x and not x.startswith('#')]
 6 |     lines = [x.rstrip().lstrip() for x in lines]  # get rid of fringe whitespaces
 7 |     module_defs = []
 8 |     for line in lines:
 9 |         if line.startswith('['):  # This marks the start of a new block
10 |             module_defs.append({})
11 |             module_defs[-1]['type'] = line[1:-1].rstrip()
12 |             if module_defs[-1]['type'] == 'convolutional':
13 |                 module_defs[-1]['batch_normalize'] = 0
14 |         else:
15 |             key, value = line.split("=")
16 |             value = value.strip()
17 |             module_defs[-1][key.rstrip()] = value.strip()
18 | 
19 |     return module_defs
20 | 
21 | 
22 | def parse_data_cfg(path):
23 |     """Parses the data configuration file"""
24 |     options = dict()
25 |     options['gpus'] = '0'
26 |     options['num_workers'] = '10'
27 |     with open(path, 'r') as fp:
28 |         lines = fp.readlines()
29 |     for line in lines:
30 |         line = line.strip()
31 |         if line == '' or line.startswith('#'):
32 |             continue
33 |         key, value = line.split('=')
34 |         options[key.strip()] = value.strip()
35 |     return options
36 | 


--------------------------------------------------------------------------------
/detector/tracker/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import time
 9 | 
10 | 
11 | class Timer(object):
12 |     """A simple timer."""
13 |     def __init__(self):
14 |         self.total_time = 0.
15 |         self.calls = 0
16 |         self.start_time = 0.
17 |         self.diff = 0.
18 |         self.average_time = 0.
19 | 
20 |         self.duration = 0.
21 | 
22 |     def tic(self):
23 |         # using time.time instead of time.clock because time time.clock
24 |         # does not normalize for multithreading
25 |         self.start_time = time.time()
26 | 
27 |     def toc(self, average=True):
28 |         self.diff = time.time() - self.start_time
29 |         self.total_time += self.diff
30 |         self.calls += 1
31 |         self.average_time = self.total_time / self.calls
32 |         if average:
33 |             self.duration = self.average_time
34 |         else:
35 |             self.duration = self.diff
36 |         return self.duration
37 | 
38 |     def clear(self):
39 |         self.total_time = 0.
40 |         self.calls = 0
41 |         self.start_time = 0.
42 |         self.diff = 0.
43 |         self.average_time = 0.
44 |         self.duration = 0.
45 | 
46 | 


--------------------------------------------------------------------------------
/detector/tracker/utils/visualization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 | 
 4 | 
 5 | def tlwhs_to_tlbrs(tlwhs):
 6 |     tlbrs = np.copy(tlwhs)
 7 |     if len(tlbrs) == 0:
 8 |         return tlbrs
 9 |     tlbrs[:, 2] += tlwhs[:, 0]
10 |     tlbrs[:, 3] += tlwhs[:, 1]
11 |     return tlbrs
12 | 
13 | 
14 | def get_color(idx):
15 |     idx = idx * 3
16 |     color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)
17 | 
18 |     return color
19 | 
20 | 
21 | def resize_image(image, max_size=800):
22 |     if max(image.shape[:2]) > max_size:
23 |         scale = float(max_size) / max(image.shape[:2])
24 |         image = cv2.resize(image, None, fx=scale, fy=scale)
25 |     return image
26 | 
27 | 
28 | def plot_tracking(image, tlwhs, obj_ids, scores=None, frame_id=0, fps=0., ids2=None):
29 |     im = np.ascontiguousarray(np.copy(image))
30 |     im_h, im_w = im.shape[:2]
31 | 
32 |     top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255
33 | 
34 |     text_scale = max(1, image.shape[1] / 1600.)
35 |     text_thickness = 1 if text_scale > 1.1 else 1
36 |     line_thickness = max(1, int(image.shape[1] / 500.))
37 | 
38 |     radius = max(5, int(im_w/140.))
39 |     cv2.putText(im, 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)),
40 |                 (0, int(15 * text_scale)), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=2)
41 | 
42 |     for i, tlwh in enumerate(tlwhs):
43 |         x1, y1, w, h = tlwh
44 |         intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))
45 |         obj_id = int(obj_ids[i])
46 |         id_text = '{}'.format(int(obj_id))
47 |         if ids2 is not None:
48 |             id_text = id_text + ', {}'.format(int(ids2[i]))
49 |         _line_thickness = 1 if obj_id <= 0 else line_thickness
50 |         color = get_color(abs(obj_id))
51 |         cv2.rectangle(im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness)
52 |         cv2.putText(im, id_text, (intbox[0], intbox[1] + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255),
53 |                     thickness=text_thickness)
54 |     return im
55 | 
56 | 
57 | def plot_trajectory(image, tlwhs, track_ids):
58 |     image = image.copy()
59 |     for one_tlwhs, track_id in zip(tlwhs, track_ids):
60 |         color = get_color(int(track_id))
61 |         for tlwh in one_tlwhs:
62 |             x1, y1, w, h = tuple(map(int, tlwh))
63 |             cv2.circle(image, (int(x1 + 0.5 * w), int(y1 + h)), 2, color, thickness=2)
64 | 
65 |     return image
66 | 
67 | 
68 | def plot_detections(image, tlbrs, scores=None, color=(255, 0, 0), ids=None):
69 |     im = np.copy(image)
70 |     text_scale = max(1, image.shape[1] / 800.)
71 |     thickness = 2 if text_scale > 1.3 else 1
72 |     for i, det in enumerate(tlbrs):
73 |         x1, y1, x2, y2 = np.asarray(det[:4], dtype=np.int)
74 |         if len(det) >= 7:
75 |             label = 'det' if det[5] > 0 else 'trk'
76 |             if ids is not None:
77 |                 text = '{}# {:.2f}: {:d}'.format(label, det[6], ids[i])
78 |                 cv2.putText(im, text, (x1, y1 + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 255, 255),
79 |                             thickness=thickness)
80 |             else:
81 |                 text = '{}# {:.2f}'.format(label, det[6])
82 | 
83 |         if scores is not None:
84 |             text = '{:.2f}'.format(scores[i])
85 |             cv2.putText(im, text, (x1, y1 + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 255, 255),
86 |                         thickness=thickness)
87 | 
88 |         cv2.rectangle(im, (x1, y1), (x2, y2), color, 2)
89 | 
90 |     return im
91 | 


--------------------------------------------------------------------------------
/detector/tracker_cfg.py:
--------------------------------------------------------------------------------
 1 | from easydict import EasyDict as edict
 2 | 
 3 | cfg = edict()
 4 | cfg.CONFIG = '../detector/tracker/cfg/yolov3.cfg'
 5 | cfg.WEIGHTS = '../data/models/detector_models/jde.uncertainty.pt'
 6 | cfg.IMG_SIZE =  (1088, 608)
 7 | cfg.NMS_THRES =  0.4
 8 | cfg.CONFIDENCE = 0.2
 9 | cfg.BUFFER_SIZE = 30 # frame buffer
10 | 


--------------------------------------------------------------------------------
/detector/yolo/README.md:
--------------------------------------------------------------------------------
1 | # A PyTorch implementation of a YOLO v3 Object Detector
2 | 
3 | Forked from https://github.com/ayooshkathuria/pytorch-yolo-v3
4 | 


--------------------------------------------------------------------------------
/detector/yolo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/detector/yolo/__init__.py


--------------------------------------------------------------------------------
/detector/yolo/bbox.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import torch 
  4 | import random
  5 | 
  6 | import numpy as np
  7 | import cv2
  8 | 
  9 | def confidence_filter(result, confidence):
 10 |     conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2)
 11 |     result = result*conf_mask    
 12 |     
 13 |     return result
 14 | 
 15 | def confidence_filter_cls(result, confidence):
 16 |     max_scores = torch.max(result[:,:,5:25], 2)[0]
 17 |     res = torch.cat((result, max_scores),2)
 18 |     print(res.shape)
 19 |     
 20 |     
 21 |     cond_1 = (res[:,:,4] > confidence).float()
 22 |     cond_2 = (res[:,:,25] > 0.995).float()
 23 |     
 24 |     conf = cond_1 + cond_2
 25 |     conf = torch.clamp(conf, 0.0, 1.0)
 26 |     conf = conf.unsqueeze(2)
 27 |     result = result*conf   
 28 |     return result
 29 | 
 30 | 
 31 | 
 32 | def get_abs_coord(box):
 33 |     box[2], box[3] = abs(box[2]), abs(box[3])
 34 |     x1 = (box[0] - box[2]/2) - 1 
 35 |     y1 = (box[1] - box[3]/2) - 1 
 36 |     x2 = (box[0] + box[2]/2) - 1 
 37 |     y2 = (box[1] + box[3]/2) - 1
 38 |     return x1, y1, x2, y2
 39 |     
 40 | 
 41 | 
 42 | def sanity_fix(box):
 43 |     if (box[0] > box[2]):
 44 |         box[0], box[2] = box[2], box[0]
 45 |     
 46 |     if (box[1] >  box[3]):
 47 |         box[1], box[3] = box[3], box[1]
 48 |         
 49 |     return box
 50 | 
 51 | def bbox_iou(box1, box2, args=None):
 52 |     """
 53 |     Returns the IoU of two bounding boxes 
 54 |     
 55 |     
 56 |     """
 57 |     #Get the coordinates of bounding boxes
 58 |     b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3]
 59 |     b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3]
 60 |     
 61 |     #get the corrdinates of the intersection rectangle
 62 |     inter_rect_x1 =  torch.max(b1_x1, b2_x1)
 63 |     inter_rect_y1 =  torch.max(b1_y1, b2_y1)
 64 |     inter_rect_x2 =  torch.min(b1_x2, b2_x2)
 65 |     inter_rect_y2 =  torch.min(b1_y2, b2_y2)
 66 |     
 67 |     #Intersection area
 68 |     if not args:
 69 |         inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda())
 70 |     else:
 71 |         inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).to(args.device))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).to(args.device))
 72 |     #Union Area
 73 |     b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
 74 |     b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
 75 |     
 76 |     iou = inter_area / (b1_area + b2_area - inter_area)
 77 |     
 78 |     return iou
 79 | 
 80 | 
 81 | def pred_corner_coord(prediction):
 82 |     #Get indices of non-zero confidence bboxes
 83 |     ind_nz = torch.nonzero(prediction[:,:,4], as_tuple=False).transpose(0,1).contiguous()
 84 |     
 85 |     box = prediction[ind_nz[0], ind_nz[1]]
 86 |     
 87 |     
 88 |     box_a = box.new(box.shape)
 89 |     box_a[:,0] = (box[:,0] - box[:,2]/2)
 90 |     box_a[:,1] = (box[:,1] - box[:,3]/2)
 91 |     box_a[:,2] = (box[:,0] + box[:,2]/2) 
 92 |     box_a[:,3] = (box[:,1] + box[:,3]/2)
 93 |     box[:,:4] = box_a[:,:4]
 94 |     
 95 |     prediction[ind_nz[0], ind_nz[1]] = box
 96 |     
 97 |     return prediction
 98 | 
 99 | 
100 | 
101 | 
102 | def write(x, batches, results, colors, classes):
103 |     c1 = tuple(x[1:3].int())
104 |     c2 = tuple(x[3:5].int())
105 |     img = results[int(x[0])]
106 |     cls = int(x[-1])
107 |     label = "{0}".format(classes[cls])
108 |     color = random.choice(colors)
109 |     cv2.rectangle(img, c1, c2,color, 1)
110 |     t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
111 |     c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
112 |     cv2.rectangle(img, c1, c2,color, -1)
113 |     cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1);
114 |     return img
115 | 


--------------------------------------------------------------------------------
/detector/yolo/cam_demo.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import time
  3 | import torch 
  4 | import torch.nn as nn
  5 | from torch.autograd import Variable
  6 | import numpy as np
  7 | import cv2 
  8 | from util import *
  9 | from darknet import Darknet
 10 | from preprocess import prep_image, inp_to_image
 11 | import pandas as pd
 12 | import random 
 13 | import argparse
 14 | import pickle as pkl
 15 | 
 16 | def get_test_input(input_dim, CUDA):
 17 |     img = cv2.imread("imgs/messi.jpg")
 18 |     img = cv2.resize(img, (input_dim, input_dim)) 
 19 |     img_ =  img[:,:,::-1].transpose((2,0,1))
 20 |     img_ = img_[np.newaxis,:,:,:]/255.0
 21 |     img_ = torch.from_numpy(img_).float()
 22 |     img_ = Variable(img_)
 23 |     
 24 |     if CUDA:
 25 |         img_ = img_.cuda()
 26 |     
 27 |     return img_
 28 | 
 29 | def prep_image(img, inp_dim):
 30 |     """
 31 |     Prepare image for inputting to the neural network. 
 32 |     
 33 |     Returns a Variable 
 34 |     """
 35 | 
 36 |     orig_im = img
 37 |     dim = orig_im.shape[1], orig_im.shape[0]
 38 |     img = cv2.resize(orig_im, (inp_dim, inp_dim))
 39 |     img_ = img[:,:,::-1].transpose((2,0,1)).copy()
 40 |     img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
 41 |     return img_, orig_im, dim
 42 | 
 43 | def write(x, img):
 44 |     c1 = tuple(x[1:3].int())
 45 |     c2 = tuple(x[3:5].int())
 46 |     cls = int(x[-1])
 47 |     label = "{0}".format(classes[cls])
 48 |     color = random.choice(colors)
 49 |     cv2.rectangle(img, c1, c2,color, 1)
 50 |     t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
 51 |     c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4
 52 |     cv2.rectangle(img, c1, c2,color, -1)
 53 |     cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1);
 54 |     return img
 55 | 
 56 | def arg_parse():
 57 |     """
 58 |     Parse arguements to the detect module
 59 |     
 60 |     """
 61 |     
 62 |     
 63 |     parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo')
 64 |     parser.add_argument("--confidence", dest = "confidence", help = "Object Confidence to filter predictions", default = 0.25)
 65 |     parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4)
 66 |     parser.add_argument("--reso", dest = 'reso', help = 
 67 |                         "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed",
 68 |                         default = "160", type = str)
 69 |     return parser.parse_args()
 70 | 
 71 | 
 72 | 
 73 | if __name__ == '__main__':
 74 |     cfgfile = "cfg/yolov3-spp.cfg"
 75 |     weightsfile = "yolov3-spp.weights"
 76 |     num_classes = 80
 77 | 
 78 |     args = arg_parse()
 79 |     confidence = float(args.confidence)
 80 |     nms_thesh = float(args.nms_thresh)
 81 |     start = 0
 82 |     CUDA = torch.cuda.is_available()
 83 |     
 84 | 
 85 |     
 86 |     
 87 |     num_classes = 80
 88 |     bbox_attrs = 5 + num_classes
 89 |     
 90 |     model = Darknet(cfgfile)
 91 |     model.load_weights(weightsfile)
 92 |     
 93 |     model.net_info["height"] = args.reso
 94 |     inp_dim = int(model.net_info["height"])
 95 |     
 96 |     assert inp_dim % 32 == 0 
 97 |     assert inp_dim > 32
 98 | 
 99 |     if CUDA:
100 |         model.cuda()
101 |             
102 |     model.eval()
103 |     
104 |     videofile = 'video.avi'
105 |     
106 |     cap = cv2.VideoCapture(0)
107 |     
108 |     assert cap.isOpened(), 'Cannot capture source'
109 |     
110 |     frames = 0
111 |     start = time.time()    
112 |     while cap.isOpened():
113 |         
114 |         ret, frame = cap.read()
115 |         if ret:
116 |             
117 |             img, orig_im, dim = prep_image(frame, inp_dim)
118 |             
119 | #            im_dim = torch.FloatTensor(dim).repeat(1,2)                        
120 |             
121 |             
122 |             if CUDA:
123 |                 im_dim = im_dim.cuda()
124 |                 img = img.cuda()
125 |             
126 |             
127 |             output = model(Variable(img), CUDA)
128 |             output = write_results(output, confidence, num_classes, nms = True, nms_conf = nms_thesh)
129 | 
130 |             if type(output) == int:
131 |                 frames += 1
132 |                 print("FPS of the video is {:5.2f}".format( frames / (time.time() - start)))
133 |                 cv2.imshow("frame", orig_im)
134 |                 key = cv2.waitKey(1)
135 |                 if key & 0xFF == ord('q'):
136 |                     break
137 |                 continue
138 |             
139 | 
140 |         
141 |             output[:,1:5] = torch.clamp(output[:,1:5], 0.0, float(inp_dim))/inp_dim
142 |             
143 | #            im_dim = im_dim.repeat(output.size(0), 1)
144 |             output[:,[1,3]] *= frame.shape[1]
145 |             output[:,[2,4]] *= frame.shape[0]
146 | 
147 |             
148 |             classes = load_classes('data/coco.names')
149 |             colors = pkl.load(open("pallete", "rb"))
150 |             
151 |             list(map(lambda x: write(x, orig_im), output))
152 |             
153 |             
154 |             cv2.imshow("frame", orig_im)
155 |             key = cv2.waitKey(1)
156 |             if key & 0xFF == ord('q'):
157 |                 break
158 |             frames += 1
159 |             print("FPS of the video is {:5.2f}".format( frames / (time.time() - start)))
160 | 
161 |             
162 |         else:
163 |             break
164 |     
165 | 
166 |     
167 |     
168 | 
169 | 


--------------------------------------------------------------------------------
/detector/yolo/cfg/tiny-yolo-voc.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | batch=64
  3 | subdivisions=8
  4 | width=416
  5 | height=416
  6 | channels=3
  7 | momentum=0.9
  8 | decay=0.0005
  9 | angle=0
 10 | saturation = 1.5
 11 | exposure = 1.5
 12 | hue=.1
 13 | 
 14 | learning_rate=0.001
 15 | max_batches = 40200
 16 | policy=steps
 17 | steps=-1,100,20000,30000
 18 | scales=.1,10,.1,.1
 19 | 
 20 | [convolutional]
 21 | batch_normalize=1
 22 | filters=16
 23 | size=3
 24 | stride=1
 25 | pad=1
 26 | activation=leaky
 27 | 
 28 | [maxpool]
 29 | size=2
 30 | stride=2
 31 | 
 32 | [convolutional]
 33 | batch_normalize=1
 34 | filters=32
 35 | size=3
 36 | stride=1
 37 | pad=1
 38 | activation=leaky
 39 | 
 40 | [maxpool]
 41 | size=2
 42 | stride=2
 43 | 
 44 | [convolutional]
 45 | batch_normalize=1
 46 | filters=64
 47 | size=3
 48 | stride=1
 49 | pad=1
 50 | activation=leaky
 51 | 
 52 | [maxpool]
 53 | size=2
 54 | stride=2
 55 | 
 56 | [convolutional]
 57 | batch_normalize=1
 58 | filters=128
 59 | size=3
 60 | stride=1
 61 | pad=1
 62 | activation=leaky
 63 | 
 64 | [maxpool]
 65 | size=2
 66 | stride=2
 67 | 
 68 | [convolutional]
 69 | batch_normalize=1
 70 | filters=256
 71 | size=3
 72 | stride=1
 73 | pad=1
 74 | activation=leaky
 75 | 
 76 | [maxpool]
 77 | size=2
 78 | stride=2
 79 | 
 80 | [convolutional]
 81 | batch_normalize=1
 82 | filters=512
 83 | size=3
 84 | stride=1
 85 | pad=1
 86 | activation=leaky
 87 | 
 88 | [maxpool]
 89 | size=2
 90 | stride=1
 91 | 
 92 | [convolutional]
 93 | batch_normalize=1
 94 | filters=1024
 95 | size=3
 96 | stride=1
 97 | pad=1
 98 | activation=leaky
 99 | 
100 | ###########
101 | 
102 | [convolutional]
103 | batch_normalize=1
104 | size=3
105 | stride=1
106 | pad=1
107 | filters=1024
108 | activation=leaky
109 | 
110 | [convolutional]
111 | size=1
112 | stride=1
113 | pad=1
114 | filters=125
115 | activation=linear
116 | 
117 | [region]
118 | anchors = 1.08,1.19,  3.42,4.41,  6.63,11.38,  9.42,5.11,  16.62,10.52
119 | bias_match=1
120 | classes=20
121 | coords=4
122 | num=5
123 | softmax=1
124 | jitter=.2
125 | rescore=1
126 | 
127 | object_scale=5
128 | noobject_scale=1
129 | class_scale=1
130 | coord_scale=1
131 | 
132 | absolute=1
133 | thresh = .6
134 | random=1
135 | 


--------------------------------------------------------------------------------
/detector/yolo/cfg/yolo-voc.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=64
  4 | subdivisions=8
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=8
  8 | height=416
  9 | width=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 80200
 21 | policy=steps
 22 | steps=-1,500,40000,60000
 23 | scales=0.1,10,.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | [maxpool]
 34 | size=2
 35 | stride=2
 36 | 
 37 | [convolutional]
 38 | batch_normalize=1
 39 | filters=64
 40 | size=3
 41 | stride=1
 42 | pad=1
 43 | activation=leaky
 44 | 
 45 | [maxpool]
 46 | size=2
 47 | stride=2
 48 | 
 49 | [convolutional]
 50 | batch_normalize=1
 51 | filters=128
 52 | size=3
 53 | stride=1
 54 | pad=1
 55 | activation=leaky
 56 | 
 57 | [convolutional]
 58 | batch_normalize=1
 59 | filters=64
 60 | size=1
 61 | stride=1
 62 | pad=1
 63 | activation=leaky
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=1
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [maxpool]
 74 | size=2
 75 | stride=2
 76 | 
 77 | [convolutional]
 78 | batch_normalize=1
 79 | filters=256
 80 | size=3
 81 | stride=1
 82 | pad=1
 83 | activation=leaky
 84 | 
 85 | [convolutional]
 86 | batch_normalize=1
 87 | filters=128
 88 | size=1
 89 | stride=1
 90 | pad=1
 91 | activation=leaky
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=256
 96 | size=3
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [maxpool]
102 | size=2
103 | stride=2
104 | 
105 | [convolutional]
106 | batch_normalize=1
107 | filters=512
108 | size=3
109 | stride=1
110 | pad=1
111 | activation=leaky
112 | 
113 | [convolutional]
114 | batch_normalize=1
115 | filters=256
116 | size=1
117 | stride=1
118 | pad=1
119 | activation=leaky
120 | 
121 | [convolutional]
122 | batch_normalize=1
123 | filters=512
124 | size=3
125 | stride=1
126 | pad=1
127 | activation=leaky
128 | 
129 | [convolutional]
130 | batch_normalize=1
131 | filters=256
132 | size=1
133 | stride=1
134 | pad=1
135 | activation=leaky
136 | 
137 | [convolutional]
138 | batch_normalize=1
139 | filters=512
140 | size=3
141 | stride=1
142 | pad=1
143 | activation=leaky
144 | 
145 | [maxpool]
146 | size=2
147 | stride=2
148 | 
149 | [convolutional]
150 | batch_normalize=1
151 | filters=1024
152 | size=3
153 | stride=1
154 | pad=1
155 | activation=leaky
156 | 
157 | [convolutional]
158 | batch_normalize=1
159 | filters=512
160 | size=1
161 | stride=1
162 | pad=1
163 | activation=leaky
164 | 
165 | [convolutional]
166 | batch_normalize=1
167 | filters=1024
168 | size=3
169 | stride=1
170 | pad=1
171 | activation=leaky
172 | 
173 | [convolutional]
174 | batch_normalize=1
175 | filters=512
176 | size=1
177 | stride=1
178 | pad=1
179 | activation=leaky
180 | 
181 | [convolutional]
182 | batch_normalize=1
183 | filters=1024
184 | size=3
185 | stride=1
186 | pad=1
187 | activation=leaky
188 | 
189 | 
190 | #######
191 | 
192 | [convolutional]
193 | batch_normalize=1
194 | size=3
195 | stride=1
196 | pad=1
197 | filters=1024
198 | activation=leaky
199 | 
200 | [convolutional]
201 | batch_normalize=1
202 | size=3
203 | stride=1
204 | pad=1
205 | filters=1024
206 | activation=leaky
207 | 
208 | [route]
209 | layers=-9
210 | 
211 | [convolutional]
212 | batch_normalize=1
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=64
217 | activation=leaky
218 | 
219 | [reorg]
220 | stride=2
221 | 
222 | [route]
223 | layers=-1,-4
224 | 
225 | [convolutional]
226 | batch_normalize=1
227 | size=3
228 | stride=1
229 | pad=1
230 | filters=1024
231 | activation=leaky
232 | 
233 | [convolutional]
234 | size=1
235 | stride=1
236 | pad=1
237 | filters=125
238 | activation=linear
239 | 
240 | 
241 | [region]
242 | anchors =  1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071
243 | bias_match=1
244 | classes=20
245 | coords=4
246 | num=5
247 | softmax=1
248 | jitter=.3
249 | rescore=1
250 | 
251 | object_scale=5
252 | noobject_scale=1
253 | class_scale=1
254 | coord_scale=1
255 | 
256 | absolute=1
257 | thresh = .6
258 | random=1
259 | 


--------------------------------------------------------------------------------
/detector/yolo/cfg/yolo.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=8
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | [maxpool]
 34 | size=2
 35 | stride=2
 36 | 
 37 | [convolutional]
 38 | batch_normalize=1
 39 | filters=64
 40 | size=3
 41 | stride=1
 42 | pad=1
 43 | activation=leaky
 44 | 
 45 | [maxpool]
 46 | size=2
 47 | stride=2
 48 | 
 49 | [convolutional]
 50 | batch_normalize=1
 51 | filters=128
 52 | size=3
 53 | stride=1
 54 | pad=1
 55 | activation=leaky
 56 | 
 57 | [convolutional]
 58 | batch_normalize=1
 59 | filters=64
 60 | size=1
 61 | stride=1
 62 | pad=1
 63 | activation=leaky
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=1
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [maxpool]
 74 | size=2
 75 | stride=2
 76 | 
 77 | [convolutional]
 78 | batch_normalize=1
 79 | filters=256
 80 | size=3
 81 | stride=1
 82 | pad=1
 83 | activation=leaky
 84 | 
 85 | [convolutional]
 86 | batch_normalize=1
 87 | filters=128
 88 | size=1
 89 | stride=1
 90 | pad=1
 91 | activation=leaky
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=256
 96 | size=3
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [maxpool]
102 | size=2
103 | stride=2
104 | 
105 | [convolutional]
106 | batch_normalize=1
107 | filters=512
108 | size=3
109 | stride=1
110 | pad=1
111 | activation=leaky
112 | 
113 | [convolutional]
114 | batch_normalize=1
115 | filters=256
116 | size=1
117 | stride=1
118 | pad=1
119 | activation=leaky
120 | 
121 | [convolutional]
122 | batch_normalize=1
123 | filters=512
124 | size=3
125 | stride=1
126 | pad=1
127 | activation=leaky
128 | 
129 | [convolutional]
130 | batch_normalize=1
131 | filters=256
132 | size=1
133 | stride=1
134 | pad=1
135 | activation=leaky
136 | 
137 | [convolutional]
138 | batch_normalize=1
139 | filters=512
140 | size=3
141 | stride=1
142 | pad=1
143 | activation=leaky
144 | 
145 | [maxpool]
146 | size=2
147 | stride=2
148 | 
149 | [convolutional]
150 | batch_normalize=1
151 | filters=1024
152 | size=3
153 | stride=1
154 | pad=1
155 | activation=leaky
156 | 
157 | [convolutional]
158 | batch_normalize=1
159 | filters=512
160 | size=1
161 | stride=1
162 | pad=1
163 | activation=leaky
164 | 
165 | [convolutional]
166 | batch_normalize=1
167 | filters=1024
168 | size=3
169 | stride=1
170 | pad=1
171 | activation=leaky
172 | 
173 | [convolutional]
174 | batch_normalize=1
175 | filters=512
176 | size=1
177 | stride=1
178 | pad=1
179 | activation=leaky
180 | 
181 | [convolutional]
182 | batch_normalize=1
183 | filters=1024
184 | size=3
185 | stride=1
186 | pad=1
187 | activation=leaky
188 | 
189 | 
190 | #######
191 | 
192 | [convolutional]
193 | batch_normalize=1
194 | size=3
195 | stride=1
196 | pad=1
197 | filters=1024
198 | activation=leaky
199 | 
200 | [convolutional]
201 | batch_normalize=1
202 | size=3
203 | stride=1
204 | pad=1
205 | filters=1024
206 | activation=leaky
207 | 
208 | [route]
209 | layers=-9
210 | 
211 | [convolutional]
212 | batch_normalize=1
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=64
217 | activation=leaky
218 | 
219 | [reorg]
220 | stride=2
221 | 
222 | [route]
223 | layers=-1,-4
224 | 
225 | [convolutional]
226 | batch_normalize=1
227 | size=3
228 | stride=1
229 | pad=1
230 | filters=1024
231 | activation=leaky
232 | 
233 | [convolutional]
234 | size=1
235 | stride=1
236 | pad=1
237 | filters=425
238 | activation=linear
239 | 
240 | 
241 | [region]
242 | anchors =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
243 | bias_match=1
244 | classes=80
245 | coords=4
246 | num=5
247 | softmax=1
248 | jitter=.3
249 | rescore=1
250 | 
251 | object_scale=5
252 | noobject_scale=1
253 | class_scale=1
254 | coord_scale=1
255 | 
256 | absolute=1
257 | thresh = .6
258 | random=1
259 | 


--------------------------------------------------------------------------------
/detector/yolo/detect.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import time
  3 | import torch 
  4 | import torch.nn as nn
  5 | from torch.autograd import Variable
  6 | import numpy as np
  7 | import cv2 
  8 | from util import *
  9 | import argparse
 10 | import os 
 11 | import os.path as osp
 12 | from darknet import Darknet
 13 | from preprocess import prep_image, inp_to_image
 14 | import pandas as pd
 15 | import random 
 16 | import pickle as pkl
 17 | import itertools
 18 | 
 19 | 
 20 | if __name__ == '__main__':
 21 | 
 22 |     scales = "1,2,3"
 23 |     images = "imgs/messi.jpg"
 24 |     batch_size = 1
 25 |     confidence = 0.5
 26 |     nms_thesh = 0.4
 27 | 
 28 |     CUDA = torch.cuda.is_available()
 29 | 
 30 |     num_classes = 80
 31 |     classes = load_classes('data/coco.names') 
 32 | 
 33 |     #Set up the neural network
 34 |     print("Loading network.....")
 35 |     model = Darknet("cfg/yolov3-spp.cfg")
 36 |     model.load_weights("yolov3-spp.weights")
 37 |     print("Network successfully loaded")
 38 | 
 39 |     model.net_info["height"] = "608"
 40 |     inp_dim = int(model.net_info["height"])
 41 |     assert inp_dim % 32 == 0
 42 |     assert inp_dim > 32
 43 | 
 44 |     #If there's a GPU availible, put the model on GPU
 45 |     if CUDA:
 46 |         model.cuda()
 47 | 
 48 |     #Set the model in evaluation mode
 49 |     model.eval()
 50 | 
 51 |     #Detection phase
 52 |     try:
 53 |         imlist = []
 54 |         imlist.append(osp.join(osp.realpath('.'), images))
 55 |     except FileNotFoundError:
 56 |         print ("No file or directory with the name {}".format(images))
 57 |         exit()
 58 | 
 59 |     batches = list(map(prep_image, imlist, [inp_dim for x in range(len(imlist))]))
 60 |     im_batches = [x[0] for x in batches]
 61 |     orig_ims = [x[1] for x in batches]
 62 |     im_dim_list = [x[2] for x in batches]
 63 |     im_dim_list = torch.FloatTensor(im_dim_list).repeat(1, 2)
 64 | 
 65 |     if CUDA:
 66 |         im_dim_list = im_dim_list.cuda()
 67 | 
 68 | 
 69 |     for batch in im_batches:
 70 |         #load the image
 71 |         if CUDA:
 72 |             batch = batch.cuda()
 73 |         with torch.no_grad():
 74 |             prediction = model(Variable(batch), CUDA)
 75 | 
 76 |         prediction = write_results(prediction, confidence, num_classes, nms=True, nms_conf=nms_thesh)
 77 |         output = prediction
 78 | 
 79 |         if CUDA:
 80 |             torch.cuda.synchronize()
 81 | 
 82 |     try:
 83 |         output
 84 |     except NameError:
 85 |         print("No detections were made")
 86 |         exit()
 87 |     print(im_dim_list.shape)
 88 |     im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long())
 89 | 
 90 |     scaling_factor = torch.min(inp_dim/im_dim_list,1)[0].view(-1,1)
 91 | 
 92 | 
 93 |     output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2
 94 |     output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2
 95 | 
 96 |     output[:,1:5] /= scaling_factor
 97 | 
 98 |     for i in range(output.shape[0]):
 99 |         output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0])
100 |         output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1])
101 | 
102 |     print(output)
103 |     print(output.shape)
104 | 


--------------------------------------------------------------------------------
/detector/yolo/pallete:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVIG-SJTU/AlphAction/4ea202d7491e184f2cae3984aadd8adab62a368d/detector/yolo/pallete


--------------------------------------------------------------------------------
/detector/yolo/preprocess.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import torch
 4 | import numpy as np
 5 | import cv2
 6 | 
 7 | try:
 8 |     from util import count_parameters as count
 9 |     from util import convert2cpu as cpu
10 | except ImportError:
11 |     from yolo.util import count_parameters as count
12 |     from yolo.util import convert2cpu as cpu
13 | from PIL import Image, ImageDraw
14 | 
15 | 
16 | def letterbox_image(img, inp_dim):
17 |     '''resize image with unchanged aspect ratio using padding'''
18 |     img_w, img_h = img.shape[1], img.shape[0]
19 |     w, h = inp_dim
20 |     new_w = int(img_w * min(w / img_w, h / img_h))
21 |     new_h = int(img_h * min(w / img_w, h / img_h))
22 |     resized_image = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
23 | 
24 |     canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
25 | 
26 |     canvas[(h - new_h) // 2:(h - new_h) // 2 + new_h, (w - new_w) // 2:(w - new_w) // 2 + new_w, :] = resized_image
27 | 
28 |     return canvas
29 | 
30 | 
31 | def prep_image(img, inp_dim):
32 |     """
33 |     Prepare image for inputting to the neural network.
34 | 
35 |     Returns a Variable
36 |     """
37 | 
38 |     orig_im = cv2.imread(img)
39 |     dim = orig_im.shape[1], orig_im.shape[0]
40 |     img = (letterbox_image(orig_im, (inp_dim, inp_dim)))
41 |     img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
42 |     img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
43 |     return img_, orig_im, dim
44 | 
45 | 
46 | def prep_frame(img, inp_dim):
47 |     """
48 |     Prepare image for inputting to the neural network.
49 | 
50 |     Returns a Variable
51 |     """
52 | 
53 |     orig_im = img
54 |     dim = orig_im.shape[1], orig_im.shape[0]
55 |     img = (letterbox_image(orig_im, (inp_dim, inp_dim)))
56 |     img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy()
57 |     img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0)
58 |     return img_, orig_im, dim
59 | 
60 | 
61 | def prep_image_pil(img, network_dim):
62 |     orig_im = Image.open(img)
63 |     img = orig_im.convert('RGB')
64 |     dim = img.size
65 |     img = img.resize(network_dim)
66 |     img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes()))
67 |     img = img.view(*network_dim, 3).transpose(0, 1).transpose(0, 2).contiguous()
68 |     img = img.view(1, 3, *network_dim)
69 |     img = img.float().div(255.0)
70 |     return (img, orig_im, dim)
71 | 
72 | 
73 | def inp_to_image(inp):
74 |     inp = inp.cpu().squeeze()
75 |     inp = inp * 255
76 |     try:
77 |         inp = inp.data.numpy()
78 |     except RuntimeError:
79 |         inp = inp.numpy()
80 |     inp = inp.transpose(1, 2, 0)
81 | 
82 |     inp = inp[:, :, ::-1]
83 |     return inp
84 | 


--------------------------------------------------------------------------------
/detector/yolo_cfg.py:
--------------------------------------------------------------------------------
1 | from easydict import EasyDict as edict
2 | 
3 | cfg = edict()
4 | cfg.CONFIG = '../detector/yolo/cfg/yolov3-spp.cfg'
5 | cfg.WEIGHTS = '../data/models/detector_models/yolov3-spp.weights'
6 | cfg.INP_DIM =  608
7 | cfg.NMS_THRES =  0.6
8 | cfg.CONFIDENCE = 0.01
9 | cfg.NUM_CLASSES = 80


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import glob
  4 | import os
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from setuptools import setup, Extension, find_packages
  9 | from torch.utils.cpp_extension import CUDA_HOME
 10 | from torch.utils.cpp_extension import CppExtension
 11 | from torch.utils.cpp_extension import CUDAExtension
 12 | from Cython.Build import cythonize
 13 | import platform
 14 | 
 15 | def make_cython_ext(name, module, sources):
 16 |     extra_compile_args = None
 17 |     if platform.system() != 'Windows':
 18 |         extra_compile_args = {
 19 |             'cxx': ['-Wno-unused-function', '-Wno-write-strings']
 20 |         }
 21 | 
 22 |     extension = Extension(
 23 |         '{}.{}'.format(module, name),
 24 |         [os.path.join(*module.split('.'), p) for p in sources],
 25 |         include_dirs=[np.get_include()],
 26 |         language='c++',
 27 |         extra_compile_args=extra_compile_args)
 28 |     extension, = cythonize(extension)
 29 |     return extension
 30 | 
 31 | 
 32 | def make_cuda_ext(name, module, sources):
 33 | 
 34 |     return CUDAExtension(
 35 |         name='{}.{}'.format(module, name),
 36 |         sources=[os.path.join(*module.split('.'), p) for p in sources],
 37 |         extra_compile_args={
 38 |             'cxx': [],
 39 |             'nvcc': [
 40 |                 '-D__CUDA_NO_HALF_OPERATORS__',
 41 |                 '-D__CUDA_NO_HALF_CONVERSIONS__',
 42 |                 '-D__CUDA_NO_HALF2_OPERATORS__',
 43 |             ]
 44 |         })
 45 | 
 46 | 
 47 | def get_extensions():
 48 |     this_dir = os.path.dirname(os.path.abspath(__file__))
 49 |     extensions_dir = os.path.join(this_dir, "alphaction/csrc")
 50 | 
 51 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
 52 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
 53 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
 54 | 
 55 |     sources = main_file + source_cpu
 56 |     extension = CppExtension
 57 | 
 58 |     extra_compile_args = {"cxx": []}
 59 |     define_macros = []
 60 | 
 61 |     if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
 62 |         extension = CUDAExtension
 63 |         sources += source_cuda
 64 |         define_macros += [("WITH_CUDA", None)]
 65 |         extra_compile_args["nvcc"] = [
 66 |             "-O3",
 67 |             "-DCUDA_HAS_FP16=1",
 68 |             "-D__CUDA_NO_HALF_OPERATORS__",
 69 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
 70 |             "-D__CUDA_NO_HALF2_OPERATORS__",
 71 |         ]
 72 | 
 73 |     sources = [os.path.join(extensions_dir, s) for s in sources]
 74 | 
 75 |     include_dirs = [extensions_dir]
 76 | 
 77 |     ext_modules = [
 78 |         extension(
 79 |             "alphaction._custom_cuda_ext",
 80 |             sources,
 81 |             include_dirs=include_dirs,
 82 |             define_macros=define_macros,
 83 |             extra_compile_args=extra_compile_args,
 84 |         ),
 85 |         make_cython_ext(
 86 |             name='soft_nms_cpu',
 87 |             module='detector.nms',
 88 |             sources=['src/soft_nms_cpu.pyx']),
 89 |         make_cuda_ext(
 90 |             name='nms_cpu',
 91 |             module='detector.nms',
 92 |             sources=['src/nms_cpu.cpp']),
 93 |         make_cuda_ext(
 94 |             name='nms_cuda',
 95 |             module='detector.nms',
 96 |             sources=['src/nms_cuda.cpp', 'src/nms_kernel.cu']),
 97 | 
 98 |     ]
 99 | 
100 |     return ext_modules
101 | 
102 | 
103 | setup(
104 |     name="alphaction",
105 |     author="yelantf",
106 |     url="https://github.com/MVIG-SJTU/AlphAction",
107 |     ext_modules=get_extensions(),
108 |     packages=find_packages(".", exclude=[
109 |         "config_files", "demo", "gifs", "tools", "data",
110 |     ]),
111 |     install_requires=[
112 |         "tqdm",
113 |         "yacs",
114 |         "opencv-python",
115 |         "tensorboardX",
116 |         "SciPy",
117 |         "matplotlib",
118 |         "cython-bbox",
119 |         "easydict",
120 |     ],
121 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
122 | )
123 | 


--------------------------------------------------------------------------------
/test_net.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import torch
 5 | from alphaction.config import cfg
 6 | from alphaction.dataset import make_data_loader
 7 | from alphaction.engine.inference import inference
 8 | from alphaction.modeling.detector import build_detection_model
 9 | from alphaction.utils.checkpoint import ActionCheckpointer
10 | from torch.utils.collect_env import get_pretty_env_info
11 | from alphaction.utils.comm import synchronize, get_rank
12 | from alphaction.utils.IA_helper import has_memory
13 | from alphaction.utils.logger import setup_logger
14 | #pytorch issuse #973
15 | import resource
16 | 
17 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
18 | resource.setrlimit(resource.RLIMIT_NOFILE, (rlimit[1], rlimit[1]))
19 | 
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
23 |     parser.add_argument(
24 |         "--config-file",
25 |         default="",
26 |         metavar="FILE",
27 |         help="path to config file",
28 |     )
29 |     parser.add_argument("--local_rank", type=int, default=0)
30 |     parser.add_argument(
31 |         "opts",
32 |         help="Modify config options using the command-line",
33 |         default=None,
34 |         nargs=argparse.REMAINDER,
35 |     )
36 | 
37 |     args = parser.parse_args()
38 | 
39 |     num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
40 |     distributed = num_gpus > 1
41 | 
42 |     if distributed:
43 |         torch.cuda.set_device(args.local_rank)
44 |         torch.distributed.init_process_group(
45 |             backend="nccl", init_method="env://"
46 |         )
47 | 
48 |     # Merge config file.
49 |     cfg.merge_from_file(args.config_file)
50 |     cfg.merge_from_list(args.opts)
51 |     cfg.freeze()
52 | 
53 |     # Print experimental infos.
54 |     save_dir = ""
55 |     logger = setup_logger("alphaction", save_dir, get_rank())
56 |     logger.info("Using {} GPUs".format(num_gpus))
57 |     logger.info(cfg)
58 | 
59 |     logger.info("Collecting env info (might take some time)")
60 |     logger.info("\n" + get_pretty_env_info())
61 | 
62 |     # Build the model.
63 |     model = build_detection_model(cfg)
64 |     model.to("cuda")
65 | 
66 |     # load weight.
67 |     output_dir = cfg.OUTPUT_DIR
68 |     checkpointer = ActionCheckpointer(cfg, model, save_dir=output_dir)
69 |     checkpointer.load(cfg.MODEL.WEIGHT)
70 | 
71 |     output_folders = [None] * len(cfg.DATASETS.TEST)
72 |     dataset_names = cfg.DATASETS.TEST
73 |     mem_active = has_memory(cfg.MODEL.IA_STRUCTURE)
74 |     if cfg.OUTPUT_DIR:
75 |         for idx, dataset_name in enumerate(dataset_names):
76 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
77 |             os.makedirs(output_folder, exist_ok=True)
78 |             output_folders[idx] = output_folder
79 | 
80 |     # Do inference.
81 |     data_loaders_test = make_data_loader(cfg, is_train=False, is_distributed=distributed)
82 |     for output_folder, dataset_name, data_loader_test in zip(output_folders, dataset_names, data_loaders_test):
83 |         inference(
84 |             model,
85 |             data_loader_test,
86 |             dataset_name,
87 |             mem_active=mem_active,
88 |             output_folder=output_folder,
89 |         )
90 |         synchronize()
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     main()
95 | 


--------------------------------------------------------------------------------