├── .gitignore ├── LICENSE ├── README.md ├── data └── ILSVRC2015 │ └── ImageSets │ ├── DET_train_30classes.txt │ ├── VID_train_15frames.txt │ ├── VID_train_every10frames.txt │ └── VID_val_videos.txt ├── experiments ├── fgfa_rfcn │ ├── cfgs │ │ ├── fgfa_rfcn_vid_demo.yaml │ │ └── resnet_v1_101_flownet_imagenet_vid_rfcn_end2end_ohem.yaml │ ├── fgfa_rfcn_end2end_train_test.py │ └── fgfa_rfcn_test.py └── manet_rfcn │ ├── cfgs │ ├── phase-1.yaml │ ├── phase-2.yaml │ └── phase-3.yaml │ └── manet_rfcn_end2end_train_test.py ├── images ├── table2.png ├── table3.png └── table4.png ├── init.bat ├── init.sh ├── lib ├── Makefile ├── __init__.py ├── bbox │ ├── .gitignore │ ├── __init__.py │ ├── bbox.pyx │ ├── bbox_regression.py │ ├── bbox_transform.py │ ├── setup_linux.py │ └── setup_windows.py ├── dataset │ ├── .ropeproject │ │ ├── config.py │ │ ├── globalnames │ │ ├── history │ │ └── objectdb │ ├── __init__.py │ ├── ds_utils.py │ ├── imagenet_vid.py │ ├── imagenet_vid_eval.py │ ├── imagenet_vid_eval_motion.py │ ├── imagenet_vid_groundtruth_motion_iou.mat │ ├── imdb.py │ └── log_test.py ├── nms │ ├── __init__.py │ ├── cpu_nms.pyx │ ├── gpu_nms.cu │ ├── gpu_nms.hpp │ ├── gpu_nms.pyx │ ├── nms.py │ ├── nms_kernel.cu │ ├── seq_nms.py │ ├── setup_linux.py │ ├── setup_windows.py │ └── setup_windows_cuda.py ├── rpn │ ├── __init__.py │ ├── generate_anchor.py │ └── rpn.py └── utils │ ├── PrefetchingIter.py │ ├── __init__.py │ ├── combine_model.py │ ├── create_logger.py │ ├── image.py │ ├── image_processing.py │ ├── load_data.py │ ├── load_model.py │ ├── lr_scheduler.py │ ├── roidb.py │ ├── save_model.py │ ├── show_boxes.py │ ├── symbol.py │ └── tictoc.py ├── manet_rfcn ├── __init__.py ├── _init_paths.py ├── config │ ├── __init__.py │ └── config.py ├── core │ ├── DataParallelExecutorGroup.py │ ├── __init__.py │ ├── callback.py │ ├── loader.py │ ├── metric.py │ ├── module.py │ ├── rcnn.py │ └── tester.py ├── demo.py ├── function │ ├── __init__.py │ ├── test_rcnn.py │ ├── test_rpn.py │ ├── train_rcnn.py │ └── train_rpn.py ├── operator_cxx │ ├── psroi_pooling-inl.h │ ├── psroi_pooling.cc │ └── psroi_pooling.cu ├── operator_py │ ├── __init__.py │ ├── box_annotator_ohem.py │ ├── proposal.py │ ├── proposal_target.py │ ├── rpn_inv_normalize.py │ └── tile_as.py ├── symbols │ ├── __init__.py │ └── resnet_v1_101_manet_rfcn.py ├── test.py └── train_end2end.py └── run.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # IntelliJ project files 2 | .idea 3 | *.iml 4 | out 5 | gen 6 | 7 | ### Vim template 8 | [._]*.s[a-w][a-z] 9 | [._]s[a-w][a-z] 10 | *.un~ 11 | Session.vim 12 | .netrwhist 13 | *~ 14 | 15 | ### IPythonNotebook template 16 | # Temporary data 17 | .ipynb_checkpoints/ 18 | 19 | ### Python template 20 | # Byte-compiled / optimized / DLL files 21 | __pycache__/ 22 | *.py[cod] 23 | */*/*.pyc 24 | */*.pyc 25 | *.pyc 26 | *$py.class 27 | 28 | # C extensions 29 | *.so 30 | 31 | # Distribution / packaging 32 | .Python 33 | env/ 34 | build/ 35 | develop-eggs/ 36 | dist/ 37 | downloads/ 38 | eggs/ 39 | .eggs/ 40 | #lib/ 41 | #lib64/ 42 | parts/ 43 | sdist/ 44 | var/ 45 | *.egg-info/ 46 | .installed.cfg 47 | *.egg 48 | 49 | # PyInstaller 50 | # Usually these files are written by a python script from a template 51 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 52 | *.manifest 53 | *.spec 54 | 55 | # Installer logs 56 | pip-log.txt 57 | pip-delete-this-directory.txt 58 | 59 | # Unit test / coverage reports 60 | htmlcov/ 61 | .tox/ 62 | .coverage 63 | .coverage.* 64 | .cache 65 | nosetests.xml 66 | coverage.xml 67 | *,cover 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | *.ipynb 83 | *.params 84 | *.json 85 | .vscode/ 86 | 87 | lib/dataset/pycocotools/*.c 88 | lib/dataset/pycocotools/*.cpp 89 | lib/nms/*.c 90 | lib/nms/*.cpp 91 | 92 | external 93 | output 94 | model 95 | data 96 | demo 97 | 98 | .db 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fully Motion-Aware Network for Video Object Detection 2 | 3 | 4 | This implementation is a fork of [FGFA](https://github.com/msracver/Flow-Guided-Feature-Aggregation) and extended by [Shiyao Wang](https://github.com/wangshy31) through adding instance-level aggregation and motion pattern reasoning. 5 | 6 | 7 | 8 | ## Introduction 9 | 10 | **Fully Motion-Aware Network for Video Object Detection (MANet)** is initially described in an [ECCV 2018 paper](https://wangshy31.github.io/papers/2-MANet.pdf). It proposes an end-to-end model called fully motion-aware network (MANet), which jointly calibrates the features of objects on both pixel-level and instance-level in a unified framework. 11 | 12 | The contributions of this paper include: 13 | 14 | * Propose an instance-level feature calibration method by learning instance movements through time. The instance-level calibration is more robust to occlusions and outperforms pixel-level feature calibration. 15 | * Develop a motion pattern reasoning module to dynamically combine pixel-level and instance-level calibration according to the motion. 16 | * Demonstrate the MANet on the large-scale [ImageNet VID dataset](http://image-net.org/challenges/LSVRC/) with state-of-the-art performance. 17 | 18 | 19 | 20 | ## Installation 21 | 22 | 1. Clone the repo, and we call the directory that you cloned as ${MANet_ROOT}. 23 | ``` 24 | git clone https://github.com/wangshy31/MANet_for_Video_Object_Detection.git 25 | ``` 26 | 2. Python packages might missing: cython, opencv-python >= 3.2.0, easydict. If `pip` is set up on your system, those packages should be able to be fetched and installed by running 27 | ``` 28 | pip install Cython 29 | pip install opencv-python==3.2.0.6 30 | pip install easydict==1.6 31 | ``` 32 | 3. Run `sh ./init.sh` to build cython module automatically and create some folders. 33 | 34 | 4. Install MXNet as [FGFA](https://github.com/msracver/Flow-Guided-Feature-Aggregation): 35 | 36 | 4.1 Clone MXNet and checkout to [MXNet@(v0.10.0)](https://github.com/apache/incubator-mxnet/tree/v0.10.0) by 37 | 38 | ``` 39 | git clone --recursive https://github.com/apache/incubator-mxnet.git 40 | cd incubator-mxnet 41 | git checkout v0.10.0 42 | git submodule update 43 | ``` 44 | 45 | 4.2 Copy operators in `$(MANet_ROOT)/manet_rfcn/operator_cxx` to `$(YOUR_MXNET_FOLDER)/src/operator/contrib` by 46 | 47 | ```cp -r $(MANet_ROOT)/manet_rfcn/operator_cxx/* $(MXNET_ROOT)/src/operator/contrib/``` 48 | 49 | 4.3 Compile MXNet 50 | 51 | ``` 52 | cd ${MXNET_ROOT} 53 | make -j4 54 | ``` 55 | 4.4 Install the MXNet Python binding by 56 | ``` 57 | cd python 58 | sudo python setup.py install 59 | ``` 60 | 61 | 62 | 63 | ## Preparation for Training & Testing 64 | 65 | **For data processing**: 66 | 67 | 1. Please download ILSVRC2015 DET and ILSVRC2015 VID dataset, and make sure it looks like this: 68 | 69 | ``` 70 | ./data/ILSVRC2015/ 71 | ./data/ILSVRC2015/Annotations/DET 72 | ./data/ILSVRC2015/Annotations/VID 73 | ./data/ILSVRC2015/Data/DET 74 | ./data/ILSVRC2015/Data/VID 75 | ./data/ILSVRC2015/ImageSets 76 | ``` 77 | 78 | 2. Please download ImageNet pre-trained ResNet-v1-101 model and Flying-Chairs pre-trained FlowNet model manually from [OneDrive](https://1drv.ms/u/s!Am-5JzdW2XHzhqMOBdCBiNaKbcjPrA), and put it under folder `./model`. Make sure it looks like this: 79 | 80 | ``` 81 | ./model/pretrained_model/resnet_v1_101-0000.params 82 | ./model/pretrained_model/flownet-0000.params 83 | ``` 84 | 85 | **For training & testing**: 86 | 87 | 1. Three-phase training is performed on the mixture of ImageNet DET+VID which is useful for the final performance. 88 | 89 | ​ **Phase 1**: Fix the weights of ResNet, combine pixel-level aggregated features and instance-level aggregated features by average operation. See script/train/phase-1; 90 | 91 | ​ **Phase 2**: Similar to phase 1 but joint train ResNet. See script/train/phase-2; 92 | 93 | ​ **Phase 3**: Fix the weights of ResNet, change the average operation to learnable weights and sample more VID data. See script/train/phase-3; 94 | 95 | We use 4 GPUs to train models on ImageNet VID. Any NVIDIA GPUs with at least 8GB memory should be OK. 96 | 97 | 2. To perform experiments, run the python script with the corresponding config file as input. For example, to train and test MANet with R-FCN, use the following command 98 | 99 | ``` 100 | ./run.sh 101 | ``` 102 | 103 | A cache folder would be created automatically to save the model and the log under 104 | 105 | `imagenet_vid/`. 106 | 107 | 3. Please find more details in config files and in our code. 108 | 109 | ## Main Results 110 | 111 | 1. We conduct an ablation study so as to validate the effectiveness of the proposed network. 112 | 113 | ![ablation study](images/table2.png) 114 | 115 | **Table 1**. Accuracy of different methods on ImageNet VID validation, using ResNet-101 feature extraction networks. Detection accuracy of slow (motion IoU > 0.9), medium (0.7 ≤ motion IoU ≤ 0.9), and fast (motion IoU < 0.7) moving object instances. 116 | 117 | 2. We attempt to take a deeper look at detection results and prove that two calibrated features have respective strengths. 118 | 119 | ![visualization](images/table3.png) 120 | 121 | **Figure 1**. Visualization of two typical examples: occluded and non-rigid objects. They show respective strengths of the two calibration methods. 122 | 123 | ![statisticalanalysis](images/table4.png) 124 | 125 | **Table 2**. Statistical analysis on different validation sets. The instance-level calibration is better when objects are occluded or move more regularly while the pixel-level calibration performs well on non-rigid motion. Combination of these two module can achieve best performance. 126 | 127 | 128 | ## Download Trained Models 129 | You can download the trained MANet from [drive](https://drive.google.com/file/d/1tKFfOKaFUeZanKTCCwVw-xaKu0wAw71t/view?usp=sharing). It can achieve 78.03% mAP without sequence-level post-processing (e.g., SeqNMS). 130 | 131 | 132 | 133 | ## Citing MANet 134 | 135 | If you find Fully Motion-Aware Network for Video Object Detection useful in your research, please consider citing: 136 | ``` 137 | @inproceedings{wang2018fully, 138 | Author = {Wang, Shiyao and Zhou, Yucong and Yan, Junjie and Deng, Zhidong}, 139 | Title = {Fully Motion-Aware Network for Video Object Detection}, 140 | booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, 141 | pages={542--557}, 142 | Year = {2018} 143 | } 144 | 145 | ``` 146 | 147 | 148 | -------------------------------------------------------------------------------- /experiments/fgfa_rfcn/cfgs/fgfa_rfcn_vid_demo.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "" 3 | output_path: "./output/fgfa_rfcn/imagenet_vid" 4 | gpus: '0' 5 | CLASS_AGNOSTIC: true 6 | SCALES: 7 | - 600 8 | - 1000 9 | default: 10 | frequent: 100 11 | kvstore: device 12 | network: 13 | PIXEL_MEANS: 14 | - 103.06 15 | - 115.90 16 | - 123.15 17 | IMAGE_STRIDE: 0 18 | RCNN_FEAT_STRIDE: 16 19 | RPN_FEAT_STRIDE: 16 20 | FIXED_PARAMS: 21 | - conv1 22 | - res2 23 | - bn 24 | ANCHOR_RATIOS: 25 | - 0.5 26 | - 1 27 | - 2 28 | ANCHOR_SCALES: 29 | - 8 30 | - 16 31 | - 32 32 | ANCHOR_MEANS: 33 | - 0.0 34 | - 0.0 35 | - 0.0 36 | - 0.0 37 | ANCHOR_STDS: 38 | - 0.1 39 | - 0.1 40 | - 0.4 41 | - 0.4 42 | NORMALIZE_RPN: TRUE 43 | NUM_ANCHORS: 9 44 | dataset: 45 | NUM_CLASSES: 31 46 | dataset: ImageNetVID 47 | dataset_path: "./data/ILSVRC2015" 48 | image_set: DET_train_30classes+VID_train_15frames 49 | root_path: "./data" 50 | test_image_set: VID_val_videos 51 | proposal: rpn 52 | TRAIN: 53 | lr: 0.00025 54 | lr_step: '1.333' 55 | warmup: false 56 | begin_epoch: 0 57 | end_epoch: 2 58 | model_prefix: 'fgfa_rfcn_vid' 59 | # whether resume training 60 | RESUME: false 61 | # whether flip image 62 | FLIP: true 63 | # whether shuffle image 64 | SHUFFLE: true 65 | # whether use OHEM 66 | ENABLE_OHEM: true 67 | # size of images for each device, 1 for e2e 68 | BATCH_IMAGES: 1 69 | # e2e changes behavior of anchor loader and metric 70 | END2END: true 71 | # group images with similar aspect ratio 72 | ASPECT_GROUPING: true 73 | # R-CNN 74 | # rcnn rois batch size 75 | BATCH_ROIS: -1 76 | BATCH_ROIS_OHEM: 128 77 | # rcnn rois sampling params 78 | FG_FRACTION: 0.25 79 | FG_THRESH: 0.5 80 | BG_THRESH_HI: 0.5 81 | BG_THRESH_LO: 0.0 82 | # rcnn bounding box regression params 83 | BBOX_REGRESSION_THRESH: 0.5 84 | BBOX_WEIGHTS: 85 | - 1.0 86 | - 1.0 87 | - 1.0 88 | - 1.0 89 | 90 | # RPN anchor loader 91 | # rpn anchors batch size 92 | RPN_BATCH_SIZE: 256 93 | # rpn anchors sampling params 94 | RPN_FG_FRACTION: 0.5 95 | RPN_POSITIVE_OVERLAP: 0.7 96 | RPN_NEGATIVE_OVERLAP: 0.3 97 | RPN_CLOBBER_POSITIVES: false 98 | # rpn bounding box regression params 99 | RPN_BBOX_WEIGHTS: 100 | - 1.0 101 | - 1.0 102 | - 1.0 103 | - 1.0 104 | RPN_POSITIVE_WEIGHT: -1.0 105 | # used for end2end training 106 | # RPN proposal 107 | CXX_PROPOSAL: true 108 | RPN_NMS_THRESH: 0.7 109 | RPN_PRE_NMS_TOP_N: 6000 110 | RPN_POST_NMS_TOP_N: 300 111 | RPN_MIN_SIZE: 0 112 | # approximate bounding box regression 113 | BBOX_NORMALIZATION_PRECOMPUTED: true 114 | BBOX_MEANS: 115 | - 0.0 116 | - 0.0 117 | - 0.0 118 | - 0.0 119 | BBOX_STDS: 120 | - 0.1 121 | - 0.1 122 | - 0.2 123 | - 0.2 124 | TEST: 125 | # use rpn to generate proposal 126 | HAS_RPN: true 127 | # size of images for each device 128 | BATCH_IMAGES: 1 129 | SEQ_NMS: false 130 | 131 | # RPN proposal 132 | CXX_PROPOSAL: true 133 | RPN_NMS_THRESH: 0.7 134 | RPN_PRE_NMS_TOP_N: 6000 135 | RPN_POST_NMS_TOP_N: 300 136 | RPN_MIN_SIZE: 0 137 | # RCNN nms 138 | NMS: 0.3 139 | test_epoch: 2 140 | -------------------------------------------------------------------------------- /experiments/fgfa_rfcn/cfgs/resnet_v1_101_flownet_imagenet_vid_rfcn_end2end_ohem.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "" 3 | output_path: "./output/fgfa_rfcn/imagenet_vid" 4 | symbol: resnet_v1_101_flownet_rfcn 5 | gpus: '0,1,2,3' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 600 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/resnet_v1_101" 15 | pretrained_flow: "./model/pretrained_model/flownet" 16 | pretrained_epoch: 0 17 | PIXEL_MEANS: 18 | - 103.06 19 | - 115.90 20 | - 123.15 21 | IMAGE_STRIDE: 0 22 | RCNN_FEAT_STRIDE: 16 23 | RPN_FEAT_STRIDE: 16 24 | FIXED_PARAMS: 25 | - conv1 26 | - res2 27 | - bn 28 | ANCHOR_RATIOS: 29 | - 0.5 30 | - 1 31 | - 2 32 | ANCHOR_SCALES: 33 | - 8 34 | - 16 35 | - 32 36 | ANCHOR_MEANS: 37 | - 0.0 38 | - 0.0 39 | - 0.0 40 | - 0.0 41 | ANCHOR_STDS: 42 | - 0.1 43 | - 0.1 44 | - 0.4 45 | - 0.4 46 | NORMALIZE_RPN: TRUE 47 | NUM_ANCHORS: 9 48 | dataset: 49 | NUM_CLASSES: 31 50 | dataset: ImageNetVID 51 | dataset_path: "./data/ILSVRC2015" 52 | image_set: DET_train_30classes+VID_train_15frames 53 | root_path: "./data" 54 | test_image_set: VID_val_videos 55 | proposal: rpn 56 | motion_iou_path: './lib/dataset/imagenet_vid_groundtruth_motion_iou.mat' 57 | enable_detailed_eval: true 58 | TRAIN: 59 | lr: 0.00025 60 | lr_step: '1.333' 61 | warmup: false 62 | begin_epoch: 1 63 | end_epoch: 3 64 | model_prefix: 'fgfa_rfcn_vid' 65 | # whether resume training 66 | RESUME: true 67 | # whether flip image 68 | FLIP: true 69 | # whether shuffle image 70 | SHUFFLE: true 71 | # whether use OHEM 72 | ENABLE_OHEM: true 73 | # size of images for each device, 1 for e2e 74 | BATCH_IMAGES: 1 75 | # e2e changes behavior of anchor loader and metric 76 | END2END: true 77 | # group images with similar aspect ratio 78 | ASPECT_GROUPING: true 79 | # R-CNN 80 | # rcnn rois batch size 81 | BATCH_ROIS: -1 82 | BATCH_ROIS_OHEM: 128 83 | # rcnn rois sampling params 84 | FG_FRACTION: 0.25 85 | FG_THRESH: 0.5 86 | BG_THRESH_HI: 0.5 87 | BG_THRESH_LO: 0.0 88 | # rcnn bounding box regression params 89 | BBOX_REGRESSION_THRESH: 0.5 90 | BBOX_WEIGHTS: 91 | - 1.0 92 | - 1.0 93 | - 1.0 94 | - 1.0 95 | 96 | # RPN anchor loader 97 | # rpn anchors batch size 98 | RPN_BATCH_SIZE: 256 99 | # rpn anchors sampling params 100 | RPN_FG_FRACTION: 0.5 101 | RPN_POSITIVE_OVERLAP: 0.7 102 | RPN_NEGATIVE_OVERLAP: 0.3 103 | RPN_CLOBBER_POSITIVES: false 104 | # rpn bounding box regression params 105 | RPN_BBOX_WEIGHTS: 106 | - 1.0 107 | - 1.0 108 | - 1.0 109 | - 1.0 110 | RPN_POSITIVE_WEIGHT: -1.0 111 | # used for end2end training 112 | # RPN proposal 113 | CXX_PROPOSAL: true 114 | RPN_NMS_THRESH: 0.7 115 | RPN_PRE_NMS_TOP_N: 6000 116 | RPN_POST_NMS_TOP_N: 300 117 | RPN_MIN_SIZE: 0 118 | # approximate bounding box regression 119 | BBOX_NORMALIZATION_PRECOMPUTED: true 120 | BBOX_MEANS: 121 | - 0.0 122 | - 0.0 123 | - 0.0 124 | - 0.0 125 | BBOX_STDS: 126 | - 0.1 127 | - 0.1 128 | - 0.2 129 | - 0.2 130 | TEST: 131 | # use rpn to generate proposal 132 | HAS_RPN: true 133 | # size of images for each device 134 | BATCH_IMAGES: 1 135 | SEQ_NMS: false 136 | 137 | # RPN proposal 138 | CXX_PROPOSAL: true 139 | RPN_NMS_THRESH: 0.7 140 | RPN_PRE_NMS_TOP_N: 6000 141 | RPN_POST_NMS_TOP_N: 300 142 | RPN_MIN_SIZE: 0 143 | # RCNN nms 144 | NMS: 0.3 145 | test_epoch: 3 146 | -------------------------------------------------------------------------------- /experiments/fgfa_rfcn/fgfa_rfcn_end2end_train_test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deformable Convolutional Networks 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The Apache-2.0 License [see LICENSE for details] 5 | # Written by Yuwen Xiong 6 | # -------------------------------------------------------- 7 | import os 8 | import sys 9 | os.environ['PYTHONUNBUFFERED'] = '1' 10 | os.environ['MXNET_CUDNN_AUTOTUNE_DEFAULT'] = '0' 11 | os.environ['MXNET_ENABLE_GPU_P2P'] = '0' 12 | this_dir = os.path.dirname(__file__) 13 | sys.path.insert(0, os.path.join(this_dir, '..', '..', 'fgfa_rfcn')) 14 | 15 | import train_end2end 16 | import test 17 | 18 | if __name__ == "__main__": 19 | #train_end2end.main() 20 | test.main() 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /experiments/fgfa_rfcn/fgfa_rfcn_test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deformable Convolutional Networks 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The Apache-2.0 License [see LICENSE for details] 5 | # Written by Yuwen Xiong 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | import sys 10 | os.environ['PYTHONUNBUFFERED'] = '1' 11 | os.environ['MXNET_CUDNN_AUTOTUNE_DEFAULT'] = '0' 12 | os.environ['MXNET_ENABLE_GPU_P2P'] = '0' 13 | this_dir = os.path.dirname(__file__) 14 | sys.path.insert(0, os.path.join(this_dir, '..', '..', 'fgfa_rfcn')) 15 | 16 | import test 17 | 18 | if __name__ == "__main__": 19 | test.main() 20 | -------------------------------------------------------------------------------- /experiments/manet_rfcn/cfgs/phase-1.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "" 3 | output_path: "./imagenet_vid" 4 | symbol: resnet_v1_101_manet_rfcn 5 | gpus: '0,1,2,3' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 600 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/resnet_v1_101" 15 | pretrained_flow: "./model/pretrained_model/flownet" 16 | pretrained_epoch: 0 17 | PIXEL_MEANS: 18 | - 103.06 19 | - 115.90 20 | - 123.15 21 | IMAGE_STRIDE: 0 22 | RCNN_FEAT_STRIDE: 16 23 | RPN_FEAT_STRIDE: 16 24 | FIXED_PARAMS: 25 | - conv1 26 | - res2 27 | - bn 28 | - flow 29 | - conv 30 | - res 31 | - Convolution 32 | - deconv 33 | 34 | ANCHOR_RATIOS: 35 | - 0.5 36 | - 1 37 | - 2 38 | ANCHOR_SCALES: 39 | - 8 40 | - 16 41 | - 32 42 | ANCHOR_MEANS: 43 | - 0.0 44 | - 0.0 45 | - 0.0 46 | - 0.0 47 | ANCHOR_STDS: 48 | - 0.1 49 | - 0.1 50 | - 0.4 51 | - 0.4 52 | NORMALIZE_RPN: TRUE 53 | NUM_ANCHORS: 9 54 | dataset: 55 | NUM_CLASSES: 31 56 | dataset: ImageNetVID 57 | dataset_path: "./data/ILSVRC2015" 58 | image_set: DET_train_30classes+VID_train_15frames 59 | #image_set: DET_train_30classes+VID_pretrain_data 60 | root_path: "./data" 61 | test_image_set: VID_val_videos 62 | proposal: rpn 63 | motion_iou_path: '../lib/dataset/imagenet_vid_groundtruth_motion_iou.mat' 64 | enable_detailed_eval: true 65 | TRAIN: 66 | lr: 0.00025 67 | lr_step: '2.333' 68 | warmup: false 69 | begin_epoch: 0 70 | end_epoch: 3 71 | model_prefix: 'manet_rfcn_vid' 72 | # whether predict occlusion 73 | USE_OCCLUSION: False 74 | # whether resume training 75 | RESUME: false 76 | # whether flip image 77 | FLIP: true 78 | # whether shuffle image 79 | SHUFFLE: true 80 | # whether use OHEM 81 | ENABLE_OHEM: true 82 | # size of images for each device, 1 for e2e 83 | BATCH_IMAGES: 1 84 | # e2e changes behavior of anchor loader and metric 85 | END2END: true 86 | # group images with similar aspect ratio 87 | ASPECT_GROUPING: true 88 | # R-CNN 89 | # rcnn rois batch size 90 | BATCH_ROIS: -1 91 | BATCH_ROIS_OHEM: 128 92 | # rcnn rois sampling params 93 | FG_FRACTION: 0.25 94 | FG_THRESH: 0.5 95 | BG_THRESH_HI: 0.5 96 | BG_THRESH_LO: 0.0 97 | # rcnn bounding box regression params 98 | BBOX_REGRESSION_THRESH: 0.5 99 | BBOX_WEIGHTS: 100 | - 1.0 101 | - 1.0 102 | - 1.0 103 | - 1.0 104 | 105 | # RPN anchor loader 106 | # rpn anchors batch size 107 | RPN_BATCH_SIZE: 256 108 | # rpn anchors sampling params 109 | RPN_FG_FRACTION: 0.5 110 | RPN_POSITIVE_OVERLAP: 0.7 111 | RPN_NEGATIVE_OVERLAP: 0.3 112 | RPN_CLOBBER_POSITIVES: false 113 | # rpn bounding box regression params 114 | RPN_BBOX_WEIGHTS: 115 | - 1.0 116 | - 1.0 117 | - 1.0 118 | - 1.0 119 | RPN_POSITIVE_WEIGHT: -1.0 120 | # used for end2end training 121 | # RPN proposal 122 | CXX_PROPOSAL: true 123 | RPN_NMS_THRESH: 0.7 124 | RPN_PRE_NMS_TOP_N: 6000 125 | RPN_POST_NMS_TOP_N: 300 126 | RPN_MIN_SIZE: 0 127 | # approximate bounding box regression 128 | BBOX_NORMALIZATION_PRECOMPUTED: true 129 | BBOX_MEANS: 130 | - 0.0 131 | - 0.0 132 | - 0.0 133 | - 0.0 134 | BBOX_STDS: 135 | - 0.1 136 | - 0.1 137 | - 0.2 138 | - 0.2 139 | TEST: 140 | # use rpn to generate proposal 141 | HAS_RPN: true 142 | # size of images for each device 143 | BATCH_IMAGES: 1 144 | SEQ_NMS: false 145 | 146 | # RPN proposal 147 | CXX_PROPOSAL: true 148 | RPN_NMS_THRESH: 0.7 149 | RPN_PRE_NMS_TOP_N: 6000 150 | RPN_POST_NMS_TOP_N: 300 151 | RPN_MIN_SIZE: 0 152 | #KEY_FRAME_INTERVAL: 6 153 | # RCNN nms 154 | NMS: 0.44 155 | test_epoch: 3 156 | -------------------------------------------------------------------------------- /experiments/manet_rfcn/cfgs/phase-2.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "" 3 | output_path: "./imagenet_vid" 4 | symbol: resnet_v1_101_manet_rfcn 5 | gpus: '0,1,2,3' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 600 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/resnet_v1_101" 15 | pretrained_flow: "./model/pretrained_model/flownet" 16 | pretrained_epoch: 0 17 | PIXEL_MEANS: 18 | - 103.06 19 | - 115.90 20 | - 123.15 21 | IMAGE_STRIDE: 0 22 | RCNN_FEAT_STRIDE: 16 23 | RPN_FEAT_STRIDE: 16 24 | FIXED_PARAMS: 25 | - conv1 26 | - res2 27 | - bn 28 | #- flow 29 | #- conv 30 | #- res 31 | #- Convolution 32 | #- deconv 33 | 34 | ANCHOR_RATIOS: 35 | - 0.5 36 | - 1 37 | - 2 38 | ANCHOR_SCALES: 39 | - 8 40 | - 16 41 | - 32 42 | ANCHOR_MEANS: 43 | - 0.0 44 | - 0.0 45 | - 0.0 46 | - 0.0 47 | ANCHOR_STDS: 48 | - 0.1 49 | - 0.1 50 | - 0.4 51 | - 0.4 52 | NORMALIZE_RPN: TRUE 53 | NUM_ANCHORS: 9 54 | dataset: 55 | NUM_CLASSES: 31 56 | dataset: ImageNetVID 57 | dataset_path: "./data/ILSVRC2015" 58 | image_set: DET_train_30classes+VID_train_15frames 59 | #image_set: DET_train_30classes+VID_pretrain_data 60 | root_path: "./data" 61 | test_image_set: VID_val_videos 62 | proposal: rpn 63 | motion_iou_path: '../lib/dataset/imagenet_vid_groundtruth_motion_iou.mat' 64 | enable_detailed_eval: true 65 | TRAIN: 66 | lr: 0.00025 67 | lr_step: '2.333' 68 | warmup: false 69 | begin_epoch: 1 70 | end_epoch: 3 71 | model_prefix: 'manet_rfcn_vid' 72 | # whether predict occlusion 73 | USE_OCCLUSION: False 74 | # whether resume training 75 | RESUME: true 76 | # whether flip image 77 | FLIP: true 78 | # whether shuffle image 79 | SHUFFLE: true 80 | # whether use OHEM 81 | ENABLE_OHEM: true 82 | # size of images for each device, 1 for e2e 83 | BATCH_IMAGES: 1 84 | # e2e changes behavior of anchor loader and metric 85 | END2END: true 86 | # group images with similar aspect ratio 87 | ASPECT_GROUPING: true 88 | # R-CNN 89 | # rcnn rois batch size 90 | BATCH_ROIS: -1 91 | BATCH_ROIS_OHEM: 128 92 | # rcnn rois sampling params 93 | FG_FRACTION: 0.25 94 | FG_THRESH: 0.5 95 | BG_THRESH_HI: 0.5 96 | BG_THRESH_LO: 0.0 97 | # rcnn bounding box regression params 98 | BBOX_REGRESSION_THRESH: 0.5 99 | BBOX_WEIGHTS: 100 | - 1.0 101 | - 1.0 102 | - 1.0 103 | - 1.0 104 | 105 | # RPN anchor loader 106 | # rpn anchors batch size 107 | RPN_BATCH_SIZE: 256 108 | # rpn anchors sampling params 109 | RPN_FG_FRACTION: 0.5 110 | RPN_POSITIVE_OVERLAP: 0.7 111 | RPN_NEGATIVE_OVERLAP: 0.3 112 | RPN_CLOBBER_POSITIVES: false 113 | # rpn bounding box regression params 114 | RPN_BBOX_WEIGHTS: 115 | - 1.0 116 | - 1.0 117 | - 1.0 118 | - 1.0 119 | RPN_POSITIVE_WEIGHT: -1.0 120 | # used for end2end training 121 | # RPN proposal 122 | CXX_PROPOSAL: true 123 | RPN_NMS_THRESH: 0.7 124 | RPN_PRE_NMS_TOP_N: 6000 125 | RPN_POST_NMS_TOP_N: 300 126 | RPN_MIN_SIZE: 0 127 | # approximate bounding box regression 128 | BBOX_NORMALIZATION_PRECOMPUTED: true 129 | BBOX_MEANS: 130 | - 0.0 131 | - 0.0 132 | - 0.0 133 | - 0.0 134 | BBOX_STDS: 135 | - 0.1 136 | - 0.1 137 | - 0.2 138 | - 0.2 139 | TEST: 140 | # use rpn to generate proposal 141 | HAS_RPN: true 142 | # size of images for each device 143 | BATCH_IMAGES: 1 144 | SEQ_NMS: false 145 | 146 | # RPN proposal 147 | CXX_PROPOSAL: true 148 | RPN_NMS_THRESH: 0.7 149 | RPN_PRE_NMS_TOP_N: 6000 150 | RPN_POST_NMS_TOP_N: 300 151 | RPN_MIN_SIZE: 0 152 | #KEY_FRAME_INTERVAL: 6 153 | # RCNN nms 154 | NMS: 0.44 155 | test_epoch: 3 156 | -------------------------------------------------------------------------------- /experiments/manet_rfcn/cfgs/phase-3.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "" 3 | output_path: "./imagenet_vid" 4 | symbol: resnet_v1_101_manet_rfcn 5 | gpus: '0,1,2,3' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 600 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./imagenet_vid/phase-2/DET_train_30classes_VID_train_15frames/manet_rfcn_vid" 15 | #pretrained_flow: "./model/pretrained_model/flownet" 16 | pretrained_epoch: 3 17 | PIXEL_MEANS: 18 | - 103.06 19 | - 115.90 20 | - 123.15 21 | IMAGE_STRIDE: 0 22 | RCNN_FEAT_STRIDE: 16 23 | RPN_FEAT_STRIDE: 16 24 | FIXED_PARAMS: 25 | - conv1 26 | - res2 27 | - bn 28 | - flow 29 | - conv 30 | - res 31 | - Convolution 32 | - deconv 33 | 34 | ANCHOR_RATIOS: 35 | - 0.5 36 | - 1 37 | - 2 38 | ANCHOR_SCALES: 39 | - 8 40 | - 16 41 | - 32 42 | ANCHOR_MEANS: 43 | - 0.0 44 | - 0.0 45 | - 0.0 46 | - 0.0 47 | ANCHOR_STDS: 48 | - 0.1 49 | - 0.1 50 | - 0.4 51 | - 0.4 52 | NORMALIZE_RPN: TRUE 53 | NUM_ANCHORS: 9 54 | dataset: 55 | NUM_CLASSES: 31 56 | dataset: ImageNetVID 57 | dataset_path: "./data/ILSVRC2015" 58 | #image_set: DET_train_30classes+VID_train_15frames 59 | image_set: DET_train_30classes+VID_train_every10frames 60 | root_path: "./data" 61 | test_image_set: VID_val_videos 62 | proposal: rpn 63 | motion_iou_path: '../lib/dataset/imagenet_vid_groundtruth_motion_iou.mat' 64 | enable_detailed_eval: true 65 | TRAIN: 66 | lr: 0.00025 67 | lr_step: '0.666' 68 | warmup: false 69 | begin_epoch: 0 70 | end_epoch: 1 71 | model_prefix: 'manet_rfcn_vid' 72 | # whether predict occlusion 73 | USE_OCCLUSION: True 74 | # whether resume training 75 | RESUME: false 76 | # whether flip image 77 | FLIP: true 78 | # whether shuffle image 79 | SHUFFLE: true 80 | # whether use OHEM 81 | ENABLE_OHEM: true 82 | # size of images for each device, 1 for e2e 83 | BATCH_IMAGES: 1 84 | # e2e changes behavior of anchor loader and metric 85 | END2END: true 86 | # group images with similar aspect ratio 87 | ASPECT_GROUPING: true 88 | # R-CNN 89 | # rcnn rois batch size 90 | BATCH_ROIS: -1 91 | BATCH_ROIS_OHEM: 128 92 | # rcnn rois sampling params 93 | FG_FRACTION: 0.25 94 | FG_THRESH: 0.5 95 | BG_THRESH_HI: 0.5 96 | BG_THRESH_LO: 0.0 97 | # rcnn bounding box regression params 98 | BBOX_REGRESSION_THRESH: 0.5 99 | BBOX_WEIGHTS: 100 | - 1.0 101 | - 1.0 102 | - 1.0 103 | - 1.0 104 | 105 | # RPN anchor loader 106 | # rpn anchors batch size 107 | RPN_BATCH_SIZE: 256 108 | # rpn anchors sampling params 109 | RPN_FG_FRACTION: 0.5 110 | RPN_POSITIVE_OVERLAP: 0.7 111 | RPN_NEGATIVE_OVERLAP: 0.3 112 | RPN_CLOBBER_POSITIVES: false 113 | # rpn bounding box regression params 114 | RPN_BBOX_WEIGHTS: 115 | - 1.0 116 | - 1.0 117 | - 1.0 118 | - 1.0 119 | RPN_POSITIVE_WEIGHT: -1.0 120 | # used for end2end training 121 | # RPN proposal 122 | CXX_PROPOSAL: true 123 | RPN_NMS_THRESH: 0.7 124 | RPN_PRE_NMS_TOP_N: 6000 125 | RPN_POST_NMS_TOP_N: 300 126 | RPN_MIN_SIZE: 0 127 | # approximate bounding box regression 128 | BBOX_NORMALIZATION_PRECOMPUTED: true 129 | BBOX_MEANS: 130 | - 0.0 131 | - 0.0 132 | - 0.0 133 | - 0.0 134 | BBOX_STDS: 135 | - 0.1 136 | - 0.1 137 | - 0.2 138 | - 0.2 139 | TEST: 140 | # use rpn to generate proposal 141 | HAS_RPN: true 142 | # size of images for each device 143 | BATCH_IMAGES: 1 144 | SEQ_NMS: false 145 | 146 | # RPN proposal 147 | CXX_PROPOSAL: true 148 | RPN_NMS_THRESH: 0.7 149 | RPN_PRE_NMS_TOP_N: 6000 150 | RPN_POST_NMS_TOP_N: 300 151 | RPN_MIN_SIZE: 0 152 | #KEY_FRAME_INTERVAL: 6 153 | # RCNN nms 154 | NMS: 0.44 155 | test_epoch: 1 156 | -------------------------------------------------------------------------------- /experiments/manet_rfcn/manet_rfcn_end2end_train_test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fully Motion-Aware Network for Video Object Detection 3 | # Extend FGFA by adding instance-level aggregation and motion pattern reasoning 4 | # Modified by Shiyao Wang 5 | # -------------------------------------------------------- 6 | 7 | import os 8 | import sys 9 | os.environ['PYTHONUNBUFFERED'] = '1' 10 | os.environ['MXNET_CUDNN_AUTOTUNE_DEFAULT'] = '0' 11 | os.environ['MXNET_ENABLE_GPU_P2P'] = '0' 12 | this_dir = os.path.dirname(__file__) 13 | sys.path.insert(0, os.path.join(this_dir, '..', '..', 'manet_rfcn')) 14 | 15 | import train_end2end 16 | import test 17 | 18 | if __name__ == "__main__": 19 | train_end2end.main() 20 | #test.main() 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /images/table2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/images/table2.png -------------------------------------------------------------------------------- /images/table3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/images/table3.png -------------------------------------------------------------------------------- /images/table4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/images/table4.png -------------------------------------------------------------------------------- /init.bat: -------------------------------------------------------------------------------- 1 | cd /d %~dp0 2 | mkdir .\output 3 | mkdir .\external\mxnet 4 | mkdir .\model\pretrained_model 5 | pause 6 | cd lib\bbox 7 | python setup_windows.py build_ext --inplace 8 | cd ..\nms 9 | python setup_windows.py build_ext --inplace 10 | python setup_windows_cuda.py build_ext --inplace 11 | cd ..\.. 12 | pause 13 | -------------------------------------------------------------------------------- /init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p ./output 4 | mkdir -p ./external/mxnet 5 | mkdir -p ./model/pretrained_model 6 | 7 | cd lib/bbox 8 | python setup_linux.py build_ext --inplace 9 | cd ../nms 10 | python setup_linux.py build_ext --inplace 11 | cd ../.. 12 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | cd nms/; python setup.py build_ext --inplace; rm -rf build; cd ../../ 3 | cd bbox/; python setup.py build_ext --inplace; rm -rf build; cd ../../ 4 | cd dataset/pycocotools/; python setup.py build_ext --inplace; rm -rf build; cd ../../ 5 | clean: 6 | cd nms/; rm *.so *.c *.cpp; cd ../../ 7 | cd bbox/; rm *.so *.c *.cpp; cd ../../ 8 | cd dataset/pycocotools/; rm *.so; cd ../../ 9 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/lib/__init__.py -------------------------------------------------------------------------------- /lib/bbox/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp -------------------------------------------------------------------------------- /lib/bbox/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/lib/bbox/__init__.py -------------------------------------------------------------------------------- /lib/bbox/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deformable Convolutional Networks 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Written by Sergey Karayev 7 | # Modified by Yuwen Xiong, from from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 8 | # -------------------------------------------------------- 9 | 10 | cimport cython 11 | import numpy as np 12 | cimport numpy as np 13 | 14 | DTYPE = np.float 15 | ctypedef np.float_t DTYPE_t 16 | 17 | def bbox_overlaps_cython( 18 | np.ndarray[DTYPE_t, ndim=2] boxes, 19 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 20 | """ 21 | Parameters 22 | ---------- 23 | boxes: (N, 4) ndarray of float 24 | query_boxes: (K, 4) ndarray of float 25 | Returns 26 | ------- 27 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 28 | """ 29 | cdef unsigned int N = boxes.shape[0] 30 | cdef unsigned int K = query_boxes.shape[0] 31 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 32 | cdef DTYPE_t iw, ih, box_area 33 | cdef DTYPE_t ua 34 | cdef unsigned int k, n 35 | for k in range(K): 36 | box_area = ( 37 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 38 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 39 | ) 40 | for n in range(N): 41 | iw = ( 42 | min(boxes[n, 2], query_boxes[k, 2]) - 43 | max(boxes[n, 0], query_boxes[k, 0]) + 1 44 | ) 45 | if iw > 0: 46 | ih = ( 47 | min(boxes[n, 3], query_boxes[k, 3]) - 48 | max(boxes[n, 1], query_boxes[k, 1]) + 1 49 | ) 50 | if ih > 0: 51 | ua = float( 52 | (boxes[n, 2] - boxes[n, 0] + 1) * 53 | (boxes[n, 3] - boxes[n, 1] + 1) + 54 | box_area - iw * ih 55 | ) 56 | overlaps[n, k] = iw * ih / ua 57 | return overlaps 58 | -------------------------------------------------------------------------------- /lib/bbox/bbox_regression.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Flow-Guided Feature Aggregation 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Modified by Yuwen Xiong, from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 7 | # -------------------------------------------------------- 8 | 9 | 10 | """ 11 | This file has functions about generating bounding box regression targets 12 | """ 13 | 14 | import numpy as np 15 | 16 | from bbox_transform import bbox_overlaps, bbox_transform 17 | 18 | 19 | def compute_bbox_regression_targets(rois, overlaps, labels, cfg): 20 | """ 21 | given rois, overlaps, gt labels, compute bounding box regression targets 22 | :param rois: roidb[i]['boxes'] k * 4 23 | :param overlaps: roidb[i]['max_overlaps'] k * 1 24 | :param labels: roidb[i]['max_classes'] k * 1 25 | :return: targets[i][class, dx, dy, dw, dh] k * 5 26 | """ 27 | # Ensure ROIs are floats 28 | rois = rois.astype(np.float, copy=False) 29 | 30 | # Sanity check 31 | if len(rois) != len(overlaps): 32 | print 'bbox regression: this should not happen' 33 | 34 | # Indices of ground-truth ROIs 35 | gt_inds = np.where(overlaps == 1)[0] 36 | if len(gt_inds) == 0: 37 | print 'something wrong : zero ground truth rois' 38 | # Indices of examples for which we try to make predictions 39 | ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_REGRESSION_THRESH)[0] 40 | 41 | # Get IoU overlap between each ex ROI and gt ROI 42 | ex_gt_overlaps = bbox_overlaps(rois[ex_inds, :], rois[gt_inds, :]) 43 | 44 | # Find which gt ROI each ex ROI has max overlap with: 45 | # this will be the ex ROI's gt target 46 | gt_assignment = ex_gt_overlaps.argmax(axis=1) 47 | gt_rois = rois[gt_inds[gt_assignment], :] 48 | ex_rois = rois[ex_inds, :] 49 | 50 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32) 51 | targets[ex_inds, 0] = labels[ex_inds] 52 | targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) 53 | return targets 54 | 55 | 56 | def add_bbox_regression_targets(roidb, cfg): 57 | """ 58 | given roidb, add ['bbox_targets'] and normalize bounding box regression targets 59 | :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb 60 | :return: means, std variances of targets 61 | """ 62 | print 'add bounding box regression targets' 63 | assert len(roidb) > 0 64 | assert 'max_classes' in roidb[0] 65 | 66 | num_images = len(roidb) 67 | num_classes = 2 if cfg.CLASS_AGNOSTIC else roidb[0]['gt_overlaps'].shape[1] 68 | 69 | for im_i in range(num_images): 70 | rois = roidb[im_i]['boxes'] 71 | max_overlaps = roidb[im_i]['max_overlaps'] 72 | max_classes = roidb[im_i]['max_classes'] 73 | roidb[im_i]['bbox_targets'] = compute_bbox_regression_targets(rois, max_overlaps, max_classes, cfg) 74 | 75 | if cfg.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED: 76 | # use fixed / precomputed means and stds instead of empirical values 77 | means = np.tile(np.array(cfg.TRAIN.BBOX_MEANS), (num_classes, 1)) 78 | stds = np.tile(np.array(cfg.TRAIN.BBOX_STDS), (num_classes, 1)) 79 | else: 80 | # compute mean, std values 81 | class_counts = np.zeros((num_classes, 1)) + 1e-14 82 | sums = np.zeros((num_classes, 4)) 83 | squared_sums = np.zeros((num_classes, 4)) 84 | for im_i in range(num_images): 85 | targets = roidb[im_i]['bbox_targets'] 86 | for cls in range(1, num_classes): 87 | cls_indexes = np.where(targets[:, 0] > 0)[0] if cfg.CLASS_AGNOSTIC else np.where(targets[:, 0] == cls)[0] 88 | if cls_indexes.size > 0: 89 | class_counts[cls] += cls_indexes.size 90 | sums[cls, :] += targets[cls_indexes, 1:].sum(axis=0) 91 | squared_sums[cls, :] += (targets[cls_indexes, 1:] ** 2).sum(axis=0) 92 | 93 | means = sums / class_counts 94 | # var(x) = E(x^2) - E(x)^2 95 | stds = np.sqrt(squared_sums / class_counts - means ** 2) 96 | 97 | print 'bbox target means:' 98 | print means 99 | print means[1:, :].mean(axis=0) # ignore bg class 100 | print 'bbox target stdevs:' 101 | print stds 102 | print stds[1:, :].mean(axis=0) # ignore bg class 103 | 104 | 105 | # normalized targets 106 | for im_i in range(num_images): 107 | targets = roidb[im_i]['bbox_targets'] 108 | for cls in range(1, num_classes): 109 | cls_indexes = np.where(targets[:, 0] > 0) if cfg.CLASS_AGNOSTIC else np.where(targets[:, 0] == cls)[0] 110 | roidb[im_i]['bbox_targets'][cls_indexes, 1:] -= means[cls, :] 111 | roidb[im_i]['bbox_targets'][cls_indexes, 1:] /= stds[cls, :] 112 | 113 | return means.ravel(), stds.ravel() 114 | 115 | 116 | def expand_bbox_regression_targets(bbox_targets_data, num_classes, cfg): 117 | """ 118 | expand from 5 to 4 * num_classes; only the right class has non-zero bbox regression targets 119 | :param bbox_targets_data: [k * 5] 120 | :param num_classes: number of classes 121 | :return: bbox target processed [k * 4 num_classes] 122 | bbox_weights ! only foreground boxes have bbox regression computation! 123 | """ 124 | classes = bbox_targets_data[:, 0] 125 | if cfg.CLASS_AGNOSTIC: 126 | num_classes = 2 127 | bbox_targets = np.zeros((classes.size, 4 * num_classes), dtype=np.float32) 128 | bbox_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 129 | delta_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 130 | indexes = np.where(classes > 0)[0] 131 | for index in indexes: 132 | cls = classes[index] 133 | start = int(4 * 1 if cls > 0 else 0) if cfg.CLASS_AGNOSTIC else int(4 * cls) 134 | end = start + 4 135 | bbox_targets[index, start:end] = bbox_targets_data[index, 1:] 136 | bbox_weights[index, start:end] = cfg.TRAIN.BBOX_WEIGHTS 137 | delta_weights[index, start:end] = cfg.TRAIN.BBOX_WEIGHTS 138 | return bbox_targets, bbox_weights, delta_weights 139 | 140 | -------------------------------------------------------------------------------- /lib/bbox/bbox_transform.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from bbox import bbox_overlaps_cython 3 | 4 | 5 | def bbox_overlaps(boxes, query_boxes): 6 | return bbox_overlaps_cython(boxes, query_boxes) 7 | 8 | 9 | def bbox_overlaps_py(boxes, query_boxes): 10 | """ 11 | determine overlaps between boxes and query_boxes 12 | :param boxes: n * 4 bounding boxes 13 | :param query_boxes: k * 4 bounding boxes 14 | :return: overlaps: n * k overlaps 15 | """ 16 | n_ = boxes.shape[0] 17 | k_ = query_boxes.shape[0] 18 | overlaps = np.zeros((n_, k_), dtype=np.float) 19 | for k in range(k_): 20 | query_box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) * (query_boxes[k, 3] - query_boxes[k, 1] + 1) 21 | for n in range(n_): 22 | iw = min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) + 1 23 | if iw > 0: 24 | ih = min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) + 1 25 | if ih > 0: 26 | box_area = (boxes[n, 2] - boxes[n, 0] + 1) * (boxes[n, 3] - boxes[n, 1] + 1) 27 | all_area = float(box_area + query_box_area - iw * ih) 28 | overlaps[n, k] = iw * ih / all_area 29 | return overlaps 30 | 31 | 32 | def clip_boxes(boxes, im_shape): 33 | """ 34 | Clip boxes to image boundaries. 35 | :param boxes: [N, 4* num_classes] 36 | :param im_shape: tuple of 2 37 | :return: [N, 4* num_classes] 38 | """ 39 | # x1 >= 0 40 | boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) 41 | # y1 >= 0 42 | boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) 43 | # x2 < im_shape[1] 44 | boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) 45 | # y2 < im_shape[0] 46 | boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) 47 | return boxes 48 | 49 | def filter_boxes(boxes, min_size): 50 | """ 51 | filter small boxes. 52 | :param boxes: [N, 4* num_classes] 53 | :param min_size: 54 | :return: keep: 55 | """ 56 | ws = boxes[:, 2] - boxes[:, 0] + 1 57 | hs = boxes[:, 3] - boxes[:, 1] + 1 58 | keep = np.where((ws >= min_size) & (hs >= min_size))[0] 59 | return keep 60 | 61 | def nonlinear_transform(ex_rois, gt_rois): 62 | """ 63 | compute bounding box regression targets from ex_rois to gt_rois 64 | :param ex_rois: [N, 4] 65 | :param gt_rois: [N, 4] 66 | :return: [N, 4] 67 | """ 68 | assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number' 69 | 70 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 71 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 72 | ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0) 73 | ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0) 74 | 75 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 76 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 77 | gt_ctr_x = gt_rois[:, 0] + 0.5 * (gt_widths - 1.0) 78 | gt_ctr_y = gt_rois[:, 1] + 0.5 * (gt_heights - 1.0) 79 | 80 | targets_dx = (gt_ctr_x - ex_ctr_x) / (ex_widths + 1e-14) 81 | targets_dy = (gt_ctr_y - ex_ctr_y) / (ex_heights + 1e-14) 82 | targets_dw = np.log(gt_widths / ex_widths) 83 | targets_dh = np.log(gt_heights / ex_heights) 84 | 85 | targets = np.vstack( 86 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 87 | return targets 88 | 89 | 90 | def nonlinear_pred(boxes, box_deltas): 91 | """ 92 | Transform the set of class-agnostic boxes into class-specific boxes 93 | by applying the predicted offsets (box_deltas) 94 | :param boxes: !important [N 4] 95 | :param box_deltas: [N, 4 * num_classes] 96 | :return: [N 4 * num_classes] 97 | """ 98 | if boxes.shape[0] == 0: 99 | return np.zeros((0, box_deltas.shape[1])) 100 | 101 | boxes = boxes.astype(np.float, copy=False) 102 | widths = boxes[:, 2] - boxes[:, 0] + 1.0 103 | heights = boxes[:, 3] - boxes[:, 1] + 1.0 104 | ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0) 105 | ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0) 106 | 107 | dx = box_deltas[:, 0::4] 108 | dy = box_deltas[:, 1::4] 109 | dw = box_deltas[:, 2::4] 110 | dh = box_deltas[:, 3::4] 111 | 112 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] 113 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 114 | pred_w = np.exp(dw) * widths[:, np.newaxis] 115 | pred_h = np.exp(dh) * heights[:, np.newaxis] 116 | 117 | pred_boxes = np.zeros(box_deltas.shape) 118 | # x1 119 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * (pred_w - 1.0) 120 | # y1 121 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * (pred_h - 1.0) 122 | # x2 123 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * (pred_w - 1.0) 124 | # y2 125 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * (pred_h - 1.0) 126 | 127 | return pred_boxes 128 | 129 | 130 | def iou_transform(ex_rois, gt_rois): 131 | """ return bbox targets, IoU loss uses gt_rois as gt """ 132 | assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number' 133 | return gt_rois 134 | 135 | 136 | def iou_pred(boxes, box_deltas): 137 | """ 138 | Transform the set of class-agnostic boxes into class-specific boxes 139 | by applying the predicted offsets (box_deltas) 140 | :param boxes: !important [N 4] 141 | :param box_deltas: [N, 4 * num_classes] 142 | :return: [N 4 * num_classes] 143 | """ 144 | if boxes.shape[0] == 0: 145 | return np.zeros((0, box_deltas.shape[1])) 146 | 147 | boxes = boxes.astype(np.float, copy=False) 148 | x1 = boxes[:, 0] 149 | y1 = boxes[:, 1] 150 | x2 = boxes[:, 2] 151 | y2 = boxes[:, 3] 152 | 153 | dx1 = box_deltas[:, 0::4] 154 | dy1 = box_deltas[:, 1::4] 155 | dx2 = box_deltas[:, 2::4] 156 | dy2 = box_deltas[:, 3::4] 157 | 158 | pred_boxes = np.zeros(box_deltas.shape) 159 | # x1 160 | pred_boxes[:, 0::4] = dx1 + x1[:, np.newaxis] 161 | # y1 162 | pred_boxes[:, 1::4] = dy1 + y1[:, np.newaxis] 163 | # x2 164 | pred_boxes[:, 2::4] = dx2 + x2[:, np.newaxis] 165 | # y2 166 | pred_boxes[:, 3::4] = dy2 + y2[:, np.newaxis] 167 | 168 | return pred_boxes 169 | 170 | 171 | # define bbox_transform and bbox_pred 172 | bbox_transform = nonlinear_transform 173 | bbox_pred = nonlinear_pred 174 | -------------------------------------------------------------------------------- /lib/bbox/setup_linux.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deformable Convolutional Networks 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 7 | # -------------------------------------------------------- 8 | 9 | 10 | import os 11 | from os.path import join as pjoin 12 | from setuptools import setup 13 | from distutils.extension import Extension 14 | from Cython.Distutils import build_ext 15 | import numpy as np 16 | 17 | # Obtain the numpy include directory. This logic works across numpy versions. 18 | try: 19 | numpy_include = np.get_include() 20 | except AttributeError: 21 | numpy_include = np.get_numpy_include() 22 | 23 | 24 | def customize_compiler_for_nvcc(self): 25 | """inject deep into distutils to customize how the dispatch 26 | to gcc/nvcc works. 27 | If you subclass UnixCCompiler, it's not trivial to get your subclass 28 | injected in, and still have the right customizations (i.e. 29 | distutils.sysconfig.customize_compiler) run on it. So instead of going 30 | the OO route, I have this. Note, it's kindof like a wierd functional 31 | subclassing going on.""" 32 | 33 | # tell the compiler it can processes .cu 34 | self.src_extensions.append('.cu') 35 | 36 | # save references to the default compiler_so and _comple methods 37 | default_compiler_so = self.compiler_so 38 | super = self._compile 39 | 40 | # now redefine the _compile method. This gets executed for each 41 | # object but distutils doesn't have the ability to change compilers 42 | # based on source extension: we add it. 43 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 44 | if os.path.splitext(src)[1] == '.cu': 45 | # use the cuda for .cu files 46 | self.set_executable('compiler_so', CUDA['nvcc']) 47 | # use only a subset of the extra_postargs, which are 1-1 translated 48 | # from the extra_compile_args in the Extension class 49 | postargs = extra_postargs['nvcc'] 50 | else: 51 | postargs = extra_postargs['gcc'] 52 | 53 | super(obj, src, ext, cc_args, postargs, pp_opts) 54 | # reset the default compiler_so, which we might have changed for cuda 55 | self.compiler_so = default_compiler_so 56 | 57 | # inject our redefined _compile method into the class 58 | self._compile = _compile 59 | 60 | 61 | # run the customize_compiler 62 | class custom_build_ext(build_ext): 63 | def build_extensions(self): 64 | customize_compiler_for_nvcc(self.compiler) 65 | build_ext.build_extensions(self) 66 | 67 | 68 | ext_modules = [ 69 | Extension( 70 | "bbox", 71 | ["bbox.pyx"], 72 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 73 | include_dirs=[numpy_include] 74 | ), 75 | ] 76 | 77 | setup( 78 | name='bbox_cython', 79 | ext_modules=ext_modules, 80 | # inject our custom trigger 81 | cmdclass={'build_ext': custom_build_ext}, 82 | ) 83 | -------------------------------------------------------------------------------- /lib/bbox/setup_windows.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deformable Convolutional Networks 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 7 | # -------------------------------------------------------- 8 | 9 | import numpy as np 10 | import os 11 | from os.path import join as pjoin 12 | #from distutils.core import setup 13 | from setuptools import setup 14 | from distutils.extension import Extension 15 | from Cython.Distutils import build_ext 16 | import subprocess 17 | 18 | #change for windows, by MrX 19 | nvcc_bin = 'nvcc.exe' 20 | lib_dir = 'lib/x64' 21 | 22 | import distutils.msvc9compiler 23 | distutils.msvc9compiler.VERSION = 14.0 24 | 25 | # Obtain the numpy include directory. This logic works across numpy versions. 26 | try: 27 | numpy_include = np.get_include() 28 | except AttributeError: 29 | numpy_include = np.get_numpy_include() 30 | 31 | ext_modules = [ 32 | # unix _compile: obj, src, ext, cc_args, extra_postargs, pp_opts 33 | Extension( 34 | "bbox", 35 | sources=["bbox.pyx"], 36 | extra_compile_args={}, 37 | include_dirs = [numpy_include] 38 | ), 39 | ] 40 | 41 | setup( 42 | name='fast_rcnn', 43 | ext_modules=ext_modules, 44 | # inject our custom trigger 45 | cmdclass={'build_ext': build_ext}, 46 | ) 47 | -------------------------------------------------------------------------------- /lib/dataset/.ropeproject/config.py: -------------------------------------------------------------------------------- 1 | # The default ``config.py`` 2 | 3 | 4 | def set_prefs(prefs): 5 | """This function is called before opening the project""" 6 | 7 | # Specify which files and folders to ignore in the project. 8 | # Changes to ignored resources are not added to the history and 9 | # VCSs. Also they are not returned in `Project.get_files()`. 10 | # Note that ``?`` and ``*`` match all characters but slashes. 11 | # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc' 12 | # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc' 13 | # '.svn': matches 'pkg/.svn' and all of its children 14 | # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o' 15 | # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o' 16 | prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject', 17 | '.hg', '.svn', '_svn', '.git'] 18 | 19 | # Specifies which files should be considered python files. It is 20 | # useful when you have scripts inside your project. Only files 21 | # ending with ``.py`` are considered to be python files by 22 | # default. 23 | #prefs['python_files'] = ['*.py'] 24 | 25 | # Custom source folders: By default rope searches the project 26 | # for finding source folders (folders that should be searched 27 | # for finding modules). You can add paths to that list. Note 28 | # that rope guesses project source folders correctly most of the 29 | # time; use this if you have any problems. 30 | # The folders should be relative to project root and use '/' for 31 | # separating folders regardless of the platform rope is running on. 32 | # 'src/my_source_folder' for instance. 33 | #prefs.add('source_folders', 'src') 34 | 35 | # You can extend python path for looking up modules 36 | #prefs.add('python_path', '~/python/') 37 | 38 | # Should rope save object information or not. 39 | prefs['save_objectdb'] = True 40 | prefs['compress_objectdb'] = False 41 | 42 | # If `True`, rope analyzes each module when it is being saved. 43 | prefs['automatic_soa'] = True 44 | # The depth of calls to follow in static object analysis 45 | prefs['soa_followed_calls'] = 0 46 | 47 | # If `False` when running modules or unit tests "dynamic object 48 | # analysis" is turned off. This makes them much faster. 49 | prefs['perform_doa'] = True 50 | 51 | # Rope can check the validity of its object DB when running. 52 | prefs['validate_objectdb'] = True 53 | 54 | # How many undos to hold? 55 | prefs['max_history_items'] = 32 56 | 57 | # Shows whether to save history across sessions. 58 | prefs['save_history'] = True 59 | prefs['compress_history'] = False 60 | 61 | # Set the number spaces used for indenting. According to 62 | # :PEP:`8`, it is best to use 4 spaces. Since most of rope's 63 | # unit-tests use 4 spaces it is more reliable, too. 64 | prefs['indent_size'] = 4 65 | 66 | # Builtin and c-extension modules that are allowed to be imported 67 | # and inspected by rope. 68 | prefs['extension_modules'] = [] 69 | 70 | # Add all standard c-extensions to extension_modules list. 71 | prefs['import_dynload_stdmods'] = True 72 | 73 | # If `True` modules with syntax errors are considered to be empty. 74 | # The default value is `False`; When `False` syntax errors raise 75 | # `rope.base.exceptions.ModuleSyntaxError` exception. 76 | prefs['ignore_syntax_errors'] = False 77 | 78 | # If `True`, rope ignores unresolvable imports. Otherwise, they 79 | # appear in the importing namespace. 80 | prefs['ignore_bad_imports'] = False 81 | 82 | 83 | def project_opened(project): 84 | """This function is called after opening the project""" 85 | # Do whatever you like here! 86 | -------------------------------------------------------------------------------- /lib/dataset/.ropeproject/globalnames: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/lib/dataset/.ropeproject/globalnames -------------------------------------------------------------------------------- /lib/dataset/.ropeproject/history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/lib/dataset/.ropeproject/history -------------------------------------------------------------------------------- /lib/dataset/.ropeproject/objectdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/lib/dataset/.ropeproject/objectdb -------------------------------------------------------------------------------- /lib/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from imdb import IMDB 2 | from imagenet_vid import ImageNetVID 3 | -------------------------------------------------------------------------------- /lib/dataset/ds_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def unique_boxes(boxes, scale=1.0): 5 | """ return indices of unique boxes """ 6 | v = np.array([1, 1e3, 1e6, 1e9]) 7 | hashes = np.round(boxes * scale).dot(v) 8 | _, index = np.unique(hashes, return_index=True) 9 | return np.sort(index) 10 | 11 | 12 | def filter_small_boxes(boxes, min_size): 13 | w = boxes[:, 2] - boxes[:, 0] 14 | h = boxes[:, 3] - boxes[:, 1] 15 | keep = np.where((w >= min_size) & (h > min_size))[0] 16 | return keep -------------------------------------------------------------------------------- /lib/dataset/imagenet_vid_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Flow-Guided Feature Aggregation 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The Apache-2.0 License [see LICENSE for details] 5 | # Written by Xizhou Zhu 6 | # -------------------------------------------------------- 7 | 8 | """ 9 | given a imagenet vid imdb, compute mAP 10 | """ 11 | 12 | import numpy as np 13 | import os 14 | import cPickle 15 | 16 | 17 | def parse_vid_rec(filename, classhash, img_ids, defaultIOUthr=0.5, pixelTolerance=10): 18 | """ 19 | parse imagenet vid record into a dictionary 20 | :param filename: xml file path 21 | :return: list of dict 22 | """ 23 | import xml.etree.ElementTree as ET 24 | tree = ET.parse(filename) 25 | objects = [] 26 | for obj in tree.findall('object'): 27 | obj_dict = dict() 28 | obj_dict['label'] = classhash[obj.find('name').text] 29 | bbox = obj.find('bndbox') 30 | obj_dict['bbox'] = [float(bbox.find('xmin').text), 31 | float(bbox.find('ymin').text), 32 | float(bbox.find('xmax').text), 33 | float(bbox.find('ymax').text)] 34 | gt_w = obj_dict['bbox'][2] - obj_dict['bbox'][0] + 1 35 | gt_h = obj_dict['bbox'][3] - obj_dict['bbox'][1] + 1 36 | thr = (gt_w*gt_h)/((gt_w+pixelTolerance)*(gt_h+pixelTolerance)) 37 | obj_dict['thr'] = np.min([thr, defaultIOUthr]) 38 | objects.append(obj_dict) 39 | return {'bbox' : np.array([x['bbox'] for x in objects]), 40 | 'label': np.array([x['label'] for x in objects]), 41 | 'thr' : np.array([x['thr'] for x in objects]), 42 | 'img_ids': img_ids} 43 | 44 | 45 | def vid_ap(rec, prec): 46 | """ 47 | average precision calculations 48 | [precision integrated to recall] 49 | :param rec: recall 50 | :param prec: precision 51 | :return: average precision 52 | """ 53 | 54 | # append sentinel values at both ends 55 | mrec = np.concatenate(([0.], rec, [1.])) 56 | mpre = np.concatenate(([0.], prec, [0.])) 57 | 58 | # compute precision integration ladder 59 | for i in range(mpre.size - 1, 0, -1): 60 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 61 | 62 | # look for recall value changes 63 | i = np.where(mrec[1:] != mrec[:-1])[0] 64 | 65 | # sum (\delta recall) * prec 66 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 67 | return ap 68 | 69 | 70 | def vid_eval(multifiles, detpath, annopath, imageset_file, classname_map, annocache, ovthresh=0.5): 71 | """ 72 | imagenet vid evaluation 73 | :param detpath: detection results detpath.format(classname) 74 | :param annopath: annotations annopath.format(classname) 75 | :param imageset_file: text file containing list of images 76 | :param annocache: caching annotations 77 | :param ovthresh: overlap threshold 78 | :return: rec, prec, ap 79 | """ 80 | with open(imageset_file, 'r') as f: 81 | lines = [x.strip().split(' ') for x in f.readlines()] 82 | img_basenames = [x[0] for x in lines] 83 | gt_img_ids = [int(x[1]) for x in lines] 84 | classhash = dict(zip(classname_map, range(0,len(classname_map)))) 85 | 86 | # load annotations from cache 87 | if not os.path.isfile(annocache): 88 | recs = [] 89 | for ind, image_filename in enumerate(img_basenames): 90 | recs.append(parse_vid_rec(annopath.format('VID/' + image_filename), classhash, gt_img_ids[ind])) 91 | if ind % 100 == 0: 92 | print 'reading annotations for {:d}/{:d}'.format(ind + 1, len(img_basenames)) 93 | print 'saving annotations cache to {:s}'.format(annocache) 94 | with open(annocache, 'wb') as f: 95 | cPickle.dump(recs, f, protocol=cPickle.HIGHEST_PROTOCOL) 96 | else: 97 | with open(annocache, 'rb') as f: 98 | recs = cPickle.load(f) 99 | 100 | # extract objects in :param classname: 101 | npos = np.zeros(len(classname_map)) 102 | for rec in recs: 103 | rec_labels = rec['label'] 104 | for x in rec_labels: 105 | npos[x] += 1 106 | 107 | # read detections 108 | splitlines = [] 109 | if (multifiles == False): 110 | with open(detpath, 'r') as f: 111 | lines = f.readlines() 112 | splitlines = [x.strip().split(' ') for x in lines] 113 | else: 114 | for det in detpath: 115 | with open(det, 'r') as f: 116 | lines = f.readlines() 117 | splitlines += [x.strip().split(' ') for x in lines] 118 | 119 | img_ids = np.array([int(x[0]) for x in splitlines]) 120 | obj_labels = np.array([int(x[1]) for x in splitlines]) 121 | obj_confs = np.array([float(x[2]) for x in splitlines]) 122 | obj_bboxes = np.array([[float(z) for z in x[3:]] for x in splitlines]) 123 | 124 | # sort by confidence 125 | if obj_bboxes.shape[0] > 0: 126 | sorted_inds = np.argsort(img_ids) 127 | img_ids = img_ids[sorted_inds] 128 | obj_labels = obj_labels[sorted_inds] 129 | obj_confs = obj_confs[sorted_inds] 130 | obj_bboxes = obj_bboxes[sorted_inds, :] 131 | 132 | num_imgs = max(max(gt_img_ids),max(img_ids)) + 1 133 | obj_labels_cell = [None] * num_imgs 134 | obj_confs_cell = [None] * num_imgs 135 | obj_bboxes_cell = [None] * num_imgs 136 | start_i = 0 137 | id = img_ids[0] 138 | for i in range(0, len(img_ids)): 139 | if i == len(img_ids)-1 or img_ids[i+1] != id: 140 | conf = obj_confs[start_i:i+1] 141 | label = obj_labels[start_i:i+1] 142 | bbox = obj_bboxes[start_i:i+1, :] 143 | sorted_inds = np.argsort(-conf) 144 | 145 | obj_labels_cell[id] = label[sorted_inds] 146 | obj_confs_cell[id] = conf[sorted_inds] 147 | obj_bboxes_cell[id] = bbox[sorted_inds, :] 148 | if i < len(img_ids)-1: 149 | id = img_ids[i+1] 150 | start_i = i+1 151 | 152 | 153 | # go down detections and mark true positives and false positives 154 | tp_cell = [None] * num_imgs 155 | fp_cell = [None] * num_imgs 156 | 157 | for rec in recs: 158 | id = rec['img_ids'] 159 | gt_labels = rec['label'] 160 | gt_bboxes = rec['bbox'] 161 | gt_thr = rec['thr'] 162 | num_gt_obj = len(gt_labels) 163 | gt_detected = np.zeros(num_gt_obj) 164 | 165 | labels = obj_labels_cell[id] 166 | bboxes = obj_bboxes_cell[id] 167 | 168 | num_obj = 0 if labels is None else len(labels) 169 | tp = np.zeros(num_obj) 170 | fp = np.zeros(num_obj) 171 | 172 | for j in range(0,num_obj): 173 | bb = bboxes[j, :] 174 | ovmax = -1 175 | kmax = -1 176 | for k in range(0,num_gt_obj): 177 | if labels[j] != gt_labels[k]: 178 | continue 179 | if gt_detected[k] > 0: 180 | continue 181 | bbgt = gt_bboxes[k, :] 182 | bi=[np.max((bb[0],bbgt[0])), np.max((bb[1],bbgt[1])), np.min((bb[2],bbgt[2])), np.min((bb[3],bbgt[3]))] 183 | iw=bi[2]-bi[0]+1 184 | ih=bi[3]-bi[1]+1 185 | if iw>0 and ih>0: 186 | # compute overlap as area of intersection / area of union 187 | ua = (bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + \ 188 | (bbgt[2] - bbgt[0] + 1.) * \ 189 | (bbgt[3] - bbgt[1] + 1.) - iw*ih 190 | ov=iw*ih/ua 191 | # makes sure that this object is detected according 192 | # to its individual threshold 193 | if ov >= gt_thr[k] and ov > ovmax: 194 | ovmax=ov 195 | kmax=k 196 | if kmax >= 0: 197 | tp[j] = 1 198 | gt_detected[kmax] = 1 199 | else: 200 | fp[j] = 1 201 | 202 | tp_cell[id] = tp 203 | fp_cell[id] = fp 204 | 205 | tp_all = np.concatenate([x for x in np.array(tp_cell)[gt_img_ids] if x is not None]) 206 | fp_all = np.concatenate([x for x in np.array(fp_cell)[gt_img_ids] if x is not None]) 207 | obj_labels = np.concatenate([x for x in np.array(obj_labels_cell)[gt_img_ids] if x is not None]) 208 | confs = np.concatenate([x for x in np.array(obj_confs_cell)[gt_img_ids] if x is not None]) 209 | 210 | sorted_inds = np.argsort(-confs) 211 | tp_all = tp_all[sorted_inds] 212 | fp_all = fp_all[sorted_inds] 213 | obj_labels = obj_labels[sorted_inds] 214 | 215 | ap = np.zeros(len(classname_map)) 216 | for c in range(1, len(classname_map)): 217 | # compute precision recall 218 | fp = np.cumsum(fp_all[obj_labels == c]) 219 | tp = np.cumsum(tp_all[obj_labels == c]) 220 | rec = tp / float(npos[c]) 221 | # avoid division by zero in case first detection matches a difficult ground ruth 222 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 223 | ap[c] = vid_ap(rec, prec) 224 | ap = ap[1:] 225 | return ap 226 | -------------------------------------------------------------------------------- /lib/dataset/imagenet_vid_groundtruth_motion_iou.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/lib/dataset/imagenet_vid_groundtruth_motion_iou.mat -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/lib/nms/__init__.py -------------------------------------------------------------------------------- /lib/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deformable Convolutional Networks 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1].astype('i') 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deformable Convolutional Networks 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int32_t, ndim=1] \ 26 | order = scores.argsort()[::-1].astype(np.int32) 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /lib/nms/nms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from cpu_nms import cpu_nms 4 | from gpu_nms import gpu_nms 5 | 6 | def py_nms_wrapper(thresh): 7 | def _nms(dets): 8 | return nms(dets, thresh) 9 | return _nms 10 | 11 | 12 | def cpu_nms_wrapper(thresh): 13 | def _nms(dets): 14 | return cpu_nms(dets, thresh) 15 | return _nms 16 | 17 | 18 | def gpu_nms_wrapper(thresh, device_id): 19 | def _nms(dets): 20 | return gpu_nms(dets, thresh, device_id) 21 | return _nms 22 | 23 | 24 | def nms(dets, thresh): 25 | """ 26 | greedily select boxes with high confidence and overlap with current maximum <= thresh 27 | rule out overlap >= thresh 28 | :param dets: [[x1, y1, x2, y2 score]] 29 | :param thresh: retain overlap < thresh 30 | :return: indexes to keep 31 | """ 32 | if dets.shape[0] == 0: 33 | return [] 34 | 35 | x1 = dets[:, 0] 36 | y1 = dets[:, 1] 37 | x2 = dets[:, 2] 38 | y2 = dets[:, 3] 39 | scores = dets[:, 4] 40 | 41 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 42 | order = scores.argsort()[::-1] 43 | 44 | keep = [] 45 | while order.size > 0: 46 | i = order[0] 47 | keep.append(i) 48 | xx1 = np.maximum(x1[i], x1[order[1:]]) 49 | yy1 = np.maximum(y1[i], y1[order[1:]]) 50 | xx2 = np.minimum(x2[i], x2[order[1:]]) 51 | yy2 = np.minimum(y2[i], y2[order[1:]]) 52 | 53 | w = np.maximum(0.0, xx2 - xx1 + 1) 54 | h = np.maximum(0.0, yy2 - yy1 + 1) 55 | inter = w * h 56 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 57 | 58 | inds = np.where(ovr <= thresh)[0] 59 | order = order[inds + 1] 60 | 61 | return keep 62 | -------------------------------------------------------------------------------- /lib/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Deformable Convolutional Networks 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License 5 | // Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn) 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /lib/nms/seq_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Flow-Guided Feature Aggregation 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Modified by Yuqing Zhu, Xizhou Zhu 7 | # -------------------------------------------------------- 8 | 9 | 10 | import numpy as np 11 | 12 | import profile 13 | import cv2 14 | import time 15 | import copy 16 | import cPickle as pickle 17 | import os 18 | 19 | CLASSES = ('__background__', 20 | 'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 21 | 'car', 'cattle', 'dog', 'domestic cat', 'elephant', 'fox', 22 | 'giant panda', 'hamster', 'horse', 'lion', 'lizard', 'monkey', 23 | 'motorcycle', 'rabbit', 'red panda', 'sheep', 'snake', 'squirrel', 24 | 'tiger', 'train', 'turtle', 'watercraft', 'whale', 'zebra') 25 | 26 | 27 | NMS_THRESH = 0.3 28 | IOU_THRESH = 0.5 29 | MAX_THRESH=1e-2 30 | 31 | 32 | def createLinks(dets_all): 33 | links_all = [] 34 | 35 | frame_num = len(dets_all[0]) 36 | cls_num = len(CLASSES) - 1 37 | for cls_ind in range(cls_num): 38 | links_cls = [] 39 | for frame_ind in range(frame_num - 1): 40 | dets1 = dets_all[cls_ind][frame_ind] 41 | dets2 = dets_all[cls_ind][frame_ind + 1] 42 | box1_num = len(dets1) 43 | box2_num = len(dets2) 44 | 45 | if frame_ind == 0: 46 | areas1 = np.empty(box1_num) 47 | for box1_ind, box1 in enumerate(dets1): 48 | areas1[box1_ind] = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1) 49 | else: 50 | areas1 = areas2 51 | 52 | areas2 = np.empty(box2_num) 53 | for box2_ind, box2 in enumerate(dets2): 54 | areas2[box2_ind] = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1) 55 | 56 | links_frame = [] 57 | for box1_ind, box1 in enumerate(dets1): 58 | area1 = areas1[box1_ind] 59 | x1 = np.maximum(box1[0], dets2[:, 0]) 60 | y1 = np.maximum(box1[1], dets2[:, 1]) 61 | x2 = np.minimum(box1[2], dets2[:, 2]) 62 | y2 = np.minimum(box1[3], dets2[:, 3]) 63 | w = np.maximum(0.0, x2 - x1 + 1) 64 | h = np.maximum(0.0, y2 - y1 + 1) 65 | inter = w * h 66 | ovrs = inter / (area1 + areas2 - inter) 67 | links_box = [ovr_ind for ovr_ind, ovr in enumerate(ovrs) if 68 | ovr >= IOU_THRESH] 69 | links_frame.append(links_box) 70 | links_cls.append(links_frame) 71 | links_all.append(links_cls) 72 | return links_all 73 | 74 | 75 | def maxPath(dets_all, links_all): 76 | 77 | for cls_ind, links_cls in enumerate(links_all): 78 | 79 | max_begin = time.time() 80 | delete_sets=[[]for i in range(0,len(dets_all[0]))] 81 | delete_single_box=[] 82 | dets_cls = dets_all[cls_ind] 83 | 84 | num_path=0 85 | # compute the number of links 86 | sum_links=0 87 | for frame_ind, frame in enumerate(links_cls): 88 | for box_ind,box in enumerate(frame): 89 | sum_links+=len(box) 90 | 91 | while True: 92 | 93 | num_path+=1 94 | 95 | rootindex, maxpath, maxsum = findMaxPath(links_cls, dets_cls,delete_single_box) 96 | 97 | if (maxsuma[i,next_box_id]): 151 | a[i,next_box_id]=weight_new 152 | b[i,next_box_id]=box_id 153 | 154 | i,j=np.unravel_index(a.argmax(),a.shape) 155 | 156 | maxpath=[j] 157 | maxscore=a[i,j] 158 | while(b[i,j]!=-1): 159 | 160 | maxpath.append(b[i,j]) 161 | j=b[i,j] 162 | i=i-1 163 | 164 | 165 | rootindex=i 166 | maxpath.reverse() 167 | return rootindex, maxpath, maxscore 168 | 169 | 170 | def rescore(dets, rootindex, maxpath, maxsum): 171 | newscore = maxsum / len(maxpath) 172 | 173 | for i, box_ind in enumerate(maxpath): 174 | dets[rootindex + i][box_ind][4] = newscore 175 | 176 | 177 | def deleteLink(dets, links, rootindex, maxpath, thesh): 178 | 179 | delete_set=[] 180 | num_delete_links=0 181 | 182 | for i, box_ind in enumerate(maxpath): 183 | areas = [(box[2] - box[0] + 1) * (box[3] - box[1] + 1) for box in dets[rootindex + i]] 184 | area1 = areas[box_ind] 185 | box1 = dets[rootindex + i][box_ind] 186 | x1 = np.maximum(box1[0], dets[rootindex + i][:, 0]) 187 | y1 = np.maximum(box1[1], dets[rootindex + i][:, 1]) 188 | x2 = np.minimum(box1[2], dets[rootindex + i][:, 2]) 189 | y2 = np.minimum(box1[3], dets[rootindex + i][:, 3]) 190 | w = np.maximum(0.0, x2 - x1 + 1) 191 | h = np.maximum(0.0, y2 - y1 + 1) 192 | inter = w * h 193 | 194 | ovrs = inter / (area1 + areas - inter) 195 | #saving the box need to delete 196 | deletes = [ovr_ind for ovr_ind, ovr in enumerate(ovrs) if ovr >= 0.3] 197 | delete_set.append(deletes) 198 | 199 | #delete the links except for the last frame 200 | if rootindex + i < len(links): 201 | for delete_ind in deletes: 202 | num_delete_links+=len(links[rootindex+i][delete_ind]) 203 | links[rootindex + i][delete_ind] = [] 204 | 205 | if i > 0 or rootindex > 0: 206 | 207 | #delete the links which point to box_ind 208 | for priorbox in links[rootindex + i - 1]: 209 | for delete_ind in deletes: 210 | if delete_ind in priorbox: 211 | priorbox.remove(delete_ind) 212 | num_delete_links+=1 213 | 214 | return delete_set,num_delete_links 215 | 216 | def seq_nms(dets): 217 | links = createLinks(dets) 218 | dets=maxPath(dets, links) 219 | return dets 220 | 221 | -------------------------------------------------------------------------------- /lib/nms/setup_linux.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deformable Convolutional Networks 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | from setuptools import setup 11 | from distutils.extension import Extension 12 | from Cython.Distutils import build_ext 13 | import numpy as np 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # Adapted fom 19 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 20 | for dir in path.split(os.pathsep): 21 | binpath = pjoin(dir, name) 22 | if os.path.exists(binpath): 23 | return os.path.abspath(binpath) 24 | return None 25 | 26 | 27 | def locate_cuda(): 28 | """Locate the CUDA environment on the system 29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | and values giving the absolute path to each directory. 31 | Starts by looking for the CUDAHOME env variable. If not found, everything 32 | is based on finding 'nvcc' in the PATH. 33 | """ 34 | 35 | # first check if the CUDAHOME env variable is in use 36 | if 'CUDAHOME' in os.environ: 37 | home = os.environ['CUDAHOME'] 38 | nvcc = pjoin(home, 'bin', 'nvcc') 39 | else: 40 | # otherwise, search the PATH for NVCC 41 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 42 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 43 | if nvcc is None: 44 | raise EnvironmentError('The nvcc binary could not be ' 45 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 46 | home = os.path.dirname(os.path.dirname(nvcc)) 47 | 48 | cudaconfig = {'home':home, 'nvcc':nvcc, 49 | 'include': pjoin(home, 'include'), 50 | 'lib64': pjoin(home, 'lib64')} 51 | for k, v in cudaconfig.iteritems(): 52 | if not os.path.exists(v): 53 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 54 | 55 | return cudaconfig 56 | CUDA = locate_cuda() 57 | 58 | 59 | # Obtain the numpy include directory. This logic works across numpy versions. 60 | try: 61 | numpy_include = np.get_include() 62 | except AttributeError: 63 | numpy_include = np.get_numpy_include() 64 | 65 | 66 | def customize_compiler_for_nvcc(self): 67 | """inject deep into distutils to customize how the dispatch 68 | to gcc/nvcc works. 69 | If you subclass UnixCCompiler, it's not trivial to get your subclass 70 | injected in, and still have the right customizations (i.e. 71 | distutils.sysconfig.customize_compiler) run on it. So instead of going 72 | the OO route, I have this. Note, it's kindof like a wierd functional 73 | subclassing going on.""" 74 | 75 | # tell the compiler it can processes .cu 76 | self.src_extensions.append('.cu') 77 | 78 | # save references to the default compiler_so and _comple methods 79 | default_compiler_so = self.compiler_so 80 | super = self._compile 81 | 82 | # now redefine the _compile method. This gets executed for each 83 | # object but distutils doesn't have the ability to change compilers 84 | # based on source extension: we add it. 85 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 86 | if os.path.splitext(src)[1] == '.cu': 87 | # use the cuda for .cu files 88 | self.set_executable('compiler_so', CUDA['nvcc']) 89 | # use only a subset of the extra_postargs, which are 1-1 translated 90 | # from the extra_compile_args in the Extension class 91 | postargs = extra_postargs['nvcc'] 92 | else: 93 | postargs = extra_postargs['gcc'] 94 | 95 | super(obj, src, ext, cc_args, postargs, pp_opts) 96 | # reset the default compiler_so, which we might have changed for cuda 97 | self.compiler_so = default_compiler_so 98 | 99 | # inject our redefined _compile method into the class 100 | self._compile = _compile 101 | 102 | 103 | # run the customize_compiler 104 | class custom_build_ext(build_ext): 105 | def build_extensions(self): 106 | customize_compiler_for_nvcc(self.compiler) 107 | build_ext.build_extensions(self) 108 | 109 | 110 | ext_modules = [ 111 | Extension( 112 | "cpu_nms", 113 | ["cpu_nms.pyx"], 114 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 115 | include_dirs = [numpy_include] 116 | ), 117 | Extension('gpu_nms', 118 | ['nms_kernel.cu', 'gpu_nms.pyx'], 119 | library_dirs=[CUDA['lib64']], 120 | libraries=['cudart'], 121 | language='c++', 122 | runtime_library_dirs=[CUDA['lib64']], 123 | # this syntax is specific to this build system 124 | # we're only going to use certain compiler args with nvcc and not with 125 | # gcc the implementation of this trick is in customize_compiler() below 126 | extra_compile_args={'gcc': ["-Wno-unused-function"], 127 | 'nvcc': ['-arch=sm_35', 128 | '--ptxas-options=-v', 129 | '-c', 130 | '--compiler-options', 131 | "'-fPIC'"]}, 132 | include_dirs = [numpy_include, CUDA['include']] 133 | ), 134 | ] 135 | 136 | setup( 137 | name='nms', 138 | ext_modules=ext_modules, 139 | # inject our custom trigger 140 | cmdclass={'build_ext': custom_build_ext}, 141 | ) 142 | -------------------------------------------------------------------------------- /lib/nms/setup_windows.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deformable Convolutional Networks 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | import os 10 | from os.path import join as pjoin 11 | #from distutils.core import setup 12 | from setuptools import setup 13 | from distutils.extension import Extension 14 | from Cython.Distutils import build_ext 15 | import subprocess 16 | 17 | #change for windows, by MrX 18 | nvcc_bin = 'nvcc.exe' 19 | lib_dir = 'lib/x64' 20 | 21 | import distutils.msvc9compiler 22 | distutils.msvc9compiler.VERSION = 14.0 23 | 24 | 25 | def find_in_path(name, path): 26 | "Find a file in a search path" 27 | # Adapted fom 28 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 29 | for dir in path.split(os.pathsep): 30 | binpath = pjoin(dir, name) 31 | if os.path.exists(binpath): 32 | return os.path.abspath(binpath) 33 | return None 34 | 35 | 36 | def locate_cuda(): 37 | """Locate the CUDA environment on the system 38 | 39 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 40 | and values giving the absolute path to each directory. 41 | 42 | Starts by looking for the CUDAHOME env variable. If not found, everything 43 | is based on finding 'nvcc' in the PATH. 44 | """ 45 | 46 | # first check if the CUDAHOME env variable is in use 47 | if 'CUDA_PATH' in os.environ: 48 | home = os.environ['CUDA_PATH'] 49 | print("home = %s\n" % home) 50 | nvcc = pjoin(home, 'bin', nvcc_bin) 51 | else: 52 | # otherwise, search the PATH for NVCC 53 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 54 | nvcc = find_in_path(nvcc_bin, os.environ['PATH'] + os.pathsep + default_path) 55 | if nvcc is None: 56 | raise EnvironmentError('The nvcc binary could not be ' 57 | 'located in your $PATH. Either add it to your path, or set $CUDA_PATH') 58 | home = os.path.dirname(os.path.dirname(nvcc)) 59 | print("home = %s, nvcc = %s\n" % (home, nvcc)) 60 | 61 | 62 | cudaconfig = {'home':home, 'nvcc':nvcc, 63 | 'include': pjoin(home, 'include'), 64 | 'lib64': pjoin(home, lib_dir)} 65 | for k, v in cudaconfig.iteritems(): 66 | if not os.path.exists(v): 67 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 68 | 69 | return cudaconfig 70 | CUDA = locate_cuda() 71 | 72 | 73 | # Obtain the numpy include directory. This logic works across numpy versions. 74 | try: 75 | numpy_include = np.get_include() 76 | except AttributeError: 77 | numpy_include = np.get_numpy_include() 78 | 79 | 80 | def customize_compiler_for_nvcc(self): 81 | """inject deep into distutils to customize how the dispatch 82 | to gcc/nvcc works. 83 | 84 | If you subclass UnixCCompiler, it's not trivial to get your subclass 85 | injected in, and still have the right customizations (i.e. 86 | distutils.sysconfig.customize_compiler) run on it. So instead of going 87 | the OO route, I have this. Note, it's kindof like a wierd functional 88 | subclassing going on.""" 89 | 90 | # tell the compiler it can processes .cu 91 | #self.src_extensions.append('.cu') 92 | 93 | 94 | # save references to the default compiler_so and _comple methods 95 | #default_compiler_so = self.spawn 96 | #default_compiler_so = self.rc 97 | super = self.compile 98 | 99 | # now redefine the _compile method. This gets executed for each 100 | # object but distutils doesn't have the ability to change compilers 101 | # based on source extension: we add it. 102 | def compile(sources, output_dir=None, macros=None, include_dirs=None, debug=0, extra_preargs=None, extra_postargs=None, depends=None): 103 | postfix=os.path.splitext(sources[0])[1] 104 | 105 | if postfix == '.cu': 106 | # use the cuda for .cu files 107 | #self.set_executable('compiler_so', CUDA['nvcc']) 108 | # use only a subset of the extra_postargs, which are 1-1 translated 109 | # from the extra_compile_args in the Extension class 110 | postargs = extra_postargs['nvcc'] 111 | else: 112 | postargs = extra_postargs['gcc'] 113 | 114 | 115 | return super(sources, output_dir, macros, include_dirs, debug, extra_preargs, postargs, depends) 116 | # reset the default compiler_so, which we might have changed for cuda 117 | #self.rc = default_compiler_so 118 | 119 | # inject our redefined _compile method into the class 120 | self.compile = compile 121 | 122 | 123 | # run the customize_compiler 124 | class custom_build_ext(build_ext): 125 | def build_extensions(self): 126 | customize_compiler_for_nvcc(self.compiler) 127 | build_ext.build_extensions(self) 128 | 129 | 130 | ext_modules = [ 131 | # unix _compile: obj, src, ext, cc_args, extra_postargs, pp_opts 132 | Extension( 133 | "cpu_nms", 134 | sources=["cpu_nms.pyx"], 135 | extra_compile_args={'gcc': []}, 136 | include_dirs = [numpy_include], 137 | ), 138 | ] 139 | 140 | setup( 141 | name='fast_rcnn', 142 | ext_modules=ext_modules, 143 | # inject our custom trigger 144 | cmdclass={'build_ext': custom_build_ext}, 145 | ) 146 | -------------------------------------------------------------------------------- /lib/nms/setup_windows_cuda.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | import os 5 | # on Windows, we need the original PATH without Anaconda's compiler in it: 6 | PATH = os.environ.get('PATH') + ';C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin' 7 | from distutils.spawn import spawn, find_executable 8 | from setuptools import setup, find_packages, Extension 9 | from setuptools.command.build_ext import build_ext 10 | import sys 11 | 12 | # CUDA specific config 13 | # nvcc is assumed to be in user's PATH 14 | nvcc_compile_args = ['-O', '--ptxas-options=-v', '-arch=compute_35', '-code=sm_35,sm_52,sm_61', '-c', '--compiler-options=-fPIC'] 15 | nvcc_compile_args = os.environ.get('NVCCFLAGS', '').split() + nvcc_compile_args 16 | cuda_libs = ['cublas'] 17 | nvcc_bin = 'nvcc.exe' 18 | lib_dir = 'lib/x64' 19 | 20 | 21 | import distutils.msvc9compiler 22 | distutils.msvc9compiler.VERSION = 14.0 23 | 24 | # Obtain the numpy include directory. This logic works across numpy versions. 25 | try: 26 | numpy_include = np.get_include() 27 | except AttributeError: 28 | numpy_include = np.get_numpy_include() 29 | 30 | 31 | cudamat_ext = Extension('gpu_nms', 32 | sources=[ 33 | 'gpu_nms.cu' 34 | ], 35 | language='c++', 36 | libraries=cuda_libs, 37 | extra_compile_args=nvcc_compile_args, 38 | include_dirs = [numpy_include, 'C:\\Programming\\CUDA\\v8.0\\include']) 39 | 40 | 41 | class CUDA_build_ext(build_ext): 42 | """ 43 | Custom build_ext command that compiles CUDA files. 44 | Note that all extension source files will be processed with this compiler. 45 | """ 46 | def build_extensions(self): 47 | self.compiler.src_extensions.append('.cu') 48 | self.compiler.set_executable('compiler_so', 'nvcc') 49 | self.compiler.set_executable('linker_so', 'nvcc --shared') 50 | if hasattr(self.compiler, '_c_extensions'): 51 | self.compiler._c_extensions.append('.cu') # needed for Windows 52 | self.compiler.spawn = self.spawn 53 | build_ext.build_extensions(self) 54 | 55 | def spawn(self, cmd, search_path=1, verbose=0, dry_run=0): 56 | """ 57 | Perform any CUDA specific customizations before actually launching 58 | compile/link etc. commands. 59 | """ 60 | if (sys.platform == 'darwin' and len(cmd) >= 2 and cmd[0] == 'nvcc' and 61 | cmd[1] == '--shared' and cmd.count('-arch') > 0): 62 | # Versions of distutils on OSX earlier than 2.7.9 inject 63 | # '-arch x86_64' which we need to strip while using nvcc for 64 | # linking 65 | while True: 66 | try: 67 | index = cmd.index('-arch') 68 | del cmd[index:index+2] 69 | except ValueError: 70 | break 71 | elif self.compiler.compiler_type == 'msvc': 72 | # There are several things we need to do to change the commands 73 | # issued by MSVCCompiler into one that works with nvcc. In the end, 74 | # it might have been easier to write our own CCompiler class for 75 | # nvcc, as we're only interested in creating a shared library to 76 | # load with ctypes, not in creating an importable Python extension. 77 | # - First, we replace the cl.exe or link.exe call with an nvcc 78 | # call. In case we're running Anaconda, we search cl.exe in the 79 | # original search path we captured further above -- Anaconda 80 | # inserts a MSVC version into PATH that is too old for nvcc. 81 | cmd[:1] = ['nvcc', '--compiler-bindir', 82 | os.path.dirname(find_executable("cl.exe", PATH)) 83 | or cmd[0]] 84 | # - Secondly, we fix a bunch of command line arguments. 85 | for idx, c in enumerate(cmd): 86 | # create .dll instead of .pyd files 87 | #if '.pyd' in c: cmd[idx] = c = c.replace('.pyd', '.dll') #20160601, by MrX 88 | # replace /c by -c 89 | if c == '/c': cmd[idx] = '-c' 90 | # replace /DLL by --shared 91 | elif c == '/DLL': cmd[idx] = '--shared' 92 | # remove --compiler-options=-fPIC 93 | elif '-fPIC' in c: del cmd[idx] 94 | # replace /Tc... by ... 95 | elif c.startswith('/Tc'): cmd[idx] = c[3:] 96 | # replace /Fo... by -o ... 97 | elif c.startswith('/Fo'): cmd[idx:idx+1] = ['-o', c[3:]] 98 | # replace /LIBPATH:... by -L... 99 | elif c.startswith('/LIBPATH:'): cmd[idx] = '-L' + c[9:] 100 | # replace /OUT:... by -o ... 101 | elif c.startswith('/OUT:'): cmd[idx:idx+1] = ['-o', c[5:]] 102 | # remove /EXPORT:initlibcudamat or /EXPORT:initlibcudalearn 103 | elif c.startswith('/EXPORT:'): del cmd[idx] 104 | # replace cublas.lib by -lcublas 105 | elif c == 'cublas.lib': cmd[idx] = '-lcublas' 106 | # - Finally, we pass on all arguments starting with a '/' to the 107 | # compiler or linker, and have nvcc handle all other arguments 108 | if '--shared' in cmd: 109 | pass_on = '--linker-options=' 110 | # we only need MSVCRT for a .dll, remove CMT if it sneaks in: 111 | cmd.append('/NODEFAULTLIB:libcmt.lib') 112 | else: 113 | pass_on = '--compiler-options=' 114 | cmd = ([c for c in cmd if c[0] != '/'] + 115 | [pass_on + ','.join(c for c in cmd if c[0] == '/')]) 116 | # For the future: Apart from the wrongly set PATH by Anaconda, it 117 | # would suffice to run the following for compilation on Windows: 118 | # nvcc -c -O -o .obj .cu 119 | # And the following for linking: 120 | # nvcc --shared -o .dll .obj .obj -lcublas 121 | # This could be done by a NVCCCompiler class for all platforms. 122 | spawn(cmd, search_path, verbose, dry_run) 123 | 124 | setup(name="py_fast_rcnn_gpu", 125 | description="Performs linear algebra computation on the GPU via CUDA", 126 | ext_modules=[cudamat_ext], 127 | cmdclass={'build_ext': CUDA_build_ext}, 128 | ) 129 | -------------------------------------------------------------------------------- /lib/rpn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/lib/rpn/__init__.py -------------------------------------------------------------------------------- /lib/rpn/generate_anchor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate base anchors on index 0 3 | """ 4 | 5 | import numpy as np 6 | 7 | 8 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 9 | scales=2 ** np.arange(3, 6)): 10 | """ 11 | Generate anchor (reference) windows by enumerating aspect ratios X 12 | scales wrt a reference (0, 0, 15, 15) window. 13 | """ 14 | 15 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 16 | ratio_anchors = _ratio_enum(base_anchor, ratios) 17 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 18 | for i in xrange(ratio_anchors.shape[0])]) 19 | return anchors 20 | 21 | 22 | def _whctrs(anchor): 23 | """ 24 | Return width, height, x center, and y center for an anchor (window). 25 | """ 26 | 27 | w = anchor[2] - anchor[0] + 1 28 | h = anchor[3] - anchor[1] + 1 29 | x_ctr = anchor[0] + 0.5 * (w - 1) 30 | y_ctr = anchor[1] + 0.5 * (h - 1) 31 | return w, h, x_ctr, y_ctr 32 | 33 | 34 | def _mkanchors(ws, hs, x_ctr, y_ctr): 35 | """ 36 | Given a vector of widths (ws) and heights (hs) around a center 37 | (x_ctr, y_ctr), output a set of anchors (windows). 38 | """ 39 | 40 | ws = ws[:, np.newaxis] 41 | hs = hs[:, np.newaxis] 42 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 43 | y_ctr - 0.5 * (hs - 1), 44 | x_ctr + 0.5 * (ws - 1), 45 | y_ctr + 0.5 * (hs - 1))) 46 | return anchors 47 | 48 | 49 | def _ratio_enum(anchor, ratios): 50 | """ 51 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 52 | """ 53 | 54 | w, h, x_ctr, y_ctr = _whctrs(anchor) 55 | size = w * h 56 | size_ratios = size / ratios 57 | ws = np.round(np.sqrt(size_ratios)) 58 | hs = np.round(ws * ratios) 59 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 60 | return anchors 61 | 62 | 63 | def _scale_enum(anchor, scales): 64 | """ 65 | Enumerate a set of anchors for each scale wrt an anchor. 66 | """ 67 | 68 | w, h, x_ctr, y_ctr = _whctrs(anchor) 69 | ws = w * scales 70 | hs = h * scales 71 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 72 | return anchors 73 | -------------------------------------------------------------------------------- /lib/utils/PrefetchingIter.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Flow-Guided-Feature-Aggregation 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Modified by Yuwen Xiong 7 | # -------------------------------------------------------- 8 | 9 | 10 | import mxnet as mx 11 | from mxnet.io import DataDesc, DataBatch 12 | import threading 13 | 14 | 15 | class PrefetchingIter(mx.io.DataIter): 16 | """Base class for prefetching iterators. Takes one or more DataIters ( 17 | or any class with "reset" and "next" methods) and combine them with 18 | prefetching. For example: 19 | 20 | Parameters 21 | ---------- 22 | iters : DataIter or list of DataIter 23 | one or more DataIters (or any class with "reset" and "next" methods) 24 | rename_data : None or list of dict 25 | i-th element is a renaming map for i-th iter, in the form of 26 | {'original_name' : 'new_name'}. Should have one entry for each entry 27 | in iter[i].provide_data 28 | rename_label : None or list of dict 29 | Similar to rename_data 30 | 31 | Examples 32 | -------- 33 | iter = PrefetchingIter([NDArrayIter({'data': X1}), NDArrayIter({'data': X2})], 34 | rename_data=[{'data': 'data1'}, {'data': 'data2'}]) 35 | """ 36 | def __init__(self, iters, rename_data=None, rename_label=None): 37 | super(PrefetchingIter, self).__init__() 38 | if not isinstance(iters, list): 39 | iters = [iters] 40 | self.n_iter = len(iters) 41 | assert self.n_iter ==1, "Our prefetching iter only support 1 DataIter" 42 | self.iters = iters 43 | self.rename_data = rename_data 44 | self.rename_label = rename_label 45 | self.batch_size = len(self.provide_data) * self.provide_data[0][0][1][0] 46 | self.data_ready = [threading.Event() for i in range(self.n_iter)] 47 | self.data_taken = [threading.Event() for i in range(self.n_iter)] 48 | for e in self.data_taken: 49 | e.set() 50 | self.started = True 51 | self.current_batch = [None for _ in range(self.n_iter)] 52 | self.next_batch = [None for _ in range(self.n_iter)] 53 | def prefetch_func(self, i): 54 | """Thread entry""" 55 | while True: 56 | self.data_taken[i].wait() 57 | if not self.started: 58 | break 59 | try: 60 | self.next_batch[i] = self.iters[i].next() 61 | except StopIteration: 62 | self.next_batch[i] = None 63 | self.data_taken[i].clear() 64 | self.data_ready[i].set() 65 | self.prefetch_threads = [threading.Thread(target=prefetch_func, args=[self, i]) \ 66 | for i in range(self.n_iter)] 67 | for thread in self.prefetch_threads: 68 | thread.setDaemon(True) 69 | thread.start() 70 | 71 | def __del__(self): 72 | self.started = False 73 | for e in self.data_taken: 74 | e.set() 75 | for thread in self.prefetch_threads: 76 | thread.join() 77 | 78 | @property 79 | def provide_data(self): 80 | """The name and shape of data provided by this iterator""" 81 | if self.rename_data is None: 82 | return sum([i.provide_data for i in self.iters], []) 83 | else: 84 | return sum([[ 85 | DataDesc(r[x.name], x.shape, x.dtype) 86 | if isinstance(x, DataDesc) else DataDesc(*x) 87 | for x in i.provide_data 88 | ] for r, i in zip(self.rename_data, self.iters)], []) 89 | 90 | @property 91 | def provide_label(self): 92 | """The name and shape of label provided by this iterator""" 93 | if self.rename_label is None: 94 | return sum([i.provide_label for i in self.iters], []) 95 | else: 96 | return sum([[ 97 | DataDesc(r[x.name], x.shape, x.dtype) 98 | if isinstance(x, DataDesc) else DataDesc(*x) 99 | for x in i.provide_label 100 | ] for r, i in zip(self.rename_label, self.iters)], []) 101 | 102 | def reset(self): 103 | for e in self.data_ready: 104 | e.wait() 105 | for i in self.iters: 106 | i.reset() 107 | for e in self.data_ready: 108 | e.clear() 109 | for e in self.data_taken: 110 | e.set() 111 | 112 | def iter_next(self): 113 | for e in self.data_ready: 114 | e.wait() 115 | if self.next_batch[0] is None: 116 | return False 117 | else: 118 | self.current_batch = self.next_batch[0] 119 | for e in self.data_ready: 120 | e.clear() 121 | for e in self.data_taken: 122 | e.set() 123 | return True 124 | 125 | def next(self): 126 | if self.iter_next(): 127 | return self.current_batch 128 | else: 129 | raise StopIteration 130 | 131 | def getdata(self): 132 | return self.current_batch.data 133 | 134 | def getlabel(self): 135 | return self.current_batch.label 136 | 137 | def getindex(self): 138 | return self.current_batch.index 139 | 140 | def getpad(self): 141 | return self.current_batch.pad 142 | -------------------------------------------------------------------------------- /lib/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/lib/utils/__init__.py -------------------------------------------------------------------------------- /lib/utils/combine_model.py: -------------------------------------------------------------------------------- 1 | from load_model import load_checkpoint 2 | from save_model import save_checkpoint 3 | 4 | 5 | def combine_model(prefix1, epoch1, prefix2, epoch2, prefix_out, epoch_out): 6 | args1, auxs1 = load_checkpoint(prefix1, epoch1) 7 | args2, auxs2 = load_checkpoint(prefix2, epoch2) 8 | arg_names = args1.keys() + args2.keys() 9 | aux_names = auxs1.keys() + auxs2.keys() 10 | args = dict() 11 | for arg in arg_names: 12 | if arg in args1: 13 | args[arg] = args1[arg] 14 | if arg in args2: 15 | args[arg] = args2[arg] 16 | auxs = dict() 17 | for aux in aux_names: 18 | if aux in auxs1: 19 | auxs[aux] = auxs1[aux] 20 | if aux in auxs2: 21 | auxs[aux] = auxs2[aux] 22 | save_checkpoint(prefix_out, epoch_out, args, auxs) 23 | -------------------------------------------------------------------------------- /lib/utils/create_logger.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # low-Guided-Feature-Aggregation 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The Apache-2.0 License [see LICENSE for details] 5 | # Written by Bin Xiao 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | import logging 10 | import time 11 | 12 | def create_logger(root_output_path, cfg, image_set): 13 | # set up logger 14 | if not os.path.exists(root_output_path): 15 | os.makedirs(root_output_path) 16 | assert os.path.exists(root_output_path), '{} does not exist'.format(root_output_path) 17 | 18 | cfg_name = os.path.basename(cfg).split('.')[0] 19 | config_output_path = os.path.join(root_output_path, '{}'.format(cfg_name)) 20 | if not os.path.exists(config_output_path): 21 | os.makedirs(config_output_path) 22 | 23 | image_sets = [iset for iset in image_set.split('+')] 24 | final_output_path = os.path.join(config_output_path, '{}'.format('_'.join(image_sets))) 25 | if not os.path.exists(final_output_path): 26 | os.makedirs(final_output_path) 27 | 28 | log_file = '{}_{}.log'.format(cfg_name, time.strftime('%Y-%m-%d-%H-%M')) 29 | head = '%(asctime)-15s %(message)s' 30 | logging.basicConfig(filename=os.path.join(final_output_path, log_file), format=head) 31 | logger = logging.getLogger() 32 | logger.setLevel(logging.INFO) 33 | 34 | return logger, final_output_path 35 | 36 | -------------------------------------------------------------------------------- /lib/utils/image_processing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | 5 | def resize(im, target_size, max_size): 6 | """ 7 | only resize input image to target size and return scale 8 | :param im: BGR image input by opencv 9 | :param target_size: one dimensional size (the short side) 10 | :param max_size: one dimensional max size (the long side) 11 | :return: 12 | """ 13 | im_shape = im.shape 14 | im_size_min = np.min(im_shape[0:2]) 15 | im_size_max = np.max(im_shape[0:2]) 16 | im_scale = float(target_size) / float(im_size_min) 17 | # prevent bigger axis from being more than max_size: 18 | if np.round(im_scale * im_size_max) > max_size: 19 | im_scale = float(max_size) / float(im_size_max) 20 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) 21 | return im, im_scale 22 | 23 | 24 | def transform(im, pixel_means, need_mean=False): 25 | """ 26 | transform into mxnet tensor 27 | subtract pixel size and transform to correct format 28 | :param im: [height, width, channel] in BGR 29 | :param pixel_means: [[[R, G, B pixel means]]] 30 | :return: [batch, channel, height, width] 31 | """ 32 | assert False, "shouldn't reach here." 33 | im = im.copy() 34 | im[:, :, (0, 1, 2)] = im[:, :, (2, 1, 0)] 35 | im = im.astype(float) 36 | if need_mean: 37 | im -= pixel_means 38 | im_tensor = im[np.newaxis, :] 39 | # put channel first 40 | channel_swap = (0, 3, 1, 2) 41 | im_tensor = im_tensor.transpose(channel_swap) 42 | return im_tensor 43 | 44 | 45 | def transform_inverse(im_tensor, pixel_means): 46 | """ 47 | transform from mxnet im_tensor to ordinary RGB image 48 | im_tensor is limited to one image 49 | :param im_tensor: [batch, channel, height, width] 50 | :param pixel_means: [[[R, G, B pixel means]]] 51 | :return: im [height, width, channel(RGB)] 52 | """ 53 | assert im_tensor.shape[0] == 1 54 | im_tensor = im_tensor.copy() 55 | # put channel back 56 | channel_swap = (0, 2, 3, 1) 57 | im_tensor = im_tensor.transpose(channel_swap) 58 | im = im_tensor[0] 59 | assert im.shape[2] == 3 60 | im += pixel_means 61 | im = im.astype(np.uint8) 62 | return im 63 | 64 | 65 | def tensor_vstack(tensor_list, pad=0): 66 | """ 67 | vertically stack tensors 68 | :param tensor_list: list of tensor to be stacked vertically 69 | :param pad: label to pad with 70 | :return: tensor with max shape 71 | """ 72 | ndim = len(tensor_list[0].shape) 73 | if ndim == 1: 74 | return np.hstack(tensor_list) 75 | dimensions = [0] 76 | for dim in range(1, ndim): 77 | dimensions.append(max([tensor.shape[dim] for tensor in tensor_list])) 78 | for ind, tensor in enumerate(tensor_list): 79 | pad_shape = [(0, 0)] 80 | for dim in range(1, ndim): 81 | pad_shape.append((0, dimensions[dim] - tensor.shape[dim])) 82 | tensor_list[ind] = np.lib.pad(tensor, pad_shape, 'constant', constant_values=pad) 83 | all_tensor = np.vstack(tensor_list) 84 | return all_tensor 85 | -------------------------------------------------------------------------------- /lib/utils/load_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from dataset import * 3 | 4 | 5 | def load_gt_roidb(dataset_name, image_set_name, root_path, dataset_path, result_path=None, 6 | flip=False): 7 | """ load ground truth roidb """ 8 | imdb = eval(dataset_name)(image_set_name, root_path, dataset_path, result_path) 9 | roidb = imdb.gt_roidb() 10 | if flip: 11 | roidb = imdb.append_flipped_images(roidb) 12 | return roidb 13 | 14 | 15 | def load_proposal_roidb(dataset_name, image_set_name, root_path, dataset_path, result_path=None, 16 | proposal='rpn', append_gt=True, flip=False): 17 | """ load proposal roidb (append_gt when training) """ 18 | imdb = eval(dataset_name)(image_set_name, root_path, dataset_path, result_path) 19 | 20 | gt_roidb = imdb.gt_roidb() 21 | roidb = eval('imdb.' + proposal + '_roidb')(gt_roidb, append_gt) 22 | if flip: 23 | roidb = imdb.append_flipped_images(roidb) 24 | return roidb 25 | 26 | 27 | def merge_roidb(roidbs): 28 | """ roidb are list, concat them together """ 29 | roidb = roidbs[0] 30 | for r in roidbs[1:]: 31 | roidb.extend(r) 32 | return roidb 33 | 34 | 35 | def filter_roidb(roidb, config): 36 | """ remove roidb entries without usable rois """ 37 | 38 | def is_valid(entry): 39 | """ valid images have at least 1 fg or bg roi """ 40 | overlaps = entry['max_overlaps'] 41 | fg_inds = np.where(overlaps >= config.TRAIN.FG_THRESH)[0] 42 | bg_inds = np.where((overlaps < config.TRAIN.BG_THRESH_HI) & (overlaps >= config.TRAIN.BG_THRESH_LO))[0] 43 | valid = len(fg_inds) > 0 or len(bg_inds) > 0 44 | return valid 45 | 46 | num = len(roidb) 47 | filtered_roidb = [entry for entry in roidb if is_valid(entry)] 48 | num_after = len(filtered_roidb) 49 | print 'filtered %d roidb entries: %d -> %d' % (num - num_after, num, num_after) 50 | 51 | return filtered_roidb 52 | 53 | 54 | def load_gt_segdb(dataset_name, image_set_name, root_path, dataset_path, result_path=None, 55 | flip=False): 56 | """ load ground truth segdb """ 57 | imdb = eval(dataset_name)(image_set_name, root_path, dataset_path, result_path) 58 | segdb = imdb.gt_segdb() 59 | if flip: 60 | segdb = imdb.append_flipped_images_for_segmentation(segdb) 61 | return segdb 62 | 63 | 64 | def merge_segdb(segdbs): 65 | """ segdb are list, concat them together """ 66 | segdb = segdbs[0] 67 | for r in segdbs[1:]: 68 | segdb.extend(r) 69 | return segdb 70 | -------------------------------------------------------------------------------- /lib/utils/load_model.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | 3 | 4 | def load_checkpoint(prefix, epoch): 5 | """ 6 | Load model checkpoint from file. 7 | :param prefix: Prefix of model name. 8 | :param epoch: Epoch number of model we would like to load. 9 | :return: (arg_params, aux_params) 10 | arg_params : dict of str to NDArray 11 | Model parameter, dict of name to NDArray of net's weights. 12 | aux_params : dict of str to NDArray 13 | Model parameter, dict of name to NDArray of net's auxiliary states. 14 | """ 15 | save_dict = mx.nd.load('%s-%04d.params' % (prefix, epoch)) 16 | print 'load %s-%04d.params!!!' % (prefix, epoch) 17 | arg_params = {} 18 | aux_params = {} 19 | for k, v in save_dict.items(): 20 | tp, name = k.split(':', 1) 21 | if tp == 'arg': 22 | arg_params[name] = v 23 | if tp == 'aux': 24 | aux_params[name] = v 25 | return arg_params, aux_params 26 | 27 | 28 | def convert_context(params, ctx): 29 | """ 30 | :param params: dict of str to NDArray 31 | :param ctx: the context to convert to 32 | :return: dict of str of NDArray with context ctx 33 | """ 34 | new_params = dict() 35 | for k, v in params.items(): 36 | new_params[k] = v.as_in_context(ctx) 37 | return new_params 38 | 39 | 40 | def load_param(prefix, epoch, convert=False, ctx=None, process=False): 41 | """ 42 | wrapper for load checkpoint 43 | :param prefix: Prefix of model name. 44 | :param epoch: Epoch number of model we would like to load. 45 | :param convert: reference model should be converted to GPU NDArray first 46 | :param ctx: if convert then ctx must be designated. 47 | :param process: model should drop any test 48 | :return: (arg_params, aux_params) 49 | """ 50 | arg_params, aux_params = load_checkpoint(prefix, epoch) 51 | if convert: 52 | if ctx is None: 53 | ctx = mx.cpu() 54 | arg_params = convert_context(arg_params, ctx) 55 | aux_params = convert_context(aux_params, ctx) 56 | if process: 57 | tests = [k for k in arg_params.keys() if '_test' in k] 58 | for test in tests: 59 | arg_params[test.replace('_test', '')] = arg_params.pop(test) 60 | return arg_params, aux_params 61 | -------------------------------------------------------------------------------- /lib/utils/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deep Feature Flow 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The Apache-2.0 License [see LICENSE for details] 5 | # Written by Yuwen Xiong 6 | # -------------------------------------------------------- 7 | 8 | 9 | import logging 10 | from mxnet.lr_scheduler import LRScheduler 11 | 12 | class WarmupMultiFactorScheduler(LRScheduler): 13 | """Reduce learning rate in factor at steps specified in a list 14 | 15 | Assume the weight has been updated by n times, then the learning rate will 16 | be 17 | 18 | base_lr * factor^(sum((step/n)<=1)) # step is an array 19 | 20 | Parameters 21 | ---------- 22 | step: list of int 23 | schedule learning rate after n updates 24 | factor: float 25 | the factor for reducing the learning rate 26 | """ 27 | def __init__(self, step, factor=1, warmup=False, warmup_lr=0, warmup_step=0): 28 | super(WarmupMultiFactorScheduler, self).__init__() 29 | assert isinstance(step, list) and len(step) >= 1 30 | for i, _step in enumerate(step): 31 | if i != 0 and step[i] <= step[i-1]: 32 | raise ValueError("Schedule step must be an increasing integer list") 33 | if _step < 1: 34 | raise ValueError("Schedule step must be greater or equal than 1 round") 35 | if factor > 1.0: 36 | raise ValueError("Factor must be no more than 1 to make lr reduce") 37 | self.step = step 38 | self.cur_step_ind = 0 39 | self.factor = factor 40 | self.count = 0 41 | self.warmup = warmup 42 | self.warmup_lr = warmup_lr 43 | self.warmup_step = warmup_step 44 | 45 | def __call__(self, num_update): 46 | """ 47 | Call to schedule current learning rate 48 | 49 | Parameters 50 | ---------- 51 | num_update: int 52 | the maximal number of updates applied to a weight. 53 | """ 54 | 55 | # NOTE: use while rather than if (for continuing training via load_epoch) 56 | if self.warmup and num_update < self.warmup_step: 57 | return self.warmup_lr 58 | while self.cur_step_ind <= len(self.step)-1: 59 | if num_update > self.step[self.cur_step_ind]: 60 | self.count = self.step[self.cur_step_ind] 61 | self.cur_step_ind += 1 62 | self.base_lr *= self.factor 63 | logging.info("Update[%d]: Change learning rate to %0.5e", 64 | num_update, self.base_lr) 65 | else: 66 | return self.base_lr 67 | return self.base_lr 68 | -------------------------------------------------------------------------------- /lib/utils/roidb.py: -------------------------------------------------------------------------------- 1 | """ 2 | roidb 3 | basic format [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped'] 4 | extended ['image', 'max_classes', 'max_overlaps', 'bbox_targets'] 5 | """ 6 | 7 | import cv2 8 | import numpy as np 9 | 10 | from bbox.bbox_regression import compute_bbox_regression_targets 11 | 12 | 13 | def prepare_roidb(imdb, roidb, cfg): 14 | """ 15 | add image path, max_classes, max_overlaps to roidb 16 | :param imdb: image database, provide path 17 | :param roidb: roidb 18 | :return: None 19 | """ 20 | print 'prepare roidb' 21 | for i in range(len(roidb)): # image_index 22 | roidb[i]['image'] = imdb.image_path_from_index(imdb.image_set_index[i]) 23 | if cfg.TRAIN.ASPECT_GROUPING: 24 | size = cv2.imread(roidb[i]['image']).shape 25 | roidb[i]['height'] = size[0] 26 | roidb[i]['width'] = size[1] 27 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 28 | max_overlaps = gt_overlaps.max(axis=1) 29 | max_classes = gt_overlaps.argmax(axis=1) 30 | roidb[i]['max_overlaps'] = max_overlaps 31 | roidb[i]['max_classes'] = max_classes 32 | 33 | # background roi => background class 34 | zero_indexes = np.where(max_overlaps == 0)[0] 35 | assert all(max_classes[zero_indexes] == 0) 36 | # foreground roi => foreground class 37 | nonzero_indexes = np.where(max_overlaps > 0)[0] 38 | assert all(max_classes[nonzero_indexes] != 0) 39 | -------------------------------------------------------------------------------- /lib/utils/save_model.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | 3 | 4 | def save_checkpoint(prefix, epoch, arg_params, aux_params): 5 | """Checkpoint the model data into file. 6 | :param prefix: Prefix of model name. 7 | :param epoch: The epoch number of the model. 8 | :param arg_params: dict of str to NDArray 9 | Model parameter, dict of name to NDArray of net's weights. 10 | :param aux_params: dict of str to NDArray 11 | Model parameter, dict of name to NDArray of net's auxiliary states. 12 | :return: None 13 | prefix-epoch.params will be saved for parameters. 14 | """ 15 | save_dict = {('arg:%s' % k) : v for k, v in arg_params.items()} 16 | save_dict.update({('aux:%s' % k) : v for k, v in aux_params.items()}) 17 | param_name = '%s-%04d.params' % (prefix, epoch) 18 | mx.nd.save(param_name, save_dict) 19 | -------------------------------------------------------------------------------- /lib/utils/show_boxes.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # #Flow-Guided-Feature-Aggregation 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The Apache-2.0 License [see LICENSE for details] 5 | # Written by Yi Li, Haocheng Zhang, Xizhou Zhu 6 | # -------------------------------------------------------- 7 | 8 | import matplotlib.pyplot as plt 9 | import cv2 10 | import random 11 | 12 | def show_boxes(im, dets, classes, scale = 1.0): 13 | plt.cla() 14 | plt.axis("off") 15 | plt.imshow(im) 16 | for cls_idx, cls_name in enumerate(classes): 17 | cls_dets = dets[cls_idx] 18 | for det in cls_dets: 19 | bbox = det[:4] * scale 20 | color = (random.random(), random.random(), random.random()) 21 | rect = plt.Rectangle((bbox[0], bbox[1]), 22 | bbox[2] - bbox[0], 23 | bbox[3] - bbox[1], fill=False, 24 | edgecolor=color, linewidth=2.5) 25 | plt.gca().add_patch(rect) 26 | 27 | if cls_dets.shape[1] == 5: 28 | score = det[-1] 29 | plt.gca().text(bbox[0], bbox[1], 30 | '{:s} {:.3f}'.format(cls_name, score), 31 | bbox=dict(facecolor=color, alpha=0.5), fontsize=9, color='white') 32 | plt.show() 33 | return im 34 | 35 | 36 | def draw_boxes(im, dets, classes, scale = 1.0): 37 | color_white = (255, 255, 255) 38 | for cls_idx, cls_name in enumerate(classes): 39 | cls_dets = dets[cls_idx] 40 | for det in cls_dets: 41 | bbox = det[:4] * scale 42 | bbox = map(int, bbox) 43 | color = (random.randint(0, 256), random.randint(0, 256), random.randint(0, 256)) 44 | cv2.rectangle(im, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color=color, thickness=3) 45 | 46 | if cls_dets.shape[1] == 5: 47 | score = det[-1] 48 | cv2.putText(im, '%s %.3f' % (cls_name, score), (bbox[0], bbox[1]+10), 49 | color=color_white, fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=1, thickness=2) 50 | return im 51 | -------------------------------------------------------------------------------- /lib/utils/symbol.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # #Flow-Guided-Feature-Aggregation 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The Apache-2.0 License [see LICENSE for details] 5 | # Written by Yuwen Xiong 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | class Symbol: 10 | def __init__(self): 11 | self.arg_shape_dict = None 12 | self.out_shape_dict = None 13 | self.aux_shape_dict = None 14 | self.sym = None 15 | 16 | @property 17 | def symbol(self): 18 | return self.sym 19 | 20 | def get_symbol(self, cfg, is_train=True): 21 | """ 22 | return a generated symbol, it also need to be assigned to self.sym 23 | """ 24 | raise NotImplementedError() 25 | 26 | def init_weights(self, cfg, arg_params, aux_params): 27 | raise NotImplementedError() 28 | 29 | def get_msra_std(self, shape): 30 | fan_in = float(shape[1]) 31 | if len(shape) > 2: 32 | fan_in *= np.prod(shape[2:]) 33 | return np.sqrt(2 / fan_in) 34 | 35 | def infer_shape(self, data_shape_dict): 36 | # infer shape 37 | arg_shape, out_shape, aux_shape = self.sym.infer_shape(**data_shape_dict) 38 | self.arg_shape_dict = dict(zip(self.sym.list_arguments(), arg_shape)) 39 | self.out_shape_dict = dict(zip(self.sym.list_outputs(), out_shape)) 40 | self.aux_shape_dict = dict(zip(self.sym.list_auxiliary_states(), aux_shape)) 41 | 42 | def check_parameter_shapes(self, arg_params, aux_params, data_shape_dict, is_train=True): 43 | for k in self.sym.list_arguments(): 44 | if k in data_shape_dict or (False if is_train else 'label' in k): 45 | continue 46 | assert k in arg_params, k + ' not initialized' 47 | assert arg_params[k].shape == self.arg_shape_dict[k], \ 48 | 'shape inconsistent for ' + k + ' inferred ' + str(self.arg_shape_dict[k]) + ' provided ' + str( 49 | arg_params[k].shape) 50 | for k in self.sym.list_auxiliary_states(): 51 | assert k in aux_params, k + ' not initialized' 52 | assert aux_params[k].shape == self.aux_shape_dict[k], \ 53 | 'shape inconsistent for ' + k + ' inferred ' + str(self.aux_shape_dict[k]) + ' provided ' + str( 54 | aux_params[k].shape) 55 | -------------------------------------------------------------------------------- /lib/utils/tictoc.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | def tic(): 4 | import time 5 | global startTime_for_tictoc 6 | startTime_for_tictoc = time.time() 7 | return startTime_for_tictoc 8 | 9 | def toc(): 10 | if 'startTime_for_tictoc' in globals(): 11 | endTime = time.time() 12 | return endTime - startTime_for_tictoc 13 | else: 14 | return None 15 | -------------------------------------------------------------------------------- /manet_rfcn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/manet_rfcn/__init__.py -------------------------------------------------------------------------------- /manet_rfcn/_init_paths.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import sys 3 | 4 | def add_path(path): 5 | if path not in sys.path: 6 | sys.path.insert(0, path) 7 | 8 | this_dir = osp.dirname(__file__) 9 | 10 | lib_path = osp.join(this_dir, '..', 'lib') 11 | add_path(lib_path) 12 | -------------------------------------------------------------------------------- /manet_rfcn/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/manet_rfcn/config/__init__.py -------------------------------------------------------------------------------- /manet_rfcn/config/config.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Flow-Guided Feature Aggregation 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Modified by Yuqing Zhu, Shuhao Fu, Xizhou Zhu, Yuwen Xiong, Bin Xiao 7 | # -------------------------------------------------------- 8 | 9 | import yaml 10 | import numpy as np 11 | from easydict import EasyDict as edict 12 | 13 | config = edict() 14 | 15 | config.MXNET_VERSION = '' 16 | config.output_path = '' 17 | config.symbol = '' 18 | config.gpus = '' 19 | config.CLASS_AGNOSTIC = True 20 | config.SCALES = [(600, 1000)] # first is scale (the shorter side); second is max size 21 | 22 | # default training 23 | config.default = edict() 24 | config.default.frequent = 20 25 | config.default.kvstore = 'device' 26 | 27 | # network related params 28 | config.network = edict() 29 | config.network.pretrained = '' 30 | config.network.pretrained_flow = '' 31 | config.network.pretrained_epoch = 0 32 | config.network.PIXEL_MEANS = np.array([0, 0, 0]) 33 | config.network.IMAGE_STRIDE = 0 34 | config.network.RPN_FEAT_STRIDE = 16 35 | config.network.RCNN_FEAT_STRIDE = 16 36 | config.network.FIXED_PARAMS = ['gamma', 'beta'] 37 | config.network.ANCHOR_SCALES = (8, 16, 32) 38 | config.network.ANCHOR_RATIOS = (0.5, 1, 2) 39 | config.network.NORMALIZE_RPN = True 40 | config.network.ANCHOR_MEANS = (0.0, 0.0, 0.0, 0.0) 41 | config.network.ANCHOR_STDS = (0.1, 0.1, 0.4, 0.4) 42 | config.network.NUM_ANCHORS = len(config.network.ANCHOR_SCALES) * len(config.network.ANCHOR_RATIOS) 43 | config.network.FGFA_FEAT_DIM = 1024 #+ 2048 # 1024 for feature network, 2048 for embedding network 44 | 45 | # dataset related params 46 | config.dataset = edict() 47 | config.dataset.dataset = 'ImageNetVID' 48 | config.dataset.image_set = 'DET_train_30classes+VID_train_15frames' 49 | config.dataset.test_image_set = 'VID_val_videos' 50 | config.dataset.root_path = './data' 51 | config.dataset.dataset_path = './data/ILSVRC2015' 52 | config.dataset.motion_iou_path = './lib/dataset/imagenet_vid_groundtruth_motion_iou.mat' 53 | config.dataset.enable_detailed_eval = True 54 | config.dataset.NUM_CLASSES = 31 55 | 56 | 57 | config.TRAIN = edict() 58 | 59 | config.TRAIN.lr = 0 60 | config.TRAIN.lr_step = '' 61 | config.TRAIN.lr_factor = 0.1 62 | config.TRAIN.warmup = False 63 | config.TRAIN.warmup_lr = 0 64 | config.TRAIN.warmup_step = 0 65 | config.TRAIN.momentum = 0.9 66 | config.TRAIN.wd = 0.0005 67 | config.TRAIN.begin_epoch = 0 68 | config.TRAIN.end_epoch = 0 69 | config.TRAIN.model_prefix = '' 70 | 71 | # whether predict occlusion 72 | config.TRAIN.USE_OCCLUSION = False 73 | # whether resume training 74 | config.TRAIN.RESUME = False 75 | # whether flip image 76 | config.TRAIN.FLIP = True 77 | # whether shuffle image 78 | config.TRAIN.SHUFFLE = True 79 | # whether use OHEM 80 | config.TRAIN.ENABLE_OHEM = False 81 | # size of images for each device, 2 for rcnn, 1 for rpn and e2e 82 | config.TRAIN.BATCH_IMAGES = 2 83 | # e2e changes behavior of anchor loader and metric 84 | config.TRAIN.END2END = False 85 | # group images with similar aspect ratio 86 | config.TRAIN.ASPECT_GROUPING = True 87 | 88 | # R-CNN 89 | # rcnn rois batch size 90 | config.TRAIN.BATCH_ROIS = 128 91 | config.TRAIN.BATCH_ROIS_OHEM = 128 92 | # rcnn rois sampling params 93 | config.TRAIN.FG_FRACTION = 0.25 94 | config.TRAIN.FG_THRESH = 0.5 95 | config.TRAIN.BG_THRESH_HI = 0.5 96 | config.TRAIN.BG_THRESH_LO = 0.0 97 | # rcnn bounding box regression params 98 | config.TRAIN.BBOX_REGRESSION_THRESH = 0.5 99 | config.TRAIN.BBOX_WEIGHTS = np.array([1.0, 1.0, 1.0, 1.0]) 100 | 101 | # RPN anchor loader 102 | # rpn anchors batch size 103 | config.TRAIN.RPN_BATCH_SIZE = 256 104 | # rpn anchors sampling params 105 | config.TRAIN.RPN_FG_FRACTION = 0.5 106 | config.TRAIN.RPN_POSITIVE_OVERLAP = 0.7 107 | config.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3 108 | config.TRAIN.RPN_CLOBBER_POSITIVES = False 109 | # rpn bounding box regression params 110 | config.TRAIN.RPN_BBOX_WEIGHTS = (1.0, 1.0, 1.0, 1.0) 111 | config.TRAIN.RPN_POSITIVE_WEIGHT = -1.0 112 | 113 | # used for end2end training 114 | # RPN proposal 115 | config.TRAIN.CXX_PROPOSAL = True 116 | config.TRAIN.RPN_NMS_THRESH = 0.7 117 | config.TRAIN.RPN_PRE_NMS_TOP_N = 12000 118 | config.TRAIN.RPN_POST_NMS_TOP_N = 2000 119 | config.TRAIN.RPN_MIN_SIZE = config.network.RPN_FEAT_STRIDE 120 | # approximate bounding box regression 121 | config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED = True 122 | config.TRAIN.BBOX_MEANS = (0.0, 0.0, 0.0, 0.0) 123 | config.TRAIN.BBOX_STDS = (0.1, 0.1, 0.2, 0.2) 124 | 125 | # FGFA, trained image sampled from [min_offset, max_offset] 126 | config.TRAIN.MIN_OFFSET = -9 127 | config.TRAIN.MAX_OFFSET = 9 128 | 129 | config.TEST = edict() 130 | 131 | # R-CNN testing 132 | # use rpn to generate proposal 133 | config.TEST.HAS_RPN = False 134 | # size of images for each device 135 | config.TEST.BATCH_IMAGES = 1 136 | 137 | # RPN proposal 138 | config.TEST.CXX_PROPOSAL = True 139 | config.TEST.RPN_NMS_THRESH = 0.7 140 | config.TEST.RPN_PRE_NMS_TOP_N = 6000 141 | config.TEST.RPN_POST_NMS_TOP_N = 300 142 | config.TEST.RPN_MIN_SIZE = config.network.RPN_FEAT_STRIDE 143 | 144 | # RCNN nms 145 | config.TEST.NMS = 0.3 146 | config.TEST.max_per_image = 300 147 | 148 | # 149 | config.TEST.KEY_FRAME_INTERVAL = 9 150 | config.TEST.SEQ_NMS = False 151 | 152 | 153 | # Test Model Epoch 154 | config.TEST.test_epoch = 0 155 | 156 | 157 | def update_config(config_file): 158 | exp_config = None 159 | with open(config_file) as f: 160 | exp_config = edict(yaml.load(f)) 161 | for k, v in exp_config.items(): 162 | if k in config: 163 | if isinstance(v, dict): 164 | if k == 'TRAIN': 165 | if 'BBOX_WEIGHTS' in v: 166 | v['BBOX_WEIGHTS'] = np.array(v['BBOX_WEIGHTS']) 167 | elif k == 'network': 168 | if 'PIXEL_MEANS' in v: 169 | v['PIXEL_MEANS'] = np.array(v['PIXEL_MEANS']) 170 | for vk, vv in v.items(): 171 | config[k][vk] = vv 172 | else: 173 | if k == 'SCALES': 174 | config[k][0] = (tuple(v)) 175 | else: 176 | config[k] = v 177 | else: 178 | raise ValueError("key must exist in config.py") 179 | -------------------------------------------------------------------------------- /manet_rfcn/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/manet_rfcn/core/__init__.py -------------------------------------------------------------------------------- /manet_rfcn/core/callback.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Flow-Guided Feature Aggregation 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Modified by Yuwen Xiong 7 | # -------------------------------------------------------- 8 | 9 | import time 10 | import logging 11 | import mxnet as mx 12 | 13 | 14 | class Speedometer(object): 15 | def __init__(self, batch_size, frequent=50): 16 | self.batch_size = batch_size 17 | self.frequent = frequent 18 | self.init = False 19 | self.tic = 0 20 | self.last_count = 0 21 | 22 | def __call__(self, param): 23 | """Callback to Show speed.""" 24 | count = param.nbatch 25 | if self.last_count > count: 26 | self.init = False 27 | self.last_count = count 28 | 29 | if self.init: 30 | if count % self.frequent == 0: 31 | speed = self.frequent * self.batch_size / (time.time() - self.tic) 32 | s = '' 33 | if param.eval_metric is not None: 34 | name, value = param.eval_metric.get() 35 | s = "Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-" % (param.epoch, count, speed) 36 | for n, v in zip(name, value): 37 | s += "%s=%f,\t" % (n, v) 38 | else: 39 | s = "Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec" % (param.epoch, count, speed) 40 | 41 | logging.info(s) 42 | print(s) 43 | self.tic = time.time() 44 | else: 45 | self.init = True 46 | self.tic = time.time() 47 | 48 | 49 | def do_checkpoint(prefix, means, stds): 50 | def _callback(iter_no, sym, arg, aux): 51 | weight = arg['rfcn_bbox_weight'] 52 | bias = arg['rfcn_bbox_bias'] 53 | repeat = bias.shape[0] / means.shape[0] 54 | 55 | arg['rfcn_bbox_weight_test'] = weight * mx.nd.repeat(mx.nd.array(stds), repeats=repeat).reshape((bias.shape[0], 1, 1, 1)) 56 | arg['rfcn_bbox_bias_test'] = arg['rfcn_bbox_bias'] * mx.nd.repeat(mx.nd.array(stds), repeats=repeat) + mx.nd.repeat(mx.nd.array(means), repeats=repeat) 57 | mx.model.save_checkpoint(prefix, iter_no + 1, sym, arg, aux) 58 | arg.pop('rfcn_bbox_weight_test') 59 | arg.pop('rfcn_bbox_bias_test') 60 | return _callback -------------------------------------------------------------------------------- /manet_rfcn/core/metric.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fully Motion-Aware Network for Video Object Detection 3 | # Licensed under The Apache-2.0 License [see LICENSE for details] 4 | # Extend FGFA by adding instance-level aggregation and motion pattern reasoning 5 | # Modified by Shiyao Wang 6 | # -------------------------------------------------------- 7 | 8 | 9 | import mxnet as mx 10 | import numpy as np 11 | 12 | 13 | def get_rpn_names(): 14 | pred = ['rpn_cls_prob', 'rpn_bbox_loss'] 15 | label = ['rpn_label', 'rpn_bbox_target', 'rpn_bbox_weight'] 16 | return pred, label 17 | 18 | 19 | def get_rcnn_names(cfg): 20 | if cfg.TRAIN.USE_OCCLUSION: 21 | pred = ['rcnn_cls_prob', 'rcnn_bbox_loss', 'delta_loss', 'cls_occluded_prob'] 22 | else: 23 | pred = ['rcnn_cls_prob', 'rcnn_bbox_loss', 'delta_loss'] 24 | label = ['rcnn_label', 'rcnn_bbox_target', 'rcnn_bbox_weight'] 25 | if cfg.TRAIN.ENABLE_OHEM or cfg.TRAIN.END2END: 26 | pred.append('rcnn_label') 27 | pred.append('delta_label') 28 | if cfg.TRAIN.USE_OCCLUSION: 29 | pred.append('occluded_label') 30 | if cfg.TRAIN.END2END: 31 | rpn_pred, rpn_label = get_rpn_names() 32 | pred = rpn_pred + pred 33 | label = rpn_label 34 | return pred, label 35 | 36 | 37 | class RPNAccMetric(mx.metric.EvalMetric): 38 | def __init__(self): 39 | super(RPNAccMetric, self).__init__('RPNAcc') 40 | self.pred, self.label = get_rpn_names() 41 | 42 | def update(self, labels, preds): 43 | pred = preds[self.pred.index('rpn_cls_prob')] 44 | label = labels[self.label.index('rpn_label')] 45 | 46 | # pred (b, c, p) or (b, c, h, w) 47 | pred_label = mx.ndarray.argmax_channel(pred).asnumpy().astype('int32') 48 | pred_label = pred_label.reshape((pred_label.shape[0], -1)) 49 | # label (b, p) 50 | label = label.asnumpy().astype('int32') 51 | 52 | # filter with keep_inds 53 | keep_inds = np.where(label != -1) 54 | pred_label = pred_label[keep_inds] 55 | label = label[keep_inds] 56 | 57 | self.sum_metric += np.sum(pred_label.flat == label.flat) 58 | self.num_inst += len(pred_label.flat) 59 | 60 | 61 | class RCNNAccMetric(mx.metric.EvalMetric): 62 | def __init__(self, cfg): 63 | super(RCNNAccMetric, self).__init__('RCNNAcc') 64 | self.e2e = cfg.TRAIN.END2END 65 | self.ohem = cfg.TRAIN.ENABLE_OHEM 66 | self.pred, self.label = get_rcnn_names(cfg) 67 | 68 | def update(self, labels, preds): 69 | pred = preds[self.pred.index('rcnn_cls_prob')] 70 | if self.ohem or self.e2e: 71 | label = preds[self.pred.index('rcnn_label')] 72 | else: 73 | label = labels[self.label.index('rcnn_label')] 74 | 75 | last_dim = pred.shape[-1] 76 | pred_label = pred.asnumpy().reshape(-1, last_dim).argmax(axis=1).astype('int32') 77 | label = label.asnumpy().reshape(-1,).astype('int32') 78 | 79 | # filter with keep_inds 80 | keep_inds = np.where(label != -1) 81 | pred_label = pred_label[keep_inds] 82 | label = label[keep_inds] 83 | 84 | self.sum_metric += np.sum(pred_label.flat == label.flat) 85 | self.num_inst += len(pred_label.flat) 86 | 87 | 88 | class RPNLogLossMetric(mx.metric.EvalMetric): 89 | def __init__(self): 90 | super(RPNLogLossMetric, self).__init__('RPNLogLoss') 91 | self.pred, self.label = get_rpn_names() 92 | 93 | def update(self, labels, preds): 94 | pred = preds[self.pred.index('rpn_cls_prob')] 95 | label = labels[self.label.index('rpn_label')] 96 | 97 | # label (b, p) 98 | label = label.asnumpy().astype('int32').reshape((-1)) 99 | # pred (b, c, p) or (b, c, h, w) --> (b, p, c) --> (b*p, c) 100 | pred = pred.asnumpy().reshape((pred.shape[0], pred.shape[1], -1)).transpose((0, 2, 1)) 101 | pred = pred.reshape((label.shape[0], -1)) 102 | 103 | # filter with keep_inds 104 | keep_inds = np.where(label != -1)[0] 105 | label = label[keep_inds] 106 | cls = pred[keep_inds, label] 107 | 108 | cls += 1e-14 109 | cls_loss = -1 * np.log(cls) 110 | cls_loss = np.sum(cls_loss) 111 | self.sum_metric += cls_loss 112 | self.num_inst += label.shape[0] 113 | 114 | 115 | class RCNNLogLossMetric(mx.metric.EvalMetric): 116 | def __init__(self, cfg): 117 | super(RCNNLogLossMetric, self).__init__('RCNNLogLoss') 118 | self.e2e = cfg.TRAIN.END2END 119 | self.ohem = cfg.TRAIN.ENABLE_OHEM 120 | self.pred, self.label = get_rcnn_names(cfg) 121 | 122 | def update(self, labels, preds): 123 | pred = preds[self.pred.index('rcnn_cls_prob')] 124 | if self.ohem or self.e2e: 125 | label = preds[self.pred.index('rcnn_label')] 126 | else: 127 | label = labels[self.label.index('rcnn_label')] 128 | 129 | last_dim = pred.shape[-1] 130 | pred = pred.asnumpy().reshape(-1, last_dim) 131 | label = label.asnumpy().reshape(-1,).astype('int32') 132 | 133 | # filter with keep_inds 134 | keep_inds = np.where(label != -1)[0] 135 | label = label[keep_inds] 136 | cls = pred[keep_inds, label] 137 | 138 | cls += 1e-14 139 | cls_loss = -1 * np.log(cls) 140 | cls_loss = np.sum(cls_loss) 141 | self.sum_metric += cls_loss 142 | self.num_inst += label.shape[0] 143 | 144 | class RCNNOccludedLossMetric(mx.metric.EvalMetric): 145 | def __init__(self, cfg): 146 | super(RCNNOccludedLossMetric, self).__init__('RCNNOccludedLoss') 147 | self.e2e = cfg.TRAIN.END2END 148 | self.ohem = cfg.TRAIN.ENABLE_OHEM 149 | self.pred, self.label = get_rcnn_names(cfg) 150 | 151 | def update(self, labels, preds): 152 | pred = preds[self.pred.index('cls_occluded_prob')] 153 | label = preds[self.pred.index('occluded_label')] 154 | 155 | last_dim = pred.shape[-1] 156 | pred = pred.asnumpy().reshape(-1, last_dim) 157 | label = label.asnumpy().reshape(-1,).astype('int32') 158 | 159 | # filter with keep_inds 160 | keep_inds = np.where(label != -1)[0] 161 | label = label[keep_inds] 162 | cls = pred[keep_inds, label] 163 | 164 | cls += 1e-14 165 | cls_loss = -1 * np.log(cls) 166 | cls_loss = np.sum(cls_loss) 167 | self.sum_metric += cls_loss 168 | self.num_inst += label.shape[0] 169 | 170 | class RCNNOccludedAccMetric(mx.metric.EvalMetric): 171 | def __init__(self, cfg): 172 | super(RCNNOccludedAccMetric, self).__init__('RCNNOccludedAcc') 173 | self.e2e = cfg.TRAIN.END2END 174 | self.ohem = cfg.TRAIN.ENABLE_OHEM 175 | self.pred, self.label = get_rcnn_names(cfg) 176 | 177 | def update(self, labels, preds): 178 | pred = preds[self.pred.index('cls_occluded_prob')] 179 | label = preds[self.pred.index('occluded_label')] 180 | 181 | last_dim = pred.shape[-1] 182 | pred_label = pred.asnumpy().reshape(-1, last_dim).argmax(axis=1).astype('int32') 183 | label = label.asnumpy().reshape(-1,).astype('int32') 184 | 185 | # filter with keep_inds 186 | keep_inds = np.where(label != -1) 187 | pred_label = pred_label[keep_inds] 188 | label = label[keep_inds] 189 | 190 | self.sum_metric += np.sum(pred_label.flat == label.flat) 191 | self.num_inst += len(pred_label.flat) 192 | 193 | class RPNL1LossMetric(mx.metric.EvalMetric): 194 | def __init__(self): 195 | super(RPNL1LossMetric, self).__init__('RPNL1Loss') 196 | self.pred, self.label = get_rpn_names() 197 | 198 | def update(self, labels, preds): 199 | bbox_loss = preds[self.pred.index('rpn_bbox_loss')].asnumpy() 200 | 201 | # calculate num_inst (average on those kept anchors) 202 | label = labels[self.label.index('rpn_label')].asnumpy() 203 | num_inst = np.sum(label != -1) 204 | 205 | self.sum_metric += np.sum(bbox_loss) 206 | self.num_inst += num_inst 207 | 208 | 209 | class RCNNL1LossMetric(mx.metric.EvalMetric): 210 | def __init__(self, cfg): 211 | super(RCNNL1LossMetric, self).__init__('RCNNL1Loss') 212 | self.e2e = cfg.TRAIN.END2END 213 | self.ohem = cfg.TRAIN.ENABLE_OHEM 214 | self.pred, self.label = get_rcnn_names(cfg) 215 | 216 | def update(self, labels, preds): 217 | bbox_loss = preds[self.pred.index('rcnn_bbox_loss')].asnumpy() 218 | if self.ohem: 219 | label = preds[self.pred.index('rcnn_label')].asnumpy() 220 | else: 221 | if self.e2e: 222 | label = preds[self.pred.index('rcnn_label')].asnumpy() 223 | else: 224 | label = labels[self.label.index('rcnn_label')].asnumpy() 225 | 226 | # calculate num_inst (average on those kept anchors) 227 | num_inst = np.sum(label != -1) 228 | 229 | self.sum_metric += np.sum(bbox_loss) 230 | self.num_inst += num_inst 231 | 232 | class DELTAL1LossMetric(mx.metric.EvalMetric): 233 | def __init__(self, cfg): 234 | super(DELTAL1LossMetric, self).__init__('DELTAL1Loss') 235 | self.e2e = cfg.TRAIN.END2END 236 | self.ohem = cfg.TRAIN.ENABLE_OHEM 237 | self.pred, self.label = get_rcnn_names(cfg) 238 | 239 | def update(self, labels, preds): 240 | delta_loss = preds[self.pred.index('delta_loss')].asnumpy() 241 | label = preds[self.pred.index('delta_label')].asnumpy() 242 | 243 | # calculate num_inst (average on those kept anchors) 244 | num_inst = np.sum(label != -1) 245 | 246 | self.sum_metric += np.sum(delta_loss) 247 | self.num_inst += num_inst 248 | -------------------------------------------------------------------------------- /manet_rfcn/core/rcnn.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fully Motion-Aware Network for Video Object Detection 3 | # Licensed under The Apache-2.0 License [see LICENSE for details] 4 | # Extend FGFA by adding instance-level aggregation and motion pattern reasoning 5 | # Modified by Shiyao Wang 6 | # -------------------------------------------------------- 7 | """ 8 | Fast R-CNN: 9 | data = 10 | {'data': [num_images, c, h, w], 11 | 'rois': [num_rois, 5]} 12 | label = 13 | {'label': [num_rois], 14 | 'bbox_target': [num_rois, 4 * num_classes], 15 | 'bbox_weight': [num_rois, 4 * num_classes]} 16 | roidb extended format [image_index] 17 | ['image', 'height', 'width', 'flipped', 18 | 'boxes', 'gt_classes', 'gt_overlaps', 'max_classes', 'max_overlaps', 'bbox_targets'] 19 | """ 20 | 21 | import numpy as np 22 | import numpy.random as npr 23 | 24 | from utils.image import get_image, tensor_vstack 25 | from bbox.bbox_transform import bbox_overlaps, bbox_transform 26 | from bbox.bbox_regression import expand_bbox_regression_targets 27 | 28 | 29 | def get_rcnn_testbatch(roidb, cfg): 30 | """ 31 | return a dict of testbatch 32 | :param roidb: ['image', 'flipped'] + ['boxes'] 33 | :return: data, label, im_info 34 | """ 35 | # assert len(roidb) == 1, 'Single batch only' 36 | imgs, roidb = get_image(roidb, cfg) 37 | im_array = imgs 38 | im_info = [np.array([roidb[i]['im_info']], dtype=np.float32) for i in range(len(roidb))] 39 | 40 | im_rois = [roidb[i]['boxes'] for i in range(len(roidb))] 41 | rois = im_rois 42 | rois_array = [np.hstack((0 * np.ones((rois[i].shape[0], 1)), rois[i])) for i in range(len(rois))] 43 | 44 | data = [{'data': im_array[i], 45 | 'rois': rois_array[i]} for i in range(len(roidb))] 46 | label = {} 47 | 48 | return data, label, im_info 49 | 50 | 51 | def get_rcnn_batch(roidb, cfg): 52 | """ 53 | return a dict of multiple images 54 | :param roidb: a list of dict, whose length controls batch size 55 | ['images', 'flipped'] + ['gt_boxes', 'boxes', 'gt_overlap'] => ['bbox_targets'] 56 | :return: data, label 57 | """ 58 | num_images = len(roidb) 59 | imgs, roidb = get_image(roidb, cfg) 60 | im_array = tensor_vstack(imgs) 61 | 62 | assert cfg.TRAIN.BATCH_ROIS == -1 or cfg.TRAIN.BATCH_ROIS % cfg.TRAIN.BATCH_IMAGES == 0, \ 63 | 'BATCHIMAGES {} must divide BATCH_ROIS {}'.format(cfg.TRAIN.BATCH_IMAGES, cfg.TRAIN.BATCH_ROIS) 64 | 65 | if cfg.TRAIN.BATCH_ROIS == -1: 66 | rois_per_image = np.sum([iroidb['boxes'].shape[0] for iroidb in roidb]) 67 | fg_rois_per_image = rois_per_image 68 | else: 69 | rois_per_image = cfg.TRAIN.BATCH_ROIS / cfg.TRAIN.BATCH_IMAGES 70 | fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image).astype(int) 71 | 72 | rois_array = list() 73 | labels_array = list() 74 | bbox_targets_array = list() 75 | bbox_weights_array = list() 76 | 77 | for im_i in range(num_images): 78 | roi_rec = roidb[im_i] 79 | 80 | # infer num_classes from gt_overlaps 81 | num_classes = roi_rec['gt_overlaps'].shape[1] 82 | 83 | # label = class RoI has max overlap with 84 | rois = roi_rec['boxes'] 85 | labels = roi_rec['max_classes'] 86 | overlaps = roi_rec['max_overlaps'] 87 | bbox_targets = roi_rec['bbox_targets'] 88 | 89 | im_rois, labels, bbox_targets, bbox_weights = \ 90 | sample_rois(rois, fg_rois_per_image, rois_per_image, num_classes, cfg, 91 | labels, overlaps, bbox_targets) 92 | 93 | # project im_rois 94 | # do not round roi 95 | rois = im_rois 96 | batch_index = im_i * np.ones((rois.shape[0], 1)) 97 | rois_array_this_image = np.hstack((batch_index, rois)) 98 | rois_array.append(rois_array_this_image) 99 | 100 | # add labels 101 | labels_array.append(labels) 102 | bbox_targets_array.append(bbox_targets) 103 | bbox_weights_array.append(bbox_weights) 104 | 105 | rois_array = np.array(rois_array) 106 | labels_array = np.array(labels_array) 107 | bbox_targets_array = np.array(bbox_targets_array) 108 | bbox_weights_array = np.array(bbox_weights_array) 109 | 110 | data = {'data': im_array, 111 | 'rois': rois_array} 112 | label = {'label': labels_array, 113 | 'bbox_target': bbox_targets_array, 114 | 'bbox_weight': bbox_weights_array} 115 | 116 | return data, label 117 | 118 | 119 | def sample_rois(rois, delta_list, fg_rois_per_image, rois_per_image, num_classes, cfg, 120 | labels=None, overlaps=None, bbox_targets=None, gt_boxes=None, occluded=None): 121 | """ 122 | generate random sample of ROIs comprising foreground and background examples 123 | :param rois: all_rois [n, 4]; e2e: [n, 5] with batch_index 124 | :param fg_rois_per_image: foreground roi number 125 | :param rois_per_image: total roi number 126 | :param num_classes: number of classes 127 | :param labels: maybe precomputed 128 | :param overlaps: maybe precomputed (max_overlaps) 129 | :param bbox_targets: maybe precomputed 130 | :param gt_boxes: optional for e2e [n, 5] (x1, y1, x2, y2, cls) 131 | :return: (labels, rois, bbox_targets, bbox_weights) 132 | """ 133 | #print 'rois shape is : ', rois.shape 134 | #print 'delta_list shape is : ', delta_list.shape 135 | #print 'gt_boxes shape is : ', gt_boxes.shape 136 | if labels is None: 137 | overlaps = bbox_overlaps(rois[:, 1:].astype(np.float), gt_boxes[:, :4].astype(np.float)) 138 | gt_assignment = overlaps.argmax(axis=1) 139 | overlaps = overlaps.max(axis=1) 140 | labels = gt_boxes[gt_assignment, 4] 141 | occluded_label = occluded[gt_assignment] 142 | delta_list_shape = delta_list.shape 143 | delta_bef = delta_list[0:delta_list_shape[0]/2] 144 | delta_aft = delta_list[delta_list_shape[0]/2: delta_list_shape[0]] 145 | bef_label = delta_bef[gt_assignment,:] 146 | aft_label = delta_aft[gt_assignment,:] 147 | 148 | # foreground RoI with FG_THRESH overlap 149 | fg_indexes = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] 150 | # guard against the case when an image has fewer than fg_rois_per_image foreground RoIs 151 | fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_indexes.size) 152 | # Sample foreground regions without replacement 153 | if len(fg_indexes) > fg_rois_per_this_image: 154 | fg_indexes = npr.choice(fg_indexes, size=fg_rois_per_this_image, replace=False) 155 | 156 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 157 | bg_indexes = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 158 | # Compute number of background RoIs to take from this image (guarding against there being fewer than desired) 159 | bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image 160 | bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_indexes.size) 161 | # Sample foreground regions without replacement 162 | if len(bg_indexes) > bg_rois_per_this_image: 163 | bg_indexes = npr.choice(bg_indexes, size=bg_rois_per_this_image, replace=False) 164 | 165 | # indexes selected 166 | keep_indexes = np.append(fg_indexes, bg_indexes) 167 | 168 | # pad more to ensure a fixed minibatch size 169 | while keep_indexes.shape[0] < rois_per_image: 170 | gap = np.minimum(len(rois), rois_per_image - keep_indexes.shape[0]) 171 | gap_indexes = npr.choice(range(len(rois)), size=gap, replace=False) 172 | keep_indexes = np.append(keep_indexes, gap_indexes) 173 | 174 | # select labels 175 | labels = labels[keep_indexes] 176 | occluded_label = occluded_label[keep_indexes] 177 | bef_label = bef_label[keep_indexes] 178 | aft_label = aft_label[keep_indexes] 179 | #print 'bef_label: ', bef_label[:3] 180 | #print 'aft_label: ', aft_label[:3] 181 | # set labels of bg_rois to be 0 182 | labels[fg_rois_per_this_image:] = 0 183 | occluded_label[fg_rois_per_this_image:] = -1 184 | rois = rois[keep_indexes] 185 | 186 | delta_label = np.append(bef_label, aft_label, axis=0) 187 | 188 | # load or compute bbox_target 189 | if bbox_targets is not None: 190 | bbox_target_data = bbox_targets[keep_indexes, :] 191 | else: 192 | targets = bbox_transform(rois[:, 1:], gt_boxes[gt_assignment[keep_indexes], :4]) 193 | if cfg.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED: 194 | targets = ((targets - np.array(cfg.TRAIN.BBOX_MEANS)) 195 | / np.array(cfg.TRAIN.BBOX_STDS)) 196 | bbox_target_data = np.hstack((labels[:, np.newaxis], targets)) 197 | 198 | bbox_targets, bbox_weights, delta_weights = \ 199 | expand_bbox_regression_targets(bbox_target_data, num_classes, cfg) 200 | 201 | delta_weights = np.tile(delta_weights, reps=(2,1)) 202 | count = 0 203 | for item in delta_label: 204 | if (item==0).all(): 205 | delta_weights[count,:] = 0 206 | count+=1 207 | 208 | return rois, labels, bbox_targets, bbox_weights, delta_label, delta_weights, occluded_label 209 | 210 | -------------------------------------------------------------------------------- /manet_rfcn/function/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/manet_rfcn/function/__init__.py -------------------------------------------------------------------------------- /manet_rfcn/function/test_rcnn.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Flow-Guided Feature Aggregation 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Modified by Yuqing Zhu, Shuhao Fu, Yuwen Xiong 7 | # -------------------------------------------------------- 8 | 9 | import argparse 10 | import pprint 11 | import logging 12 | import time 13 | import os 14 | import numpy as np 15 | import mxnet as mx 16 | 17 | from symbols import * 18 | from dataset import * 19 | from core.loader import TestLoader 20 | from core.tester import Predictor, pred_eval, pred_eval_multiprocess 21 | from utils.load_model import load_param 22 | 23 | def get_predictor(sym, sym_instance, cfg, arg_params, aux_params, test_data, ctx): 24 | # infer shape 25 | data_shape_dict = dict(test_data.provide_data_single) 26 | sym_instance.infer_shape(data_shape_dict) 27 | sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict, is_train=False) 28 | 29 | # decide maximum shape 30 | data_names = [k[0] for k in test_data.provide_data_single] 31 | label_names = None 32 | max_data_shape = [[('data', (1, 3, max([v[0] for v in cfg.SCALES]), max([v[1] for v in cfg.SCALES]))), 33 | ('data_cache', (19, 3, max([v[0] for v in cfg.SCALES]), max([v[1] for v in cfg.SCALES]))), 34 | ]] 35 | 36 | # create predictor 37 | predictor = Predictor(sym, data_names, label_names, 38 | context=ctx, max_data_shapes=max_data_shape, 39 | provide_data=test_data.provide_data, provide_label=test_data.provide_label, 40 | arg_params=arg_params, aux_params=aux_params) 41 | return predictor 42 | 43 | def test_rcnn(cfg, dataset, image_set, root_path, dataset_path, motion_iou_path, 44 | ctx, prefix, epoch, 45 | vis, ignore_cache, shuffle, has_rpn, proposal, thresh, logger=None, output_path=None, enable_detailed_eval=True): 46 | if not logger: 47 | assert False, 'require a logger' 48 | 49 | # print cfg 50 | pprint.pprint(cfg) 51 | logger.info('testing cfg:{}\n'.format(pprint.pformat(cfg))) 52 | 53 | # load symbol and testing data 54 | 55 | feat_sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() 56 | aggr_sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() 57 | 58 | feat_sym = feat_sym_instance.get_feat_symbol(cfg) 59 | aggr_sym = aggr_sym_instance.get_aggregation_symbol(cfg) 60 | 61 | imdb = eval(dataset)(image_set, root_path, dataset_path, motion_iou_path, result_path=output_path, enable_detailed_eval=enable_detailed_eval) 62 | roidb = imdb.gt_roidb() 63 | 64 | # get test data iter 65 | # split roidbs 66 | gpu_num = len(ctx) 67 | roidbs = [[] for x in range(gpu_num)] 68 | roidbs_seg_lens = np.zeros(gpu_num, dtype=np.int) 69 | for x in roidb: 70 | gpu_id = np.argmin(roidbs_seg_lens) 71 | roidbs[gpu_id].append(x) 72 | roidbs_seg_lens[gpu_id] += x['frame_seg_len'] 73 | 74 | # get test data iter 75 | test_datas = [TestLoader(x, cfg, batch_size=1, shuffle=shuffle, has_rpn=has_rpn) for x in roidbs] 76 | 77 | # load model 78 | arg_params, aux_params = load_param(prefix, epoch, process=True) 79 | 80 | # create predictor 81 | feat_predictors = [get_predictor(feat_sym, feat_sym_instance, cfg, arg_params, aux_params, test_datas[i], [ctx[i]]) for i in range(gpu_num)] 82 | aggr_predictors = [get_predictor(aggr_sym, aggr_sym_instance, cfg, arg_params, aux_params, test_datas[i], [ctx[i]]) for i in range(gpu_num)] 83 | 84 | # start detection 85 | pred_eval_multiprocess(gpu_num, feat_predictors, aggr_predictors, test_datas, imdb, cfg, vis=vis, ignore_cache=ignore_cache, thresh=thresh, logger=logger) 86 | -------------------------------------------------------------------------------- /manet_rfcn/function/test_rpn.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # #Flow-Guided-Feature-Aggregation 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Modified by Yuwen Xiong 7 | # -------------------------------------------------------- 8 | 9 | import argparse 10 | import pprint 11 | import logging 12 | import mxnet as mx 13 | 14 | from symbols import * 15 | from dataset import * 16 | from core.loader import TestLoader 17 | from core.tester import Predictor, generate_proposals 18 | from utils.load_model import load_param 19 | 20 | 21 | def test_rpn(cfg, dataset, image_set, root_path, dataset_path, 22 | ctx, prefix, epoch, 23 | vis, shuffle, thresh, logger=None, output_path=None): 24 | # set up logger 25 | if not logger: 26 | logging.basicConfig() 27 | logger = logging.getLogger() 28 | logger.setLevel(logging.INFO) 29 | 30 | # rpn generate proposal cfg 31 | cfg.TEST.HAS_RPN = True 32 | 33 | # print cfg 34 | pprint.pprint(cfg) 35 | logger.info('testing rpn cfg:{}\n'.format(pprint.pformat(cfg))) 36 | 37 | # load symbol 38 | sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() 39 | sym = sym_instance.get_symbol_rpn(cfg, is_train=False) 40 | 41 | # load dataset and prepare imdb for training 42 | imdb = eval(dataset)(image_set, root_path, dataset_path, result_path=output_path) 43 | roidb = imdb.gt_roidb() 44 | test_data = TestLoader(roidb, cfg, batch_size=len(ctx), shuffle=shuffle, has_rpn=True) 45 | 46 | # load model 47 | arg_params, aux_params = load_param(prefix, epoch) 48 | 49 | # infer shape 50 | data_shape_dict = dict(test_data.provide_data_single) 51 | sym_instance.infer_shape(data_shape_dict) 52 | 53 | # check parameters 54 | sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict, is_train=False) 55 | 56 | # decide maximum shape 57 | data_names = [k[0] for k in test_data.provide_data[0]] 58 | label_names = None if test_data.provide_label[0] is None else [k[0] for k in test_data.provide_label[0]] 59 | max_data_shape = [[('data', (1, 3, max([v[0] for v in cfg.SCALES]), max([v[1] for v in cfg.SCALES])))]] 60 | 61 | # create predictor 62 | predictor = Predictor(sym, data_names, label_names, 63 | context=ctx, max_data_shapes=max_data_shape, 64 | provide_data=test_data.provide_data, provide_label=test_data.provide_label, 65 | arg_params=arg_params, aux_params=aux_params) 66 | 67 | # start testing 68 | imdb_boxes = generate_proposals(predictor, test_data, imdb, cfg, vis=vis, thresh=thresh) 69 | 70 | all_log_info = imdb.evaluate_recall(roidb, candidate_boxes=imdb_boxes) 71 | logger.info(all_log_info) 72 | -------------------------------------------------------------------------------- /manet_rfcn/function/train_rcnn.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # #Flow-Guided-Feature-Aggregation 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Modified by Yuwen Xiong 7 | # -------------------------------------------------------- 8 | 9 | import argparse 10 | import logging 11 | import pprint 12 | import os 13 | import mxnet as mx 14 | 15 | from symbols import * 16 | from core import callback, metric 17 | from core.loader import ROIIter 18 | from core.module import MutableModule 19 | from bbox.bbox_regression import add_bbox_regression_targets 20 | from utils.load_data import load_proposal_roidb, merge_roidb, filter_roidb 21 | from utils.load_model import load_param 22 | from utils.PrefetchingIter import PrefetchingIter 23 | from utils.lr_scheduler import WarmupMultiFactorScheduler 24 | 25 | 26 | def train_rcnn(cfg, dataset, image_set, root_path, dataset_path, 27 | frequent, kvstore, flip, shuffle, resume, 28 | ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, 29 | train_shared, lr, lr_step, proposal, logger=None, output_path=None): 30 | # set up logger 31 | if not logger: 32 | logging.basicConfig() 33 | logger = logging.getLogger() 34 | logger.setLevel(logging.INFO) 35 | 36 | # load symbol 37 | sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() 38 | sym = sym_instance.get_symbol_rfcn(cfg, is_train=True) 39 | 40 | # setup multi-gpu 41 | batch_size = len(ctx) 42 | input_batch_size = cfg.TRAIN.BATCH_IMAGES * batch_size 43 | 44 | # print cfg 45 | pprint.pprint(cfg) 46 | logger.info('training rcnn cfg:{}\n'.format(pprint.pformat(cfg))) 47 | 48 | # load dataset and prepare imdb for training 49 | image_sets = [iset for iset in image_set.split('+')] 50 | roidbs = [load_proposal_roidb(dataset, image_set, root_path, dataset_path, 51 | proposal=proposal, append_gt=True, flip=flip, result_path=output_path) 52 | for image_set in image_sets] 53 | roidb = merge_roidb(roidbs) 54 | roidb = filter_roidb(roidb, cfg) 55 | means, stds = add_bbox_regression_targets(roidb, cfg) 56 | 57 | # load training data 58 | train_data = ROIIter(roidb, cfg, batch_size=input_batch_size, shuffle=shuffle, 59 | ctx=ctx, aspect_grouping=cfg.TRAIN.ASPECT_GROUPING) 60 | 61 | # infer max shape 62 | max_data_shape = [('data', (cfg.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in cfg.SCALES]), max([v[1] for v in cfg.SCALES])))] 63 | 64 | # infer shape 65 | data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) 66 | sym_instance.infer_shape(data_shape_dict) 67 | 68 | # load and initialize params 69 | if resume: 70 | print('continue training from ', begin_epoch) 71 | arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) 72 | else: 73 | arg_params, aux_params = load_param(pretrained, epoch, convert=True) 74 | sym_instance.init_weight_rfcn(cfg, arg_params, aux_params) 75 | 76 | # check parameter shapes 77 | sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) 78 | 79 | # prepare training 80 | # create solver 81 | data_names = [k[0] for k in train_data.provide_data_single] 82 | label_names = [k[0] for k in train_data.provide_label_single] 83 | if train_shared: 84 | fixed_param_prefix = cfg.network.FIXED_PARAMS_SHARED 85 | else: 86 | fixed_param_prefix = cfg.network.FIXED_PARAMS 87 | mod = MutableModule(sym, data_names=data_names, label_names=label_names, 88 | logger=logger, context=ctx, 89 | max_data_shapes=[max_data_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) 90 | 91 | if cfg.TRAIN.RESUME: 92 | mod._preload_opt_states = '%s-%04d.states'%(prefix, begin_epoch) 93 | 94 | 95 | # decide training params 96 | # metric 97 | eval_metric = metric.RCNNAccMetric(cfg) 98 | cls_metric = metric.RCNNLogLossMetric(cfg) 99 | bbox_metric = metric.RCNNL1LossMetric(cfg) 100 | eval_metrics = mx.metric.CompositeEvalMetric() 101 | for child_metric in [eval_metric, cls_metric, bbox_metric]: 102 | eval_metrics.add(child_metric) 103 | # callback 104 | batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=frequent) 105 | epoch_end_callback = [mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), 106 | callback.do_checkpoint(prefix, means, stds)] 107 | # decide learning rate 108 | base_lr = lr 109 | lr_factor = cfg.TRAIN.lr_factor 110 | lr_epoch = [float(epoch) for epoch in lr_step.split(',')] 111 | lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] 112 | lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) 113 | lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] 114 | print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) 115 | lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, cfg.TRAIN.warmup, cfg.TRAIN.warmup_lr, cfg.TRAIN.warmup_step) 116 | # optimizer 117 | optimizer_params = {'momentum': cfg.TRAIN.momentum, 118 | 'wd': cfg.TRAIN.wd, 119 | 'learning_rate': lr, 120 | 'lr_scheduler': lr_scheduler, 121 | 'rescale_grad': 1.0, 122 | 'clip_gradient': None} 123 | 124 | # train 125 | 126 | if not isinstance(train_data, PrefetchingIter): 127 | train_data = PrefetchingIter(train_data) 128 | 129 | mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, 130 | batch_end_callback=batch_end_callback, kvstore=kvstore, 131 | optimizer='sgd', optimizer_params=optimizer_params, 132 | arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) 133 | 134 | -------------------------------------------------------------------------------- /manet_rfcn/function/train_rpn.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # #Flow-Guided-Feature-Aggregation 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Modified by Yuwen Xiong 7 | # -------------------------------------------------------- 8 | 9 | import argparse 10 | import logging 11 | import pprint 12 | import mxnet as mx 13 | 14 | from symbols import * 15 | from core import callback, metric 16 | from core.loader import AnchorLoader 17 | from core.module import MutableModule 18 | from utils.load_data import load_gt_roidb, merge_roidb, filter_roidb 19 | from utils.load_model import load_param 20 | from utils.PrefetchingIter import PrefetchingIter 21 | from utils.lr_scheduler import WarmupMultiFactorScheduler 22 | 23 | 24 | def train_rpn(cfg, dataset, image_set, root_path, dataset_path, 25 | frequent, kvstore, flip, shuffle, resume, 26 | ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, 27 | train_shared, lr, lr_step, logger=None, output_path=None): 28 | # set up logger 29 | if not logger: 30 | logging.basicConfig() 31 | logger = logging.getLogger() 32 | logger.setLevel(logging.INFO) 33 | 34 | # set up config 35 | cfg.TRAIN.BATCH_IMAGES = cfg.TRAIN.ALTERNATE.RPN_BATCH_IMAGES 36 | 37 | # load symbol 38 | sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() 39 | sym = sym_instance.get_symbol_rpn(cfg, is_train=True) 40 | feat_sym = sym.get_internals()['rpn_cls_score_output'] 41 | 42 | # setup multi-gpu 43 | batch_size = len(ctx) 44 | input_batch_size = cfg.TRAIN.BATCH_IMAGES * batch_size 45 | 46 | # print cfg 47 | pprint.pprint(cfg) 48 | logger.info('training rpn cfg:{}\n'.format(pprint.pformat(cfg))) 49 | 50 | # load dataset and prepare imdb for training 51 | image_sets = [iset for iset in image_set.split('+')] 52 | roidbs = [load_gt_roidb(dataset, image_set, root_path, dataset_path, result_path=output_path, 53 | flip=flip) 54 | for image_set in image_sets] 55 | roidb = merge_roidb(roidbs) 56 | roidb = filter_roidb(roidb, cfg) 57 | 58 | # load training data 59 | train_data = AnchorLoader(feat_sym, roidb, cfg, batch_size=input_batch_size, shuffle=shuffle, 60 | ctx=ctx, feat_stride=cfg.network.RPN_FEAT_STRIDE, anchor_scales=cfg.network.ANCHOR_SCALES, 61 | anchor_ratios=cfg.network.ANCHOR_RATIOS, aspect_grouping=cfg.TRAIN.ASPECT_GROUPING) 62 | 63 | # infer max shape 64 | max_data_shape = [('data', (cfg.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in cfg.SCALES]), max([v[1] for v in cfg.SCALES])))] 65 | max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) 66 | print('providing maximum shape', max_data_shape, max_label_shape) 67 | 68 | # infer shape 69 | data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) 70 | sym_instance.infer_shape(data_shape_dict) 71 | 72 | # load and initialize params 73 | if resume: 74 | print('continue training from ', begin_epoch) 75 | arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) 76 | else: 77 | arg_params, aux_params = load_param(pretrained, epoch, convert=True) 78 | sym_instance.init_weight_rpn(cfg, arg_params, aux_params) 79 | 80 | # check parameter shapes 81 | sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) 82 | 83 | # create solver 84 | data_names = [k[0] for k in train_data.provide_data_single] 85 | label_names = [k[0] for k in train_data.provide_label_single] 86 | if train_shared: 87 | fixed_param_prefix = cfg.network.FIXED_PARAMS_SHARED 88 | else: 89 | fixed_param_prefix = cfg.network.FIXED_PARAMS 90 | mod = MutableModule(sym, data_names=data_names, label_names=label_names, 91 | logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in xrange(batch_size)], 92 | max_label_shapes=[max_label_shape for _ in xrange(batch_size)], fixed_param_prefix=fixed_param_prefix) 93 | 94 | # decide training params 95 | # metric 96 | eval_metric = metric.RPNAccMetric() 97 | cls_metric = metric.RPNLogLossMetric() 98 | bbox_metric = metric.RPNL1LossMetric() 99 | eval_metrics = mx.metric.CompositeEvalMetric() 100 | for child_metric in [eval_metric, cls_metric, bbox_metric]: 101 | eval_metrics.add(child_metric) 102 | # callback 103 | batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=frequent) 104 | # epoch_end_callback = mx.callback.do_checkpoint(prefix) 105 | epoch_end_callback = mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True) 106 | # decide learning rate 107 | base_lr = lr 108 | lr_factor = cfg.TRAIN.lr_factor 109 | lr_epoch = [int(epoch) for epoch in lr_step.split(',')] 110 | lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] 111 | lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) 112 | lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] 113 | print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) 114 | lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, cfg.TRAIN.warmup, cfg.TRAIN.warmup_lr, cfg.TRAIN.warmup_step) 115 | # optimizer 116 | optimizer_params = {'momentum': cfg.TRAIN.momentum, 117 | 'wd': cfg.TRAIN.wd, 118 | 'learning_rate': lr, 119 | 'lr_scheduler': lr_scheduler, 120 | 'rescale_grad': 1.0, 121 | 'clip_gradient': None} 122 | 123 | if not isinstance(train_data, PrefetchingIter): 124 | train_data = PrefetchingIter(train_data) 125 | 126 | # train 127 | mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, 128 | batch_end_callback=batch_end_callback, kvstore=kvstore, 129 | optimizer='sgd', optimizer_params=optimizer_params, 130 | arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) 131 | 132 | -------------------------------------------------------------------------------- /manet_rfcn/operator_cxx/psroi_pooling-inl.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * Copyright (c) 2017 Microsoft 4 | * Licensed under The Apache-2.0 License [see LICENSE for details] 5 | * \file psroi_pooling-inl.h 6 | * \brief psroi pooling operator and symbol 7 | * \author Yi Li, Tairui Chen, Guodong Zhang, Jifeng Dai 8 | */ 9 | #ifndef MXNET_OPERATOR_PSROI_POOLING_INL_H_ 10 | #define MXNET_OPERATOR_PSROI_POOLING_INL_H_ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "../mshadow_op.h" 20 | #include "../operator_common.h" 21 | 22 | 23 | namespace mxnet { 24 | namespace op { 25 | 26 | // Declare enumeration of input order to make code more intuitive. 27 | // These enums are only visible within this header 28 | namespace psroipool { 29 | enum PSROIPoolingOpInputs {kData, kBox}; 30 | enum PSROIPoolingOpOutputs {kOut, kMappingChannel}; 31 | } // psroipool 32 | 33 | struct PSROIPoolingParam : public dmlc::Parameter { 34 | // TShape pooled_size; 35 | float spatial_scale; 36 | int output_dim; 37 | int pooled_size; 38 | int group_size; 39 | DMLC_DECLARE_PARAMETER(PSROIPoolingParam) { 40 | DMLC_DECLARE_FIELD(spatial_scale).set_range(0.0, 1.0) 41 | .describe("Ratio of input feature map height (or w) to raw image height (or w). " 42 | "Equals the reciprocal of total stride in convolutional layers"); 43 | DMLC_DECLARE_FIELD(output_dim).describe("fix output dim"); 44 | DMLC_DECLARE_FIELD(pooled_size).describe("fix pooled size"); 45 | DMLC_DECLARE_FIELD(group_size).set_default(0).describe("fix group size"); 46 | } 47 | }; 48 | 49 | template 50 | class PSROIPoolingOp : public Operator { 51 | public: 52 | explicit PSROIPoolingOp(PSROIPoolingParam p) { 53 | this->param_ = p; 54 | } 55 | 56 | virtual void Forward(const OpContext &ctx, 57 | const std::vector &in_data, 58 | const std::vector &req, 59 | const std::vector &out_data, 60 | const std::vector &aux_args) { 61 | using namespace mshadow; 62 | size_t expected = 2; 63 | CHECK_EQ(in_data.size(), expected); 64 | CHECK_EQ(out_data.size(), expected); 65 | CHECK_EQ(out_data[psroipool::kOut].shape_[0], in_data[psroipool::kBox].shape_[0]); 66 | CHECK_EQ(out_data[psroipool::kMappingChannel].shape_[0], in_data[psroipool::kBox].shape_[0]); 67 | Stream *s = ctx.get_stream(); 68 | 69 | Tensor data = in_data[psroipool::kData].get(s); 70 | Tensor bbox = in_data[psroipool::kBox].get(s); 71 | Tensor out = out_data[psroipool::kOut].get(s); 72 | Tensor mapping_channel = out_data[psroipool::kMappingChannel].get(s); 73 | CHECK_EQ(data.CheckContiguous(), true); 74 | CHECK_EQ(bbox.CheckContiguous(), true); 75 | CHECK_EQ(out.CheckContiguous(), true); 76 | CHECK_EQ(mapping_channel.CheckContiguous(), true); 77 | out = -FLT_MAX; 78 | mapping_channel = -1.0f; 79 | PSROIPoolForward(out, data, bbox, mapping_channel, param_.spatial_scale, param_.output_dim, param_.group_size); 80 | } 81 | 82 | virtual void Backward(const OpContext &ctx, 83 | const std::vector &out_grad, 84 | const std::vector &in_data, 85 | const std::vector &out_data, 86 | const std::vector &req, 87 | const std::vector &in_grad, 88 | const std::vector &aux_args) { 89 | using namespace mshadow; 90 | size_t expected = 2; 91 | CHECK_EQ(in_data.size(), expected); 92 | CHECK_EQ(out_data.size(), expected); 93 | CHECK_EQ(out_grad[psroipool::kOut].shape_[0], in_data[psroipool::kBox].shape_[0]); 94 | CHECK_EQ(out_data[psroipool::kMappingChannel].shape_[0], in_data[psroipool::kBox].shape_[0]); 95 | CHECK_NE(req[psroipool::kData], kWriteInplace) << 96 | "ROIPooling: Backward doesn't support kWriteInplace."; 97 | CHECK_NE(req[psroipool::kBox], kWriteInplace) << 98 | "ROIPooling: Backward doesn't support kWriteInplace."; 99 | Stream *s = ctx.get_stream(); 100 | 101 | Tensor grad_out = out_grad[psroipool::kOut].get(s); 102 | Tensor bbox = in_data[psroipool::kBox].get(s); 103 | Tensor mapping_channel = out_data[psroipool::kMappingChannel].get(s); 104 | Tensor grad_in = in_grad[psroipool::kData].get(s); 105 | Tensor grad_roi = in_grad[psroipool::kBox].get(s); 106 | 107 | CHECK_EQ(grad_out.CheckContiguous(), true); 108 | CHECK_EQ(bbox.CheckContiguous(), true); 109 | CHECK_EQ(mapping_channel.CheckContiguous(), true); 110 | CHECK_EQ(grad_in.CheckContiguous(), true); 111 | 112 | if (kAddTo == req[psroipool::kData] || kWriteTo == req[psroipool::kData]) { 113 | if (kWriteTo == req[psroipool::kData]) { 114 | grad_in = 0.0f; 115 | } 116 | PSROIPoolBackwardAcc(grad_in, grad_out, bbox, mapping_channel, param_.spatial_scale, param_.output_dim); 117 | } 118 | if (kWriteTo == req[psroipool::kBox]) { 119 | grad_roi = 0.0f; 120 | } 121 | 122 | } 123 | 124 | private: 125 | PSROIPoolingParam param_; 126 | }; // class PSROIPoolingOp 127 | 128 | // Decalre Factory function, used for dispatch specialization 129 | template 130 | Operator* CreateOp(PSROIPoolingParam param, int dtype); 131 | 132 | #if DMLC_USE_CXX11 133 | class PSROIPoolingProp : public OperatorProperty { 134 | public: 135 | std::vector ListArguments() const override { 136 | return {"data", "rois"}; 137 | } 138 | 139 | std::vector ListOutputs() const override { 140 | return {"output", "maxidx"}; 141 | } 142 | 143 | int NumOutputs() const override { 144 | return 2; 145 | } 146 | 147 | int NumVisibleOutputs() const override { 148 | return 1; 149 | } 150 | 151 | void Init(const std::vector >& kwargs) override { 152 | param_.Init(kwargs); 153 | if (param_.group_size == 0) { 154 | param_.group_size = param_.pooled_size; 155 | } 156 | } 157 | 158 | std::map GetParams() const override { 159 | return param_.__DICT__(); 160 | } 161 | 162 | bool InferShape(std::vector *in_shape, 163 | std::vector *out_shape, 164 | std::vector *aux_shape) const override { 165 | using namespace mshadow; 166 | CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]"; 167 | 168 | // data: [batch_size, c, h, w] 169 | TShape dshape = in_shape->at(psroipool::kData); 170 | CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; 171 | 172 | // bbox: [num_rois, 5] 173 | TShape bshape = in_shape->at(psroipool::kBox); 174 | CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]"; 175 | CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]"; 176 | 177 | // out: [num_rois, c, pooled_h, pooled_w] 178 | // mapping_channel: [num_rois, c, pooled_h, pooled_w] 179 | out_shape->clear(); 180 | out_shape->push_back( 181 | Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); 182 | out_shape->push_back( 183 | Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); 184 | return true; 185 | } 186 | 187 | bool InferType(std::vector *in_type, 188 | std::vector *out_type, 189 | std::vector *aux_type) const override { 190 | CHECK_EQ(in_type->size(), 2); 191 | int dtype = (*in_type)[0]; 192 | CHECK_EQ(dtype, (*in_type)[1]); 193 | CHECK_NE(dtype, -1) << "Input must have specified type"; 194 | 195 | out_type->clear(); 196 | out_type->push_back(dtype); 197 | out_type->push_back(dtype); 198 | return true; 199 | } 200 | 201 | OperatorProperty* Copy() const override { 202 | PSROIPoolingProp* psroi_pooling_sym = new PSROIPoolingProp(); 203 | psroi_pooling_sym->param_ = this->param_; 204 | return psroi_pooling_sym; 205 | } 206 | 207 | std::string TypeString() const override { 208 | return "_contrib_PSROIPooling"; 209 | } 210 | 211 | // decalre dependency and inplace optimization options 212 | std::vector DeclareBackwardDependency( 213 | const std::vector &out_grad, 214 | const std::vector &in_data, 215 | const std::vector &out_data) const override { 216 | return {out_grad[psroipool::kOut], in_data[psroipool::kBox], out_data[psroipool::kMappingChannel]}; 217 | } 218 | 219 | 220 | Operator* CreateOperator(Context ctx) const override { 221 | LOG(FATAL) << "Not Implemented."; 222 | return NULL; 223 | } 224 | 225 | Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, 226 | std::vector *in_type) const override; 227 | 228 | 229 | private: 230 | PSROIPoolingParam param_; 231 | }; // class PSROIPoolingProp 232 | #endif 233 | } // namespace op 234 | } // namespace mxnet 235 | #endif // MXNET_OPERATOR_PSROI_POOLING_INL_H_ -------------------------------------------------------------------------------- /manet_rfcn/operator_cxx/psroi_pooling.cc: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * Copyright (c) 2017 Microsoft 4 | * Licensed under The Apache-2.0 License [see LICENSE for details] 5 | * \file psroi_pooling.cc 6 | * \brief psroi pooling operator 7 | * \author Yi Li, Tairui Chen, Guodong Zhang, Jifeng Dai 8 | */ 9 | #include "./psroi_pooling-inl.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | using std::max; 17 | using std::min; 18 | using std::floor; 19 | using std::ceil; 20 | 21 | namespace mshadow { 22 | template 23 | inline void PSROIPoolForward(const Tensor &out, 24 | const Tensor &data, 25 | const Tensor &bbox, 26 | const Tensor &mapping_channel, 27 | const float spatial_scale_, 28 | const int output_dim_, 29 | const int group_size_) { 30 | // NOT_IMPLEMENTED; 31 | return; 32 | } 33 | 34 | template 35 | inline void PSROIPoolBackwardAcc(const Tensor &in_grad, 36 | const Tensor &out_grad, 37 | const Tensor &bbox, 38 | const Tensor &mapping_channel, 39 | const float spatial_scale_, 40 | const int output_dim_) { 41 | // NOT_IMPLEMENTED; 42 | return; 43 | } 44 | } // namespace mshadow 45 | 46 | namespace mxnet { 47 | namespace op { 48 | 49 | template<> 50 | Operator *CreateOp(PSROIPoolingParam param, int dtype) { 51 | Operator* op = NULL; 52 | MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { 53 | op = new PSROIPoolingOp(param); 54 | }); 55 | return op; 56 | } 57 | 58 | Operator *PSROIPoolingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, 59 | std::vector *in_type) const { 60 | std::vector out_shape, aux_shape; 61 | std::vector out_type, aux_type; 62 | CHECK(InferType(in_type, &out_type, &aux_type)); 63 | CHECK(InferShape(in_shape, &out_shape, &aux_shape)); 64 | DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); 65 | } 66 | 67 | DMLC_REGISTER_PARAMETER(PSROIPoolingParam); 68 | 69 | MXNET_REGISTER_OP_PROPERTY(_contrib_PSROIPooling, PSROIPoolingProp) 70 | .describe("Performs region-of-interest pooling on inputs. Resize bounding box coordinates by " 71 | "spatial_scale and crop input feature maps accordingly. The cropped feature maps are pooled " 72 | "by max pooling to a fixed size output indicated by pooled_size. batch_size will change to " 73 | "the number of region bounding boxes after PSROIPooling") 74 | .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") 75 | .add_argument("rois", "Symbol", "Bounding box coordinates, a 2D array of " 76 | "[[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners " 77 | "of designated region of interest. batch_index indicates the index of corresponding image " 78 | "in the input data") 79 | .add_arguments(PSROIPoolingParam::__FIELDS__()); 80 | } // namespace op 81 | } // namespace mxnet -------------------------------------------------------------------------------- /manet_rfcn/operator_py/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangshy31/MANet_for_Video_Object_Detection/b65c2a452be6b2331e17f99117b83f803cecedf0/manet_rfcn/operator_py/__init__.py -------------------------------------------------------------------------------- /manet_rfcn/operator_py/box_annotator_ohem.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # #Flow-Guided-Feature-Aggregation 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The Apache-2.0 License [see LICENSE for details] 5 | # Written by Yuwen Xiong 6 | # -------------------------------------------------------- 7 | 8 | """ 9 | Proposal Target Operator selects foreground and background roi and assigns label, bbox_transform to them. 10 | """ 11 | 12 | import mxnet as mx 13 | import numpy as np 14 | from distutils.util import strtobool 15 | 16 | 17 | 18 | 19 | class BoxAnnotatorOHEMOperator(mx.operator.CustomOp): 20 | def __init__(self, num_classes, num_reg_classes, roi_per_img): 21 | super(BoxAnnotatorOHEMOperator, self).__init__() 22 | self._num_classes = num_classes 23 | self._num_reg_classes = num_reg_classes 24 | self._roi_per_img = roi_per_img 25 | 26 | def forward(self, is_train, req, in_data, out_data, aux): 27 | 28 | cls_score = in_data[0] 29 | bbox_pred = in_data[1] 30 | labels = in_data[2].asnumpy() 31 | bbox_targets = in_data[3] 32 | bbox_weights = in_data[4] 33 | 34 | per_roi_loss_cls = mx.nd.SoftmaxActivation(cls_score) + 1e-14 35 | per_roi_loss_cls = per_roi_loss_cls.asnumpy() 36 | per_roi_loss_cls = per_roi_loss_cls[np.arange(per_roi_loss_cls.shape[0], dtype='int'), labels.astype('int')] 37 | per_roi_loss_cls = -1 * np.log(per_roi_loss_cls) 38 | per_roi_loss_cls = np.reshape(per_roi_loss_cls, newshape=(-1,)) 39 | 40 | per_roi_loss_bbox = bbox_weights * mx.nd.smooth_l1((bbox_pred - bbox_targets), scalar=1.0) 41 | per_roi_loss_bbox = mx.nd.sum(per_roi_loss_bbox, axis=1).asnumpy() 42 | 43 | top_k_per_roi_loss = np.argsort(per_roi_loss_cls + per_roi_loss_bbox) 44 | labels_ohem = labels 45 | labels_ohem[top_k_per_roi_loss[::-1][self._roi_per_img:]] = -1 46 | bbox_weights_ohem = bbox_weights.asnumpy() 47 | bbox_weights_ohem[top_k_per_roi_loss[::-1][self._roi_per_img:]] = 0 48 | 49 | labels_ohem = mx.nd.array(labels_ohem) 50 | bbox_weights_ohem = mx.nd.array(bbox_weights_ohem) 51 | 52 | for ind, val in enumerate([labels_ohem, bbox_weights_ohem]): 53 | self.assign(out_data[ind], req[ind], val) 54 | 55 | 56 | def backward(self, req, out_grad, in_data, out_data, in_grad, aux): 57 | for i in range(len(in_grad)): 58 | self.assign(in_grad[i], req[i], 0) 59 | 60 | 61 | @mx.operator.register('BoxAnnotatorOHEM') 62 | class BoxAnnotatorOHEMProp(mx.operator.CustomOpProp): 63 | def __init__(self, num_classes, num_reg_classes, roi_per_img): 64 | super(BoxAnnotatorOHEMProp, self).__init__(need_top_grad=False) 65 | self._num_classes = int(num_classes) 66 | self._num_reg_classes = int(num_reg_classes) 67 | self._roi_per_img = int(roi_per_img) 68 | 69 | def list_arguments(self): 70 | return ['cls_score', 'bbox_pred', 'labels', 'bbox_targets', 'bbox_weights'] 71 | 72 | def list_outputs(self): 73 | return ['labels_ohem', 'bbox_weights_ohem'] 74 | 75 | def infer_shape(self, in_shape): 76 | labels_shape = in_shape[2] 77 | bbox_weights_shape = in_shape[4] 78 | 79 | return in_shape, \ 80 | [labels_shape, bbox_weights_shape] 81 | 82 | def create_operator(self, ctx, shapes, dtypes): 83 | return BoxAnnotatorOHEMOperator(self._num_classes, self._num_reg_classes, self._roi_per_img) 84 | 85 | def declare_backward_dependency(self, out_grad, in_data, out_data): 86 | return [] 87 | -------------------------------------------------------------------------------- /manet_rfcn/operator_py/proposal_target.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fully Motion-Aware Network for Video Object Detection 3 | # Licensed under The Apache-2.0 License [see LICENSE for details] 4 | # Extend FGFA by adding instance-level aggregation and motion pattern reasoning 5 | # Modified by Shiyao Wang 6 | # -------------------------------------------------------- 7 | 8 | 9 | 10 | """ 11 | Proposal Target Operator selects foreground and background roi and assigns label, bbox_transform to them. 12 | """ 13 | 14 | import mxnet as mx 15 | import numpy as np 16 | from distutils.util import strtobool 17 | from easydict import EasyDict as edict 18 | import cPickle 19 | 20 | 21 | from core.rcnn import sample_rois 22 | 23 | DEBUG = False 24 | 25 | 26 | class ProposalTargetOperator(mx.operator.CustomOp): 27 | def __init__(self, num_classes, batch_images, batch_rois, cfg, fg_fraction): 28 | super(ProposalTargetOperator, self).__init__() 29 | self._num_classes = num_classes 30 | self._batch_images = batch_images 31 | self._batch_rois = batch_rois 32 | self._cfg = cfg 33 | self._fg_fraction = fg_fraction 34 | 35 | if DEBUG: 36 | self._count = 0 37 | self._fg_num = 0 38 | self._bg_num = 0 39 | 40 | def forward(self, is_train, req, in_data, out_data, aux): 41 | assert self._batch_rois == -1 or self._batch_rois % self._batch_images == 0, \ 42 | 'batchimages {} must devide batch_rois {}'.format(self._batch_images, self._batch_rois) 43 | all_rois = in_data[0].asnumpy() 44 | gt_boxes = in_data[1].asnumpy() 45 | delta_list = in_data[2].asnumpy() 46 | occluded = in_data[3].asnumpy() 47 | 48 | if self._batch_rois == -1: 49 | rois_per_image = all_rois.shape[0] + gt_boxes.shape[0] 50 | fg_rois_per_image = rois_per_image 51 | else: 52 | rois_per_image = self._batch_rois / self._batch_images 53 | fg_rois_per_image = np.round(self._fg_fraction * rois_per_image).astype(int) 54 | 55 | 56 | # Include ground-truth boxes in the set of candidate rois 57 | zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) 58 | all_rois = np.vstack((all_rois, np.hstack((zeros, gt_boxes[:, :-1])))) 59 | # Sanity check: single batch only 60 | assert np.all(all_rois[:, 0] == 0), 'Only single item batches are supported' 61 | 62 | rois, labels, bbox_targets, bbox_weights, delta_label, delta_weights, occluded_label = \ 63 | sample_rois(all_rois, delta_list, fg_rois_per_image, rois_per_image, self._num_classes, self._cfg, gt_boxes=gt_boxes, occluded=occluded) 64 | 65 | if DEBUG: 66 | print "labels=", labels 67 | print 'num fg: {}'.format((labels > 0).sum()) 68 | print 'num bg: {}'.format((labels == 0).sum()) 69 | self._count += 1 70 | self._fg_num += (labels > 0).sum() 71 | self._bg_num += (labels == 0).sum() 72 | print "self._count=", self._count 73 | print 'num fg avg: {}'.format(self._fg_num / self._count) 74 | print 'num bg avg: {}'.format(self._bg_num / self._count) 75 | print 'ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num)) 76 | 77 | for ind, val in enumerate([rois, labels, bbox_targets, bbox_weights, delta_label, delta_weights, occluded_label]): 78 | self.assign(out_data[ind], req[ind], val) 79 | 80 | def backward(self, req, out_grad, in_data, out_data, in_grad, aux): 81 | self.assign(in_grad[0], req[0], 0) 82 | self.assign(in_grad[1], req[1], 0) 83 | self.assign(in_grad[2], req[2], 0) 84 | self.assign(in_grad[3], req[3], 0) 85 | 86 | 87 | @mx.operator.register('proposal_target') 88 | class ProposalTargetProp(mx.operator.CustomOpProp): 89 | def __init__(self, num_classes, batch_images, batch_rois, cfg, fg_fraction='0.25'): 90 | super(ProposalTargetProp, self).__init__(need_top_grad=False) 91 | self._num_classes = int(num_classes) 92 | self._batch_images = int(batch_images) 93 | self._batch_rois = int(batch_rois) 94 | self._cfg = cPickle.loads(cfg) 95 | self._fg_fraction = float(fg_fraction) 96 | 97 | def list_arguments(self): 98 | return ['rois', 'gt_boxes', 'delta_list', 'occluded'] 99 | 100 | def list_outputs(self): 101 | return ['rois_output', 'label', 'bbox_target', 'bbox_weight', 'delta_label', 'delta_weight', 'occluded_label'] 102 | 103 | def infer_shape(self, in_shape): 104 | rpn_rois_shape = in_shape[0] 105 | gt_boxes_shape = in_shape[1] 106 | delta_list_shape = in_shape[2] 107 | occluded_shape = in_shape[3] 108 | 109 | rois = rpn_rois_shape[0] + gt_boxes_shape[0] if self._batch_rois == -1 else self._batch_rois 110 | 111 | output_rois_shape = (rois, 5) 112 | label_shape = (rois, ) 113 | occluded_label_shape = (rois, ) 114 | bbox_target_shape = (rois, self._num_classes * 4) 115 | bbox_weight_shape = (rois, self._num_classes * 4) 116 | delta_label_shape = (rois*2, 4) 117 | delta_weight_shape = (rois*2, 8) 118 | 119 | return [rpn_rois_shape, gt_boxes_shape, delta_list_shape, occluded_shape], \ 120 | [output_rois_shape, label_shape, bbox_target_shape, bbox_weight_shape, delta_label_shape, delta_weight_shape, occluded_label_shape] 121 | 122 | def create_operator(self, ctx, shapes, dtypes): 123 | return ProposalTargetOperator(self._num_classes, self._batch_images, self._batch_rois, self._cfg, self._fg_fraction) 124 | 125 | def declare_backward_dependency(self, out_grad, in_data, out_data): 126 | return [] 127 | -------------------------------------------------------------------------------- /manet_rfcn/operator_py/rpn_inv_normalize.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Flow-Guided Feature Aggregation 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The Apache-2.0 License [see LICENSE for details] 5 | # Written by Xizhou Zhu 6 | # -------------------------------------------------------- 7 | 8 | import mxnet as mx 9 | import numpy as np 10 | from distutils.util import strtobool 11 | 12 | class RPNInvNormalizeOperator(mx.operator.CustomOp): 13 | def __init__(self, num_anchors, bbox_mean, bbox_std): 14 | super(RPNInvNormalizeOperator, self).__init__() 15 | self._num_anchors = num_anchors 16 | self._bbox_mean = mx.ndarray.Reshape(mx.nd.array(bbox_mean), shape=(1,4,1,1)) 17 | self._bbox_std = mx.ndarray.Reshape(mx.nd.array(bbox_std), shape=(1,4,1,1)) 18 | 19 | def forward(self, is_train, req, in_data, out_data, aux): 20 | bbox_pred = in_data[0] 21 | tile_shape = (bbox_pred.shape[0], self._num_anchors, bbox_pred.shape[2], bbox_pred.shape[3]) 22 | bbox_mean = mx.ndarray.tile(self._bbox_mean.as_in_context(bbox_pred.context), reps=tile_shape) 23 | bbox_std = mx.ndarray.tile(self._bbox_std.as_in_context(bbox_pred.context), reps=tile_shape) 24 | bbox_pred = bbox_pred * bbox_std + bbox_mean 25 | 26 | self.assign(out_data[0], req[0], bbox_pred) 27 | 28 | def backward(self, req, out_grad, in_data, out_data, in_grad, aux): 29 | self.assign(in_grad[0], req[0], 0) 30 | 31 | @mx.operator.register('rpn_inv_normalize') 32 | class RPNInvNormalizeProp(mx.operator.CustomOpProp): 33 | def __init__(self, num_anchors, bbox_mean='(0.0, 0.0, 0.0, 0.0)', bbox_std='0.1, 0.1, 0.2, 0.2'): 34 | super(RPNInvNormalizeProp, self).__init__(need_top_grad=False) 35 | self._num_anchors = int(num_anchors) 36 | self._bbox_mean = np.fromstring(bbox_mean[1:-1], dtype=float, sep=',') 37 | self._bbox_std = np.fromstring(bbox_std[1:-1], dtype=float, sep=',') 38 | 39 | def list_arguments(self): 40 | return ['bbox_pred'] 41 | 42 | def list_outputs(self): 43 | return ['out_bbox_pred'] 44 | 45 | def infer_shape(self, in_shape): 46 | 47 | return [in_shape[0]], \ 48 | [in_shape[0]] 49 | 50 | def create_operator(self, ctx, shapes, dtypes): 51 | return RPNInvNormalizeOperator(self._num_anchors, self._bbox_mean, self._bbox_std) 52 | 53 | def declare_backward_dependency(self, out_grad, in_data, out_data): 54 | return [] 55 | -------------------------------------------------------------------------------- /manet_rfcn/operator_py/tile_as.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | #Flow-Guided-Feature-Aggregation 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The Apache-2.0 License [see LICENSE for details] 5 | # Written by Xizhou Zhu 6 | # -------------------------------------------------------- 7 | 8 | import mxnet as mx 9 | import numpy as np 10 | from distutils.util import strtobool 11 | 12 | class TileAsOperator(mx.operator.CustomOp): 13 | def __init__(self): 14 | super(TileAsOperator, self).__init__() 15 | 16 | def forward(self, is_train, req, in_data, out_data, aux): 17 | data_content = in_data[0] 18 | data_tiled = mx.ndarray.tile(data_content, reps=(in_data[1].shape[0], 1, 1, 1)) 19 | self.assign(out_data[0], req[0], data_tiled) 20 | 21 | def backward(self, req, out_grad, in_data, out_data, in_grad, aux): 22 | self.assign(in_grad[0], req[0], 0) 23 | self.assign(in_grad[1], req[1], 0) 24 | 25 | 26 | @mx.operator.register('tile_as') 27 | class TileAsProp(mx.operator.CustomOpProp): 28 | def __init__(self): 29 | super(TileAsProp, self).__init__(need_top_grad=False) 30 | 31 | def list_arguments(self): 32 | return ['data_content', 'data_shape'] 33 | 34 | def list_outputs(self): 35 | return ['data_tiled'] 36 | 37 | def infer_shape(self, in_shape): 38 | data_content_shape = in_shape[0] 39 | data_shape_shape = in_shape[1] 40 | 41 | tiled_data_shape = (data_shape_shape[0], data_content_shape[1], data_content_shape[2], data_content_shape[3]) 42 | 43 | return [data_content_shape, data_shape_shape], \ 44 | [tiled_data_shape] 45 | 46 | def create_operator(self, ctx, shapes, dtypes): 47 | return TileAsOperator() 48 | 49 | def declare_backward_dependency(self, out_grad, in_data, out_data): 50 | return out_grad 51 | -------------------------------------------------------------------------------- /manet_rfcn/symbols/__init__.py: -------------------------------------------------------------------------------- 1 | import resnet_v1_101_manet_rfcn 2 | -------------------------------------------------------------------------------- /manet_rfcn/test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Flow-Guided Feature Aggregation 3 | # Copyright (c) 2016 by Contributors 4 | # Copyright (c) 2017 Microsoft 5 | # Licensed under The Apache-2.0 License [see LICENSE for details] 6 | # Modified by Yuwen Xiong 7 | # -------------------------------------------------------- 8 | 9 | import _init_paths 10 | 11 | import cv2 12 | import argparse 13 | import os 14 | import sys 15 | import time 16 | import logging 17 | from config.config import config, update_config 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser(description='Test a R-FCN network') 21 | # general 22 | parser.add_argument('--cfg', help='experiment configure file name', required=True, type=str) 23 | 24 | args, rest = parser.parse_known_args() 25 | update_config(args.cfg) 26 | 27 | # rcnn 28 | parser.add_argument('--vis', help='turn on visualization', action='store_true') 29 | parser.add_argument('--ignore_cache', help='ignore cached results boxes', action='store_true') 30 | parser.add_argument('--thresh', help='valid detection threshold', default=1e-3, type=float) 31 | parser.add_argument('--shuffle', help='shuffle data on visualization', action='store_true') 32 | args = parser.parse_args() 33 | return args 34 | 35 | args = parse_args() 36 | curr_path = os.path.abspath(os.path.dirname(__file__)) 37 | sys.path.insert(0, os.path.join(curr_path, '../external/mxnet', config.MXNET_VERSION)) 38 | 39 | import mxnet as mx 40 | from function.test_rcnn import test_rcnn 41 | from utils.create_logger import create_logger 42 | 43 | 44 | def main(): 45 | ctx = [mx.gpu(int(i)) for i in config.gpus.split(',')] 46 | print args 47 | 48 | logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.test_image_set) 49 | 50 | test_rcnn(config, config.dataset.dataset, config.dataset.test_image_set, config.dataset.root_path, config.dataset.dataset_path, config.dataset.motion_iou_path, 51 | ctx, os.path.join(final_output_path, '..', '_'.join([iset for iset in config.dataset.image_set.split('+')]), config.TRAIN.model_prefix), config.TEST.test_epoch, 52 | args.vis, args.ignore_cache, args.shuffle, config.TEST.HAS_RPN, config.dataset.proposal, args.thresh, logger=logger, output_path=final_output_path, 53 | enable_detailed_eval=config.dataset.enable_detailed_eval) 54 | 55 | if __name__ == '__main__': 56 | main() 57 | -------------------------------------------------------------------------------- /manet_rfcn/train_end2end.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fully Motion-Aware Network for Video Object Detection 3 | # Licensed under The Apache-2.0 License [see LICENSE for details] 4 | # Extend FGFA by adding instance-level aggregation and motion pattern reasoning 5 | # Modified by Shiyao Wang 6 | # -------------------------------------------------------- 7 | 8 | import _init_paths 9 | 10 | import cv2 11 | import time 12 | import argparse 13 | import logging 14 | import pprint 15 | import os 16 | import sys 17 | from config.config import config, update_config 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser(description='Train R-FCN network') 21 | # general 22 | parser.add_argument('--cfg', help='experiment configure file name', required=True, type=str) 23 | 24 | args, rest = parser.parse_known_args() 25 | # update config 26 | update_config(args.cfg) 27 | 28 | # training 29 | parser.add_argument('--frequent', help='frequency of logging', default=config.default.frequent, type=int) 30 | args = parser.parse_args() 31 | return args 32 | 33 | args = parse_args() 34 | curr_path = os.path.abspath(os.path.dirname(__file__)) 35 | sys.path.insert(0, os.path.join(curr_path, '../external/mxnet', config.MXNET_VERSION)) 36 | 37 | import shutil 38 | import numpy as np 39 | import mxnet as mx 40 | 41 | from symbols import * 42 | from core import callback, metric 43 | from core.loader import AnchorLoader 44 | from core.module import MutableModule 45 | from utils.create_logger import create_logger 46 | from utils.load_data import load_gt_roidb, merge_roidb, filter_roidb 47 | from utils.load_model import load_param 48 | from utils.PrefetchingIter import PrefetchingIter 49 | from utils.lr_scheduler import WarmupMultiFactorScheduler 50 | 51 | def train_net(args, ctx, pretrained, pretrained_flow, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): 52 | logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) 53 | prefix = os.path.join(final_output_path, prefix) 54 | 55 | # load symbol 56 | shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) 57 | sym_instance = eval(config.symbol + '.' + config.symbol)() 58 | sym = sym_instance.get_train_symbol(config) 59 | feat_sym = sym.get_internals()['rpn_cls_score_output'] 60 | 61 | # setup multi-gpu 62 | batch_size = len(ctx) 63 | input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size 64 | 65 | # print config 66 | pprint.pprint(config) 67 | logger.info('training config:{}\n'.format(pprint.pformat(config))) 68 | 69 | # load dataset and prepare imdb for training 70 | image_sets = [iset for iset in config.dataset.image_set.split('+')] 71 | roidbs = [load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, 72 | flip=config.TRAIN.FLIP) 73 | for image_set in image_sets] 74 | roidb = merge_roidb(roidbs) 75 | roidb = filter_roidb(roidb, config) 76 | # load training data 77 | train_data = AnchorLoader(feat_sym, roidb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, 78 | feat_stride=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, 79 | anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING, 80 | normalize_target=config.network.NORMALIZE_RPN, bbox_mean=config.network.ANCHOR_MEANS, 81 | bbox_std=config.network.ANCHOR_STDS) 82 | 83 | # infer max shape 84 | max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES]))), 85 | ('data_bef', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES]))), 86 | ('data_aft', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] 87 | max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) 88 | max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5))) 89 | max_data_shape.append(('delta_bef_gt', (config.TRAIN.BATCH_IMAGES, 100, 4))) 90 | max_data_shape.append(('delta_aft_gt', (config.TRAIN.BATCH_IMAGES, 100, 4))) 91 | max_data_shape.append(('occluded', (config.TRAIN.BATCH_IMAGES, 100, 1))) 92 | print 'providing maximum shape', max_data_shape, max_label_shape 93 | 94 | data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) 95 | pprint.pprint(data_shape_dict) 96 | sym_instance.infer_shape(data_shape_dict) 97 | 98 | # load and initialize params 99 | if config.TRAIN.RESUME: 100 | print('continue training from ', begin_epoch) 101 | arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) 102 | else: 103 | print config.TRAIN.USE_OCCLUSION 104 | if config.TRAIN.USE_OCCLUSION: 105 | arg_params, aux_params = load_param(pretrained, epoch, convert=True) 106 | sym_instance.init_occluded_weight(config, arg_params, aux_params) 107 | else: 108 | arg_params, aux_params = load_param(pretrained, epoch, convert=True) 109 | arg_params_flow, aux_params_flow = load_param(pretrained_flow, epoch, convert=True) 110 | arg_params.update(arg_params_flow) 111 | aux_params.update(aux_params_flow) 112 | sym_instance.init_weight(config, arg_params, aux_params) 113 | 114 | # check parameter shapes 115 | sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) 116 | 117 | # create solver 118 | fixed_param_prefix = config.network.FIXED_PARAMS 119 | data_names = [k[0] for k in train_data.provide_data_single] 120 | label_names = [k[0] for k in train_data.provide_label_single] 121 | 122 | mod = MutableModule(sym, data_names=data_names, label_names=label_names, 123 | logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], 124 | max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) 125 | 126 | if config.TRAIN.RESUME: 127 | mod._preload_opt_states = '%s-%04d.states'%(prefix, begin_epoch) 128 | 129 | # decide training params 130 | # metric 131 | rpn_eval_metric = metric.RPNAccMetric() 132 | rpn_cls_metric = metric.RPNLogLossMetric() 133 | rpn_bbox_metric = metric.RPNL1LossMetric() 134 | delta_metric = metric.DELTAL1LossMetric(config) 135 | eval_metric = metric.RCNNAccMetric(config) 136 | cls_metric = metric.RCNNLogLossMetric(config) 137 | bbox_metric = metric.RCNNL1LossMetric(config) 138 | if config.TRAIN.USE_OCCLUSION: 139 | occluded_metric = metric.RCNNOccludedLossMetric(config) 140 | occluded_eval_metric = metric.RCNNOccludedAccMetric(config) 141 | eval_metrics = mx.metric.CompositeEvalMetric() 142 | # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric 143 | #for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, delta_metric, eval_metric, cls_metric, bbox_metric]: 144 | if config.TRAIN.USE_OCCLUSION: 145 | for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, delta_metric, eval_metric, cls_metric, bbox_metric, occluded_metric, occluded_eval_metric]: 146 | eval_metrics.add(child_metric) 147 | else: 148 | for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, delta_metric, eval_metric, cls_metric, bbox_metric]: 149 | eval_metrics.add(child_metric) 150 | # callback 151 | batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent) 152 | means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) 153 | stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) 154 | epoch_end_callback = [mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), callback.do_checkpoint(prefix, means, stds)] 155 | # decide learning rate 156 | base_lr = lr 157 | lr_factor = config.TRAIN.lr_factor 158 | lr_epoch = [float(epoch) for epoch in lr_step.split(',')] 159 | lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] 160 | lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) 161 | lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] 162 | print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) 163 | lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) 164 | # optimizer 165 | optimizer_params = {'momentum': config.TRAIN.momentum, 166 | 'wd': config.TRAIN.wd, 167 | 'learning_rate': lr, 168 | 'lr_scheduler': lr_scheduler, 169 | 'rescale_grad': 1.0, 170 | 'clip_gradient': None} 171 | 172 | if not isinstance(train_data, PrefetchingIter): 173 | train_data = PrefetchingIter(train_data) 174 | 175 | # train 176 | mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, 177 | batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, 178 | optimizer='sgd', optimizer_params=optimizer_params, 179 | arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) 180 | 181 | 182 | def main(): 183 | print('Called with argument:', args) 184 | ctx = [mx.gpu(int(i)) for i in config.gpus.split(',')] 185 | train_net(args, ctx, config.network.pretrained, config.network.pretrained_flow, config.network.pretrained_epoch, config.TRAIN.model_prefix, 186 | config.TRAIN.begin_epoch, config.TRAIN.end_epoch, config.TRAIN.lr, config.TRAIN.lr_step) 187 | 188 | if __name__ == '__main__': 189 | main() 190 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | python -u experiments/manet_rfcn/manet_rfcn_end2end_train_test.py --cfg experiments/manet_rfcn/cfgs/phase-1.yaml 2 | --------------------------------------------------------------------------------