├── .gitignore ├── LICENSE ├── README.md ├── assets └── teaser.png ├── configs ├── BaseRetina.yaml ├── coco │ ├── querydet_test.yaml │ ├── querydet_train.yaml │ ├── retinanet_test.yaml │ └── retinanet_train.yaml ├── custom_config.py └── visdrone │ ├── querydet_test.yaml │ ├── querydet_train.yaml │ ├── retinanet_test.yaml │ └── retinanet_train.yaml ├── eval_visdrone.sh ├── infer_coco.py ├── infer_visdrone.py ├── models ├── querydet │ ├── __pycache__ │ │ ├── det_head.cpython-36.pyc │ │ ├── det_head.cpython-37.pyc │ │ ├── detector.cpython-36.pyc │ │ ├── detector.cpython-37.pyc │ │ ├── qinfer.cpython-36.pyc │ │ └── qinfer.cpython-37.pyc │ ├── det_head.py │ ├── detector.py │ └── qinfer.py └── retinanet │ ├── __pycache__ │ ├── retinanet.cpython-36.pyc │ └── retinanet.cpython-37.pyc │ └── retinanet.py ├── train_coco.py ├── train_tools ├── coco_infer.py ├── coco_train.py ├── visdrone_infer.py └── visdrone_train.py ├── train_visdrone.py ├── utils ├── anchor_gen.py ├── coco_eval_fpn.py ├── gradient_checkpoint.py ├── json_evaluator.py ├── loop_matcher.py ├── merged_sync_bn.py ├── soft_nms.py ├── time_evaluator.py ├── utils.py └── val_mapper_with_ann.py ├── visdrone ├── data_prepare.py ├── dataloader.py ├── json_to_txt.py ├── mapper.py └── utils.py └── visdrone_eval ├── LICENSE ├── README.md ├── evaluate.py ├── requirements.txt ├── setup.py └── viseval ├── __init__.py ├── __pycache__ ├── __init__.cpython-37.pyc ├── bbox_overlaps.cpython-37.pyc ├── calc_accuracy.cpython-37.pyc ├── drop_objects_in_igr.cpython-37.pyc └── eval_det.cpython-37.pyc ├── bbox_overlaps.py ├── calc_accuracy.py ├── drop_objects_in_igr.py └── eval_det.py /.gitignore: -------------------------------------------------------------------------------- 1 | work_dirs 2 | work_dirs/* 3 | data 4 | data/* 5 | */__pycache__ 6 | */__pycache__/* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Chenhongyi Yang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # QueryDet-PyTorch 2 | 3 |

4 | 5 |

6 | 7 | This repository is the official implementation of our paper: [QueryDet: Cascaded Sparse Query for Accelerating High-Resolution Small Object Detection, *Chenhongyi Yang*, *Zehao Huang*, *Naiyan Wang*. CVPR 2022](https://arxiv.org/abs/2103.09136) 8 | 9 | 10 | 11 | ## IMPORTANT UPDATE !!! 12 | 13 | We have updated the QueryDet repository to make it easier to use. Specifically: 14 | 15 | - QueryDet now supports newer versions of PyTorch and Detectron2. 16 | - You do not need APEX anymore. FP16 training is currently achieved through PyTorch AMP. 17 | - QueryDet now supports Spconv 2.1, which can be directly installed using pip. 18 | - We have improved the support for the VisDrone dataset. 19 | - We have re-orgnized the model configs to make them easier to use. 20 | 21 | 22 | 23 | ## Setting: 24 | 25 | ### Environment setting: 26 | 27 | We tested the new QueryDet with CUDA 10.2 using NVIDIA 2080Ti GPUs. We provide a sample setting-up script as follows: 28 | 29 | ```shell 30 | conda create -n querydet python=3.7 -y 31 | source activate querydet 32 | pip install torch==1.8.1+cu101 torchvision==0.9.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html 33 | python -m pip install detectron2==0.4 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.7/index.html 34 | pip install spconv-cu102==2.1.25 35 | 36 | # Clone our repository and have fun with it! 37 | git clone https://github.com/ChenhongyiYang/QueryDet-PyTorch.git 38 | 39 | # OPTIONAL: Install the python evaluation tool for VisDrone 40 | # Reference: https://github.com/tjiiv-cprg/visdrone-det-toolkit-python 41 | cd visdrone_eval 42 | pip install -e . 43 | 44 | # OPTIONAL: Install detectron2_backbone if you want to use backbone networks like MobileNet 45 | # Reference: https://github.com/sxhxliang/detectron2_backbone 46 | git clone https://github.com/sxhxliang/detectron2_backbone.git 47 | cd detectron2_backbone 48 | python setup.py build develop 49 | ``` 50 | 51 | ### COCO setting: 52 | 53 | You need to set up COCO following the [official tutorial](https://detectron2.readthedocs.io/en/latest/tutorials/builtin_datasets.html) of Detectron2. 54 | 55 | ### VisDrone setting: 56 | 57 | We provide full support for the VisDrone dataset. 58 | 59 | - You need to download the VisDrone dataset from its [official website](http://aiskyeye.com/). 60 | - Unzip and place the downloaded dataset as follows: 61 | 62 | ``` 63 | QueryDet-PyTorch 64 | |-- data 65 | |-- visdrone 66 | |-- VisDrone2019-DET-train 67 | | |-- images 68 | | | |-- ...jpg # 6471 .jpg files 69 | | |-- annotations 70 | | |-- ...txt # 6471 .txt files 71 | |-- VisDrone2019-DET-val 72 | |-- images 73 | | |-- ...jpg # 548 .jpg files 74 | |-- annotations 75 | |-- ...txt # 548 .txt files 76 | ``` 77 | 78 | - Pre-process the dataset by running: `python visdrone/data_prepare.py --visdrone-root data/visdrone`. 79 | - The resulting file structure will be as follows: 80 | 81 | ``` 82 | QueryDet-PyTorch 83 | |-- data 84 | |-- visdrone 85 | |-- VisDrone2019-DET-train 86 | | |-- images 87 | | | |-- ...jpg # 6471 .jpg files 88 | | |-- annotations 89 | | |-- ...txt # 6471 .txt files 90 | |-- VisDrone2019-DET-val 91 | | |-- images 92 | | | |-- ...jpg # 548 .jpg files 93 | | |-- annotations 94 | | |-- ...txt # 548 .txt files 95 | |-- coco_format 96 | |-- train_images 97 | | |-- ...jpg # 25884 .jpg files 98 | |-- val_images 99 | | |-- ...jpg # 548 .jpg files 100 | |-- annotations 101 | |-- train_label.json 102 | |-- val_label.json 103 | ``` 104 | 105 | - After model training, you can evaluate your model by running `bash eval_visdrone.sh /path/to/visdrone_infer.json`. 106 | 107 | 108 | 109 | ## Usage 110 | 111 | Before training, we recommend you to create a `work_dirs` directory to store all training results under `QueryDet-PyTorch` as follows: 112 | 113 | ``` 114 | QueryDet-PyTorch 115 | |-- work_dirs 116 | |-- ... # other stuffs 117 | ``` 118 | 119 | If you do not want to store your training results in another place, you can run `ln -s /path/to/your/storage work_dirs` to create a symbolic link. 120 | 121 | In the following, we will assume you have created such a directory and introduce the training, testing, and evaluating commands. 122 | 123 | ### Training 124 | 125 | ```shell 126 | % train coco RetinaNet baseline 127 | python train_coco.py --config-file configs/coco/retinanet_train.yaml --num-gpu 8 OUTPUT_DIR work_dirs/coco_retinanet 128 | 129 | % train coco QueryDet 130 | python train_coco.py --config-file configs/coco/querydet_train.yaml --num-gpu 8 OUTPUT_DIR work_dirs/coco_querydet 131 | 132 | % train VisDrone RetinaNet baseline 133 | python train_visdrone.py --config-file configs/visdrone/retinanet_train.yaml --num-gpu 8 OUTPUT_DIR work_dirs/visdrone_retinanet 134 | 135 | % train VisDrone QueryDet 136 | python train_visdrone.py --config-file configs/visdrone/querydet_train.pyaml --num-gpu 8 OUTPUT_DIR work_dirs/visdrone_querydet 137 | ``` 138 | 139 | ### Testing 140 | 141 | ```shell 142 | % test coco RetinaNet baseline 143 | python infer_coco.py --config-file configs/coco/retinanet_test.yaml --num-gpu 8 --eval-only MODEL.WEIGHTS work_dirs/coco_retinanet/model_final.pth OUTPUT_DIR work_dirs/model_test 144 | 145 | % test coco QueryDet with Dense Inference 146 | python infer_coco.py --config-file configs/coco/querydet_test.yaml --num-gpu 8 --eval-only MODEL.WEIGHTS work_dirs/coco_querydet/model_final.pth OUTPUT_DIR work_dirs/model_test 147 | 148 | % test coco QueryDet with CSQ 149 | export SPCONV_FILTER_HWIO="1"; python infer_coco.py --config-file configs/coco/querydet_test.yaml --num-gpu 8 --eval-only MODEL.WEIGHTS work_dirs/coco_querydet/model_final.pth OUTPUT_DIR work_dirs/model_test MODEL.QUERY.QUERY_INFER True 150 | 151 | % test VisDrone RetinaNet baseline 152 | python infer_coco.py --config-file configs/visdrone/retinanet_test.yaml --num-gpu 8 --eval-only MODEL.WEIGHTS work_dirs/visdrone_retinanet/model_final.pth OUTPUT_DIR work_dirs/model_test 153 | 154 | % test VisDrone QueryDet with Dense Inference 155 | python infer_coco.py --config-file configs/visdrone/querydet_test.yaml --num-gpu 8 --eval-only MODEL.WEIGHTS work_dirs/visdrone_querydet/model_final.pth OUTPUT_DIR work_dirs/model_test 156 | 157 | % test VisDrone QueryDet with CSQ 158 | export SPCONV_FILTER_HWIO="1"; python infer_coco.py --config-file configs/visdrone/querydet_test.yaml --num-gpu 8 --eval-only MODEL.WEIGHTS work_dirs/visdrone_querydet/model_final.pth OUTPUT_DIR work_dirs/model_test MODEL.QUERY.QUERY_INFER True 159 | ``` 160 | 161 | ### Evaluation 162 | 163 | - For COCO, Detectron2 will automatically evaluate the result when you run the inference command so you do not need to run any extra command. 164 | - For VisDrone, after running an inference command, you will get a result file named `visdrone_infer.json` in your resulting directory (e.g., `work_dirs/model_test` in the above commands). Then you have two options to evaluate the result: 165 | - If you have installed the Python evaluation tool, then you can evaluate your result by running `bash eval_visdrone.sh work_dirs/model_test/visdrone_infer.json` 166 | - If you want to use the official Matlab evaluation tool, you can run `python visdrone/json_to_txt.py --out /path/to/result --gt-json data/visdrone/coco_format/annotations/val_label.json --det-json work_dirs/model_test/visdrone_infer.json` to convert the result to .txt files for Matlab evaluation. 167 | 168 | 169 | 170 | ## Citation 171 | ``` 172 | @InProceedings{Yang_2022_CVPR_QueryDet, 173 | author = {{Yang, Chenhongyi and Huang, Zehao and Wang, Naiyan}}, 174 | title = {{QueryDet: Cascaded Sparse Query for Accelerating High-Resolution Small Object Detection}, 175 | booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, 176 | year = {2022} 177 | } 178 | ``` 179 | -------------------------------------------------------------------------------- /assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/assets/teaser.png -------------------------------------------------------------------------------- /configs/BaseRetina.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | NAME: "build_retinanet_resnet_fpn_backbone" 4 | RESNETS: 5 | OUT_FEATURES: ["res3", "res4", "res5"] 6 | ANCHOR_GENERATOR: 7 | SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"] 8 | FPN: 9 | IN_FEATURES: ["res3", "res4", "res5"] 10 | RETINANET: 11 | IOU_THRESHOLDS: [0.4, 0.5] 12 | IOU_LABELS: [0, -1, 1] 13 | DATASETS: 14 | TRAIN: ("coco_2017_train",) 15 | TEST: ("coco_2017_val",) 16 | SOLVER: 17 | IMS_PER_BATCH: 16 18 | BASE_LR: 0.01 # Note that RetinaNet uses a different default learning rate 19 | STEPS: (60000, 80000) 20 | MAX_ITER: 90000 21 | INPUT: 22 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 23 | VERSION: 2 24 | -------------------------------------------------------------------------------- /configs/coco/querydet_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../BaseRetina.yaml" 2 | OUTPUT_DIR: "work_dirs/model_test" 3 | 4 | MODEL: 5 | META_ARCHITECTURE: "RetinaNetQueryDet" 6 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 7 | 8 | RESNETS: 9 | DEPTH: 50 10 | 11 | ANCHOR_GENERATOR: 12 | NAME: "AnchorGeneratorWithCenter" 13 | SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3)] for x in [16, 32, 64, 128, 256, 512]]"] 14 | 15 | RETINANET: 16 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6", "p7"] 17 | SCORE_THRESH_TEST: 0.0001 18 | 19 | RESNETS: 20 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 21 | 22 | FPN: 23 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 24 | 25 | QUERY: 26 | FEATURES_WHOLE_TEST: [2, 3, 4, 5] 27 | FEATURES_VALUE_TEST: [0, 1] 28 | Q_FEATURE_TRAIN: [1, 2] 29 | Q_FEATURE_TEST: [1, 2] 30 | THRESHOLD: 0.12 31 | QUERY_INFER: False 32 | 33 | ENCODE_CENTER_DIS_COEFF: [1., 1.] 34 | ENCODE_SMALL_OBJ_SCALE: [[0, 32], [0, 64]] 35 | 36 | CUSTOM: 37 | USE_SOFT_NMS: False 38 | SOFT_NMS_METHOD: 'gaussian' 39 | SOFT_NMS_SIGMA: 0.7 40 | SOFT_NMS_THRESHOLD: 0.4 41 | SOFT_NMS_PRUND: 0.0001 42 | 43 | TEST: 44 | DETECTIONS_PER_IMAGE: 200 45 | 46 | META_INFO: 47 | EVAL_SMALL_CLS: False 48 | EVAL_GPU_TIME: True 49 | 50 | # DATASETS: 51 | # TEST: ("coco_2017_test-dev",) 52 | -------------------------------------------------------------------------------- /configs/coco/querydet_train.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../BaseRetina.yaml" 2 | OUTPUT_DIR: "work_dirs/coco_querydet" 3 | 4 | MODEL: 5 | META_ARCHITECTURE: "RetinaNetQueryDet" 6 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 7 | 8 | ANCHOR_GENERATOR: 9 | NAME: "AnchorGeneratorWithCenter" 10 | SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3)] for x in [16, 32, 64, 128, 256, 512]]"] 11 | 12 | RETINANET: 13 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6", "p7"] 14 | 15 | RESNETS: 16 | DEPTH: 50 17 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 18 | 19 | FPN: 20 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 21 | 22 | QUERY: 23 | Q_FEATURE_TRAIN: [1, 2] 24 | FEATURES_WHOLE_TEST: [2, 3, 4, 5] 25 | FEATURES_VALUE_TEST: [0, 1] 26 | Q_FEATURE_TEST: [1, 2] 27 | 28 | QUERY_LOSS_WEIGHT: [10., 10.] 29 | QUERY_LOSS_GAMMA: [1.2, 1.2] 30 | 31 | ENCODE_CENTER_DIS_COEFF: [1., 1.] 32 | ENCODE_SMALL_OBJ_SCALE: [[0, 32], [0, 64]] 33 | 34 | QUERY_INFER: False 35 | 36 | CUSTOM: 37 | CLEAR_CUDA_CACHE: True 38 | USE_LOOP_MATCHER: True 39 | FOCAL_LOSS_ALPHAS: [0.25, 0.25, 0.25, 0.25, 0.25, 0.25] 40 | FOCAL_LOSS_GAMMAS: [2.0, 2.0, 2.0, 2.0, 2.0, 2.0] 41 | CLS_WEIGHTS: [1.0, 1.4, 2.1, 2.5, 2.9, 3.2] 42 | REG_WEIGHTS: [1.0, 1.4, 2.1, 2.5, 2.9, 3.2] 43 | 44 | SOLVER: 45 | # 3x 46 | # STEPS: (210000, 250000) 47 | # MAX_ITER: 270000 48 | 49 | # 1x 50 | BASE_LR: 0.01 51 | STEPS: (60000, 80000) 52 | MAX_ITER: 90000 53 | IMS_PER_BATCH: 16 54 | AMP: 55 | ENABLED: True 56 | 57 | 58 | TEST: 59 | EVAL_PERIOD: 0 60 | DETECTIONS_PER_IMAGE: 200 61 | 62 | META_INFO: 63 | EVAL_GPU_TIME: False 64 | EVAL_AP: True 65 | 66 | VIS_PERIOD: 0 -------------------------------------------------------------------------------- /configs/coco/retinanet_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../BaseRetina.yaml" 2 | OUTPUT_DIR: "work_dirs/model_test" 3 | MODEL: 4 | META_ARCHITECTURE: "RetinaNet_D2" 5 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 6 | RESNETS: 7 | DEPTH: 50 8 | 9 | ANCHOR_GENERATOR: 10 | SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3)] for x in [32, 64, 128, 256, 512]]"] 11 | 12 | RETINANET: 13 | IN_FEATURES: ["p3", "p4", "p5", "p6", "p7"] 14 | 15 | SOLVER: 16 | # 3x 17 | # STEPS: (210000, 250000) 18 | # MAX_ITER: 270000 19 | 20 | # 1x 21 | STEPS: (60000, 80000) 22 | MAX_ITER: 90000 23 | CLIP_GRADIENTS: 24 | ENABLED: False 25 | 26 | META_INFO: 27 | EVAL_GPU_TIME: True 28 | 29 | 30 | TEST: 31 | EVAL_PERIOD: 5000 32 | DETECTIONS_PER_IMAGE: 200 33 | -------------------------------------------------------------------------------- /configs/coco/retinanet_train.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../BaseRetina.yaml" 2 | OUTPUT_DIR: ""work_dirs/coco_retinanet" 3 | MODEL: 4 | META_ARCHITECTURE: "RetinaNet_D2" 5 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 6 | RESNETS: 7 | DEPTH: 50 8 | 9 | ANCHOR_GENERATOR: 10 | SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3)] for x in [32, 64, 128, 256, 512]]"] 11 | 12 | RETINANET: 13 | IN_FEATURES: ["p3", "p4", "p5", "p6", "p7"] 14 | 15 | CUSTOM: 16 | CLS_WEIGHTS: [1., 1., 1., 1., 1.] 17 | REG_WEIGHTS: [1., 1., 1., 1., 1.] 18 | FOCAL_LOSS_ALPHAS: [0.25, 0.25, 0.25, 0.25, 0.25, 0.25] 19 | FOCAL_LOSS_GAMMAS: [2.0, 2.0, 2.0, 2.0, 2.0, 2.0] 20 | 21 | SOLVER: 22 | # 3x 23 | # STEPS: (210000, 250000) 24 | # MAX_ITER: 270000 25 | 26 | # 1x 27 | STEPS: (60000, 80000) 28 | MAX_ITER: 90000 29 | IMS_PER_BATCH: 16 30 | BASE_LR: 0.02 31 | AMP: 32 | ENABLED: True 33 | 34 | TEST: 35 | EVAL_PERIOD: 0 36 | DETECTIONS_PER_IMAGE: 200 37 | 38 | VIS_PERIOD: 0 -------------------------------------------------------------------------------- /configs/custom_config.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import CfgNode as CN 2 | 3 | INF = 1e8 4 | 5 | def add_custom_config(cfg): 6 | cfg.MODEL.FPN.TOP_LEVELS = 2 7 | 8 | #---------------------------------------------------------------------------------------------- 9 | # CUSTOM 10 | #---------------------------------------------------------------------------------------------- 11 | cfg.MODEL.CUSTOM = CN() 12 | 13 | cfg.MODEL.CUSTOM.FOCAL_LOSS_GAMMAS = [] 14 | cfg.MODEL.CUSTOM.FOCAL_LOSS_ALPHAS = [] 15 | 16 | cfg.MODEL.CUSTOM.CLS_WEIGHTS = [] 17 | cfg.MODEL.CUSTOM.REG_WEIGHTS = [] 18 | 19 | cfg.MODEL.CUSTOM.USE_LOOP_MATCHER = False 20 | cfg.MODEL.CUSTOM.GRADIENT_CHECKPOINT = False 21 | cfg.MODEL.CUSTOM.CLEAR_CUDA_CACHE = False 22 | 23 | # soft nms 24 | cfg.MODEL.CUSTOM.USE_SOFT_NMS = False 25 | cfg.MODEL.CUSTOM.GIOU_LOSS = False 26 | cfg.MODEL.CUSTOM.SOFT_NMS_METHOD = 'linear' # gaussian 27 | cfg.MODEL.CUSTOM.SOFT_NMS_SIGMA = 0.5 28 | cfg.MODEL.CUSTOM.SOFT_NMS_THRESHOLD = 0.5 29 | cfg.MODEL.CUSTOM.SOFT_NMS_PRUND = 0.001 30 | 31 | cfg.MODEL.CUSTOM.HEAD_BN = False 32 | 33 | #---------------------------------------------------------------------------------------------- 34 | # QUERY 35 | #---------------------------------------------------------------------------------------------- 36 | cfg.MODEL.QUERY = CN() 37 | 38 | cfg.MODEL.QUERY.FEATURES_WHOLE_TRAIN = [2, 3, 4, 5] 39 | cfg.MODEL.QUERY.FEATURES_VALUE_TRAIN = [0, 1] 40 | cfg.MODEL.QUERY.Q_FEATURE_TRAIN = [2] 41 | 42 | cfg.MODEL.QUERY.FEATURES_WHOLE_TEST = [2, 3, 4, 5] 43 | cfg.MODEL.QUERY.FEATURES_VALUE_TEST = [0, 1] 44 | cfg.MODEL.QUERY.Q_FEATURE_TEST = [2] 45 | 46 | cfg.MODEL.QUERY.QUERY_LOSS_WEIGHT = [] 47 | cfg.MODEL.QUERY.QUERY_LOSS_GAMMA = [] 48 | 49 | cfg.MODEL.QUERY.ENCODE_CENTER_DIS_COEFF = [1.] 50 | cfg.MODEL.QUERY.ENCODE_SMALL_OBJ_SCALE = [] 51 | 52 | cfg.MODEL.QUERY.THRESHOLD = 0.12 53 | cfg.MODEL.QUERY.CONTEXT = 2 54 | 55 | cfg.MODEL.QUERY.QUERY_INFER = False 56 | 57 | 58 | #---------------------------------------------------------------------------------------------- 59 | # Meta Info 60 | #---------------------------------------------------------------------------------------------- 61 | cfg.META_INFO = CN() 62 | 63 | cfg.META_INFO.VIS_ROOT = '' 64 | cfg.META_INFO.EVAL_GPU_TIME = False 65 | cfg.META_INFO.EVAL_AP = True 66 | 67 | #---------------------------------------------------------------------------------------------- 68 | # VisDrone2018 69 | #---------------------------------------------------------------------------------------------- 70 | cfg.VISDRONE = CN() 71 | 72 | cfg.VISDRONE.TRAIN_JSON = 'data/visdrone/coco_format/annotations/train_label.json' 73 | cfg.VISDRONE.TRING_IMG_ROOT = 'data//visdrone/coco_format/train_images' 74 | 75 | cfg.VISDRONE.TEST_JSON = 'data/visdrone/coco_format/annotations/val_label.json' 76 | cfg.VISDRONE.TEST_IMG_ROOT = 'data/visdrone/coco_format/val_images' 77 | 78 | cfg.VISDRONE.SHORT_LENGTH = [1200] 79 | cfg.VISDRONE.MAX_LENGTH = 1999 80 | 81 | cfg.VISDRONE.TEST_LENGTH = 3999 82 | 83 | -------------------------------------------------------------------------------- /configs/visdrone/querydet_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../BaseRetina.yaml" 2 | OUTPUT_DIR: "work_dirs/model_test" 3 | 4 | MODEL: 5 | META_ARCHITECTURE: "RetinaNetQueryDet" 6 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 7 | 8 | RESNETS: 9 | DEPTH: 50 10 | 11 | ANCHOR_GENERATOR: 12 | NAME: "AnchorGeneratorWithCenter" 13 | SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3)] for x in [16, 32, 64, 128, 256, 512]]"] 14 | 15 | RETINANET: 16 | IOU_THRESHOLDS: [0.4, 0.5] 17 | IOU_LABELS: [0, -1, 1] 18 | NUM_CLASSES: 10 19 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6", "p7"] 20 | SCORE_THRESH_TEST: 0.0001 21 | 22 | RESNETS: 23 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 24 | 25 | FPN: 26 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 27 | 28 | QUERY: 29 | FEATURES_WHOLE_TEST: [2, 3, 4, 5] 30 | FEATURES_VALUE_TEST: [0, 1] 31 | Q_FEATURE_TRAIN: [1, 2] 32 | Q_FEATURE_TEST: [1, 2] 33 | 34 | ENCODE_CENTER_DIS_COEFF: [1., 1.] 35 | ENCODE_SMALL_OBJ_SCALE: [[0, 32], [0, 64]] 36 | 37 | THRESHOLD: 0.12 38 | QUERY_INFER: False 39 | 40 | CUSTOM: 41 | USE_SOFT_NMS: False 42 | SOFT_NMS_METHOD: 'gaussian' 43 | SOFT_NMS_SIGMA: 0.6 44 | SOFT_NMS_THRESHOLD: 0.4 45 | SOFT_NMS_PRUND: 0.0001 46 | 47 | VISDRONE: 48 | TEST_LENGTH: 3999 49 | 50 | TEST: 51 | DETECTIONS_PER_IMAGE: 500 52 | 53 | META_INFO: 54 | EVAL_GPU_TIME: True 55 | -------------------------------------------------------------------------------- /configs/visdrone/querydet_train.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../BaseRetina.yaml" 2 | OUTPUT_DIR: "work_dirs/visdrone_querydet" 3 | 4 | MODEL: 5 | META_ARCHITECTURE: "RetinaNetQueryDet" 6 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 7 | 8 | RESNETS: 9 | DEPTH: 50 10 | 11 | ANCHOR_GENERATOR: 12 | NAME: "AnchorGeneratorWithCenter" 13 | SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3)] for x in [16, 32, 64, 128, 256, 512]]"] 14 | 15 | RETINANET: 16 | IOU_THRESHOLDS: [0.4, 0.5] 17 | IOU_LABELS: [0, -1, 1] 18 | NUM_CLASSES: 10 19 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6", "p7"] 20 | 21 | RESNETS: 22 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 23 | 24 | FPN: 25 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 26 | 27 | QUERY: 28 | Q_FEATURE_TRAIN: [1, 2] 29 | FEATURES_WHOLE_TEST: [2, 3, 4, 5] 30 | FEATURES_VALUE_TEST: [0, 1] 31 | Q_FEATURE_TEST: [1, 2] 32 | 33 | QUERY_LOSS_WEIGHT: [10., 10.] 34 | QUERY_LOSS_GAMMA: [1.3, 1.3] 35 | 36 | ENCODE_CENTER_DIS_COEFF: [1., 1.] 37 | ENCODE_SMALL_OBJ_SCALE: [[0, 32], [0, 64]] 38 | 39 | QUERY_INFER: False 40 | 41 | CUSTOM: 42 | GRADIENT_CHECKPOINT: False 43 | USE_LOOP_MATCHER: True 44 | FOCAL_LOSS_ALPHAS: [0.25, 0.25, 0.25, 0.25, 0.25, 0.25] 45 | FOCAL_LOSS_GAMMAS: [2.0, 2.0, 2.0, 2.0, 2.0, 2.0] 46 | CLS_WEIGHTS: [1.0, 1.4, 1.8, 2.2, 2.6, 2.6] 47 | REG_WEIGHTS: [1.0, 1.4, 1.8, 2.2, 2.6, 2.6] 48 | 49 | 50 | SOLVER: 51 | BASE_LR: 0.01 52 | STEPS: (30000, 40000) 53 | MAX_ITER: 50000 54 | IMS_PER_BATCH: 8 55 | AMP: 56 | ENABLED: True 57 | CLIP_GRADIENTS: 58 | ENABLED: True 59 | CLIP_TYPE: value 60 | CLIP_VALUE: 35.0 61 | NORM_TYPE: 2.0 62 | 63 | VISDRONE: 64 | SHORT_LENGTH: [1200] 65 | MAX_LENGTH: 1999 66 | 67 | TEST: 68 | EVAL_PERIOD: 0 69 | DETECTIONS_PER_IMAGE: 500 70 | 71 | META_INFO: 72 | EVAL_GPU_TIME: True 73 | 74 | VIS_PERIOD: 0 -------------------------------------------------------------------------------- /configs/visdrone/retinanet_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../BaseRetina.yaml" 2 | OUTPUT_DIR: "work_dirs/model_test" 3 | 4 | MODEL: 5 | META_ARCHITECTURE: "RetinaNet_D2" 6 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 7 | RESNETS: 8 | DEPTH: 50 9 | 10 | ANCHOR_GENERATOR: 11 | SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3)] for x in [32, 64, 128, 256, 512]]"] 12 | 13 | RESNETS: 14 | OUT_FEATURES: ["res3", "res4", "res5"] 15 | 16 | FPN: 17 | IN_FEATURES: ["res3", "res4", "res5"] 18 | 19 | RETINANET: 20 | IOU_THRESHOLDS: [0.4, 0.5] 21 | IOU_LABELS: [0, -1, 1] 22 | NUM_CLASSES: 10 23 | IN_FEATURES: ["p3", "p4", "p5", "p6", "p7"] 24 | SCORE_THRESH_TEST: 0.005 25 | 26 | META_INFO: 27 | EVAL_GPU_TIME: True 28 | 29 | 30 | TEST: 31 | DETECTIONS_PER_IMAGE: 500 32 | -------------------------------------------------------------------------------- /configs/visdrone/retinanet_train.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "../BaseRetina.yaml" 2 | OUTPUT_DIR: "work_dirs/visdrone_retinanet" 3 | 4 | MODEL: 5 | META_ARCHITECTURE: "RetinaNet_D2" 6 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 7 | RESNETS: 8 | DEPTH: 50 9 | 10 | ANCHOR_GENERATOR: 11 | SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3)] for x in [32, 64, 128, 256, 512]]"] 12 | 13 | RETINANET: 14 | IOU_THRESHOLDS: [0.4, 0.5] 15 | IOU_LABELS: [0, -1, 1] 16 | NUM_CLASSES: 10 17 | IN_FEATURES: ["p3", "p4", "p5", "p6", "p7"] 18 | 19 | CUSTOM: 20 | FOCAL_LOSS_ALPHAS: [0.25, 0.25, 0.25, 0.25, 0.25] 21 | FOCAL_LOSS_GAMMAS: [2.0, 2.0, 2.0, 2.0, 2.0] 22 | CLS_WEIGHTS: [1., 1., 1., 1., 1.] 23 | REG_WEIGHTS: [1., 1., 1., 1., 1.] 24 | 25 | SOLVER: 26 | BASE_LR: 0.01 27 | STEPS: (30000, 40000) 28 | MAX_ITER: 50000 29 | IMS_PER_BATCH: 8 30 | AMP: 31 | ENABLED: True 32 | 33 | TEST: 34 | EVAL_PERIOD: 0 35 | DETECTIONS_PER_IMAGE: 500 36 | 37 | VIS_PERIOD: 0 -------------------------------------------------------------------------------- /eval_visdrone.sh: -------------------------------------------------------------------------------- 1 | DetJSON=$1 2 | 3 | python visdrone/json_to_txt.py --out .visdrone_det_txt --gt-json data/visdrone/coco_format/annotations/val_label.json --det-json $DetJSON 4 | python visdrone_eval/evaluate.py --dataset-dir data/visdrone/VisDrone2019-DET-val --res-dir .visdrone_det_txt 5 | rm -rf .visdrone_det_txt -------------------------------------------------------------------------------- /infer_coco.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from detectron2.engine import launch 4 | from train_tools.coco_infer import default_argument_parser, start_train 5 | 6 | from models.retinanet.retinanet import RetinaNet_D2 7 | from models.querydet.detector import RetinaNetQueryDet 8 | 9 | 10 | 11 | if __name__ == '__main__': 12 | args = default_argument_parser().parse_args() 13 | print("Command Line Args:", args) 14 | launch( 15 | start_train, 16 | args.num_gpus, 17 | num_machines=args.num_machines, 18 | machine_rank=args.machine_rank, 19 | dist_url=args.dist_url, 20 | args=(args,), 21 | ) 22 | -------------------------------------------------------------------------------- /infer_visdrone.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from detectron2.engine import launch 4 | from train_tools.visdrone_infer import default_argument_parser, start_train 5 | 6 | import logging 7 | 8 | from models.retinanet.retinanet import RetinaNet_D2 9 | from models.querydet.detector import RetinaNetQueryDet 10 | 11 | 12 | 13 | if __name__ == '__main__': 14 | args = default_argument_parser().parse_args() 15 | print("Command Line Args:", args) 16 | launch( 17 | start_train, 18 | args.num_gpus, 19 | num_machines=args.num_machines, 20 | machine_rank=args.machine_rank, 21 | dist_url=args.dist_url, 22 | args=(args,), 23 | ) -------------------------------------------------------------------------------- /models/querydet/__pycache__/det_head.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/models/querydet/__pycache__/det_head.cpython-36.pyc -------------------------------------------------------------------------------- /models/querydet/__pycache__/det_head.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/models/querydet/__pycache__/det_head.cpython-37.pyc -------------------------------------------------------------------------------- /models/querydet/__pycache__/detector.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/models/querydet/__pycache__/detector.cpython-36.pyc -------------------------------------------------------------------------------- /models/querydet/__pycache__/detector.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/models/querydet/__pycache__/detector.cpython-37.pyc -------------------------------------------------------------------------------- /models/querydet/__pycache__/qinfer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/models/querydet/__pycache__/qinfer.cpython-36.pyc -------------------------------------------------------------------------------- /models/querydet/__pycache__/qinfer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/models/querydet/__pycache__/qinfer.cpython-37.pyc -------------------------------------------------------------------------------- /models/querydet/det_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import logging 3 | import math 4 | import numpy as np 5 | from typing import List 6 | import torch 7 | from fvcore.nn import sigmoid_focal_loss_jit, smooth_l1_loss 8 | from torch import nn 9 | import torch.nn.functional as F 10 | 11 | from detectron2.layers import ShapeSpec, batched_nms, cat, Conv2d, get_norm 12 | from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou 13 | from detectron2.utils.events import get_event_storage 14 | from detectron2.utils.logger import log_first_n 15 | 16 | from detectron2.modeling.anchor_generator import build_anchor_generator 17 | from detectron2.modeling.backbone import build_backbone 18 | from detectron2.modeling.box_regression import Box2BoxTransform 19 | from detectron2.modeling.matcher import Matcher 20 | from detectron2.modeling.postprocessing import detector_postprocess 21 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY 22 | 23 | from detectron2.modeling.roi_heads.roi_heads import ROIHeads 24 | from detectron2.modeling.poolers import ROIPooler 25 | 26 | 27 | class RetinaNetHead_3x3(nn.Module): 28 | def __init__(self, cfg, in_channels, conv_channels, num_convs, num_anchors): 29 | super().__init__() 30 | # fmt: off 31 | num_classes = cfg.MODEL.RETINANET.NUM_CLASSES 32 | prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB 33 | self.num_convs = num_convs 34 | # fmt: on 35 | 36 | self.cls_subnet = [] 37 | self.bbox_subnet = [] 38 | channels = in_channels 39 | for i in range(self.num_convs): 40 | cls_layer = nn.Conv2d(channels, conv_channels, kernel_size=3, stride=1, padding=1) 41 | bbox_layer = nn.Conv2d(channels, conv_channels, kernel_size=3, stride=1, padding=1) 42 | 43 | torch.nn.init.normal_(cls_layer.weight, mean=0, std=0.01) 44 | torch.nn.init.normal_(bbox_layer.weight, mean=0, std=0.01) 45 | 46 | torch.nn.init.constant_(cls_layer.bias, 0) 47 | torch.nn.init.constant_(bbox_layer.bias, 0) 48 | 49 | self.add_module('cls_layer_{}'.format(i), cls_layer) 50 | self.add_module('bbox_layer_{}'.format(i), bbox_layer) 51 | 52 | self.cls_subnet.append(cls_layer) 53 | self.bbox_subnet.append(bbox_layer) 54 | 55 | channels = conv_channels 56 | 57 | self.cls_score = nn.Conv2d(channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1) 58 | self.bbox_pred = nn.Conv2d(channels, num_anchors * 4, kernel_size=3, stride=1, padding=1) 59 | 60 | torch.nn.init.normal_(self.cls_score.weight, mean=0, std=0.01) 61 | torch.nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.01) 62 | 63 | bias_value = -(math.log((1 - prior_prob) / prior_prob)) 64 | torch.nn.init.constant_(self.cls_score.bias, bias_value) 65 | 66 | def forward(self, features): 67 | logits = [] 68 | bbox_reg = [] 69 | 70 | for feature in features: 71 | cls_f = feature 72 | bbox_f = feature 73 | for i in range(self.num_convs): 74 | cls_f = F.relu(self.cls_subnet[i](cls_f)) 75 | bbox_f = F.relu(self.bbox_subnet[i](bbox_f)) 76 | 77 | logits.append(self.cls_score(cls_f)) 78 | bbox_reg.append(self.bbox_pred(bbox_f)) 79 | 80 | return logits, bbox_reg 81 | 82 | def get_params(self): 83 | cls_weights = [x.weight for x in self.cls_subnet] + [self.cls_score.weight.data] 84 | cls_biases = [x.bias for x in self.cls_subnet] + [self.cls_score.bias.data] 85 | 86 | bbox_weights = [x.weight for x in self.bbox_subnet] + [self.bbox_pred.weight.data] 87 | bbox_biases = [x.bias for x in self.bbox_subnet] + [self.bbox_pred.bias.data] 88 | return cls_weights, cls_biases, bbox_weights, bbox_biases 89 | 90 | 91 | class Head_3x3(nn.Module): 92 | def __init__(self, in_channels, conv_channels, num_convs, pred_channels, pred_prior=None): 93 | super().__init__() 94 | self.num_convs = num_convs 95 | 96 | self.subnet = [] 97 | channels = in_channels 98 | for i in range(self.num_convs): 99 | layer = nn.Conv2d(channels, conv_channels, kernel_size=3, stride=1, padding=1) 100 | torch.nn.init.xavier_normal_(layer.weight) 101 | torch.nn.init.constant_(layer.bias, 0) 102 | self.add_module('layer_{}'.format(i), layer) 103 | self.subnet.append(layer) 104 | channels = conv_channels 105 | 106 | self.pred_net = nn.Conv2d(channels, pred_channels, kernel_size=3, stride=1, padding=1) 107 | 108 | torch.nn.init.xavier_normal_(self.pred_net.weight) 109 | if pred_prior is not None: 110 | bias_value = -(math.log((1 - prior_prob) / prior_prob)) 111 | torch.nn.init.constant_(self.pred_net.bias, bias_value) 112 | else: 113 | torch.nn.init.constant_(self.pred_net.bias, 0) 114 | 115 | def forward(self, features): 116 | preds = [] 117 | for feature in features: 118 | x = feature 119 | for i in range(self.num_convs): 120 | x = F.relu(self.subnet[i](x)) 121 | preds.append(self.pred_net(x)) 122 | return preds 123 | 124 | def get_params(self): 125 | weights = [x.weight for x in self.subnet] + [self.pred_net.weight] 126 | biases = [x.bias for x in self.subnet] + [self.pred_net.bias] 127 | return weights, biases 128 | 129 | 130 | from utils.merged_sync_bn import MergedSyncBatchNorm 131 | 132 | class RetinaNetHead_3x3_MergeBN(nn.Module): 133 | def __init__(self, cfg, in_channels, conv_channels, num_convs, num_anchors): 134 | super().__init__() 135 | # fmt: off 136 | num_classes = cfg.MODEL.RETINANET.NUM_CLASSES 137 | prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB 138 | num_anchors = 1 139 | self.num_convs = num_convs 140 | self.bn_converted = False 141 | # fmt: on 142 | 143 | self.cls_subnet = [] 144 | self.bbox_subnet = [] 145 | self.cls_bns = [] 146 | self.bbox_bns = [] 147 | 148 | channels = in_channels 149 | for i in range(self.num_convs): 150 | cls_layer = Conv2d(channels, conv_channels, kernel_size=3, stride=1, padding=1, bias=False, activation=None, norm=None) 151 | bbox_layer = Conv2d(channels, conv_channels, kernel_size=3, stride=1, padding=1, bias=False, activation=None, norm=None) 152 | torch.nn.init.normal_(cls_layer.weight, mean=0, std=0.01) 153 | torch.nn.init.normal_(bbox_layer.weight, mean=0, std=0.01) 154 | 155 | cls_bn = MergedSyncBatchNorm(conv_channels) 156 | bbox_bn = MergedSyncBatchNorm(conv_channels) 157 | 158 | self.add_module('cls_layer_{}'.format(i), cls_layer) 159 | self.add_module('bbox_layer_{}'.format(i), bbox_layer) 160 | self.add_module('cls_bn_{}'.format(i), cls_bn) 161 | self.add_module('bbox_bn_{}'.format(i), bbox_bn) 162 | 163 | self.cls_subnet.append(cls_layer) 164 | self.bbox_subnet.append(bbox_layer) 165 | self.cls_bns.append(cls_bn) 166 | self.bbox_bns.append(bbox_bn) 167 | 168 | channels = conv_channels 169 | 170 | self.cls_score = nn.Conv2d(channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1) 171 | self.bbox_pred = nn.Conv2d(channels, num_anchors * 4, kernel_size=3, stride=1, padding=1) 172 | 173 | torch.nn.init.normal_(self.cls_score.weight, mean=0, std=0.01) 174 | torch.nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.01) 175 | 176 | bias_value = -(math.log((1 - prior_prob) / prior_prob)) 177 | torch.nn.init.constant_(self.cls_score.bias, bias_value) 178 | 179 | 180 | def forward(self, features, lvl_start): 181 | if self.training: 182 | return self._forward_train(features, lvl_start) 183 | else: 184 | return self._forward_eval(features, lvl_start) 185 | 186 | def _forward_train(self, features, lvl_start): 187 | cls_features = features 188 | bbox_features = features 189 | len_feats = len(features) 190 | 191 | for i in range(self.num_convs): 192 | cls_features = [self.cls_subnet[i](x) for x in cls_features] 193 | bbox_features = [self.bbox_subnet[i](x) for x in bbox_features] 194 | 195 | cls_features = self.cls_bns[i](cls_features) 196 | bbox_features = self.bbox_bns[i](bbox_features) 197 | 198 | cls_features = [F.relu(x) for x in cls_features] 199 | bbox_features = [F.relu(x) for x in bbox_features] 200 | 201 | logits = [self.cls_score(x) for x in cls_features] 202 | bbox_pred = [self.bbox_pred(x) for x in bbox_features] 203 | return logits, bbox_pred 204 | 205 | 206 | def _forward_eval(self, features, lvl_start): 207 | if not self.bn_converted: 208 | self._bn_convert() 209 | 210 | cls_features = features 211 | bbox_features = features 212 | len_feats = len(features) 213 | 214 | for i in range(self.num_convs): 215 | cls_features = [F.relu(self.cls_subnet[i](x)) for x in cls_features] 216 | bbox_features = [F.relu(self.bbox_subnet[i](x)) for x in bbox_features] 217 | 218 | logits = [self.cls_score(x) for x in cls_features] 219 | bbox_pred = [self.bbox_pred(x) for x in bbox_features] 220 | 221 | return logits, bbox_pred, centerness 222 | 223 | def _bn_convert(self): 224 | # merge BN into head weights 225 | assert not self.training 226 | if self.bn_converted: 227 | return 228 | 229 | for i in range(self.num_convs): 230 | cls_running_mean = self.cls_bns[i].running_mean.data 231 | cls_running_var = self.cls_bns[i].running_var.data 232 | cls_gamma = self.cls_bns[i].weight.data 233 | cls_beta = self.cls_bns[i].bias.data 234 | 235 | bbox_running_mean = self.bbox_bns[i].running_mean.data 236 | bbox_running_var = self.bbox_bns[i].running_var.data 237 | bbox_gamma = self.bbox_bns[i].weight.data 238 | bbox_beta = self.bbox_bns[i].bias.data 239 | 240 | cls_bn_scale = cls_gamma * torch.rsqrt(cls_running_var + 1e-10) 241 | cls_bn_bias = cls_beta - cls_bn_scale * cls_running_mean 242 | 243 | bbox_bn_scale = bbox_gamma * torch.rsqrt(bbox_running_var + 1e-10) 244 | bbox_bn_bias = bbox_beta - bbox_bn_scale * bbox_running_mean 245 | 246 | self.cls_subnet[i].weight.data = self.cls_subnet[i].weight.data * cls_bn_scale.view(-1, 1, 1, 1) 247 | self.cls_subnet[i].bias = torch.nn.Parameter(cls_bn_bias) 248 | self.bbox_subnet[i].weight.data = self.bbox_subnet[i].weight.data * bbox_bn_scale.view(-1, 1, 1, 1) 249 | self.bbox_subnet[i].bias = torch.nn.Parameter(bbox_bn_bias) 250 | 251 | self.bn_converted = True 252 | 253 | def get_params(self): 254 | if not self.bn_converted: 255 | self._bn_convert() 256 | 257 | cls_ws = [x.weight.data for x in self.cls_subnet] + [self.cls_score.weight.data] 258 | bbox_ws = [x.weight.data for x in self.bbox_subnet] + [self.bbox_pred.weight.data] 259 | 260 | cls_bs = [x.bias.data for x in self.cls_subnet] + [self.bbox_pred.weight.data] 261 | bbox_bs = [x.bias.data for x in self.bbox_subnet] + [self.bbox_pred.bias.data] 262 | 263 | return cls_ws, cls_bs, bbox_ws, bbox_bs 264 | 265 | 266 | class Head_3x3_MergeBN(nn.Module): 267 | def __init__(self, in_channels, conv_channels, num_convs, pred_channels, pred_prior=None): 268 | super().__init__() 269 | self.num_convs = num_convs 270 | self.bn_converted = False 271 | 272 | self.subnet = [] 273 | self.bns = [] 274 | 275 | channels = in_channels 276 | for i in range(self.num_convs): 277 | layer = Conv2d(channels, conv_channels, kernel_size=3, stride=1, padding=1, bias=False, activation=None, norm=None) 278 | torch.nn.init.normal_(layer.weight, mean=0, std=0.01) 279 | bn = MergedSyncBatchNorm(conv_channels) 280 | 281 | self.add_module('layer_{}'.format(i), layer) 282 | self.add_module('bn_{}'.format(i), bn) 283 | 284 | self.subnet.append(layer) 285 | self.bns.append(bn) 286 | 287 | channels = conv_channels 288 | 289 | self.pred_net = nn.Conv2d(channels, pred_channels, kernel_size=3, stride=1, padding=1) 290 | 291 | torch.nn.init.normal_(self.pred_net.weight, mean=0, std=0.01) 292 | if pred_prior is not None: 293 | bias_value = -(math.log((1 - prior_prob) / prior_prob)) 294 | torch.nn.init.constant_(self.pred_net.bias, bias_value) 295 | else: 296 | torch.nn.init.constant_(self.pred_net.bias, 0) 297 | 298 | def forward(self, features): 299 | if self.training: 300 | return self._forward_train(features) 301 | else: 302 | return self._forward_eval(features) 303 | 304 | def _forward_train(self, features): 305 | for i in range(self.num_convs): 306 | features = [self.subnet[i](x) for x in features] 307 | features = self.bns[i](features) 308 | features = [F.relu(x) for x in features] 309 | preds = [self.pred_net(x) for x in features] 310 | return preds 311 | 312 | def _forward_eval(self, features): 313 | if not self.bn_converted: 314 | self._bn_convert() 315 | 316 | for i in range(self.num_convs): 317 | features = [F.relu(self.subnet[i](x)) for x in features] 318 | 319 | preds = [self.pred_net(x) for x in features] 320 | return preds 321 | 322 | def _bn_convert(self): 323 | # merge BN into head weights 324 | assert not self.training 325 | if self.bn_converted: 326 | return 327 | for i in range(self.num_convs): 328 | running_mean = self.bns[i].running_mean.data 329 | running_var = self.bns[i].running_var.data 330 | gamma = self.bns[i].weight.data 331 | beta = self.bns[i].bias.data 332 | bn_scale = gamma * torch.rsqrt(running_var + 1e-10) 333 | bn_bias = beta - bn_scale * running_mean 334 | self.subnet[i].weight.data = self.subnet[i].weight.data * bn_scale.view(-1, 1, 1, 1) 335 | self.subnet[i].bias = torch.nn.Parameter(bn_bias) 336 | self.bn_converted = True 337 | 338 | def get_params(self): 339 | if not self.bn_converted: 340 | self._bn_convert() 341 | weights = [x.weight.data for x in self.subnet] + [self.pred_net.weight.data] 342 | biases = [x.bias.data for x in self.subnet] + [self.pred_net.bias.data] 343 | return weights, biases 344 | 345 | -------------------------------------------------------------------------------- /models/querydet/detector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import os 3 | import sys 4 | import time 5 | from pathlib import Path 6 | sys.path.append(os.path.abspath(Path(__file__).parent.parent)) 7 | 8 | import logging 9 | import math 10 | import numpy as np 11 | from typing import List 12 | import torch 13 | import torch.nn.functional as F 14 | from fvcore.nn import sigmoid_focal_loss_jit, smooth_l1_loss, sigmoid_focal_loss, giou_loss 15 | from torch import nn 16 | 17 | from detectron2.layers import ShapeSpec, batched_nms, cat 18 | from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou 19 | from detectron2.utils.events import get_event_storage 20 | from detectron2.utils.logger import log_first_n 21 | 22 | from detectron2.modeling.anchor_generator import build_anchor_generator 23 | from detectron2.modeling.backbone import build_backbone 24 | from detectron2.modeling.box_regression import Box2BoxTransform 25 | from detectron2.modeling.matcher import Matcher 26 | from detectron2.modeling.postprocessing import detector_postprocess 27 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY 28 | 29 | 30 | from torch.cuda import Event 31 | ########################################################################################### 32 | from utils.utils import * 33 | from utils.loop_matcher import LoopMatcher 34 | from utils.soft_nms import SoftNMSer 35 | from utils.anchor_gen import AnchorGeneratorWithCenter 36 | from utils.gradient_checkpoint import checkpoint 37 | import models.querydet.det_head as dh 38 | import models.querydet.qinfer as qf 39 | 40 | from torch.cuda.amp import autocast 41 | 42 | __all__ = ["RetinaNetQueryDet"] 43 | 44 | 45 | def permute_to_N_HWA_K(tensor, K): 46 | assert tensor.dim() == 4, tensor.shape 47 | N, _, H, W = tensor.shape 48 | tensor = tensor.view(N, -1, K, H, W) 49 | tensor = tensor.permute(0, 3, 4, 1, 2) 50 | tensor = tensor.reshape(N, -1, K) # Size=(N,HWA,K) 51 | return tensor 52 | 53 | 54 | def permute_all_cls_and_box_to_N_HWA_K_and_concat(box_cls, box_delta, num_classes=80): 55 | box_cls_flattened = [permute_to_N_HWA_K(x, num_classes) for x in box_cls] 56 | box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta] 57 | box_cls = cat(box_cls_flattened, dim=1).view(-1, num_classes) 58 | box_delta = cat(box_delta_flattened, dim=1).view(-1, 4) 59 | return box_cls, box_delta 60 | 61 | 62 | def permute_all_to_NHWA_K_not_concat(box_cls, box_delta, num_classes=80): 63 | box_cls_flattened = [permute_to_N_HWA_K(x, num_classes).reshape(-1, num_classes) for x in box_cls] 64 | box_delta_flattened = [permute_to_N_HWA_K(x, 4).reshape(-1, 4) for x in box_delta] 65 | return box_cls_flattened, box_delta_flattened 66 | 67 | 68 | @META_ARCH_REGISTRY.register() 69 | class RetinaNetQueryDet(nn.Module): 70 | """ 71 | Implement Our QueryDet 72 | """ 73 | def __init__(self, cfg): 74 | super().__init__() 75 | 76 | # fmt: off 77 | self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES 78 | self.in_features = cfg.MODEL.RETINANET.IN_FEATURES 79 | self.query_layer_train = cfg.MODEL.QUERY.Q_FEATURE_TRAIN 80 | self.layers_whole_test = cfg.MODEL.QUERY.FEATURES_WHOLE_TEST 81 | self.layers_value_test = cfg.MODEL.QUERY.FEATURES_VALUE_TEST 82 | self.query_layer_test = cfg.MODEL.QUERY.Q_FEATURE_TEST 83 | # Loss parameters: 84 | self.focal_loss_alpha = cfg.MODEL.CUSTOM.FOCAL_LOSS_ALPHAS 85 | self.focal_loss_gamma = cfg.MODEL.CUSTOM.FOCAL_LOSS_GAMMAS 86 | self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA 87 | self.use_giou_loss = cfg.MODEL.CUSTOM.GIOU_LOSS 88 | self.cls_weights = cfg.MODEL.CUSTOM.CLS_WEIGHTS 89 | self.reg_weights = cfg.MODEL.CUSTOM.REG_WEIGHTS 90 | # training query head 91 | self.small_obj_scale = cfg.MODEL.QUERY.ENCODE_SMALL_OBJ_SCALE 92 | self.query_loss_weights = cfg.MODEL.QUERY.QUERY_LOSS_WEIGHT 93 | self.query_loss_gammas = cfg.MODEL.QUERY.QUERY_LOSS_GAMMA 94 | self.small_center_dis_coeff = cfg.MODEL.QUERY.ENCODE_CENTER_DIS_COEFF 95 | # Inference parameters: 96 | self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST 97 | self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST 98 | self.use_soft_nms = cfg.MODEL.CUSTOM.USE_SOFT_NMS 99 | self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST 100 | self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE 101 | # query inference 102 | self.query_infer = cfg.MODEL.QUERY.QUERY_INFER 103 | self.query_threshold = cfg.MODEL.QUERY.THRESHOLD 104 | self.query_context = cfg.MODEL.QUERY.CONTEXT 105 | # other settings 106 | self.clear_cuda_cache = cfg.MODEL.CUSTOM.CLEAR_CUDA_CACHE 107 | self.anchor_num = len(cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS[0]) * \ 108 | len(cfg.MODEL.ANCHOR_GENERATOR.SIZES[0]) 109 | self.with_cp = cfg.MODEL.CUSTOM.GRADIENT_CHECKPOINT 110 | # fmt: on 111 | assert 'p2' in self.in_features 112 | 113 | self.backbone = build_backbone(cfg) 114 | if cfg.MODEL.CUSTOM.HEAD_BN: 115 | self.det_head = dh.RetinaNetHead_3x3_MergeBN(cfg, 256, 256, 4, self.anchor_num) 116 | self.query_head = dh.Head_3x3_MergeBN(256, 256, 4, 1) 117 | else: 118 | self.det_head = dh.RetinaNetHead_3x3(cfg, 256, 256, 4, self.anchor_num) 119 | self.query_head = dh.Head_3x3(256, 256, 4, 1) 120 | 121 | self.qInfer = qf.QueryInfer(9, self.num_classes, self.query_threshold, self.query_context) 122 | 123 | backbone_shape = self.backbone.output_shape() 124 | all_det_feature_shapes = [backbone_shape[f] for f in self.in_features] 125 | 126 | self.anchor_generator = build_anchor_generator(cfg, all_det_feature_shapes) 127 | self.query_anchor_generator = AnchorGeneratorWithCenter(sizes=[128], aspect_ratios=[1.0], 128 | strides=[2**(x+2) for x in self.query_layer_train], offset=0.5) 129 | # Matching and loss 130 | self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) 131 | 132 | self.soft_nmser = SoftNMSer( 133 | cfg.MODEL.CUSTOM.SOFT_NMS_METHOD, 134 | cfg.MODEL.CUSTOM.SOFT_NMS_SIGMA, 135 | cfg.MODEL.CUSTOM.SOFT_NMS_THRESHOLD, 136 | cfg.MODEL.CUSTOM.SOFT_NMS_PRUND 137 | ) 138 | 139 | if cfg.MODEL.CUSTOM.USE_LOOP_MATCHER: 140 | self.matcher = LoopMatcher( 141 | cfg.MODEL.RETINANET.IOU_THRESHOLDS, 142 | cfg.MODEL.RETINANET.IOU_LABELS, 143 | allow_low_quality_matches=True, 144 | ) 145 | else: 146 | self.matcher = Matcher( 147 | cfg.MODEL.RETINANET.IOU_THRESHOLDS, 148 | cfg.MODEL.RETINANET.IOU_LABELS, 149 | allow_low_quality_matches=True, 150 | ) 151 | 152 | self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) 153 | self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) 154 | 155 | # initialize with any reasonable #fg that's not too small 156 | self.loss_normalizer = 100 157 | self.loss_normalizer_momentum = 0.9 158 | 159 | @property 160 | def device(self): 161 | return self.pixel_mean.device 162 | 163 | def forward(self, batched_inputs, just_forward=False): 164 | if self.training: 165 | return self.train_forward(batched_inputs, just_forward) 166 | else: 167 | return self.test(batched_inputs) 168 | 169 | def train_forward(self, batched_inputs, just_forward=False): 170 | if self.clear_cuda_cache: 171 | torch.cuda.empty_cache() 172 | 173 | if "instances" in batched_inputs[0]: 174 | gt_instances = [x["instances"].to(self.device) for x in batched_inputs] 175 | elif "targets" in batched_inputs[0]: 176 | log_first_n( 177 | logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 178 | ) 179 | gt_instances = [x["targets"].to(self.device) for x in batched_inputs] 180 | else: 181 | gt_instances = None 182 | 183 | images = self.preprocess_image(batched_inputs) 184 | features = self.backbone(images.tensor) 185 | all_features = [features[f] for f in self.in_features] 186 | all_anchors, all_centers = self.anchor_generator(all_features) 187 | 188 | query_feature = [all_features[x] for x in self.query_layer_train] 189 | _, query_centers = self.query_anchor_generator(query_feature) 190 | 191 | # make prediction 192 | det_cls, det_delta = self.det_head(all_features) 193 | query_logits = self.query_head(query_feature) 194 | 195 | if just_forward: 196 | return None 197 | 198 | gt_classes, gt_reg_targets = self.get_det_gt(all_anchors, gt_instances) 199 | losses = self.det_loss(gt_classes, gt_reg_targets, det_cls, det_delta, all_anchors) 200 | 201 | # query loss 202 | gt_query = self.get_query_gt(query_centers, gt_instances) 203 | query_forgrounds = [gt.sum().item() for gt in gt_query] 204 | _query_loss = self.query_loss(gt_query, query_logits, self.query_loss_gammas, self.query_loss_weights) 205 | losses.update(_query_loss) 206 | return losses 207 | 208 | def test(self, batched_inputs): 209 | images = self.preprocess_image(batched_inputs) 210 | results, total_time = self.test_forward(images) # normal test 211 | processed_results = [] 212 | for results_per_image, input_per_image, image_size in zip( 213 | results, batched_inputs, images.image_sizes 214 | ): 215 | height = input_per_image.get("height", image_size[0]) 216 | width = input_per_image.get("width", image_size[1]) 217 | r = detector_postprocess(results_per_image, height, width) 218 | processed_results.append({"instances": r, 'time':total_time}) 219 | return processed_results 220 | 221 | def test_forward(self, images): 222 | start_event = Event(enable_timing=True) 223 | end_event = Event(enable_timing=True) 224 | 225 | start_event.record() 226 | features = self.backbone(images.tensor[:, :, :]) 227 | 228 | all_features = [features[f] for f in self.in_features] 229 | 230 | all_anchors, all_centers = self.anchor_generator(all_features) 231 | 232 | features_whole = [all_features[x] for x in self.layers_whole_test] 233 | features_value = [all_features[x] for x in self.layers_value_test] 234 | features_key = [all_features[x] for x in self.query_layer_test] 235 | 236 | anchors_whole = [all_anchors[x] for x in self.layers_whole_test] 237 | anchors_value = [all_anchors[x] for x in self.layers_value_test] 238 | 239 | det_cls_whole, det_delta_whole = self.det_head(features_whole) 240 | 241 | 242 | if not self.query_infer: 243 | det_cls_query, det_bbox_query = self.det_head(features_value) 244 | det_cls_query = [permute_to_N_HWA_K(x, self.num_classes) for x in det_cls_query] 245 | det_bbox_query = [permute_to_N_HWA_K(x, 4) for x in det_bbox_query] 246 | query_anchors = anchors_value 247 | else: 248 | if not self.qInfer.initialized: 249 | cls_weights, cls_biases, bbox_weights, bbox_biases = self.det_head.get_params() 250 | qcls_weights, qcls_bias = self.query_head.get_params() 251 | params = [cls_weights, cls_biases, bbox_weights, bbox_biases, qcls_weights, qcls_bias] 252 | else: 253 | params = None 254 | 255 | det_cls_query, det_bbox_query, query_anchors = self.qInfer.run_qinfer(params, features_key, features_value, anchors_value) 256 | 257 | results = self.inference(det_cls_whole, det_delta_whole, anchors_whole, 258 | det_cls_query, det_bbox_query, query_anchors, 259 | images.image_sizes) 260 | 261 | end_event.record() 262 | torch.cuda.synchronize() 263 | total_time = start_event.elapsed_time(end_event) 264 | return results, total_time 265 | 266 | # @float_function 267 | def _giou_loss(self, pred_deltas, anchors, gt_boxes): 268 | with autocast(False): 269 | pred_boxes = self.box2box_transform.apply_deltas(pred_deltas, anchors) 270 | loss = giou_loss(pred_boxes, gt_boxes, reduction='sum') 271 | return loss 272 | 273 | 274 | def det_loss(self, gt_classes, gt_anchors_targets, pred_logits, pred_deltas, all_anchors): 275 | def convert_gt_cls(logits, gt_class, f_idxs): 276 | gt_classes_target = torch.zeros_like(logits) 277 | gt_classes_target[f_idxs, gt_class[f_idxs]] = 1 278 | return gt_classes_target 279 | 280 | alphas = self.focal_loss_alpha 281 | gammas = self.focal_loss_gamma 282 | cls_weights = self.cls_weights 283 | reg_weights = self.reg_weights 284 | 285 | assert len(cls_weights) == len(pred_logits) 286 | assert len(cls_weights) == len(reg_weights) 287 | 288 | batch_size = pred_logits[0].size(0) 289 | pred_logits, pred_deltas = permute_all_to_NHWA_K_not_concat(pred_logits, pred_deltas, self.num_classes) 290 | 291 | lengths = [x.shape[0] for x in pred_logits] 292 | start_inds = [0] + [sum(lengths[:i]) for i in range(1, len(lengths))] 293 | end_inds = [sum(lengths[:i+1]) for i in range(len(lengths))] 294 | 295 | gt_classes = gt_classes.flatten() 296 | gt_anchors_targets = gt_anchors_targets.view(-1, 4) 297 | 298 | valid_idxs = gt_classes >= 0 299 | foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) 300 | num_foreground = foreground_idxs.sum().item() 301 | get_event_storage().put_scalar("num_foreground", num_foreground) 302 | self.loss_normalizer = ( 303 | self.loss_normalizer_momentum * self.loss_normalizer 304 | + (1 - self.loss_normalizer_momentum) * num_foreground 305 | ) 306 | all_anchor_lists = [torch.cat([x.tensor.reshape(-1, 4) for _ in range(batch_size)]) for x in all_anchors] 307 | gt_clsses_list = [gt_classes[s:e] for s, e in zip(start_inds, end_inds)] 308 | gt_anchors_targets_list = [gt_anchors_targets[s:e] for s, e in zip(start_inds, end_inds)] 309 | valid_idxs_list = [valid_idxs[s:e] for s, e in zip(start_inds, end_inds)] 310 | foreground_idxs_list = [foreground_idxs[s:e] for s, e in zip(start_inds, end_inds)] 311 | 312 | loss_cls = [ 313 | w * sigmoid_focal_loss_jit( 314 | x[v], 315 | convert_gt_cls(x, g, f)[v].detach(), 316 | alpha=alpha, 317 | gamma=gamma, 318 | reduction="sum" 319 | ) 320 | for w, x, g, v, f, alpha, gamma in zip(cls_weights, pred_logits, gt_clsses_list, valid_idxs_list, foreground_idxs_list, alphas, gammas) 321 | ] 322 | 323 | if self.use_giou_loss: 324 | loss_box_reg = [ 325 | w * self._giou_loss( 326 | x[f], 327 | a[f].detach(), 328 | g[f].detach(), 329 | ) 330 | for w, x, a, g, f in zip(reg_weights, pred_deltas, all_anchor_lists, gt_anchors_targets_list, foreground_idxs_list) 331 | ] 332 | else: 333 | loss_box_reg = [ 334 | w * smooth_l1_loss( 335 | x[f], 336 | g[f].detach(), 337 | beta=self.smooth_l1_loss_beta, 338 | reduction="sum" 339 | ) 340 | for w, x, g, f in zip(reg_weights, pred_deltas, gt_anchors_targets_list, foreground_idxs_list) 341 | ] 342 | 343 | loss_cls = sum(loss_cls) / max(1., self.loss_normalizer) 344 | loss_box_reg = sum(loss_box_reg) / max(1., self.loss_normalizer) 345 | return {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg} 346 | 347 | def query_loss(self, gt_small_obj, pred_small_obj, gammas, weights): 348 | pred_logits = [permute_to_N_HWA_K(x, 1).flatten() for x in pred_small_obj] 349 | gts = [x.flatten() for x in gt_small_obj] 350 | loss = sum([sigmoid_focal_loss_jit(x, y, alpha=0.25, gamma=g, reduction="mean") * w for (x, y, g, w) in zip(pred_logits, gts, gammas, weights)]) 351 | return {'loss_query': loss} 352 | 353 | @torch.no_grad() 354 | def get_det_gt(self, anchors, targets): 355 | gt_classes = [] 356 | gt_anchors_targets = [] 357 | anchor_layers = len(anchors) 358 | anchor_lens = [len(x) for x in anchors] 359 | start_inds = [0] + [sum(anchor_lens[:i]) for i in range(1, len(anchor_lens))] 360 | end_inds = [sum(anchor_lens[:i+1]) for i in range(len(anchor_lens))] 361 | all_anchors = Boxes.cat(anchors) # Rx4 362 | 363 | for targets_per_image in targets: 364 | 365 | if type(self.matcher) == Matcher: 366 | match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, all_anchors) 367 | gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix) 368 | del(match_quality_matrix) 369 | elif type(self.matcher) == LoopMatcher: # for encoding images with lots of gts 370 | gt_matched_idxs, anchor_labels = self.matcher(targets_per_image.gt_boxes, all_anchors) 371 | else: 372 | raise NotImplementedError 373 | 374 | has_gt = len(targets_per_image) > 0 375 | if has_gt: 376 | # ground truth box regression 377 | matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs] 378 | 379 | if not self.use_giou_loss: 380 | gt_anchors_reg_targets_i = self.box2box_transform.get_deltas( 381 | all_anchors.tensor, matched_gt_boxes.tensor 382 | ) 383 | else: 384 | gt_anchors_reg_targets_i = matched_gt_boxes.tensor 385 | 386 | gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] 387 | # Anchors with label 0 are treated as background. 388 | gt_classes_i[anchor_labels == 0] = self.num_classes 389 | # Anchors with label -1 are ignored. 390 | gt_classes_i[anchor_labels == -1] = -1 391 | 392 | else: 393 | gt_classes_i = torch.zeros_like(gt_matched_idxs) + self.num_classes 394 | gt_anchors_reg_targets_i = torch.zeros_like(all_anchors.tensor) 395 | 396 | gt_classes.append([gt_classes_i[s:e] for s, e in zip(start_inds, end_inds)]) 397 | gt_anchors_targets.append([gt_anchors_reg_targets_i[s:e] for s, e in zip(start_inds, end_inds)]) 398 | 399 | gt_classes = [torch.stack([x[i] for x in gt_classes]) for i in range(anchor_layers)] 400 | gt_anchors_targets = [torch.stack([x[i] for x in gt_anchors_targets]) for i in range(anchor_layers)] 401 | 402 | gt_classes = torch.cat([x.flatten() for x in gt_classes]) 403 | gt_anchors_targets = torch.cat([x.reshape(-1, 4) for x in gt_anchors_targets]) 404 | 405 | return gt_classes, gt_anchors_targets 406 | 407 | 408 | @torch.no_grad() 409 | def get_query_gt(self, small_anchor_centers, targets): 410 | small_gt_cls = [] 411 | for lind, anchor_center in enumerate(small_anchor_centers): 412 | per_layer_small_gt = [] 413 | for target_per_image in targets: 414 | target_box_scales = get_box_scales(target_per_image.gt_boxes) 415 | 416 | small_inds = (target_box_scales < self.small_obj_scale[lind][1]) & (target_box_scales >= self.small_obj_scale[lind][0]) 417 | small_boxes = target_per_image[small_inds] 418 | center_dis, minarg = get_anchor_center_min_dis(small_boxes.gt_boxes.get_centers(), anchor_center) 419 | small_obj_target = torch.zeros_like(center_dis) 420 | 421 | if len(small_boxes) != 0: 422 | min_small_target_scale = (target_box_scales[small_inds])[minarg] 423 | small_obj_target[center_dis < min_small_target_scale * self.small_center_dis_coeff[lind]] = 1 424 | 425 | per_layer_small_gt.append(small_obj_target) 426 | small_gt_cls.append(torch.stack(per_layer_small_gt)) 427 | 428 | return small_gt_cls 429 | 430 | 431 | def inference(self, 432 | retina_box_cls, retina_box_delta, retina_anchors, 433 | small_det_logits, small_det_delta, small_det_anchors, 434 | image_sizes 435 | ): 436 | results = [] 437 | 438 | N, _, _, _ = retina_box_cls[0].size() 439 | retina_box_cls = [permute_to_N_HWA_K(x, self.num_classes) for x in retina_box_cls] 440 | retina_box_delta = [permute_to_N_HWA_K(x, 4) for x in retina_box_delta] 441 | small_det_logits = [x.view(N, -1, self.num_classes) for x in small_det_logits] 442 | small_det_delta = [x.view(N, -1, 4) for x in small_det_delta] 443 | 444 | for img_idx, image_size in enumerate(image_sizes): 445 | 446 | retina_box_cls_per_image = [box_cls_per_level[img_idx] for box_cls_per_level in retina_box_cls] 447 | retina_box_reg_per_image = [box_reg_per_level[img_idx] for box_reg_per_level in retina_box_delta] 448 | small_det_logits_per_image = [small_det_cls_per_level[img_idx] for small_det_cls_per_level in small_det_logits] 449 | small_det_reg_per_image = [small_det_reg_per_level[img_idx] for small_det_reg_per_level in small_det_delta] 450 | 451 | if len(small_det_anchors) == 0 or type(small_det_anchors[0]) == torch.Tensor: 452 | small_det_anchor_per_image = [small_det_anchor_per_level[img_idx] for small_det_anchor_per_level in small_det_anchors] 453 | else: 454 | small_det_anchor_per_image = small_det_anchors 455 | 456 | results_per_img = self.inference_single_image( 457 | retina_box_cls_per_image, retina_box_reg_per_image, retina_anchors, 458 | small_det_logits_per_image, small_det_reg_per_image, small_det_anchor_per_image, 459 | tuple(image_size)) 460 | results.append(results_per_img) 461 | 462 | return results 463 | 464 | 465 | def inference_single_image(self, 466 | retina_box_cls, retina_box_delta, retina_anchors, 467 | small_det_logits, small_det_delta, small_det_anchors, 468 | image_size 469 | ): 470 | with autocast(False): 471 | # small pos cls inference 472 | all_cls = small_det_logits + retina_box_cls 473 | all_delta = small_det_delta + retina_box_delta 474 | all_anchors = small_det_anchors + retina_anchors 475 | 476 | boxes_all, scores_all, class_idxs_all = self.decode_dets(all_cls, all_delta, all_anchors) 477 | boxes_all, scores_all, class_idxs_all = [cat(x) for x in [boxes_all, scores_all, class_idxs_all]] 478 | 479 | if self.use_soft_nms: 480 | keep, soft_nms_scores = self.soft_nmser(boxes_all, scores_all, class_idxs_all) 481 | else: 482 | keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold) 483 | result = Instances(image_size) 484 | 485 | keep = keep[: self.max_detections_per_image] 486 | result.pred_boxes = Boxes(boxes_all[keep]) 487 | result.scores = scores_all[keep] 488 | result.pred_classes = class_idxs_all[keep] 489 | return result 490 | 491 | 492 | def preprocess_image(self, batched_inputs): 493 | images = [x["image"].to(self.device) for x in batched_inputs] 494 | images = [(x - self.pixel_mean) / self.pixel_std for x in images] 495 | images = ImageList.from_tensors(images, self.backbone.size_divisibility) 496 | return images 497 | 498 | def decode_dets(self, cls_results, reg_results, anchors): 499 | boxes_all = [] 500 | scores_all = [] 501 | class_idxs_all = [] 502 | 503 | for cls_i, reg_i, anchors_i in zip(cls_results, reg_results, anchors): 504 | cls_i = cls_i.view(-1, self.num_classes) 505 | reg_i = reg_i.view(-1, 4) 506 | 507 | cls_i = cls_i.flatten().sigmoid_() # (HxWxAxK,) 508 | num_topk = min(self.topk_candidates, reg_i.size(0)) 509 | 510 | predicted_prob, topk_idxs = cls_i.sort(descending=True) 511 | predicted_prob = predicted_prob[:num_topk] 512 | topk_idxs = topk_idxs[:num_topk] 513 | 514 | # filter out the proposals with low confidence score 515 | keep_idxs = predicted_prob > self.score_threshold 516 | predicted_prob = predicted_prob[keep_idxs] 517 | topk_idxs = topk_idxs[keep_idxs] 518 | 519 | anchor_idxs = topk_idxs // self.num_classes 520 | classes_idxs = topk_idxs % self.num_classes 521 | predicted_class = classes_idxs 522 | 523 | reg_i = reg_i[anchor_idxs] 524 | anchors_i = anchors_i[anchor_idxs] 525 | 526 | if type(anchors_i) != torch.Tensor: 527 | anchors_i = anchors_i.tensor 528 | 529 | predicted_boxes = self.box2box_transform.apply_deltas(reg_i, anchors_i) 530 | 531 | boxes_all.append(predicted_boxes) 532 | scores_all.append(predicted_prob) 533 | class_idxs_all.append(predicted_class) 534 | 535 | return boxes_all, scores_all, class_idxs_all 536 | 537 | 538 | -------------------------------------------------------------------------------- /models/querydet/qinfer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import torch 3 | import torch.nn.functional as F 4 | import spconv.pytorch as spconv 5 | 6 | 7 | def permute_to_N_HWA_K(tensor, K): 8 | assert tensor.dim() == 4, tensor.shape 9 | N, _, H, W = tensor.shape 10 | tensor = tensor.view(N, -1, K, H, W) 11 | tensor = tensor.permute(0, 3, 4, 1, 2) 12 | tensor = tensor.reshape(N, -1, K) # Size=(N,HWA,K) 13 | return tensor 14 | 15 | def run_conv2d(x, weights, bias): 16 | n_conv = len(weights) 17 | for i in range(n_conv): 18 | x = F.conv2d(x, weights[i], bias[i]) 19 | if i != n_conv - 1: 20 | x = F.relu(x) 21 | return x 22 | 23 | 24 | class QueryInfer(object): 25 | def __init__(self, anchor_num, num_classes, score_th=0.12, context=2): 26 | 27 | self.anchor_num = anchor_num 28 | self.num_classes = num_classes 29 | self.score_th = score_th 30 | self.context = context 31 | 32 | self.initialized = False 33 | self.cls_spconv = None 34 | self.bbox_spconv = None 35 | self.qcls_spconv = None 36 | self.qcls_conv = None 37 | self.n_conv = None 38 | 39 | 40 | def _make_sparse_tensor(self, query_logits, last_ys, last_xs, anchors, feature_value): 41 | if last_ys is None: 42 | N, _, qh, qw = query_logits.size() 43 | assert N == 1 44 | prob = torch.sigmoid_(query_logits).view(-1) 45 | pidxs = torch.where(prob > self.score_th)[0]# .float() 46 | y = torch.div(pidxs, qw).int() 47 | x = torch.remainder(pidxs, qw).int() 48 | else: 49 | prob = torch.sigmoid_(query_logits).view(-1) 50 | pidxs = prob > self.score_th 51 | y = last_ys[pidxs] 52 | x = last_xs[pidxs] 53 | 54 | if y.size(0) == 0: 55 | return None, None, None, None, None, None 56 | 57 | _, fc, fh, fw = feature_value.shape 58 | 59 | ys, xs = [], [] 60 | for i in range(2): 61 | for j in range(2): 62 | ys.append(y * 2 + i) 63 | xs.append(x * 2 + j) 64 | 65 | ys = torch.cat(ys, dim=0) 66 | xs = torch.cat(xs, dim=0) 67 | inds = (ys * fw + xs).long() 68 | 69 | sparse_ys = [] 70 | sparse_xs = [] 71 | 72 | for i in range(-1*self.context, self.context+1): 73 | for j in range(-1*self.context, self.context+1): 74 | sparse_ys.append(ys+i) 75 | sparse_xs.append(xs+j) 76 | 77 | sparse_ys = torch.cat(sparse_ys, dim=0) 78 | sparse_xs = torch.cat(sparse_xs, dim=0) 79 | 80 | 81 | good_idx = (sparse_ys >= 0) & (sparse_ys < fh) & (sparse_xs >= 0) & (sparse_xs < fw) 82 | sparse_ys = sparse_ys[good_idx] 83 | sparse_xs = sparse_xs[good_idx] 84 | 85 | sparse_yx = torch.stack((sparse_ys, sparse_xs), dim=0).t() 86 | sparse_yx = torch.unique(sparse_yx, sorted=False, dim=0) 87 | 88 | sparse_ys = sparse_yx[:, 0] 89 | sparse_xs = sparse_yx[:, 1] 90 | 91 | sparse_inds = (sparse_ys * fw + sparse_xs).long() 92 | 93 | sparse_features = feature_value.view(fc, -1).transpose(0, 1)[sparse_inds].view(-1, fc) 94 | sparse_indices = torch.stack((torch.zeros_like(sparse_ys), sparse_ys, sparse_xs), dim=-1) 95 | sparse_tensor = spconv.SparseConvTensor(sparse_features, sparse_indices.int(), (fh, fw), 1) 96 | 97 | anchors = anchors.tensor.view(-1, self.anchor_num, 4) 98 | selected_anchors = anchors[inds].view(1, -1, 4) 99 | return sparse_tensor, ys, xs, inds, selected_anchors, sparse_indices.size(0) 100 | 101 | def _make_spconv(self, weights, biases): 102 | nets = [] 103 | for i in range(len(weights)): 104 | in_channel = weights[i].shape[1] 105 | out_channel = weights[i].shape[0] 106 | k_size = weights[i].shape[2] 107 | filter = spconv.SubMConv2d(in_channel, out_channel, k_size, 1, padding=k_size//2, indice_key="asd", algo=spconv.ConvAlgo.Native).to(device=weights[i].device) 108 | filter.weight.data[:] = weights[i].permute(2,3,1,0).contiguous()[:] # transpose(1,2).transpose(0,1).transpose(2,3).transpose(1,2).transpose(2,3) 109 | filter.bias.data = biases[i] 110 | nets.append(filter) 111 | if i != len(weights) - 1: 112 | nets.append(torch.nn.ReLU(inplace=True)) 113 | return spconv.SparseSequential(*nets) 114 | 115 | def _make_conv(self, weights, biases): 116 | nets = [] 117 | for i in range(len(weights)): 118 | in_channel = weights[i].shape[0] 119 | out_channel = weights[i].shape[1] 120 | k_size = weights[i].shape[2] 121 | filter = torch.nn.Conv2d(in_channel, out_channel, k_size, 1, padding=k_size//2) 122 | filter.weight.data = weights[i] 123 | filter.bias.data = biases[i] 124 | nets.append(filter) 125 | if i != len(weights) - 1: 126 | nets.append(torch.nn.ReLU()) 127 | return torch.nn.Sequential(*nets) 128 | 129 | def _run_spconvs(self, x, filters): 130 | y = filters(x) 131 | return y.dense(channels_first=False) 132 | 133 | def _run_convs(self, x, filters): 134 | return filters(x) 135 | 136 | def run_qinfer(self, model_params, features_key, features_value, anchors_value): 137 | 138 | if not self.initialized: 139 | cls_weights, cls_biases, bbox_weights, bbox_biases, qcls_weights, qcls_biases = model_params 140 | assert len(cls_weights) == len(qcls_weights) 141 | self.n_conv = len(cls_weights) 142 | self.cls_spconv = self._make_spconv(cls_weights, cls_biases) 143 | self.bbox_spconv = self._make_spconv(bbox_weights, bbox_biases) 144 | self.qcls_spconv = self._make_spconv(qcls_weights, qcls_biases) 145 | self.qcls_conv = self._make_conv(qcls_weights, qcls_biases) 146 | self.initialized = True 147 | 148 | last_ys, last_xs = None, None 149 | query_logits = self._run_convs(features_key[-1], self.qcls_conv) 150 | det_cls_query, det_bbox_query, query_anchors = [], [], [] 151 | 152 | n_inds_all = [] 153 | 154 | for i in range(len(features_value)-1, -1, -1): 155 | x, last_ys, last_xs, inds, selected_anchors, n_inds = self._make_sparse_tensor(query_logits, last_ys, last_xs, anchors_value[i], features_value[i]) 156 | n_inds_all.append(n_inds) 157 | if x == None: 158 | break 159 | cls_result = self._run_spconvs(x, self.cls_spconv).view(-1, self.anchor_num*self.num_classes)[inds] 160 | bbox_result = self._run_spconvs(x, self.bbox_spconv).view(-1, self.anchor_num*4)[inds] 161 | query_logits = self._run_spconvs(x, self.qcls_spconv).view(-1)[inds] 162 | 163 | query_anchors.append(selected_anchors) 164 | det_cls_query.append(torch.unsqueeze(cls_result, 0)) 165 | det_bbox_query.append(torch.unsqueeze(bbox_result, 0)) 166 | 167 | return det_cls_query, det_bbox_query, query_anchors 168 | 169 | 170 | -------------------------------------------------------------------------------- /models/retinanet/__pycache__/retinanet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/models/retinanet/__pycache__/retinanet.cpython-36.pyc -------------------------------------------------------------------------------- /models/retinanet/__pycache__/retinanet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/models/retinanet/__pycache__/retinanet.cpython-37.pyc -------------------------------------------------------------------------------- /models/retinanet/retinanet.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import logging 3 | import math 4 | import time 5 | import numpy as np 6 | from typing import List 7 | import torch 8 | from fvcore.nn import sigmoid_focal_loss_jit, smooth_l1_loss 9 | from torch import nn 10 | import torch.nn.functional as F 11 | 12 | from detectron2.layers import ShapeSpec, batched_nms, cat, get_norm, Conv2d 13 | from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou 14 | from detectron2.utils.events import get_event_storage 15 | from detectron2.utils.logger import log_first_n 16 | import detectron2.utils.comm as comm 17 | 18 | from detectron2.modeling.anchor_generator import build_anchor_generator 19 | from detectron2.modeling.backbone import build_backbone 20 | from detectron2.modeling.box_regression import Box2BoxTransform 21 | from detectron2.modeling.matcher import Matcher 22 | from detectron2.modeling.postprocessing import detector_postprocess 23 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY 24 | 25 | from torch.cuda import Event 26 | from utils.loop_matcher import LoopMatcher 27 | 28 | 29 | __all__ = ["RetinaNet_D2"] 30 | 31 | 32 | def permute_to_N_HWA_K(tensor, K): 33 | """ 34 | Transpose/reshape a tensor from (N, (A x K), H, W) to (N, (HxWxA), K) 35 | """ 36 | assert tensor.dim() == 4, tensor.shape 37 | N, _, H, W = tensor.shape 38 | tensor = tensor.view(N, -1, K, H, W) 39 | tensor = tensor.permute(0, 3, 4, 1, 2) 40 | tensor = tensor.reshape(N, -1, K) # Size=(N,HWA,K) 41 | return tensor 42 | 43 | 44 | def permute_all_cls_and_box_to_N_HWA_K_and_concat(box_cls, box_delta, num_classes=80): 45 | """ 46 | Rearrange the tensor layout from the network output, i.e.: 47 | list[Tensor]: #lvl tensors of shape (N, A x K, Hi, Wi) 48 | to per-image predictions, i.e.: 49 | Tensor: of shape (N x sum(Hi x Wi x A), K) 50 | """ 51 | # for each feature level, permute the outputs to make them be in the 52 | # same format as the labels. Note that the labels are computed for 53 | # all feature levels concatenated, so we keep the same representation 54 | # for the objectness and the box_delta 55 | box_cls_flattened = [permute_to_N_HWA_K(x, num_classes) for x in box_cls] 56 | box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta] 57 | # concatenate on the first dimension (representing the feature levels), to 58 | # take into account the way the labels were generated (with all feature maps 59 | # being concatenated as well) 60 | box_cls = cat(box_cls_flattened, dim=1).view(-1, num_classes) 61 | box_delta = cat(box_delta_flattened, dim=1).view(-1, 4) 62 | return box_cls, box_delta 63 | 64 | 65 | def permute_all_to_NHWA_K_not_concat(box_cls, box_delta, num_classes=80): 66 | box_cls_flattened = [permute_to_N_HWA_K(x, num_classes).view(-1, num_classes) for x in box_cls] 67 | box_delta_flattened = [permute_to_N_HWA_K(x, 4).view(-1, 4) for x in box_delta] 68 | return box_cls_flattened, box_delta_flattened 69 | 70 | @META_ARCH_REGISTRY.register() 71 | class RetinaNet_D2(nn.Module): 72 | """ 73 | Implement RetinaNet in :paper:`RetinaNet`. 74 | """ 75 | 76 | def __init__(self, cfg): 77 | super().__init__() 78 | 79 | # fmt: off 80 | self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES 81 | self.in_features = cfg.MODEL.RETINANET.IN_FEATURES 82 | # Loss parameters: 83 | self.focal_loss_alpha = cfg.MODEL.CUSTOM.FOCAL_LOSS_ALPHAS 84 | self.focal_loss_gamma = cfg.MODEL.CUSTOM.FOCAL_LOSS_GAMMAS 85 | self.cls_weights = cfg.MODEL.CUSTOM.CLS_WEIGHTS 86 | self.reg_weights = cfg.MODEL.CUSTOM.REG_WEIGHTS 87 | self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA 88 | # Inference parameters: 89 | self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST 90 | self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST 91 | self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST 92 | self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE 93 | # Vis parameters 94 | self.vis_period = cfg.VIS_PERIOD 95 | self.input_format = cfg.INPUT.FORMAT 96 | self.scale_factor = 1 97 | # fmt: on 98 | 99 | self.backbone = build_backbone(cfg) 100 | 101 | backbone_shape = self.backbone.output_shape() 102 | feature_shapes = [backbone_shape[f] for f in self.in_features] 103 | self.head = RetinaNetHead(cfg, feature_shapes) 104 | self.anchor_generator = build_anchor_generator(cfg, feature_shapes) 105 | 106 | # Matching and loss 107 | self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) 108 | if cfg.MODEL.CUSTOM.USE_LOOP_MATCHER: 109 | self.matcher = LoopMatcher( 110 | cfg.MODEL.RETINANET.IOU_THRESHOLDS, 111 | cfg.MODEL.RETINANET.IOU_LABELS, 112 | allow_low_quality_matches=True, 113 | ) 114 | else: 115 | self.matcher = Matcher( 116 | cfg.MODEL.RETINANET.IOU_THRESHOLDS, 117 | cfg.MODEL.RETINANET.IOU_LABELS, 118 | allow_low_quality_matches=True, 119 | ) 120 | 121 | self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) 122 | self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) 123 | 124 | """ 125 | In Detectron1, loss is normalized by number of foreground samples in the batch. 126 | When batch size is 1 per GPU, #foreground has a large variance and 127 | using it lead to lower performance. Here we maintain an EMA of #foreground to 128 | stabilize the normalizer. 129 | """ 130 | self.loss_normalizer = 100 # initialize with any reasonable #fg that's not too small 131 | self.loss_normalizer_momentum = 0.9 132 | 133 | self.iter = 0 134 | self.class_stat = [0 for _ in range(10)] 135 | 136 | @property 137 | def device(self): 138 | return self.pixel_mean.device 139 | 140 | 141 | def visualize_training(self, batched_inputs, results): 142 | from detectron2.utils.visualizer import Visualizer 143 | 144 | assert len(batched_inputs) == len( 145 | results 146 | ), "Cannot visualize inputs and results of different sizes" 147 | storage = get_event_storage() 148 | max_boxes = 20 149 | 150 | image_index = 0 # only visualize a single image 151 | img = batched_inputs[image_index]["image"].cpu().numpy() 152 | assert img.shape[0] == 3, "Images should have 3 channels." 153 | if self.input_format == "BGR": 154 | img = img[::-1, :, :] 155 | img = img.transpose(1, 2, 0) 156 | v_gt = Visualizer(img, None) 157 | v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes) 158 | anno_img = v_gt.get_image() 159 | processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1]) 160 | predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy() 161 | 162 | v_pred = Visualizer(img, None) 163 | v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes]) 164 | prop_img = v_pred.get_image() 165 | vis_img = np.vstack((anno_img, prop_img)) 166 | vis_img = vis_img.transpose(2, 0, 1) 167 | vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results" 168 | storage.put_image(vis_name, vis_img) 169 | 170 | 171 | def forward(self, batched_inputs): 172 | start_event = Event(enable_timing=True) 173 | end_event = Event(enable_timing=True) 174 | 175 | images = self.preprocess_image(batched_inputs) 176 | if "instances" in batched_inputs[0]: 177 | gt_instances = [x["instances"].to(self.device) for x in batched_inputs] 178 | elif "targets" in batched_inputs[0]: 179 | log_first_n( 180 | logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 181 | ) 182 | gt_instances = [x["targets"].to(self.device) for x in batched_inputs] 183 | else: 184 | gt_instances = None 185 | 186 | start_event.record() 187 | 188 | features = self.backbone(images.tensor) 189 | features = [features[f] for f in self.in_features] 190 | box_cls, box_delta = self.head(features) 191 | anchors = self.anchor_generator(features) 192 | 193 | if self.training: 194 | # torch.cuda.empty_cache() 195 | # gt_classes, gt_anchors_reg_deltas = self.get_ground_truth(anchors, gt_instances) 196 | # losses = self.losses(gt_classes, gt_anchors_reg_deltas, box_cls, box_delta) 197 | 198 | gt_classes, gt_deltas = self.get_det_gt(anchors, gt_instances) 199 | losses = self.det_loss(gt_classes, gt_deltas, box_cls, box_delta, self.focal_loss_alpha, self.focal_loss_gamma, self.cls_weights, self.reg_weights) 200 | 201 | 202 | if self.vis_period > 0: 203 | storage = get_event_storage() 204 | if storage.iter % self.vis_period == 0: 205 | results = self.inference(box_cls, box_delta, anchors, images.image_sizes) 206 | self.visualize_training(batched_inputs, results) 207 | 208 | return losses 209 | else: 210 | results = self.inference(box_cls, box_delta, anchors, images.image_sizes) 211 | end_event.record() 212 | torch.cuda.synchronize() 213 | total_time = start_event.elapsed_time(end_event) 214 | processed_results = [] 215 | for results_per_image, input_per_image, image_size in zip( 216 | results, batched_inputs, images.image_sizes 217 | ): 218 | height = input_per_image.get("height", image_size[0]) 219 | width = input_per_image.get("width", image_size[1]) 220 | r = detector_postprocess(results_per_image, height, width) 221 | processed_results.append({"instances": r, 'time':total_time}) 222 | return processed_results 223 | 224 | 225 | @torch.no_grad() 226 | def get_det_gt(self, anchors, targets): 227 | gt_classes = [] 228 | gt_anchors_deltas = [] 229 | anchor_layers = len(anchors) 230 | anchor_lens = [len(x) for x in anchors] 231 | start_inds = [0] + [sum(anchor_lens[:i]) for i in range(1, len(anchor_lens))] 232 | end_inds = [sum(anchor_lens[:i+1]) for i in range(len(anchor_lens))] 233 | anchors = Boxes.cat(anchors) # Rx4 234 | 235 | for targets_per_image in targets: 236 | if type(self.matcher) == Matcher: 237 | match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, anchors) 238 | gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix) 239 | del(match_quality_matrix) 240 | else: 241 | gt_matched_idxs, anchor_labels = self.matcher(targets_per_image.gt_boxes, anchors) 242 | 243 | has_gt = len(targets_per_image) > 0 244 | if has_gt: 245 | # ground truth box regression 246 | matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs] 247 | gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas( 248 | anchors.tensor, matched_gt_boxes.tensor 249 | ) 250 | 251 | gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] 252 | # Anchors with label 0 are treated as background. 253 | gt_classes_i[anchor_labels == 0] = self.num_classes 254 | # Anchors with label -1 are ignored. 255 | gt_classes_i[anchor_labels == -1] = -1 256 | 257 | else: 258 | gt_classes_i = torch.zeros_like(gt_matched_idxs) + self.num_classes 259 | gt_anchors_reg_deltas_i = torch.zeros_like(anchors.tensor) 260 | 261 | gt_classes.append([gt_classes_i[s:e] for s, e in zip(start_inds, end_inds)]) 262 | gt_anchors_deltas.append([gt_anchors_reg_deltas_i[s:e] for s, e in zip(start_inds, end_inds)]) 263 | 264 | gt_classes = [torch.stack([x[i] for x in gt_classes]) for i in range(anchor_layers)] 265 | gt_anchors_deltas = [torch.stack([x[i] for x in gt_anchors_deltas]) for i in range(anchor_layers)] 266 | 267 | gt_classes = torch.cat([x.flatten() for x in gt_classes]) 268 | gt_anchors_deltas = torch.cat([x.reshape(-1, 4) for x in gt_anchors_deltas]) 269 | 270 | return gt_classes, gt_anchors_deltas 271 | 272 | 273 | def det_loss(self, gt_classes, gt_anchors_deltas, pred_logits, pred_deltas, alphas, gammas, cls_weights, reg_weights): 274 | def convert_gt_cls(logits, gt_class, f_idxs): 275 | gt_classes_target = torch.zeros_like(logits) 276 | gt_classes_target[f_idxs, gt_class[f_idxs]] = 1 277 | return gt_classes_target 278 | 279 | assert len(cls_weights) == len(pred_logits) 280 | assert len(cls_weights) == len(reg_weights) 281 | 282 | pred_logits, pred_deltas = permute_all_to_NHWA_K_not_concat(pred_logits, pred_deltas, self.num_classes) 283 | 284 | lengths = [x.shape[0] for x in pred_logits] 285 | start_inds = [0] + [sum(lengths[:i]) for i in range(1, len(lengths))] 286 | end_inds = [sum(lengths[:i+1]) for i in range(len(lengths))] 287 | 288 | gt_classes = gt_classes.flatten() 289 | gt_anchors_deltas = gt_anchors_deltas.view(-1, 4) 290 | 291 | valid_idxs = gt_classes >= 0 292 | foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) 293 | num_foreground = foreground_idxs.sum().item() 294 | get_event_storage().put_scalar("num_foreground", num_foreground) 295 | self.loss_normalizer = ( 296 | self.loss_normalizer_momentum * self.loss_normalizer 297 | + (1 - self.loss_normalizer_momentum) * num_foreground 298 | ) 299 | gt_clsses_list = [gt_classes[s:e] for s, e in zip(start_inds, end_inds)] 300 | gt_anchors_deltas_list = [gt_anchors_deltas[s:e] for s, e in zip(start_inds, end_inds)] 301 | valid_idxs_list = [valid_idxs[s:e] for s, e in zip(start_inds, end_inds)] 302 | foreground_idxs_list = [foreground_idxs[s:e] for s, e in zip(start_inds, end_inds)] 303 | 304 | loss_cls = [ 305 | w * sigmoid_focal_loss_jit( 306 | x[v], 307 | convert_gt_cls(x, g, f)[v].detach(), 308 | alpha=alpha, 309 | gamma=gamma, 310 | reduction="sum" 311 | ) 312 | for w, x, g, v, f, alpha, gamma in zip(cls_weights, pred_logits, gt_clsses_list, valid_idxs_list, foreground_idxs_list, alphas, gammas) 313 | ] 314 | 315 | loss_box_reg = [ 316 | w * smooth_l1_loss( 317 | x[f], 318 | g[f].detach(), 319 | beta=self.smooth_l1_loss_beta, 320 | reduction="sum" 321 | ) 322 | for w, x, g, f in zip(reg_weights, pred_deltas, gt_anchors_deltas_list, foreground_idxs_list) 323 | ] 324 | 325 | loss_cls = sum(loss_cls) / max(1., self.loss_normalizer) 326 | loss_box_reg = sum(loss_box_reg) / max(1., self.loss_normalizer) 327 | return {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg} 328 | 329 | 330 | def inference(self, box_cls, box_delta, anchors, image_sizes): 331 | """ 332 | Arguments: 333 | box_cls, box_delta: Same as the output of :meth:`RetinaNetHead.forward` 334 | anchors (list[Boxes]): A list of #feature level Boxes. 335 | The Boxes contain anchors of this image on the specific feature level. 336 | image_sizes (List[torch.Size]): the input image sizes 337 | 338 | Returns: 339 | results (List[Instances]): a list of #images elements. 340 | """ 341 | results = [] 342 | times = [] 343 | 344 | box_cls = [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls] 345 | box_delta = [permute_to_N_HWA_K(x, 4) for x in box_delta] 346 | 347 | for img_idx, image_size in enumerate(image_sizes): 348 | box_cls_per_image = [box_cls_per_level[img_idx] for box_cls_per_level in box_cls] 349 | box_reg_per_image = [box_reg_per_level[img_idx] for box_reg_per_level in box_delta] 350 | results_per_image = self.inference_single_image( 351 | box_cls_per_image, box_reg_per_image, anchors, (image_size[0]*self.scale_factor, image_size[1]*self.scale_factor) 352 | ) 353 | results.append(results_per_image) 354 | return results 355 | 356 | 357 | def inference_single_image(self, box_cls, box_delta, anchors, image_size): 358 | """ 359 | Single-image inference. Return bounding-box detection results by thresholding 360 | on scores and applying non-maximum suppression (NMS). 361 | 362 | Arguments: 363 | box_cls (list[Tensor]): list of #feature levels. Each entry contains 364 | tensor of size (H x W x A, K) 365 | box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. 366 | anchors (list[Boxes]): list of #feature levels. Each entry contains 367 | a Boxes object, which contains all the anchors for that 368 | image in that feature level. 369 | image_size (tuple(H, W)): a tuple of the image height and width. 370 | 371 | Returns: 372 | Same as `inference`, but for only one image. 373 | """ 374 | boxes_all = [] 375 | scores_all = [] 376 | class_idxs_all = [] 377 | 378 | # Iterate over every feature level 379 | for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors): 380 | # (HxWxAxK,) 381 | box_cls_i = box_cls_i.flatten().sigmoid_() 382 | 383 | # Keep top k top scoring indices only. 384 | num_topk = min(self.topk_candidates, box_reg_i.size(0)) 385 | # torch.sort is actually faster than .topk (at least on GPUs) 386 | predicted_prob, topk_idxs = box_cls_i.sort(descending=True) 387 | predicted_prob = predicted_prob[:num_topk] 388 | topk_idxs = topk_idxs[:num_topk] 389 | 390 | # filter out the proposals with low confidence score 391 | keep_idxs = predicted_prob > self.score_threshold 392 | predicted_prob = predicted_prob[keep_idxs] 393 | topk_idxs = topk_idxs[keep_idxs] 394 | 395 | anchor_idxs = topk_idxs // self.num_classes 396 | classes_idxs = topk_idxs % self.num_classes 397 | 398 | box_reg_i = box_reg_i[anchor_idxs] 399 | anchors_i = anchors_i[anchor_idxs] 400 | # predict boxes 401 | predicted_boxes = self.box2box_transform.apply_deltas(box_reg_i, anchors_i.tensor) 402 | 403 | boxes_all.append(predicted_boxes) 404 | scores_all.append(predicted_prob) 405 | class_idxs_all.append(classes_idxs) 406 | 407 | boxes_all, scores_all, class_idxs_all = [ 408 | cat(x) for x in [boxes_all, scores_all, class_idxs_all] 409 | ] 410 | 411 | keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold) 412 | 413 | keep = keep[: self.max_detections_per_image] 414 | 415 | result = Instances(image_size) 416 | result.pred_boxes = Boxes(boxes_all[keep]) 417 | result.scores = scores_all[keep] 418 | result.pred_classes = class_idxs_all[keep] 419 | return result 420 | 421 | 422 | def preprocess_image(self, batched_inputs): 423 | """ 424 | Normalize, pad and batch the input images. 425 | """ 426 | images = [x["image"].to(self.device) for x in batched_inputs] 427 | images = [(x - self.pixel_mean) / self.pixel_std for x in images] 428 | images = ImageList.from_tensors(images, self.backbone.size_divisibility) 429 | return images 430 | 431 | 432 | class RetinaNetHead(nn.Module): 433 | """ 434 | The head used in RetinaNet for object classification and box regression. 435 | It has two subnets for the two tasks, with a common structure but separate parameters. 436 | """ 437 | 438 | def __init__(self, cfg, input_shape: List[ShapeSpec]): 439 | super().__init__() 440 | # fmt: off 441 | in_channels = input_shape[0].channels 442 | num_classes = cfg.MODEL.RETINANET.NUM_CLASSES 443 | num_convs = cfg.MODEL.RETINANET.NUM_CONVS 444 | prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB 445 | num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors 446 | # fmt: on 447 | assert ( 448 | len(set(num_anchors)) == 1 449 | ), "Using different number of anchors between levels is not currently supported!" 450 | num_anchors = num_anchors[0] 451 | 452 | cls_subnet = [] 453 | bbox_subnet = [] 454 | for _ in range(num_convs): 455 | cls_subnet.append( 456 | nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) 457 | ) 458 | cls_subnet.append(nn.ReLU()) 459 | bbox_subnet.append( 460 | nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) 461 | ) 462 | bbox_subnet.append(nn.ReLU()) 463 | 464 | self.cls_subnet = nn.Sequential(*cls_subnet) 465 | self.bbox_subnet = nn.Sequential(*bbox_subnet) 466 | self.cls_score = nn.Conv2d( 467 | in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1 468 | ) 469 | self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1) 470 | 471 | # Initialization 472 | for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]: 473 | for layer in modules.modules(): 474 | if isinstance(layer, nn.Conv2d): 475 | #torch.nn.init.xavier_normal_(layer.weight) 476 | torch.nn.init.normal_(layer.weight, mean=0, std=0.01) 477 | torch.nn.init.constant_(layer.bias, 0) 478 | 479 | # Use prior in model initialization to improve stability 480 | bias_value = -(math.log((1 - prior_prob) / prior_prob)) 481 | torch.nn.init.constant_(self.cls_score.bias, bias_value) 482 | 483 | def forward(self, features): 484 | """ 485 | Arguments: 486 | features (list[Tensor]): FPN feature map tensors in high to low resolution. 487 | Each tensor in the list correspond to different feature levels. 488 | 489 | Returns: 490 | logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi). 491 | The tensor predicts the classification probability 492 | at each spatial position for each of the A anchors and K object 493 | classes. 494 | bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi). 495 | The tensor predicts 4-vector (dx,dy,dw,dh) box 496 | regression values for every anchor. These values are the 497 | relative offset between the anchor and the ground truth box. 498 | """ 499 | logits = [] 500 | bbox_reg = [] 501 | for feature in features: 502 | logits.append(self.cls_score(self.cls_subnet(feature))) 503 | bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature))) 504 | return logits, bbox_reg 505 | -------------------------------------------------------------------------------- /train_coco.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from detectron2.engine import launch 4 | from train_tools.coco_train import default_argument_parser, start_train 5 | 6 | from models.retinanet.retinanet import RetinaNet_D2 7 | from models.querydet.detector import RetinaNetQueryDet 8 | 9 | 10 | if __name__ == '__main__': 11 | args = default_argument_parser().parse_args() 12 | print("Command Line Args:", args) 13 | launch( 14 | start_train, 15 | args.num_gpus, 16 | num_machines=args.num_machines, 17 | machine_rank=args.machine_rank, 18 | dist_url=args.dist_url, 19 | args=(args,), 20 | ) -------------------------------------------------------------------------------- /train_tools/coco_infer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | """ 4 | Detection Training Script. 5 | 6 | This scripts reads a given config file and runs the training or evaluation. 7 | It is an entry point that is made to train standard models in detectron2. 8 | 9 | In order to let one script support training of many models, 10 | this script contains logic that are specific to these built-in models and therefore 11 | may not be suitable for your own project. 12 | For example, your research project perhaps only needs a single "evaluator". 13 | 14 | Therefore, we recommend you to use detectron2 as an library and take 15 | this file as an example of how to use the library. 16 | You may want to write your own script with your datasets and other customizations. 17 | """ 18 | 19 | import logging 20 | import sys 21 | import os 22 | from collections import OrderedDict 23 | import torch 24 | import argparse 25 | from torch.nn.parallel import DistributedDataParallel 26 | 27 | import detectron2.utils.comm as comm 28 | from detectron2.checkpoint import DetectionCheckpointer 29 | from detectron2.config import get_cfg 30 | from detectron2.data import MetadataCatalog, build_detection_test_loader 31 | from detectron2.engine import DefaultTrainer, default_setup, hooks, launch 32 | from detectron2.evaluation import ( 33 | CityscapesInstanceEvaluator, 34 | CityscapesSemSegEvaluator, 35 | COCOEvaluator, 36 | COCOPanopticEvaluator, 37 | DatasetEvaluators, 38 | LVISEvaluator, 39 | PascalVOCDetectionEvaluator, 40 | SemSegEvaluator, 41 | verify_results, 42 | ) 43 | from detectron2.modeling import GeneralizedRCNNWithTTA 44 | from detectron2.checkpoint import DetectionCheckpointer 45 | from detectron2_backbone.config import add_backbone_config 46 | from detectron2_backbone import mobilenet 47 | 48 | from utils.val_mapper_with_ann import ValMapper 49 | from utils.time_evaluator import GPUTimeEvaluator 50 | from utils.coco_eval_fpn import COCOEvaluatorFPN 51 | from utils.anchor_gen import AnchorGeneratorWithCenter 52 | from configs.custom_config import add_custom_config 53 | 54 | 55 | 56 | class Trainer(DefaultTrainer): 57 | @classmethod 58 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 59 | if output_folder is None: 60 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 61 | evaluator_list = [] 62 | if cfg.META_INFO.EVAL_AP: 63 | evaluator_list.append(COCOEvaluatorFPN(dataset_name, cfg, True, os.path.join(cfg.OUTPUT_DIR))) 64 | if cfg.META_INFO.EVAL_GPU_TIME: 65 | evaluator_list.append(GPUTimeEvaluator(True, 'minisecond')) 66 | return DatasetEvaluators(evaluator_list) 67 | 68 | def default_argument_parser(epilog=None): 69 | """ 70 | Create a parser with some common arguments used by detectron2 users. 71 | 72 | Args: 73 | epilog (str): epilog passed to ArgumentParser describing the usage. 74 | 75 | Returns: 76 | argparse.ArgumentParser: 77 | """ 78 | parser = argparse.ArgumentParser( 79 | epilog=epilog 80 | or f""" 81 | Examples: 82 | 83 | Run on single machine: 84 | $ {sys.argv[0]} --num-gpus 8 --config-file cfg.yaml MODEL.WEIGHTS /path/to/weight.pth 85 | 86 | Run on multiple machines: 87 | (machine0)$ {sys.argv[0]} --machine-rank 0 --num-machines 2 --dist-url [--other-flags] 88 | (machine1)$ {sys.argv[0]} --machine-rank 1 --num-machines 2 --dist-url [--other-flags] 89 | """, 90 | formatter_class=argparse.RawDescriptionHelpFormatter, 91 | ) 92 | parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file") 93 | parser.add_argument( 94 | "--resume", 95 | action="store_true", 96 | help="whether to attempt to resume from the checkpoint directory", 97 | ) 98 | parser.add_argument("--eval-only", action="store_true", help="perform evaluation only") 99 | parser.add_argument("--no-pretrain", action="store_true", help="whether to load pretrained model") 100 | parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*") 101 | parser.add_argument("--num-machines", type=int, default=1, help="total number of machines") 102 | parser.add_argument( 103 | "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)" 104 | ) 105 | 106 | 107 | # PyTorch still may leave orphan processes in multi-gpu training. 108 | # Therefore we use a deterministic way to obtain port, 109 | # so that users are aware of orphan processes by seeing the port occupied. 110 | port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14 111 | parser.add_argument( 112 | "--dist-url", 113 | default="tcp://127.0.0.1:{}".format(port), 114 | help="initialization URL for pytorch distributed backend. See " 115 | "https://pytorch.org/docs/stable/distributed.html for details.", 116 | ) 117 | parser.add_argument( 118 | "opts", 119 | help="Modify config options using the command-line", 120 | default=None, 121 | nargs=argparse.REMAINDER, 122 | ) 123 | return parser 124 | 125 | def setup(args): 126 | """ 127 | Create configs and perform basic setups. 128 | """ 129 | cfg = get_cfg() 130 | add_custom_config(cfg) 131 | add_backbone_config(cfg) 132 | cfg.merge_from_file(args.config_file) 133 | cfg.merge_from_list(args.opts) 134 | cfg.freeze() 135 | default_setup(cfg, args) 136 | return cfg 137 | 138 | 139 | def start_train(args): 140 | cfg = setup(args) 141 | 142 | if args.eval_only: 143 | model = Trainer.build_model(cfg) 144 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 145 | cfg.MODEL.WEIGHTS, resume=args.resume 146 | ) 147 | res = Trainer.test(cfg, model) 148 | if comm.is_main_process(): 149 | verify_results(cfg, res) 150 | return res 151 | 152 | """ 153 | If you'd like to do anything fancier than the standard training logic, 154 | consider writing your own training loop (see plain_train_net.py) or 155 | subclassing the trainer. 156 | """ 157 | trainer = Trainer(cfg) 158 | if not args.no_pretrain: 159 | trainer.resume_or_load(resume=args.resume) 160 | return trainer.train() 161 | 162 | 163 | -------------------------------------------------------------------------------- /train_tools/coco_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | """ 4 | Detection Training Script. 5 | 6 | This scripts reads a given config file and runs the training or evaluation. 7 | It is an entry point that is made to train standard models in detectron2. 8 | 9 | In order to let one script support training of many models, 10 | this script contains logic that are specific to these built-in models and therefore 11 | may not be suitable for your own project. 12 | For example, your research project perhaps only needs a single "evaluator". 13 | 14 | Therefore, we recommend you to use detectron2 as an library and take 15 | this file as an example of how to use the library. 16 | You may want to write your own script with your datasets and other customizations. 17 | """ 18 | 19 | import logging 20 | import sys 21 | import os 22 | from collections import OrderedDict 23 | import torch 24 | import argparse 25 | from torch.nn.parallel import DistributedDataParallel 26 | 27 | import detectron2.utils.comm as comm 28 | from detectron2.checkpoint import DetectionCheckpointer 29 | from detectron2.config import get_cfg 30 | from detectron2.data import MetadataCatalog, build_detection_test_loader 31 | from detectron2.engine import DefaultTrainer, default_setup, hooks, launch 32 | from detectron2.evaluation import ( 33 | CityscapesInstanceEvaluator, 34 | CityscapesSemSegEvaluator, 35 | COCOEvaluator, 36 | COCOPanopticEvaluator, 37 | DatasetEvaluators, 38 | LVISEvaluator, 39 | PascalVOCDetectionEvaluator, 40 | SemSegEvaluator, 41 | verify_results, 42 | ) 43 | from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter 44 | from detectron2.modeling import GeneralizedRCNNWithTTA 45 | from detectron2.checkpoint import DetectionCheckpointer 46 | from detectron2.engine.train_loop import AMPTrainer, SimpleTrainer 47 | from detectron2.engine.defaults import DefaultTrainer 48 | 49 | from utils.val_mapper_with_ann import ValMapper 50 | from utils.anchor_gen import AnchorGeneratorWithCenter 51 | from utils.coco_eval_fpn import COCOEvaluatorFPN 52 | 53 | from configs.custom_config import add_custom_config 54 | 55 | # from detectron2_backbone.config import add_backbone_config 56 | # import detectron2_backbone.backbone.mobilenet 57 | 58 | 59 | class Trainer(DefaultTrainer): 60 | def __init__(self, cfg, resume=False, reuse_ckpt=False): 61 | """ 62 | Args: 63 | cfg (CfgNode): 64 | """ 65 | super(DefaultTrainer, self).__init__() 66 | 67 | logger = logging.getLogger("detectron2") 68 | if not logger.isEnabledFor(logging.INFO): # setup_logger is not called for d2 69 | setup_logger() 70 | cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) 71 | 72 | # Assume these objects must be constructed in this order. 73 | model = self.build_model(cfg) 74 | 75 | ckpt = DetectionCheckpointer(model) 76 | self.start_iter = 0 77 | self.start_iter = ckpt.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 78 | self.iter =self.start_iter 79 | 80 | optimizer = self.build_optimizer(cfg, model) 81 | data_loader = self.build_train_loader(cfg) 82 | 83 | # For training, wrap with DDP. But don't need this for inference. 84 | if comm.get_world_size() > 1: 85 | model = DistributedDataParallel( 86 | model, device_ids=[comm.get_local_rank()], broadcast_buffers=False 87 | ) 88 | self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)( 89 | model, data_loader, optimizer 90 | ) 91 | 92 | self.scheduler = self.build_lr_scheduler(cfg, optimizer) 93 | self.checkpointer = DetectionCheckpointer( 94 | model, 95 | cfg.OUTPUT_DIR, 96 | optimizer=optimizer, 97 | scheduler=self.scheduler, 98 | ) 99 | self.start_iter = 0 100 | self.max_iter = cfg.SOLVER.MAX_ITER 101 | self.cfg = cfg 102 | self.register_hooks(self.build_hooks()) 103 | 104 | @classmethod 105 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 106 | if output_folder is None: 107 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 108 | evaluator_list = [] 109 | if cfg.META_INFO.EVAL_AP: 110 | evaluator_list.append(COCOEvaluatorFPN(dataset_name, cfg, True, output_folder)) 111 | return DatasetEvaluators(evaluator_list) 112 | 113 | @classmethod 114 | def build_test_loader(cls, cfg, dataset_name): 115 | return build_detection_test_loader(cfg, dataset_name, ValMapper(cfg)) 116 | 117 | 118 | def default_argument_parser(epilog=None): 119 | """ 120 | Create a parser with some common arguments used by detectron2 users. 121 | 122 | Args: 123 | epilog (str): epilog passed to ArgumentParser describing the usage. 124 | 125 | Returns: 126 | argparse.ArgumentParser: 127 | """ 128 | parser = argparse.ArgumentParser( 129 | epilog=epilog 130 | or f""" 131 | Examples: 132 | 133 | Run on single machine: 134 | $ {sys.argv[0]} --num-gpus 8 --config-file cfg.yaml MODEL.WEIGHTS /path/to/weight.pth 135 | 136 | Run on multiple machines: 137 | (machine0)$ {sys.argv[0]} --machine-rank 0 --num-machines 2 --dist-url [--other-flags] 138 | (machine1)$ {sys.argv[0]} --machine-rank 1 --num-machines 2 --dist-url [--other-flags] 139 | """, 140 | formatter_class=argparse.RawDescriptionHelpFormatter, 141 | ) 142 | parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file") 143 | parser.add_argument( 144 | "--resume", 145 | action="store_true", 146 | help="whether to attempt to resume from the checkpoint directory", 147 | ) 148 | parser.add_argument("--eval-only", action="store_true", help="perform evaluation only") 149 | parser.add_argument("--no-pretrain", action="store_true", help="whether to load pretrained model") 150 | parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*") 151 | parser.add_argument("--num-machines", type=int, default=1, help="total number of machines") 152 | parser.add_argument( 153 | "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)" 154 | ) 155 | 156 | # PyTorch still may leave orphan processes in multi-gpu training. 157 | # Therefore we use a deterministic way to obtain port, 158 | # so that users are aware of orphan processes by seeing the port occupied. 159 | port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14 160 | parser.add_argument( 161 | "--dist-url", 162 | default="tcp://127.0.0.1:{}".format(port), 163 | help="initialization URL for pytorch distributed backend. See " 164 | "https://pytorch.org/docs/stable/distributed.html for details.", 165 | ) 166 | parser.add_argument( 167 | "opts", 168 | help="Modify config options using the command-line", 169 | default=None, 170 | nargs=argparse.REMAINDER, 171 | ) 172 | return parser 173 | 174 | 175 | def setup(args): 176 | """ 177 | Create configs and perform basic setups. 178 | """ 179 | cfg = get_cfg() 180 | add_custom_config(cfg) 181 | # add_backbone_config(cfg) 182 | cfg.merge_from_file(args.config_file) 183 | cfg.merge_from_list(args.opts) 184 | cfg.freeze() 185 | default_setup(cfg, args) 186 | return cfg 187 | 188 | 189 | def start_train(args): 190 | cfg = setup(args) 191 | if args.eval_only: 192 | model = Trainer.build_model(cfg) 193 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 194 | cfg.MODEL.WEIGHTS, resume=args.resume 195 | ) 196 | res = Trainer.test(cfg, model) 197 | if cfg.TEST.AUG.ENABLED: 198 | res.update(Trainer.test_with_TTA(cfg, model)) 199 | if comm.is_main_process(): 200 | verify_results(cfg, res) 201 | return res 202 | trainer = Trainer(cfg, resume=args.resume, reuse_ckpt=args.no_pretrain) 203 | return trainer.train() 204 | -------------------------------------------------------------------------------- /train_tools/visdrone_infer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | import os 4 | from collections import OrderedDict 5 | import torch 6 | import argparse 7 | 8 | import detectron2.utils.comm as comm 9 | from detectron2.checkpoint import DetectionCheckpointer 10 | from detectron2.config import get_cfg 11 | from detectron2.data import MetadataCatalog, build_detection_test_loader 12 | from detectron2.engine import DefaultTrainer, default_setup, hooks, launch 13 | from detectron2.evaluation import ( 14 | CityscapesInstanceEvaluator, 15 | CityscapesSemSegEvaluator, 16 | COCOEvaluator, 17 | COCOPanopticEvaluator, 18 | DatasetEvaluators, 19 | LVISEvaluator, 20 | PascalVOCDetectionEvaluator, 21 | SemSegEvaluator, 22 | verify_results, 23 | ) 24 | from detectron2.evaluation import ( 25 | DatasetEvaluator, 26 | inference_on_dataset, 27 | print_csv_format, 28 | verify_results, 29 | ) 30 | from detectron2.modeling import GeneralizedRCNNWithTTA 31 | from detectron2.checkpoint import DetectionCheckpointer 32 | from detectron2.evaluation.evaluator import inference_on_dataset 33 | 34 | 35 | from utils.val_mapper_with_ann import ValMapper 36 | from utils.anchor_gen import AnchorGeneratorWithCenter 37 | from utils.coco_eval_fpn import COCOEvaluatorFPN 38 | from utils.json_evaluator import JsonEvaluator 39 | from utils.time_evaluator import GPUTimeEvaluator 40 | 41 | from visdrone.dataloader import build_train_loader, build_test_loader 42 | 43 | # from models.backbone import build 44 | from configs.custom_config import add_custom_config 45 | 46 | from models.retinanet.retinanet import RetinaNet_D2 47 | from models.querydet.detector import RetinaNetQueryDet 48 | 49 | 50 | 51 | class Trainer(DefaultTrainer): 52 | @classmethod 53 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 54 | if output_folder is None: 55 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 56 | evaluator_list = [] 57 | evaluator_list.append(JsonEvaluator(os.path.join(cfg.OUTPUT_DIR, 'visdrone_infer.json'), class_add_1=True)) 58 | if cfg.META_INFO.EVAL_GPU_TIME: 59 | evaluator_list.append(GPUTimeEvaluator(True, 'minisecond')) 60 | return DatasetEvaluators(evaluator_list) 61 | 62 | @classmethod 63 | def build_train_loader(cls, cfg): 64 | return build_train_loader(cfg) 65 | 66 | @classmethod 67 | def build_test_loader(cls, cfg, dataset_name): 68 | return build_test_loader(cfg) 69 | 70 | @classmethod 71 | def test(cls, cfg, model, evaluators=None): 72 | logger = logging.getLogger(__name__) 73 | dataset_name = 'VisDrone2018' 74 | 75 | data_loader = cls.build_test_loader(cfg, dataset_name) 76 | evaluator = cls.build_evaluator(cfg, dataset_name) 77 | result = inference_on_dataset(model, data_loader, evaluator) 78 | if comm.is_main_process(): 79 | assert isinstance( 80 | result, dict 81 | ), "Evaluator must return a dict on the main process. Got {} instead.".format( 82 | result 83 | ) 84 | logger.info("Evaluation results for {} in csv format:".format(dataset_name)) 85 | print_csv_format(result) 86 | 87 | if len(result) == 1: 88 | result = list(result.values())[0] 89 | return result 90 | 91 | 92 | def default_argument_parser(epilog=None): 93 | """ 94 | Create a parser with some common arguments used by detectron2 users. 95 | 96 | Args: 97 | epilog (str): epilog passed to ArgumentParser describing the usage. 98 | 99 | Returns: 100 | argparse.ArgumentParser: 101 | """ 102 | parser = argparse.ArgumentParser( 103 | epilog=epilog 104 | or f""" 105 | Examples: 106 | 107 | Run on single machine: 108 | $ {sys.argv[0]} --num-gpus 8 --config-file cfg.yaml MODEL.WEIGHTS /path/to/weight.pth 109 | 110 | Run on multiple machines: 111 | (machine0)$ {sys.argv[0]} --machine-rank 0 --num-machines 2 --dist-url [--other-flags] 112 | (machine1)$ {sys.argv[0]} --machine-rank 1 --num-machines 2 --dist-url [--other-flags] 113 | """, 114 | formatter_class=argparse.RawDescriptionHelpFormatter, 115 | ) 116 | parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file") 117 | parser.add_argument( 118 | "--resume", 119 | action="store_true", 120 | help="whether to attempt to resume from the checkpoint directory", 121 | ) 122 | parser.add_argument("--eval-only", action="store_true", help="perform evaluation only") 123 | parser.add_argument("--no-pretrain", action="store_true", help="whether to load pretrained model") 124 | parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*") 125 | parser.add_argument("--num-machines", type=int, default=1, help="total number of machines") 126 | parser.add_argument( 127 | "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)" 128 | ) 129 | 130 | # PyTorch still may leave orphan processes in multi-gpu training. 131 | # Therefore we use a deterministic way to obtain port, 132 | # so that users are aware of orphan processes by seeing the port occupied. 133 | port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14 134 | parser.add_argument( 135 | "--dist-url", 136 | default="tcp://127.0.0.1:{}".format(port), 137 | help="initialization URL for pytorch distributed backend. See " 138 | "https://pytorch.org/docs/stable/distributed.html for details.", 139 | ) 140 | parser.add_argument( 141 | "opts", 142 | help="Modify config options using the command-line", 143 | default=None, 144 | nargs=argparse.REMAINDER, 145 | ) 146 | return parser 147 | 148 | 149 | def setup(args): 150 | """ 151 | Create configs and perform basic setups. 152 | """ 153 | cfg = get_cfg() 154 | add_custom_config(cfg) 155 | cfg.merge_from_file(args.config_file) 156 | cfg.merge_from_list(args.opts) 157 | cfg.freeze() 158 | default_setup(cfg, args) 159 | return cfg 160 | 161 | 162 | def start_train(args): 163 | cfg = setup(args) 164 | 165 | if args.eval_only: 166 | model = Trainer.build_model(cfg) 167 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 168 | cfg.MODEL.WEIGHTS, resume=args.resume 169 | ) 170 | res = Trainer.test(cfg, model) 171 | if comm.is_main_process(): 172 | verify_results(cfg, res) 173 | return res 174 | 175 | trainer = Trainer(cfg) 176 | if not args.no_pretrain: 177 | trainer.resume_or_load(resume=args.resume) 178 | return trainer.train() 179 | -------------------------------------------------------------------------------- /train_tools/visdrone_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | """ 4 | Detection Training Script. 5 | 6 | This scripts reads a given config file and runs the training or evaluation. 7 | It is an entry point that is made to train standard models in detectron2. 8 | 9 | In order to let one script support training of many models, 10 | this script contains logic that are specific to these built-in models and therefore 11 | may not be suitable for your own project. 12 | For example, your research project perhaps only needs a single "evaluator". 13 | 14 | Therefore, we recommend you to use detectron2 as an library and take 15 | this file as an example of how to use the library. 16 | You may want to write your own script with your datasets and other customizations. 17 | """ 18 | 19 | import logging 20 | import sys 21 | import os 22 | from collections import OrderedDict 23 | import torch 24 | import time 25 | import argparse 26 | 27 | from torch.nn.parallel import DistributedDataParallel 28 | 29 | import detectron2.utils.comm as comm 30 | from detectron2.checkpoint import DetectionCheckpointer 31 | from detectron2.config import get_cfg 32 | from detectron2.data import MetadataCatalog, build_detection_test_loader 33 | from detectron2.engine import DefaultTrainer, default_setup, hooks, launch 34 | from detectron2.evaluation import ( 35 | CityscapesInstanceEvaluator, 36 | CityscapesSemSegEvaluator, 37 | COCOEvaluator, 38 | COCOPanopticEvaluator, 39 | DatasetEvaluators, 40 | LVISEvaluator, 41 | PascalVOCDetectionEvaluator, 42 | SemSegEvaluator, 43 | verify_results, 44 | ) 45 | from detectron2.modeling import GeneralizedRCNNWithTTA 46 | from detectron2.checkpoint import DetectionCheckpointer 47 | from detectron2.evaluation.evaluator import inference_on_dataset 48 | from detectron2.utils.events import JSONWriter, TensorboardXWriter 49 | from detectron2.engine.train_loop import AMPTrainer, SimpleTrainer 50 | from detectron2.engine.defaults import DefaultTrainer 51 | 52 | from utils.val_mapper_with_ann import ValMapper 53 | from utils.anchor_gen import AnchorGeneratorWithCenter 54 | from utils.coco_eval_fpn import COCOEvaluatorFPN 55 | from utils.json_evaluator import JsonEvaluator 56 | from utils.time_evaluator import GPUTimeEvaluator 57 | 58 | from visdrone.dataloader import build_train_loader, build_test_loader 59 | 60 | from configs.custom_config import add_custom_config 61 | 62 | 63 | class Trainer(DefaultTrainer): 64 | def __init__(self, cfg, resume=False, reuse_ckpt=False): 65 | """ 66 | Args: 67 | cfg (CfgNode): 68 | """ 69 | super(DefaultTrainer, self).__init__() 70 | 71 | logger = logging.getLogger("detectron2") 72 | if not logger.isEnabledFor(logging.INFO): # setup_logger is not called for d2 73 | setup_logger() 74 | cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) 75 | 76 | # Assume these objects must be constructed in this order. 77 | model = self.build_model(cfg) 78 | 79 | ckpt = DetectionCheckpointer(model) 80 | self.start_iter = 0 81 | self.start_iter = ckpt.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 82 | self.iter =self.start_iter 83 | 84 | optimizer = self.build_optimizer(cfg, model) 85 | data_loader = self.build_train_loader(cfg) 86 | 87 | # For training, wrap with DDP. But don't need this for inference. 88 | if comm.get_world_size() > 1: 89 | model = DistributedDataParallel( 90 | model, device_ids=[comm.get_local_rank()], broadcast_buffers=False 91 | ) 92 | self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)( 93 | model, data_loader, optimizer 94 | ) 95 | 96 | self.scheduler = self.build_lr_scheduler(cfg, optimizer) 97 | self.checkpointer = DetectionCheckpointer( 98 | model, 99 | cfg.OUTPUT_DIR, 100 | optimizer=optimizer, 101 | scheduler=self.scheduler, 102 | ) 103 | self.start_iter = 0 104 | self.max_iter = cfg.SOLVER.MAX_ITER 105 | self.cfg = cfg 106 | 107 | self.register_hooks(self.build_hooks()) 108 | 109 | def resume_or_load(self, resume=True): 110 | """ 111 | If `resume==True` and `cfg.OUTPUT_DIR` contains the last checkpoint (defined by 112 | a `last_checkpoint` file), resume from the file. Resuming means loading all 113 | available states (eg. optimizer and scheduler) and update iteration counter 114 | from the checkpoint. ``cfg.MODEL.WEIGHTS`` will not be used. 115 | Otherwise, this is considered as an independent training. The method will load model 116 | weights from the file `cfg.MODEL.WEIGHTS` (but will not load other states) and start 117 | from iteration 0. 118 | Args: 119 | resume (bool): whether to do resume or not 120 | """ 121 | checkpoint = self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume) 122 | print(self.cfg.MODEL.WEIGHTS) 123 | exit() 124 | if resume and self.checkpointer.has_checkpoint(): 125 | self.start_iter = checkpoint.get("iteration", -1) + 1 126 | # The checkpoint stores the training iteration that just finished, thus we start 127 | # at the next iteration (or iter zero if there's no checkpoint). 128 | if isinstance(self.model, DistributedDataParallel): 129 | # broadcast loaded data/model from the first rank, because other 130 | # machines may not have access to the checkpoint file 131 | if TORCH_VERSION >= (1, 7): 132 | self.model._sync_params_and_buffers() 133 | self.start_iter = comm.all_gather(self.start_iter)[0] 134 | 135 | @classmethod 136 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 137 | if output_folder is None: 138 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 139 | evaluator_list = [] 140 | evaluator_list.append(JsonEvaluator(os.path.join(cfg.OUTPUT_DIR, 'visdrone_infer.json'))) 141 | if cfg.META_INFO.EVAL_GPU_TIME: 142 | evaluator_list.append(GPUTimeEvaluator(True, 'minisecond')) 143 | return DatasetEvaluators(evaluator_list) 144 | 145 | @classmethod 146 | def build_train_loader(cls, cfg): 147 | return build_train_loader(cfg) 148 | 149 | @classmethod 150 | def build_test_loader(cls, cfg, dataset_name): 151 | return build_test_loader(cfg) 152 | 153 | @classmethod 154 | def test(cls, cfg, model, evaluators=None): 155 | logger = logging.getLogger(__name__) 156 | dataset_name = 'VisDrone2018' 157 | 158 | data_loader = cls.build_test_loader(cfg, dataset_name) 159 | evaluator = cls.build_evaluator(cfg, dataset_name) 160 | result = inference_on_dataset(model, data_loader, evaluator) 161 | return [] 162 | 163 | 164 | 165 | 166 | def default_argument_parser(epilog=None): 167 | """ 168 | Create a parser with some common arguments used by detectron2 users. 169 | 170 | Args: 171 | epilog (str): epilog passed to ArgumentParser describing the usage. 172 | 173 | Returns: 174 | argparse.ArgumentParser: 175 | """ 176 | parser = argparse.ArgumentParser( 177 | epilog=epilog 178 | or f""" 179 | Examples: 180 | 181 | Run on single machine: 182 | $ {sys.argv[0]} --num-gpus 8 --config-file cfg.yaml MODEL.WEIGHTS /path/to/weight.pth 183 | 184 | Run on multiple machines: 185 | (machine0)$ {sys.argv[0]} --machine-rank 0 --num-machines 2 --dist-url [--other-flags] 186 | (machine1)$ {sys.argv[0]} --machine-rank 1 --num-machines 2 --dist-url [--other-flags] 187 | """, 188 | formatter_class=argparse.RawDescriptionHelpFormatter, 189 | ) 190 | parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file") 191 | parser.add_argument( 192 | "--resume", 193 | action="store_true", 194 | help="whether to attempt to resume from the checkpoint directory", 195 | ) 196 | parser.add_argument("--eval-only", action="store_true", help="perform evaluation only") 197 | parser.add_argument("--no-pretrain", action="store_true", help="whether to load pretrained model") 198 | parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*") 199 | parser.add_argument("--num-machines", type=int, default=1, help="total number of machines") 200 | parser.add_argument( 201 | "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)" 202 | ) 203 | 204 | # PyTorch still may leave orphan processes in multi-gpu training. 205 | # Therefore we use a deterministic way to obtain port, 206 | # so that users are aware of orphan processes by seeing the port occupied. 207 | port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14 208 | parser.add_argument( 209 | "--dist-url", 210 | default="tcp://127.0.0.1:{}".format(port), 211 | help="initialization URL for pytorch distributed backend. See " 212 | "https://pytorch.org/docs/stable/distributed.html for details.", 213 | ) 214 | parser.add_argument( 215 | "opts", 216 | help="Modify config options using the command-line", 217 | default=None, 218 | nargs=argparse.REMAINDER, 219 | ) 220 | return parser 221 | 222 | 223 | def setup(args): 224 | """ 225 | Create configs and perform basic setups. 226 | """ 227 | cfg = get_cfg() 228 | add_custom_config(cfg) 229 | cfg.merge_from_file(args.config_file) 230 | cfg.merge_from_list(args.opts) 231 | cfg.freeze() 232 | default_setup(cfg, args) 233 | return cfg 234 | 235 | 236 | def start_train(args): 237 | cfg = setup(args) 238 | 239 | if args.eval_only: 240 | model = Trainer.build_model(cfg) 241 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 242 | cfg.MODEL.WEIGHTS, resume=args.resume 243 | ) 244 | res = Trainer.test(cfg, model) 245 | if comm.is_main_process(): 246 | verify_results(cfg, res) 247 | return res 248 | 249 | trainer = Trainer(cfg, resume=args.resume, reuse_ckpt=args.no_pretrain) 250 | return trainer.train() 251 | -------------------------------------------------------------------------------- /train_visdrone.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from detectron2.engine import launch 4 | from train_tools.visdrone_train import default_argument_parser, start_train 5 | 6 | from models.retinanet.retinanet import RetinaNet_D2 7 | from models.querydet.detector import RetinaNetQueryDet 8 | 9 | if __name__ == '__main__': 10 | args = default_argument_parser().parse_args() 11 | print("Command Line Args:", args) 12 | launch( 13 | start_train, 14 | args.num_gpus, 15 | num_machines=args.num_machines, 16 | machine_rank=args.machine_rank, 17 | dist_url=args.dist_url, 18 | args=(args,), 19 | ) -------------------------------------------------------------------------------- /utils/anchor_gen.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator, _create_grid_offsets 4 | from detectron2.modeling import ANCHOR_GENERATOR_REGISTRY 5 | from detectron2.structures import Boxes 6 | import math 7 | import detectron2.utils.comm as comm 8 | 9 | 10 | @ANCHOR_GENERATOR_REGISTRY.register() 11 | class AnchorGeneratorWithCenter(DefaultAnchorGenerator): 12 | 13 | def _grid_anchors(self, grid_sizes): 14 | anchors = [] 15 | centers = [] 16 | for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors): 17 | shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors.device) 18 | shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) 19 | center = torch.stack((shift_x, shift_y), dim=1) 20 | 21 | anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4)) 22 | centers.append(center.view(-1, 2)) 23 | return anchors, centers 24 | 25 | def forward(self, features): 26 | grid_sizes = [feature_map.shape[-2:] for feature_map in features] 27 | anchors_over_all_feature_maps, centers_over_all_feature_maps = self._grid_anchors(grid_sizes) 28 | anchor_boxes = [Boxes(x) for x in anchors_over_all_feature_maps] 29 | 30 | return anchor_boxes, centers_over_all_feature_maps -------------------------------------------------------------------------------- /utils/coco_eval_fpn.py: -------------------------------------------------------------------------------- 1 | from detectron2.evaluation import COCOEvaluator 2 | from detectron2.structures import Boxes, BoxMode, pairwise_iou 3 | 4 | def _instances_to_coco_json(instances, img_id): 5 | """ 6 | Dump an "Instances" object to a COCO-format json that's used for evaluation. 7 | 8 | Args: 9 | instances (Instances): 10 | img_id (int): the image id 11 | 12 | Returns: 13 | list[dict]: list of json annotations in COCO format. 14 | """ 15 | num_instance = len(instances) 16 | if num_instance == 0: 17 | return [] 18 | 19 | boxes = instances.pred_boxes.tensor.numpy() 20 | boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) 21 | boxes = boxes.tolist() 22 | scores = instances.scores.tolist() 23 | classes = instances.pred_classes.tolist() 24 | 25 | has_fpn_layer = instances.has("fpn_layers") 26 | if has_fpn_layer: 27 | fpn_layers = instances.fpn_layers.tolist() 28 | 29 | has_mask = instances.has("pred_masks") 30 | if has_mask: 31 | rles = [ 32 | mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] 33 | for mask in instances.pred_masks 34 | ] 35 | for rle in rles: 36 | rle["counts"] = rle["counts"].decode("utf-8") 37 | 38 | has_keypoints = instances.has("pred_keypoints") 39 | if has_keypoints: 40 | keypoints = instances.pred_keypoints 41 | 42 | results = [] 43 | for k in range(num_instance): 44 | result = { 45 | "image_id": img_id, 46 | "category_id": classes[k], 47 | "bbox": boxes[k], 48 | "score": scores[k], 49 | } 50 | if has_fpn_layer: 51 | result["fpn_layer"] = fpn_layers[k] 52 | if has_mask: 53 | result["segmentation"] = rles[k] 54 | if has_keypoints: 55 | keypoints[k][:, :2] -= 0.5 56 | result["keypoints"] = keypoints[k].flatten().tolist() 57 | results.append(result) 58 | return results 59 | 60 | 61 | 62 | class COCOEvaluatorFPN(COCOEvaluator): 63 | 64 | def process(self, inputs, outputs): 65 | """ 66 | Args: 67 | inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). 68 | It is a list of dict. Each dict corresponds to an image and 69 | contains keys like "height", "width", "file_name", "image_id". 70 | outputs: the outputs of a COCO model. It is a list of dicts with key 71 | "instances" that contains :class:`Instances`. 72 | """ 73 | for input, output in zip(inputs, outputs): 74 | prediction = {"image_id": input["image_id"]} 75 | 76 | # TODO this is ugly 77 | if "instances" in output: 78 | instances = output["instances"].to(self._cpu_device) 79 | prediction["instances"] = _instances_to_coco_json(instances, input["image_id"]) 80 | if "proposals" in output: 81 | prediction["proposals"] = output["proposals"].to(self._cpu_device) 82 | self._predictions.append(prediction) -------------------------------------------------------------------------------- /utils/gradient_checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import warnings 3 | from typing import Any, Iterable, List, Tuple 4 | 5 | 6 | def detach_variable(inputs: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]: 7 | if isinstance(inputs, tuple): 8 | out = [] 9 | for inp in inputs: 10 | if not isinstance(inp, torch.Tensor): 11 | out.append(inp) 12 | continue 13 | 14 | x = inp.detach() 15 | x.requires_grad = inp.requires_grad 16 | out.append(x) 17 | return tuple(out) 18 | else: 19 | raise RuntimeError( 20 | "Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__) 21 | 22 | 23 | def check_backward_validity(inputs: Iterable[Any]) -> None: 24 | if not any(inp.requires_grad for inp in inputs if isinstance(inp, torch.Tensor)): 25 | warnings.warn("None of the inputs have requires_grad=True. Gradients will be None") 26 | 27 | 28 | # We can't know if the run_fn will internally move some args to different devices, 29 | # which would require logic to preserve rng states for those devices as well. 30 | # We could paranoically stash and restore ALL the rng states for all visible devices, 31 | # but that seems very wasteful for most cases. Compromise: Stash the RNG state for 32 | # the device of all Tensor args. 33 | # 34 | # To consider: maybe get_device_states and set_device_states should reside in torch/random.py? 35 | def get_device_states(*args) -> Tuple[List[int], List[torch.Tensor]]: 36 | # This will not error out if "arg" is a CPU tensor or a non-tensor type because 37 | # the conditionals short-circuit. 38 | fwd_gpu_devices = list(set(arg.get_device() for arg in args 39 | if isinstance(arg, torch.Tensor) and arg.is_cuda)) 40 | 41 | fwd_gpu_states = [] 42 | for device in fwd_gpu_devices: 43 | with torch.cuda.device(device): 44 | fwd_gpu_states.append(torch.cuda.get_rng_state()) 45 | 46 | return fwd_gpu_devices, fwd_gpu_states 47 | 48 | 49 | def set_device_states(devices, states) -> None: 50 | for device, state in zip(devices, states): 51 | with torch.cuda.device(device): 52 | torch.cuda.set_rng_state(state) 53 | 54 | 55 | class CheckpointFunction(torch.autograd.Function): 56 | 57 | @staticmethod 58 | def forward(ctx, run_function, preserve_rng_state, *args): 59 | check_backward_validity(args) 60 | ctx.run_function = run_function 61 | ctx.preserve_rng_state = preserve_rng_state 62 | ctx.had_autocast_in_fwd = torch.is_autocast_enabled() 63 | if preserve_rng_state: 64 | ctx.fwd_cpu_state = torch.get_rng_state() 65 | # Don't eagerly initialize the cuda context by accident. 66 | # (If the user intends that the context is initialized later, within their 67 | # run_function, we SHOULD actually stash the cuda state here. Unfortunately, 68 | # we have no way to anticipate this will happen before we run the function.) 69 | ctx.had_cuda_in_fwd = False 70 | if torch.cuda._initialized: 71 | ctx.had_cuda_in_fwd = True 72 | ctx.fwd_gpu_devices, ctx.fwd_gpu_states = get_device_states(*args) 73 | ctx.save_for_backward(*args) 74 | with torch.no_grad(): 75 | outputs = run_function(*args) 76 | return outputs 77 | 78 | @staticmethod 79 | def backward(ctx, *args): 80 | if not torch.autograd._is_checkpoint_valid(): 81 | raise RuntimeError("Checkpointing is not compatible with .grad(), please use .backward() if possible") 82 | inputs = ctx.saved_tensors 83 | # Stash the surrounding rng state, and mimic the state that was 84 | # present at this time during forward. Restore the surrounding state 85 | # when we're done. 86 | rng_devices = [] 87 | if ctx.preserve_rng_state and ctx.had_cuda_in_fwd: 88 | rng_devices = ctx.fwd_gpu_devices 89 | with torch.random.fork_rng(devices=rng_devices, enabled=ctx.preserve_rng_state): 90 | if ctx.preserve_rng_state: 91 | torch.set_rng_state(ctx.fwd_cpu_state) 92 | if ctx.had_cuda_in_fwd: 93 | set_device_states(ctx.fwd_gpu_devices, ctx.fwd_gpu_states) 94 | detached_inputs = detach_variable(inputs) 95 | with torch.enable_grad(), torch.cuda.amp.autocast(ctx.had_autocast_in_fwd): 96 | outputs = ctx.run_function(*detached_inputs) 97 | 98 | if isinstance(outputs, torch.Tensor): 99 | outputs = (outputs,) 100 | torch.autograd.backward(outputs, args) 101 | grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp 102 | for inp in detached_inputs) 103 | return (None, None) + grads 104 | 105 | 106 | def checkpoint(function, *args, **kwargs): 107 | r"""Checkpoint a model or part of the model 108 | 109 | Checkpointing works by trading compute for memory. Rather than storing all 110 | intermediate activations of the entire computation graph for computing 111 | backward, the checkpointed part does **not** save intermediate activations, 112 | and instead recomputes them in backward pass. It can be applied on any part 113 | of a model. 114 | 115 | Specifically, in the forward pass, :attr:`function` will run in 116 | :func:`torch.no_grad` manner, i.e., not storing the intermediate 117 | activations. Instead, the forward pass saves the inputs tuple and the 118 | :attr:`function` parameter. In the backwards pass, the saved inputs and 119 | :attr:`function` is retrieved, and the forward pass is computed on 120 | :attr:`function` again, now tracking the intermediate activations, and then 121 | the gradients are calculated using these activation values. 122 | 123 | .. warning:: 124 | Checkpointing doesn't work with :func:`torch.autograd.grad`, but only 125 | with :func:`torch.autograd.backward`. 126 | 127 | .. warning:: 128 | If :attr:`function` invocation during backward does anything different 129 | than the one during forward, e.g., due to some global variable, the 130 | checkpointed version won't be equivalent, and unfortunately it can't be 131 | detected. 132 | 133 | .. warning:: 134 | If checkpointed segment contains tensors detached from the computational 135 | graph by `detach()` or `torch.no_grad()`, the backward pass will raise an 136 | error. This is because `checkpoint` makes all the outputs require 137 | gradients which causes issues when a tensor is defined to have no 138 | gradient in the model. To circumvent this, detach the tensors outside of 139 | the `checkpoint` function. 140 | 141 | .. warning: 142 | At least one of the inputs needs to have :code:`requires_grad=True` if 143 | grads are needed for model inputs, otherwise the checkpointed part of the 144 | model won't have gradients. 145 | 146 | Args: 147 | function: describes what to run in the forward pass of the model or 148 | part of the model. It should also know how to handle the inputs 149 | passed as the tuple. For example, in LSTM, if user passes 150 | ``(activation, hidden)``, :attr:`function` should correctly use the 151 | first input as ``activation`` and the second input as ``hidden`` 152 | preserve_rng_state(bool, optional, default=True): Omit stashing and restoring 153 | the RNG state during each checkpoint. 154 | args: tuple containing inputs to the :attr:`function` 155 | 156 | Returns: 157 | Output of running :attr:`function` on :attr:`*args` 158 | """ 159 | # Hack to mix *args with **kwargs in a python 2.7-compliant way 160 | preserve = kwargs.pop('preserve_rng_state', True) 161 | if kwargs: 162 | raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwargs)) 163 | 164 | return CheckpointFunction.apply(function, preserve, *args) 165 | 166 | 167 | def checkpoint_sequential(functions, segments, input, **kwargs): 168 | r"""A helper function for checkpointing sequential models. 169 | 170 | Sequential models execute a list of modules/functions in order 171 | (sequentially). Therefore, we can divide such a model in various segments 172 | and checkpoint each segment. All segments except the last will run in 173 | :func:`torch.no_grad` manner, i.e., not storing the intermediate 174 | activations. The inputs of each checkpointed segment will be saved for 175 | re-running the segment in the backward pass. 176 | 177 | See :func:`~torch.utils.checkpoint.checkpoint` on how checkpointing works. 178 | 179 | .. warning:: 180 | Checkpointing doesn't work with :func:`torch.autograd.grad`, but only 181 | with :func:`torch.autograd.backward`. 182 | 183 | .. warning: 184 | At least one of the inputs needs to have :code:`requires_grad=True` if 185 | grads are needed for model inputs, otherwise the checkpointed part of the 186 | model won't have gradients. 187 | 188 | .. warning: 189 | Since PyTorch 1.4, it allows only one Tensor as the input and 190 | intermediate outputs, just like :class:`torch.nn.Sequential`. 191 | 192 | Args: 193 | functions: A :class:`torch.nn.Sequential` or the list of modules or 194 | functions (comprising the model) to run sequentially. 195 | segments: Number of chunks to create in the model 196 | input: A Tensor that is input to :attr:`functions` 197 | preserve_rng_state(bool, optional, default=True): Omit stashing and restoring 198 | the RNG state during each checkpoint. 199 | 200 | Returns: 201 | Output of running :attr:`functions` sequentially on :attr:`*inputs` 202 | 203 | Example: 204 | >>> model = nn.Sequential(...) 205 | >>> input_var = checkpoint_sequential(model, chunks, input_var) 206 | """ 207 | # Hack for keyword-only parameter in a python 2.7-compliant way 208 | preserve = kwargs.pop('preserve_rng_state', True) 209 | if kwargs: 210 | raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwargs)) 211 | 212 | def run_function(start, end, functions): 213 | def forward(input): 214 | for j in range(start, end + 1): 215 | input = functions[j](input) 216 | return input 217 | return forward 218 | 219 | if isinstance(functions, torch.nn.Sequential): 220 | functions = list(functions.children()) 221 | 222 | segment_size = len(functions) // segments 223 | # the last chunk has to be non-volatile 224 | end = -1 225 | for start in range(0, segment_size * (segments - 1), segment_size): 226 | end = start + segment_size - 1 227 | input = checkpoint(run_function(start, end, functions), input, 228 | preserve_rng_state=preserve) 229 | return run_function(end + 1, len(functions) - 1, functions)(input) 230 | -------------------------------------------------------------------------------- /utils/json_evaluator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import json 4 | import torch 5 | import logging 6 | import itertools 7 | import numpy as np 8 | 9 | from detectron2.evaluation.evaluator import DatasetEvaluator 10 | import detectron2.utils.comm as comm 11 | import itertools 12 | from collections import OrderedDict 13 | from detectron2.evaluation.coco_evaluation import instances_to_coco_json 14 | 15 | import numpy as np 16 | 17 | class JsonEvaluator(DatasetEvaluator): 18 | def __init__(self, out_json, distributed=True, class_add_1=True): 19 | self._out_json = out_json 20 | self.class_add_1 = class_add_1 21 | 22 | self._distributed = distributed 23 | self._cpu_device = torch.device("cpu") 24 | self._logger = logging.getLogger(__name__) 25 | self._predictions = [] 26 | 27 | self.reset() 28 | 29 | 30 | def reset(self): 31 | self._predictions = [] 32 | 33 | 34 | def process(self, inputs, outputs): 35 | for input, output in zip(inputs, outputs): 36 | img_name = os.path.split(input['file_name'])[-1].split('.')[0] 37 | if "instances" in output: 38 | prediction = {"img_name": img_name} 39 | instances = output["instances"].to(self._cpu_device) 40 | if self.class_add_1: 41 | instances.pred_classes += 1 42 | prediction["instances"] = instances_to_coco_json(instances, input['image_id']) 43 | self._predictions.append(prediction) 44 | 45 | def evaluate(self): 46 | if self._distributed: 47 | comm.synchronize() 48 | predictions = comm.gather(self._predictions, dst=0) 49 | predictions = list(itertools.chain(*predictions)) 50 | if not comm.is_main_process(): 51 | return {} 52 | else: 53 | predictions = self._predictions 54 | 55 | if len(predictions) == 0: 56 | return {} 57 | 58 | det_preds = [] 59 | for pred in predictions: 60 | det_preds = det_preds + pred['instances'] 61 | 62 | with open(self._out_json, "w") as f: 63 | f.write(json.dumps(det_preds)) 64 | f.flush() 65 | 66 | return {} 67 | 68 | 69 | -------------------------------------------------------------------------------- /utils/loop_matcher.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | from typing import List 3 | import torch 4 | 5 | # useful when there are huge number of gt boxes 6 | class LoopMatcher(object): 7 | def __init__( 8 | self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False 9 | ): 10 | thresholds = thresholds[:] 11 | assert thresholds[0] > 0 12 | thresholds.insert(0, -float("inf")) 13 | thresholds.append(float("inf")) 14 | assert all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])) 15 | assert all(l in [-1, 0, 1] for l in labels) 16 | assert len(labels) == len(thresholds) - 1 17 | 18 | self.low_quality_thrshold = 0.3 19 | self.thresholds = thresholds 20 | self.labels = labels 21 | self.allow_low_quality_matches = allow_low_quality_matches 22 | 23 | 24 | def _iou(self, boxes, box): 25 | iw = torch.clamp(boxes[:, 2], max=box[2]) - torch.clamp(boxes[:, 0], min=box[0]) 26 | ih = torch.clamp(boxes[:, 3], max=box[3]) - torch.clamp(boxes[:, 1], min=box[1]) 27 | 28 | inter = torch.clamp(iw, min=0) * torch.clamp(ih, min=0) 29 | 30 | areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 31 | area = (box[2] - box[0]) * (box[3] - box[1]) 32 | 33 | iou = inter / (areas + area - inter) 34 | return iou 35 | 36 | def __call__(self, gt_boxes, anchors): 37 | if len(gt_boxes) == 0: 38 | default_matches = torch.zeros((len(anchors)), dtype=torch.int64).to(anchors.tensor.device) 39 | default_match_labels = torch.zeros((len(anchors)), dtype=torch.int8).to(anchors.tensor.device) + self.labels[0] 40 | return default_matches, default_match_labels 41 | 42 | gt_boxes_tensor = gt_boxes.tensor 43 | anchors_tensor = anchors.tensor 44 | 45 | max_ious = torch.zeros((len(anchors))).to(anchors_tensor.device) 46 | matched_inds = torch.zeros((len(anchors)), dtype=torch.long).to(anchors_tensor.device) 47 | gt_ious = torch.zeros((len(gt_boxes))).to(anchors_tensor.device) 48 | 49 | for i in range(len(gt_boxes)): 50 | ious = self._iou(anchors_tensor, gt_boxes_tensor[i]) 51 | gt_ious[i] = ious.max() 52 | matched_inds = torch.where(ious > max_ious, torch.zeros(1, dtype=torch.long, device=matched_inds.device)+i, matched_inds) 53 | max_ious = torch.max(ious, max_ious) 54 | del(ious) 55 | 56 | matched_vals = max_ious 57 | matches = matched_inds 58 | 59 | match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8) 60 | 61 | for (l, low, high) in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]): 62 | low_high = (matched_vals >= low) & (matched_vals < high) 63 | match_labels[low_high] = l 64 | 65 | if self.allow_low_quality_matches: 66 | self.set_low_quality_matches_(match_labels, matched_vals, matches, gt_ious) 67 | 68 | return matches, match_labels 69 | 70 | def set_low_quality_matches_(self, match_labels, matched_vals, matches, gt_ious): 71 | for i in range(len(gt_ious)): 72 | match_labels[(matched_vals==gt_ious[i]) & (matches==i)] = 1 73 | 74 | -------------------------------------------------------------------------------- /utils/merged_sync_bn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import logging 3 | import torch 4 | import torch.distributed as dist 5 | from torch import nn 6 | from torch.autograd.function import Function 7 | from torch.nn import functional as F 8 | from torch.cuda.amp import autocast 9 | 10 | from detectron2.utils import comm, env 11 | from detectron2.layers.wrappers import BatchNorm2d 12 | 13 | class AllReduce(Function): 14 | @staticmethod 15 | def forward(ctx, input): 16 | input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())] 17 | # Use allgather instead of allreduce since I don't trust in-place operations .. 18 | dist.all_gather(input_list, input, async_op=False) 19 | inputs = torch.stack(input_list, dim=0) 20 | return torch.sum(inputs, dim=0) 21 | 22 | @staticmethod 23 | def backward(ctx, grad_output): 24 | dist.all_reduce(grad_output, async_op=False) 25 | return grad_output 26 | 27 | class MergedSyncBatchNorm(BatchNorm2d): 28 | """ 29 | In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient 30 | when the batch size on each worker is different. 31 | (e.g., when scale augmentation is used, or when it is applied to mask head). 32 | 33 | This is a slower but correct alternative to `nn.SyncBatchNorm`. 34 | 35 | Note: 36 | There isn't a single definition of Sync BatchNorm. 37 | 38 | When ``stats_mode==""``, this module computes overall statistics by using 39 | statistics of each worker with equal weight. The result is true statistics 40 | of all samples (as if they are all on one worker) only when all workers 41 | have the same (N, H, W). This mode does not support inputs with zero batch size. 42 | 43 | When ``stats_mode=="N"``, this module computes overall statistics by weighting 44 | the statistics of each worker by their ``N``. The result is true statistics 45 | of all samples (as if they are all on one worker) only when all workers 46 | have the same (H, W). It is slower than ``stats_mode==""``. 47 | 48 | Even though the result of this module may not be the true statistics of all samples, 49 | it may still be reasonable because it might be preferrable to assign equal weights 50 | to all workers, regardless of their (H, W) dimension, instead of putting larger weight 51 | on larger images. From preliminary experiments, little difference is found between such 52 | a simplified implementation and an accurate computation of overall mean & variance. 53 | """ 54 | 55 | def __init__(self, *args, stats_mode="", **kwargs): 56 | super().__init__(*args, **kwargs) 57 | assert stats_mode in ["", "N"] 58 | self._stats_mode = stats_mode 59 | self._batch_mean = None # for precise BN 60 | self._batch_meansqr = None # for precise BN 61 | 62 | def _eval_forward(self, inputs): 63 | scale = self.weight * torch.rsqrt(self.running_var + self.eps) 64 | bias = self.bias - self.running_mean * scale 65 | scale = scale.view(1, -1, 1, 1) 66 | bias = bias.view(1, -1, 1, 1) 67 | return [(x * scale + bias) for x in inputs] 68 | 69 | 70 | # @float_function 71 | def forward(self, inputs): 72 | with autocast(False): 73 | if comm.get_world_size() == 1 or not self.training: 74 | return self._eval_forward(inputs) 75 | 76 | B, C = inputs[0].shape[0], inputs[0].shape[1] 77 | 78 | mean = sum([torch.mean(input, dim=[0, 2, 3]) for input in inputs]) / len(inputs) 79 | meansqr = sum([torch.mean(input * input, dim=[0, 2, 3]) for input in inputs]) / len(inputs) 80 | 81 | if self._stats_mode == "": 82 | assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.' 83 | vec = torch.cat([mean, meansqr], dim=0) 84 | vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) 85 | mean, meansqr = torch.split(vec, C) 86 | momentum = self.momentum 87 | else: 88 | if B == 0: 89 | vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype) 90 | vec = vec + _input.sum() # make sure there is gradient w.r.t input 91 | else: 92 | vec = torch.cat( 93 | [mean, meansqr, torch.ones([1], device=mean.device, dtype=mean.dtype)], dim=0 94 | ) 95 | vec = AllReduce.apply(vec * B) 96 | 97 | total_batch = vec[-1].detach() 98 | momentum = total_batch.clamp(max=1) * self.momentum # no update if total_batch is 0 99 | total_batch = torch.max(total_batch, torch.ones_like(total_batch)) # avoid div-by-zero 100 | mean, meansqr, _ = torch.split(vec / total_batch, C) 101 | 102 | var = meansqr - mean * mean 103 | invstd = torch.rsqrt(var + self.eps) 104 | scale = self.weight * invstd 105 | bias = self.bias - mean * scale 106 | scale = scale.reshape(1, -1, 1, 1) 107 | bias = bias.reshape(1, -1, 1, 1) 108 | 109 | self.running_mean += momentum * (mean.detach() - self.running_mean) 110 | self.running_var += momentum * (var.detach() - self.running_var) 111 | 112 | self._batch_mean = mean 113 | self._batch_meansqr = meansqr 114 | 115 | outputs = [(input * scale + bias) for input in inputs] 116 | return outputs 117 | -------------------------------------------------------------------------------- /utils/soft_nms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from detectron2.structures import Boxes, RotatedBoxes, pairwise_iou, pairwise_iou_rotated 4 | 5 | 6 | def soft_nms(boxes, scores, method, gaussian_sigma, linear_threshold, prune_threshold): 7 | """ 8 | Performs soft non-maximum suppression algorithm on axis aligned boxes 9 | 10 | Args: 11 | boxes (Tensor[N, 5]): 12 | boxes where NMS will be performed. They 13 | are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format 14 | scores (Tensor[N]): 15 | scores for each one of the boxes 16 | method (str): 17 | one of ['gaussian', 'linear', 'hard'] 18 | see paper for details. users encouraged not to use "hard", as this is the 19 | same nms available elsewhere in detectron2 20 | gaussian_sigma (float): 21 | parameter for Gaussian penalty function 22 | linear_threshold (float): 23 | iou threshold for applying linear decay. Nt from the paper 24 | re-used as threshold for standard "hard" nms 25 | prune_threshold (float): 26 | boxes with scores below this threshold are pruned at each iteration. 27 | Dramatically reduces computation time. Authors use values in [10e-4, 10e-2] 28 | 29 | Returns: 30 | tuple(Tensor, Tensor): 31 | [0]: int64 tensor with the indices of the elements that have been kept 32 | by Soft NMS, sorted in decreasing order of scores 33 | [1]: float tensor with the re-scored scores of the elements that were kept 34 | """ 35 | return _soft_nms( 36 | Boxes, 37 | pairwise_iou, 38 | boxes, 39 | scores, 40 | method, 41 | gaussian_sigma, 42 | linear_threshold, 43 | prune_threshold, 44 | ) 45 | 46 | 47 | def soft_nms_rotated(boxes, scores, method, gaussian_sigma, linear_threshold, prune_threshold): 48 | """ 49 | Performs soft non-maximum suppression algorithm on rotated boxes 50 | 51 | Args: 52 | boxes (Tensor[N, 5]): 53 | boxes where NMS will be performed. They 54 | are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format 55 | scores (Tensor[N]): 56 | scores for each one of the boxes 57 | method (str): 58 | one of ['gaussian', 'linear', 'hard'] 59 | see paper for details. users encouraged not to use "hard", as this is the 60 | same nms available elsewhere in detectron2 61 | gaussian_sigma (float): 62 | parameter for Gaussian penalty function 63 | linear_threshold (float): 64 | iou threshold for applying linear decay. Nt from the paper 65 | re-used as threshold for standard "hard" nms 66 | prune_threshold (float): 67 | boxes with scores below this threshold are pruned at each iteration. 68 | Dramatically reduces computation time. Authors use values in [10e-4, 10e-2] 69 | 70 | Returns: 71 | tuple(Tensor, Tensor): 72 | [0]: int64 tensor with the indices of the elements that have been kept 73 | by Soft NMS, sorted in decreasing order of scores 74 | [1]: float tensor with the re-scored scores of the elements that were kept """ 75 | return _soft_nms( 76 | RotatedBoxes, 77 | pairwise_iou_rotated, 78 | boxes, 79 | scores, 80 | method, 81 | gaussian_sigma, 82 | linear_threshold, 83 | prune_threshold, 84 | ) 85 | 86 | 87 | def batched_soft_nms( 88 | boxes, scores, idxs, method, gaussian_sigma, linear_threshold, prune_threshold 89 | ): 90 | """ 91 | Performs soft non-maximum suppression in a batched fashion. 92 | 93 | Each index value correspond to a category, and NMS 94 | will not be applied between elements of different categories. 95 | 96 | Args: 97 | boxes (Tensor[N, 4]): 98 | boxes where NMS will be performed. They 99 | are expected to be in (x1, y1, x2, y2) format 100 | scores (Tensor[N]): 101 | scores for each one of the boxes 102 | idxs (Tensor[N]): 103 | indices of the categories for each one of the boxes. 104 | method (str): 105 | one of ['gaussian', 'linear', 'hard'] 106 | see paper for details. users encouraged not to use "hard", as this is the 107 | same nms available elsewhere in detectron2 108 | gaussian_sigma (float): 109 | parameter for Gaussian penalty function 110 | linear_threshold (float): 111 | iou threshold for applying linear decay. Nt from the paper 112 | re-used as threshold for standard "hard" nms 113 | prune_threshold (float): 114 | boxes with scores below this threshold are pruned at each iteration. 115 | Dramatically reduces computation time. Authors use values in [10e-4, 10e-2] 116 | Returns: 117 | tuple(Tensor, Tensor): 118 | [0]: int64 tensor with the indices of the elements that have been kept 119 | by Soft NMS, sorted in decreasing order of scores 120 | [1]: float tensor with the re-scored scores of the elements that were kept 121 | """ 122 | if boxes.numel() == 0: 123 | return ( 124 | torch.empty((0,), dtype=torch.int64, device=boxes.device), 125 | torch.empty((0,), dtype=torch.float32, device=scores.device), 126 | ) 127 | # strategy: in order to perform NMS independently per class. 128 | # we add an offset to all the boxes. The offset is dependent 129 | # only on the class idx, and is large enough so that boxes 130 | # from different classes do not overlap 131 | max_coordinate = boxes.max() 132 | offsets = idxs.to(boxes) * (max_coordinate + 1) 133 | boxes_for_nms = boxes + offsets[:, None] 134 | return soft_nms( 135 | boxes_for_nms, scores, method, gaussian_sigma, linear_threshold, prune_threshold 136 | ) 137 | 138 | 139 | class SoftNMSer(object): 140 | def __init__(self, method, gaussian_sigma, linear_threshold, prune_threshold): 141 | self.method = method 142 | self.gaussian_sigma = gaussian_sigma 143 | self.linear_threshold = linear_threshold 144 | self.prune_threshold = prune_threshold 145 | 146 | def __call__(self, boxes, scores, class_idxs): 147 | return batched_soft_nms(boxes, scores, class_idxs, self.method, self.gaussian_sigma, self.linear_threshold, self.prune_threshold) 148 | 149 | 150 | def batched_soft_nms_rotated( 151 | boxes, scores, idxs, method, gaussian_sigma, linear_threshold, prune_threshold 152 | ): 153 | """ 154 | Performs soft non-maximum suppression in a batched fashion on rotated bounding boxes. 155 | 156 | Each index value correspond to a category, and NMS 157 | will not be applied between elements of different categories. 158 | 159 | Args: 160 | boxes (Tensor[N, 5]): 161 | boxes where NMS will be performed. They 162 | are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format 163 | scores (Tensor[N]): 164 | scores for each one of the boxes 165 | idxs (Tensor[N]): 166 | indices of the categories for each one of the boxes. 167 | method (str): 168 | one of ['gaussian', 'linear', 'hard'] 169 | see paper for details. users encouraged not to use "hard", as this is the 170 | same nms available elsewhere in detectron2 171 | gaussian_sigma (float): 172 | parameter for Gaussian penalty function 173 | linear_threshold (float): 174 | iou threshold for applying linear decay. Nt from the paper 175 | re-used as threshold for standard "hard" nms 176 | prune_threshold (float): 177 | boxes with scores below this threshold are pruned at each iteration. 178 | Dramatically reduces computation time. Authors use values in [10e-4, 10e-2] 179 | Returns: 180 | tuple(Tensor, Tensor): 181 | [0]: int64 tensor with the indices of the elements that have been kept 182 | by Soft NMS, sorted in decreasing order of scores 183 | [1]: float tensor with the re-scored scores of the elements that were kept 184 | """ 185 | if boxes.numel() == 0: 186 | return ( 187 | torch.empty((0,), dtype=torch.int64, device=boxes.device), 188 | torch.empty((0,), dtype=torch.float32, device=scores.device), 189 | ) 190 | # strategy: in order to perform NMS independently per class. 191 | # we add an offset to all the boxes. The offset is dependent 192 | # only on the class idx, and is large enough so that boxes 193 | # from different classes do not overlap 194 | max_coordinate = boxes[:, :2].max() + torch.norm(boxes[:, 2:4], 2, dim=1).max() 195 | offsets = idxs.to(boxes) * (max_coordinate + 1) 196 | boxes_for_nms = boxes.clone() 197 | boxes_for_nms[:, :2] += offsets[:, None] 198 | return soft_nms_rotated( 199 | boxes_for_nms, scores, method, gaussian_sigma, linear_threshold, prune_threshold 200 | ) 201 | 202 | 203 | def _soft_nms( 204 | box_class, 205 | pairwise_iou_func, 206 | boxes, 207 | scores, 208 | method, 209 | gaussian_sigma, 210 | linear_threshold, 211 | prune_threshold, 212 | ): 213 | """ 214 | Soft non-max suppression algorithm. 215 | 216 | Implementation of [Soft-NMS -- Improving Object Detection With One Line of Codec] 217 | (https://arxiv.org/abs/1704.04503) 218 | 219 | Args: 220 | box_class (cls): one of Box, RotatedBoxes 221 | pairwise_iou_func (func): one of pairwise_iou, pairwise_iou_rotated 222 | boxes (Tensor[N, ?]): 223 | boxes where NMS will be performed 224 | if Boxes, in (x1, y1, x2, y2) format 225 | if RotatedBoxes, in (x_ctr, y_ctr, width, height, angle_degrees) format 226 | scores (Tensor[N]): 227 | scores for each one of the boxes 228 | method (str): 229 | one of ['gaussian', 'linear', 'hard'] 230 | see paper for details. users encouraged not to use "hard", as this is the 231 | same nms available elsewhere in detectron2 232 | gaussian_sigma (float): 233 | parameter for Gaussian penalty function 234 | linear_threshold (float): 235 | iou threshold for applying linear decay. Nt from the paper 236 | re-used as threshold for standard "hard" nms 237 | prune_threshold (float): 238 | boxes with scores below this threshold are pruned at each iteration. 239 | Dramatically reduces computation time. Authors use values in [10e-4, 10e-2] 240 | 241 | Returns: 242 | tuple(Tensor, Tensor): 243 | [0]: int64 tensor with the indices of the elements that have been kept 244 | by Soft NMS, sorted in decreasing order of scores 245 | [1]: float tensor with the re-scored scores of the elements that were kept 246 | """ 247 | boxes = boxes.clone() 248 | scores = scores.clone() 249 | idxs = torch.arange(scores.size()[0]) 250 | 251 | idxs_out = [] 252 | scores_out = [] 253 | 254 | while scores.numel() > 0: 255 | top_idx = torch.argmax(scores) 256 | idxs_out.append(idxs[top_idx].item()) 257 | scores_out.append(scores[top_idx].item()) 258 | 259 | top_box = boxes[top_idx] 260 | ious = pairwise_iou_func(box_class(top_box.unsqueeze(0)), box_class(boxes))[0] 261 | 262 | if method == "linear": 263 | decay = torch.ones_like(ious) 264 | decay_mask = ious > linear_threshold 265 | decay[decay_mask] = 1 - ious[decay_mask] 266 | elif method == "gaussian": 267 | decay = torch.exp(-torch.pow(ious, 2) / gaussian_sigma) 268 | elif method == "hard": # standard NMS 269 | decay = (ious < linear_threshold).float() 270 | else: 271 | raise NotImplementedError("{} soft nms method not implemented.".format(method)) 272 | 273 | scores *= decay 274 | keep = scores > prune_threshold 275 | keep[top_idx] = False 276 | 277 | boxes = boxes[keep] 278 | scores = scores[keep] 279 | idxs = idxs[keep] 280 | 281 | return torch.tensor(idxs_out).to(boxes.device), torch.tensor(scores_out).to(scores.device) -------------------------------------------------------------------------------- /utils/time_evaluator.py: -------------------------------------------------------------------------------- 1 | import time 2 | from detectron2.evaluation.evaluator import DatasetEvaluator 3 | import detectron2.utils.comm as comm 4 | import itertools 5 | from collections import OrderedDict 6 | 7 | import numpy as np 8 | 9 | class GPUTimeEvaluator(DatasetEvaluator): 10 | def __init__(self, distributed, unit, out_file=None): 11 | self.distributed = distributed 12 | self.all_time = [] 13 | self.unit = unit 14 | self.out_file = out_file 15 | if unit not in {'minisecond', 'second'}: 16 | raise NotImplementedError('Unsupported time unit %s'%unit) 17 | self.reset() 18 | 19 | def reset(self): 20 | self.all_time = [] 21 | 22 | def process(self, inputs, outputs): 23 | for output in outputs: 24 | if 'time' in output.keys(): 25 | self.all_time.append(output['time']) 26 | return 27 | 28 | def evaluate(self): 29 | if self.distributed: 30 | comm.synchronize() 31 | all_time = comm.gather(self.all_time, dst=0) 32 | all_time = list(itertools.chain(*all_time)) 33 | 34 | if not comm.is_main_process(): 35 | return {} 36 | else: 37 | all_time = self.all_time 38 | 39 | if len(all_time) == 0: 40 | return {'GPU_Speed': 0} 41 | 42 | all_time = np.array(all_time) 43 | 44 | speeds = 1. / all_time 45 | if self.unit == 'minisecond': 46 | speeds *= 1000 47 | 48 | mean_speed = speeds.mean() 49 | std_speed = speeds.std() 50 | max_speed = speeds.max() 51 | min_speed = speeds.min() 52 | mid_speed = np.median(speeds) 53 | 54 | if self.out_file is not None: 55 | f = open(self.out_file, 'a') 56 | curr_time = time.strftime('%Y/%m/%d,%H:%M:%S', time.localtime()) 57 | f.write('%s\t%.2f\n'%(curr_time, mean_speed)) 58 | f.close() 59 | 60 | ret_dict = {'Mean_FPS': mean_speed, 'Std_FPS': std_speed, 'Max_FPS': max_speed, 'Min_FPS': min_speed, 'Mid_FPS': mid_speed} 61 | 62 | return {'GPU_Speed': ret_dict} -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import cv2 4 | 5 | from detectron2.structures import Boxes 6 | 7 | 8 | def get_box_scales(boxes: Boxes): 9 | return torch.sqrt((boxes.tensor[:, 2] - boxes.tensor[:, 0]) * (boxes.tensor[:, 3] - boxes.tensor[:, 1])) 10 | 11 | def get_anchor_center_min_dis(box_centers: torch.Tensor, anchor_centers: torch.Tensor): 12 | """ 13 | Args: 14 | box_centers: [N, 2] 15 | anchor_centers: [M, 2] 16 | Returns: 17 | 18 | """ 19 | N, _ = box_centers.size() 20 | M, _ = anchor_centers.size() 21 | if N == 0: 22 | return torch.ones_like(anchor_centers)[:, 0] * 99999, (torch.zeros_like(anchor_centers)[:, 0]).long() 23 | acenters = anchor_centers.view(-1, 1, 2) 24 | acenters = acenters.repeat(1, N, 1) 25 | bcenters = box_centers.view(1, -1, 2) 26 | bcenters = bcenters.repeat(M, 1, 1) 27 | 28 | dis = torch.sqrt(torch.sum((acenters - bcenters)**2, dim=2)) 29 | 30 | mindis, minind = torch.min(input=dis, dim=1) 31 | 32 | return mindis, minind 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /utils/val_mapper_with_ann.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import copy 5 | import torch 6 | from fvcore.common.file_io import PathManager 7 | 8 | from detectron2.data import MetadataCatalog 9 | from detectron2.data import detection_utils as utils 10 | from detectron2.data import transforms as T 11 | 12 | 13 | 14 | class ValMapper(object): 15 | """ 16 | COCO validation mapper, with annotations 17 | """ 18 | 19 | def __init__(self, cfg): 20 | self.is_train = False 21 | 22 | self.tfm_gens = utils.build_transform_gen(cfg, self.is_train) 23 | 24 | self.img_format = cfg.INPUT.FORMAT 25 | assert not cfg.MODEL.LOAD_PROPOSALS, "not supported yet" 26 | 27 | 28 | 29 | def __call__(self, dataset_dict): 30 | """ 31 | Args: 32 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 33 | 34 | Returns: 35 | dict: a format that builtin models in detectron2 accept 36 | """ 37 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 38 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 39 | utils.check_image_size(dataset_dict, image) 40 | 41 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 42 | image_shape = image.shape[:2] # h, w 43 | dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32")) 44 | 45 | for anno in dataset_dict["annotations"]: 46 | anno.pop("segmentation", None) 47 | anno.pop("keypoints", None) 48 | 49 | annos = [ 50 | utils.transform_instance_annotations( 51 | obj, transforms, image_shape, keypoint_hflip_indices=None 52 | ) 53 | for obj in dataset_dict.pop("annotations") 54 | if obj.get("iscrowd", 0) == 0 55 | ] 56 | 57 | instances = utils.annotations_to_instances(annos, image_shape) 58 | 59 | dataset_dict["instances"] = instances[instances.gt_boxes.nonempty()] 60 | return dataset_dict 61 | 62 | -------------------------------------------------------------------------------- /visdrone/data_prepare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from pathlib import Path 4 | sys.path.append(os.path.abspath(Path(__file__).parent.parent)) 5 | 6 | import shutil 7 | import cv2 8 | import json 9 | from visdrone import utils 10 | from tqdm import tqdm 11 | 12 | import argparse 13 | 14 | 15 | def get_save_path(img_path, index): 16 | name = img_path.split('.')[0] 17 | return name + '_' + str(index) + '.jpg' 18 | 19 | def crop_and_save_image(img_root, img_path, new_img_root): 20 | img = cv2.imread(os.path.join(img_root,img_path)) 21 | h, w, c = img.shape 22 | 23 | _y = h // 2 24 | _x = w // 2 25 | 26 | img0 = img[:_y, :_x, :] 27 | img1 = img[:_y, _x:, :] 28 | img2 = img[_y:, :_x, :] 29 | img3 = img[_y:, _x:, :] 30 | 31 | cv2.imwrite(os.path.join(new_img_root, get_save_path(img_path, 0)), img0) 32 | cv2.imwrite(os.path.join(new_img_root, get_save_path(img_path, 1)), img1) 33 | cv2.imwrite(os.path.join(new_img_root, get_save_path(img_path, 2)), img2) 34 | cv2.imwrite(os.path.join(new_img_root, get_save_path(img_path, 3)), img3) 35 | 36 | return h, w, _y, _x 37 | 38 | 39 | def copy_image(img_root, img_path, new_img_root): 40 | img = cv2.imread(os.path.join(img_root,img_path)) 41 | h, w, c = img.shape 42 | cv2.imwrite(os.path.join(new_img_root, img_path), img) 43 | return h, w 44 | 45 | 46 | def get_new_label(label, img_path, cy, cx, id, img_id_base): 47 | if label['class'] == 0 or label['ignore']: 48 | return None 49 | 50 | x, y, w, h = label['bbox'] 51 | 52 | if x < cx and y < cy: 53 | nx = x 54 | ny = y 55 | nw = min(x+w, cx) - x 56 | nh = min(y+h, cy) - y 57 | img_id = img_id_base 58 | elif x < cx and y >= cy: 59 | nx = x 60 | ny = y - cy 61 | nw = min(x+w, cx) - x 62 | nh = h 63 | img_id = img_id_base + 2 64 | elif x >= cx and y < cy: 65 | nx = x - cx 66 | ny = y 67 | nw = w 68 | nh = min(y+h, cy) - y 69 | img_id = img_id_base + 1 70 | else: 71 | nx = x - cx 72 | ny = y - cy 73 | nw = w 74 | nh = h 75 | img_id = img_id_base + 3 76 | 77 | new_label = {'category_id': label['class'], 'id': id, 'iscrowd':0, 'image_id':img_id, 'area':nw*nh, 'segmentation':[], 'bbox':[nx,ny,nw,nh]} 78 | return new_label 79 | 80 | 81 | def label_to_coco(label, id, img_id): 82 | x, y, w, h = label['bbox'] 83 | new_label = {'category_id': label['class'], 'id': id, 'iscrowd':0, 'image_id':img_id, 'area':w*h, 'segmentation':[], 'bbox':[x,y,w,h]} 84 | return new_label 85 | 86 | 87 | def make_json(images, annotations, new_label_json): 88 | ann_dict = {} 89 | ann_dict['categories'] = [ 90 | {'supercategory': 'things', 'id': 1, 'name': 'pedestrian'}, 91 | {'supercategory': 'things', 'id': 2, 'name': 'people'}, 92 | {'supercategory': 'things', 'id': 3, 'name': 'bicycle'}, 93 | {'supercategory': 'things', 'id': 4, 'name': 'car'}, 94 | {'supercategory': 'things', 'id': 5, 'name': 'van'}, 95 | {'supercategory': 'things', 'id': 6, 'name': 'truck'}, 96 | {'supercategory': 'things', 'id': 7, 'name': 'tricycle'}, 97 | {'supercategory': 'things', 'id': 8, 'name': 'awning-tricycle'}, 98 | {'supercategory': 'things', 'id': 9, 'name': 'bus'}, 99 | {'supercategory': 'things', 'id': 10, 'name': 'motor'} 100 | ] 101 | ann_dict['images'] = images 102 | ann_dict['annotations'] = annotations 103 | with open(new_label_json, 'w') as outfile: 104 | json.dump(ann_dict, outfile) 105 | 106 | 107 | def make_new_train_set(img_root, label_root, new_img_root, new_label_json): 108 | all_labels = utils.read_all_labels(label_root) 109 | 110 | annotations = [] 111 | images = [] 112 | ann_id = 0 113 | img_id = 0 114 | for filename, labels in tqdm(all_labels.items()): 115 | img_path = filename.replace('txt', 'jpg') 116 | h, w, cy, cx = crop_and_save_image(img_root, img_path, new_img_root) 117 | 118 | images.append({'file_name': get_save_path(img_path, 0), 'height': cy, 'width': cx, 'id': img_id}) 119 | images.append({'file_name': get_save_path(img_path, 1), 'height': cy, 'width': w-cx, 'id': img_id+1}) 120 | images.append({'file_name': get_save_path(img_path, 2), 'height': h-cy, 'width': cx, 'id':img_id+2}) 121 | images.append({'file_name': get_save_path(img_path, 3), 'height': h-cy, 'width': w-cx, 'id':img_id+3}) 122 | 123 | for label in labels: 124 | new_label = get_new_label(label, img_path, cy, cx, ann_id, img_id) 125 | if new_label != None: 126 | ann_id += 1 127 | annotations.append(new_label) 128 | img_id += 4 129 | make_json(images, annotations, new_label_json) 130 | 131 | 132 | def make_new_test_set(img_root, label_root, new_img_root, new_label_json): 133 | all_labels = utils.read_all_labels(label_root) 134 | annotations = [] 135 | images = [] 136 | ann_id = 0 137 | img_id = 0 138 | 139 | for filename, labels in tqdm(all_labels.items()): 140 | img_path = filename.replace('txt', 'jpg') 141 | h, w = copy_image(img_root, img_path, new_img_root) 142 | images.append({'file_name': img_path, 'height': h, 'width': w, 'id': img_id}) 143 | for label in labels: 144 | coco_label = label_to_coco(label, ann_id, img_id) 145 | if coco_label != None: 146 | ann_id += 1 147 | annotations.append(coco_label) 148 | img_id += 1 149 | 150 | make_json(images, annotations, new_label_json) 151 | 152 | 153 | 154 | if __name__ == '__main__': 155 | 156 | parser = argparse.ArgumentParser(description='Data Prepare Arguments') 157 | parser.add_argument('--visdrone-root', required=True, type=str, help='VisDrone dataset root') 158 | args = parser.parse_args() 159 | 160 | if not os.path.isdir(os.path.join(args.visdrone_root, 'coco_format')): 161 | os.mkdir(os.path.join(args.visdrone_root, 'coco_format')) 162 | os.mkdir(os.path.join(args.visdrone_root, 'coco_format/train_images')) 163 | os.mkdir(os.path.join(args.visdrone_root, 'coco_format/val_images')) 164 | os.mkdir(os.path.join(args.visdrone_root, 'coco_format/annotations')) 165 | 166 | 167 | ''' 168 | Training 169 | ''' 170 | train_img_root = os.path.join(args.visdrone_root, 'VisDrone2019-DET-train/images') 171 | train_label_root = os.path.join(args.visdrone_root, 'VisDrone2019-DET-train/annotations') 172 | train_new_img_root = os.path.join(args.visdrone_root, 'coco_format/train_images') 173 | train_new_label_json = os.path.join(args.visdrone_root, 'coco_format/annotations/train_label.json') 174 | make_new_train_set(train_img_root, train_label_root, train_new_img_root, train_new_label_json) 175 | 176 | ''' 177 | Validation 178 | ''' 179 | val_img_root = os.path.join(args.visdrone_root, 'VisDrone2019-DET-val/images') 180 | val_label_root = os.path.join(args.visdrone_root, 'VisDrone2019-DET-val/annotations') 181 | val_new_img_root = os.path.join(args.visdrone_root, 'coco_format/val_images') 182 | val_new_label_json = os.path.join(args.visdrone_root, 'coco_format/annotations/val_label.json') 183 | make_new_test_set(val_img_root, val_label_root, val_new_img_root, val_new_label_json) 184 | 185 | ''' 186 | Test set, not needed here. You can convert by yourself in the same way as validation set if you want to. 187 | ''' 188 | # img_root = '/path/to/test/images' 189 | # label_root = '/path/to/test/annotations' 190 | # new_img_root = '/path/to/test/images' 191 | # new_label_json = '/path/to/test/label.json' 192 | # make_new_test_set(img_root, label_root, new_img_root, new_label_json) 193 | 194 | 195 | 196 | 197 | 198 | -------------------------------------------------------------------------------- /visdrone/dataloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import bisect 4 | import copy 5 | import itertools 6 | import logging 7 | import numpy as np 8 | import operator 9 | import pickle 10 | import torch.utils.data 11 | from fvcore.common.file_io import PathManager 12 | from tabulate import tabulate 13 | from termcolor import colored 14 | 15 | from detectron2.structures import BoxMode 16 | from detectron2.utils.comm import get_world_size 17 | from detectron2.utils.env import seed_all_rng 18 | from detectron2.utils.logger import log_first_n 19 | 20 | from detectron2.structures.boxes import BoxMode 21 | from detectron2.data import samplers 22 | from detectron2.data.catalog import DatasetCatalog, MetadataCatalog 23 | from detectron2.data.common import AspectRatioGroupedDataset, DatasetFromList, MapDataset 24 | from detectron2.data.dataset_mapper import DatasetMapper 25 | from detectron2.data.detection_utils import check_metadata_consistency 26 | 27 | 28 | from visdrone.mapper import Mapper 29 | 30 | 31 | def get_train_data_dicts(json_file, img_root, filter_empty=False): 32 | data = json.load(open(json_file)) 33 | 34 | images = {x['id']: {'file': x['file_name'], 'height':x['height'], 'width':x['width']} for x in data['images']} 35 | 36 | annotations = {} 37 | for ann in data['annotations']: 38 | img_id = ann['image_id'] 39 | if img_id not in annotations.keys(): 40 | annotations[img_id] = [] 41 | annotations[img_id].append({'bbox': ann['bbox'], 'category_id': ann['category_id']-1, 'iscrowd': ann['iscrowd'], 'area': ann['area']}) 42 | 43 | for img_id in images.keys(): 44 | if img_id not in annotations.keys(): 45 | annotations[img_id] = [] 46 | 47 | data_dicts = [] 48 | for img_id in images.keys(): 49 | if filter_empty and len(annotations[img_id]) == 0: 50 | continue 51 | data_dict = {} 52 | data_dict['file_name'] = str(os.path.join(img_root, images[img_id]['file'])) 53 | data_dict['height'] = images[img_id]['height'] 54 | data_dict['width'] = images[img_id]['width'] 55 | data_dict['image_id'] = img_id 56 | data_dict['annotations'] = [] 57 | for ann in annotations[img_id]: 58 | data_dict['annotations'].append({'bbox': ann['bbox'], 'iscrowd': ann['iscrowd'], 'category_id': ann['category_id'], 'bbox_mode': BoxMode.XYWH_ABS}) 59 | data_dicts.append(data_dict) 60 | return data_dicts 61 | 62 | 63 | def get_test_data_dicts(json_file, img_root): 64 | data = json.load(open(json_file)) 65 | images = {x['id']: {'file': x['file_name'], 'height':x['height'], 'width':x['width']} for x in data['images']} 66 | 67 | data_dicts = [] 68 | for img_id in images.keys(): 69 | data_dict = {} 70 | data_dict['file_name'] = str(os.path.join(img_root, images[img_id]['file'])) 71 | data_dict['height'] = images[img_id]['height'] 72 | data_dict['width'] = images[img_id]['width'] 73 | data_dict['image_id'] = img_id 74 | data_dict['annotations'] = [] 75 | data_dicts.append(data_dict) 76 | return data_dicts 77 | 78 | 79 | def build_train_loader(cfg): 80 | num_workers = get_world_size() 81 | images_per_batch = cfg.SOLVER.IMS_PER_BATCH 82 | 83 | assert ( 84 | images_per_batch % num_workers == 0 85 | ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( 86 | images_per_batch, num_workers 87 | ) 88 | assert ( 89 | images_per_batch >= num_workers 90 | ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( 91 | images_per_batch, num_workers 92 | ) 93 | images_per_worker = images_per_batch // num_workers 94 | 95 | dataset_dicts = get_train_data_dicts(cfg.VISDRONE.TRAIN_JSON, cfg.VISDRONE.TRING_IMG_ROOT) 96 | dataset = DatasetFromList(dataset_dicts, copy=False) 97 | mapper = Mapper(cfg, True) 98 | dataset = MapDataset(dataset, mapper) 99 | 100 | sampler_name = cfg.DATALOADER.SAMPLER_TRAIN 101 | logger = logging.getLogger(__name__) 102 | logger.info("Using training sampler {}".format(sampler_name)) 103 | 104 | if sampler_name == "TrainingSampler": 105 | sampler = samplers.TrainingSampler(len(dataset)) 106 | elif sampler_name == "RepeatFactorTrainingSampler": 107 | sampler = samplers.RepeatFactorTrainingSampler( 108 | dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD 109 | ) 110 | else: 111 | raise ValueError("Unknown training sampler: {}".format(sampler_name)) 112 | 113 | batch_sampler = torch.utils.data.sampler.BatchSampler( 114 | sampler, images_per_worker, drop_last=True 115 | ) 116 | # drop_last so the batch always have the same size 117 | data_loader = torch.utils.data.DataLoader( 118 | dataset, 119 | num_workers=cfg.DATALOADER.NUM_WORKERS, 120 | batch_sampler=batch_sampler, 121 | collate_fn=trivial_batch_collator, 122 | worker_init_fn=worker_init_reset_seed, 123 | ) 124 | return data_loader 125 | 126 | 127 | def build_test_loader(cfg): 128 | 129 | dataset_dicts = get_test_data_dicts(cfg.VISDRONE.TEST_JSON, cfg.VISDRONE.TEST_IMG_ROOT) 130 | 131 | dataset = DatasetFromList(dataset_dicts) 132 | mapper = Mapper(cfg, False) 133 | dataset = MapDataset(dataset, mapper) 134 | 135 | sampler = samplers.InferenceSampler(len(dataset)) 136 | batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) 137 | 138 | data_loader = torch.utils.data.DataLoader( 139 | dataset, 140 | num_workers=cfg.DATALOADER.NUM_WORKERS, 141 | batch_sampler=batch_sampler, 142 | collate_fn=trivial_batch_collator, 143 | ) 144 | return data_loader 145 | 146 | 147 | def worker_init_reset_seed(worker_id): 148 | seed_all_rng(np.random.randint(2 ** 31) + worker_id) 149 | 150 | 151 | def trivial_batch_collator(batch): 152 | return batch 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | -------------------------------------------------------------------------------- /visdrone/json_to_txt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tqdm 3 | import json 4 | 5 | 6 | import argparse 7 | 8 | 9 | class Json2Txt(object): 10 | 11 | def __init__(self, gt_json, det_json, out_dir): 12 | gt_data = json.load(open(gt_json)) 13 | self.images = {x['id']: {'file': x['file_name'], 'height':x['height'], 'width':x['width']} for x in gt_data['images']} 14 | 15 | det_data = json.load(open(det_json)) 16 | 17 | self.results = {} 18 | for result in det_data: 19 | if result['image_id'] not in self.results.keys(): 20 | self.results[result['image_id']] = [] 21 | self.results[result['image_id']].append({'box': result['bbox'], 'category': result['category_id'], 'score': result['score']}) 22 | 23 | self.out_dir = out_dir 24 | 25 | def to_txt(self): 26 | for img_id in tqdm.tqdm(self.images.keys()): 27 | file_name = self.images[img_id]['file'].replace('jpg', 'txt') 28 | with open(os.path.join(self.out_dir, file_name), 'w') as fw: 29 | for pred in self.results[img_id]: 30 | row = '%.2f,%.2f,%.2f,%.2f,%.8f,%d,-1,-1'%(pred['box'][0],pred['box'][1],pred['box'][2],pred['box'][3],pred['score'],pred['category']) 31 | fw.write(row+'\n') 32 | 33 | if __name__ == '__main__': 34 | 35 | parser = argparse.ArgumentParser(description='Arguments') 36 | parser.add_argument('--out', required=True, type=str, help='output txt dir') 37 | parser.add_argument('--gt-json', required=False, type=str, default='visdrone_data/annotations/val_label', help='Grond Truth Info JSON') 38 | parser.add_argument('--det-json', required=True, type=str, help='COCO style result JSON') 39 | args = parser.parse_args() 40 | 41 | gt_json = args.gt_json 42 | det_json = args.det_json 43 | outdir = args.out 44 | 45 | if not os.path.isdir(outdir): 46 | os.mkdir(outdir) 47 | 48 | print('Json to txt:', outdir) 49 | tool = Json2Txt(gt_json, det_json, outdir) 50 | tool.to_txt() 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /visdrone/mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import copy 3 | import logging 4 | import numpy as np 5 | import torch 6 | from fvcore.common.file_io import PathManager 7 | from PIL import Image 8 | 9 | from detectron2.data import detection_utils as utils 10 | from detectron2.data import transforms as T 11 | 12 | """ 13 | This file contains the default mapping that's applied to "dataset dicts". 14 | """ 15 | 16 | __all__ = ["DatasetMapper"] 17 | 18 | 19 | class Mapper: 20 | """ 21 | A callable which takes a dataset dict in Detectron2 Dataset format, 22 | and map it into a format used by the model. 23 | 24 | This is the default callable to be used to map your dataset dict into training data. 25 | You may need to follow it to implement your own one for customized logic, 26 | such as a different way to read or transform images. 27 | See :doc:`/tutorials/data_loading` for details. 28 | 29 | The callable currently does the following: 30 | 31 | 1. Read the image from "file_name" 32 | 2. Applies cropping/geometric transforms to the image and annotations 33 | 3. Prepare data and annotations to Tensor and :class:`Instances` 34 | """ 35 | 36 | def __init__(self, cfg, is_train=True): 37 | 38 | self.tfm_gens = build_transform_gen(cfg, is_train) 39 | # fmt: off 40 | self.img_format = cfg.INPUT.FORMAT 41 | self.mask_on = False 42 | self.mask_format = cfg.INPUT.MASK_FORMAT 43 | self.keypoint_on = False 44 | self.load_proposals = False 45 | self.keypoint_hflip_indices = None 46 | # fmt: on 47 | 48 | self.is_train = is_train 49 | 50 | def __call__(self, dataset_dict): 51 | 52 | dataset_dict = copy.deepcopy(dataset_dict) 53 | image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 54 | utils.check_image_size(dataset_dict, image) 55 | 56 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 57 | image_shape = image.shape[:2] # h, w 58 | 59 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 60 | 61 | 62 | if not self.is_train: 63 | dataset_dict.pop("annotations", None) 64 | return dataset_dict 65 | 66 | if "annotations" in dataset_dict: 67 | # USER: Modify this if you want to keep them for some reason. 68 | for anno in dataset_dict["annotations"]: 69 | anno.pop("segmentation", None) 70 | anno.pop("keypoints", None) 71 | 72 | # USER: Implement additional transformations if you have other types of data 73 | annos = [ 74 | utils.transform_instance_annotations( 75 | obj, transforms, image_shape 76 | ) 77 | for obj in dataset_dict.pop("annotations") 78 | if obj.get("iscrowd", 0) == 0 79 | ] 80 | instances = utils.annotations_to_instances( 81 | annos, image_shape, mask_format=self.mask_format 82 | ) 83 | dataset_dict["instances"] = utils.filter_empty_instances(instances) 84 | return dataset_dict 85 | 86 | 87 | def build_transform_gen(cfg, is_train): 88 | if is_train: 89 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 90 | else: 91 | sample_style = 'choice' 92 | 93 | logger = logging.getLogger(__name__) 94 | tfm_gens = [] 95 | if is_train: 96 | tfm_gens.append(T.RandomFlip(horizontal=True, vertical=False)) 97 | tfm_gens.append(T.ResizeShortestEdge(short_edge_length=cfg.VISDRONE.SHORT_LENGTH, max_size=cfg.VISDRONE.MAX_LENGTH, sample_style=sample_style)) 98 | else: 99 | tfm_gens.append(T.ResizeShortestEdge(short_edge_length=[cfg.VISDRONE.TEST_LENGTH], max_size=cfg.VISDRONE.TEST_LENGTH, sample_style=sample_style)) 100 | 101 | return tfm_gens 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /visdrone/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import json 4 | 5 | 6 | 7 | def read_label_txt(txt_file): 8 | f = open(txt_file, 'r') 9 | lines = f.readlines() 10 | 11 | labels = [] 12 | for line in lines: 13 | line = line.strip().split(',') 14 | 15 | x, y, w, h, not_ignore, cate, trun, occ = line[:8] 16 | 17 | labels.append( 18 | {'bbox': (int(x),int(y),int(w),int(h)), 19 | 'ignore': 0 if int(not_ignore) else 1, 20 | 'class': int(cate), 21 | 'truncate': int(trun), 22 | 'occlusion': int(occ)} 23 | ) 24 | return labels 25 | 26 | 27 | def read_all_labels(ann_root): 28 | ann_list = os.listdir(ann_root) 29 | all_labels = {} 30 | for ann_file in ann_list: 31 | if not ann_file.endswith('txt'): 32 | continue 33 | ann_labels = read_label_txt(os.path.join(ann_root, ann_file)) 34 | all_labels[ann_file] = ann_labels 35 | return all_labels 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /visdrone_eval/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 同济大学智能汽车研究所综合感知研究组 ( Comprehensive Perception Research Group under Institute of Intelligent Vehicles, School of Automotive Studies, Tongji University) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /visdrone_eval/README.md: -------------------------------------------------------------------------------- 1 | # visdrone-det-toolkit-python 2 | 3 | Python implementation of evaluation utilities of **[VisDrone2018-DET-toolkit](https://github.com/VisDrone/VisDrone2018-DET-toolkit)**. 4 | 5 | ### Run Evaluation 6 | 7 | Modify the dataset and result directories in evaluate.py and run: 8 | 9 | ```shell 10 | python evaluate.py 11 | ``` 12 | 13 | ### Installation and Usage 14 | 15 | Installation: 16 | 17 | ```bash 18 | pip install -e . 19 | ``` 20 | 21 | An example of using the function `eval_det` is given below: 22 | 23 | ```python 24 | from viseval import eval_det 25 | ... 26 | ap_all, ap_50, ap_75, ar_1, ar_10, ar_100, ar_500 = eval_det( 27 | annotations, results, heights, widths) 28 | ... 29 | ``` 30 | 31 | Reference: https://github.com/tjiiv-cprg/visdrone-det-toolkit-python.git 32 | 33 | -------------------------------------------------------------------------------- /visdrone_eval/evaluate.py: -------------------------------------------------------------------------------- 1 | # reference: https://github.com/tjiiv-cprg/visdrone-det-toolkit-python 2 | 3 | 4 | import os.path as osp 5 | import os 6 | import numpy as np 7 | import cv2 8 | from viseval.eval_det import eval_det 9 | 10 | import argparse 11 | parser = argparse.ArgumentParser(description='Arguments') 12 | parser.add_argument('--dataset-dir', required=True, type=str, help='output txt dir') 13 | parser.add_argument('--res-dir', required=True, type=str, help='Grond Truth Info JSON') 14 | args = parser.parse_args() 15 | 16 | def open_label_file(path, dtype=np.float32): 17 | label = np.loadtxt(path, delimiter=',', dtype=dtype, 18 | ndmin=2, usecols=range(8)) 19 | if not len(label): 20 | label = label.reshape(0, 8) 21 | return label 22 | 23 | 24 | def main(): 25 | dataset_dir = args.dataset_dir 26 | res_dir = args.res_dir 27 | 28 | gt_dir = osp.join(dataset_dir, 'annotations') 29 | img_dir = osp.join(dataset_dir, 'images') 30 | 31 | all_gt = [] 32 | all_det = [] 33 | allheight = [] 34 | allwidth = [] 35 | 36 | data_list_path = os.listdir(img_dir) 37 | 38 | for filename in data_list_path: 39 | filename = filename.strip().split('.')[0] 40 | img_path = osp.join(img_dir, filename + '.jpg') 41 | img = cv2.imread(img_path) 42 | height, width = img.shape[:2] 43 | 44 | allheight.append(height) 45 | allwidth.append(width) 46 | 47 | label = open_label_file( 48 | osp.join(gt_dir, filename + '.txt'), dtype=np.int32) 49 | all_gt.append(label) 50 | 51 | det = open_label_file( 52 | osp.join(res_dir, filename + '.txt')) 53 | all_det.append(det) 54 | 55 | ap_all, ap_50, ap_75, ar_1, ar_10, ar_100, ar_500, ap_classwise = eval_det( 56 | all_gt, all_det, allheight, allwidth, per_class=True) 57 | 58 | print('Average Precision (AP) @[ IoU=0.50:0.95 | maxDets=500 ] = {}%.'.format(ap_all)) 59 | print('Average Precision (AP) @[ IoU=0.50 | maxDets=500 ] = {}%.'.format(ap_50)) 60 | print('Average Precision (AP) @[ IoU=0.75 | maxDets=500 ] = {}%.'.format(ap_75)) 61 | print('Average Recall (AR) @[ IoU=0.50:0.95 | maxDets= 1 ] = {}%.'.format(ar_1)) 62 | print('Average Recall (AR) @[ IoU=0.50:0.95 | maxDets= 10 ] = {}%.'.format(ar_10)) 63 | print('Average Recall (AR) @[ IoU=0.50:0.95 | maxDets=100 ] = {}%.'.format(ar_100)) 64 | print('Average Recall (AR) @[ IoU=0.50:0.95 | maxDets=500 ] = {}%.'.format(ar_500)) 65 | 66 | for i, ap in enumerate(ap_classwise): 67 | print('Class {} AP = {}%'.format(i, ap)) 68 | 69 | 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /visdrone_eval/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy~=1.18.5 2 | setuptools~=46.4.0 3 | opencv-python~=4.2.0.34 -------------------------------------------------------------------------------- /visdrone_eval/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import find_packages, setup 3 | 4 | 5 | def parse_requirements(fname='requirements.txt', with_version=True): 6 | """ 7 | Parse the package dependencies listed in a requirements file but strips 8 | specific versioning information. 9 | 10 | Args: 11 | fname (str): path to requirements file 12 | with_version (bool, default=False): if True include version specs 13 | 14 | Returns: 15 | List[str]: list of requirements items 16 | 17 | CommandLine: 18 | python -c "import setup; print(setup.parse_requirements())" 19 | """ 20 | import sys 21 | from os.path import exists 22 | import re 23 | require_fpath = fname 24 | 25 | def parse_line(line): 26 | """ 27 | Parse information from a line in a requirements text file 28 | """ 29 | if line.startswith('-r '): 30 | # Allow specifying requirements in other files 31 | target = line.split(' ')[1] 32 | for info in parse_require_file(target): 33 | yield info 34 | else: 35 | info = {'line': line} 36 | if line.startswith('-e '): 37 | info['package'] = line.split('#egg=')[1] 38 | else: 39 | # Remove versioning from the package 40 | pat = '(' + '|'.join(['>=', '==', '>']) + ')' 41 | parts = re.split(pat, line, maxsplit=1) 42 | parts = [p.strip() for p in parts] 43 | 44 | info['package'] = parts[0] 45 | if len(parts) > 1: 46 | op, rest = parts[1:] 47 | if ';' in rest: 48 | # Handle platform specific dependencies 49 | # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies 50 | version, platform_deps = map(str.strip, 51 | rest.split(';')) 52 | info['platform_deps'] = platform_deps 53 | else: 54 | version = rest # NOQA 55 | info['version'] = (op, version) 56 | yield info 57 | 58 | def parse_require_file(fpath): 59 | with open(fpath, 'r') as f: 60 | for line in f.readlines(): 61 | line = line.strip() 62 | if line and not line.startswith('#'): 63 | for info in parse_line(line): 64 | yield info 65 | 66 | def gen_packages_items(): 67 | if exists(require_fpath): 68 | for info in parse_require_file(require_fpath): 69 | parts = [info['package']] 70 | if with_version and 'version' in info: 71 | parts.extend(info['version']) 72 | if not sys.version.startswith('3.4'): 73 | # apparently package_deps are broken in 3.4 74 | platform_deps = info.get('platform_deps') 75 | if platform_deps is not None: 76 | parts.append(';' + platform_deps) 77 | item = ''.join(parts) 78 | yield item 79 | 80 | packages = list(gen_packages_items()) 81 | return packages 82 | 83 | 84 | if __name__ == '__main__': 85 | setup( 86 | name='visdrone_eval', 87 | version='0.1', 88 | description='Python Implementation of VisDrone Detection Toolbox', 89 | packages=find_packages(exclude=('configs', 'tools', 'demo')), 90 | install_requires=parse_requirements('requirements.txt') 91 | ) 92 | -------------------------------------------------------------------------------- /visdrone_eval/viseval/__init__.py: -------------------------------------------------------------------------------- 1 | from .eval_det import eval_det 2 | 3 | 4 | __all__ = ['eval_det'] 5 | -------------------------------------------------------------------------------- /visdrone_eval/viseval/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/visdrone_eval/viseval/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /visdrone_eval/viseval/__pycache__/bbox_overlaps.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/visdrone_eval/viseval/__pycache__/bbox_overlaps.cpython-37.pyc -------------------------------------------------------------------------------- /visdrone_eval/viseval/__pycache__/calc_accuracy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/visdrone_eval/viseval/__pycache__/calc_accuracy.cpython-37.pyc -------------------------------------------------------------------------------- /visdrone_eval/viseval/__pycache__/drop_objects_in_igr.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/visdrone_eval/viseval/__pycache__/drop_objects_in_igr.cpython-37.pyc -------------------------------------------------------------------------------- /visdrone_eval/viseval/__pycache__/eval_det.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenhongyiYang/QueryDet-PyTorch/feebf218d53d59ba054132dfa6ef84159f793967/visdrone_eval/viseval/__pycache__/eval_det.cpython-37.pyc -------------------------------------------------------------------------------- /visdrone_eval/viseval/bbox_overlaps.py: -------------------------------------------------------------------------------- 1 | # from mmdetection 2 | 3 | import numpy as np 4 | 5 | 6 | def bbox_overlaps(bboxes1, bboxes2, mode='iou', eps=1e-6): 7 | """Calculate the ious between each bbox of bboxes1 and bboxes2. 8 | 9 | Args: 10 | bboxes1(ndarray): shape (n, 4) 11 | bboxes2(ndarray): shape (k, 4) 12 | mode(str): iou (intersection over union) or iof (intersection 13 | over foreground) 14 | eps(float): 15 | 16 | Returns: 17 | ious(ndarray): shape (n, k) 18 | """ 19 | 20 | assert mode in ['iou', 'iof'] 21 | 22 | bboxes1 = bboxes1.astype(np.float32) 23 | bboxes2 = bboxes2.astype(np.float32) 24 | rows = bboxes1.shape[0] 25 | cols = bboxes2.shape[0] 26 | ious = np.zeros((rows, cols), dtype=np.float32) 27 | if rows * cols == 0: 28 | return ious 29 | exchange = False 30 | if bboxes1.shape[0] > bboxes2.shape[0]: 31 | bboxes1, bboxes2 = bboxes2, bboxes1 32 | ious = np.zeros((cols, rows), dtype=np.float32) 33 | exchange = True 34 | area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1]) 35 | area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1]) 36 | for i in range(bboxes1.shape[0]): 37 | x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0]) 38 | y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1]) 39 | x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2]) 40 | y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3]) 41 | overlap = np.maximum(x_end - x_start, 0) * np.maximum( 42 | y_end - y_start, 0) 43 | if mode == 'iou': 44 | union = area1[i] + area2 - overlap 45 | else: 46 | union = area1[i] if not exchange else area2 47 | union = np.maximum(union, eps) 48 | ious[i, :] = overlap / union 49 | if exchange: 50 | ious = ious.T 51 | return ious 52 | -------------------------------------------------------------------------------- /visdrone_eval/viseval/calc_accuracy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .bbox_overlaps import bbox_overlaps 3 | 4 | 5 | def eval_res(gt0, dt0, thr): 6 | """ 7 | :param gt0: np.array[ng, 5], ground truth results [x, y, w, h, ignore] 8 | :param dt0: np.array[nd, 5], detection results [x, y, w, h, score] 9 | :param thr: float, IoU threshold 10 | :return gt1: np.array[ng, 5], gt match types 11 | dt1: np.array[nd, 6], dt match types 12 | """ 13 | nd = len(dt0) 14 | ng = len(gt0) 15 | 16 | # sort 17 | dt = dt0[dt0[:, 4].argsort()[::-1]] 18 | gt_ignore_mask = gt0[:, 4] == 1 19 | gt = gt0[np.logical_not(gt_ignore_mask)] 20 | ig = gt0[gt_ignore_mask] 21 | ig[:, 4] = -ig[:, 4] # -1 indicates ignore 22 | 23 | dt_format = dt[:, :4].copy() 24 | gt_format = gt[:, :4].copy() 25 | ig_format = ig[:, :4].copy() 26 | dt_format[:, 2:] += dt_format[:, :2] # [x2, y2] = [w, h] + [x1, y1] 27 | gt_format[:, 2:] += gt_format[:, :2] 28 | ig_format[:, 2:] += ig_format[:, :2] 29 | 30 | iou_dtgt = bbox_overlaps(dt_format, gt_format, mode='iou') 31 | iof_dtig = bbox_overlaps(dt_format, gt_format, mode='iof') 32 | oa = np.concatenate((iou_dtgt, iof_dtig), axis=1) 33 | 34 | # [nd, 6] 35 | dt1 = np.concatenate((dt, np.zeros((nd, 1), dtype=dt.dtype)), axis=1) 36 | # [ng, 5] 37 | gt1 = np.concatenate((gt, ig), axis=0) 38 | 39 | for d in range(nd): 40 | bst_oa = thr 41 | bstg = -1 # index of matched gt 42 | bstm = 0 # best match type 43 | for g in range(ng): 44 | m = gt1[g, 4] 45 | # if gt already matched, continue to next gt 46 | if m == 1: 47 | continue 48 | # if dt already matched, and on ignore gt, nothing more to do 49 | if bstm != 0 and m == -1: 50 | break 51 | # continue to next gt until better match is found 52 | if oa[d, g] < bst_oa: 53 | continue 54 | bst_oa = oa[d, g] 55 | bstg = g 56 | bstm = 1 if m == 0 else -1 # 1: matched to gt, -1: matched to ignore 57 | 58 | # store match type for dt 59 | dt1[d, 5] = bstm 60 | # store match flag for gt 61 | if bstm == 1: 62 | gt1[bstg, 4] = 1 63 | 64 | return gt1, dt1 65 | 66 | 67 | def voc_ap(rec, prec): 68 | mrec = np.concatenate(([0], rec, [1])) 69 | mpre = np.concatenate(([0], prec, [0])) 70 | for i in reversed(range(0, len(mpre)-1)): 71 | mpre[i] = max(mpre[i], mpre[i + 1]) 72 | i = np.flatnonzero(mrec[1:] != mrec[:-1]) + 1 73 | ap = np.sum((mrec[i] - mrec[i - 1]) * mpre[i]) 74 | return ap 75 | 76 | 77 | def calc_accuracy(num_imgs, all_gt, all_det, per_class=False): 78 | """ 79 | :param num_imgs: int 80 | :param all_gt: list of np.array[m, 8], [:, 4] == 1 indicates ignored regions, 81 | which should be dropped before calling this function 82 | :param all_det: list of np.array[m, 6], truncation and occlusion not necessary 83 | :param per_class: 84 | """ 85 | assert num_imgs == len(all_gt) == len(all_det) 86 | 87 | ap = np.zeros((10, 10), dtype=np.float32) 88 | ar = np.zeros((10, 10, 4), dtype=np.float32) 89 | eval_class = [] 90 | 91 | print('') 92 | for id_class in range(1, 11): 93 | print('evaluating object category {}/10...'.format(id_class)) 94 | 95 | for gt in all_gt: 96 | if np.any(gt[:, 5] == id_class): 97 | eval_class.append(id_class - 1) 98 | 99 | x = 0 100 | for thr in np.linspace(0.5, 0.95, num=10): 101 | y = 0 102 | for max_dets in (1, 10, 100, 500): 103 | gt_match = [] 104 | det_match = [] 105 | for gt, det in zip(all_gt, all_det): 106 | det_limited = det[:min(len(det), max_dets)] 107 | mask_gt_cur_class = gt[:, 5] == id_class 108 | mask_det_cur_class = det_limited[:, 5] == id_class 109 | gt0 = gt[mask_gt_cur_class, :5] 110 | dt0 = det_limited[mask_det_cur_class, :5] 111 | gt1, dt1 = eval_res(gt0, dt0, thr) 112 | # 1: matched, 0: unmatched, -1: ignore 113 | gt_match.append(gt1[:, 4]) 114 | # [score, match type] 115 | # 1: matched to gt, 0: unmatched, -1: matched to ignore 116 | det_match.append(dt1[:, 4:6]) 117 | gt_match = np.concatenate(gt_match, axis=0) 118 | det_match = np.concatenate(det_match, axis=0) 119 | 120 | idrank = det_match[:, 0].argsort()[::-1] 121 | tp = np.cumsum(det_match[idrank, 1] == 1) 122 | rec = tp / max(1, len(gt_match)) # including ignore (already dropped) 123 | if len(rec): 124 | ar[id_class - 1, x, y] = np.max(rec) * 100 125 | 126 | y += 1 127 | 128 | fp = np.cumsum(det_match[idrank, 1] == 0) 129 | prec = tp / (fp + tp).clip(min=1) 130 | ap[id_class - 1, x] = voc_ap(rec, prec) * 100 131 | 132 | x += 1 133 | 134 | ap_all = np.mean(ap[eval_class, :]) 135 | ap_50 = np.mean(ap[eval_class, 0]) 136 | ap_75 = np.mean(ap[eval_class, 5]) 137 | ar_1 = np.mean(ar[eval_class, :, 0]) 138 | ar_10 = np.mean(ar[eval_class, :, 1]) 139 | ar_100 = np.mean(ar[eval_class, :, 2]) 140 | ar_500 = np.mean(ar[eval_class, :, 3]) 141 | 142 | results = (ap_all, ap_50, ap_75, ar_1, ar_10, ar_100, ar_500) 143 | 144 | if per_class: 145 | ap_classwise = np.mean(ap, axis=1) 146 | results += (ap_classwise,) 147 | 148 | print('Evaluation completed. The performance of the detector is presented as follows.') 149 | 150 | return results 151 | -------------------------------------------------------------------------------- /visdrone_eval/viseval/drop_objects_in_igr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def create_int_img(img): 5 | int_img = np.cumsum(img, axis=0) 6 | np.cumsum(int_img, axis=1, out=int_img) 7 | return int_img 8 | 9 | 10 | def drop_objects_in_igr(gt, det, img_height, img_width): 11 | gt_ignore_mask = gt[:, 5] == 0 12 | curgt = gt[np.logical_not(gt_ignore_mask)] 13 | igr_region = gt[gt_ignore_mask, :4].clip(min=1) 14 | if len(igr_region): 15 | igr_map = np.zeros((img_height, img_width), dtype=np.int) 16 | 17 | for igr in igr_region: 18 | x1 = igr[0] 19 | y1 = igr[1] 20 | x2 = min(x1 + igr[2], img_width) 21 | y2 = min(y1 + igr[3], img_height) 22 | igr_map[y1 - 1:y2, x1 - 1:x2] = 1 23 | int_igr_map = create_int_img(igr_map) 24 | idx_left_gt = [] 25 | 26 | for i, gtbox in enumerate(curgt): 27 | pos = np.round(gtbox[:4]).astype(np.int32).clip(min=1) 28 | x = max(1, min(img_width - 1, pos[0])) 29 | y = max(1, min(img_height - 1, pos[1])) 30 | w = pos[2] 31 | h = pos[3] 32 | tl = int_igr_map[y - 1, x - 1] 33 | tr = int_igr_map[y - 1, min(img_width, x + w) - 1] 34 | bl = int_igr_map[max(1, min(img_height, y + h)) - 1, x - 1] 35 | br = int_igr_map[max(1, min(img_height, y + h)) - 1, 36 | min(img_width, x + w) - 1] 37 | igr_val = tl + br - tr - bl 38 | if igr_val / (h * w) < 0.5: 39 | idx_left_gt.append(i) 40 | 41 | curgt = curgt[idx_left_gt] 42 | 43 | idx_left_det = [] 44 | for i, dtbox in enumerate(det): 45 | pos = np.round(dtbox[:4]).astype(np.int32).clip(min=1) 46 | x = max(1, min(img_width - 1, pos[0])) 47 | y = max(1, min(img_height - 1, pos[1])) 48 | w = pos[2] 49 | h = pos[3] 50 | tl = int_igr_map[y - 1, x - 1] 51 | tr = int_igr_map[y - 1, min(img_width, x + w) - 1] 52 | bl = int_igr_map[max(1, min(img_height, y + h)) - 1, x - 1] 53 | br = int_igr_map[max(1, min(img_height, y + h)) - 1, 54 | min(img_width, x + w) - 1] 55 | igr_val = tl + br - tr - bl 56 | if igr_val / (h * w) < 0.5: 57 | idx_left_det.append(i) 58 | 59 | det = det[idx_left_det] 60 | 61 | return curgt, det 62 | -------------------------------------------------------------------------------- /visdrone_eval/viseval/eval_det.py: -------------------------------------------------------------------------------- 1 | from .calc_accuracy import calc_accuracy 2 | from .drop_objects_in_igr import drop_objects_in_igr 3 | 4 | 5 | def eval_det(all_gt, all_det, allheight, allwidth, per_class=False): 6 | """ 7 | :param all_gt: list of np.array[m, 8] 8 | :param all_det: list of np.array[m, 6], truncation and occlusion not necessary 9 | :param allheight: 10 | :param allwidth: 11 | :param per_class: 12 | """ 13 | all_gt_ = [] 14 | all_det_ = [] 15 | num_imgs = len(all_gt) 16 | for gt, det, height, width in zip(all_gt, all_det, allheight, allwidth): 17 | gt, det = drop_objects_in_igr(gt, det, height, width) 18 | gt[:, 4] = 1 - gt[:, 4] # set ignore flag 19 | all_gt_.append(gt) 20 | all_det_.append(det) 21 | return calc_accuracy(num_imgs, all_gt_, all_det_, per_class) 22 | --------------------------------------------------------------------------------