├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── configs ├── config.py ├── drf_res101_voc.yaml ├── drf_vgg_voc.yaml ├── refine_drf_res101_voc.yaml ├── refine_drf_vgg_voc.yaml ├── refine_res101_voc.yaml ├── refine_vgg_voc.yaml ├── refine_vgg_voc_512.yaml ├── ssd_darknet19_voc.yaml ├── ssd_darknet53_voc.yaml ├── ssd_mobilenetv2_voc.yaml ├── ssd_res101_voc.yaml ├── ssd_res18_voc.yaml ├── ssd_res50_coco.yaml ├── ssd_res50_voc.yaml ├── ssd_vgg_voc.yaml ├── weave_vgg_voc.yaml └── weave_vgg_voc_512.yaml ├── data ├── __init__.py ├── coco.py ├── data_augment.py ├── drf_net.jpg ├── scripts │ ├── VOC2007.sh │ └── VOC2012.sh ├── voc0712.py └── voc_eval.py ├── demo.py ├── eval.py ├── images ├── dog.jpg ├── eagle.jpg └── person.jpg ├── layers ├── __init__.py ├── functions │ ├── __init__.py │ ├── detection.py │ ├── prior_box.py │ └── prior_layer.py └── modules │ ├── __init__.py │ ├── focal_loss_sigmoid.py │ ├── focal_loss_softmax.py │ ├── multibox_loss.py │ ├── refine_multibox_loss.py │ ├── weight_smooth_l1_loss.py │ └── weight_softmax_loss.py ├── make.sh ├── models ├── darknet.py ├── dense_conv.py ├── drf_res.py ├── drf_vgg.py ├── mobilenetv2.py ├── model_builder.py ├── model_helper.py ├── refine_dense_conv.py ├── refine_drf_res.py ├── refine_drf_vgg.py ├── refine_res.py ├── refine_vgg.py ├── resnet.py ├── vgg.py ├── weave_res.py └── weave_vgg.py ├── train.py └── utils ├── __init__.py ├── augmentations.py ├── averageMeter.py ├── box_utils.py ├── build.py ├── collections.py ├── convert_darknet.py ├── get_class_map.py ├── nms ├── __init__.py ├── cpu_nms.pyx ├── gpu_nms.hpp ├── gpu_nms.pyx ├── nms_kernel.cu └── py_cpu_nms.py ├── nms_wrapper.py └── timer.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-language=Python 2 | .ipynb_checkpoints/* linguist-documentation 3 | dev.ipynb linguist-documentation 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # sftp 7 | sftp-config.json 8 | 9 | # coco 10 | ./utils/nms/*.so 11 | ./utils/build 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *,cover 54 | .hypothesis/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | *.json 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # IPython Notebook 79 | .ipynb_checkpoints 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # dotenv 88 | .env 89 | 90 | # virtualenv 91 | venv/ 92 | ENV/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # atom remote-sync package 101 | .remote-sync.json 102 | 103 | # weights 104 | weights/ 105 | 106 | #DS_Store 107 | .DS_Store 108 | 109 | # dev stuff 110 | eval/ 111 | eval.ipynb 112 | dev.ipynb 113 | .vscode/ 114 | 115 | # not ready 116 | videos/ 117 | templates/ 118 | data/ssd_dataloader.py 119 | data/datasets/ 120 | doc/visualize.py 121 | read_results.py 122 | ssd300_120000/ 123 | demos/live 124 | webdemo.py 125 | test_data_aug.py 126 | weights/ 127 | 128 | # attributes 129 | 130 | # pycharm 131 | .idea/ 132 | 133 | # temp checkout soln 134 | data/datasets/ 135 | data/ssd_dataloader.py 136 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Max deGroot, Ellis Brown 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SSD Pytorch 2 | A [PyTorch](http://pytorch.org/) implementation of SSDs (include original ssd, DRFNet, RefineDet) 3 | 4 | 5 | 6 | ### Table of Contents 7 | - Installation 8 | - Datasets 9 | - Train 10 | - Evaluate 11 | - Performance 12 | - Reference 13 | 14 |   15 |   16 |   17 |   18 | 19 | ## Installation 20 | - Install [PyTorch-0.4.0](http://pytorch.org/) by selecting your environment on the website and running the appropriate command. 21 | - Clone this repository. 22 | * Note: We currently only support Python 3+. 23 | - Then download the dataset by following the [instructions](#download-voc2007-trainval--test) below. 24 | - Compile the nms and install coco tools: 25 | 26 | ```shell 27 | cd SSD_Pytorch 28 | # if you use anaconda3, maybe you need https://github.com/rbgirshick/py-faster-rcnn/issues/706 29 | ./make.sh 30 | pip install pycocotools 31 | 32 | ``` 33 | 34 | Note*: Check you GPU architecture support in utils/build.py, line 131. Default is: 35 | 36 | ```Shell 37 | 'nvcc': ['-arch=sm_52', 38 | 39 | ``` 40 | 41 | ## Datasets 42 | To make things easy, we provide a simple VOC dataset loader that inherits `torch.utils.data.Dataset` making it fully compatible with the `torchvision.datasets` [API](http://pytorch.org/docs/torchvision/datasets.html). 43 | 44 | ### VOC Dataset 45 | ##### Download VOC2007 trainval & test 46 | 47 | ```Shell 48 | # specify a directory for dataset to be downloaded into, else default is ~/data/ 49 | sh data/scripts/VOC2007.sh # 50 | ``` 51 | 52 | ##### Download VOC2012 trainval 53 | 54 | ```Shell 55 | # specify a directory for dataset to be downloaded into, else default is ~/data/ 56 | sh data/scripts/VOC2012.sh # 57 | ``` 58 | 59 | ##### Merge VOC2007 and VOC2012 60 | 61 | ```Shell 62 | move all images in VOC2007 and VOC2012 into VOCROOT/VOC0712/JPEGImages 63 | move all annotations in VOC2007 and VOC2012 into VOCROOT/VOC0712/JPEGImages/Annotations 64 | rename and merge some txt VOC2007 and VOC2012 ImageSets/Main/*.txt to VOCROOT/VOC0712/JPEGImages/ImageSets/Main/*.txt 65 | the merged txt list as follows: 66 | 2012_test.txt, 2007_test.txt, 0712_trainval_test.txt, 2012_trainval.txt, 0712_trainval.txt 67 | 68 | ``` 69 | ### COCO Dataset 70 | Install the MS COCO dataset at /path/to/coco from [official website](http://mscoco.org/), default is ~/data/COCO. Following the [instructions](https://github.com/rbgirshick/py-faster-rcnn/blob/77b773655505599b94fd8f3f9928dbf1a9a776c7/data/README.md) to prepare *minival2014* and *valminusminival2014* annotations. All label files (.json) should be under the COCO/annotations/ folder. It should have this basic structure 71 | ```Shell 72 | $COCO/ 73 | $COCO/cache/ 74 | $COCO/annotations/ 75 | $COCO/images/ 76 | $COCO/images/test2015/ 77 | $COCO/images/train2014/ 78 | $COCO/images/val2014/ 79 | ``` 80 | *UPDATE*: The current COCO dataset has released new *train2017* and *val2017* sets which are just new splits of the same image sets. 81 | 82 | 83 | ## Training 84 | - First download the fc-reduced [VGG-16](https://arxiv.org/abs/1409.1556) PyTorch base network weights at: https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth 85 | - ResNet pre-trained basenet weight file is available at [ResNet50](https://download.pytorch.org/models/resnet50-19c8e357.pth), [ResNet101](https://download.pytorch.org/models/resnet101-5d3b4d8f.pth), [ResNet152](https://download.pytorch.org/models/resnet152-b121ed2d.pth) 86 | - By default, we assume you have downloaded the file in the `SSD_Pytorch/weights/pretrained_models` dir: 87 | 88 | ```Shell 89 | mkdir weights 90 | cd weights 91 | mkdir pretrained_models 92 | 93 | wget https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth 94 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth 95 | wget https://download.pytorch.org/models/resnet101-5d3b4d8f.pth 96 | wget https://download.pytorch.org/models/resnet152-b121ed2d.pth 97 | mv download_weights pretrained_models 98 | ``` 99 | 100 | - To train SSD_Pytorch using the train script simply specify the parameters listed in `train.py` as a flag or manually change them. 101 | 102 | ```Shell 103 | python train.py --cfg ./configs/ssd_vgg_voc.yaml 104 | ``` 105 | 106 | - Note: 107 | All training configs are in ssd_vgg_voc.yaml, you can change it by yourself. 108 | 109 | - To evaluate a trained network: 110 | 111 | ```Shell 112 | python eval.py --cfg ./configs/ssd_vgg_voc.yaml --weights ./eval_weights 113 | ``` 114 | 115 | - To detect one images 116 | 117 | ``` 118 | # you need put some images in ./images 119 | python demo.py --cfg ./configs/ssd_vgg_voc.yaml --images ./images --save_folder ./output 120 | 121 | ``` 122 | You can specify the parameters listed in the `eval.py` or `demo.py` file by flagging them or manually changing them. 123 | 124 | ## Performance 125 | 126 | #### VOC2007 Test 127 | 128 | ##### mAP 129 | 130 | we retrained some models, so it's different from the origin paper 131 | size = 300 132 | 133 | |ssd_vgg|ssd_res|ssd_darknet|drf_ssd_vgg|drf_ssd_res|refine_drf_vgg|refine_ssd_vgg| 134 | |:-: |:-: |:-: |:-: |:-: |:-:|:-: | 135 | | 77.5% | 77.0 | 77.6% | 79.6 % | 79.0% |80.2% |80.4 % | 136 | 137 | 138 | 139 | 140 | ## References 141 | - Wei Liu, et al. "SSD: Single Shot MultiBox Detector." [ECCV2016]((http://arxiv.org/abs/1512.02325)). 142 | - [Original Implementation (CAFFE)](https://github.com/weiliu89/caffe/tree/ssd) 143 | - A list of other great SSD ports that were sources of inspiration (especially the Chainer repo): 144 | * [ssd.pytorch]((https://github.com/amdegroot/ssd.pytorch)), 145 | [RFBNet](https://github.com/ruinmessi/RFBNet) 146 | [Chainer](https://github.com/Hakuyume/chainer-ssd), 147 | [torchcv](https://github.com/kuangliu/torchcv) 148 | ) 149 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /configs/drf_res101_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: drf_res101 3 | SIZE: '300' 4 | REFINE: False 5 | CONV_BODY: drf_res.DRFSSDRes101 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet101-5d3b4d8f.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | SMALL: 21 | FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5], [3, 3], [1, 1]] 22 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256] 23 | NUM_ANCHORS: [6, 6, 6, 6, 4, 4] 24 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]] 25 | MIN_SIZES: [30, 60, 111, 162, 213, 264] 26 | MAX_SIZES: [60, 111, 162, 213, 264, 315] 27 | ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 28 | VARIANCE : [0.1, 0.2] 29 | CLIP: True 30 | IMG_WH: [320, 320] 31 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 32 | USE_MAX_SIZE: True 33 | 34 | BIG: 35 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]] 36 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256] 37 | NUM_ANCHORS: [6, 6, 6, 6, 6, 4, 4] 38 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]] 39 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 40 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 41 | ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 42 | 43 | CLIP: True 44 | IMG_WH: [512, 512] 45 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 46 | USE_MAX_SIZE: True 47 | 48 | SOLVER: 49 | WEIGHT_DECAY: 0.0005 50 | BASE_LR: 0.001 51 | GAMMA: 0.1 52 | MOMENTUM: 0.9 53 | EPOCH_STEPS: [0, 150, 250] 54 | END_EPOCH: 250 55 | START_EPOCH: 0 56 | 57 | DATASETS: 58 | TRAIN_TYPE: [['0712', '0712_trainval']] 59 | VAL_TYPE: [['0712', '2007_test']] 60 | DATAROOT: 'data/datasets/VOCdevkit0712/' 61 | DATA_TYPE: 'VOC' 62 | SETS: 63 | VOC: [['0712', '0712_trainval']] 64 | VOC0712PLUS: [['0712', '0712_trainval_test']] 65 | VOC0712: [['2012', '2012_trainval']] 66 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 67 | VOC2007: [['0712', "2007_test"]] 68 | COCOval: [['2014', 'minival']] 69 | VOCROOT: 'data/datasets/VOCdevkit0712/' 70 | COCOROOT: 'data/datasets/coco2015' 71 | 72 | TEST: 73 | INPUT_WH: [300, 300] 74 | CONFIDENCE_THRESH: 0.01 75 | NMS_OVERLAP: 0.45 76 | BATCH_SIZE: 16 77 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /configs/drf_vgg_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: drf_vgg 3 | SIZE: '300' 4 | REFINE: False 5 | CONV_BODY: drf_vgg.DRFVgg 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 3 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5], [3, 3], [1, 1]] 23 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256] 24 | NUM_ANCHORS: [6, 6, 6, 6, 4, 4] 25 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]] 26 | MIN_SIZES: [30, 60, 111, 162, 213, 264] 27 | MAX_SIZES: [60, 111, 162, 213, 264, 315] 28 | ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 29 | VARIANCE : [0.1, 0.2] 30 | CLIP: True 31 | IMG_WH: [320, 320] 32 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 33 | USE_MAX_SIZE: True 34 | 35 | BIG: 36 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]] 37 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256] 38 | NUM_ANCHORS: [6, 6, 6, 6, 6, 4, 4] 39 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]] 40 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 41 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 42 | ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 43 | 44 | CLIP: True 45 | IMG_WH: [512, 512] 46 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 47 | USE_MAX_SIZE: True 48 | 49 | SOLVER: 50 | WEIGHT_DECAY: 0.0005 51 | BASE_LR: 0.004 52 | GAMMA: 0.1 53 | MOMENTUM: 0.9 54 | EPOCH_STEPS: [0, 150, 200] 55 | END_EPOCH: 250 56 | START_EPOCH: 0 57 | 58 | DATASETS: 59 | TRAIN_TYPE: [['0712', '0712_trainval']] 60 | VAL_TYPE: [['0712', '2007_test']] 61 | DATAROOT: 'data/datasets/VOCdevkit0712/' 62 | DATA_TYPE: 'VOC' 63 | SETS: 64 | VOC: [['0712', '0712_trainval']] 65 | VOC0712PLUS: [['0712', '0712_trainval_test']] 66 | VOC0712: [['2012', '2012_trainval']] 67 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 68 | VOC2007: [['0712', "2007_test"]] 69 | COCOval: [['2014', 'minival']] 70 | VOCROOT: 'data/datasets/VOCdevkit0712/' 71 | COCOROOT: 'data/datasets/coco2015' 72 | 73 | TEST: 74 | INPUT_WH: [320, 320] 75 | CONFIDENCE_THRESH: 0.01 76 | NMS_OVERLAP: 0.45 77 | BATCH_SIZE: 16 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /configs/refine_drf_res101_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: drf_res101 3 | SIZE: '300' 4 | REFINE: True 5 | CONV_BODY: refine_drf_res.RefineDRFRes101 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet101-5d3b4d8f.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | SMALL: 21 | FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 22 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256] 23 | ODM_CHANNELS: [512, 1024, 512, 256, 256, 256] 24 | NUM_ANCHORS: [6, 6, 6, 6, 4, 4] 25 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]] 26 | MIN_SIZES: [30, 60, 111, 162, 213, 264] 27 | MAX_SIZES: [60, 111, 162, 213, 264, 315] 28 | ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 29 | VARIANCE : [0.1, 0.2] 30 | CLIP: True 31 | IMG_WH: [320, 320] 32 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 33 | USE_MAX_SIZE: True 34 | 35 | BIG: 36 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]] 37 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256] 38 | ODM_CHANNELS: [512, 1024, 512, 256, 256, 256] 39 | NUM_ANCHORS: [6, 6, 6, 6, 6, 4, 4] 40 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]] 41 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 42 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 43 | ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 44 | 45 | CLIP: True 46 | IMG_WH: [512, 512] 47 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 48 | USE_MAX_SIZE: True 49 | 50 | SOLVER: 51 | WEIGHT_DECAY: 0.0005 52 | BASE_LR: 0.001 53 | GAMMA: 0.1 54 | MOMENTUM: 0.9 55 | EPOCH_STEPS: [0, 150, 200] 56 | END_EPOCH: 250 57 | START_EPOCH: 0 58 | 59 | DATASETS: 60 | TRAIN_TYPE: [['0712', '0712_trainval']] 61 | VAL_TYPE: [['0712', '2007_test']] 62 | DATAROOT: 'data/datasets/VOCdevkit0712/' 63 | DATA_TYPE: 'VOC' 64 | SETS: 65 | VOC: [['0712', '0712_trainval']] 66 | VOC0712PLUS: [['0712', '0712_trainval_test']] 67 | VOC0712: [['2012', '2012_trainval']] 68 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 69 | VOC2007: [['0712', "2007_test"]] 70 | COCOval: [['2014', 'minival']] 71 | VOCROOT: 'data/datasets/VOCdevkit0712/' 72 | COCOROOT: 'data/datasets/coco2015' 73 | 74 | TEST: 75 | INPUT_WH: [300, 300] 76 | CONFIDENCE_THRESH: 0.01 77 | NMS_OVERLAP: 0.45 78 | BATCH_SIZE: 16 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /configs/refine_drf_vgg_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: refine_drf_vgg 3 | SIZE: '300' 4 | REFINE: True 5 | CONV_BODY: refine_drf_vgg.RefineDRFVgg 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5], [3, 3], [1, 1]] 23 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256] 24 | ODM_CHANNELS: [512, 1024, 512, 256, 256, 256] 25 | NUM_ANCHORS: [6, 6, 6, 6, 4, 4] 26 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]] 27 | MIN_SIZES: [30, 60, 111, 162, 213, 264] 28 | MAX_SIZES: [60, 111, 162, 213, 264, 315] 29 | ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 30 | VARIANCE : [0.1, 0.2] 31 | CLIP: True 32 | IMG_WH: [320, 320] 33 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 34 | USE_MAX_SIZE: True 35 | 36 | BIG: 37 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]] 38 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256] 39 | ODM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256] 40 | NUM_ANCHORS: [6, 6, 6, 6, 6, 4, 4] 41 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]] 42 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 43 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 44 | ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 45 | 46 | CLIP: True 47 | IMG_WH: [512, 512] 48 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 49 | USE_MAX_SIZE: True 50 | 51 | SOLVER: 52 | WEIGHT_DECAY: 0.0005 53 | BASE_LR: 0.004 54 | GAMMA: 0.1 55 | MOMENTUM: 0.9 56 | EPOCH_STEPS: [0, 150, 200] 57 | END_EPOCH: 250 58 | START_EPOCH: 0 59 | 60 | DATASETS: 61 | TRAIN_TYPE: [['0712', '0712_trainval']] 62 | VAL_TYPE: [['0712', '2007_test']] 63 | DATAROOT: 'data/datasets/VOCdevkit0712/' 64 | DATA_TYPE: 'VOC' 65 | SETS: 66 | VOC: [['0712', '0712_trainval']] 67 | VOC0712PLUS: [['0712', '0712_trainval_test']] 68 | VOC0712: [['2012', '2012_trainval']] 69 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 70 | VOC2007: [['0712', "2007_test"]] 71 | COCOval: [['2014', 'minival']] 72 | VOCROOT: 'data/datasets/VOCdevkit0712/' 73 | COCOROOT: 'data/datasets/coco2015' 74 | 75 | TEST: 76 | INPUT_WH: [300, 300] 77 | CONFIDENCE_THRESH: 0.01 78 | NMS_OVERLAP: 0.45 79 | BATCH_SIZE: 16 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /configs/refine_res101_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: refine_res 3 | SIZE: '300' 4 | REFINE: True 5 | CONV_BODY: refine_res.RefineResnet101 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet101-5d3b4d8f.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]] 23 | ARM_CHANNELS: [512, 1024, 512, 256] 24 | ODM_CHANNELS: [256, 256, 256, 256] 25 | NUM_ANCHORS: [3, 3, 3, 3] 26 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 27 | MIN_SIZES: [30, 64, 128, 256] 28 | MAX_SIZES: [64, 128, 256, 315] 29 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 30 | VARIANCE : [0.1, 0.2] 31 | CLIP: True 32 | IMG_WH: [320, 320] 33 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 34 | USE_MAX_SIZE: False 35 | 36 | BIG: 37 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]] 38 | ARM_CHANNELS: [512, 1024, 512, 256] 39 | ODM_CHANNELS: [256, 256, 256, 256] 40 | NUM_ANCHORS: [3, 3, 3, 3] 41 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 42 | MIN_SIZES: [30, 64, 128, 256] 43 | MAX_SIZES: [64, 128, 256, 315] 44 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 45 | CLIP: True 46 | IMG_WH: [512, 512] 47 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 48 | USE_MAX_SIZE: False 49 | 50 | SOLVER: 51 | WEIGHT_DECAY: 0.0005 52 | BASE_LR: 0.001 53 | GAMMA: 0.1 54 | MOMENTUM: 0.9 55 | EPOCH_STEPS: [0, 150, 200] 56 | END_EPOCH: 250 57 | START_EPOCH: 0 58 | 59 | DATASETS: 60 | TRAIN_TYPE: [['0712', '0712_trainval']] 61 | VAL_TYPE: [['0712', '2007_test']] 62 | DATAROOT: 'data/datasets/VOCdevkit0712/' 63 | DATA_TYPE: 'VOC' 64 | SETS: 65 | VOC: [['0712', '0712_trainval']] 66 | VOC0712PLUS: [['0712', '0712_trainval_test']] 67 | VOC0712: [['2012', '2012_trainval']] 68 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 69 | VOC2007: [['0712', "2007_test"]] 70 | COCOval: [['2014', 'minival']] 71 | VOCROOT: 'data/datasets/VOCdevkit0712/' 72 | COCOROOT: 'data/datasets/coco2015' 73 | 74 | TEST: 75 | INPUT_WH: [320, 320] 76 | CONFIDENCE_THRESH: 0.01 77 | NMS_OVERLAP: 0.45 78 | BATCH_SIZE: 16 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /configs/refine_vgg_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: refine_vgg 3 | SIZE: '300' 4 | REFINE: True 5 | CONV_BODY: refine_vgg.refine_vgg 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]] 23 | ARM_CHANNELS: [512, 1024, 256, 256] 24 | ODM_CHANNELS: [256, 256, 256, 256] 25 | NUM_ANCHORS: [3, 3, 3, 3] 26 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 27 | MIN_SIZES: [30, 64, 128, 256] 28 | MAX_SIZES: [64, 128, 256, 315] 29 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 30 | VARIANCE : [0.1, 0.2] 31 | CLIP: True 32 | IMG_WH: [320, 320] 33 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 34 | USE_MAX_SIZE: False 35 | 36 | BIG: 37 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]] 38 | ARM_CHANNELS: [512, 1024, 256, 256] 39 | ODM_CHANNELS: [256, 256, 256, 256] 40 | NUM_ANCHORS: [3, 3, 3, 3] 41 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 42 | MIN_SIZES: [30, 64, 128, 256] 43 | MAX_SIZES: [64, 128, 256, 315] 44 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 45 | CLIP: True 46 | IMG_WH: [512, 512] 47 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 48 | USE_MAX_SIZE: False 49 | 50 | SOLVER: 51 | WEIGHT_DECAY: 0.0005 52 | BASE_LR: 0.002 53 | GAMMA: 0.1 54 | MOMENTUM: 0.9 55 | EPOCH_STEPS: [0, 150, 200] 56 | END_EPOCH: 250 57 | START_EPOCH: 0 58 | 59 | DATASETS: 60 | TRAIN_TYPE: [['0712', '0712_trainval']] 61 | VAL_TYPE: [['0712', '2007_test']] 62 | DATAROOT: 'data/datasets/VOCdevkit0712/' 63 | DATA_TYPE: 'VOC' 64 | SETS: 65 | VOC: [['0712', '0712_trainval']] 66 | VOC0712PLUS: [['0712', '0712_trainval_test']] 67 | VOC0712: [['2012', '2012_trainval']] 68 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 69 | VOC2007: [['0712', "2007_test"]] 70 | COCOval: [['2014', 'minival']] 71 | VOCROOT: 'data/datasets/VOCdevkit0712/' 72 | COCOROOT: 'data/datasets/coco2015' 73 | 74 | TEST: 75 | INPUT_WH: [320, 320] 76 | CONFIDENCE_THRESH: 0.01 77 | NMS_OVERLAP: 0.45 78 | BATCH_SIZE: 16 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /configs/refine_vgg_voc_512.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: refine_vgg 3 | SIZE: '512' 4 | REFINE: True 5 | CONV_BODY: refine_vgg.refine_vgg 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]] 23 | ARM_CHANNELS: [512, 1024, 256, 256] 24 | ODM_CHANNELS: [256, 256, 256, 256] 25 | NUM_ANCHORS: [3, 3, 3, 3] 26 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 27 | MIN_SIZES: [30, 64, 128, 256] 28 | MAX_SIZES: [64, 128, 256, 315] 29 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 30 | VARIANCE : [0.1, 0.2] 31 | CLIP: True 32 | IMG_WH: [320, 320] 33 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 34 | USE_MAX_SIZE: False 35 | 36 | BIG: 37 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]] 38 | ARM_CHANNELS: [512, 1024, 256, 256] 39 | ODM_CHANNELS: [256, 256, 256, 256] 40 | NUM_ANCHORS: [3, 3, 3, 3] 41 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 42 | MIN_SIZES: [30, 64, 128, 256] 43 | MAX_SIZES: [64, 128, 256, 315] 44 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 45 | CLIP: True 46 | IMG_WH: [512, 512] 47 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 48 | USE_MAX_SIZE: False 49 | 50 | SOLVER: 51 | WEIGHT_DECAY: 0.0005 52 | BASE_LR: 0.002 53 | GAMMA: 0.1 54 | MOMENTUM: 0.9 55 | EPOCH_STEPS: [0, 150, 200] 56 | END_EPOCH: 250 57 | START_EPOCH: 0 58 | 59 | DATASETS: 60 | TRAIN_TYPE: [['0712', '0712_trainval']] 61 | VAL_TYPE: [['0712', '2007_test']] 62 | DATAROOT: 'data/datasets/VOCdevkit0712/' 63 | DATA_TYPE: 'VOC' 64 | SETS: 65 | VOC: [['0712', '0712_trainval']] 66 | VOC0712PLUS: [['0712', '0712_trainval_test']] 67 | VOC0712: [['2012', '2012_trainval']] 68 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 69 | VOC2007: [['0712', "2007_test"]] 70 | COCOval: [['2014', 'minival']] 71 | VOCROOT: 'data/datasets/VOCdevkit0712/' 72 | COCOROOT: 'data/datasets/coco2015' 73 | 74 | TEST: 75 | INPUT_WH: [512, 512] 76 | CONFIDENCE_THRESH: 0.01 77 | NMS_OVERLAP: 0.45 78 | BATCH_SIZE: 16 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /configs/ssd_darknet19_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: ssd_darknet19 3 | SIZE: '300' 4 | REFINE: False 5 | CONV_BODY: darknet.SSDarknet19 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/convert_darknet19.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 23 | ARM_CHANNELS: [256, 512, 1024, 256, 256, 256] 24 | NUM_ANCHORS: [4, 6, 6, 6, 4, 4] 25 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]] 26 | MIN_SIZES: [30, 60, 111, 162, 213, 264] 27 | MAX_SIZES: [60, 111, 162, 213, 264, 315] 28 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 29 | VARIANCE : [0.1, 0.2] 30 | CLIP: True 31 | IMG_WH: [300, 300] 32 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 33 | USE_MAX_SIZE: True 34 | 35 | BIG: 36 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]] 37 | ARM_CHANNELS: [256, 512, 1024, 256, 256, 256, 256] 38 | NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4] 39 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]] 40 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 41 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 42 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 43 | 44 | CLIP: True 45 | IMG_WH: [512, 512] 46 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 47 | USE_MAX_SIZE: True 48 | 49 | SOLVER: 50 | WEIGHT_DECAY: 0.0005 51 | BASE_LR: 0.001 52 | GAMMA: 0.1 53 | MOMENTUM: 0.9 54 | EPOCH_STEPS: [0, 150, 200] 55 | END_EPOCH: 250 56 | START_EPOCH: 0 57 | 58 | DATASETS: 59 | TRAIN_TYPE: [['0712', '0712_trainval']] 60 | VAL_TYPE: [['0712', '2007_test']] 61 | DATAROOT: 'data/datasets/VOCdevkit0712/' 62 | DATA_TYPE: 'VOC' 63 | SETS: 64 | VOC: [['0712', '0712_trainval']] 65 | VOC0712PLUS: [['0712', '0712_trainval_test']] 66 | VOC0712: [['2012', '2012_trainval']] 67 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 68 | VOC2007: [['0712', "2007_test"]] 69 | COCOval: [['2014', 'minival']] 70 | VOCROOT: 'data/datasets/VOCdevkit0712/' 71 | COCOROOT: 'data/datasets/coco2015' 72 | 73 | TEST: 74 | INPUT_WH: [300, 300] 75 | CONFIDENCE_THRESH: 0.01 76 | NMS_OVERLAP: 0.45 77 | BATCH_SIZE: 16 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /configs/ssd_darknet53_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: ssd_darknet53 3 | SIZE: '300' 4 | REFINE: False 5 | CONV_BODY: darknet.SSDarknet53 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/convert_darknet53.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 23 | ARM_CHANNELS: [256, 512, 1024, 256, 256, 256] 24 | NUM_ANCHORS: [4, 6, 6, 6, 4, 4] 25 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]] 26 | MIN_SIZES: [30, 60, 111, 162, 213, 264] 27 | MAX_SIZES: [60, 111, 162, 213, 264, 315] 28 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 29 | VARIANCE : [0.1, 0.2] 30 | CLIP: True 31 | IMG_WH: [300, 300] 32 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 33 | USE_MAX_SIZE: True 34 | 35 | BIG: 36 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]] 37 | ARM_CHANNELS: [256, 512, 1024, 256, 256, 256, 256] 38 | NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4] 39 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]] 40 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 41 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 42 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 43 | 44 | CLIP: True 45 | IMG_WH: [512, 512] 46 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 47 | USE_MAX_SIZE: True 48 | 49 | SOLVER: 50 | WEIGHT_DECAY: 0.0005 51 | BASE_LR: 0.001 52 | GAMMA: 0.1 53 | MOMENTUM: 0.9 54 | EPOCH_STEPS: [0, 150, 200] 55 | END_EPOCH: 250 56 | START_EPOCH: 0 57 | 58 | DATASETS: 59 | TRAIN_TYPE: [['0712', '0712_trainval']] 60 | VAL_TYPE: [['0712', '2007_test']] 61 | DATAROOT: 'data/datasets/VOCdevkit0712/' 62 | DATA_TYPE: 'VOC' 63 | SETS: 64 | VOC: [['0712', '0712_trainval']] 65 | VOC0712PLUS: [['0712', '0712_trainval_test']] 66 | VOC0712: [['2012', '2012_trainval']] 67 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 68 | VOC2007: [['0712', "2007_test"]] 69 | COCOval: [['2014', 'minival']] 70 | VOCROOT: 'data/datasets/VOCdevkit0712/' 71 | COCOROOT: 'data/datasets/coco2015' 72 | 73 | TEST: 74 | INPUT_WH: [300, 300] 75 | CONFIDENCE_THRESH: 0.01 76 | NMS_OVERLAP: 0.45 77 | BATCH_SIZE: 16 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /configs/ssd_mobilenetv2_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: ssd_mobilenetv2 3 | SIZE: '300' 4 | REFINE: False 5 | CONV_BODY: mobilenetv2.SSDMobilenetv2 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/mobilenetv2_feature.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 23 | ARM_CHANNELS: [32, 96, 1280, 256, 256, 256] 24 | NUM_ANCHORS: [4, 6, 6, 6, 4, 4] 25 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]] 26 | MIN_SIZES: [30, 60, 111, 162, 213, 264] 27 | MAX_SIZES: [60, 111, 162, 213, 264, 315] 28 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 29 | VARIANCE : [0.1, 0.2] 30 | CLIP: True 31 | IMG_WH: [300, 300] 32 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 33 | USE_MAX_SIZE: True 34 | 35 | BIG: 36 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]] 37 | ARM_CHANNELS: [32, 96, 1280, 256, 256, 256, 256] 38 | NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4] 39 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]] 40 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 41 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 42 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 43 | 44 | CLIP: True 45 | IMG_WH: [512, 512] 46 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 47 | USE_MAX_SIZE: True 48 | 49 | SOLVER: 50 | WEIGHT_DECAY: 0.0005 51 | BASE_LR: 0.001 52 | GAMMA: 0.1 53 | MOMENTUM: 0.9 54 | EPOCH_STEPS: [0, 150, 200] 55 | END_EPOCH: 250 56 | START_EPOCH: 0 57 | 58 | DATASETS: 59 | TRAIN_TYPE: [['0712', '0712_trainval']] 60 | VAL_TYPE: [['0712', '2007_test']] 61 | DATAROOT: 'data/datasets/VOCdevkit0712/' 62 | DATA_TYPE: 'VOC' 63 | SETS: 64 | VOC: [['0712', '0712_trainval']] 65 | VOC0712PLUS: [['0712', '0712_trainval_test']] 66 | VOC0712: [['2012', '2012_trainval']] 67 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 68 | VOC2007: [['0712', "2007_test"]] 69 | COCOval: [['2014', 'minival']] 70 | VOCROOT: 'data/datasets/VOCdevkit0712/' 71 | COCOROOT: 'data/datasets/coco2015' 72 | 73 | TEST: 74 | INPUT_WH: [300, 300] 75 | CONFIDENCE_THRESH: 0.01 76 | NMS_OVERLAP: 0.45 77 | BATCH_SIZE: 16 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /configs/ssd_res101_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: ssd_res50 3 | SIZE: '300' 4 | REFINE: False 5 | CONV_BODY: resnet.SSDResnet101 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet101-5d3b4d8f.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 23 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256] 24 | NUM_ANCHORS: [4, 6, 6, 6, 4, 4] 25 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]] 26 | MIN_SIZES: [30, 60, 111, 162, 213, 264] 27 | MAX_SIZES: [60, 111, 162, 213, 264, 315] 28 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 29 | VARIANCE : [0.1, 0.2] 30 | CLIP: True 31 | IMG_WH: [300, 300] 32 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 33 | USE_MAX_SIZE: True 34 | 35 | BIG: 36 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]] 37 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256] 38 | NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4] 39 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]] 40 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 41 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 42 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 43 | 44 | CLIP: True 45 | IMG_WH: [512, 512] 46 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 47 | USE_MAX_SIZE: True 48 | 49 | SOLVER: 50 | WEIGHT_DECAY: 0.0005 51 | BASE_LR: 0.001 52 | GAMMA: 0.1 53 | MOMENTUM: 0.9 54 | EPOCH_STEPS: [0, 150, 200] 55 | END_EPOCH: 250 56 | START_EPOCH: 0 57 | 58 | DATASETS: 59 | TRAIN_TYPE: [['0712', '0712_trainval']] 60 | VAL_TYPE: [['0712', '2007_test']] 61 | DATAROOT: 'data/datasets/VOCdevkit0712/' 62 | DATA_TYPE: 'VOC' 63 | SETS: 64 | VOC: [['0712', '0712_trainval']] 65 | VOC0712PLUS: [['0712', '0712_trainval_test']] 66 | VOC0712: [['2012', '2012_trainval']] 67 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 68 | VOC2007: [['0712', "2007_test"]] 69 | COCOval: [['2014', 'minival']] 70 | VOCROOT: 'data/datasets/VOCdevkit0712/' 71 | COCOROOT: 'data/datasets/coco2015' 72 | 73 | TEST: 74 | INPUT_WH: [300, 300] 75 | CONFIDENCE_THRESH: 0.01 76 | NMS_OVERLAP: 0.45 77 | BATCH_SIZE: 16 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /configs/ssd_res18_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: ssd_res18 3 | SIZE: '300' 4 | REFINE: False 5 | CONV_BODY: resnet.SSDResnet18 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet18-5c106cde.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 23 | ARM_CHANNELS: [128, 256, 512, 256, 256, 256] 24 | NUM_ANCHORS: [4, 6, 6, 6, 4, 4] 25 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]] 26 | MIN_SIZES: [30, 60, 111, 162, 213, 264] 27 | MAX_SIZES: [60, 111, 162, 213, 264, 315] 28 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 29 | VARIANCE : [0.1, 0.2] 30 | CLIP: True 31 | IMG_WH: [300, 300] 32 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 33 | USE_MAX_SIZE: True 34 | 35 | BIG: 36 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]] 37 | ARM_CHANNELS: [128, 256, 512, 256, 256, 256, 256] 38 | NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4] 39 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]] 40 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 41 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 42 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 43 | 44 | CLIP: True 45 | IMG_WH: [512, 512] 46 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 47 | USE_MAX_SIZE: True 48 | 49 | SOLVER: 50 | WEIGHT_DECAY: 0.0005 51 | BASE_LR: 0.001 52 | GAMMA: 0.1 53 | MOMENTUM: 0.9 54 | EPOCH_STEPS: [0, 150, 200] 55 | END_EPOCH: 250 56 | START_EPOCH: 0 57 | 58 | DATASETS: 59 | TRAIN_TYPE: [['0712', '0712_trainval']] 60 | VAL_TYPE: [['0712', '2007_test']] 61 | DATAROOT: 'data/datasets/VOCdevkit0712/' 62 | DATA_TYPE: 'VOC' 63 | SETS: 64 | VOC: [['0712', '0712_trainval']] 65 | VOC0712PLUS: [['0712', '0712_trainval_test']] 66 | VOC0712: [['2012', '2012_trainval']] 67 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 68 | VOC2007: [['0712', "2007_test"]] 69 | COCOval: [['2014', 'minival']] 70 | VOCROOT: 'data/datasets/VOCdevkit0712/' 71 | COCOROOT: 'data/datasets/coco2015' 72 | 73 | TEST: 74 | INPUT_WH: [300, 300] 75 | CONFIDENCE_THRESH: 0.01 76 | NMS_OVERLAP: 0.45 77 | BATCH_SIZE: 16 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /configs/ssd_res50_coco.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: ssd_res50 3 | SIZE: '300' 4 | REFINE: False 5 | CONV_BODY: resnet.SSDResnet50 6 | NUM_CLASSES: 81 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet50-19c8e357.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 23 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256] 24 | NUM_ANCHORS: [4, 6, 6, 6, 4, 4] 25 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]] 26 | MIN_SIZES: [30, 60, 111, 162, 213, 264] 27 | MAX_SIZES: [60, 111, 162, 213, 264, 315] 28 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 29 | VARIANCE : [0.1, 0.2] 30 | CLIP: True 31 | IMG_WH: [300, 300] 32 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 33 | USE_MAX_SIZE: True 34 | 35 | BIG: 36 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]] 37 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256] 38 | NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4] 39 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]] 40 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 41 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 42 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 43 | 44 | CLIP: True 45 | IMG_WH: [512, 512] 46 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 47 | USE_MAX_SIZE: True 48 | 49 | SOLVER: 50 | WEIGHT_DECAY: 0.0005 51 | BASE_LR: 0.001 52 | GAMMA: 0.1 53 | MOMENTUM: 0.9 54 | EPOCH_STEPS: [0, 150, 200] 55 | END_EPOCH: 250 56 | START_EPOCH: 0 57 | 58 | DATASETS: 59 | TRAIN_TYPE: [['2014', 'train'], ['2014', 'valminusminival']] 60 | VAL_TYPE: [['2014', 'minival']] 61 | DATAROOT: 'data/datasets/coco2015' 62 | DATA_TYPE: 'COCO' 63 | SETS: 64 | VOC: [['0712', '0712_trainval']] 65 | VOC0712PLUS: [['0712', '0712_trainval_test']] 66 | VOC0712: [['2012', '2012_trainval']] 67 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 68 | VOC2007: [['0712', "2007_test"]] 69 | COCOval: [['2014', 'minival']] 70 | VOCROOT: 'data/datasets/VOCdevkit0712/' 71 | COCOROOT: 'data/datasets/coco2015' 72 | 73 | TEST: 74 | INPUT_WH: [300, 300] 75 | CONFIDENCE_THRESH: 0.01 76 | NMS_OVERLAP: 0.45 77 | BATCH_SIZE: 16 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /configs/ssd_res50_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: ssd_res50 3 | SIZE: '300' 4 | REFINE: False 5 | CONV_BODY: resnet.SSDResnet50 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet50-19c8e357.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 23 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256] 24 | NUM_ANCHORS: [4, 6, 6, 6, 4, 4] 25 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]] 26 | MIN_SIZES: [30, 60, 111, 162, 213, 264] 27 | MAX_SIZES: [60, 111, 162, 213, 264, 315] 28 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 29 | VARIANCE : [0.1, 0.2] 30 | CLIP: True 31 | IMG_WH: [300, 300] 32 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 33 | USE_MAX_SIZE: True 34 | 35 | BIG: 36 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]] 37 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256] 38 | NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4] 39 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]] 40 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 41 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 42 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 43 | 44 | CLIP: True 45 | IMG_WH: [512, 512] 46 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 47 | USE_MAX_SIZE: True 48 | 49 | SOLVER: 50 | WEIGHT_DECAY: 0.0005 51 | BASE_LR: 0.001 52 | GAMMA: 0.1 53 | MOMENTUM: 0.9 54 | EPOCH_STEPS: [0, 150, 200] 55 | END_EPOCH: 250 56 | START_EPOCH: 0 57 | 58 | DATASETS: 59 | TRAIN_TYPE: [['0712', '0712_trainval']] 60 | VAL_TYPE: [['0712', '2007_test']] 61 | DATAROOT: 'data/datasets/VOCdevkit0712/' 62 | DATA_TYPE: 'VOC' 63 | SETS: 64 | VOC: [['0712', '0712_trainval']] 65 | VOC0712PLUS: [['0712', '0712_trainval_test']] 66 | VOC0712: [['2012', '2012_trainval']] 67 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 68 | VOC2007: [['0712', "2007_test"]] 69 | COCOval: [['2014', 'minival']] 70 | VOCROOT: 'data/datasets/VOCdevkit0712/' 71 | COCOROOT: 'data/datasets/coco2015' 72 | 73 | TEST: 74 | INPUT_WH: [300, 300] 75 | CONFIDENCE_THRESH: 0.01 76 | NMS_OVERLAP: 0.45 77 | BATCH_SIZE: 16 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /configs/ssd_vgg_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: ssd_vgg 3 | SIZE: '300' 4 | REFINE: False 5 | CONV_BODY: vgg.SSDVgg 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 23 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256] 24 | NUM_ANCHORS: [4, 6, 6, 6, 4, 4] 25 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]] 26 | MIN_SIZES: [30, 60, 111, 162, 213, 264] 27 | MAX_SIZES: [60, 111, 162, 213, 264, 315] 28 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 29 | VARIANCE : [0.1, 0.2] 30 | CLIP: True 31 | IMG_WH: [300, 300] 32 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 33 | USE_MAX_SIZE: True 34 | 35 | BIG: 36 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]] 37 | ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256] 38 | NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4] 39 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]] 40 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 41 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 42 | ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 43 | 44 | CLIP: True 45 | IMG_WH: [512, 512] 46 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 47 | USE_MAX_SIZE: True 48 | 49 | SOLVER: 50 | WEIGHT_DECAY: 0.0005 51 | BASE_LR: 0.001 52 | GAMMA: 0.1 53 | MOMENTUM: 0.9 54 | EPOCH_STEPS: [0, 150, 200] 55 | END_EPOCH: 250 56 | START_EPOCH: 0 57 | 58 | DATASETS: 59 | TRAIN_TYPE: [['0712', '0712_trainval']] 60 | VAL_TYPE: [['0712', '2007_test']] 61 | DATAROOT: 'data/datasets/VOCdevkit0712/' 62 | DATA_TYPE: 'VOC' 63 | SETS: 64 | VOC: [['0712', '0712_trainval']] 65 | VOC0712PLUS: [['0712', '0712_trainval_test']] 66 | VOC0712: [['2012', '2012_trainval']] 67 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 68 | VOC2007: [['0712', "2007_test"]] 69 | COCOval: [['2014', 'minival']] 70 | VOCROOT: 'data/datasets/VOCdevkit0712/' 71 | COCOROOT: 'data/datasets/coco2015' 72 | 73 | TEST: 74 | INPUT_WH: [300, 300] 75 | CONFIDENCE_THRESH: 0.01 76 | NMS_OVERLAP: 0.45 77 | BATCH_SIZE: 32 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /configs/weave_vgg_voc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: weave_vgg 3 | SIZE: '300' 4 | REFINE: True 5 | CONV_BODY: weave_vgg.weave_vgg 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]] 23 | ARM_CHANNELS: [512, 1024, 256, 256] 24 | ODM_CHANNELS: [256, 256, 256, 256] 25 | NUM_ANCHORS: [3, 3, 3, 3] 26 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 27 | MIN_SIZES: [30, 64, 128, 256] 28 | MAX_SIZES: [64, 128, 256, 315] 29 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 30 | VARIANCE : [0.1, 0.2] 31 | CLIP: True 32 | IMG_WH: [320, 320] 33 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 34 | USE_MAX_SIZE: False 35 | 36 | BIG: 37 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]] 38 | ARM_CHANNELS: [512, 1024, 256, 256] 39 | ODM_CHANNELS: [256, 256, 256, 256] 40 | NUM_ANCHORS: [3, 3, 3, 3] 41 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 42 | MIN_SIZES: [30, 64, 128, 256] 43 | MAX_SIZES: [64, 128, 256, 315] 44 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 45 | CLIP: True 46 | IMG_WH: [512, 512] 47 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 48 | USE_MAX_SIZE: False 49 | 50 | SOLVER: 51 | WEIGHT_DECAY: 0.0005 52 | BASE_LR: 0.004 53 | GAMMA: 0.1 54 | MOMENTUM: 0.9 55 | EPOCH_STEPS: [0, 150, 200] 56 | END_EPOCH: 250 57 | START_EPOCH: 0 58 | 59 | DATASETS: 60 | TRAIN_TYPE: [['0712', '0712_trainval']] 61 | VAL_TYPE: [['0712', '2007_test']] 62 | DATAROOT: 'data/datasets/VOCdevkit0712/' 63 | DATA_TYPE: 'VOC' 64 | SETS: 65 | VOC: [['0712', '0712_trainval']] 66 | VOC0712PLUS: [['0712', '0712_trainval_test']] 67 | VOC0712: [['2012', '2012_trainval']] 68 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 69 | VOC2007: [['0712', "2007_test"]] 70 | COCOval: [['2014', 'minival']] 71 | VOCROOT: 'data/datasets/VOCdevkit0712/' 72 | COCOROOT: 'data/datasets/coco2015' 73 | 74 | TEST: 75 | INPUT_WH: [320, 320] 76 | CONFIDENCE_THRESH: 0.01 77 | NMS_OVERLAP: 0.45 78 | BATCH_SIZE: 16 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /configs/weave_vgg_voc_512.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: weave_512_vgg 3 | SIZE: '512' 4 | REFINE: True 5 | CONV_BODY: weave_vgg.weave_vgg 6 | NUM_CLASSES: 21 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]] 23 | ARM_CHANNELS: [512, 1024, 256, 256] 24 | ODM_CHANNELS: [256, 256, 256, 256] 25 | NUM_ANCHORS: [3, 3, 3, 3] 26 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 27 | MIN_SIZES: [30, 64, 128, 256] 28 | MAX_SIZES: [64, 128, 256, 315] 29 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 30 | VARIANCE : [0.1, 0.2] 31 | CLIP: True 32 | IMG_WH: [320, 320] 33 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 34 | USE_MAX_SIZE: False 35 | 36 | BIG: 37 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]] 38 | ARM_CHANNELS: [512, 1024, 256, 256] 39 | ODM_CHANNELS: [256, 256, 256, 256] 40 | NUM_ANCHORS: [3, 3, 3, 3] 41 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 42 | MIN_SIZES: [30, 64, 128, 256] 43 | MAX_SIZES: [64, 128, 256, 315] 44 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 45 | CLIP: True 46 | IMG_WH: [512, 512] 47 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 48 | USE_MAX_SIZE: False 49 | 50 | SOLVER: 51 | WEIGHT_DECAY: 0.0005 52 | BASE_LR: 0.002 53 | GAMMA: 0.1 54 | MOMENTUM: 0.9 55 | EPOCH_STEPS: [0, 150, 200] 56 | END_EPOCH: 250 57 | START_EPOCH: 0 58 | 59 | DATASETS: 60 | TRAIN_TYPE: [['0712', '0712_trainval']] 61 | VAL_TYPE: [['0712', '2007_test']] 62 | DATAROOT: 'data/datasets/VOCdevkit0712/' 63 | DATA_TYPE: 'VOC' 64 | SETS: 65 | VOC: [['0712', '0712_trainval']] 66 | VOC0712PLUS: [['0712', '0712_trainval_test']] 67 | VOC0712: [['2012', '2012_trainval']] 68 | COCO: [['2014', 'train'], ['2014', 'valminusminival']] 69 | VOC2007: [['0712', "2007_test"]] 70 | COCOval: [['2014', 'minival']] 71 | VOCROOT: 'data/datasets/VOCdevkit0712/' 72 | COCOROOT: 'data/datasets/coco2015' 73 | 74 | TEST: 75 | INPUT_WH: [512, 512] 76 | CONFIDENCE_THRESH: 0.01 77 | NMS_OVERLAP: 0.45 78 | BATCH_SIZE: 16 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .voc0712 import VOCDetection, detection_collate 3 | from .coco import * 4 | from .data_augment import * 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /data/data_augment.py: -------------------------------------------------------------------------------- 1 | """Data augmentation functionality. Passed as callable transformations to 2 | Dataset classes. 3 | 4 | The data augmentation procedures were interpreted from @weiliu89's SSD paper 5 | http://arxiv.org/abs/1512.02325 6 | 7 | TODO: implement data_augment for training 8 | 9 | Ellis Brown, Max deGroot 10 | """ 11 | 12 | import torch 13 | from torchvision import transforms 14 | import cv2 15 | import numpy as np 16 | import random 17 | import math 18 | from utils.box_utils import matrix_iou 19 | 20 | 21 | def _crop(image, boxes, labels): 22 | height, width, _ = image.shape 23 | 24 | if len(boxes) == 0: 25 | return image, boxes, labels 26 | 27 | while True: 28 | mode = random.choice(( 29 | None, 30 | (0.1, None), 31 | (0.3, None), 32 | (0.5, None), 33 | (0.7, None), 34 | (0.9, None), 35 | (None, None), 36 | )) 37 | 38 | if mode is None: 39 | return image, boxes, labels 40 | 41 | min_iou, max_iou = mode 42 | if min_iou is None: 43 | min_iou = float('-inf') 44 | if max_iou is None: 45 | max_iou = float('inf') 46 | 47 | for _ in range(50): 48 | scale = random.uniform(0.3, 1.) 49 | min_ratio = max(0.5, scale * scale) 50 | max_ratio = min(2, 1. / scale / scale) 51 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 52 | w = int(scale * ratio * width) 53 | h = int((scale / ratio) * height) 54 | 55 | l = random.randrange(width - w) 56 | t = random.randrange(height - h) 57 | roi = np.array((l, t, l + w, t + h)) 58 | 59 | iou = matrix_iou(boxes, roi[np.newaxis]) 60 | 61 | if not (min_iou <= iou.min() and iou.max() <= max_iou): 62 | continue 63 | 64 | image_t = image[roi[1]:roi[3], roi[0]:roi[2]] 65 | 66 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2 67 | mask = np.logical_and(roi[:2] < centers, centers < roi[2:]) \ 68 | .all(axis=1) 69 | boxes_t = boxes[mask].copy() 70 | labels_t = labels[mask].copy() 71 | if len(boxes_t) == 0: 72 | continue 73 | 74 | boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2]) 75 | boxes_t[:, :2] -= roi[:2] 76 | boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:]) 77 | boxes_t[:, 2:] -= roi[:2] 78 | 79 | return image_t, boxes_t, labels_t 80 | 81 | 82 | def _distort(image): 83 | def _convert(image, alpha=1, beta=0): 84 | tmp = image.astype(float) * alpha + beta 85 | tmp[tmp < 0] = 0 86 | tmp[tmp > 255] = 255 87 | image[:] = tmp 88 | 89 | image = image.copy() 90 | 91 | if random.randrange(2): 92 | _convert(image, beta=random.uniform(-32, 32)) 93 | 94 | if random.randrange(2): 95 | _convert(image, alpha=random.uniform(0.5, 1.5)) 96 | 97 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 98 | 99 | if random.randrange(2): 100 | tmp = image[:, :, 0].astype(int) + random.randint(-18, 18) 101 | tmp %= 180 102 | image[:, :, 0] = tmp 103 | 104 | if random.randrange(2): 105 | _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5)) 106 | 107 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 108 | 109 | return image 110 | 111 | 112 | def _expand(image, boxes, fill, p): 113 | if random.random() > p: 114 | return image, boxes 115 | 116 | height, width, depth = image.shape 117 | for _ in range(50): 118 | scale = random.uniform(1, 4) 119 | 120 | min_ratio = max(0.5, 1. / scale / scale) 121 | max_ratio = min(2, scale * scale) 122 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 123 | ws = scale * ratio 124 | hs = scale / ratio 125 | if ws < 1 or hs < 1: 126 | continue 127 | w = int(ws * width) 128 | h = int(hs * height) 129 | 130 | left = random.randint(0, w - width) 131 | top = random.randint(0, h - height) 132 | 133 | boxes_t = boxes.copy() 134 | boxes_t[:, :2] += (left, top) 135 | boxes_t[:, 2:] += (left, top) 136 | 137 | expand_image = np.empty((h, w, depth), dtype=image.dtype) 138 | expand_image[:, :] = fill 139 | expand_image[top:top + height, left:left + width] = image 140 | image = expand_image 141 | 142 | return image, boxes_t 143 | 144 | 145 | def _mirror(image, boxes): 146 | _, width, _ = image.shape 147 | if random.randrange(2): 148 | image = image[:, ::-1] 149 | boxes = boxes.copy() 150 | boxes[:, 0::2] = width - boxes[:, 2::-2] 151 | return image, boxes 152 | 153 | 154 | def preproc_for_test(image, resize_wh, mean): 155 | interp_methods = [ 156 | cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, 157 | cv2.INTER_LANCZOS4 158 | ] 159 | interp_method = interp_methods[random.randrange(5)] 160 | # interp_method = interp_methods[0] 161 | image = cv2.resize( 162 | image, (resize_wh[0], resize_wh[1]), interpolation=interp_method) 163 | image = image.astype(np.float32) 164 | image -= mean 165 | # to rgb 166 | # image = image[:, :, (2, 1, 0)] 167 | return image.transpose(2, 0, 1) 168 | 169 | 170 | class preproc(object): 171 | def __init__(self, resize_wh, rgb_means, p): 172 | self.means = rgb_means 173 | self.resize_wh = resize_wh 174 | self.p = p 175 | 176 | def __call__(self, image, targets): 177 | boxes = targets[:, :-1].copy() 178 | labels = targets[:, -1].copy() 179 | if len(boxes) == 0: 180 | #boxes = np.empty((0, 4)) 181 | targets = np.zeros((1, 5)) 182 | image = preproc_for_test(image, self.resize_wh, self.means) 183 | return torch.from_numpy(image), targets 184 | 185 | image_o = image.copy() 186 | targets_o = targets.copy() 187 | height_o, width_o, _ = image_o.shape 188 | boxes_o = targets_o[:, :-1] 189 | labels_o = targets_o[:, -1] 190 | boxes_o[:, 0::2] /= width_o 191 | boxes_o[:, 1::2] /= height_o 192 | labels_o = np.expand_dims(labels_o, 1) 193 | targets_o = np.hstack((boxes_o, labels_o)) 194 | 195 | image_t, boxes, labels = _crop(image, boxes, labels) 196 | image_t = _distort(image_t) 197 | image_t, boxes = _expand(image_t, boxes, self.means, self.p) 198 | image_t, boxes = _mirror(image_t, boxes) 199 | #image_t, boxes = _mirror(image, boxes) 200 | 201 | height, width, _ = image_t.shape 202 | image_t = preproc_for_test(image_t, self.resize_wh, self.means) 203 | boxes = boxes.copy() 204 | boxes[:, 0::2] /= width 205 | boxes[:, 1::2] /= height 206 | b_w = (boxes[:, 2] - boxes[:, 0]) * 1. 207 | b_h = (boxes[:, 3] - boxes[:, 1]) * 1. 208 | mask_b = np.minimum(b_w, b_h) > 0.01 209 | boxes_t = boxes[mask_b] 210 | labels_t = labels[mask_b].copy() 211 | 212 | if len(boxes_t) == 0: 213 | image = preproc_for_test(image_o, self.resize_wh, self.means) 214 | return torch.from_numpy(image), targets_o 215 | 216 | labels_t = np.expand_dims(labels_t, 1) 217 | targets_t = np.hstack((boxes_t, labels_t)) 218 | 219 | return torch.from_numpy(image_t), targets_t 220 | 221 | 222 | class BaseTransform(object): 223 | """Defines the transformations that should be applied to test PIL image 224 | for input into the network 225 | 226 | dimension -> tensorize -> color adj 227 | 228 | Arguments: 229 | resize (int): input dimension to SSD 230 | rgb_means ((int,int,int)): average RGB of the dataset 231 | (104,117,123) 232 | swap ((int,int,int)): final order of channels 233 | Returns: 234 | transform (transform) : callable transform to be applied to test/val 235 | data 236 | """ 237 | 238 | def __init__(self, resize_wh, rgb_means, swap=(2, 0, 1)): 239 | self.means = rgb_means 240 | self.resize_wh = resize_wh 241 | self.swap = swap 242 | 243 | # assume input is cv2 img for now 244 | def __call__(self, img, target=None): 245 | 246 | interp_methods = [ 247 | cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, 248 | cv2.INTER_NEAREST, cv2.INTER_LANCZOS4 249 | ] 250 | interp_method = interp_methods[0] 251 | img = cv2.resize( 252 | np.array(img), (self.resize_wh[0], self.resize_wh[1]), 253 | interpolation=interp_method).astype(np.float32) 254 | img -= self.means 255 | img = img.transpose(self.swap) 256 | return torch.from_numpy(img), target 257 | -------------------------------------------------------------------------------- /data/drf_net.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yqyao/SSD_Pytorch/6060bbb650e7a1df7c12d7c9650a38eaba4ab6a8/data/drf_net.jpg -------------------------------------------------------------------------------- /data/scripts/VOC2007.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2007 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 26 | echo "Downloading VOC2007 test data ..." 27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 28 | echo "Done downloading." 29 | 30 | # Extract data 31 | echo "Extracting trainval ..." 32 | tar -xvf VOCtrainval_06-Nov-2007.tar 33 | echo "Extracting test ..." 34 | tar -xvf VOCtest_06-Nov-2007.tar 35 | echo "removing tars ..." 36 | rm VOCtrainval_06-Nov-2007.tar 37 | rm VOCtest_06-Nov-2007.tar 38 | 39 | end=`date +%s` 40 | runtime=$((end-start)) 41 | 42 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/scripts/VOC2012.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2012 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 26 | echo "Done downloading." 27 | 28 | 29 | # Extract data 30 | echo "Extracting trainval ..." 31 | tar -xvf VOCtrainval_11-May-2012.tar 32 | echo "removing tar ..." 33 | rm VOCtrainval_11-May-2012.tar 34 | 35 | end=`date +%s` 36 | runtime=$((end-start)) 37 | 38 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import xml.etree.ElementTree as ET 8 | import os 9 | import pickle 10 | import numpy as np 11 | import pdb 12 | import matplotlib 13 | matplotlib.use('Agg') 14 | import matplotlib.pyplot as plt 15 | 16 | 17 | def parse_rec(filename): 18 | """ Parse a PASCAL VOC xml file """ 19 | tree = ET.parse(filename) 20 | objects = [] 21 | for obj in tree.findall('object'): 22 | obj_struct = {} 23 | obj_struct['name'] = obj.find('name').text 24 | obj_struct['pose'] = obj.find('pose').text 25 | obj_struct['truncated'] = int(obj.find('truncated').text) 26 | obj_struct['difficult'] = int(obj.find('difficult').text) 27 | bbox = obj.find('bndbox') 28 | obj_struct['bbox'] = [ 29 | int(bbox.find('xmin').text), 30 | int(bbox.find('ymin').text), 31 | int(bbox.find('xmax').text), 32 | int(bbox.find('ymax').text) 33 | ] 34 | objects.append(obj_struct) 35 | 36 | return objects 37 | 38 | 39 | def voc_ap(rec, prec, use_07_metric=False): 40 | """ ap = voc_ap(rec, prec, [use_07_metric]) 41 | Compute VOC AP given precision and recall. 42 | If use_07_metric is true, uses the 43 | VOC 07 11 point method (default:False). 44 | """ 45 | if use_07_metric: 46 | # 11 point metric 47 | ap = 0. 48 | for t in np.arange(0., 1.1, 0.1): 49 | if np.sum(rec >= t) == 0: 50 | p = 0 51 | else: 52 | p = np.max(prec[rec >= t]) 53 | ap = ap + p / 11. 54 | else: 55 | # correct AP calculation 56 | # first append sentinel values at the end 57 | mrec = np.concatenate(([0.], rec, [1.])) 58 | mpre = np.concatenate(([0.], prec, [0.])) 59 | 60 | # compute the precision envelope 61 | for i in range(mpre.size - 1, 0, -1): 62 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 63 | 64 | # to calculate area under PR curve, look for points 65 | # where X axis (recall) changes value 66 | i = np.where(mrec[1:] != mrec[:-1])[0] 67 | 68 | # and sum (\Delta recall) * prec 69 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 70 | return ap 71 | 72 | 73 | def voc_eval(detpath, 74 | annopath, 75 | imagesetfile, 76 | classname, 77 | cachedir, 78 | ovthresh=0.5, 79 | use_07_metric=False): 80 | """rec, prec, ap = voc_eval(detpath, 81 | annopath, 82 | imagesetfile, 83 | classname, 84 | [ovthresh], 85 | [use_07_metric]) 86 | 87 | Top level function that does the PASCAL VOC evaluation. 88 | 89 | detpath: Path to detections 90 | detpath.format(classname) should produce the detection results file. 91 | annopath: Path to annotations 92 | annopath.format(imagename) should be the xml annotations file. 93 | imagesetfile: Text file containing the list of images, one image per line. 94 | classname: Category name (duh) 95 | cachedir: Directory for caching the annotations 96 | [ovthresh]: Overlap threshold (default = 0.5) 97 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 98 | (default False) 99 | """ 100 | # assumes detections are in detpath.format(classname) 101 | # assumes annotations are in annopath.format(imagename) 102 | # assumes imagesetfile is a text file with each line an image name 103 | # cachedir caches the annotations in a pickle file 104 | 105 | # first load gt 106 | if not os.path.isdir(cachedir): 107 | os.mkdir(cachedir) 108 | cachefile = os.path.join(cachedir, 'annots.pkl') 109 | # read list of images 110 | with open(imagesetfile, 'r') as f: 111 | lines = f.readlines() 112 | imagenames = [x.strip() for x in lines] 113 | 114 | if not os.path.isfile(cachefile): 115 | # load annots 116 | recs = {} 117 | for i, imagename in enumerate(imagenames): 118 | recs[imagename] = parse_rec(annopath.format(imagename)) 119 | if i % 100 == 0: 120 | print('Reading annotation for {:d}/{:d}'.format( 121 | i + 1, len(imagenames))) 122 | # save 123 | print('Saving cached annotations to {:s}'.format(cachefile)) 124 | with open(cachefile, 'wb') as f: 125 | pickle.dump(recs, f) 126 | else: 127 | # load 128 | with open(cachefile, 'rb') as f: 129 | recs = pickle.load(f) 130 | 131 | # extract gt objects for this class 132 | class_recs = {} 133 | npos = 0 134 | for imagename in imagenames: 135 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 136 | bbox = np.array([x['bbox'] for x in R]) 137 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 138 | det = [False] * len(R) 139 | npos = npos + sum(~difficult) 140 | class_recs[imagename] = { 141 | 'bbox': bbox, 142 | 'difficult': difficult, 143 | 'det': det 144 | } 145 | 146 | # read dets 147 | detfile = detpath.format(classname) 148 | with open(detfile, 'r') as f: 149 | lines = f.readlines() 150 | 151 | splitlines = [x.strip().split(' ') for x in lines] 152 | image_ids = [x[0] for x in splitlines] 153 | confidence = np.array([float(x[1]) for x in splitlines]) 154 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 155 | # sort by confidence 156 | sorted_ind = np.argsort(-confidence) 157 | sorted_scores = np.sort(-confidence) 158 | BB = BB[sorted_ind, :] 159 | image_ids = [image_ids[x] for x in sorted_ind] 160 | 161 | # go down dets and mark TPs and FPs 162 | nd = len(image_ids) 163 | tp = np.zeros(nd) 164 | fp = np.zeros(nd) 165 | for d in range(nd): 166 | R = class_recs[image_ids[d]] 167 | bb = BB[d, :].astype(float) 168 | ovmax = -np.inf 169 | BBGT = R['bbox'].astype(float) 170 | 171 | if BBGT.size > 0: 172 | # compute overlaps 173 | # intersection 174 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 175 | iymin = np.maximum(BBGT[:, 1], bb[1]) 176 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 177 | iymax = np.minimum(BBGT[:, 3], bb[3]) 178 | iw = np.maximum(ixmax - ixmin + 1., 0.) 179 | ih = np.maximum(iymax - iymin + 1., 0.) 180 | inters = iw * ih 181 | 182 | # union 183 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 184 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 185 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 186 | 187 | overlaps = inters / uni 188 | ovmax = np.max(overlaps) 189 | jmax = np.argmax(overlaps) 190 | 191 | if ovmax > ovthresh: 192 | if not R['difficult'][jmax]: 193 | if not R['det'][jmax]: 194 | tp[d] = 1. 195 | R['det'][jmax] = 1 196 | else: 197 | fp[d] = 1. 198 | else: 199 | fp[d] = 1. 200 | 201 | # compute precision recall 202 | fp = np.cumsum(fp) 203 | tp = np.cumsum(tp) 204 | rec = tp / float(npos) 205 | # avoid divide by zero in case the first detection matches a difficult 206 | # ground truth 207 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 208 | # if classname == 'person': 209 | final_rec = round(rec[-1], 4) 210 | final_prec = round(prec[-1], 4) 211 | plt_save_path = os.path.join(".", "eval", "pr") 212 | if not os.path.exists(plt_save_path): 213 | os.makedirs(plt_save_path) 214 | plt.plot(rec, prec, 'r') 215 | pr_curl = os.path.join( 216 | plt_save_path, '{}_{}_{}pr.jpg'.format(classname, str(final_prec), 217 | str(final_rec))) 218 | plt.savefig(pr_curl) 219 | plt.close() 220 | ap = voc_ap(rec, prec, use_07_metric) 221 | 222 | return rec, prec, ap 223 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["CUDA_VISIBLE_DEVICES"] = "1,0" 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | import torch.backends.cudnn as cudnn 7 | import torch.nn.init as init 8 | import argparse 9 | from torch.autograd import Variable 10 | import torch.utils.data as data 11 | from data import COCODetection, VOCDetection, detection_collate, BaseTransform, preproc 12 | from layers.modules import MultiBoxLoss, RefineMultiBoxLoss 13 | from layers.functions import Detect 14 | from utils.nms_wrapper import nms, soft_nms 15 | from configs.config import cfg, cfg_from_file, VOC_CLASSES, COCO_CLASSES 16 | from utils.box_utils import draw_rects 17 | import numpy as np 18 | import time 19 | import os 20 | import sys 21 | import pickle 22 | import datetime 23 | from models.model_builder import SSD 24 | import yaml 25 | import cv2 26 | 27 | 28 | def arg_parse(): 29 | parser = argparse.ArgumentParser( 30 | description='Single Shot MultiBox Detection') 31 | parser.add_argument( 32 | "--images", 33 | dest='images', 34 | help="Image / Directory containing images to perform detection upon", 35 | default="images", 36 | type=str) 37 | parser.add_argument( 38 | '--weights', 39 | default='weights/ssd_darknet_300.pth', 40 | type=str, 41 | help='Trained state_dict file path to open') 42 | parser.add_argument( 43 | '--cfg', 44 | dest='cfg_file', 45 | required=True, 46 | help='Config file for training (and optionally testing)') 47 | parser.add_argument( 48 | '--save_folder', 49 | default='eval/', 50 | type=str, 51 | help='File path to save results') 52 | parser.add_argument( 53 | '--num_workers', 54 | default=8, 55 | type=int, 56 | help='Number of workers used in dataloading') 57 | parser.add_argument( 58 | '--retest', default=False, type=bool, help='test cache results') 59 | args = parser.parse_args() 60 | return args 61 | 62 | 63 | def im_detect(img, net, detector, transform, thresh=0.01): 64 | with torch.no_grad(): 65 | t0 = time.time() 66 | w, h = img.shape[1], img.shape[0] 67 | x = transform(img)[0].unsqueeze(0) 68 | x = x.cuda() 69 | t1 = time.time() 70 | output = net(x) 71 | boxes, scores = detector.forward(output) 72 | t2 = time.time() 73 | max_conf, max_id = scores[0].topk(1, 1, True, True) 74 | pos = max_id > 0 75 | if len(pos) == 0: 76 | return np.empty((0, 6)) 77 | boxes = boxes[0][pos.view(-1, 1).expand(len(pos), 4)].view(-1, 4) 78 | scores = max_conf[pos].view(-1, 1) 79 | max_id = max_id[pos].view(-1, 1) 80 | inds = scores > thresh 81 | if len(inds) == 0: 82 | return np.empty((0, 6)) 83 | boxes = boxes[inds.view(-1, 1).expand(len(inds), 4)].view(-1, 4) 84 | scores = scores[inds].view(-1, 1) 85 | max_id = max_id[inds].view(-1, 1) 86 | c_dets = torch.cat((boxes, scores, max_id.float()), 1).cpu().numpy() 87 | img_classes = np.unique(c_dets[:, -1]) 88 | output = None 89 | flag = False 90 | for cls in img_classes: 91 | cls_mask = np.where(c_dets[:, -1] == cls)[0] 92 | image_pred_class = c_dets[cls_mask, :] 93 | keep = nms(image_pred_class, cfg.TEST.NMS_OVERLAP, force_cpu=True) 94 | keep = keep[:50] 95 | image_pred_class = image_pred_class[keep, :] 96 | if not flag: 97 | output = image_pred_class 98 | flag = True 99 | else: 100 | output = np.concatenate((output, image_pred_class), axis=0) 101 | output[:, 0:2][output[:, 0:2] < 0] = 0 102 | output[:, 2:4][output[:, 2:4] > 1] = 1 103 | scale = np.array([w, h, w, h]) 104 | output[:, :4] = output[:, :4] * scale 105 | t3 = time.time() 106 | print("transform_t:", round(t1 - t0, 3), "detect_time:", 107 | round(t2 - t1, 3), "nms_time:", round(t3 - t2, 3)) 108 | return output 109 | 110 | 111 | def main(): 112 | global args 113 | args = arg_parse() 114 | cfg_from_file(args.cfg_file) 115 | bgr_means = cfg.TRAIN.BGR_MEAN 116 | dataset_name = cfg.DATASETS.DATA_TYPE 117 | batch_size = cfg.TEST.BATCH_SIZE 118 | num_workers = args.num_workers 119 | if cfg.DATASETS.DATA_TYPE == 'VOC': 120 | trainvalDataset = VOCDetection 121 | classes = VOC_CLASSES 122 | top_k = 200 123 | else: 124 | trainvalDataset = COCODetection 125 | classes = COCO_CLASSES 126 | top_k = 300 127 | valSet = cfg.DATASETS.VAL_TYPE 128 | num_classes = cfg.MODEL.NUM_CLASSES 129 | save_folder = args.save_folder 130 | if not os.path.exists(save_folder): 131 | os.mkdir(save_folder) 132 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 133 | cfg.TRAIN.TRAIN_ON = False 134 | net = SSD(cfg) 135 | 136 | checkpoint = torch.load(args.weights) 137 | state_dict = checkpoint['model'] 138 | from collections import OrderedDict 139 | new_state_dict = OrderedDict() 140 | for k, v in state_dict.items(): 141 | head = k[:7] 142 | if head == 'module.': 143 | name = k[7:] # remove `module.` 144 | else: 145 | name = k 146 | new_state_dict[name] = v 147 | net.load_state_dict(new_state_dict) 148 | 149 | detector = Detect(cfg) 150 | img_wh = cfg.TEST.INPUT_WH 151 | ValTransform = BaseTransform(img_wh, bgr_means, (2, 0, 1)) 152 | input_folder = args.images 153 | thresh = cfg.TEST.CONFIDENCE_THRESH 154 | for item in os.listdir(input_folder)[2:3]: 155 | img_path = os.path.join(input_folder, item) 156 | print(img_path) 157 | img = cv2.imread(img_path) 158 | dets = im_detect(img, net, detector, ValTransform, thresh) 159 | draw_img = draw_rects(img, dets, classes) 160 | out_img_name = "output_" + item 161 | save_path = os.path.join(save_folder, out_img_name) 162 | cv2.imwrite(save_path, img) 163 | 164 | 165 | if __name__ == '__main__': 166 | st = time.time() 167 | main() 168 | print("final time", time.time() - st) 169 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["CUDA_VISIBLE_DEVICES"] = "1,0" 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | import torch.backends.cudnn as cudnn 7 | import torch.nn.init as init 8 | import argparse 9 | from torch.autograd import Variable 10 | import torch.utils.data as data 11 | from data import COCODetection, VOCDetection, detection_collate, BaseTransform, preproc 12 | from layers.modules import MultiBoxLoss, RefineMultiBoxLoss 13 | from layers.functions import Detect 14 | from utils.nms_wrapper import nms, soft_nms 15 | from configs.config import cfg, cfg_from_file 16 | import numpy as np 17 | import time 18 | import os 19 | import sys 20 | import pickle 21 | import datetime 22 | from models.model_builder import SSD 23 | import yaml 24 | 25 | 26 | def arg_parse(): 27 | parser = argparse.ArgumentParser( 28 | description='Single Shot MultiBox Detection') 29 | parser.add_argument( 30 | '--weights', 31 | default='weights/ssd_darknet_300.pth', 32 | type=str, 33 | help='Trained state_dict file path to open') 34 | parser.add_argument( 35 | '--cfg', 36 | dest='cfg_file', 37 | required=True, 38 | help='Config file for training (and optionally testing)') 39 | parser.add_argument( 40 | '--save_folder', 41 | default='eval/', 42 | type=str, 43 | help='File path to save results') 44 | parser.add_argument( 45 | '--num_workers', 46 | default=8, 47 | type=int, 48 | help='Number of workers used in dataloading') 49 | parser.add_argument( 50 | '--retest', default=False, type=bool, help='test cache results') 51 | args = parser.parse_args() 52 | return args 53 | 54 | 55 | def eval_net(val_dataset, 56 | val_loader, 57 | net, 58 | detector, 59 | cfg, 60 | transform, 61 | max_per_image=300, 62 | thresh=0.01, 63 | batch_size=1): 64 | net.eval() 65 | num_images = len(val_dataset) 66 | num_classes = cfg.MODEL.NUM_CLASSES 67 | eval_save_folder = "./eval/" 68 | if not os.path.exists(eval_save_folder): 69 | os.mkdir(eval_save_folder) 70 | all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)] 71 | det_file = os.path.join(eval_save_folder, 'detections.pkl') 72 | 73 | if args.retest: 74 | f = open(det_file, 'rb') 75 | all_boxes = pickle.load(f) 76 | print('Evaluating detections') 77 | val_dataset.evaluate_detections(all_boxes, eval_save_folder) 78 | return 79 | 80 | for idx, (imgs, _, img_info) in enumerate(val_loader): 81 | with torch.no_grad(): 82 | t1 = time.time() 83 | x = imgs 84 | x = x.cuda() 85 | output = net(x) 86 | t4 = time.time() 87 | boxes, scores = detector.forward(output) 88 | t2 = time.time() 89 | for k in range(boxes.size(0)): 90 | i = idx * batch_size + k 91 | boxes_ = boxes[k] 92 | scores_ = scores[k] 93 | boxes_ = boxes_.cpu().numpy() 94 | scores_ = scores_.cpu().numpy() 95 | img_wh = img_info[k] 96 | scale = np.array([img_wh[0], img_wh[1], img_wh[0], img_wh[1]]) 97 | boxes_ *= scale 98 | for j in range(1, num_classes): 99 | inds = np.where(scores_[:, j] > thresh)[0] 100 | if len(inds) == 0: 101 | all_boxes[j][i] = np.empty([0, 5], dtype=np.float32) 102 | continue 103 | c_bboxes = boxes_[inds] 104 | c_scores = scores_[inds, j] 105 | c_dets = np.hstack((c_bboxes, 106 | c_scores[:, np.newaxis])).astype( 107 | np.float32, copy=False) 108 | keep = nms(c_dets, cfg.TEST.NMS_OVERLAP, force_cpu=True) 109 | keep = keep[:50] 110 | c_dets = c_dets[keep, :] 111 | all_boxes[j][i] = c_dets 112 | t3 = time.time() 113 | detect_time = t2 - t1 114 | nms_time = t3 - t2 115 | forward_time = t4 - t1 116 | if idx % 10 == 0: 117 | print('im_detect: {:d}/{:d} {:.3f}s {:.3f}s {:.3f}s'.format( 118 | i + 1, num_images, forward_time, detect_time, nms_time)) 119 | 120 | with open(det_file, 'wb') as f: 121 | pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) 122 | print('Evaluating detections') 123 | val_dataset.evaluate_detections(all_boxes, eval_save_folder) 124 | print("detect time: ", time.time() - st) 125 | 126 | 127 | def main(): 128 | global args 129 | args = arg_parse() 130 | cfg_from_file(args.cfg_file) 131 | bgr_means = cfg.TRAIN.BGR_MEAN 132 | dataset_name = cfg.DATASETS.DATA_TYPE 133 | batch_size = cfg.TEST.BATCH_SIZE 134 | num_workers = args.num_workers 135 | if cfg.DATASETS.DATA_TYPE == 'VOC': 136 | trainvalDataset = VOCDetection 137 | top_k = 200 138 | else: 139 | trainvalDataset = COCODetection 140 | top_k = 300 141 | dataroot = cfg.DATASETS.DATAROOT 142 | if cfg.MODEL.SIZE == '300': 143 | size_cfg = cfg.SMALL 144 | else: 145 | size_cfg = cfg.BIG 146 | valSet = cfg.DATASETS.VAL_TYPE 147 | num_classes = cfg.MODEL.NUM_CLASSES 148 | save_folder = args.save_folder 149 | if not os.path.exists(save_folder): 150 | os.mkdir(save_folder) 151 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 152 | cfg.TRAIN.TRAIN_ON = False 153 | net = SSD(cfg) 154 | 155 | checkpoint = torch.load(args.weights) 156 | state_dict = checkpoint['model'] 157 | from collections import OrderedDict 158 | new_state_dict = OrderedDict() 159 | for k, v in state_dict.items(): 160 | head = k[:7] 161 | if head == 'module.': 162 | name = k[7:] # remove `module.` 163 | else: 164 | name = k 165 | new_state_dict[name] = v 166 | net.load_state_dict(new_state_dict) 167 | detector = Detect(cfg) 168 | ValTransform = BaseTransform(size_cfg.IMG_WH, bgr_means, (2, 0, 1)) 169 | val_dataset = trainvalDataset(dataroot, valSet, ValTransform, "val") 170 | val_loader = data.DataLoader( 171 | val_dataset, 172 | batch_size, 173 | shuffle=False, 174 | num_workers=num_workers, 175 | collate_fn=detection_collate) 176 | top_k = 300 177 | thresh = cfg.TEST.CONFIDENCE_THRESH 178 | eval_net( 179 | val_dataset, 180 | val_loader, 181 | net, 182 | detector, 183 | cfg, 184 | ValTransform, 185 | top_k, 186 | thresh=thresh, 187 | batch_size=batch_size) 188 | 189 | 190 | if __name__ == '__main__': 191 | st = time.time() 192 | main() 193 | print("final time", time.time() - st) 194 | -------------------------------------------------------------------------------- /images/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yqyao/SSD_Pytorch/6060bbb650e7a1df7c12d7c9650a38eaba4ab6a8/images/dog.jpg -------------------------------------------------------------------------------- /images/eagle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yqyao/SSD_Pytorch/6060bbb650e7a1df7c12d7c9650a38eaba4ab6a8/images/eagle.jpg -------------------------------------------------------------------------------- /images/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yqyao/SSD_Pytorch/6060bbb650e7a1df7c12d7c9650a38eaba4ab6a8/images/person.jpg -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * 2 | from .modules import * 3 | -------------------------------------------------------------------------------- /layers/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .detection import Detect 2 | from .prior_box import PriorBox 3 | # from .refine_prior_box import RefinePriorBox 4 | 5 | 6 | __all__ = ['Detect', 'PriorBox'] 7 | -------------------------------------------------------------------------------- /layers/functions/detection.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.backends.cudnn as cudnn 4 | from torch.autograd import Function 5 | from torch.autograd import Variable 6 | import torch.nn.functional as F 7 | from utils.box_utils import decode, center_size 8 | 9 | 10 | class Detect(Function): 11 | """At test time, Detect is the final layer of SSD. Decode location preds, 12 | apply non-maximum suppression to location predictions based on conf 13 | scores and threshold to a top_k number of output predictions for both 14 | confidence score and locations. 15 | """ 16 | 17 | def __init__(self, cfg): 18 | self.cfg = cfg 19 | self.num_classes = cfg.MODEL.NUM_CLASSES 20 | #self.thresh = thresh 21 | self.size = cfg.MODEL.SIZE 22 | if self.size == '300': 23 | size_cfg = cfg.SMALL 24 | else: 25 | size_cfg = cfg.BIG 26 | # Parameters used in nms. 27 | self.variance = size_cfg.VARIANCE 28 | self.object_score = cfg.MODEL.OBJECT_SCORE 29 | 30 | def forward(self, predictions): 31 | """ 32 | Args: 33 | loc_data: (tensor) Loc preds from loc layers 34 | Shape: [batch,num_priors*4] 35 | conf_data: (tensor) Shape: Conf preds from conf layers 36 | Shape: [batch*num_priors,num_classes] 37 | prior_data: (tensor) Prior boxes and variances from priorbox layers 38 | Shape: [1,num_priors,4] 39 | """ 40 | # loc, conf, priors = predictions 41 | if self.cfg.MODEL.REFINE: 42 | arm_loc, arm_conf, loc, conf, priors = predictions 43 | arm_conf = F.softmax(arm_conf.view(-1, 2), 1) 44 | conf = F.softmax(conf.view(-1, self.num_classes), 1) 45 | arm_loc_data = arm_loc.data 46 | arm_conf_data = arm_conf.data 47 | arm_object_conf = arm_conf_data[:, 1:] 48 | no_object_index = arm_object_conf <= self.object_score 49 | conf.data[no_object_index.expand_as(conf.data)] = 0 50 | else: 51 | loc, conf, priors = predictions 52 | conf = F.softmax(conf.view(-1, self.num_classes), 1) 53 | loc_data = loc.data 54 | conf_data = conf.data 55 | # prior_data = priors.data 56 | prior_data = priors[:loc_data.size(1), :] 57 | 58 | num = loc_data.size(0) # batch size 59 | 60 | self.num_priors = prior_data.size(0) 61 | 62 | self.boxes = torch.zeros(num, self.num_priors, 4) 63 | self.scores = torch.zeros(num, self.num_priors, self.num_classes) 64 | conf_preds = conf_data.view(num, self.num_priors, self.num_classes) 65 | batch_prior = prior_data.view(-1, self.num_priors, 4).expand( 66 | (num, self.num_priors, 4)) 67 | batch_prior = batch_prior.contiguous().view(-1, 4) 68 | if self.cfg.MODEL.REFINE: 69 | default = decode( 70 | arm_loc_data.view(-1, 4), batch_prior, self.variance) 71 | default = center_size(default) 72 | decoded_boxes = decode( 73 | loc_data.view(-1, 4), default, self.variance) 74 | else: 75 | decoded_boxes = decode( 76 | loc_data.view(-1, 4), batch_prior, self.variance) 77 | 78 | self.scores = conf_preds.view(num, self.num_priors, self.num_classes) 79 | self.boxes = decoded_boxes.view(num, self.num_priors, 4) 80 | return self.boxes, self.scores -------------------------------------------------------------------------------- /layers/functions/prior_box.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from math import sqrt as sqrt 3 | from itertools import product as product 4 | 5 | 6 | class PriorBox(object): 7 | """Compute priorbox coordinates in center-offset form for each source 8 | feature map. 9 | Note: 10 | This 'layer' has changed between versions of the original SSD 11 | paper, so we include both versions, but note v2 is the most tested and most 12 | recent version of the paper. 13 | 14 | """ 15 | 16 | def __init__(self, cfg): 17 | super(PriorBox, self).__init__() 18 | self.size = cfg.MODEL.SIZE 19 | if self.size == '300': 20 | size_cfg = cfg.SMALL 21 | else: 22 | size_cfg = cfg.BIG 23 | self.img_wh = size_cfg.IMG_WH 24 | self.num_priors = len(size_cfg.ASPECT_RATIOS) 25 | self.feature_maps = size_cfg.FEATURE_MAPS 26 | self.variance = size_cfg.VARIANCE or [0.1] 27 | self.min_sizes = size_cfg.MIN_SIZES 28 | self.use_max_sizes = size_cfg.USE_MAX_SIZE 29 | if self.use_max_sizes: 30 | self.max_sizes = size_cfg.MAX_SIZES 31 | self.steps = size_cfg.STEPS 32 | self.aspect_ratios = size_cfg.ASPECT_RATIOS 33 | self.clip = size_cfg.CLIP 34 | for v in self.variance: 35 | if v <= 0: 36 | raise ValueError('Variances must be greater than 0') 37 | 38 | def forward(self): 39 | mean = [] 40 | for k, f in enumerate(self.feature_maps): 41 | grid_h, grid_w = f[1], f[0] 42 | for i in range(grid_h): 43 | for j in range(grid_w): 44 | f_k_h = self.img_wh[1] / self.steps[k][1] 45 | f_k_w = self.img_wh[0] / self.steps[k][0] 46 | # unit center x,y 47 | cx = (j + 0.5) / f_k_w 48 | cy = (i + 0.5) / f_k_h 49 | 50 | # aspect_ratio: 1 51 | # rel size: min_size 52 | s_k_h = self.min_sizes[k] / self.img_wh[1] 53 | s_k_w = self.min_sizes[k] / self.img_wh[0] 54 | mean += [cx, cy, s_k_w, s_k_h] 55 | 56 | # aspect_ratio: 1 57 | # rel size: sqrt(s_k * s_(k+1)) 58 | if self.use_max_sizes: 59 | s_k_prime_w = sqrt( 60 | s_k_w * (self.max_sizes[k] / self.img_wh[0])) 61 | s_k_prime_h = sqrt( 62 | s_k_h * (self.max_sizes[k] / self.img_wh[1])) 63 | mean += [cx, cy, s_k_prime_w, s_k_prime_h] 64 | 65 | for ar in self.aspect_ratios[k]: 66 | mean += [cx, cy, s_k_w * sqrt(ar), s_k_h / sqrt(ar)] 67 | 68 | # back to torch land 69 | output = torch.Tensor(mean).view(-1, 4) 70 | if self.clip: 71 | output.clamp_(max=1, min=0) 72 | # print(output.size()) 73 | return output 74 | -------------------------------------------------------------------------------- /layers/functions/prior_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from math import sqrt as sqrt 3 | from math import ceil 4 | import torch.nn as nn 5 | from itertools import product as product 6 | 7 | 8 | class PriorLayer(nn.Module): 9 | def __init__(self, cfg): 10 | super(PriorLayer, self).__init__() 11 | self.size = cfg.MODEL.SIZE 12 | if self.size == '300': 13 | size_cfg = cfg.SMALL 14 | else: 15 | size_cfg = cfg.BIG 16 | self.img_wh = size_cfg.IMG_WH 17 | self.num_priors = len(size_cfg.ASPECT_RATIOS) 18 | self.feature_maps = size_cfg.FEATURE_MAPS 19 | self.variance = size_cfg.VARIANCE or [0.1] 20 | self.min_sizes = size_cfg.MIN_SIZES 21 | self.use_max_sizes = size_cfg.USE_MAX_SIZE 22 | if self.use_max_sizes: 23 | self.max_sizes = size_cfg.MAX_SIZES 24 | self.steps = size_cfg.STEPS 25 | self.aspect_ratios = size_cfg.ASPECT_RATIOS 26 | self.clip = size_cfg.CLIP 27 | for v in self.variance: 28 | if v <= 0: 29 | raise ValueError('Variances must be greater than 0') 30 | 31 | def forward(self, img_wh, feature_maps_wh): 32 | self.img_wh = img_wh 33 | self.feature_maps_wh = feature_maps_wh 34 | mean = [] 35 | for k, f in enumerate(self.feature_maps_wh): 36 | grid_h, grid_w = f[1], f[0] 37 | for i in range(grid_h): 38 | for j in range(grid_w): 39 | f_k_h = self.img_wh[1] / self.steps[k][1] 40 | f_k_w = self.img_wh[0] / self.steps[k][0] 41 | # unit center x,y 42 | cx = (j + 0.5) / f_k_w 43 | cy = (i + 0.5) / f_k_h 44 | 45 | # aspect_ratio: 1 46 | # rel size: min_size 47 | s_k_h = self.min_sizes[k] / self.img_wh[1] 48 | s_k_w = self.min_sizes[k] / self.img_wh[0] 49 | mean += [cx, cy, s_k_w, s_k_h] 50 | 51 | # aspect_ratio: 1 52 | # rel size: sqrt(s_k * s_(k+1)) 53 | if self.use_max_sizes: 54 | s_k_prime_w = sqrt( 55 | s_k_w * (self.max_sizes[k] / self.img_wh[0])) 56 | s_k_prime_h = sqrt( 57 | s_k_h * (self.max_sizes[k] / self.img_wh[1])) 58 | mean += [cx, cy, s_k_prime_w, s_k_prime_h] 59 | 60 | for ar in self.aspect_ratios[k]: 61 | mean += [cx, cy, s_k_w * sqrt(ar), s_k_h / sqrt(ar)] 62 | 63 | output = torch.Tensor(mean).view(-1, 4) 64 | if self.clip: 65 | output.clamp_(max=1, min=0) 66 | return output 67 | -------------------------------------------------------------------------------- /layers/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .weight_smooth_l1_loss import WeightSmoothL1Loss 2 | from .weight_softmax_loss import WeightSoftmaxLoss 3 | from .multibox_loss import MultiBoxLoss 4 | from .refine_multibox_loss import RefineMultiBoxLoss 5 | from .focal_loss_sigmoid import FocalLossSigmoid 6 | from .focal_loss_softmax import FocalLossSoftmax 7 | 8 | 9 | 10 | __all__ = ['MultiBoxLoss', 'WeightSoftmaxLoss', ] 11 | -------------------------------------------------------------------------------- /layers/modules/focal_loss_sigmoid.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class FocalLossSigmoid(nn.Module): 11 | ''' 12 | sigmoid version focal loss 13 | ''' 14 | 15 | def __init__(self, alpha=0.25, gamma=2, size_average=False): 16 | super(FocalLossSigmoid, self).__init__() 17 | self.alpha = alpha 18 | self.gamma = gamma 19 | self.size_average = size_average 20 | 21 | def forward(self, inputs, targets): 22 | N = inputs.size(0) 23 | C = inputs.size(1) 24 | P = torch.sigmoid(inputs) 25 | alpha_mask = self.alpha * targets 26 | loss_pos = -1. * torch.pow( 27 | 1 - P, self.gamma) * torch.log(P) * targets * alpha_mask 28 | loss_neg = -1. * torch.pow(P, self.gamma) * torch.log(1 - P) * ( 29 | 1 - targets) * (1 - alpha_mask) 30 | batch_loss = loss_neg + loss_pos 31 | if self.size_average: 32 | loss = batch_loss.mean() 33 | else: 34 | loss = batch_loss.sum() 35 | return loss 36 | -------------------------------------------------------------------------------- /layers/modules/focal_loss_softmax.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class FocalLossSoftmax(nn.Module): 11 | ''' 12 | softmax version focal loss 13 | ''' 14 | 15 | def __init__(self, class_num, alpha=None, gamma=2, size_average=True): 16 | super(FocalLossSoftmax, self).__init__() 17 | if alpha is None: 18 | self.alpha = Variable(torch.ones(class_num, 1)) 19 | else: 20 | if isinstance(alpha, Variable): 21 | self.alpha = alpha 22 | else: 23 | self.alpha = Variable(alpha) 24 | self.gamma = gamma 25 | self.class_num = class_num 26 | self.size_average = size_average 27 | 28 | def forward(self, inputs, targets): 29 | N = inputs.size(0) 30 | C = inputs.size(1) 31 | P = F.softmax(inputs) 32 | 33 | class_mask = inputs.data.new(N, C).fill_(0) 34 | class_mask = Variable(class_mask) 35 | ids = targets.view(-1, 1) 36 | class_mask.scatter_(1, ids.data, 1.) 37 | 38 | if inputs.is_cuda and not self.alpha.is_cuda: 39 | self.alpha = self.alpha.cuda() 40 | alpha = self.alpha[ids.data.view(-1)] 41 | probs = (P * class_mask).sum(1).view(-1, 1) 42 | log_p = probs.log() 43 | batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p 44 | 45 | if self.size_average: 46 | loss = batch_loss.mean() 47 | else: 48 | loss = batch_loss.sum() 49 | return loss -------------------------------------------------------------------------------- /layers/modules/multibox_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | from torch.autograd import Variable 6 | from utils.box_utils import match, log_sum_exp 7 | from .focal_loss_softmax import FocalLossSoftmax 8 | from .focal_loss_sigmoid import FocalLossSigmoid 9 | 10 | GPU = False 11 | if torch.cuda.is_available(): 12 | GPU = True 13 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 14 | 15 | 16 | class MultiBoxLoss(nn.Module): 17 | """SSD Weighted Loss Function 18 | Compute Targets: 19 | 1) Produce Confidence Target Indices by matching ground truth boxes 20 | with (default) 'priorboxes' that have jaccard index > threshold parameter 21 | (default threshold: 0.5). 22 | 2) Produce localization target by 'encoding' variance into offsets of ground 23 | truth boxes and their matched 'priorboxes'. 24 | 3) Hard negative mining to filter the excessive number of negative examples 25 | that comes with using a large number of default bounding boxes. 26 | (default negative:positive ratio 3:1) 27 | Objective Loss: 28 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 29 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 30 | weighted by α which is set to 1 by cross val. 31 | Args: 32 | c: class confidences, 33 | l: predicted boxes, 34 | g: ground truth boxes 35 | N: number of matched default boxes 36 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 37 | """ 38 | 39 | def __init__(self, cfg): 40 | super(MultiBoxLoss, self).__init__() 41 | self.cfg = cfg 42 | self.size = cfg.MODEL.SIZE 43 | if self.size == '300': 44 | size_cfg = cfg.SMALL 45 | else: 46 | size_cfg = cfg.BIG 47 | self.variance = size_cfg.VARIANCE 48 | self.num_classes = cfg.MODEL.NUM_CLASSES 49 | self.threshold = cfg.TRAIN.OVERLAP 50 | self.OHEM = cfg.TRAIN.OHEM 51 | self.negpos_ratio = cfg.TRAIN.NEG_RATIO 52 | self.variance = size_cfg.VARIANCE 53 | if cfg.TRAIN.FOCAL_LOSS: 54 | if cfg.TRAIN.FOCAL_LOSS_TYPE == 'SOFTMAX': 55 | self.focaloss = FocalLossSoftmax( 56 | self.num_classes, gamma=2, size_average=False) 57 | else: 58 | self.focaloss = FocalLossSigmoid() 59 | 60 | def forward(self, predictions, targets): 61 | """Multibox Loss 62 | Args: 63 | predictions (tuple): A tuple containing loc preds, conf preds, 64 | and prior boxes from SSD net. 65 | conf shape: torch.size(batch_size,num_priors,num_classes) 66 | loc shape: torch.size(batch_size,num_priors,4) 67 | priors shape: torch.size(num_priors,4) 68 | 69 | ground_truth (tensor): Ground truth boxes and labels for a batch, 70 | shape: [batch_size,num_objs,5] (last idx is the label). 71 | """ 72 | loc_data, conf_data, priors = predictions 73 | num = loc_data.size(0) 74 | priors = priors[:loc_data.size(1), :] 75 | num_priors = (priors.size(0)) 76 | num_classes = self.num_classes 77 | loc_t = torch.Tensor(num, num_priors, 4) 78 | conf_t = torch.LongTensor(num, num_priors) 79 | for idx in range(num): 80 | truths = targets[idx][:, :-1].data 81 | labels = targets[idx][:, -1].data 82 | if self.num_classes == 2: 83 | labels = labels > 0 84 | defaults = priors.data 85 | match(self.threshold, truths, defaults, self.variance, labels, 86 | loc_t, conf_t, idx) 87 | loc_t = loc_t.cuda() 88 | conf_t = conf_t.cuda() 89 | 90 | pos = conf_t > 0 91 | num_pos = pos.sum(1, keepdim=True) 92 | 93 | if self.OHEM: 94 | # Compute max conf across batch for hard negative mining 95 | batch_conf = conf_data.view(-1, self.num_classes) 96 | 97 | loss_hard = log_sum_exp(batch_conf) - batch_conf.gather( 98 | 1, conf_t.view(-1, 1)) 99 | # Hard Negative Mining 100 | loss_hard[pos.view(-1, 1)] = 0 # filter out pos boxes for now 101 | loss_hard = loss_hard.view(num, -1) 102 | _, loss_idx = loss_hard.sort(1, descending=True) 103 | _, idx_rank = loss_idx.sort(1) 104 | num_pos = pos.long().sum(1, keepdim=True) 105 | if num_pos.data.sum() > 0: 106 | num_neg = torch.clamp( 107 | self.negpos_ratio * num_pos, max=pos.size(1) - 1) 108 | else: 109 | fake_num_pos = torch.ones(32, 1).long() * 15 110 | num_neg = torch.clamp( 111 | self.negpos_ratio * fake_num_pos, max=pos.size(1) - 1) 112 | neg = idx_rank < num_neg.expand_as(idx_rank) 113 | 114 | # Confidence Loss Including Positive and Negative Examples 115 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 116 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 117 | conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view( 118 | -1, self.num_classes) 119 | targets_weighted = conf_t[(pos + neg).gt(0)] 120 | loss_c = F.cross_entropy( 121 | conf_p, targets_weighted, size_average=False) 122 | else: 123 | loss_c = F.cross_entropy(conf_p, conf_t, size_average=False) 124 | # Localization Loss (Smooth L1) 125 | # Shape: [batch,num_priors,4] 126 | if num_pos.data.sum() > 0: 127 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 128 | loc_p = loc_data[pos_idx].view(-1, 4) 129 | loc_t = loc_t[pos_idx].view(-1, 4) 130 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) 131 | N = num_pos.data.sum() 132 | else: 133 | loss_l = torch.zeros(1) 134 | N = 1.0 135 | loss_l /= float(N) 136 | loss_c /= float(N) 137 | return loss_l, loss_c 138 | -------------------------------------------------------------------------------- /layers/modules/refine_multibox_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import numpy as np 8 | from torch.autograd import Variable 9 | from utils.box_utils import match, log_sum_exp, refine_match 10 | from layers.modules import WeightSoftmaxLoss, WeightSmoothL1Loss 11 | GPU = False 12 | if torch.cuda.is_available(): 13 | GPU = True 14 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 15 | 16 | 17 | class RefineMultiBoxLoss(nn.Module): 18 | """SSD Weighted Loss Function 19 | Compute Targets: 20 | 1) Produce Confidence Target Indices by matching ground truth boxes 21 | with (default) 'priorboxes' that have jaccard index > threshold parameter 22 | (default threshold: 0.5). 23 | 2) Produce localization target by 'encoding' variance into offsets of ground 24 | truth boxes and their matched 'priorboxes'. 25 | 3) Hard negative mining to filter the excessive number of negative examples 26 | that comes with using a large number of default bounding boxes. 27 | (default negative:positive ratio 3:1) 28 | Objective Loss: 29 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 30 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 31 | weighted by α which is set to 1 by cross val. 32 | Args: 33 | c: class confidences, 34 | l: predicted boxes, 35 | g: ground truth boxes 36 | N: number of matched default boxes 37 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 38 | """ 39 | 40 | def __init__(self, cfg, num_classes): 41 | super(RefineMultiBoxLoss, self).__init__() 42 | self.cfg = cfg 43 | self.size = cfg.MODEL.SIZE 44 | if self.size == '300': 45 | size_cfg = cfg.SMALL 46 | else: 47 | size_cfg = cfg.BIG 48 | self.variance = size_cfg.VARIANCE 49 | self.num_classes = num_classes 50 | self.threshold = cfg.TRAIN.OVERLAP 51 | self.OHEM = cfg.TRAIN.OHEM 52 | self.negpos_ratio = cfg.TRAIN.NEG_RATIO 53 | self.object_score = cfg.MODEL.OBJECT_SCORE 54 | self.variance = size_cfg.VARIANCE 55 | if cfg.TRAIN.FOCAL_LOSS: 56 | if cfg.TRAIN.FOCAL_LOSS_TYPE == 'SOFTMAX': 57 | self.focaloss = FocalLossSoftmax( 58 | self.num_classes, gamma=2, size_average=False) 59 | else: 60 | self.focaloss = FocalLossSigmoid() 61 | 62 | def forward(self, 63 | predictions, 64 | targets, 65 | use_arm=False, 66 | filter_object=False, 67 | debug=False): 68 | """Multibox Loss 69 | Args: 70 | predictions (tuple): A tuple containing loc preds, conf preds, 71 | and prior boxes from SSD net. 72 | conf shape: torch.size(batch_size,num_priors,num_classes) 73 | loc shape: torch.size(batch_size,num_priors,4) 74 | priors shape: torch.size(num_priors,4) 75 | 76 | ground_truth (tensor): Ground truth boxes and labels for a batch, 77 | shape: [batch_size,num_objs,5] (last idx is the label). 78 | """ 79 | # arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions 80 | if use_arm: 81 | arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions 82 | else: 83 | loc_data, conf_data, _, _, priors = predictions 84 | num = loc_data.size(0) 85 | priors = priors[:loc_data.size(1), :] 86 | num_priors = (priors.size(0)) 87 | num_classes = self.num_classes 88 | 89 | # match priors (default boxes) and ground truth boxes 90 | loc_t = torch.Tensor(num, num_priors, 4) 91 | conf_t = torch.LongTensor(num, num_priors) 92 | defaults = priors.data 93 | for idx in range(num): 94 | truths = targets[idx][:, :-1].data 95 | labels = targets[idx][:, -1].data 96 | if self.num_classes == 2: 97 | labels = labels > 0 98 | if use_arm: 99 | bbox_weight = refine_match( 100 | self.threshold, 101 | truths, 102 | defaults, 103 | self.variance, 104 | labels, 105 | loc_t, 106 | conf_t, 107 | idx, 108 | arm_loc_data[idx].data, 109 | use_weight=False) 110 | else: 111 | match(self.threshold, truths, defaults, self.variance, labels, 112 | loc_t, conf_t, idx) 113 | 114 | loc_t = loc_t.cuda() 115 | conf_t = conf_t.cuda() 116 | # wrap targets 117 | loc_t = Variable(loc_t, requires_grad=False) 118 | conf_t = Variable(conf_t, requires_grad=False) 119 | 120 | if use_arm and filter_object: 121 | P = F.softmax(arm_conf_data, 2) 122 | arm_conf_data_temp = P[:, :, 1] 123 | object_score_index = arm_conf_data_temp <= self.object_score 124 | pos = conf_t > 0 125 | pos[object_score_index.detach()] = 0 126 | else: 127 | pos = conf_t > 0 128 | num_pos = pos.sum(1, keepdim=True) 129 | if debug: 130 | if use_arm: 131 | print("odm pos num: ", str(loc_t.size(0)), str(loc_t.size(1))) 132 | else: 133 | print("arm pos num", str(loc_t.size(0)), str(loc_t.size(1))) 134 | 135 | if self.OHEM: 136 | # Compute max conf across batch for hard negative mining 137 | batch_conf = conf_data.view(-1, self.num_classes) 138 | 139 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather( 140 | 1, conf_t.view(-1, 1)) 141 | 142 | # Hard Negative Mining 143 | loss_c[pos.view(-1, 1)] = 0 # filter out pos boxes for now 144 | loss_c = loss_c.view(num, -1) 145 | _, loss_idx = loss_c.sort(1, descending=True) 146 | _, idx_rank = loss_idx.sort(1) 147 | num_pos = pos.long().sum(1, keepdim=True) 148 | 149 | if num_pos.data.sum() > 0: 150 | num_neg = torch.clamp( 151 | self.negpos_ratio * num_pos, max=pos.size(1) - 1) 152 | else: 153 | fake_num_pos = torch.ones(32, 1).long() * 15 154 | num_neg = torch.clamp( 155 | self.negpos_ratio * fake_num_pos, max=pos.size(1) - 1) 156 | neg = idx_rank < num_neg.expand_as(idx_rank) 157 | 158 | # Confidence Loss Including Positive and Negative Examples 159 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 160 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 161 | conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view( 162 | -1, self.num_classes) 163 | 164 | targets_weighted = conf_t[(pos + neg).gt(0)] 165 | loss_c = F.cross_entropy( 166 | conf_p, targets_weighted, size_average=False) 167 | else: 168 | loss_c = F.cross_entropy(conf_p, conf_t, size_average=False) 169 | 170 | # Localization Loss (Smooth L1) 171 | # Shape: [batch,num_priors,4] 172 | if num_pos.data.sum() > 0: 173 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 174 | loc_p = loc_data[pos_idx].view(-1, 4) 175 | loc_t = loc_t[pos_idx].view(-1, 4) 176 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) 177 | N = num_pos.data.sum() 178 | else: 179 | loss_l = torch.zeros(1) 180 | N = 1.0 181 | 182 | loss_l /= float(N) 183 | loss_c /= float(N) 184 | return loss_l, loss_c 185 | -------------------------------------------------------------------------------- /layers/modules/weight_smooth_l1_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class WeightSmoothL1Loss(nn.Module): 11 | def __init__(self, class_num, size_average=False): 12 | super(WeightSmoothL1Loss, self).__init__() 13 | self.class_num = class_num 14 | self.size_average = size_average 15 | 16 | def forward(self, inputs, targets, weights): 17 | N = inputs.size(0) 18 | loc_num = inputs.size(1) 19 | abs_out = torch.abs(inputs - targets) 20 | 21 | if inputs.is_cuda and not weights.is_cuda: 22 | weights = weights.cuda() 23 | 24 | weights = weights.view(-1, 1) 25 | 26 | weights = torch.cat((weights, weights, weights, weights), 1) 27 | mask_big = abs_out >= 1. 28 | mask_small = abs_out < 1. 29 | loss_big = weights[mask_big] * (abs_out[mask_big] - 0.5) 30 | loss_small = weights[mask_small] * 0.5 * torch.pow( 31 | abs_out[mask_small], 2) 32 | loss_sum = loss_big.sum() + loss_small.sum() 33 | 34 | if self.size_average: 35 | loss = loss_sum / N * loc_num 36 | else: 37 | loss = loss_sum 38 | return loss 39 | -------------------------------------------------------------------------------- /layers/modules/weight_softmax_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class WeightSoftmaxLoss(nn.Module): 11 | def __init__(self, class_num, gamma=2, size_average=True): 12 | super(WeightSoftmaxLoss, self).__init__() 13 | # if isinstance(weights, Variable): 14 | # self.weights = weights 15 | # else: 16 | # self.weights = Variable(weights) 17 | 18 | self.class_num = class_num 19 | self.gamma = gamma 20 | self.size_average = size_average 21 | 22 | def forward(self, inputs, targets, weights): 23 | N = inputs.size(0) 24 | C = inputs.size(1) 25 | P = F.softmax(inputs) 26 | 27 | class_mask = inputs.data.new(N, C).fill_(0) 28 | class_mask = Variable(class_mask) 29 | ids = targets.view(-1, 1) 30 | class_mask.scatter_(1, ids.data, 1.) 31 | if inputs.is_cuda and not weights.is_cuda: 32 | weights = weights.cuda() 33 | probs = (P * class_mask).sum(1).view(-1, 1) 34 | 35 | log_p = probs.log() 36 | weights = weights.view(-1, 1) 37 | batch_loss = -weights * log_p 38 | 39 | if self.size_average: 40 | loss = batch_loss.mean() 41 | else: 42 | loss = batch_loss.sum() 43 | return loss -------------------------------------------------------------------------------- /make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd ./utils/ 3 | 4 | CUDA_PATH=/usr/local/cuda/ 5 | 6 | python build.py build_ext --inplace 7 | # if you use anaconda3 maybe you need add this 8 | # change code like https://github.com/rbgirshick/py-faster-rcnn/issues/706 9 | mv nms/cpu_nms.cpython-36m-x86_64-linux-gnu.so nms/cpu_nms.so 10 | mv nms/gpu_nms.cpython-36m-x86_64-linux-gnu.so nms/gpu_nms.so 11 | cd .. 12 | -------------------------------------------------------------------------------- /models/darknet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | # 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | from models.model_helper import weights_init 9 | 10 | 11 | def add_extras(size, in_channel, batch_norm=False): 12 | # Extra layers added to resnet for feature scaling 13 | layers = [] 14 | layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)] 15 | layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)] 16 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 17 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 18 | if size == '300': 19 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 20 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=0)] 21 | else: 22 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 23 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 24 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 25 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 26 | 27 | return layers 28 | 29 | 30 | class ConvBN(nn.Module): 31 | def __init__(self, ch_in, ch_out, kernel_size=3, stride=1, padding=0): 32 | super().__init__() 33 | self.conv = nn.Conv2d( 34 | ch_in, 35 | ch_out, 36 | kernel_size=kernel_size, 37 | stride=stride, 38 | padding=padding, 39 | bias=False) 40 | self.bn = nn.BatchNorm2d(ch_out, momentum=0.01, eps=1e-05, affine=True) 41 | 42 | def forward(self, x): 43 | return F.leaky_relu( 44 | self.bn(self.conv(x)), negative_slope=0.1, inplace=True) 45 | 46 | 47 | class DarknetBlock(nn.Module): 48 | def __init__(self, ch_in): 49 | super().__init__() 50 | ch_hid = ch_in // 2 51 | self.conv1 = ConvBN(ch_in, ch_hid, kernel_size=1, stride=1, padding=0) 52 | self.conv2 = ConvBN(ch_hid, ch_in, kernel_size=3, stride=1, padding=1) 53 | 54 | def forward(self, x): 55 | out = self.conv1(x) 56 | out = self.conv2(out) 57 | return out + x 58 | 59 | 60 | class Darknet19(nn.Module): 61 | def __init__(self, size): 62 | super().__init__() 63 | self.conv = ConvBN(3, 32, kernel_size=3, stride=1, padding=1) 64 | self.layer1 = self._make_layer1() 65 | self.layer2 = self._make_layer2() 66 | self.layer3 = self._make_layer3() 67 | self.layer4 = self._make_layer4() 68 | self.layer5 = self._make_layer5() 69 | self.extras = nn.ModuleList(add_extras(str(size), 1024)) 70 | 71 | def _make_layer1(self): 72 | layers = [ 73 | nn.MaxPool2d(kernel_size=2, stride=2), 74 | ConvBN(32, 64, kernel_size=3, stride=1, padding=1) 75 | ] 76 | return nn.Sequential(*layers) 77 | 78 | def _make_layer2(self): 79 | layers = [ 80 | nn.MaxPool2d(kernel_size=2, stride=2), 81 | ConvBN(64, 128, kernel_size=3, stride=1, padding=1), 82 | ConvBN(128, 64, kernel_size=1, stride=1), 83 | ConvBN(64, 128, kernel_size=3, stride=1, padding=1) 84 | ] 85 | return nn.Sequential(*layers) 86 | 87 | def _make_layer3(self): 88 | layers = [ 89 | nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True), 90 | ConvBN(128, 256, kernel_size=3, stride=1, padding=1), 91 | ConvBN(256, 128, kernel_size=1, stride=1), 92 | ConvBN(128, 256, kernel_size=3, stride=1, padding=1) 93 | ] 94 | return nn.Sequential(*layers) 95 | 96 | def _make_layer4(self): 97 | layers = [ 98 | nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True), 99 | ConvBN(256, 512, kernel_size=3, stride=1, padding=1), 100 | ConvBN(512, 256, kernel_size=1, stride=1), 101 | ConvBN(256, 512, kernel_size=3, stride=1, padding=1), 102 | ConvBN(512, 256, kernel_size=1, stride=1), 103 | ConvBN(256, 512, kernel_size=3, stride=1, padding=1) 104 | ] 105 | return nn.Sequential(*layers) 106 | 107 | def _make_layer5(self): 108 | layers = [ 109 | nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True), 110 | ConvBN(512, 1024, kernel_size=3, stride=1, padding=1), 111 | ConvBN(1024, 512, kernel_size=1, stride=1), 112 | ConvBN(512, 1024, kernel_size=3, stride=1, padding=1), 113 | ConvBN(1024, 512, kernel_size=1, stride=1), 114 | ConvBN(512, 1024, kernel_size=3, stride=1, padding=1) 115 | ] 116 | return nn.Sequential(*layers) 117 | 118 | def forward(self, x): 119 | out = self.conv(x) 120 | c1 = self.layer1(out) 121 | c2 = self.layer2(c1) 122 | c3 = self.layer3(c2) 123 | c4 = self.layer4(c3) 124 | c5 = self.layer5(c4) 125 | sources = [c3, c4, c5] 126 | x = c5 127 | for k, v in enumerate(self.extras): 128 | x = F.relu(v(x), inplace=True) 129 | if k % 2 == 1: 130 | sources.append(x) 131 | return sources 132 | 133 | 134 | class Darknet53(nn.Module): 135 | def __init__(self, num_blocks, size): 136 | super().__init__() 137 | self.conv = ConvBN(3, 32, kernel_size=3, stride=1, padding=1) 138 | self.layer1 = self._make_layer(32, num_blocks[0], stride=2) 139 | self.layer2 = self._make_layer(64, num_blocks[1], stride=2) 140 | self.layer3 = self._make_layer(128, num_blocks[2], stride=2) 141 | self.layer4 = self._make_layer(256, num_blocks[3], stride=2) 142 | self.layer5 = self._make_layer(512, num_blocks[4], stride=2) 143 | self.extras = nn.ModuleList(add_extras(str(size), 1024)) 144 | self._init_modules() 145 | 146 | def _make_layer(self, ch_in, num_blocks, stride=1): 147 | layers = [ConvBN(ch_in, ch_in * 2, stride=stride, padding=1)] 148 | for i in range(num_blocks): 149 | layers.append(DarknetBlock(ch_in * 2)) 150 | return nn.Sequential(*layers) 151 | 152 | def _init_modules(self): 153 | self.extras.apply(weights_init) 154 | 155 | def forward(self, x): 156 | out = self.conv(x) 157 | c1 = self.layer1(out) 158 | c2 = self.layer2(c1) 159 | c3 = self.layer3(c2) 160 | c4 = self.layer4(c3) 161 | c5 = self.layer5(c4) 162 | sources = [c3, c4, c5] 163 | x = c5 164 | for k, v in enumerate(self.extras): 165 | x = F.relu(v(x), inplace=True) 166 | if k % 2 == 1: 167 | sources.append(x) 168 | return sources 169 | 170 | 171 | def SSDarknet53(size, channel_size='48'): 172 | return Darknet53([1, 2, 8, 8, 4], size) 173 | 174 | 175 | def SSDarknet19(size, channel_size='48'): 176 | return Darknet19(size) 177 | 178 | 179 | if __name__ == "__main__": 180 | import os 181 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 182 | model3 = SSDarknet19(size=300) 183 | with torch.no_grad(): 184 | model3.eval() 185 | x = torch.randn(16, 3, 300, 300) 186 | model3.cuda() 187 | model3(x.cuda()) 188 | import time 189 | st = time.time() 190 | for i in range(100): 191 | model3(x.cuda()) 192 | print(time.time() - st) 193 | -------------------------------------------------------------------------------- /models/drf_res.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import models.dense_conv 8 | from torch.autograd import Variable 9 | from models.model_helper import weights_init 10 | 11 | 12 | def add_extras(size, in_channel, batch_norm=False): 13 | layers = [] 14 | layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)] 15 | layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)] 16 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 17 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 18 | if size == '300': 19 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 20 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=0)] 21 | else: 22 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 23 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 24 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 25 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 26 | 27 | return layers 28 | 29 | 30 | class Bottleneck(nn.Module): 31 | expansion = 4 32 | 33 | def __init__(self, in_planes, planes, stride=1): 34 | super(Bottleneck, self).__init__() 35 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 36 | self.bn1 = nn.BatchNorm2d(planes) 37 | self.conv2 = nn.Conv2d( 38 | planes, 39 | planes, 40 | kernel_size=3, 41 | stride=stride, 42 | padding=1, 43 | bias=False) 44 | self.bn2 = nn.BatchNorm2d(planes) 45 | self.conv3 = nn.Conv2d( 46 | planes, self.expansion * planes, kernel_size=1, bias=False) 47 | self.bn3 = nn.BatchNorm2d(self.expansion * planes) 48 | 49 | self.downsample = nn.Sequential() 50 | if stride != 1 or in_planes != self.expansion * planes: 51 | self.downsample = nn.Sequential( 52 | nn.Conv2d( 53 | in_planes, 54 | self.expansion * planes, 55 | kernel_size=1, 56 | stride=stride, 57 | bias=False), nn.BatchNorm2d(self.expansion * planes)) 58 | 59 | def forward(self, x): 60 | out = F.relu(self.bn1(self.conv1(x))) 61 | out = F.relu(self.bn2(self.conv2(out))) 62 | out = self.bn3(self.conv3(out)) 63 | out += self.downsample(x) 64 | out = F.relu(out) 65 | return out 66 | 67 | 68 | class DenseSSDResnet(nn.Module): 69 | def __init__(self, block, num_blocks, size='300', channel_size='48'): 70 | super(DenseSSDResnet, self).__init__() 71 | self.in_planes = 64 72 | 73 | self.conv1 = nn.Conv2d( 74 | 3, 64, kernel_size=7, stride=2, padding=3, bias=False) 75 | self.bn1 = nn.BatchNorm2d(64) 76 | 77 | # Bottom-up layers 78 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 79 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 80 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 81 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 82 | 83 | self.extras = nn.ModuleList(add_extras(str(size), 2048)) 84 | 85 | dense_list = models.dense_conv.dense_list_res(channel_size, size) 86 | self.dense_list0 = nn.ModuleList(dense_list[0]) 87 | self.dense_list1 = nn.ModuleList(dense_list[1]) 88 | self.dense_list2 = nn.ModuleList(dense_list[2]) 89 | self.dense_list3 = nn.ModuleList(dense_list[3]) 90 | self.dense_list4 = nn.ModuleList(dense_list[4]) 91 | self.dense_list5 = nn.ModuleList(dense_list[5]) 92 | self.smooth1 = nn.Conv2d(2048, 512, kernel_size=3, stride=1, padding=1) 93 | self._init_modules() 94 | 95 | def _make_layer(self, block, planes, num_blocks, stride): 96 | strides = [stride] + [1] * (num_blocks - 1) 97 | layers = [] 98 | for stride in strides: 99 | layers.append(block(self.in_planes, planes, stride)) 100 | self.in_planes = planes * block.expansion 101 | return nn.Sequential(*layers) 102 | 103 | def _init_modules(self): 104 | self.extras.apply(weights_init) 105 | self.dense_list0.apply(weights_init) 106 | self.dense_list1.apply(weights_init) 107 | self.dense_list2.apply(weights_init) 108 | self.dense_list3.apply(weights_init) 109 | self.dense_list4.apply(weights_init) 110 | self.dense_list5.apply(weights_init) 111 | self.smooth1.apply(weights_init) 112 | 113 | def forward(self, x): 114 | # Bottom-up 115 | c1 = F.relu(self.bn1(self.conv1(x))) 116 | c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1) 117 | 118 | c2 = self.layer1(c1) 119 | dense1_p1 = self.dense_list0[0](c2) 120 | dense1_p2 = self.dense_list0[1](dense1_p1) 121 | dense1_p3 = self.dense_list0[2](dense1_p2) 122 | dense1_p1_conv = self.dense_list0[3](dense1_p1) 123 | dense1_p2_conv = self.dense_list0[4](dense1_p2) 124 | dense1_p3_conv = self.dense_list0[5](dense1_p3) 125 | 126 | c3 = self.layer2(c2) 127 | dense2_p1 = self.dense_list1[0](c3) 128 | dense2_p2 = self.dense_list1[1](dense2_p1) 129 | dense2_p3 = self.dense_list1[2](dense2_p2) 130 | dense2_p1_conv = self.dense_list1[3](dense2_p1) 131 | dense2_p2_conv = self.dense_list1[4](dense2_p2) 132 | dense2_p3_conv = self.dense_list1[5](dense2_p3) 133 | 134 | c4 = self.layer3(c3) 135 | dense3_up_conv = self.dense_list2[0](c4) 136 | dense3_up = self.dense_list2[1](dense3_up_conv) 137 | dense3_p1 = self.dense_list2[2](c4) 138 | dense3_p2 = self.dense_list2[3](dense3_p1) 139 | dense3_p1_conv = self.dense_list2[4](dense3_p1) 140 | dense3_p2_conv = self.dense_list2[5](dense3_p2) 141 | 142 | c5 = self.layer4(c4) 143 | c5_ = self.smooth1(c5) 144 | dense4_up1_conv = self.dense_list3[0](c5) 145 | dense4_up2_conv = self.dense_list3[1](c5) 146 | dense4_up1 = self.dense_list3[2](dense4_up1_conv) 147 | dense4_up2 = self.dense_list3[3](dense4_up2_conv) 148 | dense4_p = self.dense_list3[4](c5) 149 | dense4_p_conv = self.dense_list3[5](dense4_p) 150 | 151 | p6 = F.relu(self.extras[0](c5), inplace=True) 152 | p6 = F.relu(self.extras[1](p6), inplace=True) 153 | 154 | x = p6 155 | 156 | dense5_up1_conv = self.dense_list4[0](p6) 157 | dense5_up2_conv = self.dense_list4[1](p6) 158 | dense5_up3_conv = self.dense_list4[2](p6) 159 | dense5_up1 = self.dense_list4[3](dense5_up1_conv) 160 | dense5_up2 = self.dense_list4[4](dense5_up2_conv) 161 | dense5_up3 = self.dense_list4[5](dense5_up3_conv) 162 | 163 | dense_out1 = torch.cat( 164 | (dense1_p1_conv, c3, dense3_up, dense4_up2, dense5_up3), 1) 165 | dense_out1 = F.relu(self.dense_list5[0](dense_out1)) 166 | 167 | dense_out2 = torch.cat( 168 | (dense1_p2_conv, dense2_p1_conv, c4, dense4_up1, dense5_up2), 1) 169 | dense_out2 = F.relu(self.dense_list5[1](dense_out2)) 170 | 171 | dense_out3 = torch.cat( 172 | (dense1_p3_conv, dense2_p2_conv, dense3_p1_conv, c5_, dense5_up1), 173 | 1) 174 | dense_out3 = F.relu(self.dense_list5[2](dense_out3)) 175 | 176 | dense_out4 = torch.cat( 177 | (dense2_p3_conv, dense3_p2_conv, dense4_p_conv, p6), 1) 178 | dense_out4 = F.relu(self.dense_list5[3](dense_out4)) 179 | 180 | sources = [dense_out1, dense_out2, dense_out3, dense_out4] 181 | # apply extra layers and cache source layer outputs 182 | for k, v in enumerate(self.extras): 183 | if k > 1: 184 | x = F.relu(v(x), inplace=True) 185 | if k % 2 == 1: 186 | sources.append(x) 187 | 188 | return sources 189 | 190 | 191 | def DRFSSDRes50(size, channel_size='48'): 192 | return DenseSSDResnet(Bottleneck, [3, 4, 6, 3], size, channel_size) 193 | 194 | 195 | def DRFSSDRes101(size, channel_size='48'): 196 | return DenseSSDResnet(Bottleneck, [3, 4, 23, 3], size, channel_size) 197 | 198 | 199 | def DRFSSDRes152(size, channel_size='48'): 200 | return DenseSSDResnet(Bottleneck, [3, 8, 36, 3], size, channel_size) 201 | -------------------------------------------------------------------------------- /models/mobilenetv2.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.nn import init 7 | from models.model_helper import weights_init 8 | 9 | 10 | def add_extras(size, in_channel, batch_norm=False): 11 | # Extra layers added to resnet for feature scaling 12 | layers = [] 13 | layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)] 14 | layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)] 15 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 16 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 17 | if size == '300': 18 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 19 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=0)] 20 | else: 21 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 22 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 23 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 24 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 25 | 26 | return layers 27 | 28 | 29 | def _make_divisible(v, divisor, min_value=None): 30 | """ 31 | This function is taken from the original tf repo. 32 | It ensures that all layers have a channel number that is divisible by 8 33 | It can be seen here: 34 | https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py 35 | :param v: 36 | :param divisor: 37 | :param min_value: 38 | :return: 39 | """ 40 | if min_value is None: 41 | min_value = divisor 42 | new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) 43 | # Make sure that round down does not go down by more than 10%. 44 | if new_v < 0.9 * v: 45 | new_v += divisor 46 | return new_v 47 | 48 | 49 | class LinearBottleneck(nn.Module): 50 | def __init__(self, inplanes, outplanes, stride=1, t=6, 51 | activation=nn.ReLU6): 52 | super(LinearBottleneck, self).__init__() 53 | self.conv1 = nn.Conv2d( 54 | inplanes, inplanes * t, kernel_size=1, bias=False) 55 | self.bn1 = nn.BatchNorm2d(inplanes * t) 56 | self.conv2 = nn.Conv2d( 57 | inplanes * t, 58 | inplanes * t, 59 | kernel_size=3, 60 | stride=stride, 61 | padding=1, 62 | bias=False, 63 | groups=inplanes * t) 64 | self.bn2 = nn.BatchNorm2d(inplanes * t) 65 | self.conv3 = nn.Conv2d( 66 | inplanes * t, outplanes, kernel_size=1, bias=False) 67 | self.bn3 = nn.BatchNorm2d(outplanes) 68 | self.activation = activation(inplace=True) 69 | self.stride = stride 70 | self.t = t 71 | self.inplanes = inplanes 72 | self.outplanes = outplanes 73 | 74 | def forward(self, x): 75 | residual = x 76 | 77 | out = self.conv1(x) 78 | out = self.bn1(out) 79 | out = self.activation(out) 80 | 81 | out = self.conv2(out) 82 | out = self.bn2(out) 83 | out = self.activation(out) 84 | 85 | out = self.conv3(out) 86 | out = self.bn3(out) 87 | 88 | if self.stride == 1 and self.inplanes == self.outplanes: 89 | out += residual 90 | 91 | return out 92 | 93 | 94 | class MobileNet2(nn.Module): 95 | """MobileNet2 implementation. 96 | """ 97 | 98 | def __init__(self, 99 | scale=1.0, 100 | input_size=224, 101 | t=6, 102 | in_channels=3, 103 | size=300, 104 | activation=nn.ReLU6): 105 | """ 106 | MobileNet2 constructor. 107 | :param in_channels: (int, optional): number of channels in the input tensor. 108 | Default is 3 for RGB image inputs. 109 | :param input_size: 110 | :param num_classes: number of classes to predict. Default 111 | is 1000 for ImageNet. 112 | :param scale: 113 | :param t: 114 | :param activation: 115 | """ 116 | 117 | super(MobileNet2, self).__init__() 118 | 119 | self.scale = scale 120 | self.t = t 121 | self.activation_type = activation 122 | self.activation = activation(inplace=True) 123 | self.size = size 124 | 125 | self.num_of_channels = [32, 16, 24, 32, 64, 96, 160, 320] 126 | # assert (input_size % 32 == 0) 127 | 128 | self.c = [ 129 | _make_divisible(ch * self.scale, 8) for ch in self.num_of_channels 130 | ] 131 | self.n = [1, 1, 2, 3, 4, 3, 3, 1] 132 | self.s = [2, 1, 2, 2, 2, 1, 2, 1] 133 | self.conv1 = nn.Conv2d( 134 | in_channels, 135 | self.c[0], 136 | kernel_size=3, 137 | bias=False, 138 | stride=self.s[0], 139 | padding=1) 140 | self.bn1 = nn.BatchNorm2d(self.c[0]) 141 | # self.bottlenecks = self._make_bottlenecks() 142 | self.bottlenecks = nn.ModuleList(self._make_bottlenecks()) 143 | 144 | # Last convolution has 1280 output channels for scale <= 1 145 | self.last_conv_out_ch = 1280 if self.scale <= 1 else _make_divisible( 146 | 1280 * self.scale, 8) 147 | self.conv_last = nn.Conv2d( 148 | self.c[-1], self.last_conv_out_ch, kernel_size=1, bias=False) 149 | self.bn_last = nn.BatchNorm2d(self.last_conv_out_ch) 150 | 151 | self.extras = nn.ModuleList( 152 | add_extras(str(self.size), self.last_conv_out_ch)) 153 | self._init_modules() 154 | 155 | def _init_modules(self): 156 | self.extras.apply(weights_init) 157 | 158 | def _make_stage(self, inplanes, outplanes, n, stride, t, stage): 159 | modules = OrderedDict() 160 | stage_name = "LinearBottleneck{}".format(stage) 161 | 162 | # First module is the only one utilizing stride 163 | first_module = LinearBottleneck( 164 | inplanes=inplanes, 165 | outplanes=outplanes, 166 | stride=stride, 167 | t=t, 168 | activation=self.activation_type) 169 | modules[stage_name + "_0"] = first_module 170 | 171 | # add more LinearBottleneck depending on number of repeats 172 | for i in range(n - 1): 173 | name = stage_name + "_{}".format(i + 1) 174 | module = LinearBottleneck( 175 | inplanes=outplanes, 176 | outplanes=outplanes, 177 | stride=1, 178 | t=6, 179 | activation=self.activation_type) 180 | modules[name] = module 181 | return nn.Sequential(modules) 182 | 183 | def _make_bottlenecks(self): 184 | modules = list() 185 | stage_name = "Bottlenecks" 186 | 187 | # First module is the only one with t=1 188 | bottleneck1 = self._make_stage( 189 | inplanes=self.c[0], 190 | outplanes=self.c[1], 191 | n=self.n[1], 192 | stride=self.s[1], 193 | t=1, 194 | stage=0) 195 | modules.append(bottleneck1) 196 | 197 | # add more LinearBottleneck depending on number of repeats 198 | for i in range(1, len(self.c) - 1): 199 | name = stage_name + "_{}".format(i) 200 | module = self._make_stage( 201 | inplanes=self.c[i], 202 | outplanes=self.c[i + 1], 203 | n=self.n[i + 1], 204 | stride=self.s[i + 1], 205 | t=self.t, 206 | stage=i) 207 | modules += module 208 | 209 | return modules 210 | 211 | def forward(self, x): 212 | x = self.conv1(x) 213 | x = self.bn1(x) 214 | x = self.activation(x) 215 | 216 | sources = list() 217 | for i in range(6): 218 | x = self.bottlenecks[i](x) 219 | sources.append(x) 220 | for i in range(6, 13): 221 | x = self.bottlenecks[i](x) 222 | sources.append(x) 223 | for i in range(13, len(self.bottlenecks)): 224 | x = self.bottlenecks[i](x) 225 | x = self.conv_last(x) 226 | x = self.bn_last(x) 227 | x = self.activation(x) 228 | sources.append(x) 229 | for k, v in enumerate(self.extras): 230 | x = F.relu(v(x), inplace=True) 231 | if k % 2 == 1: 232 | sources.append(x) 233 | return sources 234 | 235 | 236 | def SSDMobilenetv2(size, channel_size='48'): 237 | return MobileNet2(size=size) 238 | 239 | 240 | if __name__ == "__main__": 241 | import os 242 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 243 | model3 = MobileNet2(size=300) 244 | with torch.no_grad(): 245 | model3.eval() 246 | x = torch.randn(16, 3, 300, 300) 247 | model3.cuda() 248 | model3(x.cuda()) 249 | import time 250 | st = time.time() 251 | for i in range(100): 252 | model3(x.cuda()) 253 | print(time.time() - st) 254 | # print(model3(x)) 255 | -------------------------------------------------------------------------------- /models/model_builder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | from layers import * 9 | import os 10 | from models.model_helper import weights_init 11 | import importlib 12 | from layers.functions.prior_layer import PriorLayer 13 | 14 | 15 | def get_func(func_name): 16 | """Helper to return a function object by name. func_name must identify a 17 | function in this module or the path to a function relative to the base 18 | 'modeling' module. 19 | """ 20 | if func_name == '': 21 | return None 22 | try: 23 | parts = func_name.split('.') 24 | # Refers to a function in this module 25 | if len(parts) == 1: 26 | return globals()[parts[0]] 27 | # Otherwise, assume we're referencing a module under modeling 28 | module_name = 'models.' + '.'.join(parts[:-1]) 29 | module = importlib.import_module(module_name) 30 | return getattr(module, parts[-1]) 31 | except Exception: 32 | print('Failed to find function: %s', func_name) 33 | raise 34 | 35 | 36 | class SSD(nn.Module): 37 | """Single Shot Multibox Architecture 38 | The network is composed of a base VGG network followed by the 39 | added multibox conv layers. Each multibox layer branches into 40 | 1) conv2d for class conf scores 41 | 2) conv2d for localization predictions 42 | 3) associated priorbox layer to produce default bounding 43 | boxes specific to the layer's feature map size. 44 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 45 | 46 | Args: 47 | phase: (string) Can be "test" or "train" 48 | base: VGG16 layers for input, size of either 300 or 500 49 | extras: extra layers that feed to multibox loc and conf layers 50 | head: "multibox head" consists of loc and conf conv layers 51 | """ 52 | 53 | def _init_modules(self): 54 | self.arm_loc.apply(weights_init) 55 | self.arm_conf.apply(weights_init) 56 | if self.cfg.MODEL.REFINE: 57 | self.odm_loc.apply(weights_init) 58 | self.odm_conf.apply(weights_init) 59 | if self.cfg.MODEL.LOAD_PRETRAINED_WEIGHTS: 60 | weights = torch.load(self.cfg.MODEL.PRETRAIN_WEIGHTS) 61 | print("load pretrain model {}".format( 62 | self.cfg.MODEL.PRETRAIN_WEIGHTS)) 63 | if self.cfg.MODEL.TYPE.split('_')[-1] == 'vgg': 64 | self.extractor.vgg.load_state_dict(weights) 65 | else: 66 | self.extractor.load_state_dict(weights, strict=False) 67 | 68 | def __init__(self, cfg): 69 | super(SSD, self).__init__() 70 | self.cfg = cfg 71 | self.size = cfg.MODEL.SIZE 72 | if self.size == '300': 73 | size_cfg = cfg.SMALL 74 | else: 75 | size_cfg = cfg.BIG 76 | self.num_classes = cfg.MODEL.NUM_CLASSES 77 | self.prior_layer = PriorLayer(cfg) 78 | self.priorbox = PriorBox(cfg) 79 | self.priors = self.priorbox.forward() 80 | self.extractor = get_func(cfg.MODEL.CONV_BODY)(self.size, 81 | cfg.TRAIN.CHANNEL_SIZE) 82 | if cfg.MODEL.REFINE: 83 | self.odm_channels = size_cfg.ODM_CHANNELS 84 | self.arm_num_classes = 2 85 | self.odm_loc = nn.ModuleList() 86 | self.odm_conf = nn.ModuleList() 87 | self.arm_loc = nn.ModuleList() 88 | self.arm_conf = nn.ModuleList() 89 | self.arm_channels = size_cfg.ARM_CHANNELS 90 | self.num_anchors = size_cfg.NUM_ANCHORS 91 | self.input_fixed = size_cfg.INPUT_FIXED 92 | self.arm_loc = nn.ModuleList() 93 | self.arm_conf = nn.ModuleList() 94 | for i in range(len(self.arm_channels)): 95 | if cfg.MODEL.REFINE: 96 | self.arm_loc += [ 97 | nn.Conv2d( 98 | self.arm_channels[i], 99 | self.num_anchors[i] * 4, 100 | kernel_size=3, 101 | padding=1) 102 | ] 103 | self.arm_conf += [ 104 | nn.Conv2d( 105 | self.arm_channels[i], 106 | self.num_anchors[i] * self.arm_num_classes, 107 | kernel_size=3, 108 | padding=1) 109 | ] 110 | self.odm_loc += [ 111 | nn.Conv2d( 112 | self.odm_channels[i], 113 | self.num_anchors[i] * 4, 114 | kernel_size=3, 115 | padding=1) 116 | ] 117 | self.odm_conf += [ 118 | nn.Conv2d( 119 | self.odm_channels[i], 120 | self.num_anchors[i] * self.num_classes, 121 | kernel_size=3, 122 | padding=1) 123 | ] 124 | else: 125 | self.arm_loc += [ 126 | nn.Conv2d( 127 | self.arm_channels[i], 128 | self.num_anchors[i] * 4, 129 | kernel_size=3, 130 | padding=1) 131 | ] 132 | self.arm_conf += [ 133 | nn.Conv2d( 134 | self.arm_channels[i], 135 | self.num_anchors[i] * self.num_classes, 136 | kernel_size=3, 137 | padding=1) 138 | ] 139 | if cfg.TRAIN.TRAIN_ON: 140 | self._init_modules() 141 | 142 | def forward(self, x): 143 | 144 | arm_loc = list() 145 | arm_conf = list() 146 | if self.cfg.MODEL.REFINE: 147 | odm_loc = list() 148 | odm_conf = list() 149 | arm_xs, odm_xs = self.extractor(x) 150 | for (x, l, c) in zip(odm_xs, self.odm_loc, self.odm_conf): 151 | odm_loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 152 | odm_conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 153 | odm_loc = torch.cat([o.view(o.size(0), -1) for o in odm_loc], 1) 154 | odm_conf = torch.cat([o.view(o.size(0), -1) for o in odm_conf], 1) 155 | else: 156 | arm_xs = self.extractor(x) 157 | img_wh = (x.size(3), x.size(2)) 158 | feature_maps_wh = [(t.size(3), t.size(2)) for t in arm_xs] 159 | for (x, l, c) in zip(arm_xs, self.arm_loc, self.arm_conf): 160 | arm_loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 161 | arm_conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 162 | arm_loc = torch.cat([o.view(o.size(0), -1) for o in arm_loc], 1) 163 | arm_conf = torch.cat([o.view(o.size(0), -1) for o in arm_conf], 1) 164 | if self.cfg.MODEL.REFINE: 165 | output = (arm_loc.view(arm_loc.size(0), -1, 4), 166 | arm_conf.view( 167 | arm_conf.size(0), -1, self.arm_num_classes), 168 | odm_loc.view(odm_loc.size(0), -1, 4), 169 | odm_conf.view(odm_conf.size(0), -1, self.num_classes), 170 | self.priors if self.input_fixed else self.prior_layer( 171 | img_wh, feature_maps_wh)) 172 | else: 173 | output = (arm_loc.view(arm_loc.size(0), -1, 4), 174 | arm_conf.view(arm_conf.size(0), -1, self.num_classes), 175 | self.priors if self.input_fixed else self.prior_layer( 176 | img_wh, feature_maps_wh)) 177 | return output 178 | -------------------------------------------------------------------------------- /models/refine_res.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | from models.model_helper import FpnAdapter, weights_init 9 | 10 | 11 | def add_extras(size, in_channel, batch_norm=False): 12 | # Extra layers added to resnet for feature scaling 13 | layers = [] 14 | layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)] 15 | layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)] 16 | return layers 17 | 18 | 19 | def conv3x3(in_planes, out_planes, stride=1): 20 | "3x3 convolution with padding" 21 | return nn.Conv2d( 22 | in_planes, 23 | out_planes, 24 | kernel_size=3, 25 | stride=stride, 26 | padding=1, 27 | bias=False) 28 | 29 | 30 | class BasicBlock(nn.Module): 31 | expansion = 1 32 | 33 | def __init__(self, inplanes, planes, stride=1, downsample=None): 34 | super(BasicBlock, self).__init__() 35 | self.conv1 = conv3x3(inplanes, planes, stride) 36 | self.bn1 = nn.BatchNorm2d(planes) 37 | self.relu = nn.ReLU(inplace=True) 38 | self.conv2 = conv3x3(planes, planes) 39 | self.bn2 = nn.BatchNorm2d(planes) 40 | self.downsample = downsample 41 | self.stride = stride 42 | 43 | def forward(self, x): 44 | residual = x 45 | 46 | out = self.conv1(x) 47 | out = self.bn1(out) 48 | out = self.relu(out) 49 | 50 | out = self.conv2(out) 51 | out = self.bn2(out) 52 | 53 | if self.downsample is not None: 54 | residual = self.downsample(x) 55 | out += residual 56 | out = self.relu(out) 57 | 58 | return out 59 | 60 | 61 | class Bottleneck(nn.Module): 62 | expansion = 4 63 | 64 | def __init__(self, inplanes, planes, stride=1, downsample=None): 65 | super(Bottleneck, self).__init__() 66 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 67 | self.bn1 = nn.BatchNorm2d(planes) 68 | self.conv2 = nn.Conv2d( 69 | planes, 70 | planes, 71 | kernel_size=3, 72 | stride=stride, 73 | padding=1, 74 | bias=False) 75 | self.bn2 = nn.BatchNorm2d(planes) 76 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 77 | self.bn3 = nn.BatchNorm2d(planes * 4) 78 | self.relu = nn.ReLU(inplace=True) 79 | self.downsample = downsample 80 | self.stride = stride 81 | 82 | def forward(self, x): 83 | residual = x 84 | 85 | out = self.conv1(x) 86 | out = self.bn1(out) 87 | out = self.relu(out) 88 | 89 | out = self.conv2(out) 90 | out = self.bn2(out) 91 | out = self.relu(out) 92 | 93 | out = self.conv3(out) 94 | out = self.bn3(out) 95 | 96 | if self.downsample is not None: 97 | residual = self.downsample(x) 98 | 99 | out += residual 100 | out = self.relu(out) 101 | 102 | return out 103 | 104 | 105 | class RefineResnet(nn.Module): 106 | def __init__(self, block, num_blocks, size): 107 | super(RefineResnet, self).__init__() 108 | self.inplanes = 64 109 | 110 | self.conv1 = nn.Conv2d( 111 | 3, 64, kernel_size=7, stride=2, padding=3, bias=False) 112 | self.bn1 = nn.BatchNorm2d(64) 113 | 114 | # Bottom-up layers 115 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 116 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 117 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 118 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 119 | self.inchannel = block.expansion * 512 120 | self.extras = nn.ModuleList(add_extras(str(size), self.inchannel)) 121 | self.smooth1 = nn.Conv2d( 122 | self.inchannel, 512, kernel_size=3, stride=1, padding=1) 123 | self.fpn = FpnAdapter([512, 1024, 512, 256], 4) 124 | self._init_modules() 125 | 126 | def _make_layer(self, block, planes, blocks, stride=1): 127 | downsample = None 128 | if stride != 1 or self.inplanes != planes * block.expansion: 129 | downsample = nn.Sequential( 130 | nn.Conv2d( 131 | self.inplanes, 132 | planes * block.expansion, 133 | kernel_size=1, 134 | stride=stride, 135 | bias=False), 136 | nn.BatchNorm2d(planes * block.expansion), 137 | ) 138 | 139 | layers = [] 140 | layers.append(block(self.inplanes, planes, stride, downsample)) 141 | self.inplanes = planes * block.expansion 142 | for i in range(1, blocks): 143 | layers.append(block(self.inplanes, planes)) 144 | 145 | return nn.Sequential(*layers) 146 | 147 | def _init_modules(self): 148 | self.extras.apply(weights_init) 149 | self.smooth1.apply(weights_init) 150 | 151 | def forward(self, x): 152 | # Bottom-up 153 | odm_sources = list() 154 | c1 = F.relu(self.bn1(self.conv1(x))) 155 | c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1) 156 | c2 = self.layer1(c1) 157 | c3 = self.layer2(c2) 158 | c4 = self.layer3(c3) 159 | c5 = self.layer4(c4) 160 | x = c5 161 | c5_ = self.smooth1(c5) 162 | arm_sources = [c3, c4, c5_] 163 | for k, v in enumerate(self.extras): 164 | x = F.relu(v(x), inplace=True) 165 | if k % 2 == 1: 166 | arm_sources.append(x) 167 | odm_sources = self.fpn(arm_sources) 168 | return arm_sources, odm_sources 169 | 170 | 171 | def RefineResnet50(size, channel_size='48'): 172 | return RefineResnet(Bottleneck, [3, 4, 6, 3], size) 173 | 174 | 175 | def RefineResnet101(size, channel_size='48'): 176 | return RefineResnet(Bottleneck, [3, 4, 23, 3], size) 177 | 178 | 179 | def RefineResnet152(size, channel_size='48'): 180 | return RefineResnet(Bottleneck, [3, 8, 36, 3], size) 181 | 182 | 183 | if __name__ == "__main__": 184 | import os 185 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 186 | model = RefineResnet50(size=300) 187 | print(model) 188 | with torch.no_grad(): 189 | model.eval() 190 | x = torch.randn(1, 3, 320, 320) 191 | model.cuda() 192 | model(x.cuda()) 193 | -------------------------------------------------------------------------------- /models/refine_vgg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import torch.nn.init as init 9 | from models.model_helper import FpnAdapter, WeaveAdapter, weights_init 10 | 11 | 12 | class L2Norm(nn.Module): 13 | def __init__(self, n_channels, scale): 14 | super(L2Norm, self).__init__() 15 | self.n_channels = n_channels 16 | self.gamma = scale or None 17 | self.eps = 1e-10 18 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 19 | self.reset_parameters() 20 | 21 | def reset_parameters(self): 22 | init.constant_(self.weight, self.gamma) 23 | 24 | def forward(self, x): 25 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps 26 | x = x / norm 27 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as( 28 | x) * x 29 | return out 30 | 31 | 32 | # This function is derived from torchvision VGG make_layers() 33 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 34 | 35 | 36 | def vgg(cfg, i, batch_norm=False): 37 | layers = [] 38 | in_channels = i 39 | for v in cfg: 40 | if v == 'M': 41 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 42 | elif v == 'C': 43 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 44 | else: 45 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 46 | if batch_norm: 47 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 48 | else: 49 | layers += [conv2d, nn.ReLU(inplace=True)] 50 | in_channels = v 51 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 52 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 53 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 54 | layers += [ 55 | pool5, conv6, 56 | nn.ReLU(inplace=True), conv7, 57 | nn.ReLU(inplace=True) 58 | ] 59 | return layers 60 | 61 | 62 | base = { 63 | '300': [ 64 | 64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 65 | 512, 512, 512 66 | ], 67 | '512': [ 68 | 64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 69 | 512, 512, 512 70 | ], 71 | } 72 | 73 | 74 | def add_extras(size): 75 | layers = [] 76 | layers += [nn.Conv2d(1024, 256, kernel_size=1, stride=1)] 77 | layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)] 78 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 79 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 80 | 81 | return layers 82 | 83 | 84 | # def last_layer_trans(): 85 | # return nn.Sequential(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), 86 | # nn.ReLU(inplace=True), 87 | # nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), 88 | # nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)) 89 | 90 | # def trans_layers(size): 91 | # layers = list() 92 | # layers += [nn.Sequential(nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1), 93 | # nn.ReLU(inplace=True), 94 | # nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1))] 95 | # layers += [nn.Sequential(nn.Conv2d(1024, 256, kernel_size=3, stride=1, padding=1), 96 | # nn.ReLU(inplace=True), 97 | # nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1))] 98 | # layers += [nn.Sequential(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), 99 | # nn.ReLU(inplace=True), 100 | # nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1))] 101 | 102 | # return layers 103 | 104 | # def latent_layers(size): 105 | # layers = [] 106 | # for i in range(3): 107 | # layers += [nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)] 108 | # return layers 109 | 110 | # def up_layers(size): 111 | # layers = [] 112 | # for i in range(3): 113 | # layers += [nn.ConvTranspose2d(256, 256, kernel_size=2, stride=2, padding=0)] 114 | # return layers 115 | 116 | 117 | class VGG16Extractor(nn.Module): 118 | def __init__(self, size, channel_size='48'): 119 | super(VGG16Extractor, self).__init__() 120 | self.vgg = nn.ModuleList(vgg(base[str(size)], 3)) 121 | self.extras = nn.ModuleList(add_extras(str(size))) 122 | self.L2Norm_4_3 = L2Norm(512, 10) 123 | self.L2Norm_5_3 = L2Norm(1024, 8) 124 | # self.last_layer_trans = last_layer_trans() 125 | # self.trans_layers = nn.ModuleList(trans_layers(str(size))) 126 | # self.latent_layers = nn.ModuleList(latent_layers((str(size)))) 127 | # self.up_layers = nn.ModuleList(up_layers(str(size))) 128 | self.fpn = FpnAdapter([512, 1024, 256, 256], 4) 129 | self._init_modules() 130 | 131 | def _init_modules(self): 132 | self.extras.apply(weights_init) 133 | # self.last_layer_trans.apply(weights_init) 134 | # self.trans_layers.apply(weights_init) 135 | # self.latent_layers.apply(weights_init) 136 | # self.up_layers.apply(weights_init) 137 | 138 | def forward(self, x): 139 | """Applies network layers and ops on input image(s) x. 140 | Args: 141 | x: input image or batch of images. Shape: [batch,3*batch,300,300]. 142 | Return: 143 | Depending on phase: 144 | test: 145 | Variable(tensor) of output class label predictions, 146 | confidence score, and corresponding location predictions for 147 | each object detected. Shape: [batch,topk,7] 148 | train: 149 | list of concat outputs from: 150 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 151 | 2: localization layers, Shape: [batch,num_priors*4] 152 | 3: priorbox layers, Shape: [2,num_priors*4] 153 | """ 154 | arm_sources = list() 155 | 156 | for i in range(23): 157 | x = self.vgg[i](x) 158 | #38x38 159 | c2 = x 160 | c2 = self.L2Norm_4_3(c2) 161 | arm_sources.append(c2) 162 | 163 | for k in range(23, len(self.vgg)): 164 | x = self.vgg[k](x) 165 | #19x19 166 | c3 = x 167 | c3 = self.L2Norm_5_3(c3) 168 | arm_sources.append(c3) 169 | 170 | # 10x10 171 | x = F.relu(self.extras[0](x), inplace=True) 172 | x = F.relu(self.extras[1](x), inplace=True) 173 | c4 = x 174 | arm_sources.append(c4) 175 | 176 | # 5x5 177 | x = F.relu(self.extras[2](x), inplace=True) 178 | x = F.relu(self.extras[3](x), inplace=True) 179 | c5 = x 180 | arm_sources.append(c5) 181 | 182 | if len(self.extras) > 4: 183 | x = F.relu(self.extras[4](x), inplace=True) 184 | x = F.relu(self.extras[5](x), inplace=True) 185 | c6 = x 186 | arm_sources.append(c6) 187 | 188 | # x = self.last_layer_trans(x) 189 | # odm_sources.append(x) 190 | 191 | # trans_layer_list = list() 192 | 193 | # for(p, t) in zip(arm_sources, self.trans_layers): 194 | # trans_layer_list.append(t(p)) 195 | 196 | # trans_layer_list.reverse() 197 | # for (t, u, l) in zip(trans_layer_list, self.up_layers, self.latent_layers): 198 | # x = F.relu(l(F.relu(u(x)+ t, inplace=True)), inplace=True) 199 | # odm_sources.append(x) 200 | 201 | # odm_sources.reverse() 202 | odm_sources = self.fpn(arm_sources) 203 | return arm_sources, odm_sources 204 | 205 | 206 | def refine_vgg(size, channel_size='48'): 207 | return VGG16Extractor(size) -------------------------------------------------------------------------------- /models/resnet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | from models.model_helper import weights_init 9 | 10 | 11 | def add_extras(size, in_channel, batch_norm=False): 12 | # Extra layers added to resnet for feature scaling 13 | layers = [] 14 | layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)] 15 | layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)] 16 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 17 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 18 | if size == '300': 19 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 20 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=0)] 21 | else: 22 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 23 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 24 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 25 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 26 | 27 | return layers 28 | 29 | 30 | def conv3x3(in_planes, out_planes, stride=1): 31 | "3x3 convolution with padding" 32 | return nn.Conv2d( 33 | in_planes, 34 | out_planes, 35 | kernel_size=3, 36 | stride=stride, 37 | padding=1, 38 | bias=False) 39 | 40 | 41 | class BasicBlock(nn.Module): 42 | expansion = 1 43 | 44 | def __init__(self, inplanes, planes, stride=1, downsample=None): 45 | super(BasicBlock, self).__init__() 46 | self.conv1 = conv3x3(inplanes, planes, stride) 47 | self.bn1 = nn.BatchNorm2d(planes) 48 | self.relu = nn.ReLU(inplace=True) 49 | self.conv2 = conv3x3(planes, planes) 50 | self.bn2 = nn.BatchNorm2d(planes) 51 | self.downsample = downsample 52 | self.stride = stride 53 | 54 | def forward(self, x): 55 | residual = x 56 | 57 | out = self.conv1(x) 58 | out = self.bn1(out) 59 | out = self.relu(out) 60 | 61 | out = self.conv2(out) 62 | out = self.bn2(out) 63 | 64 | if self.downsample is not None: 65 | residual = self.downsample(x) 66 | out += residual 67 | out = self.relu(out) 68 | 69 | return out 70 | 71 | 72 | class Bottleneck(nn.Module): 73 | expansion = 4 74 | 75 | def __init__(self, inplanes, planes, stride=1, downsample=None): 76 | super(Bottleneck, self).__init__() 77 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 78 | self.bn1 = nn.BatchNorm2d(planes) 79 | self.conv2 = nn.Conv2d( 80 | planes, 81 | planes, 82 | kernel_size=3, 83 | stride=stride, 84 | padding=1, 85 | bias=False) 86 | self.bn2 = nn.BatchNorm2d(planes) 87 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 88 | self.bn3 = nn.BatchNorm2d(planes * 4) 89 | self.relu = nn.ReLU(inplace=True) 90 | self.downsample = downsample 91 | self.stride = stride 92 | 93 | def forward(self, x): 94 | residual = x 95 | 96 | out = self.conv1(x) 97 | out = self.bn1(out) 98 | out = self.relu(out) 99 | 100 | out = self.conv2(out) 101 | out = self.bn2(out) 102 | out = self.relu(out) 103 | 104 | out = self.conv3(out) 105 | out = self.bn3(out) 106 | 107 | if self.downsample is not None: 108 | residual = self.downsample(x) 109 | 110 | out += residual 111 | out = self.relu(out) 112 | 113 | return out 114 | 115 | 116 | class SSDResnet(nn.Module): 117 | def __init__(self, block, num_blocks, size): 118 | super(SSDResnet, self).__init__() 119 | self.inplanes = 64 120 | 121 | self.conv1 = nn.Conv2d( 122 | 3, 64, kernel_size=7, stride=2, padding=3, bias=False) 123 | self.bn1 = nn.BatchNorm2d(64) 124 | 125 | # Bottom-up layers 126 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 127 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 128 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 129 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 130 | self.inchannel = block.expansion * 512 131 | self.extras = nn.ModuleList(add_extras(str(size), self.inchannel)) 132 | self.smooth1 = nn.Conv2d( 133 | self.inchannel, 512, kernel_size=3, stride=1, padding=1) 134 | self._init_modules() 135 | 136 | def _make_layer(self, block, planes, blocks, stride=1): 137 | downsample = None 138 | if stride != 1 or self.inplanes != planes * block.expansion: 139 | downsample = nn.Sequential( 140 | nn.Conv2d( 141 | self.inplanes, 142 | planes * block.expansion, 143 | kernel_size=1, 144 | stride=stride, 145 | bias=False), 146 | nn.BatchNorm2d(planes * block.expansion), 147 | ) 148 | 149 | layers = [] 150 | layers.append(block(self.inplanes, planes, stride, downsample)) 151 | self.inplanes = planes * block.expansion 152 | for i in range(1, blocks): 153 | layers.append(block(self.inplanes, planes)) 154 | 155 | return nn.Sequential(*layers) 156 | 157 | def _init_modules(self): 158 | self.extras.apply(weights_init) 159 | self.smooth1.apply(weights_init) 160 | 161 | def forward(self, x): 162 | # Bottom-up 163 | c1 = F.relu(self.bn1(self.conv1(x))) 164 | c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1) 165 | c2 = self.layer1(c1) 166 | c3 = self.layer2(c2) 167 | c4 = self.layer3(c3) 168 | c5 = self.layer4(c4) 169 | x = c5 170 | c5_ = self.smooth1(c5) 171 | sources = [c3, c4, c5_] 172 | for k, v in enumerate(self.extras): 173 | x = F.relu(v(x), inplace=True) 174 | if k % 2 == 1: 175 | sources.append(x) 176 | return sources 177 | 178 | 179 | def SSDResnet18(size, channel_size='48'): 180 | return SSDResnet(BasicBlock, [2, 2, 2, 2], size) 181 | 182 | 183 | def SSDResnet34(size, channel_size='48'): 184 | return SSDResnet(BasicBlock, [3, 4, 6, 3], size) 185 | 186 | 187 | def SSDResnet50(size, channel_size='48'): 188 | return SSDResnet(Bottleneck, [3, 4, 6, 3], size) 189 | 190 | 191 | def SSDResnet101(size, channel_size='48'): 192 | return SSDResnet(Bottleneck, [3, 4, 23, 3], size) 193 | 194 | 195 | def SSDResnet152(size, channel_size='48'): 196 | return SSDResnet(Bottleneck, [3, 8, 36, 3], size) 197 | 198 | 199 | if __name__ == "__main__": 200 | import os 201 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 202 | model3 = SSDResnet18(size=300) 203 | with torch.no_grad(): 204 | model3.eval() 205 | x = torch.randn(1, 3, 300, 300) 206 | model3.cuda() 207 | model3(x.cuda()) 208 | import time 209 | st = time.time() 210 | for i in range(1): 211 | model3(x.cuda()) 212 | print(time.time() - st) 213 | # print(model3(x)) 214 | -------------------------------------------------------------------------------- /models/vgg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import torch.nn.init as init 9 | from models.model_helper import weights_init 10 | 11 | 12 | class L2Norm(nn.Module): 13 | def __init__(self, n_channels, scale): 14 | super(L2Norm, self).__init__() 15 | self.n_channels = n_channels 16 | self.gamma = scale or None 17 | self.eps = 1e-10 18 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 19 | self.reset_parameters() 20 | 21 | def reset_parameters(self): 22 | init.constant_(self.weight, self.gamma) 23 | 24 | def forward(self, x): 25 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps 26 | x = x / norm 27 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as( 28 | x) * x 29 | return out 30 | 31 | 32 | # This function is derived from torchvision VGG make_layers() 33 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 34 | 35 | 36 | def vgg(cfg, i, batch_norm=False): 37 | layers = [] 38 | in_channels = i 39 | for v in cfg: 40 | if v == 'M': 41 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 42 | elif v == 'C': 43 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 44 | else: 45 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 46 | if batch_norm: 47 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 48 | else: 49 | layers += [conv2d, nn.ReLU(inplace=True)] 50 | in_channels = v 51 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 52 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 53 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 54 | layers += [ 55 | pool5, conv6, 56 | nn.ReLU(inplace=True), conv7, 57 | nn.ReLU(inplace=True) 58 | ] 59 | return layers 60 | 61 | 62 | extras_cfg = { 63 | '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 64 | '512': [ 65 | 256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128, 'S', 66 | 256 67 | ], 68 | } 69 | 70 | base = { 71 | '300': [ 72 | 64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 73 | 512, 512, 512 74 | ], 75 | '512': [ 76 | 64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 77 | 512, 512, 512 78 | ], 79 | } 80 | 81 | 82 | def add_extras(cfg, i, batch_norm=False): 83 | # Extra layers added to VGG for feature scaling 84 | layers = [] 85 | in_channels = i 86 | flag = False 87 | for k, v in enumerate(cfg): 88 | if in_channels != 'S': 89 | if v == 'S': 90 | layers += [ 91 | nn.Conv2d( 92 | in_channels, 93 | cfg[k + 1], 94 | kernel_size=(1, 3)[flag], 95 | stride=2, 96 | padding=1) 97 | ] 98 | else: 99 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 100 | flag = not flag 101 | in_channels = v 102 | return layers 103 | 104 | 105 | class VGG16Extractor(nn.Module): 106 | def __init__(self, size): 107 | super(VGG16Extractor, self).__init__() 108 | self.vgg = nn.ModuleList(vgg(base[str(size)], 3)) 109 | self.L2Norm = L2Norm(512, 20) 110 | self.extras = nn.ModuleList(add_extras(extras_cfg[str(size)], 1024)) 111 | self._init_modules() 112 | 113 | def _init_modules(self): 114 | self.extras.apply(weights_init) 115 | self.vgg.apply(weights_init) 116 | 117 | def forward(self, x): 118 | """Applies network layers and ops on input image(s) x. 119 | 120 | Args: 121 | x: input image or batch of images. Shape: [batch,3*batch,300,300]. 122 | 123 | Return: 124 | Depending on phase: 125 | test: 126 | Variable(tensor) of output class label predictions, 127 | confidence score, and corresponding location predictions for 128 | each object detected. Shape: [batch,topk,7] 129 | 130 | train: 131 | list of concat outputs from: 132 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 133 | 2: localization layers, Shape: [batch,num_priors*4] 134 | 3: priorbox layers, Shape: [2,num_priors*4] 135 | """ 136 | sources = list() 137 | 138 | # apply vgg up to conv4_3 relu 139 | for k in range(23): 140 | x = self.vgg[k](x) 141 | 142 | s = self.L2Norm(x) 143 | sources.append(s) 144 | 145 | # apply vgg up to fc7 146 | for k in range(23, len(self.vgg)): 147 | x = self.vgg[k](x) 148 | sources.append(x) 149 | 150 | # apply extra layers and cache source layer outputs 151 | for k, v in enumerate(self.extras): 152 | x = F.relu(v(x), inplace=True) 153 | if k % 2 == 1: 154 | sources.append(x) 155 | return sources 156 | 157 | 158 | def SSDVgg(size, channel_size='48'): 159 | return VGG16Extractor(size) 160 | 161 | 162 | if __name__ == "__main__": 163 | import os 164 | os.environ["CUDA_VISIBLE_DEVICES"] = "3" 165 | with torch.no_grad(): 166 | model3 = VGG16Extractor(300) 167 | model3.eval() 168 | x = torch.randn(16, 3, 300, 300) 169 | model3.cuda() 170 | model3(x.cuda()) 171 | import time 172 | st = time.time() 173 | for i in range(1000): 174 | model3(x.cuda()) 175 | print(time.time() - st) 176 | -------------------------------------------------------------------------------- /models/weave_res.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | from models.model_helper import FpnAdapter, WeaveAdapter, weights_init 9 | 10 | 11 | def add_extras(size, in_channel, batch_norm=False): 12 | # Extra layers added to resnet for feature scaling 13 | layers = [] 14 | layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)] 15 | layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)] 16 | return layers 17 | 18 | 19 | def conv3x3(in_planes, out_planes, stride=1): 20 | "3x3 convolution with padding" 21 | return nn.Conv2d( 22 | in_planes, 23 | out_planes, 24 | kernel_size=3, 25 | stride=stride, 26 | padding=1, 27 | bias=False) 28 | 29 | 30 | class BasicBlock(nn.Module): 31 | expansion = 1 32 | 33 | def __init__(self, inplanes, planes, stride=1, downsample=None): 34 | super(BasicBlock, self).__init__() 35 | self.conv1 = conv3x3(inplanes, planes, stride) 36 | self.bn1 = nn.BatchNorm2d(planes) 37 | self.relu = nn.ReLU(inplace=True) 38 | self.conv2 = conv3x3(planes, planes) 39 | self.bn2 = nn.BatchNorm2d(planes) 40 | self.downsample = downsample 41 | self.stride = stride 42 | 43 | def forward(self, x): 44 | residual = x 45 | 46 | out = self.conv1(x) 47 | out = self.bn1(out) 48 | out = self.relu(out) 49 | 50 | out = self.conv2(out) 51 | out = self.bn2(out) 52 | 53 | if self.downsample is not None: 54 | residual = self.downsample(x) 55 | out += residual 56 | out = self.relu(out) 57 | 58 | return out 59 | 60 | 61 | class Bottleneck(nn.Module): 62 | expansion = 4 63 | 64 | def __init__(self, inplanes, planes, stride=1, downsample=None): 65 | super(Bottleneck, self).__init__() 66 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 67 | self.bn1 = nn.BatchNorm2d(planes) 68 | self.conv2 = nn.Conv2d( 69 | planes, 70 | planes, 71 | kernel_size=3, 72 | stride=stride, 73 | padding=1, 74 | bias=False) 75 | self.bn2 = nn.BatchNorm2d(planes) 76 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 77 | self.bn3 = nn.BatchNorm2d(planes * 4) 78 | self.relu = nn.ReLU(inplace=True) 79 | self.downsample = downsample 80 | self.stride = stride 81 | 82 | def forward(self, x): 83 | residual = x 84 | 85 | out = self.conv1(x) 86 | out = self.bn1(out) 87 | out = self.relu(out) 88 | 89 | out = self.conv2(out) 90 | out = self.bn2(out) 91 | out = self.relu(out) 92 | 93 | out = self.conv3(out) 94 | out = self.bn3(out) 95 | 96 | if self.downsample is not None: 97 | residual = self.downsample(x) 98 | 99 | out += residual 100 | out = self.relu(out) 101 | 102 | return out 103 | 104 | 105 | class WeaveResnet(nn.Module): 106 | def __init__(self, block, num_blocks, size): 107 | super(WeaveResnet, self).__init__() 108 | self.inplanes = 64 109 | 110 | self.conv1 = nn.Conv2d( 111 | 3, 64, kernel_size=7, stride=2, padding=3, bias=False) 112 | self.bn1 = nn.BatchNorm2d(64) 113 | 114 | # Bottom-up layers 115 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 116 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 117 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 118 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 119 | self.inchannel = block.expansion * 512 120 | self.extras = nn.ModuleList(add_extras(str(size), self.inchannel)) 121 | self.smooth1 = nn.Conv2d( 122 | self.inchannel, 512, kernel_size=3, stride=1, padding=1) 123 | self.weave = WeaveAdapter([512, 1024, 512, 256], 4) 124 | self._init_modules() 125 | 126 | def _make_layer(self, block, planes, blocks, stride=1): 127 | downsample = None 128 | if stride != 1 or self.inplanes != planes * block.expansion: 129 | downsample = nn.Sequential( 130 | nn.Conv2d( 131 | self.inplanes, 132 | planes * block.expansion, 133 | kernel_size=1, 134 | stride=stride, 135 | bias=False), 136 | nn.BatchNorm2d(planes * block.expansion), 137 | ) 138 | 139 | layers = [] 140 | layers.append(block(self.inplanes, planes, stride, downsample)) 141 | self.inplanes = planes * block.expansion 142 | for i in range(1, blocks): 143 | layers.append(block(self.inplanes, planes)) 144 | 145 | return nn.Sequential(*layers) 146 | 147 | def _init_modules(self): 148 | self.extras.apply(weights_init) 149 | self.smooth1.apply(weights_init) 150 | 151 | def forward(self, x): 152 | # Bottom-up 153 | odm_sources = list() 154 | c1 = F.relu(self.bn1(self.conv1(x))) 155 | c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1) 156 | c2 = self.layer1(c1) 157 | c3 = self.layer2(c2) 158 | c4 = self.layer3(c3) 159 | c5 = self.layer4(c4) 160 | x = c5 161 | c5_ = self.smooth1(c5) 162 | arm_sources = [c3, c4, c5_] 163 | for k, v in enumerate(self.extras): 164 | x = F.relu(v(x), inplace=True) 165 | if k % 2 == 1: 166 | arm_sources.append(x) 167 | odm_sources = self.weave(arm_sources) 168 | return arm_sources, odm_sources 169 | 170 | 171 | def WeaveResnet50(size, channel_size='48'): 172 | return WeaveResnet(Bottleneck, [3, 4, 6, 3], size) 173 | 174 | 175 | def WeaveResnet101(size, channel_size='48'): 176 | return WeaveResnet(Bottleneck, [3, 4, 23, 3], size) 177 | 178 | 179 | def WeaveResnet152(size, channel_size='48'): 180 | return WeaveResnet(Bottleneck, [3, 8, 36, 3], size) 181 | 182 | 183 | if __name__ == "__main__": 184 | import os 185 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 186 | model = WeaveResnet50(size=300) 187 | print(model) 188 | with torch.no_grad(): 189 | model.eval() 190 | x = torch.randn(1, 3, 320, 320) 191 | model.cuda() 192 | model(x.cuda()) 193 | -------------------------------------------------------------------------------- /models/weave_vgg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import torch.nn.init as init 9 | from models.model_helper import FpnAdapter, WeaveAdapter, weights_init, WeaveAdapter2 10 | # from model_helper import FpnAdapter, WeaveAdapter, weights_init, WeaveAdapter2 11 | 12 | class L2Norm(nn.Module): 13 | def __init__(self, n_channels, scale): 14 | super(L2Norm, self).__init__() 15 | self.n_channels = n_channels 16 | self.gamma = scale or None 17 | self.eps = 1e-10 18 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 19 | self.reset_parameters() 20 | 21 | def reset_parameters(self): 22 | init.constant_(self.weight, self.gamma) 23 | 24 | def forward(self, x): 25 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps 26 | x = x / norm 27 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as( 28 | x) * x 29 | return out 30 | 31 | 32 | # This function is derived from torchvision VGG make_layers() 33 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 34 | 35 | 36 | def vgg(cfg, i, batch_norm=False): 37 | layers = [] 38 | in_channels = i 39 | for v in cfg: 40 | if v == 'M': 41 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 42 | elif v == 'C': 43 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 44 | else: 45 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 46 | if batch_norm: 47 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 48 | else: 49 | layers += [conv2d, nn.ReLU(inplace=True)] 50 | in_channels = v 51 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 52 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 53 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 54 | layers += [ 55 | pool5, conv6, 56 | nn.ReLU(inplace=True), conv7, 57 | nn.ReLU(inplace=True) 58 | ] 59 | return layers 60 | 61 | 62 | base = { 63 | '300': [ 64 | 64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 65 | 512, 512, 512 66 | ], 67 | '512': [ 68 | 64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 69 | 512, 512, 512 70 | ], 71 | } 72 | 73 | 74 | def add_extras(size): 75 | layers = [] 76 | layers += [nn.Conv2d(1024, 256, kernel_size=1, stride=1)] 77 | layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)] 78 | layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)] 79 | layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)] 80 | 81 | return layers 82 | 83 | 84 | class VGG16Extractor(nn.Module): 85 | def __init__(self, size, channel_size='48'): 86 | super(VGG16Extractor, self).__init__() 87 | self.vgg = nn.ModuleList(vgg(base[str(size)], 3)) 88 | self.extras = nn.ModuleList(add_extras(str(size))) 89 | self.L2Norm_4_3 = L2Norm(512, 10) 90 | self.L2Norm_5_3 = L2Norm(1024, 8) 91 | self.raw_channels = [512, 1024, 256, 256] 92 | self.weave_add_channels = [(48, 48), (48, 48), (48, 48), (48, 48)] 93 | self.weave_channels = [256, 256, 256, 256] 94 | # self.weave = WeaveAdapter([512, 1024, 256, 256], 4) 95 | self.weave = WeaveAdapter2(self.raw_channels, self.weave_add_channels, self.weave_channels) 96 | self._init_modules() 97 | 98 | def _init_modules(self): 99 | self.extras.apply(weights_init) 100 | 101 | def forward(self, x): 102 | """Applies network layers and ops on input image(s) x. 103 | Args: 104 | x: input image or batch of images. Shape: [batch,3*batch,300,300]. 105 | Return: 106 | Depending on phase: 107 | test: 108 | Variable(tensor) of output class label predictions, 109 | confidence score, and corresponding location predictions for 110 | each object detected. Shape: [batch,topk,7] 111 | train: 112 | list of concat outputs from: 113 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 114 | 2: localization layers, Shape: [batch,num_priors*4] 115 | 3: priorbox layers, Shape: [2,num_priors*4] 116 | """ 117 | arm_sources = list() 118 | odm_sources = list() 119 | 120 | for i in range(23): 121 | x = self.vgg[i](x) 122 | #38x38 123 | c2 = x 124 | c2 = self.L2Norm_4_3(c2) 125 | arm_sources.append(c2) 126 | 127 | for k in range(23, len(self.vgg)): 128 | x = self.vgg[k](x) 129 | #19x19 130 | c3 = x 131 | c3 = self.L2Norm_5_3(c3) 132 | arm_sources.append(c3) 133 | 134 | # 10x10 135 | x = F.relu(self.extras[0](x), inplace=True) 136 | x = F.relu(self.extras[1](x), inplace=True) 137 | c4 = x 138 | arm_sources.append(c4) 139 | 140 | # 5x5 141 | x = F.relu(self.extras[2](x), inplace=True) 142 | x = F.relu(self.extras[3](x), inplace=True) 143 | c5 = x 144 | arm_sources.append(c5) 145 | 146 | if len(self.extras) > 4: 147 | x = F.relu(self.extras[4](x), inplace=True) 148 | x = F.relu(self.extras[5](x), inplace=True) 149 | c6 = x 150 | arm_sources.append(c6) 151 | odm_sources = self.weave(arm_sources) 152 | return arm_sources, odm_sources 153 | 154 | 155 | def weave_vgg(size, channel_size='48'): 156 | return VGG16Extractor(size) 157 | 158 | 159 | if __name__ == "__main__": 160 | import os 161 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 162 | model = weave_vgg(size=300) 163 | print(model) 164 | with torch.no_grad(): 165 | model.eval() 166 | x = torch.randn(1, 3, 320, 320) 167 | model.cuda() 168 | model(x.cuda()) -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yqyao/SSD_Pytorch/6060bbb650e7a1df7c12d7c9650a38eaba4ab6a8/utils/__init__.py -------------------------------------------------------------------------------- /utils/averageMeter.py: -------------------------------------------------------------------------------- 1 | class AverageMeter(object): 2 | """Computes and stores the average and current value""" 3 | 4 | def __init__(self): 5 | self.reset() 6 | 7 | def reset(self): 8 | self.val = 0 9 | self.avg = 0 10 | self.sum = 0 11 | self.count = 0 12 | 13 | def update(self, val, n=1): 14 | self.val = val 15 | self.sum += val * n 16 | self.count += n 17 | self.avg = self.sum / self.count -------------------------------------------------------------------------------- /utils/build.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | import numpy as np 11 | from distutils.core import setup 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 19 | for dir in path.split(os.pathsep): 20 | binpath = pjoin(dir, name) 21 | if os.path.exists(binpath): 22 | return os.path.abspath(binpath) 23 | return None 24 | 25 | 26 | def locate_cuda(): 27 | """Locate the CUDA environment on the system 28 | 29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | and values giving the absolute path to each directory. 31 | 32 | Starts by looking for the CUDAHOME env variable. If not found, everything 33 | is based on finding 'nvcc' in the PATH. 34 | """ 35 | 36 | # first check if the CUDAHOME env variable is in use 37 | if 'CUDAHOME' in os.environ: 38 | home = os.environ['CUDAHOME'] 39 | nvcc = pjoin(home, 'bin', 'nvcc') 40 | else: 41 | # otherwise, search the PATH for NVCC 42 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 43 | nvcc = find_in_path('nvcc', 44 | os.environ['PATH'] + os.pathsep + default_path) 45 | if nvcc is None: 46 | raise EnvironmentError( 47 | 'The nvcc binary could not be ' 48 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME' 49 | ) 50 | home = os.path.dirname(os.path.dirname(nvcc)) 51 | 52 | cudaconfig = { 53 | 'home': home, 54 | 'nvcc': nvcc, 55 | 'include': pjoin(home, 'include'), 56 | 'lib64': pjoin(home, 'lib64') 57 | } 58 | for k, v in cudaconfig.items(): 59 | if not os.path.exists(v): 60 | raise EnvironmentError( 61 | 'The CUDA %s path could not be located in %s' % (k, v)) 62 | 63 | return cudaconfig 64 | 65 | 66 | CUDA = locate_cuda() 67 | 68 | # Obtain the numpy include directory. This logic works across numpy versions. 69 | try: 70 | numpy_include = np.get_include() 71 | except AttributeError: 72 | numpy_include = np.get_numpy_include() 73 | 74 | 75 | def customize_compiler_for_nvcc(self): 76 | """inject deep into distutils to customize how the dispatch 77 | to gcc/nvcc works. 78 | 79 | If you subclass UnixCCompiler, it's not trivial to get your subclass 80 | injected in, and still have the right customizations (i.e. 81 | distutils.sysconfig.customize_compiler) run on it. So instead of going 82 | the OO route, I have this. Note, it's kindof like a wierd functional 83 | subclassing going on.""" 84 | 85 | # tell the compiler it can processes .cu 86 | self.src_extensions.append('.cu') 87 | 88 | # save references to the default compiler_so and _comple methods 89 | default_compiler_so = self.compiler_so 90 | super = self._compile 91 | 92 | # now redefine the _compile method. This gets executed for each 93 | # object but distutils doesn't have the ability to change compilers 94 | # based on source extension: we add it. 95 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 96 | print(extra_postargs) 97 | if os.path.splitext(src)[1] == '.cu': 98 | # use the cuda for .cu files 99 | self.set_executable('compiler_so', CUDA['nvcc']) 100 | # use only a subset of the extra_postargs, which are 1-1 translated 101 | # from the extra_compile_args in the Extension class 102 | postargs = extra_postargs['nvcc'] 103 | else: 104 | postargs = extra_postargs['gcc'] 105 | 106 | super(obj, src, ext, cc_args, postargs, pp_opts) 107 | # reset the default compiler_so, which we might have changed for cuda 108 | self.compiler_so = default_compiler_so 109 | 110 | # inject our redefined _compile method into the class 111 | self._compile = _compile 112 | 113 | 114 | # run the customize_compiler 115 | class custom_build_ext(build_ext): 116 | def build_extensions(self): 117 | customize_compiler_for_nvcc(self.compiler) 118 | build_ext.build_extensions(self) 119 | 120 | 121 | ext_modules = [ 122 | Extension( 123 | "nms.cpu_nms", ["nms/cpu_nms.pyx"], 124 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 125 | include_dirs=[numpy_include]), 126 | Extension( 127 | 'nms.gpu_nms', 128 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 129 | library_dirs=[CUDA['lib64']], 130 | libraries=['cudart'], 131 | language='c++', 132 | runtime_library_dirs=[CUDA['lib64']], 133 | # this syntax is specific to this build system 134 | # we're only going to use certain compiler args with nvcc and not with gcc 135 | # the implementation of this trick is in customize_compiler() below 136 | extra_compile_args={ 137 | 'gcc': ["-Wno-unused-function"], 138 | 'nvcc': [ 139 | '-arch=sm_61', '--ptxas-options=-v', '-c', 140 | '--compiler-options', "'-fPIC'" 141 | ] 142 | }, 143 | include_dirs=[numpy_include, CUDA['include']]) 144 | ] 145 | 146 | setup( 147 | name='mot_utils', 148 | ext_modules=ext_modules, 149 | # inject our custom trigger 150 | cmdclass={'build_ext': custom_build_ext}, 151 | ) 152 | -------------------------------------------------------------------------------- /utils/collections.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | ############################################################################## 15 | """A simple attribute dictionary used for representing configuration options.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | from __future__ import unicode_literals 21 | 22 | 23 | class AttrDict(dict): 24 | 25 | IMMUTABLE = '__immutable__' 26 | 27 | def __init__(self, *args, **kwargs): 28 | super(AttrDict, self).__init__(*args, **kwargs) 29 | self.__dict__[AttrDict.IMMUTABLE] = False 30 | 31 | def __getattr__(self, name): 32 | if name in self.__dict__: 33 | return self.__dict__[name] 34 | elif name in self: 35 | return self[name] 36 | else: 37 | raise AttributeError(name) 38 | 39 | def __setattr__(self, name, value): 40 | if not self.__dict__[AttrDict.IMMUTABLE]: 41 | if name in self.__dict__: 42 | self.__dict__[name] = value 43 | else: 44 | self[name] = value 45 | else: 46 | raise AttributeError( 47 | 'Attempted to set "{}" to "{}", but AttrDict is immutable'. 48 | format(name, value)) 49 | 50 | def immutable(self, is_immutable): 51 | """Set immutability to is_immutable and recursively apply the setting 52 | to all nested AttrDicts. 53 | """ 54 | self.__dict__[AttrDict.IMMUTABLE] = is_immutable 55 | # Recursively set immutable state 56 | for v in self.__dict__.values(): 57 | if isinstance(v, AttrDict): 58 | v.immutable(is_immutable) 59 | for v in self.values(): 60 | if isinstance(v, AttrDict): 61 | v.immutable(is_immutable) 62 | 63 | def is_immutable(self): 64 | return self.__dict__[AttrDict.IMMUTABLE] 65 | -------------------------------------------------------------------------------- /utils/get_class_map.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import argparse 4 | import os.path as osp 5 | 6 | 7 | def check_size(submit_file): 8 | max_size = 60 * 1024 * 1024 9 | if osp.getsize(submit_file) > max_size: 10 | raise ( 11 | IOError, 12 | "File size exceeds the specified maximum size, which is 60M for the server." 13 | ) 14 | 15 | 16 | def parse_submission(submit_file): 17 | with open(submit_file, 'r') as f: 18 | lines = f.readlines() 19 | submit_dict = dict() 20 | final_dict = dict() 21 | splitlines = [x.strip().split(' ') for x in lines] 22 | for idx, val in enumerate(splitlines): 23 | cls = str(int(float(val[1]))) 24 | if cls not in submit_dict: 25 | submit_dict[cls] = list() 26 | final_dict[cls] = dict() 27 | submit_dict[cls].append( 28 | [val[0], val[2], val[3], val[4], val[5], val[6]]) 29 | for k, v in submit_dict.items(): 30 | image_ids = [x[0] for x in v] 31 | confidence = np.array([float(x[1]) for x in v]) 32 | BB = np.array([[float(z) for z in x[2:]] for x in v]) 33 | sorted_ind = np.argsort(-confidence) 34 | sorted_scores = np.sort(-confidence) 35 | BB = BB[sorted_ind, :] 36 | image_ids = [image_ids[x] for x in sorted_ind] 37 | final_dict[k]["image_ids"] = image_ids 38 | final_dict[k]["BB"] = np.array(BB) 39 | return final_dict 40 | 41 | 42 | def parse_gt_annotation(gt_file): 43 | with open(gt_file, 'r') as f: 44 | lines = f.readlines() 45 | info = [x.strip().split() for x in lines] 46 | gt = {} 47 | for item in info: 48 | img_id = item[0] 49 | obj_struct = {} 50 | obj_struct['class'] = item[1] 51 | obj_struct['bbox'] = [ 52 | int(item[2]), 53 | int(item[3]), 54 | int(item[4]), 55 | int(item[5]) 56 | ] 57 | if img_id not in gt: 58 | gt[img_id] = list() 59 | gt[img_id].append(obj_struct) 60 | return gt 61 | 62 | 63 | def get_class_recs(recs, classname): 64 | npos = 0 65 | class_recs = {} 66 | for key in recs.keys(): 67 | R = [obj for obj in recs[key] if obj['class'] == classname] 68 | bbox = np.array([x['bbox'] for x in R]) 69 | det = [False] * len(R) 70 | npos += len(R) 71 | class_recs[key] = {'bbox': bbox, 'det': det} 72 | return class_recs, npos 73 | 74 | 75 | def compute_ap(rec, prec): 76 | mrec = np.concatenate(([0.], rec, [1.])) 77 | mpre = np.concatenate(([0.], prec, [0.])) 78 | for i in range(mpre.size - 1, 0, -1): 79 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 80 | i = np.where(mrec[1:] != mrec[:-1])[0] 81 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 82 | return ap 83 | 84 | 85 | def eval(submit_file, gt_file, ovthresh, classname): 86 | recs = parse_gt_annotation(gt_file) 87 | submit_result = parse_submission(submit_file) 88 | # get one class result 89 | class_recs, npos = get_class_recs(recs, classname) 90 | image_ids = submit_result[classname]["image_ids"] 91 | BB = submit_result[classname]["BB"] 92 | nd = len(image_ids) 93 | tp = np.zeros(nd) 94 | fp = np.zeros(nd) 95 | for d in range(nd): 96 | if image_ids[d] not in recs.keys(): 97 | raise KeyError( 98 | "Can not find image {} in the groundtruth file, did you submit the result file for the right dataset?" 99 | .format(image_ids[d])) 100 | for d in range(nd): 101 | R = class_recs[image_ids[d]] 102 | bb = BB[d, :].astype(float) 103 | ovmax = -np.inf 104 | BBGT = R['bbox'].astype(float) 105 | if BBGT.size > 0: 106 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 107 | iymin = np.maximum(BBGT[:, 1], bb[1]) 108 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 109 | iymax = np.minimum(BBGT[:, 3], bb[3]) 110 | iw = np.maximum(ixmax - ixmin + 1., 0.) 111 | ih = np.maximum(iymax - iymin + 1., 0.) 112 | inters = iw * ih 113 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 114 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 115 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 116 | overlaps = inters / uni 117 | ovmax = np.max(overlaps) 118 | jmax = np.argmax(overlaps) 119 | if ovmax > ovthresh: 120 | if not R['det'][jmax]: 121 | tp[d] = 1. 122 | R['det'][jmax] = 1 123 | else: 124 | fp[d] = 1. 125 | else: 126 | fp[d] = 1. 127 | fp = np.cumsum(fp) 128 | tp = np.cumsum(tp) 129 | rec = tp / float(npos) 130 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 131 | ap = compute_ap(rec, prec) 132 | return ap 133 | 134 | 135 | def result_eval(submit_file, gt, class_list): 136 | ove_aap = [] 137 | for ove in np.arange(0.5, 1.0, 0.05): 138 | cls_aap = [] 139 | for cls in class_list: 140 | ap = eval(submit_file, gt, ove, cls) 141 | cls_aap.append(ap) 142 | cls_mAP = np.average(cls_aap) 143 | print("thresh", round(ove, 3), "map", round(cls_mAP * 100, 3)) 144 | ove_aap.append(cls_mAP) 145 | mAP = np.average(ove_aap) * 100 146 | return round(mAP, 3) 147 | 148 | 149 | if __name__ == '__main__': 150 | ''' 151 | submit_file: image_id, class, score, xmin, ymin, xmax, ymax 152 | gt_file: image_id, class, xmin, ymin, xmax, ymax 153 | ''' 154 | class_list = [] 155 | for i in range(1, 61): 156 | class_list.append(str(i)) 157 | submit_file = "./results/fpn_dcn_result.csv" 158 | gt_file = "./results/val_label.txt" 159 | check_size(submit_file) 160 | mAP = result_eval(submit_file, gt_file, class_list) 161 | out = {'Average AP': str(round(mAP, 3))} 162 | print(out) -------------------------------------------------------------------------------- /utils/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yqyao/SSD_Pytorch/6060bbb650e7a1df7c12d7c9650a38eaba4ab6a8/utils/nms/__init__.py -------------------------------------------------------------------------------- /utils/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | cdef inline np.float32_t abs(np.float32_t a, np.float32_t b): 18 | return a - b if a >= b else b - a 19 | 20 | def get_iou_weights(np.ndarray[np.float32_t, ndim=1] ious, np.float threshold, float init_weight): 21 | 22 | cdef: 23 | int num = ious.shape[0] 24 | # np.ndarray[np.float32_t, ndim=1] out = np.zeros(num, dtype=np.float) 25 | int idx 26 | float iou 27 | float weight 28 | 29 | for idx, iou in enumerate(ious): 30 | weight = init_weight 31 | if iou > 0.0: 32 | if iou > threshold + 0.1: 33 | weight += 1.0 34 | elif iou < threshold - 0.1: 35 | weight += 1.0 36 | else: 37 | weight += 0.0 38 | ious[idx] = weight 39 | return ious 40 | 41 | def get_mask(np.ndarray[np.float32_t, ndim=1] ious, np.float threshold): 42 | cdef: 43 | int num = ious.shape[0] 44 | int idx = 0 45 | float distance 46 | float iou 47 | np.ndarray[np.int64_t, ndim=1] out = np.zeros((num), dtype=np.int64) 48 | for idx, iou in enumerate(ious): 49 | # if iou >= threshold: 50 | # distance = iou - threshold 51 | # if distance < 0.1: 52 | # out[idx] = 0 53 | # elif distance < 0.2: 54 | # out[idx] = 1 55 | # else: 56 | # out[idx] = 2 57 | # else: 58 | # distance = threshold - iou 59 | # if distance < 0.1: 60 | # out[idx] = 2 61 | # elif distance < 0.2: 62 | # out[idx] = 1 63 | # else: 64 | # out[idx] = 0 65 | distance = abs(iou, threshold) 66 | if distance < 0.1: 67 | # out[:,2] = 1 68 | out[idx] = 2 69 | elif distance < 0.2: 70 | # out[:,1] = 1 71 | out[idx] = 1 72 | else: 73 | # out[:,0] = 0 74 | out[idx] = 0 75 | return out 76 | 77 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 78 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 79 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 80 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 81 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 82 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 83 | 84 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 85 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 86 | 87 | cdef int ndets = dets.shape[0] 88 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 89 | np.zeros((ndets), dtype=np.int) 90 | 91 | # nominal indices 92 | cdef int _i, _j 93 | # sorted indices 94 | cdef int i, j 95 | # temp variables for box i's (the box currently under consideration) 96 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 97 | # variables for computing overlap with box j (lower scoring box) 98 | cdef np.float32_t xx1, yy1, xx2, yy2 99 | cdef np.float32_t w, h 100 | cdef np.float32_t inter, ovr 101 | 102 | keep = [] 103 | for _i in range(ndets): 104 | i = order[_i] 105 | if suppressed[i] == 1: 106 | continue 107 | keep.append(i) 108 | ix1 = x1[i] 109 | iy1 = y1[i] 110 | ix2 = x2[i] 111 | iy2 = y2[i] 112 | iarea = areas[i] 113 | for _j in range(_i + 1, ndets): 114 | j = order[_j] 115 | if suppressed[j] == 1: 116 | continue 117 | xx1 = max(ix1, x1[j]) 118 | yy1 = max(iy1, y1[j]) 119 | xx2 = min(ix2, x2[j]) 120 | yy2 = min(iy2, y2[j]) 121 | w = max(0.0, xx2 - xx1 + 1) 122 | h = max(0.0, yy2 - yy1 + 1) 123 | inter = w * h 124 | ovr = inter / (iarea + areas[j] - inter) 125 | if ovr >= thresh: 126 | suppressed[j] = 1 127 | 128 | return keep 129 | 130 | def cpu_soft_nms(np.ndarray[float, ndim=2] boxes, float sigma=0.5, float Nt=0.3, float threshold=0.001, unsigned int method=0): 131 | cdef unsigned int N = boxes.shape[0] 132 | cdef float iw, ih, box_area 133 | cdef float ua 134 | cdef int pos = 0 135 | cdef float maxscore = 0 136 | cdef int maxpos = 0 137 | cdef float x1,x2,y1,y2,tx1,tx2,ty1,ty2,ts,area,weight,ov 138 | 139 | for i in range(N): 140 | maxscore = boxes[i, 4] 141 | maxpos = i 142 | 143 | tx1 = boxes[i,0] 144 | ty1 = boxes[i,1] 145 | tx2 = boxes[i,2] 146 | ty2 = boxes[i,3] 147 | ts = boxes[i,4] 148 | 149 | pos = i + 1 150 | # get max box 151 | while pos < N: 152 | if maxscore < boxes[pos, 4]: 153 | maxscore = boxes[pos, 4] 154 | maxpos = pos 155 | pos = pos + 1 156 | 157 | # add max box as a detection 158 | boxes[i,0] = boxes[maxpos,0] 159 | boxes[i,1] = boxes[maxpos,1] 160 | boxes[i,2] = boxes[maxpos,2] 161 | boxes[i,3] = boxes[maxpos,3] 162 | boxes[i,4] = boxes[maxpos,4] 163 | 164 | # swap ith box with position of max box 165 | boxes[maxpos,0] = tx1 166 | boxes[maxpos,1] = ty1 167 | boxes[maxpos,2] = tx2 168 | boxes[maxpos,3] = ty2 169 | boxes[maxpos,4] = ts 170 | 171 | tx1 = boxes[i,0] 172 | ty1 = boxes[i,1] 173 | tx2 = boxes[i,2] 174 | ty2 = boxes[i,3] 175 | ts = boxes[i,4] 176 | 177 | pos = i + 1 178 | # NMS iterations, note that N changes if detection boxes fall below threshold 179 | while pos < N: 180 | x1 = boxes[pos, 0] 181 | y1 = boxes[pos, 1] 182 | x2 = boxes[pos, 2] 183 | y2 = boxes[pos, 3] 184 | s = boxes[pos, 4] 185 | 186 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 187 | iw = (min(tx2, x2) - max(tx1, x1) + 1) 188 | if iw > 0: 189 | ih = (min(ty2, y2) - max(ty1, y1) + 1) 190 | if ih > 0: 191 | ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) 192 | ov = iw * ih / ua #iou between max box and detection box 193 | 194 | if method == 1: # linear 195 | if ov > Nt: 196 | weight = 1 - ov 197 | else: 198 | weight = 1 199 | elif method == 2: # gaussian 200 | weight = np.exp(-(ov * ov)/sigma) 201 | else: # original NMS 202 | if ov > Nt: 203 | weight = 0 204 | else: 205 | weight = 1 206 | 207 | boxes[pos, 4] = weight*boxes[pos, 4] 208 | 209 | # if box score falls below threshold, discard the box by swapping with last box 210 | # update N 211 | if boxes[pos, 4] < threshold: 212 | boxes[pos,0] = boxes[N-1, 0] 213 | boxes[pos,1] = boxes[N-1, 1] 214 | boxes[pos,2] = boxes[N-1, 2] 215 | boxes[pos,3] = boxes[N-1, 3] 216 | boxes[pos,4] = boxes[N-1, 4] 217 | N = N - 1 218 | pos = pos - 1 219 | 220 | pos = pos + 1 221 | 222 | keep = [i for i in range(N)] 223 | return keep 224 | -------------------------------------------------------------------------------- /utils/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /utils/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /utils/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /utils/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /utils/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from .nms.cpu_nms import cpu_nms, cpu_soft_nms 9 | from .nms.gpu_nms import gpu_nms 10 | 11 | # def nms(dets, thresh, force_cpu=False): 12 | # """Dispatch to either CPU or GPU NMS implementations.""" 13 | 14 | # if dets.shape[0] == 0: 15 | # return [] 16 | # if cfg.USE_GPU_NMS and not force_cpu: 17 | # return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 18 | # else: 19 | # return cpu_nms(dets, thresh) 20 | 21 | 22 | def nms(dets, thresh, force_cpu=False): 23 | """Dispatch to either CPU or GPU NMS implementations.""" 24 | 25 | if dets.shape[0] == 0: 26 | return [] 27 | if force_cpu: 28 | #return cpu_soft_nms(dets, thresh, method = 0) 29 | return cpu_nms(dets, thresh) 30 | return gpu_nms(dets, thresh) 31 | 32 | 33 | def soft_nms(dets, Nt=0.3, sigma=0.5, thresh=0.001, method=1): 34 | """Dispatch to either CPU or GPU NMS implementations.""" 35 | 36 | if dets.shape[0] == 0: 37 | return [] 38 | return cpu_soft_nms(dets, sigma, Nt, thresh, method) -------------------------------------------------------------------------------- /utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | 11 | class Timer(object): 12 | """A simple timer.""" 13 | 14 | def __init__(self): 15 | self.total_time = 0. 16 | self.calls = 0 17 | self.start_time = 0. 18 | self.diff = 0. 19 | self.average_time = 0. 20 | 21 | def tic(self): 22 | # using time.time instead of time.clock because time time.clock 23 | # does not normalize for multithreading 24 | self.start_time = time.time() 25 | 26 | def toc(self, average=True): 27 | self.diff = time.time() - self.start_time 28 | self.total_time += self.diff 29 | self.calls += 1 30 | self.average_time = self.total_time / self.calls 31 | if average: 32 | return self.average_time 33 | else: 34 | return self.diff 35 | 36 | def clear(self): 37 | self.total_time = 0. 38 | self.calls = 0 39 | self.start_time = 0. 40 | self.diff = 0. 41 | self.average_time = 0. 42 | --------------------------------------------------------------------------------