├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── configs
    ├── config.py
    ├── drf_res101_voc.yaml
    ├── drf_vgg_voc.yaml
    ├── refine_drf_res101_voc.yaml
    ├── refine_drf_vgg_voc.yaml
    ├── refine_res101_voc.yaml
    ├── refine_vgg_voc.yaml
    ├── refine_vgg_voc_512.yaml
    ├── ssd_darknet19_voc.yaml
    ├── ssd_darknet53_voc.yaml
    ├── ssd_mobilenetv2_voc.yaml
    ├── ssd_res101_voc.yaml
    ├── ssd_res18_voc.yaml
    ├── ssd_res50_coco.yaml
    ├── ssd_res50_voc.yaml
    ├── ssd_vgg_voc.yaml
    ├── weave_vgg_voc.yaml
    └── weave_vgg_voc_512.yaml
├── data
    ├── __init__.py
    ├── coco.py
    ├── data_augment.py
    ├── drf_net.jpg
    ├── scripts
    │   ├── VOC2007.sh
    │   └── VOC2012.sh
    ├── voc0712.py
    └── voc_eval.py
├── demo.py
├── eval.py
├── images
    ├── dog.jpg
    ├── eagle.jpg
    └── person.jpg
├── layers
    ├── __init__.py
    ├── functions
    │   ├── __init__.py
    │   ├── detection.py
    │   ├── prior_box.py
    │   └── prior_layer.py
    └── modules
    │   ├── __init__.py
    │   ├── focal_loss_sigmoid.py
    │   ├── focal_loss_softmax.py
    │   ├── multibox_loss.py
    │   ├── refine_multibox_loss.py
    │   ├── weight_smooth_l1_loss.py
    │   └── weight_softmax_loss.py
├── make.sh
├── models
    ├── darknet.py
    ├── dense_conv.py
    ├── drf_res.py
    ├── drf_vgg.py
    ├── mobilenetv2.py
    ├── model_builder.py
    ├── model_helper.py
    ├── refine_dense_conv.py
    ├── refine_drf_res.py
    ├── refine_drf_vgg.py
    ├── refine_res.py
    ├── refine_vgg.py
    ├── resnet.py
    ├── vgg.py
    ├── weave_res.py
    └── weave_vgg.py
├── train.py
└── utils
    ├── __init__.py
    ├── augmentations.py
    ├── averageMeter.py
    ├── box_utils.py
    ├── build.py
    ├── collections.py
    ├── convert_darknet.py
    ├── get_class_map.py
    ├── nms
        ├── __init__.py
        ├── cpu_nms.pyx
        ├── gpu_nms.hpp
        ├── gpu_nms.pyx
        ├── nms_kernel.cu
        └── py_cpu_nms.py
    ├── nms_wrapper.py
    └── timer.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-language=Python
2 | .ipynb_checkpoints/* linguist-documentation
3 | dev.ipynb linguist-documentation
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # sftp
  7 | sftp-config.json
  8 | 
  9 | # coco
 10 | ./utils/nms/*.so
 11 | ./utils/build
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | env/
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | 
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *,cover
 54 | .hypothesis/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | *.json
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # IPython Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # celery beat schedule file
 85 | celerybeat-schedule
 86 | 
 87 | # dotenv
 88 | .env
 89 | 
 90 | # virtualenv
 91 | venv/
 92 | ENV/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # atom remote-sync package
101 | .remote-sync.json
102 | 
103 | # weights
104 | weights/
105 | 
106 | #DS_Store
107 | .DS_Store
108 | 
109 | # dev stuff
110 | eval/
111 | eval.ipynb
112 | dev.ipynb
113 | .vscode/
114 | 
115 | # not ready
116 | videos/
117 | templates/
118 | data/ssd_dataloader.py
119 | data/datasets/
120 | doc/visualize.py
121 | read_results.py
122 | ssd300_120000/
123 | demos/live
124 | webdemo.py
125 | test_data_aug.py
126 | weights/
127 | 
128 | # attributes
129 | 
130 | # pycharm
131 | .idea/
132 | 
133 | # temp checkout soln
134 | data/datasets/
135 | data/ssd_dataloader.py
136 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Max deGroot, Ellis Brown
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SSD Pytorch
  2 | A [PyTorch](http://pytorch.org/) implementation of SSDs (include original ssd, DRFNet, RefineDet)
  3 | 
  4 | <!-- <img align="right" src= "https://github.com/amdegroot/ssd.pytorch/blob/master/doc/ssd.png" height = 400/> -->
  5 | 
  6 | ### Table of Contents
  7 | - <a href='#installation'>Installation</a>
  8 | - <a href='#datasets'>Datasets</a>
  9 | - <a href='#training'>Train</a>
 10 | - <a href='#evaluation'>Evaluate</a>
 11 | - <a href='#performance'>Performance</a>
 12 | - <a href='#references'>Reference</a>
 13 | 
 14 | &nbsp;
 15 | &nbsp;
 16 | &nbsp;
 17 | &nbsp;
 18 | 
 19 | ## Installation
 20 | - Install [PyTorch-0.4.0](http://pytorch.org/)  by selecting your environment on the website and running the appropriate command.
 21 | - Clone this repository.
 22 |   * Note: We currently only support Python 3+.
 23 | - Then download the dataset by following the [instructions](#download-voc2007-trainval--test) below.
 24 | - Compile the nms and install coco tools:
 25 | 
 26 | ```shell
 27 | cd SSD_Pytorch
 28 | # if you use anaconda3, maybe you need https://github.com/rbgirshick/py-faster-rcnn/issues/706
 29 | ./make.sh
 30 | pip install pycocotools
 31 | 
 32 | ```
 33 | 
 34 | Note*: Check you GPU architecture support in utils/build.py, line 131. Default is:
 35 | 
 36 | ```Shell
 37 | 'nvcc': ['-arch=sm_52',
 38 | 
 39 | ```
 40 | 
 41 | ## Datasets
 42 | To make things easy, we provide a simple VOC dataset loader that inherits `torch.utils.data.Dataset` making it fully compatible with the `torchvision.datasets` [API](http://pytorch.org/docs/torchvision/datasets.html).
 43 | 
 44 | ### VOC Dataset
 45 | ##### Download VOC2007 trainval & test
 46 | 
 47 | ```Shell
 48 | # specify a directory for dataset to be downloaded into, else default is ~/data/
 49 | sh data/scripts/VOC2007.sh # <directory>
 50 | ```
 51 | 
 52 | ##### Download VOC2012 trainval
 53 | 
 54 | ```Shell
 55 | # specify a directory for dataset to be downloaded into, else default is ~/data/
 56 | sh data/scripts/VOC2012.sh # <directory>
 57 | ```
 58 | 
 59 | ##### Merge VOC2007 and VOC2012
 60 | 
 61 | ```Shell
 62 | move all images in VOC2007 and VOC2012 into VOCROOT/VOC0712/JPEGImages
 63 | move all annotations in VOC2007 and VOC2012 into VOCROOT/VOC0712/JPEGImages/Annotations
 64 | rename and merge some txt VOC2007 and VOC2012 ImageSets/Main/*.txt to VOCROOT/VOC0712/JPEGImages/ImageSets/Main/*.txt
 65 | the merged txt list as follows:
 66 | 2012_test.txt, 2007_test.txt, 0712_trainval_test.txt, 2012_trainval.txt, 0712_trainval.txt
 67 | 
 68 | ```
 69 | ### COCO Dataset
 70 | Install the MS COCO dataset at /path/to/coco from [official website](http://mscoco.org/), default is ~/data/COCO. Following the [instructions](https://github.com/rbgirshick/py-faster-rcnn/blob/77b773655505599b94fd8f3f9928dbf1a9a776c7/data/README.md) to prepare *minival2014* and *valminusminival2014* annotations. All label files (.json) should be under the COCO/annotations/ folder. It should have this basic structure
 71 | ```Shell
 72 | $COCO/
 73 | $COCO/cache/
 74 | $COCO/annotations/
 75 | $COCO/images/
 76 | $COCO/images/test2015/
 77 | $COCO/images/train2014/
 78 | $COCO/images/val2014/
 79 | ```
 80 | *UPDATE*: The current COCO dataset has released new *train2017* and *val2017* sets which are just new splits of the same image sets. 
 81 | 
 82 | 
 83 | ## Training
 84 | - First download the fc-reduced [VGG-16](https://arxiv.org/abs/1409.1556) PyTorch base network weights at: https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth
 85 | - ResNet pre-trained basenet weight file is available at [ResNet50](https://download.pytorch.org/models/resnet50-19c8e357.pth), [ResNet101](https://download.pytorch.org/models/resnet101-5d3b4d8f.pth), [ResNet152](https://download.pytorch.org/models/resnet152-b121ed2d.pth)
 86 | - By default, we assume you have downloaded the file in the `SSD_Pytorch/weights/pretrained_models` dir:
 87 | 
 88 | ```Shell
 89 | mkdir weights
 90 | cd weights
 91 | mkdir pretrained_models
 92 | 
 93 | wget https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth
 94 | wget https://download.pytorch.org/models/resnet50-19c8e357.pth
 95 | wget https://download.pytorch.org/models/resnet101-5d3b4d8f.pth
 96 | wget https://download.pytorch.org/models/resnet152-b121ed2d.pth
 97 | mv download_weights pretrained_models
 98 | ```
 99 | 
100 | - To train SSD_Pytorch using the train script simply specify the parameters listed in `train.py` as a flag or manually change them.
101 | 
102 | ```Shell
103 | python train.py --cfg ./configs/ssd_vgg_voc.yaml
104 | ```
105 | 
106 | - Note:
107 |   All training configs are in ssd_vgg_voc.yaml, you can change it by yourself.
108 | 
109 | - To evaluate a trained network:
110 | 
111 | ```Shell
112 | python eval.py --cfg ./configs/ssd_vgg_voc.yaml --weights ./eval_weights
113 | ```
114 | 
115 | - To detect one images
116 | 
117 | ```
118 |  # you need put some images in ./images
119 | python demo.py --cfg ./configs/ssd_vgg_voc.yaml --images ./images --save_folder ./output
120 | 
121 | ```
122 | You can specify the parameters listed in the `eval.py` or `demo.py` file by flagging them or manually changing them.  
123 | 
124 | ## Performance
125 | 
126 | #### VOC2007 Test
127 | 
128 | ##### mAP
129 | 
130 | we retrained some models, so it's different from the origin paper
131 | size = 300
132 | 
133 | |ssd_vgg|ssd_res|ssd_darknet|drf_ssd_vgg|drf_ssd_res|refine_drf_vgg|refine_ssd_vgg| 
134 | |:-:      |:-:      |:-:          |:-:          |:-:          |:-:|:-:   |          
135 | | 77.5%   | 77.0    | 77.6%       | 79.6 %      | 79.0%       |80.2% |80.4 %        |
136 | 
137 | 
138 | 
139 | 
140 | ## References
141 | - Wei Liu, et al. "SSD: Single Shot MultiBox Detector." [ECCV2016]((http://arxiv.org/abs/1512.02325)).
142 | - [Original Implementation (CAFFE)](https://github.com/weiliu89/caffe/tree/ssd)
143 | - A list of other great SSD ports that were sources of inspiration (especially the Chainer repo): 
144 |   * [ssd.pytorch]((https://github.com/amdegroot/ssd.pytorch)),
145 |     [RFBNet](https://github.com/ruinmessi/RFBNet)
146 |     [Chainer](https://github.com/Hakuyume/chainer-ssd),
147 |     [torchcv](https://github.com/kuangliu/torchcv)
148 |   ) 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 


--------------------------------------------------------------------------------
/configs/drf_res101_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: drf_res101
 3 |   SIZE: '300'
 4 |   REFINE: False
 5 |   CONV_BODY: drf_res.DRFSSDRes101
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet101-5d3b4d8f.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | SMALL:
21 |   FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5], [3, 3], [1, 1]]
22 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256]
23 |   NUM_ANCHORS: [6, 6, 6, 6, 4, 4]
24 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]]
25 |   MIN_SIZES: [30, 60, 111, 162, 213, 264]
26 |   MAX_SIZES: [60, 111, 162, 213, 264, 315]
27 |   ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
28 |   VARIANCE : [0.1, 0.2]
29 |   CLIP: True
30 |   IMG_WH: [320, 320]
31 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
32 |   USE_MAX_SIZE: True
33 | 
34 | BIG:
35 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]]
36 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256]
37 |   NUM_ANCHORS: [6, 6, 6, 6, 6, 4, 4]
38 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]]
39 |   MIN_SIZES: [35.84, 76.8, 153.6,  230.4, 307.2, 384.0, 460.8]
40 |   MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
41 |   ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
42 | 
43 |   CLIP: True
44 |   IMG_WH: [512, 512]
45 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
46 |   USE_MAX_SIZE: True
47 | 
48 | SOLVER:
49 |   WEIGHT_DECAY: 0.0005
50 |   BASE_LR: 0.001
51 |   GAMMA: 0.1
52 |   MOMENTUM: 0.9
53 |   EPOCH_STEPS: [0, 150, 250]  
54 |   END_EPOCH: 250
55 |   START_EPOCH: 0
56 | 
57 | DATASETS:
58 |   TRAIN_TYPE: [['0712', '0712_trainval']]
59 |   VAL_TYPE: [['0712', '2007_test']]
60 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
61 |   DATA_TYPE: 'VOC'
62 |   SETS:
63 |     VOC: [['0712', '0712_trainval']]
64 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
65 |     VOC0712: [['2012', '2012_trainval']]
66 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
67 |     VOC2007: [['0712', "2007_test"]]
68 |     COCOval: [['2014', 'minival']]
69 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
70 |     COCOROOT: 'data/datasets/coco2015'
71 |   
72 | TEST:
73 |   INPUT_WH: [300, 300]
74 |   CONFIDENCE_THRESH: 0.01
75 |   NMS_OVERLAP: 0.45
76 |   BATCH_SIZE: 16
77 |   
78 | 
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/configs/drf_vgg_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: drf_vgg
 3 |   SIZE: '300'
 4 |   REFINE: False
 5 |   CONV_BODY: drf_vgg.DRFVgg
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 3
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5], [3, 3], [1, 1]]
23 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256]
24 |   NUM_ANCHORS: [6, 6, 6, 6, 4, 4]
25 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]]
26 |   MIN_SIZES: [30, 60, 111, 162, 213, 264]
27 |   MAX_SIZES: [60, 111, 162, 213, 264, 315]
28 |   ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
29 |   VARIANCE : [0.1, 0.2]
30 |   CLIP: True
31 |   IMG_WH: [320, 320]
32 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
33 |   USE_MAX_SIZE: True
34 | 
35 | BIG:
36 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]]
37 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256]
38 |   NUM_ANCHORS: [6, 6, 6, 6, 6, 4, 4]
39 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]]
40 |   MIN_SIZES: [35.84, 76.8, 153.6,  230.4, 307.2, 384.0, 460.8]
41 |   MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
42 |   ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
43 | 
44 |   CLIP: True
45 |   IMG_WH: [512, 512]
46 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
47 |   USE_MAX_SIZE: True
48 | 
49 | SOLVER:
50 |   WEIGHT_DECAY: 0.0005
51 |   BASE_LR: 0.004
52 |   GAMMA: 0.1
53 |   MOMENTUM: 0.9
54 |   EPOCH_STEPS: [0, 150, 200]  
55 |   END_EPOCH: 250
56 |   START_EPOCH: 0
57 | 
58 | DATASETS:
59 |   TRAIN_TYPE: [['0712', '0712_trainval']]
60 |   VAL_TYPE: [['0712', '2007_test']]
61 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
62 |   DATA_TYPE: 'VOC'
63 |   SETS:
64 |     VOC: [['0712', '0712_trainval']]
65 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
66 |     VOC0712: [['2012', '2012_trainval']]
67 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
68 |     VOC2007: [['0712', "2007_test"]]
69 |     COCOval: [['2014', 'minival']]
70 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
71 |     COCOROOT: 'data/datasets/coco2015'
72 |   
73 | TEST:
74 |   INPUT_WH: [320, 320]
75 |   CONFIDENCE_THRESH: 0.01
76 |   NMS_OVERLAP: 0.45
77 |   BATCH_SIZE: 16
78 |   
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/configs/refine_drf_res101_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: drf_res101
 3 |   SIZE: '300'
 4 |   REFINE: True 
 5 |   CONV_BODY: refine_drf_res.RefineDRFRes101
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet101-5d3b4d8f.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | SMALL:
21 |   FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]]
22 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256]
23 |   ODM_CHANNELS: [512, 1024, 512, 256, 256, 256]
24 |   NUM_ANCHORS: [6, 6, 6, 6, 4, 4]
25 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]]
26 |   MIN_SIZES: [30, 60, 111, 162, 213, 264]
27 |   MAX_SIZES: [60, 111, 162, 213, 264, 315]
28 |   ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
29 |   VARIANCE : [0.1, 0.2]
30 |   CLIP: True
31 |   IMG_WH: [320, 320]
32 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
33 |   USE_MAX_SIZE: True
34 | 
35 | BIG:
36 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]]
37 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256]
38 |   ODM_CHANNELS: [512, 1024, 512, 256, 256, 256]
39 |   NUM_ANCHORS: [6, 6, 6, 6, 6, 4, 4]
40 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]]
41 |   MIN_SIZES: [35.84, 76.8, 153.6,  230.4, 307.2, 384.0, 460.8]
42 |   MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
43 |   ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
44 | 
45 |   CLIP: True
46 |   IMG_WH: [512, 512]
47 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
48 |   USE_MAX_SIZE: True
49 | 
50 | SOLVER:
51 |   WEIGHT_DECAY: 0.0005
52 |   BASE_LR: 0.001
53 |   GAMMA: 0.1
54 |   MOMENTUM: 0.9
55 |   EPOCH_STEPS: [0, 150, 200]  
56 |   END_EPOCH: 250
57 |   START_EPOCH: 0
58 | 
59 | DATASETS:
60 |   TRAIN_TYPE: [['0712', '0712_trainval']]
61 |   VAL_TYPE: [['0712', '2007_test']]
62 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
63 |   DATA_TYPE: 'VOC'
64 |   SETS:
65 |     VOC: [['0712', '0712_trainval']]
66 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
67 |     VOC0712: [['2012', '2012_trainval']]
68 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
69 |     VOC2007: [['0712', "2007_test"]]
70 |     COCOval: [['2014', 'minival']]
71 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
72 |     COCOROOT: 'data/datasets/coco2015'
73 |   
74 | TEST:
75 |   INPUT_WH: [300, 300]
76 |   CONFIDENCE_THRESH: 0.01
77 |   NMS_OVERLAP: 0.45
78 |   BATCH_SIZE: 16
79 |   
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/configs/refine_drf_vgg_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: refine_drf_vgg
 3 |   SIZE: '300'
 4 |   REFINE: True
 5 |   CONV_BODY: refine_drf_vgg.RefineDRFVgg
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5], [3, 3], [1, 1]]
23 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256]
24 |   ODM_CHANNELS: [512, 1024, 512, 256, 256, 256]
25 |   NUM_ANCHORS: [6, 6, 6, 6, 4, 4]
26 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]]
27 |   MIN_SIZES: [30, 60, 111, 162, 213, 264]
28 |   MAX_SIZES: [60, 111, 162, 213, 264, 315]
29 |   ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
30 |   VARIANCE : [0.1, 0.2]
31 |   CLIP: True
32 |   IMG_WH: [320, 320]
33 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
34 |   USE_MAX_SIZE: True
35 | 
36 | BIG:
37 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]]
38 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256]
39 |   ODM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256]
40 |   NUM_ANCHORS: [6, 6, 6, 6, 6, 4, 4]
41 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]]
42 |   MIN_SIZES: [35.84, 76.8, 153.6,  230.4, 307.2, 384.0, 460.8]
43 |   MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
44 |   ASPECT_RATIOS : [[2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
45 | 
46 |   CLIP: True
47 |   IMG_WH: [512, 512]
48 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
49 |   USE_MAX_SIZE: True
50 | 
51 | SOLVER:
52 |   WEIGHT_DECAY: 0.0005
53 |   BASE_LR: 0.004
54 |   GAMMA: 0.1
55 |   MOMENTUM: 0.9
56 |   EPOCH_STEPS: [0, 150, 200]  
57 |   END_EPOCH: 250
58 |   START_EPOCH: 0
59 | 
60 | DATASETS:
61 |   TRAIN_TYPE: [['0712', '0712_trainval']]
62 |   VAL_TYPE: [['0712', '2007_test']]
63 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
64 |   DATA_TYPE: 'VOC'
65 |   SETS:
66 |     VOC: [['0712', '0712_trainval']]
67 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
68 |     VOC0712: [['2012', '2012_trainval']]
69 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
70 |     VOC2007: [['0712', "2007_test"]]
71 |     COCOval: [['2014', 'minival']]
72 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
73 |     COCOROOT: 'data/datasets/coco2015'
74 |   
75 | TEST:
76 |   INPUT_WH: [300, 300]
77 |   CONFIDENCE_THRESH: 0.01
78 |   NMS_OVERLAP: 0.45
79 |   BATCH_SIZE: 16
80 |   
81 | 
82 | 
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/configs/refine_res101_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: refine_res
 3 |   SIZE: '300'
 4 |   REFINE: True
 5 |   CONV_BODY: refine_res.RefineResnet101
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet101-5d3b4d8f.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]]
23 |   ARM_CHANNELS: [512, 1024, 512, 256]
24 |   ODM_CHANNELS: [256, 256, 256, 256]
25 |   NUM_ANCHORS: [3, 3, 3, 3]
26 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]]
27 |   MIN_SIZES: [30, 64, 128, 256]
28 |   MAX_SIZES: [64, 128, 256, 315]
29 |   ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]]
30 |   VARIANCE : [0.1, 0.2]
31 |   CLIP: True
32 |   IMG_WH: [320, 320]
33 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
34 |   USE_MAX_SIZE: False
35 | 
36 | BIG:
37 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]]
38 |   ARM_CHANNELS: [512, 1024, 512, 256]
39 |   ODM_CHANNELS: [256, 256, 256, 256]
40 |   NUM_ANCHORS: [3, 3, 3, 3]
41 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]]
42 |   MIN_SIZES: [30, 64, 128, 256]
43 |   MAX_SIZES: [64, 128, 256, 315]
44 |   ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]]
45 |   CLIP: True
46 |   IMG_WH: [512, 512]
47 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
48 |   USE_MAX_SIZE: False
49 | 
50 | SOLVER:
51 |   WEIGHT_DECAY: 0.0005
52 |   BASE_LR: 0.001
53 |   GAMMA: 0.1
54 |   MOMENTUM: 0.9
55 |   EPOCH_STEPS: [0, 150, 200]  
56 |   END_EPOCH: 250
57 |   START_EPOCH: 0
58 | 
59 | DATASETS:
60 |   TRAIN_TYPE: [['0712', '0712_trainval']]
61 |   VAL_TYPE: [['0712', '2007_test']]
62 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
63 |   DATA_TYPE: 'VOC'
64 |   SETS:
65 |     VOC: [['0712', '0712_trainval']]
66 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
67 |     VOC0712: [['2012', '2012_trainval']]
68 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
69 |     VOC2007: [['0712', "2007_test"]]
70 |     COCOval: [['2014', 'minival']]
71 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
72 |     COCOROOT: 'data/datasets/coco2015'
73 |   
74 | TEST:
75 |   INPUT_WH: [320, 320]
76 |   CONFIDENCE_THRESH: 0.01
77 |   NMS_OVERLAP: 0.45
78 |   BATCH_SIZE: 16
79 |   
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/configs/refine_vgg_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: refine_vgg
 3 |   SIZE: '300'
 4 |   REFINE: True
 5 |   CONV_BODY: refine_vgg.refine_vgg
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]]
23 |   ARM_CHANNELS: [512, 1024, 256, 256]
24 |   ODM_CHANNELS: [256, 256, 256, 256]
25 |   NUM_ANCHORS: [3, 3, 3, 3]
26 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]]
27 |   MIN_SIZES: [30, 64, 128, 256]
28 |   MAX_SIZES: [64, 128, 256, 315]
29 |   ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]]
30 |   VARIANCE : [0.1, 0.2]
31 |   CLIP: True
32 |   IMG_WH: [320, 320]
33 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
34 |   USE_MAX_SIZE: False
35 | 
36 | BIG:
37 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]]
38 |   ARM_CHANNELS: [512, 1024, 256, 256]
39 |   ODM_CHANNELS: [256, 256, 256, 256]
40 |   NUM_ANCHORS: [3, 3, 3, 3]
41 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]]
42 |   MIN_SIZES: [30, 64, 128, 256]
43 |   MAX_SIZES: [64, 128, 256, 315]
44 |   ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]]
45 |   CLIP: True
46 |   IMG_WH: [512, 512]
47 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
48 |   USE_MAX_SIZE: False
49 | 
50 | SOLVER:
51 |   WEIGHT_DECAY: 0.0005
52 |   BASE_LR: 0.002
53 |   GAMMA: 0.1
54 |   MOMENTUM: 0.9
55 |   EPOCH_STEPS: [0, 150, 200]  
56 |   END_EPOCH: 250
57 |   START_EPOCH: 0
58 | 
59 | DATASETS:
60 |   TRAIN_TYPE: [['0712', '0712_trainval']]
61 |   VAL_TYPE: [['0712', '2007_test']]
62 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
63 |   DATA_TYPE: 'VOC'
64 |   SETS:
65 |     VOC: [['0712', '0712_trainval']]
66 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
67 |     VOC0712: [['2012', '2012_trainval']]
68 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
69 |     VOC2007: [['0712', "2007_test"]]
70 |     COCOval: [['2014', 'minival']]
71 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
72 |     COCOROOT: 'data/datasets/coco2015'
73 |   
74 | TEST:
75 |   INPUT_WH: [320, 320]
76 |   CONFIDENCE_THRESH: 0.01
77 |   NMS_OVERLAP: 0.45
78 |   BATCH_SIZE: 16
79 |   
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/configs/refine_vgg_voc_512.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: refine_vgg
 3 |   SIZE: '512'
 4 |   REFINE: True
 5 |   CONV_BODY: refine_vgg.refine_vgg
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]]
23 |   ARM_CHANNELS: [512, 1024, 256, 256]
24 |   ODM_CHANNELS: [256, 256, 256, 256]
25 |   NUM_ANCHORS: [3, 3, 3, 3]
26 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]]
27 |   MIN_SIZES: [30, 64, 128, 256]
28 |   MAX_SIZES: [64, 128, 256, 315]
29 |   ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]]
30 |   VARIANCE : [0.1, 0.2]
31 |   CLIP: True
32 |   IMG_WH: [320, 320]
33 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
34 |   USE_MAX_SIZE: False
35 | 
36 | BIG:
37 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]]
38 |   ARM_CHANNELS: [512, 1024, 256, 256]
39 |   ODM_CHANNELS: [256, 256, 256, 256]
40 |   NUM_ANCHORS: [3, 3, 3, 3]
41 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]]
42 |   MIN_SIZES: [30, 64, 128, 256]
43 |   MAX_SIZES: [64, 128, 256, 315]
44 |   ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]]
45 |   CLIP: True
46 |   IMG_WH: [512, 512]
47 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
48 |   USE_MAX_SIZE: False
49 | 
50 | SOLVER:
51 |   WEIGHT_DECAY: 0.0005
52 |   BASE_LR: 0.002
53 |   GAMMA: 0.1
54 |   MOMENTUM: 0.9
55 |   EPOCH_STEPS: [0, 150, 200]  
56 |   END_EPOCH: 250
57 |   START_EPOCH: 0
58 | 
59 | DATASETS:
60 |   TRAIN_TYPE: [['0712', '0712_trainval']]
61 |   VAL_TYPE: [['0712', '2007_test']]
62 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
63 |   DATA_TYPE: 'VOC'
64 |   SETS:
65 |     VOC: [['0712', '0712_trainval']]
66 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
67 |     VOC0712: [['2012', '2012_trainval']]
68 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
69 |     VOC2007: [['0712', "2007_test"]]
70 |     COCOval: [['2014', 'minival']]
71 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
72 |     COCOROOT: 'data/datasets/coco2015'
73 |   
74 | TEST:
75 |   INPUT_WH: [512, 512]
76 |   CONFIDENCE_THRESH: 0.01
77 |   NMS_OVERLAP: 0.45
78 |   BATCH_SIZE: 16
79 |   
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/configs/ssd_darknet19_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: ssd_darknet19
 3 |   SIZE: '300'
 4 |   REFINE: False
 5 |   CONV_BODY: darknet.SSDarknet19
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/convert_darknet19.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]]
23 |   ARM_CHANNELS: [256, 512, 1024, 256, 256, 256]
24 |   NUM_ANCHORS: [4, 6, 6, 6, 4, 4]
25 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]]
26 |   MIN_SIZES: [30, 60, 111, 162, 213, 264]
27 |   MAX_SIZES: [60, 111, 162, 213, 264, 315]
28 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
29 |   VARIANCE : [0.1, 0.2]
30 |   CLIP: True
31 |   IMG_WH: [300, 300]
32 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
33 |   USE_MAX_SIZE: True
34 | 
35 | BIG:
36 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]]
37 |   ARM_CHANNELS: [256, 512, 1024, 256, 256, 256, 256]
38 |   NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4]
39 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]]
40 |   MIN_SIZES: [35.84, 76.8, 153.6,  230.4, 307.2, 384.0, 460.8]
41 |   MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
42 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
43 | 
44 |   CLIP: True
45 |   IMG_WH: [512, 512]
46 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
47 |   USE_MAX_SIZE: True
48 | 
49 | SOLVER:
50 |   WEIGHT_DECAY: 0.0005
51 |   BASE_LR: 0.001
52 |   GAMMA: 0.1
53 |   MOMENTUM: 0.9
54 |   EPOCH_STEPS: [0, 150, 200]  
55 |   END_EPOCH: 250
56 |   START_EPOCH: 0
57 | 
58 | DATASETS:
59 |   TRAIN_TYPE: [['0712', '0712_trainval']]
60 |   VAL_TYPE: [['0712', '2007_test']]
61 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
62 |   DATA_TYPE: 'VOC'
63 |   SETS:
64 |     VOC: [['0712', '0712_trainval']]
65 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
66 |     VOC0712: [['2012', '2012_trainval']]
67 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
68 |     VOC2007: [['0712', "2007_test"]]
69 |     COCOval: [['2014', 'minival']]
70 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
71 |     COCOROOT: 'data/datasets/coco2015'
72 |   
73 | TEST:
74 |   INPUT_WH: [300, 300]
75 |   CONFIDENCE_THRESH: 0.01
76 |   NMS_OVERLAP: 0.45
77 |   BATCH_SIZE: 16
78 |   
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/configs/ssd_darknet53_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: ssd_darknet53
 3 |   SIZE: '300'
 4 |   REFINE: False
 5 |   CONV_BODY: darknet.SSDarknet53
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/convert_darknet53.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]]
23 |   ARM_CHANNELS: [256, 512, 1024, 256, 256, 256]
24 |   NUM_ANCHORS: [4, 6, 6, 6, 4, 4]
25 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]]
26 |   MIN_SIZES: [30, 60, 111, 162, 213, 264]
27 |   MAX_SIZES: [60, 111, 162, 213, 264, 315]
28 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
29 |   VARIANCE : [0.1, 0.2]
30 |   CLIP: True
31 |   IMG_WH: [300, 300]
32 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
33 |   USE_MAX_SIZE: True
34 | 
35 | BIG:
36 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]]
37 |   ARM_CHANNELS: [256, 512, 1024, 256, 256, 256, 256]
38 |   NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4]
39 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]]
40 |   MIN_SIZES: [35.84, 76.8, 153.6,  230.4, 307.2, 384.0, 460.8]
41 |   MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
42 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
43 | 
44 |   CLIP: True
45 |   IMG_WH: [512, 512]
46 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
47 |   USE_MAX_SIZE: True
48 | 
49 | SOLVER:
50 |   WEIGHT_DECAY: 0.0005
51 |   BASE_LR: 0.001
52 |   GAMMA: 0.1
53 |   MOMENTUM: 0.9
54 |   EPOCH_STEPS: [0, 150, 200]  
55 |   END_EPOCH: 250
56 |   START_EPOCH: 0
57 | 
58 | DATASETS:
59 |   TRAIN_TYPE: [['0712', '0712_trainval']]
60 |   VAL_TYPE: [['0712', '2007_test']]
61 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
62 |   DATA_TYPE: 'VOC'
63 |   SETS:
64 |     VOC: [['0712', '0712_trainval']]
65 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
66 |     VOC0712: [['2012', '2012_trainval']]
67 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
68 |     VOC2007: [['0712', "2007_test"]]
69 |     COCOval: [['2014', 'minival']]
70 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
71 |     COCOROOT: 'data/datasets/coco2015'
72 |   
73 | TEST:
74 |   INPUT_WH: [300, 300]
75 |   CONFIDENCE_THRESH: 0.01
76 |   NMS_OVERLAP: 0.45
77 |   BATCH_SIZE: 16
78 |   
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/configs/ssd_mobilenetv2_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: ssd_mobilenetv2
 3 |   SIZE: '300'
 4 |   REFINE: False
 5 |   CONV_BODY: mobilenetv2.SSDMobilenetv2
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/mobilenetv2_feature.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]]
23 |   ARM_CHANNELS: [32, 96, 1280, 256, 256, 256]
24 |   NUM_ANCHORS: [4, 6, 6, 6, 4, 4]
25 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]]
26 |   MIN_SIZES: [30, 60, 111, 162, 213, 264]
27 |   MAX_SIZES: [60, 111, 162, 213, 264, 315]
28 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
29 |   VARIANCE : [0.1, 0.2]
30 |   CLIP: True
31 |   IMG_WH: [300, 300]
32 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
33 |   USE_MAX_SIZE: True
34 | 
35 | BIG:
36 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]]
37 |   ARM_CHANNELS: [32, 96, 1280, 256, 256, 256, 256]
38 |   NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4]
39 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]]
40 |   MIN_SIZES: [35.84, 76.8, 153.6,  230.4, 307.2, 384.0, 460.8]
41 |   MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
42 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
43 | 
44 |   CLIP: True
45 |   IMG_WH: [512, 512]
46 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
47 |   USE_MAX_SIZE: True
48 | 
49 | SOLVER:
50 |   WEIGHT_DECAY: 0.0005
51 |   BASE_LR: 0.001
52 |   GAMMA: 0.1
53 |   MOMENTUM: 0.9
54 |   EPOCH_STEPS: [0, 150, 200]  
55 |   END_EPOCH: 250
56 |   START_EPOCH: 0
57 | 
58 | DATASETS:
59 |   TRAIN_TYPE: [['0712', '0712_trainval']]
60 |   VAL_TYPE: [['0712', '2007_test']]
61 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
62 |   DATA_TYPE: 'VOC'
63 |   SETS:
64 |     VOC: [['0712', '0712_trainval']]
65 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
66 |     VOC0712: [['2012', '2012_trainval']]
67 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
68 |     VOC2007: [['0712', "2007_test"]]
69 |     COCOval: [['2014', 'minival']]
70 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
71 |     COCOROOT: 'data/datasets/coco2015'
72 |   
73 | TEST:
74 |   INPUT_WH: [300, 300]
75 |   CONFIDENCE_THRESH: 0.01
76 |   NMS_OVERLAP: 0.45
77 |   BATCH_SIZE: 16
78 |   
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/configs/ssd_res101_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: ssd_res50
 3 |   SIZE: '300'
 4 |   REFINE: False
 5 |   CONV_BODY: resnet.SSDResnet101
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet101-5d3b4d8f.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]]
23 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256]
24 |   NUM_ANCHORS: [4, 6, 6, 6, 4, 4]
25 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]]
26 |   MIN_SIZES: [30, 60, 111, 162, 213, 264]
27 |   MAX_SIZES: [60, 111, 162, 213, 264, 315]
28 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
29 |   VARIANCE : [0.1, 0.2]
30 |   CLIP: True
31 |   IMG_WH: [300, 300]
32 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
33 |   USE_MAX_SIZE: True
34 | 
35 | BIG:
36 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]]
37 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256]
38 |   NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4]
39 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]]
40 |   MIN_SIZES: [35.84, 76.8, 153.6,  230.4, 307.2, 384.0, 460.8]
41 |   MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
42 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
43 | 
44 |   CLIP: True
45 |   IMG_WH: [512, 512]
46 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
47 |   USE_MAX_SIZE: True
48 | 
49 | SOLVER:
50 |   WEIGHT_DECAY: 0.0005
51 |   BASE_LR: 0.001
52 |   GAMMA: 0.1
53 |   MOMENTUM: 0.9
54 |   EPOCH_STEPS: [0, 150, 200]  
55 |   END_EPOCH: 250
56 |   START_EPOCH: 0
57 | 
58 | DATASETS:
59 |   TRAIN_TYPE: [['0712', '0712_trainval']]
60 |   VAL_TYPE: [['0712', '2007_test']]
61 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
62 |   DATA_TYPE: 'VOC'
63 |   SETS:
64 |     VOC: [['0712', '0712_trainval']]
65 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
66 |     VOC0712: [['2012', '2012_trainval']]
67 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
68 |     VOC2007: [['0712', "2007_test"]]
69 |     COCOval: [['2014', 'minival']]
70 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
71 |     COCOROOT: 'data/datasets/coco2015'
72 |   
73 | TEST:
74 |   INPUT_WH: [300, 300]
75 |   CONFIDENCE_THRESH: 0.01
76 |   NMS_OVERLAP: 0.45
77 |   BATCH_SIZE: 16
78 |   
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/configs/ssd_res18_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: ssd_res18
 3 |   SIZE: '300'
 4 |   REFINE: False
 5 |   CONV_BODY: resnet.SSDResnet18
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet18-5c106cde.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]]
23 |   ARM_CHANNELS: [128, 256, 512, 256, 256, 256]
24 |   NUM_ANCHORS: [4, 6, 6, 6, 4, 4]
25 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]]
26 |   MIN_SIZES: [30, 60, 111, 162, 213, 264]
27 |   MAX_SIZES: [60, 111, 162, 213, 264, 315]
28 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
29 |   VARIANCE : [0.1, 0.2]
30 |   CLIP: True
31 |   IMG_WH: [300, 300]
32 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
33 |   USE_MAX_SIZE: True
34 | 
35 | BIG:
36 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]]
37 |   ARM_CHANNELS: [128, 256, 512, 256, 256, 256, 256]
38 |   NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4]
39 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]]
40 |   MIN_SIZES: [35.84, 76.8, 153.6,  230.4, 307.2, 384.0, 460.8]
41 |   MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
42 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
43 | 
44 |   CLIP: True
45 |   IMG_WH: [512, 512]
46 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
47 |   USE_MAX_SIZE: True
48 | 
49 | SOLVER:
50 |   WEIGHT_DECAY: 0.0005
51 |   BASE_LR: 0.001
52 |   GAMMA: 0.1
53 |   MOMENTUM: 0.9
54 |   EPOCH_STEPS: [0, 150, 200]  
55 |   END_EPOCH: 250
56 |   START_EPOCH: 0
57 | 
58 | DATASETS:
59 |   TRAIN_TYPE: [['0712', '0712_trainval']]
60 |   VAL_TYPE: [['0712', '2007_test']]
61 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
62 |   DATA_TYPE: 'VOC'
63 |   SETS:
64 |     VOC: [['0712', '0712_trainval']]
65 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
66 |     VOC0712: [['2012', '2012_trainval']]
67 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
68 |     VOC2007: [['0712', "2007_test"]]
69 |     COCOval: [['2014', 'minival']]
70 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
71 |     COCOROOT: 'data/datasets/coco2015'
72 |   
73 | TEST:
74 |   INPUT_WH: [300, 300]
75 |   CONFIDENCE_THRESH: 0.01
76 |   NMS_OVERLAP: 0.45
77 |   BATCH_SIZE: 16
78 |   
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/configs/ssd_res50_coco.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: ssd_res50
 3 |   SIZE: '300'
 4 |   REFINE: False
 5 |   CONV_BODY: resnet.SSDResnet50
 6 |   NUM_CLASSES: 81
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet50-19c8e357.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]]
23 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256]
24 |   NUM_ANCHORS: [4, 6, 6, 6, 4, 4]
25 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]]
26 |   MIN_SIZES: [30, 60, 111, 162, 213, 264]
27 |   MAX_SIZES: [60, 111, 162, 213, 264, 315]
28 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
29 |   VARIANCE : [0.1, 0.2]
30 |   CLIP: True
31 |   IMG_WH: [300, 300]
32 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
33 |   USE_MAX_SIZE: True
34 | 
35 | BIG:
36 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]]
37 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256]
38 |   NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4]
39 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]]
40 |   MIN_SIZES: [35.84, 76.8, 153.6,  230.4, 307.2, 384.0, 460.8]
41 |   MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
42 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
43 | 
44 |   CLIP: True
45 |   IMG_WH: [512, 512]
46 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
47 |   USE_MAX_SIZE: True
48 | 
49 | SOLVER:
50 |   WEIGHT_DECAY: 0.0005
51 |   BASE_LR: 0.001
52 |   GAMMA: 0.1
53 |   MOMENTUM: 0.9
54 |   EPOCH_STEPS: [0, 150, 200]  
55 |   END_EPOCH: 250
56 |   START_EPOCH: 0
57 | 
58 | DATASETS:
59 |   TRAIN_TYPE: [['2014', 'train'], ['2014', 'valminusminival']]
60 |   VAL_TYPE: [['2014', 'minival']]
61 |   DATAROOT: 'data/datasets/coco2015'
62 |   DATA_TYPE: 'COCO'
63 |   SETS:
64 |     VOC: [['0712', '0712_trainval']]
65 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
66 |     VOC0712: [['2012', '2012_trainval']]
67 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
68 |     VOC2007: [['0712', "2007_test"]]
69 |     COCOval: [['2014', 'minival']]
70 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
71 |     COCOROOT: 'data/datasets/coco2015'
72 |   
73 | TEST:
74 |   INPUT_WH: [300, 300]
75 |   CONFIDENCE_THRESH: 0.01
76 |   NMS_OVERLAP: 0.45
77 |   BATCH_SIZE: 16
78 |   
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/configs/ssd_res50_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: ssd_res50
 3 |   SIZE: '300'
 4 |   REFINE: False
 5 |   CONV_BODY: resnet.SSDResnet50
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet50-19c8e357.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]]
23 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256]
24 |   NUM_ANCHORS: [4, 6, 6, 6, 4, 4]
25 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]]
26 |   MIN_SIZES: [30, 60, 111, 162, 213, 264]
27 |   MAX_SIZES: [60, 111, 162, 213, 264, 315]
28 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
29 |   VARIANCE : [0.1, 0.2]
30 |   CLIP: True
31 |   IMG_WH: [300, 300]
32 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
33 |   USE_MAX_SIZE: True
34 | 
35 | BIG:
36 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]]
37 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256]
38 |   NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4]
39 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]]
40 |   MIN_SIZES: [35.84, 76.8, 153.6,  230.4, 307.2, 384.0, 460.8]
41 |   MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
42 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
43 | 
44 |   CLIP: True
45 |   IMG_WH: [512, 512]
46 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
47 |   USE_MAX_SIZE: True
48 | 
49 | SOLVER:
50 |   WEIGHT_DECAY: 0.0005
51 |   BASE_LR: 0.001
52 |   GAMMA: 0.1
53 |   MOMENTUM: 0.9
54 |   EPOCH_STEPS: [0, 150, 200]  
55 |   END_EPOCH: 250
56 |   START_EPOCH: 0
57 | 
58 | DATASETS:
59 |   TRAIN_TYPE: [['0712', '0712_trainval']]
60 |   VAL_TYPE: [['0712', '2007_test']]
61 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
62 |   DATA_TYPE: 'VOC'
63 |   SETS:
64 |     VOC: [['0712', '0712_trainval']]
65 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
66 |     VOC0712: [['2012', '2012_trainval']]
67 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
68 |     VOC2007: [['0712', "2007_test"]]
69 |     COCOval: [['2014', 'minival']]
70 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
71 |     COCOROOT: 'data/datasets/coco2015'
72 |   
73 | TEST:
74 |   INPUT_WH: [300, 300]
75 |   CONFIDENCE_THRESH: 0.01
76 |   NMS_OVERLAP: 0.45
77 |   BATCH_SIZE: 16
78 |   
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/configs/ssd_vgg_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: ssd_vgg
 3 |   SIZE: '300'
 4 |   REFINE: False
 5 |   CONV_BODY: vgg.SSDVgg
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]]
23 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256]
24 |   NUM_ANCHORS: [4, 6, 6, 6, 4, 4]
25 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], [300, 300]]
26 |   MIN_SIZES: [30, 60, 111, 162, 213, 264]
27 |   MAX_SIZES: [60, 111, 162, 213, 264, 315]
28 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
29 |   VARIANCE : [0.1, 0.2]
30 |   CLIP: True
31 |   IMG_WH: [300, 300]
32 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
33 |   USE_MAX_SIZE: True
34 | 
35 | BIG:
36 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], [1, 1]]
37 |   ARM_CHANNELS: [512, 1024, 512, 256, 256, 256, 256]
38 |   NUM_ANCHORS: [4, 6, 6, 6, 6, 4, 4]
39 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], [512, 512]]
40 |   MIN_SIZES: [35.84, 76.8, 153.6,  230.4, 307.2, 384.0, 460.8]
41 |   MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6]
42 |   ASPECT_RATIOS : [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]]
43 | 
44 |   CLIP: True
45 |   IMG_WH: [512, 512]
46 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
47 |   USE_MAX_SIZE: True
48 | 
49 | SOLVER:
50 |   WEIGHT_DECAY: 0.0005
51 |   BASE_LR: 0.001
52 |   GAMMA: 0.1
53 |   MOMENTUM: 0.9
54 |   EPOCH_STEPS: [0, 150, 200]  
55 |   END_EPOCH: 250
56 |   START_EPOCH: 0
57 | 
58 | DATASETS:
59 |   TRAIN_TYPE: [['0712', '0712_trainval']]
60 |   VAL_TYPE: [['0712', '2007_test']]
61 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
62 |   DATA_TYPE: 'VOC'
63 |   SETS:
64 |     VOC: [['0712', '0712_trainval']]
65 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
66 |     VOC0712: [['2012', '2012_trainval']]
67 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
68 |     VOC2007: [['0712', "2007_test"]]
69 |     COCOval: [['2014', 'minival']]
70 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
71 |     COCOROOT: 'data/datasets/coco2015'
72 |   
73 | TEST:
74 |   INPUT_WH: [300, 300]
75 |   CONFIDENCE_THRESH: 0.01
76 |   NMS_OVERLAP: 0.45
77 |   BATCH_SIZE: 32
78 |   
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/configs/weave_vgg_voc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: weave_vgg
 3 |   SIZE: '300'
 4 |   REFINE: True
 5 |   CONV_BODY: weave_vgg.weave_vgg
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]]
23 |   ARM_CHANNELS: [512, 1024, 256, 256]
24 |   ODM_CHANNELS: [256, 256, 256, 256]
25 |   NUM_ANCHORS: [3, 3, 3, 3]
26 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]]
27 |   MIN_SIZES: [30, 64, 128, 256]
28 |   MAX_SIZES: [64, 128, 256, 315]
29 |   ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]]
30 |   VARIANCE : [0.1, 0.2]
31 |   CLIP: True
32 |   IMG_WH: [320, 320]
33 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
34 |   USE_MAX_SIZE: False
35 | 
36 | BIG:
37 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]]
38 |   ARM_CHANNELS: [512, 1024, 256, 256]
39 |   ODM_CHANNELS: [256, 256, 256, 256]
40 |   NUM_ANCHORS: [3, 3, 3, 3]
41 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]]
42 |   MIN_SIZES: [30, 64, 128, 256]
43 |   MAX_SIZES: [64, 128, 256, 315]
44 |   ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]]
45 |   CLIP: True
46 |   IMG_WH: [512, 512]
47 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
48 |   USE_MAX_SIZE: False
49 | 
50 | SOLVER:
51 |   WEIGHT_DECAY: 0.0005
52 |   BASE_LR: 0.004
53 |   GAMMA: 0.1
54 |   MOMENTUM: 0.9
55 |   EPOCH_STEPS: [0, 150, 200]  
56 |   END_EPOCH: 250
57 |   START_EPOCH: 0
58 | 
59 | DATASETS:
60 |   TRAIN_TYPE: [['0712', '0712_trainval']]
61 |   VAL_TYPE: [['0712', '2007_test']]
62 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
63 |   DATA_TYPE: 'VOC'
64 |   SETS:
65 |     VOC: [['0712', '0712_trainval']]
66 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
67 |     VOC0712: [['2012', '2012_trainval']]
68 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
69 |     VOC2007: [['0712', "2007_test"]]
70 |     COCOval: [['2014', 'minival']]
71 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
72 |     COCOROOT: 'data/datasets/coco2015'
73 |   
74 | TEST:
75 |   INPUT_WH: [320, 320]
76 |   CONFIDENCE_THRESH: 0.01
77 |   NMS_OVERLAP: 0.45
78 |   BATCH_SIZE: 16
79 |   
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/configs/weave_vgg_voc_512.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   TYPE: weave_512_vgg
 3 |   SIZE: '512'
 4 |   REFINE: True
 5 |   CONV_BODY: weave_vgg.weave_vgg
 6 |   NUM_CLASSES: 21
 7 |   LOAD_PRETRAINED_WEIGHTS: True
 8 |   PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth'
 9 | 
10 | TRAIN:
11 |   OVERLAP: 0.5
12 |   BGR_MEAN: [104, 117, 123]
13 |   BATCH_SIZE: 32
14 |   OHEM: True
15 |   NEG_RATIO: 3
16 |   WARMUP: True
17 |   WARMUP_EPOCH: 2
18 |   TRAIN_ON: True
19 | 
20 | 
21 | SMALL:
22 |   FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]]
23 |   ARM_CHANNELS: [512, 1024, 256, 256]
24 |   ODM_CHANNELS: [256, 256, 256, 256]
25 |   NUM_ANCHORS: [3, 3, 3, 3]
26 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]]
27 |   MIN_SIZES: [30, 64, 128, 256]
28 |   MAX_SIZES: [64, 128, 256, 315]
29 |   ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]]
30 |   VARIANCE : [0.1, 0.2]
31 |   CLIP: True
32 |   IMG_WH: [320, 320]
33 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
34 |   USE_MAX_SIZE: False
35 | 
36 | BIG:
37 |   FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]]
38 |   ARM_CHANNELS: [512, 1024, 256, 256]
39 |   ODM_CHANNELS: [256, 256, 256, 256]
40 |   NUM_ANCHORS: [3, 3, 3, 3]
41 |   STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]]
42 |   MIN_SIZES: [30, 64, 128, 256]
43 |   MAX_SIZES: [64, 128, 256, 315]
44 |   ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]]
45 |   CLIP: True
46 |   IMG_WH: [512, 512]
47 |   INPUT_FIXED: True # if you want to input different size, you need to set this False.
48 |   USE_MAX_SIZE: False
49 | 
50 | SOLVER:
51 |   WEIGHT_DECAY: 0.0005
52 |   BASE_LR: 0.002
53 |   GAMMA: 0.1
54 |   MOMENTUM: 0.9
55 |   EPOCH_STEPS: [0, 150, 200]  
56 |   END_EPOCH: 250
57 |   START_EPOCH: 0
58 | 
59 | DATASETS:
60 |   TRAIN_TYPE: [['0712', '0712_trainval']]
61 |   VAL_TYPE: [['0712', '2007_test']]
62 |   DATAROOT: 'data/datasets/VOCdevkit0712/'
63 |   DATA_TYPE: 'VOC'
64 |   SETS:
65 |     VOC: [['0712', '0712_trainval']]
66 |     VOC0712PLUS: [['0712', '0712_trainval_test']]
67 |     VOC0712: [['2012', '2012_trainval']]
68 |     COCO: [['2014', 'train'], ['2014', 'valminusminival']]
69 |     VOC2007: [['0712', "2007_test"]]
70 |     COCOval: [['2014', 'minival']]
71 |     VOCROOT: 'data/datasets/VOCdevkit0712/'
72 |     COCOROOT: 'data/datasets/coco2015'
73 |   
74 | TEST:
75 |   INPUT_WH: [512, 512]
76 |   CONFIDENCE_THRESH: 0.01
77 |   NMS_OVERLAP: 0.45
78 |   BATCH_SIZE: 16
79 |   
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .voc0712 import VOCDetection, detection_collate
3 | from .coco import *
4 | from .data_augment import *
5 | 
6 |  
7 | 
8 | 


--------------------------------------------------------------------------------
/data/data_augment.py:
--------------------------------------------------------------------------------
  1 | """Data augmentation functionality. Passed as callable transformations to
  2 | Dataset classes.
  3 | 
  4 | The data augmentation procedures were interpreted from @weiliu89's SSD paper
  5 | http://arxiv.org/abs/1512.02325
  6 | 
  7 | TODO: implement data_augment for training
  8 | 
  9 | Ellis Brown, Max deGroot
 10 | """
 11 | 
 12 | import torch
 13 | from torchvision import transforms
 14 | import cv2
 15 | import numpy as np
 16 | import random
 17 | import math
 18 | from utils.box_utils import matrix_iou
 19 | 
 20 | 
 21 | def _crop(image, boxes, labels):
 22 |     height, width, _ = image.shape
 23 | 
 24 |     if len(boxes) == 0:
 25 |         return image, boxes, labels
 26 | 
 27 |     while True:
 28 |         mode = random.choice((
 29 |             None,
 30 |             (0.1, None),
 31 |             (0.3, None),
 32 |             (0.5, None),
 33 |             (0.7, None),
 34 |             (0.9, None),
 35 |             (None, None),
 36 |         ))
 37 | 
 38 |         if mode is None:
 39 |             return image, boxes, labels
 40 | 
 41 |         min_iou, max_iou = mode
 42 |         if min_iou is None:
 43 |             min_iou = float('-inf')
 44 |         if max_iou is None:
 45 |             max_iou = float('inf')
 46 | 
 47 |         for _ in range(50):
 48 |             scale = random.uniform(0.3, 1.)
 49 |             min_ratio = max(0.5, scale * scale)
 50 |             max_ratio = min(2, 1. / scale / scale)
 51 |             ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
 52 |             w = int(scale * ratio * width)
 53 |             h = int((scale / ratio) * height)
 54 | 
 55 |             l = random.randrange(width - w)
 56 |             t = random.randrange(height - h)
 57 |             roi = np.array((l, t, l + w, t + h))
 58 | 
 59 |             iou = matrix_iou(boxes, roi[np.newaxis])
 60 | 
 61 |             if not (min_iou <= iou.min() and iou.max() <= max_iou):
 62 |                 continue
 63 | 
 64 |             image_t = image[roi[1]:roi[3], roi[0]:roi[2]]
 65 | 
 66 |             centers = (boxes[:, :2] + boxes[:, 2:]) / 2
 67 |             mask = np.logical_and(roi[:2] < centers, centers < roi[2:]) \
 68 |                      .all(axis=1)
 69 |             boxes_t = boxes[mask].copy()
 70 |             labels_t = labels[mask].copy()
 71 |             if len(boxes_t) == 0:
 72 |                 continue
 73 | 
 74 |             boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2])
 75 |             boxes_t[:, :2] -= roi[:2]
 76 |             boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:])
 77 |             boxes_t[:, 2:] -= roi[:2]
 78 | 
 79 |             return image_t, boxes_t, labels_t
 80 | 
 81 | 
 82 | def _distort(image):
 83 |     def _convert(image, alpha=1, beta=0):
 84 |         tmp = image.astype(float) * alpha + beta
 85 |         tmp[tmp < 0] = 0
 86 |         tmp[tmp > 255] = 255
 87 |         image[:] = tmp
 88 | 
 89 |     image = image.copy()
 90 | 
 91 |     if random.randrange(2):
 92 |         _convert(image, beta=random.uniform(-32, 32))
 93 | 
 94 |     if random.randrange(2):
 95 |         _convert(image, alpha=random.uniform(0.5, 1.5))
 96 | 
 97 |     image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
 98 | 
 99 |     if random.randrange(2):
100 |         tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
101 |         tmp %= 180
102 |         image[:, :, 0] = tmp
103 | 
104 |     if random.randrange(2):
105 |         _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))
106 | 
107 |     image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
108 | 
109 |     return image
110 | 
111 | 
112 | def _expand(image, boxes, fill, p):
113 |     if random.random() > p:
114 |         return image, boxes
115 | 
116 |     height, width, depth = image.shape
117 |     for _ in range(50):
118 |         scale = random.uniform(1, 4)
119 | 
120 |         min_ratio = max(0.5, 1. / scale / scale)
121 |         max_ratio = min(2, scale * scale)
122 |         ratio = math.sqrt(random.uniform(min_ratio, max_ratio))
123 |         ws = scale * ratio
124 |         hs = scale / ratio
125 |         if ws < 1 or hs < 1:
126 |             continue
127 |         w = int(ws * width)
128 |         h = int(hs * height)
129 | 
130 |         left = random.randint(0, w - width)
131 |         top = random.randint(0, h - height)
132 | 
133 |         boxes_t = boxes.copy()
134 |         boxes_t[:, :2] += (left, top)
135 |         boxes_t[:, 2:] += (left, top)
136 | 
137 |         expand_image = np.empty((h, w, depth), dtype=image.dtype)
138 |         expand_image[:, :] = fill
139 |         expand_image[top:top + height, left:left + width] = image
140 |         image = expand_image
141 | 
142 |         return image, boxes_t
143 | 
144 | 
145 | def _mirror(image, boxes):
146 |     _, width, _ = image.shape
147 |     if random.randrange(2):
148 |         image = image[:, ::-1]
149 |         boxes = boxes.copy()
150 |         boxes[:, 0::2] = width - boxes[:, 2::-2]
151 |     return image, boxes
152 | 
153 | 
154 | def preproc_for_test(image, resize_wh, mean):
155 |     interp_methods = [
156 |         cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST,
157 |         cv2.INTER_LANCZOS4
158 |     ]
159 |     interp_method = interp_methods[random.randrange(5)]
160 |     # interp_method = interp_methods[0]
161 |     image = cv2.resize(
162 |         image, (resize_wh[0], resize_wh[1]), interpolation=interp_method)
163 |     image = image.astype(np.float32)
164 |     image -= mean
165 |     # to rgb
166 |     # image = image[:, :, (2, 1, 0)]
167 |     return image.transpose(2, 0, 1)
168 | 
169 | 
170 | class preproc(object):
171 |     def __init__(self, resize_wh, rgb_means, p):
172 |         self.means = rgb_means
173 |         self.resize_wh = resize_wh
174 |         self.p = p
175 | 
176 |     def __call__(self, image, targets):
177 |         boxes = targets[:, :-1].copy()
178 |         labels = targets[:, -1].copy()
179 |         if len(boxes) == 0:
180 |             #boxes = np.empty((0, 4))
181 |             targets = np.zeros((1, 5))
182 |             image = preproc_for_test(image, self.resize_wh, self.means)
183 |             return torch.from_numpy(image), targets
184 | 
185 |         image_o = image.copy()
186 |         targets_o = targets.copy()
187 |         height_o, width_o, _ = image_o.shape
188 |         boxes_o = targets_o[:, :-1]
189 |         labels_o = targets_o[:, -1]
190 |         boxes_o[:, 0::2] /= width_o
191 |         boxes_o[:, 1::2] /= height_o
192 |         labels_o = np.expand_dims(labels_o, 1)
193 |         targets_o = np.hstack((boxes_o, labels_o))
194 | 
195 |         image_t, boxes, labels = _crop(image, boxes, labels)
196 |         image_t = _distort(image_t)
197 |         image_t, boxes = _expand(image_t, boxes, self.means, self.p)
198 |         image_t, boxes = _mirror(image_t, boxes)
199 |         #image_t, boxes = _mirror(image, boxes)
200 | 
201 |         height, width, _ = image_t.shape
202 |         image_t = preproc_for_test(image_t, self.resize_wh, self.means)
203 |         boxes = boxes.copy()
204 |         boxes[:, 0::2] /= width
205 |         boxes[:, 1::2] /= height
206 |         b_w = (boxes[:, 2] - boxes[:, 0]) * 1.
207 |         b_h = (boxes[:, 3] - boxes[:, 1]) * 1.
208 |         mask_b = np.minimum(b_w, b_h) > 0.01
209 |         boxes_t = boxes[mask_b]
210 |         labels_t = labels[mask_b].copy()
211 | 
212 |         if len(boxes_t) == 0:
213 |             image = preproc_for_test(image_o, self.resize_wh, self.means)
214 |             return torch.from_numpy(image), targets_o
215 | 
216 |         labels_t = np.expand_dims(labels_t, 1)
217 |         targets_t = np.hstack((boxes_t, labels_t))
218 | 
219 |         return torch.from_numpy(image_t), targets_t
220 | 
221 | 
222 | class BaseTransform(object):
223 |     """Defines the transformations that should be applied to test PIL image
224 |         for input into the network
225 | 
226 |     dimension -> tensorize -> color adj
227 | 
228 |     Arguments:
229 |         resize (int): input dimension to SSD
230 |         rgb_means ((int,int,int)): average RGB of the dataset
231 |             (104,117,123)
232 |         swap ((int,int,int)): final order of channels
233 |     Returns:
234 |         transform (transform) : callable transform to be applied to test/val
235 |         data
236 |     """
237 | 
238 |     def __init__(self, resize_wh, rgb_means, swap=(2, 0, 1)):
239 |         self.means = rgb_means
240 |         self.resize_wh = resize_wh
241 |         self.swap = swap
242 | 
243 |     # assume input is cv2 img for now
244 |     def __call__(self, img, target=None):
245 | 
246 |         interp_methods = [
247 |             cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA,
248 |             cv2.INTER_NEAREST, cv2.INTER_LANCZOS4
249 |         ]
250 |         interp_method = interp_methods[0]
251 |         img = cv2.resize(
252 |             np.array(img), (self.resize_wh[0], self.resize_wh[1]),
253 |             interpolation=interp_method).astype(np.float32)
254 |         img -= self.means
255 |         img = img.transpose(self.swap)
256 |         return torch.from_numpy(img), target
257 | 


--------------------------------------------------------------------------------
/data/drf_net.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yqyao/SSD_Pytorch/6060bbb650e7a1df7c12d7c9650a38eaba4ab6a8/data/drf_net.jpg


--------------------------------------------------------------------------------
/data/scripts/VOC2007.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Ellis Brown
 3 | 
 4 | start=`date +%s`
 5 | 
 6 | # handle optional download dir
 7 | if [ -z "$1" ]
 8 |   then
 9 |     # navigate to ~/data
10 |     echo "navigating to ~/data/ ..." 
11 |     mkdir -p ~/data
12 |     cd ~/data/
13 |   else
14 |     # check if is valid directory
15 |     if [ ! -d $1 ]; then
16 |         echo $1 "is not a valid directory"
17 |         exit 0
18 |     fi
19 |     echo "navigating to" $1 "..."
20 |     cd $1
21 | fi
22 | 
23 | echo "Downloading VOC2007 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
26 | echo "Downloading VOC2007 test data ..."
27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
28 | echo "Done downloading."
29 | 
30 | # Extract data
31 | echo "Extracting trainval ..."
32 | tar -xvf VOCtrainval_06-Nov-2007.tar
33 | echo "Extracting test ..."
34 | tar -xvf VOCtest_06-Nov-2007.tar
35 | echo "removing tars ..."
36 | rm VOCtrainval_06-Nov-2007.tar
37 | rm VOCtest_06-Nov-2007.tar
38 | 
39 | end=`date +%s`
40 | runtime=$((end-start))
41 | 
42 | echo "Completed in" $runtime "seconds"


--------------------------------------------------------------------------------
/data/scripts/VOC2012.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Ellis Brown
 3 | 
 4 | start=`date +%s`
 5 | 
 6 | # handle optional download dir
 7 | if [ -z "$1" ]
 8 |   then
 9 |     # navigate to ~/data
10 |     echo "navigating to ~/data/ ..." 
11 |     mkdir -p ~/data
12 |     cd ~/data/
13 |   else
14 |     # check if is valid directory
15 |     if [ ! -d $1 ]; then
16 |         echo $1 "is not a valid directory"
17 |         exit 0
18 |     fi
19 |     echo "navigating to" $1 "..."
20 |     cd $1
21 | fi
22 | 
23 | echo "Downloading VOC2012 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
26 | echo "Done downloading."
27 | 
28 | 
29 | # Extract data
30 | echo "Extracting trainval ..."
31 | tar -xvf VOCtrainval_11-May-2012.tar
32 | echo "removing tar ..."
33 | rm VOCtrainval_11-May-2012.tar
34 | 
35 | end=`date +%s`
36 | runtime=$((end-start))
37 | 
38 | echo "Completed in" $runtime "seconds"


--------------------------------------------------------------------------------
/data/voc_eval.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast/er R-CNN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Bharath Hariharan
  5 | # --------------------------------------------------------
  6 | 
  7 | import xml.etree.ElementTree as ET
  8 | import os
  9 | import pickle
 10 | import numpy as np
 11 | import pdb
 12 | import matplotlib
 13 | matplotlib.use('Agg')
 14 | import matplotlib.pyplot as plt
 15 | 
 16 | 
 17 | def parse_rec(filename):
 18 |     """ Parse a PASCAL VOC xml file """
 19 |     tree = ET.parse(filename)
 20 |     objects = []
 21 |     for obj in tree.findall('object'):
 22 |         obj_struct = {}
 23 |         obj_struct['name'] = obj.find('name').text
 24 |         obj_struct['pose'] = obj.find('pose').text
 25 |         obj_struct['truncated'] = int(obj.find('truncated').text)
 26 |         obj_struct['difficult'] = int(obj.find('difficult').text)
 27 |         bbox = obj.find('bndbox')
 28 |         obj_struct['bbox'] = [
 29 |             int(bbox.find('xmin').text),
 30 |             int(bbox.find('ymin').text),
 31 |             int(bbox.find('xmax').text),
 32 |             int(bbox.find('ymax').text)
 33 |         ]
 34 |         objects.append(obj_struct)
 35 | 
 36 |     return objects
 37 | 
 38 | 
 39 | def voc_ap(rec, prec, use_07_metric=False):
 40 |     """ ap = voc_ap(rec, prec, [use_07_metric])
 41 |     Compute VOC AP given precision and recall.
 42 |     If use_07_metric is true, uses the
 43 |     VOC 07 11 point method (default:False).
 44 |     """
 45 |     if use_07_metric:
 46 |         # 11 point metric
 47 |         ap = 0.
 48 |         for t in np.arange(0., 1.1, 0.1):
 49 |             if np.sum(rec >= t) == 0:
 50 |                 p = 0
 51 |             else:
 52 |                 p = np.max(prec[rec >= t])
 53 |             ap = ap + p / 11.
 54 |     else:
 55 |         # correct AP calculation
 56 |         # first append sentinel values at the end
 57 |         mrec = np.concatenate(([0.], rec, [1.]))
 58 |         mpre = np.concatenate(([0.], prec, [0.]))
 59 | 
 60 |         # compute the precision envelope
 61 |         for i in range(mpre.size - 1, 0, -1):
 62 |             mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 63 | 
 64 |         # to calculate area under PR curve, look for points
 65 |         # where X axis (recall) changes value
 66 |         i = np.where(mrec[1:] != mrec[:-1])[0]
 67 | 
 68 |         # and sum (\Delta recall) * prec
 69 |         ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 70 |     return ap
 71 | 
 72 | 
 73 | def voc_eval(detpath,
 74 |              annopath,
 75 |              imagesetfile,
 76 |              classname,
 77 |              cachedir,
 78 |              ovthresh=0.5,
 79 |              use_07_metric=False):
 80 |     """rec, prec, ap = voc_eval(detpath,
 81 |                                 annopath,
 82 |                                 imagesetfile,
 83 |                                 classname,
 84 |                                 [ovthresh],
 85 |                                 [use_07_metric])
 86 | 
 87 |     Top level function that does the PASCAL VOC evaluation.
 88 | 
 89 |     detpath: Path to detections
 90 |         detpath.format(classname) should produce the detection results file.
 91 |     annopath: Path to annotations
 92 |         annopath.format(imagename) should be the xml annotations file.
 93 |     imagesetfile: Text file containing the list of images, one image per line.
 94 |     classname: Category name (duh)
 95 |     cachedir: Directory for caching the annotations
 96 |     [ovthresh]: Overlap threshold (default = 0.5)
 97 |     [use_07_metric]: Whether to use VOC07's 11 point AP computation
 98 |         (default False)
 99 |     """
100 |     # assumes detections are in detpath.format(classname)
101 |     # assumes annotations are in annopath.format(imagename)
102 |     # assumes imagesetfile is a text file with each line an image name
103 |     # cachedir caches the annotations in a pickle file
104 | 
105 |     # first load gt
106 |     if not os.path.isdir(cachedir):
107 |         os.mkdir(cachedir)
108 |     cachefile = os.path.join(cachedir, 'annots.pkl')
109 |     # read list of images
110 |     with open(imagesetfile, 'r') as f:
111 |         lines = f.readlines()
112 |     imagenames = [x.strip() for x in lines]
113 | 
114 |     if not os.path.isfile(cachefile):
115 |         # load annots
116 |         recs = {}
117 |         for i, imagename in enumerate(imagenames):
118 |             recs[imagename] = parse_rec(annopath.format(imagename))
119 |             if i % 100 == 0:
120 |                 print('Reading annotation for {:d}/{:d}'.format(
121 |                     i + 1, len(imagenames)))
122 |         # save
123 |         print('Saving cached annotations to {:s}'.format(cachefile))
124 |         with open(cachefile, 'wb') as f:
125 |             pickle.dump(recs, f)
126 |     else:
127 |         # load
128 |         with open(cachefile, 'rb') as f:
129 |             recs = pickle.load(f)
130 | 
131 |     # extract gt objects for this class
132 |     class_recs = {}
133 |     npos = 0
134 |     for imagename in imagenames:
135 |         R = [obj for obj in recs[imagename] if obj['name'] == classname]
136 |         bbox = np.array([x['bbox'] for x in R])
137 |         difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
138 |         det = [False] * len(R)
139 |         npos = npos + sum(~difficult)
140 |         class_recs[imagename] = {
141 |             'bbox': bbox,
142 |             'difficult': difficult,
143 |             'det': det
144 |         }
145 | 
146 |     # read dets
147 |     detfile = detpath.format(classname)
148 |     with open(detfile, 'r') as f:
149 |         lines = f.readlines()
150 | 
151 |     splitlines = [x.strip().split(' ') for x in lines]
152 |     image_ids = [x[0] for x in splitlines]
153 |     confidence = np.array([float(x[1]) for x in splitlines])
154 |     BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
155 |     # sort by confidence
156 |     sorted_ind = np.argsort(-confidence)
157 |     sorted_scores = np.sort(-confidence)
158 |     BB = BB[sorted_ind, :]
159 |     image_ids = [image_ids[x] for x in sorted_ind]
160 | 
161 |     # go down dets and mark TPs and FPs
162 |     nd = len(image_ids)
163 |     tp = np.zeros(nd)
164 |     fp = np.zeros(nd)
165 |     for d in range(nd):
166 |         R = class_recs[image_ids[d]]
167 |         bb = BB[d, :].astype(float)
168 |         ovmax = -np.inf
169 |         BBGT = R['bbox'].astype(float)
170 | 
171 |         if BBGT.size > 0:
172 |             # compute overlaps
173 |             # intersection
174 |             ixmin = np.maximum(BBGT[:, 0], bb[0])
175 |             iymin = np.maximum(BBGT[:, 1], bb[1])
176 |             ixmax = np.minimum(BBGT[:, 2], bb[2])
177 |             iymax = np.minimum(BBGT[:, 3], bb[3])
178 |             iw = np.maximum(ixmax - ixmin + 1., 0.)
179 |             ih = np.maximum(iymax - iymin + 1., 0.)
180 |             inters = iw * ih
181 | 
182 |             # union
183 |             uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
184 |                    (BBGT[:, 2] - BBGT[:, 0] + 1.) *
185 |                    (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
186 | 
187 |             overlaps = inters / uni
188 |             ovmax = np.max(overlaps)
189 |             jmax = np.argmax(overlaps)
190 | 
191 |         if ovmax > ovthresh:
192 |             if not R['difficult'][jmax]:
193 |                 if not R['det'][jmax]:
194 |                     tp[d] = 1.
195 |                     R['det'][jmax] = 1
196 |                 else:
197 |                     fp[d] = 1.
198 |         else:
199 |             fp[d] = 1.
200 | 
201 |         # compute precision recall
202 |     fp = np.cumsum(fp)
203 |     tp = np.cumsum(tp)
204 |     rec = tp / float(npos)
205 |     # avoid divide by zero in case the first detection matches a difficult
206 |     # ground truth
207 |     prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
208 |     # if classname == 'person':
209 |     final_rec = round(rec[-1], 4)
210 |     final_prec = round(prec[-1], 4)
211 |     plt_save_path = os.path.join(".", "eval", "pr")
212 |     if not os.path.exists(plt_save_path):
213 |         os.makedirs(plt_save_path)
214 |     plt.plot(rec, prec, 'r')
215 |     pr_curl = os.path.join(
216 |         plt_save_path, '{}_{}_{}pr.jpg'.format(classname, str(final_prec),
217 |                                                str(final_rec)))
218 |     plt.savefig(pr_curl)
219 |     plt.close()
220 |     ap = voc_ap(rec, prec, use_07_metric)
221 | 
222 |     return rec, prec, ap
223 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "1,0"
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.optim as optim
  6 | import torch.backends.cudnn as cudnn
  7 | import torch.nn.init as init
  8 | import argparse
  9 | from torch.autograd import Variable
 10 | import torch.utils.data as data
 11 | from data import COCODetection, VOCDetection, detection_collate, BaseTransform, preproc
 12 | from layers.modules import MultiBoxLoss, RefineMultiBoxLoss
 13 | from layers.functions import Detect
 14 | from utils.nms_wrapper import nms, soft_nms
 15 | from configs.config import cfg, cfg_from_file, VOC_CLASSES, COCO_CLASSES
 16 | from utils.box_utils import draw_rects
 17 | import numpy as np
 18 | import time
 19 | import os
 20 | import sys
 21 | import pickle
 22 | import datetime
 23 | from models.model_builder import SSD
 24 | import yaml
 25 | import cv2
 26 | 
 27 | 
 28 | def arg_parse():
 29 |     parser = argparse.ArgumentParser(
 30 |         description='Single Shot MultiBox Detection')
 31 |     parser.add_argument(
 32 |         "--images",
 33 |         dest='images',
 34 |         help="Image / Directory containing images to perform detection upon",
 35 |         default="images",
 36 |         type=str)
 37 |     parser.add_argument(
 38 |         '--weights',
 39 |         default='weights/ssd_darknet_300.pth',
 40 |         type=str,
 41 |         help='Trained state_dict file path to open')
 42 |     parser.add_argument(
 43 |         '--cfg',
 44 |         dest='cfg_file',
 45 |         required=True,
 46 |         help='Config file for training (and optionally testing)')
 47 |     parser.add_argument(
 48 |         '--save_folder',
 49 |         default='eval/',
 50 |         type=str,
 51 |         help='File path to save results')
 52 |     parser.add_argument(
 53 |         '--num_workers',
 54 |         default=8,
 55 |         type=int,
 56 |         help='Number of workers used in dataloading')
 57 |     parser.add_argument(
 58 |         '--retest', default=False, type=bool, help='test cache results')
 59 |     args = parser.parse_args()
 60 |     return args
 61 | 
 62 | 
 63 | def im_detect(img, net, detector, transform, thresh=0.01):
 64 |     with torch.no_grad():
 65 |         t0 = time.time()
 66 |         w, h = img.shape[1], img.shape[0]
 67 |         x = transform(img)[0].unsqueeze(0)
 68 |         x = x.cuda()
 69 |         t1 = time.time()
 70 |         output = net(x)
 71 |         boxes, scores = detector.forward(output)
 72 |         t2 = time.time()
 73 |         max_conf, max_id = scores[0].topk(1, 1, True, True)
 74 |         pos = max_id > 0
 75 |         if len(pos) == 0:
 76 |             return np.empty((0, 6))
 77 |         boxes = boxes[0][pos.view(-1, 1).expand(len(pos), 4)].view(-1, 4)
 78 |         scores = max_conf[pos].view(-1, 1)
 79 |         max_id = max_id[pos].view(-1, 1)
 80 |         inds = scores > thresh
 81 |         if len(inds) == 0:
 82 |             return np.empty((0, 6))
 83 |         boxes = boxes[inds.view(-1, 1).expand(len(inds), 4)].view(-1, 4)
 84 |         scores = scores[inds].view(-1, 1)
 85 |         max_id = max_id[inds].view(-1, 1)
 86 |         c_dets = torch.cat((boxes, scores, max_id.float()), 1).cpu().numpy()
 87 |         img_classes = np.unique(c_dets[:, -1])
 88 |         output = None
 89 |         flag = False
 90 |         for cls in img_classes:
 91 |             cls_mask = np.where(c_dets[:, -1] == cls)[0]
 92 |             image_pred_class = c_dets[cls_mask, :]
 93 |             keep = nms(image_pred_class, cfg.TEST.NMS_OVERLAP, force_cpu=True)
 94 |             keep = keep[:50]
 95 |             image_pred_class = image_pred_class[keep, :]
 96 |             if not flag:
 97 |                 output = image_pred_class
 98 |                 flag = True
 99 |             else:
100 |                 output = np.concatenate((output, image_pred_class), axis=0)
101 |         output[:, 0:2][output[:, 0:2] < 0] = 0
102 |         output[:, 2:4][output[:, 2:4] > 1] = 1
103 |         scale = np.array([w, h, w, h])
104 |         output[:, :4] = output[:, :4] * scale
105 |         t3 = time.time()
106 |         print("transform_t:", round(t1 - t0, 3), "detect_time:",
107 |               round(t2 - t1, 3), "nms_time:", round(t3 - t2, 3))
108 |     return output
109 | 
110 | 
111 | def main():
112 |     global args
113 |     args = arg_parse()
114 |     cfg_from_file(args.cfg_file)
115 |     bgr_means = cfg.TRAIN.BGR_MEAN
116 |     dataset_name = cfg.DATASETS.DATA_TYPE
117 |     batch_size = cfg.TEST.BATCH_SIZE
118 |     num_workers = args.num_workers
119 |     if cfg.DATASETS.DATA_TYPE == 'VOC':
120 |         trainvalDataset = VOCDetection
121 |         classes = VOC_CLASSES
122 |         top_k = 200
123 |     else:
124 |         trainvalDataset = COCODetection
125 |         classes = COCO_CLASSES
126 |         top_k = 300
127 |     valSet = cfg.DATASETS.VAL_TYPE
128 |     num_classes = cfg.MODEL.NUM_CLASSES
129 |     save_folder = args.save_folder
130 |     if not os.path.exists(save_folder):
131 |         os.mkdir(save_folder)
132 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
133 |     cfg.TRAIN.TRAIN_ON = False
134 |     net = SSD(cfg)
135 | 
136 |     checkpoint = torch.load(args.weights)
137 |     state_dict = checkpoint['model']
138 |     from collections import OrderedDict
139 |     new_state_dict = OrderedDict()
140 |     for k, v in state_dict.items():
141 |         head = k[:7]
142 |         if head == 'module.':
143 |             name = k[7:]  # remove `module.`
144 |         else:
145 |             name = k
146 |         new_state_dict[name] = v
147 |     net.load_state_dict(new_state_dict)
148 | 
149 |     detector = Detect(cfg)
150 |     img_wh = cfg.TEST.INPUT_WH
151 |     ValTransform = BaseTransform(img_wh, bgr_means, (2, 0, 1))
152 |     input_folder = args.images
153 |     thresh = cfg.TEST.CONFIDENCE_THRESH
154 |     for item in os.listdir(input_folder)[2:3]:
155 |         img_path = os.path.join(input_folder, item)
156 |         print(img_path)
157 |         img = cv2.imread(img_path)
158 |         dets = im_detect(img, net, detector, ValTransform, thresh)
159 |         draw_img = draw_rects(img, dets, classes)
160 |         out_img_name = "output_" + item
161 |         save_path = os.path.join(save_folder, out_img_name)
162 |         cv2.imwrite(save_path, img)
163 | 
164 | 
165 | if __name__ == '__main__':
166 |     st = time.time()
167 |     main()
168 |     print("final time", time.time() - st)
169 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"] = "1,0"
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.optim as optim
  6 | import torch.backends.cudnn as cudnn
  7 | import torch.nn.init as init
  8 | import argparse
  9 | from torch.autograd import Variable
 10 | import torch.utils.data as data
 11 | from data import COCODetection, VOCDetection, detection_collate, BaseTransform, preproc
 12 | from layers.modules import MultiBoxLoss, RefineMultiBoxLoss
 13 | from layers.functions import Detect
 14 | from utils.nms_wrapper import nms, soft_nms
 15 | from configs.config import cfg, cfg_from_file
 16 | import numpy as np
 17 | import time
 18 | import os
 19 | import sys
 20 | import pickle
 21 | import datetime
 22 | from models.model_builder import SSD
 23 | import yaml
 24 | 
 25 | 
 26 | def arg_parse():
 27 |     parser = argparse.ArgumentParser(
 28 |         description='Single Shot MultiBox Detection')
 29 |     parser.add_argument(
 30 |         '--weights',
 31 |         default='weights/ssd_darknet_300.pth',
 32 |         type=str,
 33 |         help='Trained state_dict file path to open')
 34 |     parser.add_argument(
 35 |         '--cfg',
 36 |         dest='cfg_file',
 37 |         required=True,
 38 |         help='Config file for training (and optionally testing)')
 39 |     parser.add_argument(
 40 |         '--save_folder',
 41 |         default='eval/',
 42 |         type=str,
 43 |         help='File path to save results')
 44 |     parser.add_argument(
 45 |         '--num_workers',
 46 |         default=8,
 47 |         type=int,
 48 |         help='Number of workers used in dataloading')
 49 |     parser.add_argument(
 50 |         '--retest', default=False, type=bool, help='test cache results')
 51 |     args = parser.parse_args()
 52 |     return args
 53 | 
 54 | 
 55 | def eval_net(val_dataset,
 56 |              val_loader,
 57 |              net,
 58 |              detector,
 59 |              cfg,
 60 |              transform,
 61 |              max_per_image=300,
 62 |              thresh=0.01,
 63 |              batch_size=1):
 64 |     net.eval()
 65 |     num_images = len(val_dataset)
 66 |     num_classes = cfg.MODEL.NUM_CLASSES
 67 |     eval_save_folder = "./eval/"
 68 |     if not os.path.exists(eval_save_folder):
 69 |         os.mkdir(eval_save_folder)
 70 |     all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]
 71 |     det_file = os.path.join(eval_save_folder, 'detections.pkl')
 72 | 
 73 |     if args.retest:
 74 |         f = open(det_file, 'rb')
 75 |         all_boxes = pickle.load(f)
 76 |         print('Evaluating detections')
 77 |         val_dataset.evaluate_detections(all_boxes, eval_save_folder)
 78 |         return
 79 | 
 80 |     for idx, (imgs, _, img_info) in enumerate(val_loader):
 81 |         with torch.no_grad():
 82 |             t1 = time.time()
 83 |             x = imgs
 84 |             x = x.cuda()
 85 |             output = net(x)
 86 |             t4 = time.time()
 87 |             boxes, scores = detector.forward(output)
 88 |             t2 = time.time()
 89 |             for k in range(boxes.size(0)):
 90 |                 i = idx * batch_size + k
 91 |                 boxes_ = boxes[k]
 92 |                 scores_ = scores[k]
 93 |                 boxes_ = boxes_.cpu().numpy()
 94 |                 scores_ = scores_.cpu().numpy()
 95 |                 img_wh = img_info[k]
 96 |                 scale = np.array([img_wh[0], img_wh[1], img_wh[0], img_wh[1]])
 97 |                 boxes_ *= scale
 98 |                 for j in range(1, num_classes):
 99 |                     inds = np.where(scores_[:, j] > thresh)[0]
100 |                     if len(inds) == 0:
101 |                         all_boxes[j][i] = np.empty([0, 5], dtype=np.float32)
102 |                         continue
103 |                     c_bboxes = boxes_[inds]
104 |                     c_scores = scores_[inds, j]
105 |                     c_dets = np.hstack((c_bboxes,
106 |                                         c_scores[:, np.newaxis])).astype(
107 |                                             np.float32, copy=False)
108 |                     keep = nms(c_dets, cfg.TEST.NMS_OVERLAP, force_cpu=True)
109 |                     keep = keep[:50]
110 |                     c_dets = c_dets[keep, :]
111 |                     all_boxes[j][i] = c_dets
112 |             t3 = time.time()
113 |             detect_time = t2 - t1
114 |             nms_time = t3 - t2
115 |             forward_time = t4 - t1
116 |             if idx % 10 == 0:
117 |                 print('im_detect: {:d}/{:d} {:.3f}s {:.3f}s {:.3f}s'.format(
118 |                     i + 1, num_images, forward_time, detect_time, nms_time))
119 | 
120 |     with open(det_file, 'wb') as f:
121 |         pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)
122 |     print('Evaluating detections')
123 |     val_dataset.evaluate_detections(all_boxes, eval_save_folder)
124 |     print("detect time: ", time.time() - st)
125 | 
126 | 
127 | def main():
128 |     global args
129 |     args = arg_parse()
130 |     cfg_from_file(args.cfg_file)
131 |     bgr_means = cfg.TRAIN.BGR_MEAN
132 |     dataset_name = cfg.DATASETS.DATA_TYPE
133 |     batch_size = cfg.TEST.BATCH_SIZE
134 |     num_workers = args.num_workers
135 |     if cfg.DATASETS.DATA_TYPE == 'VOC':
136 |         trainvalDataset = VOCDetection
137 |         top_k = 200
138 |     else:
139 |         trainvalDataset = COCODetection
140 |         top_k = 300
141 |     dataroot = cfg.DATASETS.DATAROOT
142 |     if cfg.MODEL.SIZE == '300':
143 |         size_cfg = cfg.SMALL
144 |     else:
145 |         size_cfg = cfg.BIG
146 |     valSet = cfg.DATASETS.VAL_TYPE
147 |     num_classes = cfg.MODEL.NUM_CLASSES
148 |     save_folder = args.save_folder
149 |     if not os.path.exists(save_folder):
150 |         os.mkdir(save_folder)
151 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
152 |     cfg.TRAIN.TRAIN_ON = False
153 |     net = SSD(cfg)
154 | 
155 |     checkpoint = torch.load(args.weights)
156 |     state_dict = checkpoint['model']
157 |     from collections import OrderedDict
158 |     new_state_dict = OrderedDict()
159 |     for k, v in state_dict.items():
160 |         head = k[:7]
161 |         if head == 'module.':
162 |             name = k[7:]  # remove `module.`
163 |         else:
164 |             name = k
165 |         new_state_dict[name] = v
166 |     net.load_state_dict(new_state_dict)
167 |     detector = Detect(cfg)
168 |     ValTransform = BaseTransform(size_cfg.IMG_WH, bgr_means, (2, 0, 1))
169 |     val_dataset = trainvalDataset(dataroot, valSet, ValTransform, "val")
170 |     val_loader = data.DataLoader(
171 |         val_dataset,
172 |         batch_size,
173 |         shuffle=False,
174 |         num_workers=num_workers,
175 |         collate_fn=detection_collate)
176 |     top_k = 300
177 |     thresh = cfg.TEST.CONFIDENCE_THRESH
178 |     eval_net(
179 |         val_dataset,
180 |         val_loader,
181 |         net,
182 |         detector,
183 |         cfg,
184 |         ValTransform,
185 |         top_k,
186 |         thresh=thresh,
187 |         batch_size=batch_size)
188 | 
189 | 
190 | if __name__ == '__main__':
191 |     st = time.time()
192 |     main()
193 |     print("final time", time.time() - st)
194 | 


--------------------------------------------------------------------------------
/images/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yqyao/SSD_Pytorch/6060bbb650e7a1df7c12d7c9650a38eaba4ab6a8/images/dog.jpg


--------------------------------------------------------------------------------
/images/eagle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yqyao/SSD_Pytorch/6060bbb650e7a1df7c12d7c9650a38eaba4ab6a8/images/eagle.jpg


--------------------------------------------------------------------------------
/images/person.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yqyao/SSD_Pytorch/6060bbb650e7a1df7c12d7c9650a38eaba4ab6a8/images/person.jpg


--------------------------------------------------------------------------------
/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import *
2 | from .modules import *
3 | 


--------------------------------------------------------------------------------
/layers/functions/__init__.py:
--------------------------------------------------------------------------------
1 | from .detection import Detect
2 | from .prior_box import PriorBox
3 | # from .refine_prior_box import RefinePriorBox
4 | 
5 | 
6 | __all__ = ['Detect', 'PriorBox']
7 | 


--------------------------------------------------------------------------------
/layers/functions/detection.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.backends.cudnn as cudnn
 4 | from torch.autograd import Function
 5 | from torch.autograd import Variable
 6 | import torch.nn.functional as F
 7 | from utils.box_utils import decode, center_size
 8 | 
 9 | 
10 | class Detect(Function):
11 |     """At test time, Detect is the final layer of SSD.  Decode location preds,
12 |     apply non-maximum suppression to location predictions based on conf
13 |     scores and threshold to a top_k number of output predictions for both
14 |     confidence score and locations.
15 |     """
16 | 
17 |     def __init__(self, cfg):
18 |         self.cfg = cfg
19 |         self.num_classes = cfg.MODEL.NUM_CLASSES
20 |         #self.thresh = thresh
21 |         self.size = cfg.MODEL.SIZE
22 |         if self.size == '300':
23 |             size_cfg = cfg.SMALL
24 |         else:
25 |             size_cfg = cfg.BIG
26 |         # Parameters used in nms.
27 |         self.variance = size_cfg.VARIANCE
28 |         self.object_score = cfg.MODEL.OBJECT_SCORE
29 | 
30 |     def forward(self, predictions):
31 |         """
32 |         Args:
33 |             loc_data: (tensor) Loc preds from loc layers
34 |                 Shape: [batch,num_priors*4]
35 |             conf_data: (tensor) Shape: Conf preds from conf layers
36 |                 Shape: [batch*num_priors,num_classes]
37 |             prior_data: (tensor) Prior boxes and variances from priorbox layers
38 |                 Shape: [1,num_priors,4]
39 |         """
40 |         # loc, conf, priors = predictions
41 |         if self.cfg.MODEL.REFINE:
42 |             arm_loc, arm_conf, loc, conf, priors = predictions
43 |             arm_conf = F.softmax(arm_conf.view(-1, 2), 1)
44 |             conf = F.softmax(conf.view(-1, self.num_classes), 1)
45 |             arm_loc_data = arm_loc.data
46 |             arm_conf_data = arm_conf.data
47 |             arm_object_conf = arm_conf_data[:, 1:]
48 |             no_object_index = arm_object_conf <= self.object_score
49 |             conf.data[no_object_index.expand_as(conf.data)] = 0
50 |         else:
51 |             loc, conf, priors = predictions
52 |             conf = F.softmax(conf.view(-1, self.num_classes), 1)
53 |         loc_data = loc.data
54 |         conf_data = conf.data
55 |         # prior_data = priors.data
56 |         prior_data = priors[:loc_data.size(1), :]
57 | 
58 |         num = loc_data.size(0)  # batch size
59 | 
60 |         self.num_priors = prior_data.size(0)
61 | 
62 |         self.boxes = torch.zeros(num, self.num_priors, 4)
63 |         self.scores = torch.zeros(num, self.num_priors, self.num_classes)
64 |         conf_preds = conf_data.view(num, self.num_priors, self.num_classes)
65 |         batch_prior = prior_data.view(-1, self.num_priors, 4).expand(
66 |             (num, self.num_priors, 4))
67 |         batch_prior = batch_prior.contiguous().view(-1, 4)
68 |         if self.cfg.MODEL.REFINE:
69 |             default = decode(
70 |                 arm_loc_data.view(-1, 4), batch_prior, self.variance)
71 |             default = center_size(default)
72 |             decoded_boxes = decode(
73 |                 loc_data.view(-1, 4), default, self.variance)
74 |         else:
75 |             decoded_boxes = decode(
76 |                 loc_data.view(-1, 4), batch_prior, self.variance)
77 | 
78 |         self.scores = conf_preds.view(num, self.num_priors, self.num_classes)
79 |         self.boxes = decoded_boxes.view(num, self.num_priors, 4)
80 |         return self.boxes, self.scores


--------------------------------------------------------------------------------
/layers/functions/prior_box.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from math import sqrt as sqrt
 3 | from itertools import product as product
 4 | 
 5 | 
 6 | class PriorBox(object):
 7 |     """Compute priorbox coordinates in center-offset form for each source
 8 |     feature map.
 9 |     Note:
10 |     This 'layer' has changed between versions of the original SSD
11 |     paper, so we include both versions, but note v2 is the most tested and most
12 |     recent version of the paper.
13 | 
14 |     """
15 | 
16 |     def __init__(self, cfg):
17 |         super(PriorBox, self).__init__()
18 |         self.size = cfg.MODEL.SIZE
19 |         if self.size == '300':
20 |             size_cfg = cfg.SMALL
21 |         else:
22 |             size_cfg = cfg.BIG
23 |         self.img_wh = size_cfg.IMG_WH
24 |         self.num_priors = len(size_cfg.ASPECT_RATIOS)
25 |         self.feature_maps = size_cfg.FEATURE_MAPS
26 |         self.variance = size_cfg.VARIANCE or [0.1]
27 |         self.min_sizes = size_cfg.MIN_SIZES
28 |         self.use_max_sizes = size_cfg.USE_MAX_SIZE
29 |         if self.use_max_sizes:
30 |             self.max_sizes = size_cfg.MAX_SIZES
31 |         self.steps = size_cfg.STEPS
32 |         self.aspect_ratios = size_cfg.ASPECT_RATIOS
33 |         self.clip = size_cfg.CLIP
34 |         for v in self.variance:
35 |             if v <= 0:
36 |                 raise ValueError('Variances must be greater than 0')
37 | 
38 |     def forward(self):
39 |         mean = []
40 |         for k, f in enumerate(self.feature_maps):
41 |             grid_h, grid_w = f[1], f[0]
42 |             for i in range(grid_h):
43 |                 for j in range(grid_w):
44 |                     f_k_h = self.img_wh[1] / self.steps[k][1]
45 |                     f_k_w = self.img_wh[0] / self.steps[k][0]
46 |                     # unit center x,y
47 |                     cx = (j + 0.5) / f_k_w
48 |                     cy = (i + 0.5) / f_k_h
49 | 
50 |                     # aspect_ratio: 1
51 |                     # rel size: min_size
52 |                     s_k_h = self.min_sizes[k] / self.img_wh[1]
53 |                     s_k_w = self.min_sizes[k] / self.img_wh[0]
54 |                     mean += [cx, cy, s_k_w, s_k_h]
55 | 
56 |                     # aspect_ratio: 1
57 |                     # rel size: sqrt(s_k * s_(k+1))
58 |                     if self.use_max_sizes:
59 |                         s_k_prime_w = sqrt(
60 |                             s_k_w * (self.max_sizes[k] / self.img_wh[0]))
61 |                         s_k_prime_h = sqrt(
62 |                             s_k_h * (self.max_sizes[k] / self.img_wh[1]))
63 |                         mean += [cx, cy, s_k_prime_w, s_k_prime_h]
64 | 
65 |                     for ar in self.aspect_ratios[k]:
66 |                         mean += [cx, cy, s_k_w * sqrt(ar), s_k_h / sqrt(ar)]
67 | 
68 |         # back to torch land
69 |         output = torch.Tensor(mean).view(-1, 4)
70 |         if self.clip:
71 |             output.clamp_(max=1, min=0)
72 |         # print(output.size())
73 |         return output
74 | 


--------------------------------------------------------------------------------
/layers/functions/prior_layer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from math import sqrt as sqrt
 3 | from math import ceil
 4 | import torch.nn as nn
 5 | from itertools import product as product
 6 | 
 7 | 
 8 | class PriorLayer(nn.Module):
 9 |     def __init__(self, cfg):
10 |         super(PriorLayer, self).__init__()
11 |         self.size = cfg.MODEL.SIZE
12 |         if self.size == '300':
13 |             size_cfg = cfg.SMALL
14 |         else:
15 |             size_cfg = cfg.BIG
16 |         self.img_wh = size_cfg.IMG_WH
17 |         self.num_priors = len(size_cfg.ASPECT_RATIOS)
18 |         self.feature_maps = size_cfg.FEATURE_MAPS
19 |         self.variance = size_cfg.VARIANCE or [0.1]
20 |         self.min_sizes = size_cfg.MIN_SIZES
21 |         self.use_max_sizes = size_cfg.USE_MAX_SIZE
22 |         if self.use_max_sizes:
23 |             self.max_sizes = size_cfg.MAX_SIZES
24 |         self.steps = size_cfg.STEPS
25 |         self.aspect_ratios = size_cfg.ASPECT_RATIOS
26 |         self.clip = size_cfg.CLIP
27 |         for v in self.variance:
28 |             if v <= 0:
29 |                 raise ValueError('Variances must be greater than 0')
30 | 
31 |     def forward(self, img_wh, feature_maps_wh):
32 |         self.img_wh = img_wh
33 |         self.feature_maps_wh = feature_maps_wh
34 |         mean = []
35 |         for k, f in enumerate(self.feature_maps_wh):
36 |             grid_h, grid_w = f[1], f[0]
37 |             for i in range(grid_h):
38 |                 for j in range(grid_w):
39 |                     f_k_h = self.img_wh[1] / self.steps[k][1]
40 |                     f_k_w = self.img_wh[0] / self.steps[k][0]
41 |                     # unit center x,y
42 |                     cx = (j + 0.5) / f_k_w
43 |                     cy = (i + 0.5) / f_k_h
44 | 
45 |                     # aspect_ratio: 1
46 |                     # rel size: min_size
47 |                     s_k_h = self.min_sizes[k] / self.img_wh[1]
48 |                     s_k_w = self.min_sizes[k] / self.img_wh[0]
49 |                     mean += [cx, cy, s_k_w, s_k_h]
50 | 
51 |                     # aspect_ratio: 1
52 |                     # rel size: sqrt(s_k * s_(k+1))
53 |                     if self.use_max_sizes:
54 |                         s_k_prime_w = sqrt(
55 |                             s_k_w * (self.max_sizes[k] / self.img_wh[0]))
56 |                         s_k_prime_h = sqrt(
57 |                             s_k_h * (self.max_sizes[k] / self.img_wh[1]))
58 |                         mean += [cx, cy, s_k_prime_w, s_k_prime_h]
59 | 
60 |                     for ar in self.aspect_ratios[k]:
61 |                         mean += [cx, cy, s_k_w * sqrt(ar), s_k_h / sqrt(ar)]
62 | 
63 |         output = torch.Tensor(mean).view(-1, 4)
64 |         if self.clip:
65 |             output.clamp_(max=1, min=0)
66 |         return output
67 | 


--------------------------------------------------------------------------------
/layers/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from .weight_smooth_l1_loss import WeightSmoothL1Loss
 2 | from .weight_softmax_loss import WeightSoftmaxLoss
 3 | from .multibox_loss import MultiBoxLoss 
 4 | from .refine_multibox_loss import RefineMultiBoxLoss 
 5 | from .focal_loss_sigmoid import FocalLossSigmoid
 6 | from .focal_loss_softmax import FocalLossSoftmax
 7 | 
 8 | 
 9 | 
10 | __all__ = ['MultiBoxLoss', 'WeightSoftmaxLoss', ]
11 | 


--------------------------------------------------------------------------------
/layers/modules/focal_loss_sigmoid.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Written by yq_yao
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | class FocalLossSigmoid(nn.Module):
11 |     '''
12 |     sigmoid version focal loss
13 |     '''
14 | 
15 |     def __init__(self, alpha=0.25, gamma=2, size_average=False):
16 |         super(FocalLossSigmoid, self).__init__()
17 |         self.alpha = alpha
18 |         self.gamma = gamma
19 |         self.size_average = size_average
20 | 
21 |     def forward(self, inputs, targets):
22 |         N = inputs.size(0)
23 |         C = inputs.size(1)
24 |         P = torch.sigmoid(inputs)
25 |         alpha_mask = self.alpha * targets
26 |         loss_pos = -1. * torch.pow(
27 |             1 - P, self.gamma) * torch.log(P) * targets * alpha_mask
28 |         loss_neg = -1. * torch.pow(P, self.gamma) * torch.log(1 - P) * (
29 |             1 - targets) * (1 - alpha_mask)
30 |         batch_loss = loss_neg + loss_pos
31 |         if self.size_average:
32 |             loss = batch_loss.mean()
33 |         else:
34 |             loss = batch_loss.sum()
35 |         return loss
36 | 


--------------------------------------------------------------------------------
/layers/modules/focal_loss_softmax.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Written by yq_yao
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | class FocalLossSoftmax(nn.Module):
11 |     '''
12 |     softmax version focal loss
13 |     '''
14 | 
15 |     def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
16 |         super(FocalLossSoftmax, self).__init__()
17 |         if alpha is None:
18 |             self.alpha = Variable(torch.ones(class_num, 1))
19 |         else:
20 |             if isinstance(alpha, Variable):
21 |                 self.alpha = alpha
22 |             else:
23 |                 self.alpha = Variable(alpha)
24 |         self.gamma = gamma
25 |         self.class_num = class_num
26 |         self.size_average = size_average
27 | 
28 |     def forward(self, inputs, targets):
29 |         N = inputs.size(0)
30 |         C = inputs.size(1)
31 |         P = F.softmax(inputs)
32 | 
33 |         class_mask = inputs.data.new(N, C).fill_(0)
34 |         class_mask = Variable(class_mask)
35 |         ids = targets.view(-1, 1)
36 |         class_mask.scatter_(1, ids.data, 1.)
37 | 
38 |         if inputs.is_cuda and not self.alpha.is_cuda:
39 |             self.alpha = self.alpha.cuda()
40 |         alpha = self.alpha[ids.data.view(-1)]
41 |         probs = (P * class_mask).sum(1).view(-1, 1)
42 |         log_p = probs.log()
43 |         batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p
44 | 
45 |         if self.size_average:
46 |             loss = batch_loss.mean()
47 |         else:
48 |             loss = batch_loss.sum()
49 |         return loss


--------------------------------------------------------------------------------
/layers/modules/multibox_loss.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | from torch.autograd import Variable
  6 | from utils.box_utils import match, log_sum_exp
  7 | from .focal_loss_softmax import FocalLossSoftmax
  8 | from .focal_loss_sigmoid import FocalLossSigmoid
  9 | 
 10 | GPU = False
 11 | if torch.cuda.is_available():
 12 |     GPU = True
 13 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 14 | 
 15 | 
 16 | class MultiBoxLoss(nn.Module):
 17 |     """SSD Weighted Loss Function
 18 |     Compute Targets:
 19 |         1) Produce Confidence Target Indices by matching  ground truth boxes
 20 |            with (default) 'priorboxes' that have jaccard index > threshold parameter
 21 |            (default threshold: 0.5).
 22 |         2) Produce localization target by 'encoding' variance into offsets of ground
 23 |            truth boxes and their matched  'priorboxes'.
 24 |         3) Hard negative mining to filter the excessive number of negative examples
 25 |            that comes with using a large number of default bounding boxes.
 26 |            (default negative:positive ratio 3:1)
 27 |     Objective Loss:
 28 |         L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
 29 |         Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
 30 |         weighted by α which is set to 1 by cross val.
 31 |         Args:
 32 |             c: class confidences,
 33 |             l: predicted boxes,
 34 |             g: ground truth boxes
 35 |             N: number of matched default boxes
 36 |         See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 37 |     """
 38 | 
 39 |     def __init__(self, cfg):
 40 |         super(MultiBoxLoss, self).__init__()
 41 |         self.cfg = cfg
 42 |         self.size = cfg.MODEL.SIZE
 43 |         if self.size == '300':
 44 |             size_cfg = cfg.SMALL
 45 |         else:
 46 |             size_cfg = cfg.BIG
 47 |         self.variance = size_cfg.VARIANCE
 48 |         self.num_classes = cfg.MODEL.NUM_CLASSES
 49 |         self.threshold = cfg.TRAIN.OVERLAP
 50 |         self.OHEM = cfg.TRAIN.OHEM
 51 |         self.negpos_ratio = cfg.TRAIN.NEG_RATIO
 52 |         self.variance = size_cfg.VARIANCE
 53 |         if cfg.TRAIN.FOCAL_LOSS:
 54 |             if cfg.TRAIN.FOCAL_LOSS_TYPE == 'SOFTMAX':
 55 |                 self.focaloss = FocalLossSoftmax(
 56 |                     self.num_classes, gamma=2, size_average=False)
 57 |             else:
 58 |                 self.focaloss = FocalLossSigmoid()
 59 | 
 60 |     def forward(self, predictions, targets):
 61 |         """Multibox Loss
 62 |         Args:
 63 |             predictions (tuple): A tuple containing loc preds, conf preds,
 64 |             and prior boxes from SSD net.
 65 |                 conf shape: torch.size(batch_size,num_priors,num_classes)
 66 |                 loc shape: torch.size(batch_size,num_priors,4)
 67 |                 priors shape: torch.size(num_priors,4)
 68 | 
 69 |             ground_truth (tensor): Ground truth boxes and labels for a batch,
 70 |                 shape: [batch_size,num_objs,5] (last idx is the label).
 71 |         """
 72 |         loc_data, conf_data, priors = predictions
 73 |         num = loc_data.size(0)
 74 |         priors = priors[:loc_data.size(1), :]
 75 |         num_priors = (priors.size(0))
 76 |         num_classes = self.num_classes
 77 |         loc_t = torch.Tensor(num, num_priors, 4)
 78 |         conf_t = torch.LongTensor(num, num_priors)
 79 |         for idx in range(num):
 80 |             truths = targets[idx][:, :-1].data
 81 |             labels = targets[idx][:, -1].data
 82 |             if self.num_classes == 2:
 83 |                 labels = labels > 0
 84 |             defaults = priors.data
 85 |             match(self.threshold, truths, defaults, self.variance, labels,
 86 |                   loc_t, conf_t, idx)
 87 |         loc_t = loc_t.cuda()
 88 |         conf_t = conf_t.cuda()
 89 | 
 90 |         pos = conf_t > 0
 91 |         num_pos = pos.sum(1, keepdim=True)
 92 | 
 93 |         if self.OHEM:
 94 |             # Compute max conf across batch for hard negative mining
 95 |             batch_conf = conf_data.view(-1, self.num_classes)
 96 | 
 97 |             loss_hard = log_sum_exp(batch_conf) - batch_conf.gather(
 98 |                 1, conf_t.view(-1, 1))
 99 |             # Hard Negative Mining
100 |             loss_hard[pos.view(-1, 1)] = 0  # filter out pos boxes for now
101 |             loss_hard = loss_hard.view(num, -1)
102 |             _, loss_idx = loss_hard.sort(1, descending=True)
103 |             _, idx_rank = loss_idx.sort(1)
104 |             num_pos = pos.long().sum(1, keepdim=True)
105 |             if num_pos.data.sum() > 0:
106 |                 num_neg = torch.clamp(
107 |                 self.negpos_ratio * num_pos, max=pos.size(1) - 1)
108 |             else:
109 |                 fake_num_pos = torch.ones(32, 1).long() * 15
110 |                 num_neg = torch.clamp(
111 |                 self.negpos_ratio * fake_num_pos, max=pos.size(1) - 1)
112 |             neg = idx_rank < num_neg.expand_as(idx_rank)
113 | 
114 |             # Confidence Loss Including Positive and Negative Examples
115 |             pos_idx = pos.unsqueeze(2).expand_as(conf_data)
116 |             neg_idx = neg.unsqueeze(2).expand_as(conf_data)
117 |             conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(
118 |                 -1, self.num_classes)
119 |             targets_weighted = conf_t[(pos + neg).gt(0)]
120 |             loss_c = F.cross_entropy(
121 |                 conf_p, targets_weighted, size_average=False)
122 |         else:
123 |             loss_c = F.cross_entropy(conf_p, conf_t, size_average=False)
124 |         # Localization Loss (Smooth L1)
125 |         # Shape: [batch,num_priors,4]
126 |         if num_pos.data.sum() > 0:
127 |             pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
128 |             loc_p = loc_data[pos_idx].view(-1, 4)
129 |             loc_t = loc_t[pos_idx].view(-1, 4)
130 |             loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
131 |             N = num_pos.data.sum()
132 |         else:
133 |             loss_l = torch.zeros(1)
134 |             N = 1.0
135 |         loss_l /= float(N)
136 |         loss_c /= float(N)
137 |         return loss_l, loss_c
138 | 


--------------------------------------------------------------------------------
/layers/modules/refine_multibox_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import numpy as np
  8 | from torch.autograd import Variable
  9 | from utils.box_utils import match, log_sum_exp, refine_match
 10 | from layers.modules import WeightSoftmaxLoss, WeightSmoothL1Loss
 11 | GPU = False
 12 | if torch.cuda.is_available():
 13 |     GPU = True
 14 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 15 | 
 16 | 
 17 | class RefineMultiBoxLoss(nn.Module):
 18 |     """SSD Weighted Loss Function
 19 |     Compute Targets:
 20 |         1) Produce Confidence Target Indices by matching  ground truth boxes
 21 |            with (default) 'priorboxes' that have jaccard index > threshold parameter
 22 |            (default threshold: 0.5).
 23 |         2) Produce localization target by 'encoding' variance into offsets of ground
 24 |            truth boxes and their matched  'priorboxes'.
 25 |         3) Hard negative mining to filter the excessive number of negative examples
 26 |            that comes with using a large number of default bounding boxes.
 27 |            (default negative:positive ratio 3:1)
 28 |     Objective Loss:
 29 |         L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
 30 |         Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
 31 |         weighted by α which is set to 1 by cross val.
 32 |         Args:
 33 |             c: class confidences,
 34 |             l: predicted boxes,
 35 |             g: ground truth boxes
 36 |             N: number of matched default boxes
 37 |         See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 38 |     """
 39 | 
 40 |     def __init__(self, cfg, num_classes):
 41 |         super(RefineMultiBoxLoss, self).__init__()
 42 |         self.cfg = cfg
 43 |         self.size = cfg.MODEL.SIZE
 44 |         if self.size == '300':
 45 |             size_cfg = cfg.SMALL
 46 |         else:
 47 |             size_cfg = cfg.BIG
 48 |         self.variance = size_cfg.VARIANCE
 49 |         self.num_classes = num_classes
 50 |         self.threshold = cfg.TRAIN.OVERLAP
 51 |         self.OHEM = cfg.TRAIN.OHEM
 52 |         self.negpos_ratio = cfg.TRAIN.NEG_RATIO
 53 |         self.object_score = cfg.MODEL.OBJECT_SCORE
 54 |         self.variance = size_cfg.VARIANCE
 55 |         if cfg.TRAIN.FOCAL_LOSS:
 56 |             if cfg.TRAIN.FOCAL_LOSS_TYPE == 'SOFTMAX':
 57 |                 self.focaloss = FocalLossSoftmax(
 58 |                     self.num_classes, gamma=2, size_average=False)
 59 |             else:
 60 |                 self.focaloss = FocalLossSigmoid()
 61 | 
 62 |     def forward(self,
 63 |                 predictions,
 64 |                 targets,
 65 |                 use_arm=False,
 66 |                 filter_object=False,
 67 |                 debug=False):
 68 |         """Multibox Loss
 69 |         Args:
 70 |             predictions (tuple): A tuple containing loc preds, conf preds,
 71 |             and prior boxes from SSD net.
 72 |                 conf shape: torch.size(batch_size,num_priors,num_classes)
 73 |                 loc shape: torch.size(batch_size,num_priors,4)
 74 |                 priors shape: torch.size(num_priors,4)
 75 | 
 76 |             ground_truth (tensor): Ground truth boxes and labels for a batch,
 77 |                 shape: [batch_size,num_objs,5] (last idx is the label).
 78 |         """
 79 |         # arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions
 80 |         if use_arm:
 81 |             arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions
 82 |         else:
 83 |             loc_data, conf_data, _, _, priors = predictions
 84 |         num = loc_data.size(0)
 85 |         priors = priors[:loc_data.size(1), :]
 86 |         num_priors = (priors.size(0))
 87 |         num_classes = self.num_classes
 88 | 
 89 |         # match priors (default boxes) and ground truth boxes
 90 |         loc_t = torch.Tensor(num, num_priors, 4)
 91 |         conf_t = torch.LongTensor(num, num_priors)
 92 |         defaults = priors.data
 93 |         for idx in range(num):
 94 |             truths = targets[idx][:, :-1].data
 95 |             labels = targets[idx][:, -1].data
 96 |             if self.num_classes == 2:
 97 |                 labels = labels > 0
 98 |             if use_arm:
 99 |                 bbox_weight = refine_match(
100 |                     self.threshold,
101 |                     truths,
102 |                     defaults,
103 |                     self.variance,
104 |                     labels,
105 |                     loc_t,
106 |                     conf_t,
107 |                     idx,
108 |                     arm_loc_data[idx].data,
109 |                     use_weight=False)
110 |             else:
111 |                 match(self.threshold, truths, defaults, self.variance, labels,
112 |                       loc_t, conf_t, idx)
113 | 
114 |         loc_t = loc_t.cuda()
115 |         conf_t = conf_t.cuda()
116 |         # wrap targets
117 |         loc_t = Variable(loc_t, requires_grad=False)
118 |         conf_t = Variable(conf_t, requires_grad=False)
119 | 
120 |         if use_arm and filter_object:
121 |             P = F.softmax(arm_conf_data, 2)
122 |             arm_conf_data_temp = P[:, :, 1]
123 |             object_score_index = arm_conf_data_temp <= self.object_score
124 |             pos = conf_t > 0
125 |             pos[object_score_index.detach()] = 0
126 |         else:
127 |             pos = conf_t > 0
128 |         num_pos = pos.sum(1, keepdim=True)
129 |         if debug:
130 |             if use_arm:
131 |                 print("odm pos num: ", str(loc_t.size(0)), str(loc_t.size(1)))
132 |             else:
133 |                 print("arm pos num", str(loc_t.size(0)), str(loc_t.size(1)))
134 | 
135 |         if self.OHEM:
136 |             # Compute max conf across batch for hard negative mining
137 |             batch_conf = conf_data.view(-1, self.num_classes)
138 | 
139 |             loss_c = log_sum_exp(batch_conf) - batch_conf.gather(
140 |                 1, conf_t.view(-1, 1))
141 | 
142 |             # Hard Negative Mining
143 |             loss_c[pos.view(-1, 1)] = 0  # filter out pos boxes for now
144 |             loss_c = loss_c.view(num, -1)
145 |             _, loss_idx = loss_c.sort(1, descending=True)
146 |             _, idx_rank = loss_idx.sort(1)
147 |             num_pos = pos.long().sum(1, keepdim=True)
148 | 
149 |             if num_pos.data.sum() > 0:
150 |                 num_neg = torch.clamp(
151 |                 self.negpos_ratio * num_pos, max=pos.size(1) - 1)
152 |             else:
153 |                 fake_num_pos = torch.ones(32, 1).long() * 15
154 |                 num_neg = torch.clamp(
155 |                 self.negpos_ratio * fake_num_pos, max=pos.size(1) - 1)
156 |             neg = idx_rank < num_neg.expand_as(idx_rank)
157 | 
158 |             # Confidence Loss Including Positive and Negative Examples
159 |             pos_idx = pos.unsqueeze(2).expand_as(conf_data)
160 |             neg_idx = neg.unsqueeze(2).expand_as(conf_data)
161 |             conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(
162 |                 -1, self.num_classes)
163 | 
164 |             targets_weighted = conf_t[(pos + neg).gt(0)]
165 |             loss_c = F.cross_entropy(
166 |                 conf_p, targets_weighted, size_average=False)
167 |         else:
168 |             loss_c = F.cross_entropy(conf_p, conf_t, size_average=False)
169 | 
170 |         # Localization Loss (Smooth L1)
171 |         # Shape: [batch,num_priors,4]
172 |         if num_pos.data.sum() > 0:
173 |             pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
174 |             loc_p = loc_data[pos_idx].view(-1, 4)
175 |             loc_t = loc_t[pos_idx].view(-1, 4)
176 |             loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
177 |             N = num_pos.data.sum()
178 |         else:
179 |             loss_l = torch.zeros(1)
180 |             N = 1.0
181 | 
182 |         loss_l /= float(N)
183 |         loss_c /= float(N)
184 |         return loss_l, loss_c
185 | 


--------------------------------------------------------------------------------
/layers/modules/weight_smooth_l1_loss.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Written by yq_yao
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | class WeightSmoothL1Loss(nn.Module):
11 |     def __init__(self, class_num, size_average=False):
12 |         super(WeightSmoothL1Loss, self).__init__()
13 |         self.class_num = class_num
14 |         self.size_average = size_average
15 | 
16 |     def forward(self, inputs, targets, weights):
17 |         N = inputs.size(0)
18 |         loc_num = inputs.size(1)
19 |         abs_out = torch.abs(inputs - targets)
20 | 
21 |         if inputs.is_cuda and not weights.is_cuda:
22 |             weights = weights.cuda()
23 | 
24 |         weights = weights.view(-1, 1)
25 | 
26 |         weights = torch.cat((weights, weights, weights, weights), 1)
27 |         mask_big = abs_out >= 1.
28 |         mask_small = abs_out < 1.
29 |         loss_big = weights[mask_big] * (abs_out[mask_big] - 0.5)
30 |         loss_small = weights[mask_small] * 0.5 * torch.pow(
31 |             abs_out[mask_small], 2)
32 |         loss_sum = loss_big.sum() + loss_small.sum()
33 | 
34 |         if self.size_average:
35 |             loss = loss_sum / N * loc_num
36 |         else:
37 |             loss = loss_sum
38 |         return loss
39 | 


--------------------------------------------------------------------------------
/layers/modules/weight_softmax_loss.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Written by yq_yao
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from torch.autograd import Variable
 8 | 
 9 | 
10 | class WeightSoftmaxLoss(nn.Module):
11 |     def __init__(self, class_num, gamma=2, size_average=True):
12 |         super(WeightSoftmaxLoss, self).__init__()
13 |         # if isinstance(weights, Variable):
14 |         #     self.weights = weights
15 |         # else:
16 |         #     self.weights = Variable(weights)
17 | 
18 |         self.class_num = class_num
19 |         self.gamma = gamma
20 |         self.size_average = size_average
21 | 
22 |     def forward(self, inputs, targets, weights):
23 |         N = inputs.size(0)
24 |         C = inputs.size(1)
25 |         P = F.softmax(inputs)
26 | 
27 |         class_mask = inputs.data.new(N, C).fill_(0)
28 |         class_mask = Variable(class_mask)
29 |         ids = targets.view(-1, 1)
30 |         class_mask.scatter_(1, ids.data, 1.)
31 |         if inputs.is_cuda and not weights.is_cuda:
32 |             weights = weights.cuda()
33 |         probs = (P * class_mask).sum(1).view(-1, 1)
34 | 
35 |         log_p = probs.log()
36 |         weights = weights.view(-1, 1)
37 |         batch_loss = -weights * log_p
38 | 
39 |         if self.size_average:
40 |             loss = batch_loss.mean()
41 |         else:
42 |             loss = batch_loss.sum()
43 |         return loss


--------------------------------------------------------------------------------
/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | cd ./utils/
 3 | 
 4 | CUDA_PATH=/usr/local/cuda/
 5 | 
 6 | python build.py build_ext --inplace
 7 | # if you use anaconda3 maybe you need add this
 8 | # change code like https://github.com/rbgirshick/py-faster-rcnn/issues/706
 9 | mv nms/cpu_nms.cpython-36m-x86_64-linux-gnu.so nms/cpu_nms.so
10 | mv nms/gpu_nms.cpython-36m-x86_64-linux-gnu.so nms/gpu_nms.so
11 | cd ..
12 | 


--------------------------------------------------------------------------------
/models/darknet.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | #
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | from models.model_helper import weights_init
  9 | 
 10 | 
 11 | def add_extras(size, in_channel, batch_norm=False):
 12 |     # Extra layers added to resnet for feature scaling
 13 |     layers = []
 14 |     layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)]
 15 |     layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)]
 16 |     layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 17 |     layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 18 |     if size == '300':
 19 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 20 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=0)]
 21 |     else:
 22 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 23 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 24 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 25 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 26 | 
 27 |     return layers
 28 | 
 29 | 
 30 | class ConvBN(nn.Module):
 31 |     def __init__(self, ch_in, ch_out, kernel_size=3, stride=1, padding=0):
 32 |         super().__init__()
 33 |         self.conv = nn.Conv2d(
 34 |             ch_in,
 35 |             ch_out,
 36 |             kernel_size=kernel_size,
 37 |             stride=stride,
 38 |             padding=padding,
 39 |             bias=False)
 40 |         self.bn = nn.BatchNorm2d(ch_out, momentum=0.01, eps=1e-05, affine=True)
 41 | 
 42 |     def forward(self, x):
 43 |         return F.leaky_relu(
 44 |             self.bn(self.conv(x)), negative_slope=0.1, inplace=True)
 45 | 
 46 | 
 47 | class DarknetBlock(nn.Module):
 48 |     def __init__(self, ch_in):
 49 |         super().__init__()
 50 |         ch_hid = ch_in // 2
 51 |         self.conv1 = ConvBN(ch_in, ch_hid, kernel_size=1, stride=1, padding=0)
 52 |         self.conv2 = ConvBN(ch_hid, ch_in, kernel_size=3, stride=1, padding=1)
 53 | 
 54 |     def forward(self, x):
 55 |         out = self.conv1(x)
 56 |         out = self.conv2(out)
 57 |         return out + x
 58 | 
 59 | 
 60 | class Darknet19(nn.Module):
 61 |     def __init__(self, size):
 62 |         super().__init__()
 63 |         self.conv = ConvBN(3, 32, kernel_size=3, stride=1, padding=1)
 64 |         self.layer1 = self._make_layer1()
 65 |         self.layer2 = self._make_layer2()
 66 |         self.layer3 = self._make_layer3()
 67 |         self.layer4 = self._make_layer4()
 68 |         self.layer5 = self._make_layer5()
 69 |         self.extras = nn.ModuleList(add_extras(str(size), 1024))
 70 | 
 71 |     def _make_layer1(self):
 72 |         layers = [
 73 |             nn.MaxPool2d(kernel_size=2, stride=2),
 74 |             ConvBN(32, 64, kernel_size=3, stride=1, padding=1)
 75 |         ]
 76 |         return nn.Sequential(*layers)
 77 | 
 78 |     def _make_layer2(self):
 79 |         layers = [
 80 |             nn.MaxPool2d(kernel_size=2, stride=2),
 81 |             ConvBN(64, 128, kernel_size=3, stride=1, padding=1),
 82 |             ConvBN(128, 64, kernel_size=1, stride=1),
 83 |             ConvBN(64, 128, kernel_size=3, stride=1, padding=1)
 84 |         ]
 85 |         return nn.Sequential(*layers)
 86 | 
 87 |     def _make_layer3(self):
 88 |         layers = [
 89 |             nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True),
 90 |             ConvBN(128, 256, kernel_size=3, stride=1, padding=1),
 91 |             ConvBN(256, 128, kernel_size=1, stride=1),
 92 |             ConvBN(128, 256, kernel_size=3, stride=1, padding=1)
 93 |         ]
 94 |         return nn.Sequential(*layers)
 95 | 
 96 |     def _make_layer4(self):
 97 |         layers = [
 98 |             nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True),
 99 |             ConvBN(256, 512, kernel_size=3, stride=1, padding=1),
100 |             ConvBN(512, 256, kernel_size=1, stride=1),
101 |             ConvBN(256, 512, kernel_size=3, stride=1, padding=1),
102 |             ConvBN(512, 256, kernel_size=1, stride=1),
103 |             ConvBN(256, 512, kernel_size=3, stride=1, padding=1)
104 |         ]
105 |         return nn.Sequential(*layers)
106 | 
107 |     def _make_layer5(self):
108 |         layers = [
109 |             nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True),
110 |             ConvBN(512, 1024, kernel_size=3, stride=1, padding=1),
111 |             ConvBN(1024, 512, kernel_size=1, stride=1),
112 |             ConvBN(512, 1024, kernel_size=3, stride=1, padding=1),
113 |             ConvBN(1024, 512, kernel_size=1, stride=1),
114 |             ConvBN(512, 1024, kernel_size=3, stride=1, padding=1)
115 |         ]
116 |         return nn.Sequential(*layers)
117 | 
118 |     def forward(self, x):
119 |         out = self.conv(x)
120 |         c1 = self.layer1(out)
121 |         c2 = self.layer2(c1)
122 |         c3 = self.layer3(c2)
123 |         c4 = self.layer4(c3)
124 |         c5 = self.layer5(c4)
125 |         sources = [c3, c4, c5]
126 |         x = c5
127 |         for k, v in enumerate(self.extras):
128 |             x = F.relu(v(x), inplace=True)
129 |             if k % 2 == 1:
130 |                 sources.append(x)
131 |         return sources
132 | 
133 | 
134 | class Darknet53(nn.Module):
135 |     def __init__(self, num_blocks, size):
136 |         super().__init__()
137 |         self.conv = ConvBN(3, 32, kernel_size=3, stride=1, padding=1)
138 |         self.layer1 = self._make_layer(32, num_blocks[0], stride=2)
139 |         self.layer2 = self._make_layer(64, num_blocks[1], stride=2)
140 |         self.layer3 = self._make_layer(128, num_blocks[2], stride=2)
141 |         self.layer4 = self._make_layer(256, num_blocks[3], stride=2)
142 |         self.layer5 = self._make_layer(512, num_blocks[4], stride=2)
143 |         self.extras = nn.ModuleList(add_extras(str(size), 1024))
144 |         self._init_modules()
145 | 
146 |     def _make_layer(self, ch_in, num_blocks, stride=1):
147 |         layers = [ConvBN(ch_in, ch_in * 2, stride=stride, padding=1)]
148 |         for i in range(num_blocks):
149 |             layers.append(DarknetBlock(ch_in * 2))
150 |         return nn.Sequential(*layers)
151 | 
152 |     def _init_modules(self):
153 |         self.extras.apply(weights_init)
154 | 
155 |     def forward(self, x):
156 |         out = self.conv(x)
157 |         c1 = self.layer1(out)
158 |         c2 = self.layer2(c1)
159 |         c3 = self.layer3(c2)
160 |         c4 = self.layer4(c3)
161 |         c5 = self.layer5(c4)
162 |         sources = [c3, c4, c5]
163 |         x = c5
164 |         for k, v in enumerate(self.extras):
165 |             x = F.relu(v(x), inplace=True)
166 |             if k % 2 == 1:
167 |                 sources.append(x)
168 |         return sources
169 | 
170 | 
171 | def SSDarknet53(size, channel_size='48'):
172 |     return Darknet53([1, 2, 8, 8, 4], size)
173 | 
174 | 
175 | def SSDarknet19(size, channel_size='48'):
176 |     return Darknet19(size)
177 | 
178 | 
179 | if __name__ == "__main__":
180 |     import os
181 |     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
182 |     model3 = SSDarknet19(size=300)
183 |     with torch.no_grad():
184 |         model3.eval()
185 |         x = torch.randn(16, 3, 300, 300)
186 |         model3.cuda()
187 |         model3(x.cuda())
188 |         import time
189 |         st = time.time()
190 |         for i in range(100):
191 |             model3(x.cuda())
192 |         print(time.time() - st)
193 | 


--------------------------------------------------------------------------------
/models/drf_res.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import models.dense_conv
  8 | from torch.autograd import Variable
  9 | from models.model_helper import weights_init
 10 | 
 11 | 
 12 | def add_extras(size, in_channel, batch_norm=False):
 13 |     layers = []
 14 |     layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)]
 15 |     layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)]
 16 |     layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 17 |     layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 18 |     if size == '300':
 19 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 20 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=0)]
 21 |     else:
 22 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 23 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 24 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 25 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 26 | 
 27 |     return layers
 28 | 
 29 | 
 30 | class Bottleneck(nn.Module):
 31 |     expansion = 4
 32 | 
 33 |     def __init__(self, in_planes, planes, stride=1):
 34 |         super(Bottleneck, self).__init__()
 35 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
 36 |         self.bn1 = nn.BatchNorm2d(planes)
 37 |         self.conv2 = nn.Conv2d(
 38 |             planes,
 39 |             planes,
 40 |             kernel_size=3,
 41 |             stride=stride,
 42 |             padding=1,
 43 |             bias=False)
 44 |         self.bn2 = nn.BatchNorm2d(planes)
 45 |         self.conv3 = nn.Conv2d(
 46 |             planes, self.expansion * planes, kernel_size=1, bias=False)
 47 |         self.bn3 = nn.BatchNorm2d(self.expansion * planes)
 48 | 
 49 |         self.downsample = nn.Sequential()
 50 |         if stride != 1 or in_planes != self.expansion * planes:
 51 |             self.downsample = nn.Sequential(
 52 |                 nn.Conv2d(
 53 |                     in_planes,
 54 |                     self.expansion * planes,
 55 |                     kernel_size=1,
 56 |                     stride=stride,
 57 |                     bias=False), nn.BatchNorm2d(self.expansion * planes))
 58 | 
 59 |     def forward(self, x):
 60 |         out = F.relu(self.bn1(self.conv1(x)))
 61 |         out = F.relu(self.bn2(self.conv2(out)))
 62 |         out = self.bn3(self.conv3(out))
 63 |         out += self.downsample(x)
 64 |         out = F.relu(out)
 65 |         return out
 66 | 
 67 | 
 68 | class DenseSSDResnet(nn.Module):
 69 |     def __init__(self, block, num_blocks, size='300', channel_size='48'):
 70 |         super(DenseSSDResnet, self).__init__()
 71 |         self.in_planes = 64
 72 | 
 73 |         self.conv1 = nn.Conv2d(
 74 |             3, 64, kernel_size=7, stride=2, padding=3, bias=False)
 75 |         self.bn1 = nn.BatchNorm2d(64)
 76 | 
 77 |         # Bottom-up layers
 78 |         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
 79 |         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
 80 |         self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
 81 |         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
 82 | 
 83 |         self.extras = nn.ModuleList(add_extras(str(size), 2048))
 84 | 
 85 |         dense_list = models.dense_conv.dense_list_res(channel_size, size)
 86 |         self.dense_list0 = nn.ModuleList(dense_list[0])
 87 |         self.dense_list1 = nn.ModuleList(dense_list[1])
 88 |         self.dense_list2 = nn.ModuleList(dense_list[2])
 89 |         self.dense_list3 = nn.ModuleList(dense_list[3])
 90 |         self.dense_list4 = nn.ModuleList(dense_list[4])
 91 |         self.dense_list5 = nn.ModuleList(dense_list[5])
 92 |         self.smooth1 = nn.Conv2d(2048, 512, kernel_size=3, stride=1, padding=1)
 93 |         self._init_modules()
 94 | 
 95 |     def _make_layer(self, block, planes, num_blocks, stride):
 96 |         strides = [stride] + [1] * (num_blocks - 1)
 97 |         layers = []
 98 |         for stride in strides:
 99 |             layers.append(block(self.in_planes, planes, stride))
100 |             self.in_planes = planes * block.expansion
101 |         return nn.Sequential(*layers)
102 | 
103 |     def _init_modules(self):
104 |         self.extras.apply(weights_init)
105 |         self.dense_list0.apply(weights_init)
106 |         self.dense_list1.apply(weights_init)
107 |         self.dense_list2.apply(weights_init)
108 |         self.dense_list3.apply(weights_init)
109 |         self.dense_list4.apply(weights_init)
110 |         self.dense_list5.apply(weights_init)
111 |         self.smooth1.apply(weights_init)
112 | 
113 |     def forward(self, x):
114 |         # Bottom-up
115 |         c1 = F.relu(self.bn1(self.conv1(x)))
116 |         c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1)
117 | 
118 |         c2 = self.layer1(c1)
119 |         dense1_p1 = self.dense_list0[0](c2)
120 |         dense1_p2 = self.dense_list0[1](dense1_p1)
121 |         dense1_p3 = self.dense_list0[2](dense1_p2)
122 |         dense1_p1_conv = self.dense_list0[3](dense1_p1)
123 |         dense1_p2_conv = self.dense_list0[4](dense1_p2)
124 |         dense1_p3_conv = self.dense_list0[5](dense1_p3)
125 | 
126 |         c3 = self.layer2(c2)
127 |         dense2_p1 = self.dense_list1[0](c3)
128 |         dense2_p2 = self.dense_list1[1](dense2_p1)
129 |         dense2_p3 = self.dense_list1[2](dense2_p2)
130 |         dense2_p1_conv = self.dense_list1[3](dense2_p1)
131 |         dense2_p2_conv = self.dense_list1[4](dense2_p2)
132 |         dense2_p3_conv = self.dense_list1[5](dense2_p3)
133 | 
134 |         c4 = self.layer3(c3)
135 |         dense3_up_conv = self.dense_list2[0](c4)
136 |         dense3_up = self.dense_list2[1](dense3_up_conv)
137 |         dense3_p1 = self.dense_list2[2](c4)
138 |         dense3_p2 = self.dense_list2[3](dense3_p1)
139 |         dense3_p1_conv = self.dense_list2[4](dense3_p1)
140 |         dense3_p2_conv = self.dense_list2[5](dense3_p2)
141 | 
142 |         c5 = self.layer4(c4)
143 |         c5_ = self.smooth1(c5)
144 |         dense4_up1_conv = self.dense_list3[0](c5)
145 |         dense4_up2_conv = self.dense_list3[1](c5)
146 |         dense4_up1 = self.dense_list3[2](dense4_up1_conv)
147 |         dense4_up2 = self.dense_list3[3](dense4_up2_conv)
148 |         dense4_p = self.dense_list3[4](c5)
149 |         dense4_p_conv = self.dense_list3[5](dense4_p)
150 | 
151 |         p6 = F.relu(self.extras[0](c5), inplace=True)
152 |         p6 = F.relu(self.extras[1](p6), inplace=True)
153 | 
154 |         x = p6
155 | 
156 |         dense5_up1_conv = self.dense_list4[0](p6)
157 |         dense5_up2_conv = self.dense_list4[1](p6)
158 |         dense5_up3_conv = self.dense_list4[2](p6)
159 |         dense5_up1 = self.dense_list4[3](dense5_up1_conv)
160 |         dense5_up2 = self.dense_list4[4](dense5_up2_conv)
161 |         dense5_up3 = self.dense_list4[5](dense5_up3_conv)
162 | 
163 |         dense_out1 = torch.cat(
164 |             (dense1_p1_conv, c3, dense3_up, dense4_up2, dense5_up3), 1)
165 |         dense_out1 = F.relu(self.dense_list5[0](dense_out1))
166 | 
167 |         dense_out2 = torch.cat(
168 |             (dense1_p2_conv, dense2_p1_conv, c4, dense4_up1, dense5_up2), 1)
169 |         dense_out2 = F.relu(self.dense_list5[1](dense_out2))
170 | 
171 |         dense_out3 = torch.cat(
172 |             (dense1_p3_conv, dense2_p2_conv, dense3_p1_conv, c5_, dense5_up1),
173 |             1)
174 |         dense_out3 = F.relu(self.dense_list5[2](dense_out3))
175 | 
176 |         dense_out4 = torch.cat(
177 |             (dense2_p3_conv, dense3_p2_conv, dense4_p_conv, p6), 1)
178 |         dense_out4 = F.relu(self.dense_list5[3](dense_out4))
179 | 
180 |         sources = [dense_out1, dense_out2, dense_out3, dense_out4]
181 |         # apply extra layers and cache source layer outputs
182 |         for k, v in enumerate(self.extras):
183 |             if k > 1:
184 |                 x = F.relu(v(x), inplace=True)
185 |                 if k % 2 == 1:
186 |                     sources.append(x)
187 | 
188 |         return sources
189 | 
190 | 
191 | def DRFSSDRes50(size, channel_size='48'):
192 |     return DenseSSDResnet(Bottleneck, [3, 4, 6, 3], size, channel_size)
193 | 
194 | 
195 | def DRFSSDRes101(size, channel_size='48'):
196 |     return DenseSSDResnet(Bottleneck, [3, 4, 23, 3], size, channel_size)
197 | 
198 | 
199 | def DRFSSDRes152(size, channel_size='48'):
200 |     return DenseSSDResnet(Bottleneck, [3, 8, 36, 3], size, channel_size)
201 | 


--------------------------------------------------------------------------------
/models/mobilenetv2.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.nn import init
  7 | from models.model_helper import weights_init
  8 | 
  9 | 
 10 | def add_extras(size, in_channel, batch_norm=False):
 11 |     # Extra layers added to resnet for feature scaling
 12 |     layers = []
 13 |     layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)]
 14 |     layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)]
 15 |     layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 16 |     layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 17 |     if size == '300':
 18 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 19 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=0)]
 20 |     else:
 21 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 22 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 23 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 24 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 25 | 
 26 |     return layers
 27 | 
 28 | 
 29 | def _make_divisible(v, divisor, min_value=None):
 30 |     """
 31 |     This function is taken from the original tf repo.
 32 |     It ensures that all layers have a channel number that is divisible by 8
 33 |     It can be seen here:
 34 |     https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
 35 |     :param v:
 36 |     :param divisor:
 37 |     :param min_value:
 38 |     :return:
 39 |     """
 40 |     if min_value is None:
 41 |         min_value = divisor
 42 |     new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
 43 |     # Make sure that round down does not go down by more than 10%.
 44 |     if new_v < 0.9 * v:
 45 |         new_v += divisor
 46 |     return new_v
 47 | 
 48 | 
 49 | class LinearBottleneck(nn.Module):
 50 |     def __init__(self, inplanes, outplanes, stride=1, t=6,
 51 |                  activation=nn.ReLU6):
 52 |         super(LinearBottleneck, self).__init__()
 53 |         self.conv1 = nn.Conv2d(
 54 |             inplanes, inplanes * t, kernel_size=1, bias=False)
 55 |         self.bn1 = nn.BatchNorm2d(inplanes * t)
 56 |         self.conv2 = nn.Conv2d(
 57 |             inplanes * t,
 58 |             inplanes * t,
 59 |             kernel_size=3,
 60 |             stride=stride,
 61 |             padding=1,
 62 |             bias=False,
 63 |             groups=inplanes * t)
 64 |         self.bn2 = nn.BatchNorm2d(inplanes * t)
 65 |         self.conv3 = nn.Conv2d(
 66 |             inplanes * t, outplanes, kernel_size=1, bias=False)
 67 |         self.bn3 = nn.BatchNorm2d(outplanes)
 68 |         self.activation = activation(inplace=True)
 69 |         self.stride = stride
 70 |         self.t = t
 71 |         self.inplanes = inplanes
 72 |         self.outplanes = outplanes
 73 | 
 74 |     def forward(self, x):
 75 |         residual = x
 76 | 
 77 |         out = self.conv1(x)
 78 |         out = self.bn1(out)
 79 |         out = self.activation(out)
 80 | 
 81 |         out = self.conv2(out)
 82 |         out = self.bn2(out)
 83 |         out = self.activation(out)
 84 | 
 85 |         out = self.conv3(out)
 86 |         out = self.bn3(out)
 87 | 
 88 |         if self.stride == 1 and self.inplanes == self.outplanes:
 89 |             out += residual
 90 | 
 91 |         return out
 92 | 
 93 | 
 94 | class MobileNet2(nn.Module):
 95 |     """MobileNet2 implementation.
 96 |     """
 97 | 
 98 |     def __init__(self,
 99 |                  scale=1.0,
100 |                  input_size=224,
101 |                  t=6,
102 |                  in_channels=3,
103 |                  size=300,
104 |                  activation=nn.ReLU6):
105 |         """
106 |         MobileNet2 constructor.
107 |         :param in_channels: (int, optional): number of channels in the input tensor.
108 |                 Default is 3 for RGB image inputs.
109 |         :param input_size:
110 |         :param num_classes: number of classes to predict. Default
111 |                 is 1000 for ImageNet.
112 |         :param scale:
113 |         :param t:
114 |         :param activation:
115 |         """
116 | 
117 |         super(MobileNet2, self).__init__()
118 | 
119 |         self.scale = scale
120 |         self.t = t
121 |         self.activation_type = activation
122 |         self.activation = activation(inplace=True)
123 |         self.size = size
124 | 
125 |         self.num_of_channels = [32, 16, 24, 32, 64, 96, 160, 320]
126 |         # assert (input_size % 32 == 0)
127 | 
128 |         self.c = [
129 |             _make_divisible(ch * self.scale, 8) for ch in self.num_of_channels
130 |         ]
131 |         self.n = [1, 1, 2, 3, 4, 3, 3, 1]
132 |         self.s = [2, 1, 2, 2, 2, 1, 2, 1]
133 |         self.conv1 = nn.Conv2d(
134 |             in_channels,
135 |             self.c[0],
136 |             kernel_size=3,
137 |             bias=False,
138 |             stride=self.s[0],
139 |             padding=1)
140 |         self.bn1 = nn.BatchNorm2d(self.c[0])
141 |         # self.bottlenecks = self._make_bottlenecks()
142 |         self.bottlenecks = nn.ModuleList(self._make_bottlenecks())
143 | 
144 |         # Last convolution has 1280 output channels for scale <= 1
145 |         self.last_conv_out_ch = 1280 if self.scale <= 1 else _make_divisible(
146 |             1280 * self.scale, 8)
147 |         self.conv_last = nn.Conv2d(
148 |             self.c[-1], self.last_conv_out_ch, kernel_size=1, bias=False)
149 |         self.bn_last = nn.BatchNorm2d(self.last_conv_out_ch)
150 | 
151 |         self.extras = nn.ModuleList(
152 |             add_extras(str(self.size), self.last_conv_out_ch))
153 |         self._init_modules()
154 | 
155 |     def _init_modules(self):
156 |         self.extras.apply(weights_init)
157 | 
158 |     def _make_stage(self, inplanes, outplanes, n, stride, t, stage):
159 |         modules = OrderedDict()
160 |         stage_name = "LinearBottleneck{}".format(stage)
161 | 
162 |         # First module is the only one utilizing stride
163 |         first_module = LinearBottleneck(
164 |             inplanes=inplanes,
165 |             outplanes=outplanes,
166 |             stride=stride,
167 |             t=t,
168 |             activation=self.activation_type)
169 |         modules[stage_name + "_0"] = first_module
170 | 
171 |         # add more LinearBottleneck depending on number of repeats
172 |         for i in range(n - 1):
173 |             name = stage_name + "_{}".format(i + 1)
174 |             module = LinearBottleneck(
175 |                 inplanes=outplanes,
176 |                 outplanes=outplanes,
177 |                 stride=1,
178 |                 t=6,
179 |                 activation=self.activation_type)
180 |             modules[name] = module
181 |         return nn.Sequential(modules)
182 | 
183 |     def _make_bottlenecks(self):
184 |         modules = list()
185 |         stage_name = "Bottlenecks"
186 | 
187 |         # First module is the only one with t=1
188 |         bottleneck1 = self._make_stage(
189 |             inplanes=self.c[0],
190 |             outplanes=self.c[1],
191 |             n=self.n[1],
192 |             stride=self.s[1],
193 |             t=1,
194 |             stage=0)
195 |         modules.append(bottleneck1)
196 | 
197 |         # add more LinearBottleneck depending on number of repeats
198 |         for i in range(1, len(self.c) - 1):
199 |             name = stage_name + "_{}".format(i)
200 |             module = self._make_stage(
201 |                 inplanes=self.c[i],
202 |                 outplanes=self.c[i + 1],
203 |                 n=self.n[i + 1],
204 |                 stride=self.s[i + 1],
205 |                 t=self.t,
206 |                 stage=i)
207 |             modules += module
208 | 
209 |         return modules
210 | 
211 |     def forward(self, x):
212 |         x = self.conv1(x)
213 |         x = self.bn1(x)
214 |         x = self.activation(x)
215 | 
216 |         sources = list()
217 |         for i in range(6):
218 |             x = self.bottlenecks[i](x)
219 |         sources.append(x)
220 |         for i in range(6, 13):
221 |             x = self.bottlenecks[i](x)
222 |         sources.append(x)
223 |         for i in range(13, len(self.bottlenecks)):
224 |             x = self.bottlenecks[i](x)
225 |         x = self.conv_last(x)
226 |         x = self.bn_last(x)
227 |         x = self.activation(x)
228 |         sources.append(x)
229 |         for k, v in enumerate(self.extras):
230 |             x = F.relu(v(x), inplace=True)
231 |             if k % 2 == 1:
232 |                 sources.append(x)
233 |         return sources
234 | 
235 | 
236 | def SSDMobilenetv2(size, channel_size='48'):
237 |     return MobileNet2(size=size)
238 | 
239 | 
240 | if __name__ == "__main__":
241 |     import os
242 |     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
243 |     model3 = MobileNet2(size=300)
244 |     with torch.no_grad():
245 |         model3.eval()
246 |         x = torch.randn(16, 3, 300, 300)
247 |         model3.cuda()
248 |         model3(x.cuda())
249 |         import time
250 |         st = time.time()
251 |         for i in range(100):
252 |             model3(x.cuda())
253 |         print(time.time() - st)
254 |         # print(model3(x))
255 | 


--------------------------------------------------------------------------------
/models/model_builder.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | from layers import *
  9 | import os
 10 | from models.model_helper import weights_init
 11 | import importlib
 12 | from layers.functions.prior_layer import PriorLayer
 13 | 
 14 | 
 15 | def get_func(func_name):
 16 |     """Helper to return a function object by name. func_name must identify a
 17 |     function in this module or the path to a function relative to the base
 18 |     'modeling' module.
 19 |     """
 20 |     if func_name == '':
 21 |         return None
 22 |     try:
 23 |         parts = func_name.split('.')
 24 |         # Refers to a function in this module
 25 |         if len(parts) == 1:
 26 |             return globals()[parts[0]]
 27 |         # Otherwise, assume we're referencing a module under modeling
 28 |         module_name = 'models.' + '.'.join(parts[:-1])
 29 |         module = importlib.import_module(module_name)
 30 |         return getattr(module, parts[-1])
 31 |     except Exception:
 32 |         print('Failed to find function: %s', func_name)
 33 |         raise
 34 | 
 35 | 
 36 | class SSD(nn.Module):
 37 |     """Single Shot Multibox Architecture
 38 |     The network is composed of a base VGG network followed by the
 39 |     added multibox conv layers.  Each multibox layer branches into
 40 |         1) conv2d for class conf scores
 41 |         2) conv2d for localization predictions
 42 |         3) associated priorbox layer to produce default bounding
 43 |            boxes specific to the layer's feature map size.
 44 |     See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 45 | 
 46 |     Args:
 47 |         phase: (string) Can be "test" or "train"
 48 |         base: VGG16 layers for input, size of either 300 or 500
 49 |         extras: extra layers that feed to multibox loc and conf layers
 50 |         head: "multibox head" consists of loc and conf conv layers
 51 |     """
 52 | 
 53 |     def _init_modules(self):
 54 |         self.arm_loc.apply(weights_init)
 55 |         self.arm_conf.apply(weights_init)
 56 |         if self.cfg.MODEL.REFINE:
 57 |             self.odm_loc.apply(weights_init)
 58 |             self.odm_conf.apply(weights_init)
 59 |         if self.cfg.MODEL.LOAD_PRETRAINED_WEIGHTS:
 60 |             weights = torch.load(self.cfg.MODEL.PRETRAIN_WEIGHTS)
 61 |             print("load pretrain model {}".format(
 62 |                 self.cfg.MODEL.PRETRAIN_WEIGHTS))
 63 |             if self.cfg.MODEL.TYPE.split('_')[-1] == 'vgg':
 64 |                 self.extractor.vgg.load_state_dict(weights)
 65 |             else:
 66 |                 self.extractor.load_state_dict(weights, strict=False)
 67 | 
 68 |     def __init__(self, cfg):
 69 |         super(SSD, self).__init__()
 70 |         self.cfg = cfg
 71 |         self.size = cfg.MODEL.SIZE
 72 |         if self.size == '300':
 73 |             size_cfg = cfg.SMALL
 74 |         else:
 75 |             size_cfg = cfg.BIG
 76 |         self.num_classes = cfg.MODEL.NUM_CLASSES
 77 |         self.prior_layer = PriorLayer(cfg)
 78 |         self.priorbox = PriorBox(cfg)
 79 |         self.priors = self.priorbox.forward()
 80 |         self.extractor = get_func(cfg.MODEL.CONV_BODY)(self.size,
 81 |                                                        cfg.TRAIN.CHANNEL_SIZE)
 82 |         if cfg.MODEL.REFINE:
 83 |             self.odm_channels = size_cfg.ODM_CHANNELS
 84 |             self.arm_num_classes = 2
 85 |             self.odm_loc = nn.ModuleList()
 86 |             self.odm_conf = nn.ModuleList()
 87 |         self.arm_loc = nn.ModuleList()
 88 |         self.arm_conf = nn.ModuleList()
 89 |         self.arm_channels = size_cfg.ARM_CHANNELS
 90 |         self.num_anchors = size_cfg.NUM_ANCHORS
 91 |         self.input_fixed = size_cfg.INPUT_FIXED
 92 |         self.arm_loc = nn.ModuleList()
 93 |         self.arm_conf = nn.ModuleList()
 94 |         for i in range(len(self.arm_channels)):
 95 |             if cfg.MODEL.REFINE:
 96 |                 self.arm_loc += [
 97 |                     nn.Conv2d(
 98 |                         self.arm_channels[i],
 99 |                         self.num_anchors[i] * 4,
100 |                         kernel_size=3,
101 |                         padding=1)
102 |                 ]
103 |                 self.arm_conf += [
104 |                     nn.Conv2d(
105 |                         self.arm_channels[i],
106 |                         self.num_anchors[i] * self.arm_num_classes,
107 |                         kernel_size=3,
108 |                         padding=1)
109 |                 ]
110 |                 self.odm_loc += [
111 |                     nn.Conv2d(
112 |                         self.odm_channels[i],
113 |                         self.num_anchors[i] * 4,
114 |                         kernel_size=3,
115 |                         padding=1)
116 |                 ]
117 |                 self.odm_conf += [
118 |                     nn.Conv2d(
119 |                         self.odm_channels[i],
120 |                         self.num_anchors[i] * self.num_classes,
121 |                         kernel_size=3,
122 |                         padding=1)
123 |                 ]
124 |             else:
125 |                 self.arm_loc += [
126 |                     nn.Conv2d(
127 |                         self.arm_channels[i],
128 |                         self.num_anchors[i] * 4,
129 |                         kernel_size=3,
130 |                         padding=1)
131 |                 ]
132 |                 self.arm_conf += [
133 |                     nn.Conv2d(
134 |                         self.arm_channels[i],
135 |                         self.num_anchors[i] * self.num_classes,
136 |                         kernel_size=3,
137 |                         padding=1)
138 |                 ]
139 |         if cfg.TRAIN.TRAIN_ON:
140 |             self._init_modules()
141 | 
142 |     def forward(self, x):
143 | 
144 |         arm_loc = list()
145 |         arm_conf = list()
146 |         if self.cfg.MODEL.REFINE:
147 |             odm_loc = list()
148 |             odm_conf = list()
149 |             arm_xs, odm_xs = self.extractor(x)
150 |             for (x, l, c) in zip(odm_xs, self.odm_loc, self.odm_conf):
151 |                 odm_loc.append(l(x).permute(0, 2, 3, 1).contiguous())
152 |                 odm_conf.append(c(x).permute(0, 2, 3, 1).contiguous())
153 |             odm_loc = torch.cat([o.view(o.size(0), -1) for o in odm_loc], 1)
154 |             odm_conf = torch.cat([o.view(o.size(0), -1) for o in odm_conf], 1)
155 |         else:
156 |             arm_xs = self.extractor(x)
157 |         img_wh = (x.size(3), x.size(2))
158 |         feature_maps_wh = [(t.size(3), t.size(2)) for t in arm_xs]
159 |         for (x, l, c) in zip(arm_xs, self.arm_loc, self.arm_conf):
160 |             arm_loc.append(l(x).permute(0, 2, 3, 1).contiguous())
161 |             arm_conf.append(c(x).permute(0, 2, 3, 1).contiguous())
162 |         arm_loc = torch.cat([o.view(o.size(0), -1) for o in arm_loc], 1)
163 |         arm_conf = torch.cat([o.view(o.size(0), -1) for o in arm_conf], 1)
164 |         if self.cfg.MODEL.REFINE:
165 |             output = (arm_loc.view(arm_loc.size(0), -1, 4),
166 |                       arm_conf.view(
167 |                           arm_conf.size(0), -1, self.arm_num_classes),
168 |                       odm_loc.view(odm_loc.size(0), -1, 4),
169 |                       odm_conf.view(odm_conf.size(0), -1, self.num_classes),
170 |                       self.priors if self.input_fixed else self.prior_layer(
171 |                           img_wh, feature_maps_wh))
172 |         else:
173 |             output = (arm_loc.view(arm_loc.size(0), -1, 4),
174 |                       arm_conf.view(arm_conf.size(0), -1, self.num_classes),
175 |                       self.priors if self.input_fixed else self.prior_layer(
176 |                           img_wh, feature_maps_wh))
177 |         return output
178 | 


--------------------------------------------------------------------------------
/models/refine_res.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | from models.model_helper import FpnAdapter, weights_init
  9 | 
 10 | 
 11 | def add_extras(size, in_channel, batch_norm=False):
 12 |     # Extra layers added to resnet for feature scaling
 13 |     layers = []
 14 |     layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)]
 15 |     layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)]
 16 |     return layers
 17 | 
 18 | 
 19 | def conv3x3(in_planes, out_planes, stride=1):
 20 |     "3x3 convolution with padding"
 21 |     return nn.Conv2d(
 22 |         in_planes,
 23 |         out_planes,
 24 |         kernel_size=3,
 25 |         stride=stride,
 26 |         padding=1,
 27 |         bias=False)
 28 | 
 29 | 
 30 | class BasicBlock(nn.Module):
 31 |     expansion = 1
 32 | 
 33 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 34 |         super(BasicBlock, self).__init__()
 35 |         self.conv1 = conv3x3(inplanes, planes, stride)
 36 |         self.bn1 = nn.BatchNorm2d(planes)
 37 |         self.relu = nn.ReLU(inplace=True)
 38 |         self.conv2 = conv3x3(planes, planes)
 39 |         self.bn2 = nn.BatchNorm2d(planes)
 40 |         self.downsample = downsample
 41 |         self.stride = stride
 42 | 
 43 |     def forward(self, x):
 44 |         residual = x
 45 | 
 46 |         out = self.conv1(x)
 47 |         out = self.bn1(out)
 48 |         out = self.relu(out)
 49 | 
 50 |         out = self.conv2(out)
 51 |         out = self.bn2(out)
 52 | 
 53 |         if self.downsample is not None:
 54 |             residual = self.downsample(x)
 55 |         out += residual
 56 |         out = self.relu(out)
 57 | 
 58 |         return out
 59 | 
 60 | 
 61 | class Bottleneck(nn.Module):
 62 |     expansion = 4
 63 | 
 64 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 65 |         super(Bottleneck, self).__init__()
 66 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
 67 |         self.bn1 = nn.BatchNorm2d(planes)
 68 |         self.conv2 = nn.Conv2d(
 69 |             planes,
 70 |             planes,
 71 |             kernel_size=3,
 72 |             stride=stride,
 73 |             padding=1,
 74 |             bias=False)
 75 |         self.bn2 = nn.BatchNorm2d(planes)
 76 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
 77 |         self.bn3 = nn.BatchNorm2d(planes * 4)
 78 |         self.relu = nn.ReLU(inplace=True)
 79 |         self.downsample = downsample
 80 |         self.stride = stride
 81 | 
 82 |     def forward(self, x):
 83 |         residual = x
 84 | 
 85 |         out = self.conv1(x)
 86 |         out = self.bn1(out)
 87 |         out = self.relu(out)
 88 | 
 89 |         out = self.conv2(out)
 90 |         out = self.bn2(out)
 91 |         out = self.relu(out)
 92 | 
 93 |         out = self.conv3(out)
 94 |         out = self.bn3(out)
 95 | 
 96 |         if self.downsample is not None:
 97 |             residual = self.downsample(x)
 98 | 
 99 |         out += residual
100 |         out = self.relu(out)
101 | 
102 |         return out
103 | 
104 | 
105 | class RefineResnet(nn.Module):
106 |     def __init__(self, block, num_blocks, size):
107 |         super(RefineResnet, self).__init__()
108 |         self.inplanes = 64
109 | 
110 |         self.conv1 = nn.Conv2d(
111 |             3, 64, kernel_size=7, stride=2, padding=3, bias=False)
112 |         self.bn1 = nn.BatchNorm2d(64)
113 | 
114 |         # Bottom-up layers
115 |         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
116 |         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
117 |         self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
118 |         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
119 |         self.inchannel = block.expansion * 512
120 |         self.extras = nn.ModuleList(add_extras(str(size), self.inchannel))
121 |         self.smooth1 = nn.Conv2d(
122 |             self.inchannel, 512, kernel_size=3, stride=1, padding=1)
123 |         self.fpn = FpnAdapter([512, 1024, 512, 256], 4)
124 |         self._init_modules()
125 | 
126 |     def _make_layer(self, block, planes, blocks, stride=1):
127 |         downsample = None
128 |         if stride != 1 or self.inplanes != planes * block.expansion:
129 |             downsample = nn.Sequential(
130 |                 nn.Conv2d(
131 |                     self.inplanes,
132 |                     planes * block.expansion,
133 |                     kernel_size=1,
134 |                     stride=stride,
135 |                     bias=False),
136 |                 nn.BatchNorm2d(planes * block.expansion),
137 |             )
138 | 
139 |         layers = []
140 |         layers.append(block(self.inplanes, planes, stride, downsample))
141 |         self.inplanes = planes * block.expansion
142 |         for i in range(1, blocks):
143 |             layers.append(block(self.inplanes, planes))
144 | 
145 |         return nn.Sequential(*layers)
146 | 
147 |     def _init_modules(self):
148 |         self.extras.apply(weights_init)
149 |         self.smooth1.apply(weights_init)
150 | 
151 |     def forward(self, x):
152 |         # Bottom-up
153 |         odm_sources = list()
154 |         c1 = F.relu(self.bn1(self.conv1(x)))
155 |         c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1)
156 |         c2 = self.layer1(c1)
157 |         c3 = self.layer2(c2)
158 |         c4 = self.layer3(c3)
159 |         c5 = self.layer4(c4)
160 |         x = c5
161 |         c5_ = self.smooth1(c5)
162 |         arm_sources = [c3, c4, c5_]
163 |         for k, v in enumerate(self.extras):
164 |             x = F.relu(v(x), inplace=True)
165 |             if k % 2 == 1:
166 |                 arm_sources.append(x)
167 |         odm_sources = self.fpn(arm_sources)
168 |         return arm_sources, odm_sources
169 | 
170 | 
171 | def RefineResnet50(size, channel_size='48'):
172 |     return RefineResnet(Bottleneck, [3, 4, 6, 3], size)
173 | 
174 | 
175 | def RefineResnet101(size, channel_size='48'):
176 |     return RefineResnet(Bottleneck, [3, 4, 23, 3], size)
177 | 
178 | 
179 | def RefineResnet152(size, channel_size='48'):
180 |     return RefineResnet(Bottleneck, [3, 8, 36, 3], size)
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     import os
185 |     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
186 |     model = RefineResnet50(size=300)
187 |     print(model)
188 |     with torch.no_grad():
189 |         model.eval()
190 |         x = torch.randn(1, 3, 320, 320)
191 |         model.cuda()
192 |         model(x.cuda())
193 | 


--------------------------------------------------------------------------------
/models/refine_vgg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | import torch.nn.init as init
  9 | from models.model_helper import FpnAdapter, WeaveAdapter, weights_init
 10 | 
 11 | 
 12 | class L2Norm(nn.Module):
 13 |     def __init__(self, n_channels, scale):
 14 |         super(L2Norm, self).__init__()
 15 |         self.n_channels = n_channels
 16 |         self.gamma = scale or None
 17 |         self.eps = 1e-10
 18 |         self.weight = nn.Parameter(torch.Tensor(self.n_channels))
 19 |         self.reset_parameters()
 20 | 
 21 |     def reset_parameters(self):
 22 |         init.constant_(self.weight, self.gamma)
 23 | 
 24 |     def forward(self, x):
 25 |         norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
 26 |         x = x / norm
 27 |         out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(
 28 |             x) * x
 29 |         return out
 30 | 
 31 | 
 32 | # This function is derived from torchvision VGG make_layers()
 33 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
 34 | 
 35 | 
 36 | def vgg(cfg, i, batch_norm=False):
 37 |     layers = []
 38 |     in_channels = i
 39 |     for v in cfg:
 40 |         if v == 'M':
 41 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
 42 |         elif v == 'C':
 43 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
 44 |         else:
 45 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
 46 |             if batch_norm:
 47 |                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
 48 |             else:
 49 |                 layers += [conv2d, nn.ReLU(inplace=True)]
 50 |             in_channels = v
 51 |     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
 52 |     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 53 |     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
 54 |     layers += [
 55 |         pool5, conv6,
 56 |         nn.ReLU(inplace=True), conv7,
 57 |         nn.ReLU(inplace=True)
 58 |     ]
 59 |     return layers
 60 | 
 61 | 
 62 | base = {
 63 |     '300': [
 64 |         64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 65 |         512, 512, 512
 66 |     ],
 67 |     '512': [
 68 |         64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 69 |         512, 512, 512
 70 |     ],
 71 | }
 72 | 
 73 | 
 74 | def add_extras(size):
 75 |     layers = []
 76 |     layers += [nn.Conv2d(1024, 256, kernel_size=1, stride=1)]
 77 |     layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)]
 78 |     layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 79 |     layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 80 | 
 81 |     return layers
 82 | 
 83 | 
 84 | # def last_layer_trans():
 85 | #     return nn.Sequential(nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
 86 | #                   nn.ReLU(inplace=True),
 87 | #                   nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
 88 | #                   nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1))
 89 | 
 90 | # def trans_layers(size):
 91 | #     layers = list()
 92 | #     layers += [nn.Sequential(nn.Conv2d(512, 256, kernel_size=3, stride=1,           padding=1),
 93 | #                 nn.ReLU(inplace=True),
 94 | #                 nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1))]
 95 | #     layers += [nn.Sequential(nn.Conv2d(1024, 256, kernel_size=3, stride=1,           padding=1),
 96 | #                 nn.ReLU(inplace=True),
 97 | #                 nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1))]
 98 | #     layers += [nn.Sequential(nn.Conv2d(256, 256, kernel_size=3, stride=1,           padding=1),
 99 | #                 nn.ReLU(inplace=True),
100 | #                 nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1))]
101 | 
102 | #     return layers
103 | 
104 | # def latent_layers(size):
105 | #     layers = []
106 | #     for i in range(3):
107 | #         layers += [nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)]
108 | #     return layers
109 | 
110 | # def up_layers(size):
111 | #     layers = []
112 | #     for i in range(3):
113 | #         layers += [nn.ConvTranspose2d(256, 256, kernel_size=2, stride=2, padding=0)]
114 | #     return layers
115 | 
116 | 
117 | class VGG16Extractor(nn.Module):
118 |     def __init__(self, size, channel_size='48'):
119 |         super(VGG16Extractor, self).__init__()
120 |         self.vgg = nn.ModuleList(vgg(base[str(size)], 3))
121 |         self.extras = nn.ModuleList(add_extras(str(size)))
122 |         self.L2Norm_4_3 = L2Norm(512, 10)
123 |         self.L2Norm_5_3 = L2Norm(1024, 8)
124 |         # self.last_layer_trans = last_layer_trans()
125 |         # self.trans_layers = nn.ModuleList(trans_layers(str(size)))
126 |         # self.latent_layers = nn.ModuleList(latent_layers((str(size))))
127 |         # self.up_layers = nn.ModuleList(up_layers(str(size)))
128 |         self.fpn = FpnAdapter([512, 1024, 256, 256], 4)
129 |         self._init_modules()
130 | 
131 |     def _init_modules(self):
132 |         self.extras.apply(weights_init)
133 |         # self.last_layer_trans.apply(weights_init)
134 |         # self.trans_layers.apply(weights_init)
135 |         # self.latent_layers.apply(weights_init)
136 |         # self.up_layers.apply(weights_init)
137 | 
138 |     def forward(self, x):
139 |         """Applies network layers and ops on input image(s) x.
140 |         Args:
141 |             x: input image or batch of images. Shape: [batch,3*batch,300,300].
142 |         Return:
143 |             Depending on phase:
144 |             test:
145 |                 Variable(tensor) of output class label predictions,
146 |                 confidence score, and corresponding location predictions for
147 |                 each object detected. Shape: [batch,topk,7]
148 |             train:
149 |                 list of concat outputs from:
150 |                     1: confidence layers, Shape: [batch*num_priors,num_classes]
151 |                     2: localization layers, Shape: [batch,num_priors*4]
152 |                     3: priorbox layers, Shape: [2,num_priors*4]
153 |         """
154 |         arm_sources = list()
155 | 
156 |         for i in range(23):
157 |             x = self.vgg[i](x)
158 |         #38x38
159 |         c2 = x
160 |         c2 = self.L2Norm_4_3(c2)
161 |         arm_sources.append(c2)
162 | 
163 |         for k in range(23, len(self.vgg)):
164 |             x = self.vgg[k](x)
165 |         #19x19
166 |         c3 = x
167 |         c3 = self.L2Norm_5_3(c3)
168 |         arm_sources.append(c3)
169 | 
170 |         # 10x10
171 |         x = F.relu(self.extras[0](x), inplace=True)
172 |         x = F.relu(self.extras[1](x), inplace=True)
173 |         c4 = x
174 |         arm_sources.append(c4)
175 | 
176 |         # 5x5
177 |         x = F.relu(self.extras[2](x), inplace=True)
178 |         x = F.relu(self.extras[3](x), inplace=True)
179 |         c5 = x
180 |         arm_sources.append(c5)
181 | 
182 |         if len(self.extras) > 4:
183 |             x = F.relu(self.extras[4](x), inplace=True)
184 |             x = F.relu(self.extras[5](x), inplace=True)
185 |             c6 = x
186 |             arm_sources.append(c6)
187 | 
188 |         # x = self.last_layer_trans(x)
189 |         # odm_sources.append(x)
190 | 
191 |         # trans_layer_list = list()
192 | 
193 |         # for(p, t) in zip(arm_sources, self.trans_layers):
194 |         #     trans_layer_list.append(t(p))
195 | 
196 |         # trans_layer_list.reverse()
197 |         # for (t, u, l) in zip(trans_layer_list, self.up_layers, self.latent_layers):
198 |         #     x = F.relu(l(F.relu(u(x)+ t, inplace=True)), inplace=True)
199 |         #     odm_sources.append(x)
200 | 
201 |         # odm_sources.reverse()
202 |         odm_sources = self.fpn(arm_sources)
203 |         return arm_sources, odm_sources
204 | 
205 | 
206 | def refine_vgg(size, channel_size='48'):
207 |     return VGG16Extractor(size)


--------------------------------------------------------------------------------
/models/resnet.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | from models.model_helper import weights_init
  9 | 
 10 | 
 11 | def add_extras(size, in_channel, batch_norm=False):
 12 |     # Extra layers added to resnet for feature scaling
 13 |     layers = []
 14 |     layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)]
 15 |     layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)]
 16 |     layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 17 |     layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 18 |     if size == '300':
 19 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 20 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=0)]
 21 |     else:
 22 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 23 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 24 |         layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 25 |         layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 26 | 
 27 |     return layers
 28 | 
 29 | 
 30 | def conv3x3(in_planes, out_planes, stride=1):
 31 |     "3x3 convolution with padding"
 32 |     return nn.Conv2d(
 33 |         in_planes,
 34 |         out_planes,
 35 |         kernel_size=3,
 36 |         stride=stride,
 37 |         padding=1,
 38 |         bias=False)
 39 | 
 40 | 
 41 | class BasicBlock(nn.Module):
 42 |     expansion = 1
 43 | 
 44 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 45 |         super(BasicBlock, self).__init__()
 46 |         self.conv1 = conv3x3(inplanes, planes, stride)
 47 |         self.bn1 = nn.BatchNorm2d(planes)
 48 |         self.relu = nn.ReLU(inplace=True)
 49 |         self.conv2 = conv3x3(planes, planes)
 50 |         self.bn2 = nn.BatchNorm2d(planes)
 51 |         self.downsample = downsample
 52 |         self.stride = stride
 53 | 
 54 |     def forward(self, x):
 55 |         residual = x
 56 | 
 57 |         out = self.conv1(x)
 58 |         out = self.bn1(out)
 59 |         out = self.relu(out)
 60 | 
 61 |         out = self.conv2(out)
 62 |         out = self.bn2(out)
 63 | 
 64 |         if self.downsample is not None:
 65 |             residual = self.downsample(x)
 66 |         out += residual
 67 |         out = self.relu(out)
 68 | 
 69 |         return out
 70 | 
 71 | 
 72 | class Bottleneck(nn.Module):
 73 |     expansion = 4
 74 | 
 75 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 76 |         super(Bottleneck, self).__init__()
 77 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
 78 |         self.bn1 = nn.BatchNorm2d(planes)
 79 |         self.conv2 = nn.Conv2d(
 80 |             planes,
 81 |             planes,
 82 |             kernel_size=3,
 83 |             stride=stride,
 84 |             padding=1,
 85 |             bias=False)
 86 |         self.bn2 = nn.BatchNorm2d(planes)
 87 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
 88 |         self.bn3 = nn.BatchNorm2d(planes * 4)
 89 |         self.relu = nn.ReLU(inplace=True)
 90 |         self.downsample = downsample
 91 |         self.stride = stride
 92 | 
 93 |     def forward(self, x):
 94 |         residual = x
 95 | 
 96 |         out = self.conv1(x)
 97 |         out = self.bn1(out)
 98 |         out = self.relu(out)
 99 | 
100 |         out = self.conv2(out)
101 |         out = self.bn2(out)
102 |         out = self.relu(out)
103 | 
104 |         out = self.conv3(out)
105 |         out = self.bn3(out)
106 | 
107 |         if self.downsample is not None:
108 |             residual = self.downsample(x)
109 | 
110 |         out += residual
111 |         out = self.relu(out)
112 | 
113 |         return out
114 | 
115 | 
116 | class SSDResnet(nn.Module):
117 |     def __init__(self, block, num_blocks, size):
118 |         super(SSDResnet, self).__init__()
119 |         self.inplanes = 64
120 | 
121 |         self.conv1 = nn.Conv2d(
122 |             3, 64, kernel_size=7, stride=2, padding=3, bias=False)
123 |         self.bn1 = nn.BatchNorm2d(64)
124 | 
125 |         # Bottom-up layers
126 |         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
127 |         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
128 |         self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
129 |         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
130 |         self.inchannel = block.expansion * 512
131 |         self.extras = nn.ModuleList(add_extras(str(size), self.inchannel))
132 |         self.smooth1 = nn.Conv2d(
133 |             self.inchannel, 512, kernel_size=3, stride=1, padding=1)
134 |         self._init_modules()
135 | 
136 |     def _make_layer(self, block, planes, blocks, stride=1):
137 |         downsample = None
138 |         if stride != 1 or self.inplanes != planes * block.expansion:
139 |             downsample = nn.Sequential(
140 |                 nn.Conv2d(
141 |                     self.inplanes,
142 |                     planes * block.expansion,
143 |                     kernel_size=1,
144 |                     stride=stride,
145 |                     bias=False),
146 |                 nn.BatchNorm2d(planes * block.expansion),
147 |             )
148 | 
149 |         layers = []
150 |         layers.append(block(self.inplanes, planes, stride, downsample))
151 |         self.inplanes = planes * block.expansion
152 |         for i in range(1, blocks):
153 |             layers.append(block(self.inplanes, planes))
154 | 
155 |         return nn.Sequential(*layers)
156 | 
157 |     def _init_modules(self):
158 |         self.extras.apply(weights_init)
159 |         self.smooth1.apply(weights_init)
160 | 
161 |     def forward(self, x):
162 |         # Bottom-up
163 |         c1 = F.relu(self.bn1(self.conv1(x)))
164 |         c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1)
165 |         c2 = self.layer1(c1)
166 |         c3 = self.layer2(c2)
167 |         c4 = self.layer3(c3)
168 |         c5 = self.layer4(c4)
169 |         x = c5
170 |         c5_ = self.smooth1(c5)
171 |         sources = [c3, c4, c5_]
172 |         for k, v in enumerate(self.extras):
173 |             x = F.relu(v(x), inplace=True)
174 |             if k % 2 == 1:
175 |                 sources.append(x)
176 |         return sources
177 | 
178 | 
179 | def SSDResnet18(size, channel_size='48'):
180 |     return SSDResnet(BasicBlock, [2, 2, 2, 2], size)
181 | 
182 | 
183 | def SSDResnet34(size, channel_size='48'):
184 |     return SSDResnet(BasicBlock, [3, 4, 6, 3], size)
185 | 
186 | 
187 | def SSDResnet50(size, channel_size='48'):
188 |     return SSDResnet(Bottleneck, [3, 4, 6, 3], size)
189 | 
190 | 
191 | def SSDResnet101(size, channel_size='48'):
192 |     return SSDResnet(Bottleneck, [3, 4, 23, 3], size)
193 | 
194 | 
195 | def SSDResnet152(size, channel_size='48'):
196 |     return SSDResnet(Bottleneck, [3, 8, 36, 3], size)
197 | 
198 | 
199 | if __name__ == "__main__":
200 |     import os
201 |     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
202 |     model3 = SSDResnet18(size=300)
203 |     with torch.no_grad():
204 |         model3.eval()
205 |         x = torch.randn(1, 3, 300, 300)
206 |         model3.cuda()
207 |         model3(x.cuda())
208 |         import time
209 |         st = time.time()
210 |         for i in range(1):
211 |             model3(x.cuda())
212 |         print(time.time() - st)
213 |         # print(model3(x))
214 | 


--------------------------------------------------------------------------------
/models/vgg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | import torch.nn.init as init
  9 | from models.model_helper import weights_init
 10 | 
 11 | 
 12 | class L2Norm(nn.Module):
 13 |     def __init__(self, n_channels, scale):
 14 |         super(L2Norm, self).__init__()
 15 |         self.n_channels = n_channels
 16 |         self.gamma = scale or None
 17 |         self.eps = 1e-10
 18 |         self.weight = nn.Parameter(torch.Tensor(self.n_channels))
 19 |         self.reset_parameters()
 20 | 
 21 |     def reset_parameters(self):
 22 |         init.constant_(self.weight, self.gamma)
 23 | 
 24 |     def forward(self, x):
 25 |         norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
 26 |         x = x / norm
 27 |         out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(
 28 |             x) * x
 29 |         return out
 30 | 
 31 | 
 32 | # This function is derived from torchvision VGG make_layers()
 33 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
 34 | 
 35 | 
 36 | def vgg(cfg, i, batch_norm=False):
 37 |     layers = []
 38 |     in_channels = i
 39 |     for v in cfg:
 40 |         if v == 'M':
 41 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
 42 |         elif v == 'C':
 43 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
 44 |         else:
 45 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
 46 |             if batch_norm:
 47 |                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
 48 |             else:
 49 |                 layers += [conv2d, nn.ReLU(inplace=True)]
 50 |             in_channels = v
 51 |     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
 52 |     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 53 |     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
 54 |     layers += [
 55 |         pool5, conv6,
 56 |         nn.ReLU(inplace=True), conv7,
 57 |         nn.ReLU(inplace=True)
 58 |     ]
 59 |     return layers
 60 | 
 61 | 
 62 | extras_cfg = {
 63 |     '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
 64 |     '512': [
 65 |         256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128, 'S',
 66 |         256
 67 |     ],
 68 | }
 69 | 
 70 | base = {
 71 |     '300': [
 72 |         64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 73 |         512, 512, 512
 74 |     ],
 75 |     '512': [
 76 |         64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 77 |         512, 512, 512
 78 |     ],
 79 | }
 80 | 
 81 | 
 82 | def add_extras(cfg, i, batch_norm=False):
 83 |     # Extra layers added to VGG for feature scaling
 84 |     layers = []
 85 |     in_channels = i
 86 |     flag = False
 87 |     for k, v in enumerate(cfg):
 88 |         if in_channels != 'S':
 89 |             if v == 'S':
 90 |                 layers += [
 91 |                     nn.Conv2d(
 92 |                         in_channels,
 93 |                         cfg[k + 1],
 94 |                         kernel_size=(1, 3)[flag],
 95 |                         stride=2,
 96 |                         padding=1)
 97 |                 ]
 98 |             else:
 99 |                 layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
100 |             flag = not flag
101 |         in_channels = v
102 |     return layers
103 | 
104 | 
105 | class VGG16Extractor(nn.Module):
106 |     def __init__(self, size):
107 |         super(VGG16Extractor, self).__init__()
108 |         self.vgg = nn.ModuleList(vgg(base[str(size)], 3))
109 |         self.L2Norm = L2Norm(512, 20)
110 |         self.extras = nn.ModuleList(add_extras(extras_cfg[str(size)], 1024))
111 |         self._init_modules()
112 | 
113 |     def _init_modules(self):
114 |         self.extras.apply(weights_init)
115 |         self.vgg.apply(weights_init)
116 | 
117 |     def forward(self, x):
118 |         """Applies network layers and ops on input image(s) x.
119 | 
120 |         Args:
121 |             x: input image or batch of images. Shape: [batch,3*batch,300,300].
122 | 
123 |         Return:
124 |             Depending on phase:
125 |             test:
126 |                 Variable(tensor) of output class label predictions,
127 |                 confidence score, and corresponding location predictions for
128 |                 each object detected. Shape: [batch,topk,7]
129 | 
130 |             train:
131 |                 list of concat outputs from:
132 |                     1: confidence layers, Shape: [batch*num_priors,num_classes]
133 |                     2: localization layers, Shape: [batch,num_priors*4]
134 |                     3: priorbox layers, Shape: [2,num_priors*4]
135 |         """
136 |         sources = list()
137 | 
138 |         # apply vgg up to conv4_3 relu
139 |         for k in range(23):
140 |             x = self.vgg[k](x)
141 | 
142 |         s = self.L2Norm(x)
143 |         sources.append(s)
144 | 
145 |         # apply vgg up to fc7
146 |         for k in range(23, len(self.vgg)):
147 |             x = self.vgg[k](x)
148 |         sources.append(x)
149 | 
150 |         # apply extra layers and cache source layer outputs
151 |         for k, v in enumerate(self.extras):
152 |             x = F.relu(v(x), inplace=True)
153 |             if k % 2 == 1:
154 |                 sources.append(x)
155 |         return sources
156 | 
157 | 
158 | def SSDVgg(size, channel_size='48'):
159 |     return VGG16Extractor(size)
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     import os
164 |     os.environ["CUDA_VISIBLE_DEVICES"] = "3"
165 |     with torch.no_grad():
166 |         model3 = VGG16Extractor(300)
167 |         model3.eval()
168 |         x = torch.randn(16, 3, 300, 300)
169 |         model3.cuda()
170 |         model3(x.cuda())
171 |         import time
172 |         st = time.time()
173 |         for i in range(1000):
174 |             model3(x.cuda())
175 |         print(time.time() - st)
176 | 


--------------------------------------------------------------------------------
/models/weave_res.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | from models.model_helper import FpnAdapter, WeaveAdapter, weights_init
  9 | 
 10 | 
 11 | def add_extras(size, in_channel, batch_norm=False):
 12 |     # Extra layers added to resnet for feature scaling
 13 |     layers = []
 14 |     layers += [nn.Conv2d(in_channel, 256, kernel_size=1, stride=1)]
 15 |     layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)]
 16 |     return layers
 17 | 
 18 | 
 19 | def conv3x3(in_planes, out_planes, stride=1):
 20 |     "3x3 convolution with padding"
 21 |     return nn.Conv2d(
 22 |         in_planes,
 23 |         out_planes,
 24 |         kernel_size=3,
 25 |         stride=stride,
 26 |         padding=1,
 27 |         bias=False)
 28 | 
 29 | 
 30 | class BasicBlock(nn.Module):
 31 |     expansion = 1
 32 | 
 33 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 34 |         super(BasicBlock, self).__init__()
 35 |         self.conv1 = conv3x3(inplanes, planes, stride)
 36 |         self.bn1 = nn.BatchNorm2d(planes)
 37 |         self.relu = nn.ReLU(inplace=True)
 38 |         self.conv2 = conv3x3(planes, planes)
 39 |         self.bn2 = nn.BatchNorm2d(planes)
 40 |         self.downsample = downsample
 41 |         self.stride = stride
 42 | 
 43 |     def forward(self, x):
 44 |         residual = x
 45 | 
 46 |         out = self.conv1(x)
 47 |         out = self.bn1(out)
 48 |         out = self.relu(out)
 49 | 
 50 |         out = self.conv2(out)
 51 |         out = self.bn2(out)
 52 | 
 53 |         if self.downsample is not None:
 54 |             residual = self.downsample(x)
 55 |         out += residual
 56 |         out = self.relu(out)
 57 | 
 58 |         return out
 59 | 
 60 | 
 61 | class Bottleneck(nn.Module):
 62 |     expansion = 4
 63 | 
 64 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 65 |         super(Bottleneck, self).__init__()
 66 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
 67 |         self.bn1 = nn.BatchNorm2d(planes)
 68 |         self.conv2 = nn.Conv2d(
 69 |             planes,
 70 |             planes,
 71 |             kernel_size=3,
 72 |             stride=stride,
 73 |             padding=1,
 74 |             bias=False)
 75 |         self.bn2 = nn.BatchNorm2d(planes)
 76 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
 77 |         self.bn3 = nn.BatchNorm2d(planes * 4)
 78 |         self.relu = nn.ReLU(inplace=True)
 79 |         self.downsample = downsample
 80 |         self.stride = stride
 81 | 
 82 |     def forward(self, x):
 83 |         residual = x
 84 | 
 85 |         out = self.conv1(x)
 86 |         out = self.bn1(out)
 87 |         out = self.relu(out)
 88 | 
 89 |         out = self.conv2(out)
 90 |         out = self.bn2(out)
 91 |         out = self.relu(out)
 92 | 
 93 |         out = self.conv3(out)
 94 |         out = self.bn3(out)
 95 | 
 96 |         if self.downsample is not None:
 97 |             residual = self.downsample(x)
 98 | 
 99 |         out += residual
100 |         out = self.relu(out)
101 | 
102 |         return out
103 | 
104 | 
105 | class WeaveResnet(nn.Module):
106 |     def __init__(self, block, num_blocks, size):
107 |         super(WeaveResnet, self).__init__()
108 |         self.inplanes = 64
109 | 
110 |         self.conv1 = nn.Conv2d(
111 |             3, 64, kernel_size=7, stride=2, padding=3, bias=False)
112 |         self.bn1 = nn.BatchNorm2d(64)
113 | 
114 |         # Bottom-up layers
115 |         self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
116 |         self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
117 |         self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
118 |         self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
119 |         self.inchannel = block.expansion * 512
120 |         self.extras = nn.ModuleList(add_extras(str(size), self.inchannel))
121 |         self.smooth1 = nn.Conv2d(
122 |             self.inchannel, 512, kernel_size=3, stride=1, padding=1)
123 |         self.weave = WeaveAdapter([512, 1024, 512, 256], 4)
124 |         self._init_modules()
125 | 
126 |     def _make_layer(self, block, planes, blocks, stride=1):
127 |         downsample = None
128 |         if stride != 1 or self.inplanes != planes * block.expansion:
129 |             downsample = nn.Sequential(
130 |                 nn.Conv2d(
131 |                     self.inplanes,
132 |                     planes * block.expansion,
133 |                     kernel_size=1,
134 |                     stride=stride,
135 |                     bias=False),
136 |                 nn.BatchNorm2d(planes * block.expansion),
137 |             )
138 | 
139 |         layers = []
140 |         layers.append(block(self.inplanes, planes, stride, downsample))
141 |         self.inplanes = planes * block.expansion
142 |         for i in range(1, blocks):
143 |             layers.append(block(self.inplanes, planes))
144 | 
145 |         return nn.Sequential(*layers)
146 | 
147 |     def _init_modules(self):
148 |         self.extras.apply(weights_init)
149 |         self.smooth1.apply(weights_init)
150 | 
151 |     def forward(self, x):
152 |         # Bottom-up
153 |         odm_sources = list()
154 |         c1 = F.relu(self.bn1(self.conv1(x)))
155 |         c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1)
156 |         c2 = self.layer1(c1)
157 |         c3 = self.layer2(c2)
158 |         c4 = self.layer3(c3)
159 |         c5 = self.layer4(c4)
160 |         x = c5
161 |         c5_ = self.smooth1(c5)
162 |         arm_sources = [c3, c4, c5_]
163 |         for k, v in enumerate(self.extras):
164 |             x = F.relu(v(x), inplace=True)
165 |             if k % 2 == 1:
166 |                 arm_sources.append(x)
167 |         odm_sources = self.weave(arm_sources)
168 |         return arm_sources, odm_sources
169 | 
170 | 
171 | def WeaveResnet50(size, channel_size='48'):
172 |     return WeaveResnet(Bottleneck, [3, 4, 6, 3], size)
173 | 
174 | 
175 | def WeaveResnet101(size, channel_size='48'):
176 |     return WeaveResnet(Bottleneck, [3, 4, 23, 3], size)
177 | 
178 | 
179 | def WeaveResnet152(size, channel_size='48'):
180 |     return WeaveResnet(Bottleneck, [3, 8, 36, 3], size)
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     import os
185 |     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
186 |     model = WeaveResnet50(size=300)
187 |     print(model)
188 |     with torch.no_grad():
189 |         model.eval()
190 |         x = torch.randn(1, 3, 320, 320)
191 |         model.cuda()
192 |         model(x.cuda())
193 | 


--------------------------------------------------------------------------------
/models/weave_vgg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Written by yq_yao
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | import torch.nn.init as init
  9 | from models.model_helper import FpnAdapter, WeaveAdapter, weights_init, WeaveAdapter2
 10 | # from model_helper import FpnAdapter, WeaveAdapter, weights_init, WeaveAdapter2
 11 | 
 12 | class L2Norm(nn.Module):
 13 |     def __init__(self, n_channels, scale):
 14 |         super(L2Norm, self).__init__()
 15 |         self.n_channels = n_channels
 16 |         self.gamma = scale or None
 17 |         self.eps = 1e-10
 18 |         self.weight = nn.Parameter(torch.Tensor(self.n_channels))
 19 |         self.reset_parameters()
 20 | 
 21 |     def reset_parameters(self):
 22 |         init.constant_(self.weight, self.gamma)
 23 | 
 24 |     def forward(self, x):
 25 |         norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
 26 |         x = x / norm
 27 |         out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(
 28 |             x) * x
 29 |         return out
 30 | 
 31 | 
 32 | # This function is derived from torchvision VGG make_layers()
 33 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
 34 | 
 35 | 
 36 | def vgg(cfg, i, batch_norm=False):
 37 |     layers = []
 38 |     in_channels = i
 39 |     for v in cfg:
 40 |         if v == 'M':
 41 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
 42 |         elif v == 'C':
 43 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
 44 |         else:
 45 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
 46 |             if batch_norm:
 47 |                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
 48 |             else:
 49 |                 layers += [conv2d, nn.ReLU(inplace=True)]
 50 |             in_channels = v
 51 |     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
 52 |     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 53 |     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
 54 |     layers += [
 55 |         pool5, conv6,
 56 |         nn.ReLU(inplace=True), conv7,
 57 |         nn.ReLU(inplace=True)
 58 |     ]
 59 |     return layers
 60 | 
 61 | 
 62 | base = {
 63 |     '300': [
 64 |         64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 65 |         512, 512, 512
 66 |     ],
 67 |     '512': [
 68 |         64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 69 |         512, 512, 512
 70 |     ],
 71 | }
 72 | 
 73 | 
 74 | def add_extras(size):
 75 |     layers = []
 76 |     layers += [nn.Conv2d(1024, 256, kernel_size=1, stride=1)]
 77 |     layers += [nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)]
 78 |     layers += [nn.Conv2d(256, 128, kernel_size=1, stride=1)]
 79 |     layers += [nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)]
 80 | 
 81 |     return layers
 82 | 
 83 | 
 84 | class VGG16Extractor(nn.Module):
 85 |     def __init__(self, size, channel_size='48'):
 86 |         super(VGG16Extractor, self).__init__()
 87 |         self.vgg = nn.ModuleList(vgg(base[str(size)], 3))
 88 |         self.extras = nn.ModuleList(add_extras(str(size)))
 89 |         self.L2Norm_4_3 = L2Norm(512, 10)
 90 |         self.L2Norm_5_3 = L2Norm(1024, 8)
 91 |         self.raw_channels = [512, 1024, 256, 256]
 92 |         self.weave_add_channels = [(48, 48), (48, 48), (48, 48), (48, 48)]
 93 |         self.weave_channels = [256, 256, 256, 256]
 94 |         # self.weave = WeaveAdapter([512, 1024, 256, 256], 4)
 95 |         self.weave = WeaveAdapter2(self.raw_channels, self.weave_add_channels, self.weave_channels)
 96 |         self._init_modules()
 97 | 
 98 |     def _init_modules(self):
 99 |         self.extras.apply(weights_init)
100 | 
101 |     def forward(self, x):
102 |         """Applies network layers and ops on input image(s) x.
103 |         Args:
104 |             x: input image or batch of images. Shape: [batch,3*batch,300,300].
105 |         Return:
106 |             Depending on phase:
107 |             test:
108 |                 Variable(tensor) of output class label predictions,
109 |                 confidence score, and corresponding location predictions for
110 |                 each object detected. Shape: [batch,topk,7]
111 |             train:
112 |                 list of concat outputs from:
113 |                     1: confidence layers, Shape: [batch*num_priors,num_classes]
114 |                     2: localization layers, Shape: [batch,num_priors*4]
115 |                     3: priorbox layers, Shape: [2,num_priors*4]
116 |         """
117 |         arm_sources = list()
118 |         odm_sources = list()
119 | 
120 |         for i in range(23):
121 |             x = self.vgg[i](x)
122 |         #38x38
123 |         c2 = x
124 |         c2 = self.L2Norm_4_3(c2)
125 |         arm_sources.append(c2)
126 | 
127 |         for k in range(23, len(self.vgg)):
128 |             x = self.vgg[k](x)
129 |         #19x19
130 |         c3 = x
131 |         c3 = self.L2Norm_5_3(c3)
132 |         arm_sources.append(c3)
133 | 
134 |         # 10x10
135 |         x = F.relu(self.extras[0](x), inplace=True)
136 |         x = F.relu(self.extras[1](x), inplace=True)
137 |         c4 = x
138 |         arm_sources.append(c4)
139 | 
140 |         # 5x5
141 |         x = F.relu(self.extras[2](x), inplace=True)
142 |         x = F.relu(self.extras[3](x), inplace=True)
143 |         c5 = x
144 |         arm_sources.append(c5)
145 | 
146 |         if len(self.extras) > 4:
147 |             x = F.relu(self.extras[4](x), inplace=True)
148 |             x = F.relu(self.extras[5](x), inplace=True)
149 |             c6 = x
150 |             arm_sources.append(c6)
151 |         odm_sources = self.weave(arm_sources)
152 |         return arm_sources, odm_sources
153 | 
154 | 
155 | def weave_vgg(size, channel_size='48'):
156 |     return VGG16Extractor(size)
157 | 
158 | 
159 | if __name__ == "__main__":
160 |     import os
161 |     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
162 |     model = weave_vgg(size=300)
163 |     print(model)
164 |     with torch.no_grad():
165 |         model.eval()
166 |         x = torch.randn(1, 3, 320, 320)
167 |         model.cuda()
168 |         model(x.cuda())


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yqyao/SSD_Pytorch/6060bbb650e7a1df7c12d7c9650a38eaba4ab6a8/utils/__init__.py


--------------------------------------------------------------------------------
/utils/averageMeter.py:
--------------------------------------------------------------------------------
 1 | class AverageMeter(object):
 2 |     """Computes and stores the average and current value"""
 3 | 
 4 |     def __init__(self):
 5 |         self.reset()
 6 | 
 7 |     def reset(self):
 8 |         self.val = 0
 9 |         self.avg = 0
10 |         self.sum = 0
11 |         self.count = 0
12 | 
13 |     def update(self, val, n=1):
14 |         self.val = val
15 |         self.sum += val * n
16 |         self.count += n
17 |         self.avg = self.sum / self.count


--------------------------------------------------------------------------------
/utils/build.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick
  6 | # --------------------------------------------------------
  7 | 
  8 | import os
  9 | from os.path import join as pjoin
 10 | import numpy as np
 11 | from distutils.core import setup
 12 | from distutils.extension import Extension
 13 | from Cython.Distutils import build_ext
 14 | 
 15 | 
 16 | def find_in_path(name, path):
 17 |     "Find a file in a search path"
 18 |     # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
 19 |     for dir in path.split(os.pathsep):
 20 |         binpath = pjoin(dir, name)
 21 |         if os.path.exists(binpath):
 22 |             return os.path.abspath(binpath)
 23 |     return None
 24 | 
 25 | 
 26 | def locate_cuda():
 27 |     """Locate the CUDA environment on the system
 28 | 
 29 |     Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
 30 |     and values giving the absolute path to each directory.
 31 | 
 32 |     Starts by looking for the CUDAHOME env variable. If not found, everything
 33 |     is based on finding 'nvcc' in the PATH.
 34 |     """
 35 | 
 36 |     # first check if the CUDAHOME env variable is in use
 37 |     if 'CUDAHOME' in os.environ:
 38 |         home = os.environ['CUDAHOME']
 39 |         nvcc = pjoin(home, 'bin', 'nvcc')
 40 |     else:
 41 |         # otherwise, search the PATH for NVCC
 42 |         default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin')
 43 |         nvcc = find_in_path('nvcc',
 44 |                             os.environ['PATH'] + os.pathsep + default_path)
 45 |         if nvcc is None:
 46 |             raise EnvironmentError(
 47 |                 'The nvcc binary could not be '
 48 |                 'located in your $PATH. Either add it to your path, or set $CUDAHOME'
 49 |             )
 50 |         home = os.path.dirname(os.path.dirname(nvcc))
 51 | 
 52 |     cudaconfig = {
 53 |         'home': home,
 54 |         'nvcc': nvcc,
 55 |         'include': pjoin(home, 'include'),
 56 |         'lib64': pjoin(home, 'lib64')
 57 |     }
 58 |     for k, v in cudaconfig.items():
 59 |         if not os.path.exists(v):
 60 |             raise EnvironmentError(
 61 |                 'The CUDA %s path could not be located in %s' % (k, v))
 62 | 
 63 |     return cudaconfig
 64 | 
 65 | 
 66 | CUDA = locate_cuda()
 67 | 
 68 | # Obtain the numpy include directory.  This logic works across numpy versions.
 69 | try:
 70 |     numpy_include = np.get_include()
 71 | except AttributeError:
 72 |     numpy_include = np.get_numpy_include()
 73 | 
 74 | 
 75 | def customize_compiler_for_nvcc(self):
 76 |     """inject deep into distutils to customize how the dispatch
 77 |     to gcc/nvcc works.
 78 | 
 79 |     If you subclass UnixCCompiler, it's not trivial to get your subclass
 80 |     injected in, and still have the right customizations (i.e.
 81 |     distutils.sysconfig.customize_compiler) run on it. So instead of going
 82 |     the OO route, I have this. Note, it's kindof like a wierd functional
 83 |     subclassing going on."""
 84 | 
 85 |     # tell the compiler it can processes .cu
 86 |     self.src_extensions.append('.cu')
 87 | 
 88 |     # save references to the default compiler_so and _comple methods
 89 |     default_compiler_so = self.compiler_so
 90 |     super = self._compile
 91 | 
 92 |     # now redefine the _compile method. This gets executed for each
 93 |     # object but distutils doesn't have the ability to change compilers
 94 |     # based on source extension: we add it.
 95 |     def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
 96 |         print(extra_postargs)
 97 |         if os.path.splitext(src)[1] == '.cu':
 98 |             # use the cuda for .cu files
 99 |             self.set_executable('compiler_so', CUDA['nvcc'])
100 |             # use only a subset of the extra_postargs, which are 1-1 translated
101 |             # from the extra_compile_args in the Extension class
102 |             postargs = extra_postargs['nvcc']
103 |         else:
104 |             postargs = extra_postargs['gcc']
105 | 
106 |         super(obj, src, ext, cc_args, postargs, pp_opts)
107 |         # reset the default compiler_so, which we might have changed for cuda
108 |         self.compiler_so = default_compiler_so
109 | 
110 |     # inject our redefined _compile method into the class
111 |     self._compile = _compile
112 | 
113 | 
114 | # run the customize_compiler
115 | class custom_build_ext(build_ext):
116 |     def build_extensions(self):
117 |         customize_compiler_for_nvcc(self.compiler)
118 |         build_ext.build_extensions(self)
119 | 
120 | 
121 | ext_modules = [
122 |     Extension(
123 |         "nms.cpu_nms", ["nms/cpu_nms.pyx"],
124 |         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
125 |         include_dirs=[numpy_include]),
126 |     Extension(
127 |         'nms.gpu_nms',
128 |         ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'],
129 |         library_dirs=[CUDA['lib64']],
130 |         libraries=['cudart'],
131 |         language='c++',
132 |         runtime_library_dirs=[CUDA['lib64']],
133 |         # this syntax is specific to this build system
134 |         # we're only going to use certain compiler args with nvcc and not with gcc
135 |         # the implementation of this trick is in customize_compiler() below
136 |         extra_compile_args={
137 |             'gcc': ["-Wno-unused-function"],
138 |             'nvcc': [
139 |                 '-arch=sm_61', '--ptxas-options=-v', '-c',
140 |                 '--compiler-options', "'-fPIC'"
141 |             ]
142 |         },
143 |         include_dirs=[numpy_include, CUDA['include']])
144 | ]
145 | 
146 | setup(
147 |     name='mot_utils',
148 |     ext_modules=ext_modules,
149 |     # inject our custom trigger
150 |     cmdclass={'build_ext': custom_build_ext},
151 | )
152 | 


--------------------------------------------------------------------------------
/utils/collections.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | ##############################################################################
15 | """A simple attribute dictionary used for representing configuration options."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | from __future__ import unicode_literals
21 | 
22 | 
23 | class AttrDict(dict):
24 | 
25 |     IMMUTABLE = '__immutable__'
26 | 
27 |     def __init__(self, *args, **kwargs):
28 |         super(AttrDict, self).__init__(*args, **kwargs)
29 |         self.__dict__[AttrDict.IMMUTABLE] = False
30 | 
31 |     def __getattr__(self, name):
32 |         if name in self.__dict__:
33 |             return self.__dict__[name]
34 |         elif name in self:
35 |             return self[name]
36 |         else:
37 |             raise AttributeError(name)
38 | 
39 |     def __setattr__(self, name, value):
40 |         if not self.__dict__[AttrDict.IMMUTABLE]:
41 |             if name in self.__dict__:
42 |                 self.__dict__[name] = value
43 |             else:
44 |                 self[name] = value
45 |         else:
46 |             raise AttributeError(
47 |                 'Attempted to set "{}" to "{}", but AttrDict is immutable'.
48 |                 format(name, value))
49 | 
50 |     def immutable(self, is_immutable):
51 |         """Set immutability to is_immutable and recursively apply the setting
52 |         to all nested AttrDicts.
53 |         """
54 |         self.__dict__[AttrDict.IMMUTABLE] = is_immutable
55 |         # Recursively set immutable state
56 |         for v in self.__dict__.values():
57 |             if isinstance(v, AttrDict):
58 |                 v.immutable(is_immutable)
59 |         for v in self.values():
60 |             if isinstance(v, AttrDict):
61 |                 v.immutable(is_immutable)
62 | 
63 |     def is_immutable(self):
64 |         return self.__dict__[AttrDict.IMMUTABLE]
65 | 


--------------------------------------------------------------------------------
/utils/get_class_map.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import argparse
  4 | import os.path as osp
  5 | 
  6 | 
  7 | def check_size(submit_file):
  8 |     max_size = 60 * 1024 * 1024
  9 |     if osp.getsize(submit_file) > max_size:
 10 |         raise (
 11 |             IOError,
 12 |             "File size exceeds the specified maximum size, which is 60M for the server."
 13 |         )
 14 | 
 15 | 
 16 | def parse_submission(submit_file):
 17 |     with open(submit_file, 'r') as f:
 18 |         lines = f.readlines()
 19 |     submit_dict = dict()
 20 |     final_dict = dict()
 21 |     splitlines = [x.strip().split(' ') for x in lines]
 22 |     for idx, val in enumerate(splitlines):
 23 |         cls = str(int(float(val[1])))
 24 |         if cls not in submit_dict:
 25 |             submit_dict[cls] = list()
 26 |             final_dict[cls] = dict()
 27 |         submit_dict[cls].append(
 28 |             [val[0], val[2], val[3], val[4], val[5], val[6]])
 29 |     for k, v in submit_dict.items():
 30 |         image_ids = [x[0] for x in v]
 31 |         confidence = np.array([float(x[1]) for x in v])
 32 |         BB = np.array([[float(z) for z in x[2:]] for x in v])
 33 |         sorted_ind = np.argsort(-confidence)
 34 |         sorted_scores = np.sort(-confidence)
 35 |         BB = BB[sorted_ind, :]
 36 |         image_ids = [image_ids[x] for x in sorted_ind]
 37 |         final_dict[k]["image_ids"] = image_ids
 38 |         final_dict[k]["BB"] = np.array(BB)
 39 |     return final_dict
 40 | 
 41 | 
 42 | def parse_gt_annotation(gt_file):
 43 |     with open(gt_file, 'r') as f:
 44 |         lines = f.readlines()
 45 |     info = [x.strip().split() for x in lines]
 46 |     gt = {}
 47 |     for item in info:
 48 |         img_id = item[0]
 49 |         obj_struct = {}
 50 |         obj_struct['class'] = item[1]
 51 |         obj_struct['bbox'] = [
 52 |             int(item[2]),
 53 |             int(item[3]),
 54 |             int(item[4]),
 55 |             int(item[5])
 56 |         ]
 57 |         if img_id not in gt:
 58 |             gt[img_id] = list()
 59 |         gt[img_id].append(obj_struct)
 60 |     return gt
 61 | 
 62 | 
 63 | def get_class_recs(recs, classname):
 64 |     npos = 0
 65 |     class_recs = {}
 66 |     for key in recs.keys():
 67 |         R = [obj for obj in recs[key] if obj['class'] == classname]
 68 |         bbox = np.array([x['bbox'] for x in R])
 69 |         det = [False] * len(R)
 70 |         npos += len(R)
 71 |         class_recs[key] = {'bbox': bbox, 'det': det}
 72 |     return class_recs, npos
 73 | 
 74 | 
 75 | def compute_ap(rec, prec):
 76 |     mrec = np.concatenate(([0.], rec, [1.]))
 77 |     mpre = np.concatenate(([0.], prec, [0.]))
 78 |     for i in range(mpre.size - 1, 0, -1):
 79 |         mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 80 |     i = np.where(mrec[1:] != mrec[:-1])[0]
 81 |     ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 82 |     return ap
 83 | 
 84 | 
 85 | def eval(submit_file, gt_file, ovthresh, classname):
 86 |     recs = parse_gt_annotation(gt_file)
 87 |     submit_result = parse_submission(submit_file)
 88 |     # get one class result
 89 |     class_recs, npos = get_class_recs(recs, classname)
 90 |     image_ids = submit_result[classname]["image_ids"]
 91 |     BB = submit_result[classname]["BB"]
 92 |     nd = len(image_ids)
 93 |     tp = np.zeros(nd)
 94 |     fp = np.zeros(nd)
 95 |     for d in range(nd):
 96 |         if image_ids[d] not in recs.keys():
 97 |             raise KeyError(
 98 |                 "Can not find image {} in the groundtruth file, did you submit the result file for the right dataset?"
 99 |                 .format(image_ids[d]))
100 |     for d in range(nd):
101 |         R = class_recs[image_ids[d]]
102 |         bb = BB[d, :].astype(float)
103 |         ovmax = -np.inf
104 |         BBGT = R['bbox'].astype(float)
105 |         if BBGT.size > 0:
106 |             ixmin = np.maximum(BBGT[:, 0], bb[0])
107 |             iymin = np.maximum(BBGT[:, 1], bb[1])
108 |             ixmax = np.minimum(BBGT[:, 2], bb[2])
109 |             iymax = np.minimum(BBGT[:, 3], bb[3])
110 |             iw = np.maximum(ixmax - ixmin + 1., 0.)
111 |             ih = np.maximum(iymax - iymin + 1., 0.)
112 |             inters = iw * ih
113 |             uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
114 |                    (BBGT[:, 2] - BBGT[:, 0] + 1.) *
115 |                    (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
116 |             overlaps = inters / uni
117 |             ovmax = np.max(overlaps)
118 |             jmax = np.argmax(overlaps)
119 |         if ovmax > ovthresh:
120 |             if not R['det'][jmax]:
121 |                 tp[d] = 1.
122 |                 R['det'][jmax] = 1
123 |             else:
124 |                 fp[d] = 1.
125 |         else:
126 |             fp[d] = 1.
127 |     fp = np.cumsum(fp)
128 |     tp = np.cumsum(tp)
129 |     rec = tp / float(npos)
130 |     prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
131 |     ap = compute_ap(rec, prec)
132 |     return ap
133 | 
134 | 
135 | def result_eval(submit_file, gt, class_list):
136 |     ove_aap = []
137 |     for ove in np.arange(0.5, 1.0, 0.05):
138 |         cls_aap = []
139 |         for cls in class_list:
140 |             ap = eval(submit_file, gt, ove, cls)
141 |             cls_aap.append(ap)
142 |         cls_mAP = np.average(cls_aap)
143 |         print("thresh", round(ove, 3), "map", round(cls_mAP * 100, 3))
144 |         ove_aap.append(cls_mAP)
145 |     mAP = np.average(ove_aap) * 100
146 |     return round(mAP, 3)
147 | 
148 | 
149 | if __name__ == '__main__':
150 |     '''
151 |     submit_file: image_id, class, score, xmin, ymin, xmax, ymax
152 |     gt_file: image_id, class, xmin, ymin, xmax, ymax
153 |     '''
154 |     class_list = []
155 |     for i in range(1, 61):
156 |         class_list.append(str(i))
157 |     submit_file = "./results/fpn_dcn_result.csv"
158 |     gt_file = "./results/val_label.txt"
159 |     check_size(submit_file)
160 |     mAP = result_eval(submit_file, gt_file, class_list)
161 |     out = {'Average AP': str(round(mAP, 3))}
162 |     print(out)


--------------------------------------------------------------------------------
/utils/nms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yqyao/SSD_Pytorch/6060bbb650e7a1df7c12d7c9650a38eaba4ab6a8/utils/nms/__init__.py


--------------------------------------------------------------------------------
/utils/nms/cpu_nms.pyx:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick
  6 | # --------------------------------------------------------
  7 | 
  8 | import numpy as np
  9 | cimport numpy as np
 10 | 
 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
 12 |     return a if a >= b else b
 13 | 
 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
 15 |     return a if a <= b else b
 16 | 
 17 | cdef inline np.float32_t abs(np.float32_t a, np.float32_t b):
 18 |     return a - b if a >= b else b - a
 19 | 
 20 | def get_iou_weights(np.ndarray[np.float32_t, ndim=1] ious, np.float threshold, float init_weight):
 21 | 
 22 |     cdef:
 23 |         int num = ious.shape[0]
 24 |         # np.ndarray[np.float32_t, ndim=1] out = np.zeros(num, dtype=np.float)
 25 |         int idx
 26 |         float iou
 27 |         float weight
 28 | 
 29 |     for idx, iou in enumerate(ious):
 30 |         weight = init_weight
 31 |         if iou > 0.0:
 32 |             if iou > threshold + 0.1:
 33 |                 weight += 1.0
 34 |             elif iou < threshold - 0.1:
 35 |                 weight += 1.0
 36 |             else:
 37 |                 weight += 0.0
 38 |         ious[idx]  = <float>weight
 39 |     return ious
 40 | 
 41 | def get_mask(np.ndarray[np.float32_t, ndim=1] ious, np.float threshold):
 42 |     cdef:
 43 |         int num = ious.shape[0]
 44 |         int idx = 0
 45 |         float distance
 46 |         float iou
 47 |         np.ndarray[np.int64_t, ndim=1] out = np.zeros((num), dtype=np.int64)
 48 |     for idx, iou in enumerate(ious):
 49 |         # if iou >= threshold:
 50 |         #     distance = iou - threshold
 51 |         #     if distance < 0.1:
 52 |         #         out[idx] = 0
 53 |         #     elif distance < 0.2:
 54 |         #         out[idx] = 1
 55 |         #     else:
 56 |         #         out[idx] = 2
 57 |         # else:
 58 |         #     distance = threshold - iou
 59 |         #     if distance < 0.1:
 60 |         #         out[idx] = 2
 61 |         #     elif distance < 0.2:
 62 |         #         out[idx] = 1
 63 |         #     else:
 64 |         #         out[idx] = 0
 65 |         distance = abs(iou, threshold) 
 66 |         if distance < 0.1:
 67 |             # out[:,2] = 1
 68 |             out[idx] = 2
 69 |         elif distance < 0.2:
 70 |             # out[:,1] = 1
 71 |             out[idx] = 1
 72 |         else:
 73 |             # out[:,0] = 0
 74 |             out[idx] = 0
 75 |     return out
 76 | 
 77 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
 78 |     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
 79 |     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
 80 |     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
 81 |     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
 82 |     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
 83 | 
 84 |     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
 85 |     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
 86 | 
 87 |     cdef int ndets = dets.shape[0]
 88 |     cdef np.ndarray[np.int_t, ndim=1] suppressed = \
 89 |             np.zeros((ndets), dtype=np.int)
 90 | 
 91 |     # nominal indices
 92 |     cdef int _i, _j
 93 |     # sorted indices
 94 |     cdef int i, j
 95 |     # temp variables for box i's (the box currently under consideration)
 96 |     cdef np.float32_t ix1, iy1, ix2, iy2, iarea
 97 |     # variables for computing overlap with box j (lower scoring box)
 98 |     cdef np.float32_t xx1, yy1, xx2, yy2
 99 |     cdef np.float32_t w, h
100 |     cdef np.float32_t inter, ovr
101 | 
102 |     keep = []
103 |     for _i in range(ndets):
104 |         i = order[_i]
105 |         if suppressed[i] == 1:
106 |             continue
107 |         keep.append(i)
108 |         ix1 = x1[i]
109 |         iy1 = y1[i]
110 |         ix2 = x2[i]
111 |         iy2 = y2[i]
112 |         iarea = areas[i]
113 |         for _j in range(_i + 1, ndets):
114 |             j = order[_j]
115 |             if suppressed[j] == 1:
116 |                 continue
117 |             xx1 = max(ix1, x1[j])
118 |             yy1 = max(iy1, y1[j])
119 |             xx2 = min(ix2, x2[j])
120 |             yy2 = min(iy2, y2[j])
121 |             w = max(0.0, xx2 - xx1 + 1)
122 |             h = max(0.0, yy2 - yy1 + 1)
123 |             inter = w * h
124 |             ovr = inter / (iarea + areas[j] - inter)
125 |             if ovr >= thresh:
126 |                 suppressed[j] = 1
127 | 
128 |     return keep
129 | 
130 | def cpu_soft_nms(np.ndarray[float, ndim=2] boxes, float sigma=0.5, float Nt=0.3, float threshold=0.001, unsigned int method=0):
131 |     cdef unsigned int N = boxes.shape[0]
132 |     cdef float iw, ih, box_area
133 |     cdef float ua
134 |     cdef int pos = 0
135 |     cdef float maxscore = 0
136 |     cdef int maxpos = 0
137 |     cdef float x1,x2,y1,y2,tx1,tx2,ty1,ty2,ts,area,weight,ov
138 | 
139 |     for i in range(N):
140 |         maxscore = boxes[i, 4]
141 |         maxpos = i
142 | 
143 |         tx1 = boxes[i,0]
144 |         ty1 = boxes[i,1]
145 |         tx2 = boxes[i,2]
146 |         ty2 = boxes[i,3]
147 |         ts = boxes[i,4]
148 | 
149 |         pos = i + 1
150 | 	# get max box
151 |         while pos < N:
152 |             if maxscore < boxes[pos, 4]:
153 |                 maxscore = boxes[pos, 4]
154 |                 maxpos = pos
155 |             pos = pos + 1
156 | 
157 | 	# add max box as a detection 
158 |         boxes[i,0] = boxes[maxpos,0]
159 |         boxes[i,1] = boxes[maxpos,1]
160 |         boxes[i,2] = boxes[maxpos,2]
161 |         boxes[i,3] = boxes[maxpos,3]
162 |         boxes[i,4] = boxes[maxpos,4]
163 | 
164 | 	# swap ith box with position of max box
165 |         boxes[maxpos,0] = tx1
166 |         boxes[maxpos,1] = ty1
167 |         boxes[maxpos,2] = tx2
168 |         boxes[maxpos,3] = ty2
169 |         boxes[maxpos,4] = ts
170 | 
171 |         tx1 = boxes[i,0]
172 |         ty1 = boxes[i,1]
173 |         tx2 = boxes[i,2]
174 |         ty2 = boxes[i,3]
175 |         ts = boxes[i,4]
176 | 
177 |         pos = i + 1
178 | 	# NMS iterations, note that N changes if detection boxes fall below threshold
179 |         while pos < N:
180 |             x1 = boxes[pos, 0]
181 |             y1 = boxes[pos, 1]
182 |             x2 = boxes[pos, 2]
183 |             y2 = boxes[pos, 3]
184 |             s = boxes[pos, 4]
185 | 
186 |             area = (x2 - x1 + 1) * (y2 - y1 + 1)
187 |             iw = (min(tx2, x2) - max(tx1, x1) + 1)
188 |             if iw > 0:
189 |                 ih = (min(ty2, y2) - max(ty1, y1) + 1)
190 |                 if ih > 0:
191 |                     ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
192 |                     ov = iw * ih / ua #iou between max box and detection box
193 | 
194 |                     if method == 1: # linear
195 |                         if ov > Nt: 
196 |                             weight = 1 - ov
197 |                         else:
198 |                             weight = 1
199 |                     elif method == 2: # gaussian
200 |                         weight = np.exp(-(ov * ov)/sigma)
201 |                     else: # original NMS
202 |                         if ov > Nt: 
203 |                             weight = 0
204 |                         else:
205 |                             weight = 1
206 | 
207 |                     boxes[pos, 4] = weight*boxes[pos, 4]
208 | 		    
209 | 		    # if box score falls below threshold, discard the box by swapping with last box
210 | 		    # update N
211 |                     if boxes[pos, 4] < threshold:
212 |                         boxes[pos,0] = boxes[N-1, 0]
213 |                         boxes[pos,1] = boxes[N-1, 1]
214 |                         boxes[pos,2] = boxes[N-1, 2]
215 |                         boxes[pos,3] = boxes[N-1, 3]
216 |                         boxes[pos,4] = boxes[N-1, 4]
217 |                         N = N - 1
218 |                         pos = pos - 1
219 | 
220 |             pos = pos + 1
221 | 
222 |     keep = [i for i in range(N)]
223 |     return keep
224 | 


--------------------------------------------------------------------------------
/utils/nms/gpu_nms.hpp:
--------------------------------------------------------------------------------
1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
2 |           int boxes_dim, float nms_overlap_thresh, int device_id);
3 | 


--------------------------------------------------------------------------------
/utils/nms/gpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Faster R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | assert sizeof(int) == sizeof(np.int32_t)
12 | 
13 | cdef extern from "gpu_nms.hpp":
14 |     void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
15 | 
16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
17 |             np.int32_t device_id=0):
18 |     cdef int boxes_num = dets.shape[0]
19 |     cdef int boxes_dim = dets.shape[1]
20 |     cdef int num_out
21 |     cdef np.ndarray[np.int32_t, ndim=1] \
22 |         keep = np.zeros(boxes_num, dtype=np.int32)
23 |     cdef np.ndarray[np.float32_t, ndim=1] \
24 |         scores = dets[:, 4]
25 |     cdef np.ndarray[np.int_t, ndim=1] \
26 |         order = scores.argsort()[::-1]
27 |     cdef np.ndarray[np.float32_t, ndim=2] \
28 |         sorted_dets = dets[order, :]
29 |     _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
30 |     keep = keep[:num_out]
31 |     return list(order[keep])
32 | 


--------------------------------------------------------------------------------
/utils/nms/nms_kernel.cu:
--------------------------------------------------------------------------------
  1 | // ------------------------------------------------------------------
  2 | // Faster R-CNN
  3 | // Copyright (c) 2015 Microsoft
  4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details]
  5 | // Written by Shaoqing Ren
  6 | // ------------------------------------------------------------------
  7 | 
  8 | #include "gpu_nms.hpp"
  9 | #include <vector>
 10 | #include <iostream>
 11 | 
 12 | #define CUDA_CHECK(condition) \
 13 |   /* Code block avoids redefinition of cudaError_t error */ \
 14 |   do { \
 15 |     cudaError_t error = condition; \
 16 |     if (error != cudaSuccess) { \
 17 |       std::cout << cudaGetErrorString(error) << std::endl; \
 18 |     } \
 19 |   } while (0)
 20 | 
 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
 23 | 
 24 | __device__ inline float devIoU(float const * const a, float const * const b) {
 25 |   float left = max(a[0], b[0]), right = min(a[2], b[2]);
 26 |   float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
 27 |   float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
 28 |   float interS = width * height;
 29 |   float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
 30 |   float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
 31 |   return interS / (Sa + Sb - interS);
 32 | }
 33 | 
 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 35 |                            const float *dev_boxes, unsigned long long *dev_mask) {
 36 |   const int row_start = blockIdx.y;
 37 |   const int col_start = blockIdx.x;
 38 | 
 39 |   // if (row_start > col_start) return;
 40 | 
 41 |   const int row_size =
 42 |         min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
 43 |   const int col_size =
 44 |         min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
 45 | 
 46 |   __shared__ float block_boxes[threadsPerBlock * 5];
 47 |   if (threadIdx.x < col_size) {
 48 |     block_boxes[threadIdx.x * 5 + 0] =
 49 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
 50 |     block_boxes[threadIdx.x * 5 + 1] =
 51 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
 52 |     block_boxes[threadIdx.x * 5 + 2] =
 53 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
 54 |     block_boxes[threadIdx.x * 5 + 3] =
 55 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
 56 |     block_boxes[threadIdx.x * 5 + 4] =
 57 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
 58 |   }
 59 |   __syncthreads();
 60 | 
 61 |   if (threadIdx.x < row_size) {
 62 |     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
 63 |     const float *cur_box = dev_boxes + cur_box_idx * 5;
 64 |     int i = 0;
 65 |     unsigned long long t = 0;
 66 |     int start = 0;
 67 |     if (row_start == col_start) {
 68 |       start = threadIdx.x + 1;
 69 |     }
 70 |     for (i = start; i < col_size; i++) {
 71 |       if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
 72 |         t |= 1ULL << i;
 73 |       }
 74 |     }
 75 |     const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
 76 |     dev_mask[cur_box_idx * col_blocks + col_start] = t;
 77 |   }
 78 | }
 79 | 
 80 | void _set_device(int device_id) {
 81 |   int current_device;
 82 |   CUDA_CHECK(cudaGetDevice(&current_device));
 83 |   if (current_device == device_id) {
 84 |     return;
 85 |   }
 86 |   // The call to cudaSetDevice must come before any calls to Get, which
 87 |   // may perform initialization using the GPU.
 88 |   CUDA_CHECK(cudaSetDevice(device_id));
 89 | }
 90 | 
 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
 92 |           int boxes_dim, float nms_overlap_thresh, int device_id) {
 93 |   _set_device(device_id);
 94 | 
 95 |   float* boxes_dev = NULL;
 96 |   unsigned long long* mask_dev = NULL;
 97 | 
 98 |   const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
 99 | 
100 |   CUDA_CHECK(cudaMalloc(&boxes_dev,
101 |                         boxes_num * boxes_dim * sizeof(float)));
102 |   CUDA_CHECK(cudaMemcpy(boxes_dev,
103 |                         boxes_host,
104 |                         boxes_num * boxes_dim * sizeof(float),
105 |                         cudaMemcpyHostToDevice));
106 | 
107 |   CUDA_CHECK(cudaMalloc(&mask_dev,
108 |                         boxes_num * col_blocks * sizeof(unsigned long long)));
109 | 
110 |   dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
111 |               DIVUP(boxes_num, threadsPerBlock));
112 |   dim3 threads(threadsPerBlock);
113 |   nms_kernel<<<blocks, threads>>>(boxes_num,
114 |                                   nms_overlap_thresh,
115 |                                   boxes_dev,
116 |                                   mask_dev);
117 | 
118 |   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
119 |   CUDA_CHECK(cudaMemcpy(&mask_host[0],
120 |                         mask_dev,
121 |                         sizeof(unsigned long long) * boxes_num * col_blocks,
122 |                         cudaMemcpyDeviceToHost));
123 | 
124 |   std::vector<unsigned long long> remv(col_blocks);
125 |   memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
126 | 
127 |   int num_to_keep = 0;
128 |   for (int i = 0; i < boxes_num; i++) {
129 |     int nblock = i / threadsPerBlock;
130 |     int inblock = i % threadsPerBlock;
131 | 
132 |     if (!(remv[nblock] & (1ULL << inblock))) {
133 |       keep_out[num_to_keep++] = i;
134 |       unsigned long long *p = &mask_host[0] + i * col_blocks;
135 |       for (int j = nblock; j < col_blocks; j++) {
136 |         remv[j] |= p[j];
137 |       }
138 |     }
139 |   }
140 |   *num_out = num_to_keep;
141 | 
142 |   CUDA_CHECK(cudaFree(boxes_dev));
143 |   CUDA_CHECK(cudaFree(mask_dev));
144 | }
145 | 


--------------------------------------------------------------------------------
/utils/nms/py_cpu_nms.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | 
10 | def py_cpu_nms(dets, thresh):
11 |     """Pure Python NMS baseline."""
12 |     x1 = dets[:, 0]
13 |     y1 = dets[:, 1]
14 |     x2 = dets[:, 2]
15 |     y2 = dets[:, 3]
16 |     scores = dets[:, 4]
17 | 
18 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
19 |     order = scores.argsort()[::-1]
20 | 
21 |     keep = []
22 |     while order.size > 0:
23 |         i = order[0]
24 |         keep.append(i)
25 |         xx1 = np.maximum(x1[i], x1[order[1:]])
26 |         yy1 = np.maximum(y1[i], y1[order[1:]])
27 |         xx2 = np.minimum(x2[i], x2[order[1:]])
28 |         yy2 = np.minimum(y2[i], y2[order[1:]])
29 | 
30 |         w = np.maximum(0.0, xx2 - xx1 + 1)
31 |         h = np.maximum(0.0, yy2 - yy1 + 1)
32 |         inter = w * h
33 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
34 | 
35 |         inds = np.where(ovr <= thresh)[0]
36 |         order = order[inds + 1]
37 | 
38 |     return keep
39 | 


--------------------------------------------------------------------------------
/utils/nms_wrapper.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | from .nms.cpu_nms import cpu_nms, cpu_soft_nms
 9 | from .nms.gpu_nms import gpu_nms
10 | 
11 | # def nms(dets, thresh, force_cpu=False):
12 | #     """Dispatch to either CPU or GPU NMS implementations."""
13 | 
14 | #     if dets.shape[0] == 0:
15 | #         return []
16 | #     if cfg.USE_GPU_NMS and not force_cpu:
17 | #         return gpu_nms(dets, thresh, device_id=cfg.GPU_ID)
18 | #     else:
19 | #         return cpu_nms(dets, thresh)
20 | 
21 | 
22 | def nms(dets, thresh, force_cpu=False):
23 |     """Dispatch to either CPU or GPU NMS implementations."""
24 | 
25 |     if dets.shape[0] == 0:
26 |         return []
27 |     if force_cpu:
28 |         #return cpu_soft_nms(dets, thresh, method = 0)
29 |         return cpu_nms(dets, thresh)
30 |     return gpu_nms(dets, thresh)
31 | 
32 | 
33 | def soft_nms(dets, Nt=0.3, sigma=0.5, thresh=0.001, method=1):
34 |     """Dispatch to either CPU or GPU NMS implementations."""
35 | 
36 |     if dets.shape[0] == 0:
37 |         return []
38 |     return cpu_soft_nms(dets, sigma, Nt, thresh, method)


--------------------------------------------------------------------------------
/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import time
 9 | 
10 | 
11 | class Timer(object):
12 |     """A simple timer."""
13 | 
14 |     def __init__(self):
15 |         self.total_time = 0.
16 |         self.calls = 0
17 |         self.start_time = 0.
18 |         self.diff = 0.
19 |         self.average_time = 0.
20 | 
21 |     def tic(self):
22 |         # using time.time instead of time.clock because time time.clock
23 |         # does not normalize for multithreading
24 |         self.start_time = time.time()
25 | 
26 |     def toc(self, average=True):
27 |         self.diff = time.time() - self.start_time
28 |         self.total_time += self.diff
29 |         self.calls += 1
30 |         self.average_time = self.total_time / self.calls
31 |         if average:
32 |             return self.average_time
33 |         else:
34 |             return self.diff
35 | 
36 |     def clear(self):
37 |         self.total_time = 0.
38 |         self.calls = 0
39 |         self.start_time = 0.
40 |         self.diff = 0.
41 |         self.average_time = 0.
42 | 


--------------------------------------------------------------------------------