├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── datasets
    └── .gitignore
├── demo.ipynb
├── demo
    ├── 15673749081_767a7fa63a_k.jpg
    ├── 16004479832_a748d55f21_k.jpg
    ├── 17790319373_bd19b24cfc_k.jpg
    ├── 18124840932_e42b3e377c_k.jpg
    ├── 19064748793_bb942deea1_k.jpg
    ├── 24274813513_0cfd2ce6d0_k.jpg
    ├── 33823288584_1d21cf0a26_k.jpg
    ├── 33887522274_eebd074106_k.jpg
    ├── 34501842524_3c858b3080_k.jpg
    ├── NOTICE
    └── output
    │   └── sample.jpg
├── demo_FPN.ipynb
├── eval_fast.ipynb
├── eval_fast_FPN.ipynb
├── eval_faster.ipynb
├── eval_faster_FPN.ipynb
├── eval_mask.ipynb
├── eval_mask_FPN.ipynb
├── files
    ├── pretrained_base_cnn
    │   └── .gitignore
    ├── proposal_files
    │   └── .gitignore
    ├── results
    │   └── .gitignore
    └── trained_models
    │   └── .gitignore
├── lib
    ├── cppcuda
    │   ├── build
    │   │   └── .gitignore
    │   ├── roi_align_backward_cpu.cpp
    │   ├── roi_align_backward_cuda.cu
    │   ├── roi_align_binding.cpp
    │   ├── roi_align_cpu.cpp
    │   ├── roi_align_cuda.h
    │   └── roi_align_forward_cuda.cu
    ├── cppcuda_cffi
    │   ├── __init__.py
    │   ├── bind.py
    │   ├── get_lib_path.py
    │   ├── make.sh
    │   └── src
    │   │   ├── cpp
    │   │       ├── roi_align_cpu_loop.cpp
    │   │       └── roi_align_cpu_loop.h
    │   │   ├── cuda
    │   │       ├── roi_align_backward_cuda_kernel.cu
    │   │       ├── roi_align_backward_cuda_kernel.h
    │   │       ├── roi_align_forward_cuda_kernel.cu
    │   │       └── roi_align_forward_cuda_kernel.h
    │   │   ├── roi_align_backward_cuda.c
    │   │   ├── roi_align_backward_cuda.h
    │   │   ├── roi_align_forward_cpu.c
    │   │   ├── roi_align_forward_cpu.h
    │   │   ├── roi_align_forward_cuda.c
    │   │   └── roi_align_forward_cuda.h
    ├── data
    │   ├── coco_dataset.py
    │   ├── json_dataset.py
    │   └── roidb.py
    ├── model
    │   ├── collect_and_distribute_fpn_rpn_proposals.py
    │   ├── detector.py
    │   ├── generate_proposals.py
    │   ├── loss.py
    │   └── roi_align.py
    ├── utils
    │   ├── blob.py
    │   ├── boxes.py
    │   ├── collate_custom.py
    │   ├── collections.py
    │   ├── colormap.py
    │   ├── data_parallel.py
    │   ├── dummy_datasets.py
    │   ├── fast_rcnn_sample_rois.py
    │   ├── generate_anchors.py
    │   ├── io.py
    │   ├── json_dataset_evaluator.py
    │   ├── logging.py
    │   ├── multilevel_rois.py
    │   ├── preprocess_sample.py
    │   ├── result_utils.py
    │   ├── segms.py
    │   ├── selective_search.py
    │   ├── solver.py
    │   ├── timer.py
    │   ├── training_stats.py
    │   ├── utils.py
    │   └── vis.py
    └── utils_cython
    │   ├── build_cython.py
    │   ├── cython_bbox.pyx
    │   └── cython_nms.pyx
└── train_fast.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | **/*.o
 2 | **/*.so
 3 | **/__pycache__/
 4 | .ipynb_checkpoints/
 5 | debug/
 6 | lib/utils_cython/*.c
 7 | demo/output/*.pdf
 8 | lib/cppcuda_cffi/roialign/
 9 | 
10 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "lib/cocoapi"]
2 | 	path = lib/cocoapi
3 | 	url = https://github.com/cocodataset/cocoapi
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2018 Ignacio Rocco
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Detectorch  - detectron for PyTorch
 2 | 
 3 | (Disclaimer: this is work in progress and does not feature all the functionalities of detectron. Currently only inference and evaluation are supported -- no training)
 4 | (News: Now supporting FPN and ResNet-101!)
 5 | 
 6 | This code allows to use some of the [Detectron models for object detection from Facebook AI Research](https://github.com/facebookresearch/Detectron/) with PyTorch.
 7 | 
 8 | It currently supports:
 9 | 
10 | - Fast R-CNN
11 | - Faster R-CNN
12 | - Mask R-CNN
13 | 
14 | It supports ResNet-50/101 models with or without FPN. The pre-trained models from caffe2 can be imported and used on PyTorch.
15 | 
16 | <div align="center">
17 |   <img src="demo/output/sample.jpg" width="700px" />
18 |   <p>Example Mask R-CNN with ResNet-101 and FPN.</p>
19 | </div>
20 | 
21 | ## Evaluation
22 | Both bounding box evaluation and instance segmentation evaluation where tested, yielding the same results as in the Detectron caffe2 models. These results below have been computed using the PyTorch code:
23 | 
24 | | Model | box AP | mask AP |  model id |
25 | | --- | --- | --- | --- |
26 | | [fast_rcnn_R-50-C4_2x](https://s3-us-west-2.amazonaws.com/detectron/36224046/12_2017_baselines/fast_rcnn_R-50-C4_2x.yaml.08_22_57.XFxNqEnL/output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl) | 35.6 | | 36224046 |
27 | | [fast_rcnn_R-50-FPN_2x](https://s3-us-west-2.amazonaws.com/detectron/36225249/12_2017_baselines/fast_rcnn_R-50-FPN_2x.yaml.08_40_18.zoChak1f/output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl) | 36.8 | | 36225249 | 
28 | | [e2e_faster_rcnn_R-50-C4_2x](https://s3-us-west-2.amazonaws.com/detectron/35857281/12_2017_baselines/e2e_faster_rcnn_R-50-C4_2x.yaml.01_34_56.ScPH0Z4r/output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl) | 36.5 | | 35857281 |
29 | | [e2e_faster_rcnn_R-50-FPN_2x](https://s3-us-west-2.amazonaws.com/detectron/35857389/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_2x.yaml.01_37_22.KSeq0b5q/output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl) | 37.9 | | 35857389 |
30 | | [e2e_mask_rcnn_R-50-C4_2x](https://s3-us-west-2.amazonaws.com/detectron/35858828/12_2017_baselines/e2e_mask_rcnn_R-50-C4_2x.yaml.01_46_47.HBThTerB/output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl) | 37.8  | 32.8 | 35858828 |
31 | | [e2e_mask_rcnn_R-50-FPN_2x](https://s3-us-west-2.amazonaws.com/detectron/35859007/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_2x.yaml.01_49_07.By8nQcCH/output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl)| 38.6 | 34.5 | 35859007 | 
32 | | [e2e_mask_rcnn_R-101-FPN_2x](https://s3-us-west-2.amazonaws.com/detectron/35861858/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_2x.yaml.02_32_51.SgT4y1cO/output/train/coco_2014_train:coco_2014_valminusminival/generalized_rcnn/model_final.pkl) | 40.9 | 36.4 | 35861858 |
33 | 
34 | 
35 | ## Training
36 | Training code is experimental. See `train_fast.py` for training Fast R-CNN. It seems to work, but slow.
37 | 
38 | ## Installation
39 | First, clone the repo with `git clone --recursive https://github.com/ignacio-rocco/detectorch` so that you also clone the Coco API.
40 | 
41 | The code can be used with PyTorch 0.3.1 or PyTorch 0.4 (master) under Python 3. Anaconda is recommended. Other required packages
42 | 
43 | - torchvision (`conda install torchvision -c soumith`)
44 | - opencv (`conda install -c conda-forge opencv `)
45 | - cython (`conda install cython`)
46 | - matplotlib (`conda install matplotlib`)
47 | - scikit-image (`conda install scikit-image`)
48 | - ninja (`conda install ninja`) *(required for Pytorch 0.4 only)*
49 | 
50 | Additionally, you need to build the Coco API and RoIAlign layer. See below.
51 | 
52 | #### Compiling the Coco API
53 | If you cloned this repo with `git clone --recursive` you should have also cloned the cocoapi in `lib/cocoapi`. Compile this with:
54 | ```
55 | cd lib/cocoapi/PythonAPI
56 | make install
57 | ```
58 | 
59 | 
60 | #### Compiling RoIAlign
61 | The RoIAlign layer was converted from the caffe2 version. There are two different implementations for each PyTorch version:
62 | 
63 | - Pytorch 0.4: RoIAlign using ATen library (lib/cppcuda). Compiled JIT when loaded.
64 | - PyTorch 0.3.1: RoIAlign using TH/THC and cffi (lib/cppcuda_cffi). Needs to be compiled with:
65 | 
66 | ``` 
67 | cd lib/cppcuda_cffi
68 | ./make.sh 
69 | ```
70 | 
71 | ## Quick Start
72 | Check the demo notebook. 
73 | 


--------------------------------------------------------------------------------
/datasets/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/demo/15673749081_767a7fa63a_k.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ignacio-rocco/detectorch/bc2bc84781dfe3cb85aa4639ffd21d71989c6183/demo/15673749081_767a7fa63a_k.jpg


--------------------------------------------------------------------------------
/demo/16004479832_a748d55f21_k.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ignacio-rocco/detectorch/bc2bc84781dfe3cb85aa4639ffd21d71989c6183/demo/16004479832_a748d55f21_k.jpg


--------------------------------------------------------------------------------
/demo/17790319373_bd19b24cfc_k.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ignacio-rocco/detectorch/bc2bc84781dfe3cb85aa4639ffd21d71989c6183/demo/17790319373_bd19b24cfc_k.jpg


--------------------------------------------------------------------------------
/demo/18124840932_e42b3e377c_k.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ignacio-rocco/detectorch/bc2bc84781dfe3cb85aa4639ffd21d71989c6183/demo/18124840932_e42b3e377c_k.jpg


--------------------------------------------------------------------------------
/demo/19064748793_bb942deea1_k.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ignacio-rocco/detectorch/bc2bc84781dfe3cb85aa4639ffd21d71989c6183/demo/19064748793_bb942deea1_k.jpg


--------------------------------------------------------------------------------
/demo/24274813513_0cfd2ce6d0_k.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ignacio-rocco/detectorch/bc2bc84781dfe3cb85aa4639ffd21d71989c6183/demo/24274813513_0cfd2ce6d0_k.jpg


--------------------------------------------------------------------------------
/demo/33823288584_1d21cf0a26_k.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ignacio-rocco/detectorch/bc2bc84781dfe3cb85aa4639ffd21d71989c6183/demo/33823288584_1d21cf0a26_k.jpg


--------------------------------------------------------------------------------
/demo/33887522274_eebd074106_k.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ignacio-rocco/detectorch/bc2bc84781dfe3cb85aa4639ffd21d71989c6183/demo/33887522274_eebd074106_k.jpg


--------------------------------------------------------------------------------
/demo/34501842524_3c858b3080_k.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ignacio-rocco/detectorch/bc2bc84781dfe3cb85aa4639ffd21d71989c6183/demo/34501842524_3c858b3080_k.jpg


--------------------------------------------------------------------------------
/demo/NOTICE:
--------------------------------------------------------------------------------
 1 | The demo images are licensed as United States government work:
 2 | https://www.usa.gov/government-works
 3 | 
 4 | The image files were obtained on Jan 13, 2018 from the following
 5 | URLs.
 6 | 
 7 | 16004479832_a748d55f21_k.jpg
 8 | https://www.flickr.com/photos/archivesnews/16004479832
 9 | 
10 | 18124840932_e42b3e377c_k.jpg
11 | https://www.flickr.com/photos/usnavy/18124840932
12 | 
13 | 33887522274_eebd074106_k.jpg
14 | https://www.flickr.com/photos/usaid_pakistan/33887522274
15 | 
16 | 15673749081_767a7fa63a_k.jpg
17 | https://www.flickr.com/photos/usnavy/15673749081
18 | 
19 | 34501842524_3c858b3080_k.jpg
20 | https://www.flickr.com/photos/departmentofenergy/34501842524
21 | 
22 | 24274813513_0cfd2ce6d0_k.jpg
23 | https://www.flickr.com/photos/dhsgov/24274813513
24 | 
25 | 19064748793_bb942deea1_k.jpg
26 | https://www.flickr.com/photos/statephotos/19064748793
27 | 
28 | 33823288584_1d21cf0a26_k.jpg
29 | https://www.flickr.com/photos/cbpphotos/33823288584
30 | 
31 | 17790319373_bd19b24cfc_k.jpg
32 | https://www.flickr.com/photos/secdef/17790319373
33 | 


--------------------------------------------------------------------------------
/demo/output/sample.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ignacio-rocco/detectorch/bc2bc84781dfe3cb85aa4639ffd21d71989c6183/demo/output/sample.jpg


--------------------------------------------------------------------------------
/eval_fast.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Imports"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import torch\n",
 19 |     "from torch.autograd import Variable\n",
 20 |     "from torch.utils.data import DataLoader\n",
 21 |     "\n",
 22 |     "import matplotlib.pyplot as plt\n",
 23 |     "import numpy as np\n",
 24 |     "\n",
 25 |     "import sys\n",
 26 |     "sys.path.insert(0, \"lib/\")\n",
 27 |     "from data.coco_dataset import CocoDataset\n",
 28 |     "from utils.preprocess_sample import preprocess_sample\n",
 29 |     "from utils.collate_custom import collate_custom\n",
 30 |     "from utils.utils import to_cuda_variable\n",
 31 |     "import utils.result_utils as result_utils\n",
 32 |     "from utils.json_dataset_evaluator import evaluate_boxes\n",
 33 |     "from model.detector import detector\n",
 34 |     "\n",
 35 |     "torch_ver = torch.__version__[:3]"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# Parameters"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# Pretrained model\n",
 54 |     "# https://s3-us-west-2.amazonaws.com/detectron/36224046/12_2017_baselines/fast_rcnn_R-50-C4_2x.yaml.08_22_57.XFxNqEnL/output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl\n",
 55 |     "arch='resnet50'\n",
 56 |     "pretrained_model_file = 'files/trained_models/fast/model_final.pkl'\n",
 57 |     "\n",
 58 |     "# Pre-computed COCO minival2014 proposals\n",
 59 |     "# https://s3-us-west-2.amazonaws.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_minival/rpn/rpn_proposals.pkl\n",
 60 |     "proposal_file='files/proposal_files/coco_2014_minival/rpn_proposals.pkl'\n",
 61 |     "\n",
 62 |     "# COCO minival2014 dataset path\n",
 63 |     "coco_ann_file='datasets/data/coco/annotations/instances_minival2014.json'\n",
 64 |     "img_dir='datasets/data/coco/val2014'"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "# Create dataset"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 3,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "loading annotations into memory...\n",
 84 |       "Done (t=1.31s)\n",
 85 |       "creating index...\n",
 86 |       "index created!\n",
 87 |       "Loading proposals from: files/proposal_files/coco_2014_minival/rpn_proposals.pkl\n",
 88 |       " 1/5000\n",
 89 |       " 2501/5000\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "dataset = CocoDataset(ann_file=coco_ann_file,img_dir=img_dir,proposal_file=proposal_file,\n",
 95 |     "                       sample_transform=preprocess_sample(target_sizes=[800]))\n",
 96 |     "dataloader = DataLoader(dataset, batch_size=1, # only batch_size=1 is supported by now\n",
 97 |     "                        shuffle=False, num_workers=0, collate_fn=collate_custom)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "# Create detector model"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 4,
110 |    "metadata": {},
111 |    "outputs": [
112 |     {
113 |      "name": "stdout",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "loading pretrained weights\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "model = detector(arch=arch,\n",
122 |     "                 detector_pkl_file=pretrained_model_file)\n",
123 |     "model = model.cuda()"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "# Evaluate"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 5,
136 |    "metadata": {
137 |     "collapsed": true
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "# Create data structure to store results\n",
142 |     "all_boxes, all_segms, all_keyps = result_utils.empty_results(dataset.num_classes, len(dataset)) \n",
143 |     "# (only all_boxes will be used for fast RCNN)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 6,
149 |    "metadata": {
150 |     "collapsed": true
151 |    },
152 |    "outputs": [],
153 |    "source": [
154 |     "batch = next(iter(dataloader))"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 7,
160 |    "metadata": {},
161 |    "outputs": [
162 |     {
163 |      "name": "stdout",
164 |      "output_type": "stream",
165 |      "text": [
166 |       "1/5000\n",
167 |       "101/5000\n",
168 |       "201/5000\n",
169 |       "301/5000\n",
170 |       "401/5000\n",
171 |       "501/5000\n",
172 |       "601/5000\n",
173 |       "701/5000\n",
174 |       "801/5000\n",
175 |       "901/5000\n",
176 |       "1001/5000\n",
177 |       "1101/5000\n",
178 |       "1201/5000\n",
179 |       "1301/5000\n",
180 |       "1401/5000\n",
181 |       "1501/5000\n",
182 |       "1601/5000\n",
183 |       "1701/5000\n",
184 |       "1801/5000\n",
185 |       "1901/5000\n",
186 |       "2001/5000\n",
187 |       "2101/5000\n",
188 |       "2201/5000\n",
189 |       "2301/5000\n",
190 |       "2401/5000\n",
191 |       "2501/5000\n",
192 |       "2601/5000\n",
193 |       "2701/5000\n",
194 |       "2801/5000\n",
195 |       "2901/5000\n",
196 |       "3001/5000\n",
197 |       "3101/5000\n",
198 |       "3201/5000\n",
199 |       "3301/5000\n",
200 |       "3401/5000\n",
201 |       "3501/5000\n",
202 |       "3601/5000\n",
203 |       "3701/5000\n",
204 |       "3801/5000\n",
205 |       "3901/5000\n",
206 |       "4001/5000\n",
207 |       "4101/5000\n",
208 |       "4201/5000\n",
209 |       "4301/5000\n",
210 |       "4401/5000\n",
211 |       "4501/5000\n",
212 |       "4601/5000\n",
213 |       "4701/5000\n",
214 |       "4801/5000\n",
215 |       "4901/5000\n",
216 |       "Done!\n"
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "# Compute detections for whole dataset\n",
222 |     "for i, batch in enumerate(dataloader):\n",
223 |     "    batch = to_cuda_variable(batch)\n",
224 |     "    # forward pass\n",
225 |     "    if torch_ver==\"0.4\": # handle change in \"volatile\"\n",
226 |     "        with torch.no_grad():    \n",
227 |     "            class_scores,bbox_deltas,_,_ =model(batch['image'],batch['rois'])\n",
228 |     "    else:\n",
229 |     "        class_scores,bbox_deltas,_,_ =model(batch['image'],batch['rois'])\n",
230 |     "    # postprocess output:\n",
231 |     "    # - convert coordinates back to original image size, \n",
232 |     "    # - treshold proposals based on score,\n",
233 |     "    # - do NMS.\n",
234 |     "    scores_final, boxes_final, boxes_per_class = result_utils.postprocess_output(batch['rois'],\n",
235 |     "                                                                    batch['scaling_factors'],\n",
236 |     "                                                                    batch['original_im_size'],\n",
237 |     "                                                                    class_scores,\n",
238 |     "                                                                    bbox_deltas)\n",
239 |     "    # store results\n",
240 |     "    result_utils.extend_results(i, all_boxes, boxes_per_class)\n",
241 |     "    \n",
242 |     "    if i%100==0:\n",
243 |     "        print(\"{}/{}\".format(i+1,len(dataset)))\n",
244 |     "\n",
245 |     "print('Done!')"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 8,
251 |    "metadata": {
252 |     "collapsed": true
253 |    },
254 |    "outputs": [],
255 |    "source": [
256 |     "# Save detection results\n",
257 |     "np.save('files/results/all_boxes.npy',all_boxes)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 9,
263 |    "metadata": {},
264 |    "outputs": [
265 |     {
266 |      "name": "stdout",
267 |      "output_type": "stream",
268 |      "text": [
269 |       "Loading and preparing results...\n",
270 |       "DONE (t=1.64s)\n",
271 |       "creating index...\n",
272 |       "index created!\n",
273 |       "Running per image evaluation...\n",
274 |       "Evaluate annotation type *bbox*\n",
275 |       "DONE (t=42.18s).\n",
276 |       "Accumulating evaluation results...\n",
277 |       "DONE (t=6.47s).\n",
278 |       " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.356\n",
279 |       " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.567\n",
280 |       " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.382\n",
281 |       " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.181\n",
282 |       " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.403\n",
283 |       " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.494\n",
284 |       " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.302\n",
285 |       " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.466\n",
286 |       " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.486\n",
287 |       " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.269\n",
288 |       " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.545\n",
289 |       " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.651\n"
290 |      ]
291 |     }
292 |    ],
293 |    "source": [
294 |     "# Compute evaluation metrics\n",
295 |     "coco_eval = evaluate_boxes(json_dataset=dataset.coco, \n",
296 |     "                           all_boxes=all_boxes, \n",
297 |     "                           output_dir='files/results/',\n",
298 |     "                           use_salt=False, cleanup=False)"
299 |    ]
300 |   }
301 |  ],
302 |  "metadata": {
303 |   "kernelspec": {
304 |    "display_name": "Python (detectorch0.3)",
305 |    "language": "python",
306 |    "name": "detectorch03"
307 |   },
308 |   "language_info": {
309 |    "codemirror_mode": {
310 |     "name": "ipython",
311 |     "version": 3
312 |    },
313 |    "file_extension": ".py",
314 |    "mimetype": "text/x-python",
315 |    "name": "python",
316 |    "nbconvert_exporter": "python",
317 |    "pygments_lexer": "ipython3",
318 |    "version": "3.6.2"
319 |   }
320 |  },
321 |  "nbformat": 4,
322 |  "nbformat_minor": 2
323 | }
324 | 


--------------------------------------------------------------------------------
/eval_faster.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Imports"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import torch\n",
 19 |     "from torch.autograd import Variable\n",
 20 |     "from torch.utils.data import DataLoader\n",
 21 |     "\n",
 22 |     "import matplotlib.pyplot as plt\n",
 23 |     "import numpy as np\n",
 24 |     "\n",
 25 |     "import sys\n",
 26 |     "sys.path.insert(0, \"lib/\")\n",
 27 |     "from data.coco_dataset import CocoDataset\n",
 28 |     "from utils.preprocess_sample import preprocess_sample\n",
 29 |     "from utils.collate_custom import collate_custom\n",
 30 |     "from utils.utils import to_cuda_variable\n",
 31 |     "import utils.result_utils as result_utils\n",
 32 |     "from utils.json_dataset_evaluator import evaluate_boxes\n",
 33 |     "from model.detector import detector\n",
 34 |     "\n",
 35 |     "torch_ver = torch.__version__[:3]"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# Parameters"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# Pretrained model\n",
 54 |     "# https://s3-us-west-2.amazonaws.com/detectron/35857281/12_2017_baselines/e2e_faster_rcnn_R-50-C4_2x.yaml.01_34_56.ScPH0Z4r/output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl\n",
 55 |     "arch='resnet50'\n",
 56 |     "pretrained_model_file = 'files/trained_models/faster/model_final.pkl'\n",
 57 |     "\n",
 58 |     "# COCO minival2014 dataset path\n",
 59 |     "coco_ann_file='datasets/data/coco/annotations/instances_minival2014.json'\n",
 60 |     "img_dir='datasets/data/coco/val2014'"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "# Create dataset"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 3,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "name": "stdout",
 77 |      "output_type": "stream",
 78 |      "text": [
 79 |       "loading annotations into memory...\n",
 80 |       "Done (t=1.38s)\n",
 81 |       "creating index...\n",
 82 |       "index created!\n"
 83 |      ]
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "dataset = CocoDataset(ann_file=coco_ann_file,img_dir=img_dir,\n",
 88 |     "                       sample_transform=preprocess_sample(target_sizes=[800]))\n",
 89 |     "dataloader = DataLoader(dataset, batch_size=1, # only batch_size=1 is supported by now\n",
 90 |     "                        shuffle=False, num_workers=0, collate_fn=collate_custom)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "# Create detector model"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 4,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "loading pretrained weights\n"
110 |      ]
111 |     }
112 |    ],
113 |    "source": [
114 |     "model = detector(arch=arch,\n",
115 |     "                 detector_pkl_file=pretrained_model_file,\n",
116 |     "                 use_rpn_head = True)\n",
117 |     "model = model.cuda()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "# Evaluate"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 5,
130 |    "metadata": {
131 |     "collapsed": true
132 |    },
133 |    "outputs": [],
134 |    "source": [
135 |     "# Create data structure to store results\n",
136 |     "all_boxes, all_segms, all_keyps = result_utils.empty_results(dataset.num_classes, len(dataset)) \n",
137 |     "# (only all_boxes will be used for fast RCNN)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "1/5000\n",
150 |       "101/5000\n",
151 |       "201/5000\n",
152 |       "301/5000\n",
153 |       "401/5000\n",
154 |       "501/5000\n",
155 |       "601/5000\n",
156 |       "701/5000\n",
157 |       "801/5000\n",
158 |       "901/5000\n",
159 |       "1001/5000\n",
160 |       "1101/5000\n",
161 |       "1201/5000\n",
162 |       "1301/5000\n",
163 |       "1401/5000\n",
164 |       "1501/5000\n",
165 |       "1601/5000\n"
166 |      ]
167 |     }
168 |    ],
169 |    "source": [
170 |     "# Compute detections for whole dataset\n",
171 |     "for i, batch in enumerate(dataloader):\n",
172 |     "    batch = to_cuda_variable(batch)\n",
173 |     "    # forward pass\n",
174 |     "    if torch_ver==\"0.4\": # handle change in \"volatile\"\n",
175 |     "        with torch.no_grad():\n",
176 |     "            class_scores,bbox_deltas,rois,_=model(batch['image'],\n",
177 |     "                                                  scaling_factor=batch['scaling_factors'])   \n",
178 |     "    else:\n",
179 |     "        class_scores,bbox_deltas,rois,_=model(batch['image'],\n",
180 |     "                                              scaling_factor=batch['scaling_factors'])   \n",
181 |     "    # postprocess output:\n",
182 |     "    # - convert coordinates back to original image size, \n",
183 |     "    # - treshold proposals based on score,\n",
184 |     "    # - do NMS.\n",
185 |     "    scores_final, boxes_final, boxes_per_class = result_utils.postprocess_output(rois,\n",
186 |     "                                                                    batch['scaling_factors'],\n",
187 |     "                                                                    batch['original_im_size'],\n",
188 |     "                                                                    class_scores,\n",
189 |     "                                                                    bbox_deltas)\n",
190 |     "    # store results\n",
191 |     "    result_utils.extend_results(i, all_boxes, boxes_per_class)\n",
192 |     "    \n",
193 |     "    if i%100==0:\n",
194 |     "        print(\"{}/{}\".format(i+1,len(dataset)))\n",
195 |     "        \n",
196 |     "print('Done!')"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 11,
202 |    "metadata": {
203 |     "collapsed": true
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "# Save detection results\n",
208 |     "np.save('files/results/all_boxes_faster.npy',all_boxes)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 12,
214 |    "metadata": {},
215 |    "outputs": [
216 |     {
217 |      "name": "stdout",
218 |      "output_type": "stream",
219 |      "text": [
220 |       "Loading and preparing results...\n",
221 |       "DONE (t=1.97s)\n",
222 |       "creating index...\n",
223 |       "index created!\n",
224 |       "Running per image evaluation...\n",
225 |       "Evaluate annotation type *bbox*\n",
226 |       "DONE (t=40.78s).\n",
227 |       "Accumulating evaluation results...\n",
228 |       "DONE (t=6.83s).\n",
229 |       " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.365\n",
230 |       " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.573\n",
231 |       " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.393\n",
232 |       " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.184\n",
233 |       " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.406\n",
234 |       " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.506\n",
235 |       " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.308\n",
236 |       " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.474\n",
237 |       " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.492\n",
238 |       " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.279\n",
239 |       " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.540\n",
240 |       " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.657\n"
241 |      ]
242 |     }
243 |    ],
244 |    "source": [
245 |     "# Compute evaluation metrics\n",
246 |     "coco_eval = evaluate_boxes(json_dataset=dataset.coco, \n",
247 |     "                           all_boxes=all_boxes, \n",
248 |     "                           output_dir='files/results/',\n",
249 |     "                           use_salt=False, cleanup=False)"
250 |    ]
251 |   }
252 |  ],
253 |  "metadata": {
254 |   "kernelspec": {
255 |    "display_name": "Python (detectorch0.3)",
256 |    "language": "python",
257 |    "name": "detectorch03"
258 |   },
259 |   "language_info": {
260 |    "codemirror_mode": {
261 |     "name": "ipython",
262 |     "version": 3
263 |    },
264 |    "file_extension": ".py",
265 |    "mimetype": "text/x-python",
266 |    "name": "python",
267 |    "nbconvert_exporter": "python",
268 |    "pygments_lexer": "ipython3",
269 |    "version": "3.6.2"
270 |   }
271 |  },
272 |  "nbformat": 4,
273 |  "nbformat_minor": 2
274 | }
275 | 


--------------------------------------------------------------------------------
/files/pretrained_base_cnn/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/files/proposal_files/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/files/results/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/files/trained_models/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/lib/cppcuda/build/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/lib/cppcuda/roi_align_backward_cpu.cpp:
--------------------------------------------------------------------------------
  1 | // Adapted from https://github.com/caffe2/caffe2/blob/master/caffe2/operators/roi_align_gradient_op.cc
  2 | // (Ignacio Rocco)
  3 | 
  4 | #include "ATen/NativeFunctions.h"
  5 | #include <cfloat>
  6 | 
  7 | namespace at {
  8 | namespace contrib {
  9 | 
 10 | template <typename T>
 11 | void bilinear_interpolate_gradient(
 12 |     const int height,
 13 |     const int width,
 14 |     T y,
 15 |     T x,
 16 |     T& w1,
 17 |     T& w2,
 18 |     T& w3,
 19 |     T& w4,
 20 |     int& x_low,
 21 |     int& x_high,
 22 |     int& y_low,
 23 |     int& y_high,
 24 |     const int /*index*/ /* index for debug only*/) {
 25 |   // deal with cases that inverse elements are out of feature map boundary
 26 |   if (y < -1.0 || y > height || x < -1.0 || x > width) {
 27 |     // empty
 28 |     w1 = w2 = w3 = w4 = 0.;
 29 |     x_low = x_high = y_low = y_high = -1;
 30 |     return;
 31 |   }
 32 | 
 33 |   if (y <= 0) {
 34 |     y = 0;
 35 |   }
 36 |   if (x <= 0) {
 37 |     x = 0;
 38 |   }
 39 | 
 40 |   y_low = (int)y;
 41 |   x_low = (int)x;
 42 | 
 43 |   if (y_low >= height - 1) {
 44 |     y_high = y_low = height - 1;
 45 |     y = (T)y_low;
 46 |   } else {
 47 |     y_high = y_low + 1;
 48 |   }
 49 | 
 50 |   if (x_low >= width - 1) {
 51 |     x_high = x_low = width - 1;
 52 |     x = (T)x_low;
 53 |   } else {
 54 |     x_high = x_low + 1;
 55 |   }
 56 | 
 57 |   T ly = y - y_low;
 58 |   T lx = x - x_low;
 59 |   T hy = 1. - ly, hx = 1. - lx;
 60 | 
 61 |   // reference in forward
 62 |   // T v1 = bottom_data[y_low * width + x_low];
 63 |   // T v2 = bottom_data[y_low * width + x_high];
 64 |   // T v3 = bottom_data[y_high * width + x_low];
 65 |   // T v4 = bottom_data[y_high * width + x_high];
 66 |   // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
 67 | 
 68 |   w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 69 | 
 70 |   return;
 71 | }
 72 | 
 73 | template <class T>
 74 | inline void add(const T& val, T* address) {
 75 |   *address += val;
 76 | }
 77 | 
 78 | template <typename T>
 79 | void roi_align_backward_loop(
 80 |     const int nthreads,
 81 |     const T* top_diff, // input gradient
 82 |     const int /*num_rois*/, // unused
 83 |     const T& spatial_scale,
 84 |     const int channels,
 85 |     const int height,
 86 |     const int width,
 87 |     const int pooled_height,
 88 |     const int pooled_width,
 89 |     const int sampling_ratio,
 90 |     T* bottom_diff, // output gradient
 91 |     const T* bottom_rois, // input rois
 92 |     int rois_cols) {
 93 | 
 94 |   // DCHECK(rois_cols == 4 || rois_cols == 5); check this before calling loop
 95 | 
 96 | 
 97 |   for (int index = 0; index < nthreads; index++) {
 98 |     // (n, c, ph, pw) is an element in the pooled output
 99 |     int pw = index % pooled_width;
100 |     int ph = (index / pooled_width) % pooled_height;
101 |     int c = (index / pooled_width / pooled_height) % channels;
102 |     int n = index / pooled_width / pooled_height / channels;
103 | 
104 |     const T* offset_bottom_rois = bottom_rois + n * rois_cols;
105 |     int roi_batch_ind = 0;
106 |     if (rois_cols == 5) {
107 |       roi_batch_ind = offset_bottom_rois[0];
108 |       offset_bottom_rois++;
109 |     }
110 | 
111 |     // Do not using rounding; this implementation detail is critical
112 |     T roi_start_w = offset_bottom_rois[0] * spatial_scale;
113 |     T roi_start_h = offset_bottom_rois[1] * spatial_scale;
114 |     T roi_end_w = offset_bottom_rois[2] * spatial_scale;
115 |     T roi_end_h = offset_bottom_rois[3] * spatial_scale;
116 |     // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
117 |     // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
118 |     // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
119 |     // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
120 | 
121 |     // Force malformed ROIs to be 1x1
122 |     T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
123 |     T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
124 |     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
125 |     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
126 | 
127 |     T* offset_bottom_diff =
128 |         bottom_diff + (roi_batch_ind * channels + c) * height * width;
129 | 
130 |     int top_offset = (n * channels + c) * pooled_height * pooled_width;
131 |     const T* offset_top_diff = top_diff + top_offset;
132 |     const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
133 | 
134 |     // We use roi_bin_grid to sample the grid and mimic integral
135 |     int roi_bin_grid_h = (sampling_ratio > 0)
136 |         ? sampling_ratio
137 |         : std::ceil(roi_height / pooled_height); // e.g., = 2
138 |     int roi_bin_grid_w =
139 |         (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / pooled_width);
140 | 
141 |     // We do average (integral) pooling inside a bin
142 |     const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
143 | 
144 |     for (int iy = 0; iy < roi_bin_grid_h; iy++) {
145 |       const T y = roi_start_h + ph * bin_size_h +
146 |           static_cast<T>(iy + .5f) * bin_size_h /
147 |               static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
148 |       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
149 |         const T x = roi_start_w + pw * bin_size_w +
150 |             static_cast<T>(ix + .5f) * bin_size_w /
151 |                 static_cast<T>(roi_bin_grid_w);
152 | 
153 |         T w1, w2, w3, w4;
154 |         int x_low, x_high, y_low, y_high;
155 | 
156 |         bilinear_interpolate_gradient(
157 |             height,
158 |             width,
159 |             y,
160 |             x,
161 |             w1,
162 |             w2,
163 |             w3,
164 |             w4,
165 |             x_low,
166 |             x_high,
167 |             y_low,
168 |             y_high,
169 |             index);
170 | 
171 |         T g1 = top_diff_this_bin * w1 / count;
172 |         T g2 = top_diff_this_bin * w2 / count;
173 |         T g3 = top_diff_this_bin * w3 / count;
174 |         T g4 = top_diff_this_bin * w4 / count;
175 | 
176 |         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
177 |           // atomic add is not needed for now since it is single threaded
178 |           add(static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
179 |           add(static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
180 |           add(static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
181 |           add(static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
182 |         } // if
183 |       } // ix
184 |     } // iy
185 |   } // for
186 | } // ROIAlignBackward
187 | 
188 | 
189 | Tensor roi_align_backward_cpu(
190 |   const Tensor& bottom_rois,
191 |   const Tensor& grad_output, // gradient of the output of the layer
192 |   int64_t b_size,
193 |   int64_t channels,
194 |   int64_t height,
195 |   int64_t width,
196 |   int64_t pooled_height,
197 |   int64_t pooled_width,
198 |   double spatial_scale,
199 |   int64_t sampling_ratio)
200 | {
201 | 
202 |   // ROIs is the set of region proposals to process. It is a 2D Tensor where the first
203 |   // dim is the # of proposals, and the second dim is the proposal itself in the form
204 |   // [batch_index startW startH endW endH]
205 |   AT_CHECK(bottom_rois.ndimension() == 2, "RoI Proposals should be a 2D Tensor, (batch_sz x proposals)");
206 |   AT_CHECK(bottom_rois.size(1) == 5, "Proposals should be of the form [batch_index startW startH endW enH]");
207 | 
208 |   auto num_rois = bottom_rois.size(0);
209 |   auto roi_cols = bottom_rois.size(1);
210 | 
211 |   AT_CHECK(roi_cols == 4 || roi_cols == 5, "RoI Proposals should have 4 or 5 columns");
212 | 
213 |   // Output Tensor is (num_rois, C, pooled_height, pooled_width)
214 |   auto output = bottom_rois.type().tensor({b_size, channels, height, width}).zero_(); // gradient wrt input features
215 | 
216 |   AT_CHECK(bottom_rois.is_contiguous(), "bottom_rois must be contiguous");
217 | 
218 |   roi_align_backward_loop(
219 |     grad_output.numel(), 
220 |     grad_output.data<float>(),
221 |     num_rois,
222 |     static_cast<float>(spatial_scale),
223 |     channels,
224 |     height, 
225 |     width, 
226 |     pooled_height,
227 |     pooled_width,
228 |     sampling_ratio,
229 |     output.data<float>(),
230 |     bottom_rois.data<float>(),
231 |     roi_cols);
232 | 
233 |   return output;
234 | }
235 | 
236 | 
237 | } // namespace
238 | } // namespace 


--------------------------------------------------------------------------------
/lib/cppcuda/roi_align_backward_cuda.cu:
--------------------------------------------------------------------------------
  1 | // Adapted from https://github.com/caffe2/caffe2/blob/master/caffe2/operators/roi_align_gradient_op.cu
  2 | // (Ignacio Rocco)
  3 | 
  4 | #include "ATen/NativeFunctions.h"
  5 | #include <cfloat>
  6 | 
  7 | namespace at {
  8 | namespace contrib {
  9 | 
 10 | // Use 1024 threads per block, which requires cuda sm_2x or above
 11 | const int CUDA_NUM_THREADS = 1024;
 12 | const int CUDA_MAX_BLOCKS = 65535;
 13 | 
 14 | inline int GET_BLOCKS(const int N)
 15 | {
 16 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 17 | }
 18 | 
 19 | __host__ __device__ __forceinline__ float fmin(float a, float b) {
 20 |   return a > b ? b : a;
 21 | }
 22 | 
 23 | __host__ __device__ __forceinline__ float fmax(float a, float b) {
 24 |   return a > b ? a : b;
 25 | }
 26 | 
 27 | 
 28 | template <typename T>
 29 | inline __device__ T gpu_atomic_add(const T val, T* address);
 30 | 
 31 | template <>
 32 | inline __device__ float gpu_atomic_add(const float val, float* address) {
 33 |   return atomicAdd(address, val);
 34 | }
 35 | 
 36 | template <typename T>
 37 | __device__ void bilinear_interpolate_gradient(
 38 |     const int height,
 39 |     const int width,
 40 |     T y,
 41 |     T x,
 42 |     T& w1,
 43 |     T& w2,
 44 |     T& w3,
 45 |     T& w4,
 46 |     int& x_low,
 47 |     int& x_high,
 48 |     int& y_low,
 49 |     int& y_high,
 50 |     const int index /* index for debug only*/) {
 51 |   // deal with cases that inverse elements are out of feature map boundary
 52 |   if (y < -1.0 || y > height || x < -1.0 || x > width) {
 53 |     // empty
 54 |     w1 = w2 = w3 = w4 = 0.;
 55 |     x_low = x_high = y_low = y_high = -1;
 56 |     return;
 57 |   }
 58 | 
 59 |   if (y <= 0) {
 60 |     y = 0;
 61 |   }
 62 |   if (x <= 0) {
 63 |     x = 0;
 64 |   }
 65 | 
 66 |   y_low = (int)y;
 67 |   x_low = (int)x;
 68 | 
 69 |   if (y_low >= height - 1) {
 70 |     y_high = y_low = height - 1;
 71 |     y = (T)y_low;
 72 |   } else {
 73 |     y_high = y_low + 1;
 74 |   }
 75 | 
 76 |   if (x_low >= width - 1) {
 77 |     x_high = x_low = width - 1;
 78 |     x = (T)x_low;
 79 |   } else {
 80 |     x_high = x_low + 1;
 81 |   }
 82 | 
 83 |   T ly = y - y_low;
 84 |   T lx = x - x_low;
 85 |   T hy = 1. - ly, hx = 1. - lx;
 86 | 
 87 |   // reference in forward
 88 |   // T v1 = bottom_data[y_low * width + x_low];
 89 |   // T v2 = bottom_data[y_low * width + x_high];
 90 |   // T v3 = bottom_data[y_high * width + x_low];
 91 |   // T v4 = bottom_data[y_high * width + x_high];
 92 |   // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
 93 | 
 94 |   w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 95 | 
 96 |   return;
 97 | }
 98 | 
 99 | template <typename T>
100 | __global__ void roi_align_backward_kernel(
101 |     const int nthreads,
102 |     const T* top_diff,
103 |     const int num_rois,
104 |     const T spatial_scale,
105 |     const int channels,
106 |     const int height,
107 |     const int width,
108 |     const int pooled_height,
109 |     const int pooled_width,
110 |     const int sampling_ratio,
111 |     T* bottom_diff,
112 |     const T* bottom_rois,
113 |     int rois_cols) {
114 |   //CUDA_1D_KERNEL_LOOP(index, nthreads) {
115 |     for (int index = blockIdx.x * blockDim.x + threadIdx.x;
116 |        index < nthreads;
117 |        index += blockDim.x * gridDim.x)
118 |     {
119 |     // (n, c, ph, pw) is an element in the pooled output
120 |     int pw = index % pooled_width;
121 |     int ph = (index / pooled_width) % pooled_height;
122 |     int c = (index / pooled_width / pooled_height) % channels;
123 |     int n = index / pooled_width / pooled_height / channels;
124 | 
125 |     const T* offset_bottom_rois = bottom_rois + n * 5;
126 |     int roi_batch_ind = offset_bottom_rois[0];
127 | 
128 |     // Do not using rounding; this implementation detail is critical
129 |     T roi_start_w = offset_bottom_rois[1] * spatial_scale;
130 |     T roi_start_h = offset_bottom_rois[2] * spatial_scale;
131 |     T roi_end_w = offset_bottom_rois[3] * spatial_scale;
132 |     T roi_end_h = offset_bottom_rois[4] * spatial_scale;
133 |     // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
134 |     // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
135 |     // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
136 |     // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
137 | 
138 |     // Force malformed ROIs to be 1x1
139 |     T roi_width = fmax(roi_end_w - roi_start_w, (T)1.);
140 |     T roi_height = fmax(roi_end_h - roi_start_h, (T)1.);
141 |     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
142 |     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
143 | 
144 |     T* offset_bottom_diff =
145 |         bottom_diff + (roi_batch_ind * channels + c) * height * width;
146 | 
147 |     int top_offset = (n * channels + c) * pooled_height * pooled_width;
148 |     const T* offset_top_diff = top_diff + top_offset;
149 |     const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
150 | 
151 |     // We use roi_bin_grid to sample the grid and mimic integral
152 |     int roi_bin_grid_h = (sampling_ratio > 0)
153 |         ? sampling_ratio
154 |         : ceilf(roi_height / pooled_height); // e.g., = 2
155 |     int roi_bin_grid_w =
156 |         (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
157 | 
158 |     // We do average (integral) pooling inside a bin
159 |     const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
160 | 
161 |     for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
162 |     {
163 |       const T y = roi_start_h + ph * bin_size_h +
164 |           static_cast<T>(iy + .5f) * bin_size_h /
165 |               static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
166 |       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
167 |         const T x = roi_start_w + pw * bin_size_w +
168 |             static_cast<T>(ix + .5f) * bin_size_w /
169 |                 static_cast<T>(roi_bin_grid_w);
170 | 
171 |         T w1, w2, w3, w4;
172 |         int x_low, x_high, y_low, y_high;
173 | 
174 |         bilinear_interpolate_gradient(
175 |             height,
176 |             width,
177 |             y,
178 |             x,
179 |             w1,
180 |             w2,
181 |             w3,
182 |             w4,
183 |             x_low,
184 |             x_high,
185 |             y_low,
186 |             y_high,
187 |             index);
188 | 
189 |         T g1 = top_diff_this_bin * w1 / count;
190 |         T g2 = top_diff_this_bin * w2 / count;
191 |         T g3 = top_diff_this_bin * w3 / count;
192 |         T g4 = top_diff_this_bin * w4 / count;
193 | 
194 |         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
195 |           gpu_atomic_add(
196 |               static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
197 |           gpu_atomic_add(
198 |               static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
199 |           gpu_atomic_add(
200 |               static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
201 |           gpu_atomic_add(
202 |               static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
203 |         } // if
204 |       } // ix
205 |     } // iy
206 |   } // CUDA_1D_KERNEL_LOOP
207 | } // RoIAlignBackward
208 | 
209 | Tensor roi_align_backward_cuda(
210 |   const Tensor& bottom_rois,
211 |   const Tensor& grad_output, // gradient of the output of the layer
212 |   int64_t b_size,
213 |   int64_t channels,
214 |   int64_t height,
215 |   int64_t width,
216 |   int64_t pooled_height,
217 |   int64_t pooled_width,
218 |   double spatial_scale,
219 |   int64_t sampling_ratio)
220 | {
221 | 
222 |   // ROIs is the set of region proposals to process. It is a 2D Tensor where the first
223 |   // dim is the # of proposals, and the second dim is the proposal itself in the form
224 |   // [batch_index startW startH endW endH]
225 |   AT_CHECK(bottom_rois.ndimension() == 2, "RoI Proposals should be a 2D Tensor, (batch_sz x proposals)");
226 |   AT_CHECK(bottom_rois.size(1) == 5, "Proposals should be of the form [batch_index startW startH endW enH]");
227 | 
228 |   auto num_rois = bottom_rois.size(0);
229 |   auto roi_cols = bottom_rois.size(1);
230 | 
231 |   AT_CHECK(roi_cols == 4 || roi_cols == 5, "RoI Proposals should have 4 or 5 columns");
232 | 
233 |   // Output Tensor is (num_rois, C, pooled_height, pooled_width)
234 |   auto output = bottom_rois.type().tensor({b_size, channels, height, width}).zero_(); // gradient wrt input features
235 | 
236 |   AT_CHECK(bottom_rois.is_contiguous(), "bottom_rois must be contiguous");
237 | 
238 |   int64_t total_threads = output.numel();
239 |   int64_t blocks = fmin(GET_BLOCKS(total_threads),CUDA_MAX_BLOCKS);
240 |   
241 |   roi_align_backward_kernel<<<blocks, CUDA_NUM_THREADS, 0, globalContext().getCurrentCUDAStream()>>>(
242 |     grad_output.numel(), 
243 |     grad_output.data<float>(),
244 |     num_rois,
245 |     static_cast<float>(spatial_scale),
246 |     channels,
247 |     height, 
248 |     width, 
249 |     pooled_height,
250 |     pooled_width,
251 |     sampling_ratio,
252 |     output.data<float>(),
253 |     bottom_rois.data<float>(),
254 |     roi_cols);
255 | 
256 |   AT_CHECK(cudaGetLastError() == cudaSuccess, "roi_align_forward_kernel failed");
257 | 
258 |   return output;
259 | }
260 | 
261 | 
262 | }
263 | } // namespace caffe2


--------------------------------------------------------------------------------
/lib/cppcuda/roi_align_binding.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/torch.h>
 2 | #include "roi_align_cpu.cpp"
 3 | #include "roi_align_backward_cpu.cpp"
 4 | #include "roi_align_cuda.h"
 5 | 
 6 | 
 7 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 8 |   m.def("roi_align_forward_cpu", &at::contrib::roi_align_forward_cpu, "roi_align_forward_cpu");
 9 |   m.def("roi_align_backward_cpu", &at::contrib::roi_align_backward_cpu, "roi_align_backward_cpu");
10 |   m.def("roi_align_forward_cuda", &at::contrib::roi_align_forward_cuda, "roi_align_forward_cuda");
11 |   m.def("roi_align_backward_cuda", &at::contrib::roi_align_backward_cuda, "roi_align_backward_cuda");
12 | }
13 | 


--------------------------------------------------------------------------------
/lib/cppcuda/roi_align_cpu.cpp:
--------------------------------------------------------------------------------
  1 | // Adapted from https://github.com/caffe2/caffe2/blob/master/caffe2/operators/roi_align_op.cc
  2 | // (Ignacio Rocco)
  3 | 
  4 | #include "ATen/NativeFunctions.h"
  5 | #include <cfloat>
  6 | 
  7 | namespace at {
  8 | namespace contrib {
  9 | 
 10 | template <typename T>
 11 | struct PreCalc {
 12 |   int pos1;
 13 |   int pos2;
 14 |   int pos3;
 15 |   int pos4;
 16 |   T w1;
 17 |   T w2;
 18 |   T w3;
 19 |   T w4;
 20 | };
 21 | 
 22 | template <typename T>
 23 | void pre_calc_for_bilinear_interpolate(
 24 |     const int height,
 25 |     const int width,
 26 |     const int pooled_height,
 27 |     const int pooled_width,
 28 |     const int iy_upper,
 29 |     const int ix_upper,
 30 |     T roi_start_h,
 31 |     T roi_start_w,
 32 |     T bin_size_h,
 33 |     T bin_size_w,
 34 |     int roi_bin_grid_h,
 35 |     int roi_bin_grid_w,
 36 |     std::vector<PreCalc<T>>& pre_calc) {
 37 |   int pre_calc_index = 0;
 38 |   for (int ph = 0; ph < pooled_height; ph++) {
 39 |     for (int pw = 0; pw < pooled_width; pw++) {
 40 |       for (int iy = 0; iy < iy_upper; iy++) {
 41 |         const T yy = roi_start_h + ph * bin_size_h +
 42 |             static_cast<T>(iy + .5f) * bin_size_h /
 43 |                 static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
 44 |         for (int ix = 0; ix < ix_upper; ix++) {
 45 |           const T xx = roi_start_w + pw * bin_size_w +
 46 |               static_cast<T>(ix + .5f) * bin_size_w /
 47 |                   static_cast<T>(roi_bin_grid_w);
 48 | 
 49 |           T x = xx;
 50 |           T y = yy;
 51 |           // deal with: inverse elements are out of feature map boundary
 52 |           if (y < -1.0 || y > height || x < -1.0 || x > width) {
 53 |             // empty
 54 |             PreCalc<T> pc;
 55 |             pc.pos1 = 0;
 56 |             pc.pos2 = 0;
 57 |             pc.pos3 = 0;
 58 |             pc.pos4 = 0;
 59 |             pc.w1 = 0;
 60 |             pc.w2 = 0;
 61 |             pc.w3 = 0;
 62 |             pc.w4 = 0;
 63 |             pre_calc[pre_calc_index] = pc;
 64 |             pre_calc_index += 1;
 65 |             continue;
 66 |           }
 67 | 
 68 |           if (y <= 0) {
 69 |             y = 0;
 70 |           }
 71 |           if (x <= 0) {
 72 |             x = 0;
 73 |           }
 74 | 
 75 |           int y_low = (int)y;
 76 |           int x_low = (int)x;
 77 |           int y_high;
 78 |           int x_high;
 79 | 
 80 |           if (y_low >= height - 1) {
 81 |             y_high = y_low = height - 1;
 82 |             y = (T)y_low;
 83 |           } else {
 84 |             y_high = y_low + 1;
 85 |           }
 86 | 
 87 |           if (x_low >= width - 1) {
 88 |             x_high = x_low = width - 1;
 89 |             x = (T)x_low;
 90 |           } else {
 91 |             x_high = x_low + 1;
 92 |           }
 93 | 
 94 |           T ly = y - y_low;
 95 |           T lx = x - x_low;
 96 |           T hy = 1. - ly, hx = 1. - lx;
 97 |           T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 98 | 
 99 |           // save weights and indeces
100 |           PreCalc<T> pc;
101 |           pc.pos1 = y_low * width + x_low;
102 |           pc.pos2 = y_low * width + x_high;
103 |           pc.pos3 = y_high * width + x_low;
104 |           pc.pos4 = y_high * width + x_high;
105 |           pc.w1 = w1;
106 |           pc.w2 = w2;
107 |           pc.w3 = w3;
108 |           pc.w4 = w4;
109 |           pre_calc[pre_calc_index] = pc;
110 | 
111 |           pre_calc_index += 1;
112 |         }
113 |       }
114 |     }
115 |   }
116 | }
117 | 
118 | 
119 | template <typename T>
120 | void roi_align_forward_loop(
121 |     const int outputElements,
122 |     const T* bottom_data, // input tensor
123 |     const T* bottom_rois,  // input rois
124 |     const T& spatial_scale,
125 |     const int channels,
126 |     const int height,
127 |     const int width,
128 |     const int pooled_height,
129 |     const int pooled_width,
130 |     const int sampling_ratio,
131 |     const int roi_cols, // rois can have 4 or 5 columns
132 |     T* top_data)  // output
133 | {
134 |   int n_rois = outputElements / channels / pooled_width / pooled_height;
135 |   // (n, c, ph, pw) is an element in the pooled output
136 |   // can be parallelized using omp
137 |   // #pragma omp parallel for num_threads(32)
138 |   for (int n = 0; n < n_rois; n++) {
139 |     int index_n = n * channels * pooled_width * pooled_height;
140 | 
141 |     // roi could have 4 or 5 columns
142 |     const T* offset_bottom_rois = bottom_rois + n * roi_cols;
143 |     int roi_batch_ind = 0;
144 |     if (roi_cols == 5) {
145 |       roi_batch_ind = offset_bottom_rois[0];
146 |       offset_bottom_rois++;
147 |     }
148 | 
149 |     // Do not using rounding; this implementation detail is critical
150 |     T roi_start_w = offset_bottom_rois[0] * spatial_scale;
151 |     T roi_start_h = offset_bottom_rois[1] * spatial_scale;
152 |     T roi_end_w = offset_bottom_rois[2] * spatial_scale;
153 |     T roi_end_h = offset_bottom_rois[3] * spatial_scale;
154 |     // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
155 |     // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
156 |     // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
157 |     // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
158 | 
159 |     // Force malformed ROIs to be 1x1
160 |     T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
161 |     T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
162 |     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
163 |     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
164 | 
165 |     // We use roi_bin_grid to sample the grid and mimic integral
166 |     int roi_bin_grid_h = (sampling_ratio > 0)
167 |         ? sampling_ratio
168 |         : std::ceil(roi_height / pooled_height); // e.g., = 2
169 |     int roi_bin_grid_w =
170 |         (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / pooled_width);
171 | 
172 |     // We do average (integral) pooling inside a bin
173 |     const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
174 | 
175 |     // we want to precalculate indeces and weights shared by all chanels,
176 |     // this is the key point of optimiation
177 |     std::vector<PreCalc<T>> pre_calc(
178 |         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
179 |     pre_calc_for_bilinear_interpolate(
180 |         height,
181 |         width,
182 |         pooled_height,
183 |         pooled_width,
184 |         roi_bin_grid_h,
185 |         roi_bin_grid_w,
186 |         roi_start_h,
187 |         roi_start_w,
188 |         bin_size_h,
189 |         bin_size_w,
190 |         roi_bin_grid_h,
191 |         roi_bin_grid_w,
192 |         pre_calc);
193 | 
194 |     
195 |     for (int c = 0; c < channels; c++) {
196 |       int index_n_c = index_n + c * pooled_width * pooled_height;
197 |       const T* offset_bottom_data =
198 |           bottom_data + (roi_batch_ind * channels + c) * height * width;
199 |       int pre_calc_index = 0;
200 | 
201 |       for (int ph = 0; ph < pooled_height; ph++) {
202 |         for (int pw = 0; pw < pooled_width; pw++) {
203 |           int index = index_n_c + ph * pooled_width + pw;
204 | 
205 |           T output_val = 0.;
206 |           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
207 |             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
208 |               PreCalc<T> pc = pre_calc[pre_calc_index];
209 |               output_val += pc.w1 * offset_bottom_data[pc.pos1] +
210 |                   pc.w2 * offset_bottom_data[pc.pos2] +
211 |                   pc.w3 * offset_bottom_data[pc.pos3] +
212 |                   pc.w4 * offset_bottom_data[pc.pos4];
213 | 
214 |               pre_calc_index += 1;
215 |             }
216 |           }
217 |           output_val /= count;
218 | 
219 |           top_data[index] = output_val;
220 |         } // for pw
221 |       } // for ph
222 |     } // for c
223 |   } // for n
224 | }
225 | 
226 | 
227 | Tensor roi_align_forward_cpu(
228 |   const Tensor& input,
229 |   const Tensor& bottom_rois,
230 |   int64_t pooled_height,
231 |   int64_t pooled_width,
232 |   double spatial_scale,
233 |   int64_t sampling_ratio)
234 | {
235 |   // Input is the output of the last convolutional layer in the Backbone network, so
236 |   // it should be in the format of NCHW
237 |   AT_CHECK(input.ndimension() == 4, "Input to RoI Pooling should be a NCHW Tensor");
238 | 
239 |   // ROIs is the set of region proposals to process. It is a 2D Tensor where the first
240 |   // dim is the # of proposals, and the second dim is the proposal itself in the form
241 |   // [batch_index startW startH endW endH]
242 |   AT_CHECK(bottom_rois.ndimension() == 2, "RoI Proposals should be a 2D Tensor, (batch_sz x proposals)");
243 |   AT_CHECK(bottom_rois.size(1) == 5, "Proposals should be of the form [batch_index startW startH endW enH]");
244 | 
245 |   auto num_rois = bottom_rois.size(0);
246 |   auto roi_cols = bottom_rois.size(1);
247 |   auto channels = input.size(1);
248 |   auto height = input.size(2);
249 |   auto width = input.size(3);
250 | 
251 |   AT_CHECK(roi_cols == 4 || roi_cols == 5, "RoI Proposals should have 4 or 5 columns");
252 | 
253 | 
254 |   // Output Tensor is (num_rois, C, pooled_height, pooled_width)
255 |   auto output = input.type().tensor({num_rois, channels, pooled_height, pooled_width});
256 | 
257 |   AT_CHECK(input.is_contiguous(), "input must be contiguous");
258 |   AT_CHECK(bottom_rois.is_contiguous(), "bottom_rois must be contiguous");
259 | 
260 | 
261 |   roi_align_forward_loop(
262 |     output.numel(), 
263 |     input.data<float>(), 
264 |     bottom_rois.data<float>(), 
265 |     static_cast<float>(spatial_scale), 
266 |     channels,
267 |     height, 
268 |     width, 
269 |     pooled_height, 
270 |     pooled_width, 
271 |     sampling_ratio,
272 |     roi_cols,
273 |     output.data<float>());
274 | 
275 |   return output;
276 | }
277 | 
278 | 
279 | 
280 | } 
281 | }


--------------------------------------------------------------------------------
/lib/cppcuda/roi_align_cuda.h:
--------------------------------------------------------------------------------
 1 | namespace at {
 2 | namespace contrib {
 3 | 
 4 | Tensor roi_align_forward_cuda(
 5 |   const Tensor& input,
 6 |   const Tensor& bottom_rois,
 7 |   int64_t pooled_height,
 8 |   int64_t pooled_width,
 9 |   double spatial_scale,
10 |   int64_t sampling_ratio);
11 | 
12 | Tensor roi_align_backward_cuda(
13 |   const Tensor& bottom_rois,
14 |   const Tensor& grad_output, // gradient of the output of the layer
15 |   int64_t b_size,
16 |   int64_t channels,
17 |   int64_t height,
18 |   int64_t width,
19 |   int64_t pooled_height,
20 |   int64_t pooled_width,
21 |   double spatial_scale,
22 |   int64_t sampling_ratio);
23 | 
24 | 
25 | }
26 | }


--------------------------------------------------------------------------------
/lib/cppcuda/roi_align_forward_cuda.cu:
--------------------------------------------------------------------------------
  1 | // Adapted from https://github.com/caffe2/caffe2/blob/master/caffe2/operators/roi_align_op.cu
  2 | // (Ignacio Rocco)
  3 | 
  4 | #include "ATen/NativeFunctions.h"
  5 | #include <cfloat>
  6 | 
  7 | namespace at {
  8 | namespace contrib {
  9 | 
 10 | // Use 1024 threads per block, which requires cuda sm_2x or above
 11 | const int CUDA_NUM_THREADS = 1024;
 12 | const int CUDA_MAX_BLOCKS = 65535;
 13 | 
 14 | inline int GET_BLOCKS(const int N)
 15 | {
 16 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 17 | }
 18 | 
 19 | __host__ __device__ __forceinline__ float fmin(float a, float b) {
 20 |   return a > b ? b : a;
 21 | }
 22 | 
 23 | __host__ __device__ __forceinline__ float fmax(float a, float b) {
 24 |   return a > b ? a : b;
 25 | }
 26 | 
 27 | template <typename T>
 28 | __device__ T bilinear_interpolate(
 29 |     const T* bottom_data,
 30 |     const int height,
 31 |     const int width,
 32 |     T y,
 33 |     T x,
 34 |     const int index /* index for debug only*/) {
 35 |   // deal with cases that inverse elements are out of feature map boundary
 36 |   if (y < -1.0 || y > height || x < -1.0 || x > width) {
 37 |     // empty
 38 |     return 0;
 39 |   }
 40 | 
 41 |   if (y <= 0) {
 42 |     y = 0;
 43 |   }
 44 |   if (x <= 0) {
 45 |     x = 0;
 46 |   }
 47 | 
 48 |   int y_low = (int)y;
 49 |   int x_low = (int)x;
 50 |   int y_high;
 51 |   int x_high;
 52 | 
 53 |   if (y_low >= height - 1) {
 54 |     y_high = y_low = height - 1;
 55 |     y = (T)y_low;
 56 |   } else {
 57 |     y_high = y_low + 1;
 58 |   }
 59 | 
 60 |   if (x_low >= width - 1) {
 61 |     x_high = x_low = width - 1;
 62 |     x = (T)x_low;
 63 |   } else {
 64 |     x_high = x_low + 1;
 65 |   }
 66 | 
 67 |   T ly = y - y_low;
 68 |   T lx = x - x_low;
 69 |   T hy = 1. - ly, hx = 1. - lx;
 70 |   // do bilinear interpolation
 71 |   T v1 = bottom_data[y_low * width + x_low];
 72 |   T v2 = bottom_data[y_low * width + x_high];
 73 |   T v3 = bottom_data[y_high * width + x_low];
 74 |   T v4 = bottom_data[y_high * width + x_high];
 75 |   T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 76 | 
 77 |   T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
 78 | 
 79 |   return val;
 80 | }
 81 | 
 82 | template <typename T>
 83 | __global__ void roi_align_forward_kernel(
 84 |     const int outputElements,
 85 |     const T* bottom_data, // input tensor
 86 |     const T* bottom_rois, // input rois
 87 |     const T spatial_scale,
 88 |     const int channels,
 89 |     const int height,
 90 |     const int width,
 91 |     const int pooled_height,
 92 |     const int pooled_width,
 93 |     const int sampling_ratio,
 94 |     T* top_data) // output
 95 |  {
 96 | //  CUDA_1D_KERNEL_LOOP(index, nthreads) {
 97 |   for (int index = blockIdx.x * blockDim.x + threadIdx.x;
 98 |        index < outputElements;
 99 |        index += blockDim.x * gridDim.x)
100 |   {
101 |     // (n, c, ph, pw) is an element in the pooled output
102 |     int pw = index % pooled_width;
103 |     int ph = (index / pooled_width) % pooled_height;
104 |     int c = (index / pooled_width / pooled_height) % channels;
105 |     int n = index / pooled_width / pooled_height / channels;
106 | 
107 |     const T* offset_bottom_rois = bottom_rois + n * 5;
108 |     int roi_batch_ind = offset_bottom_rois[0];
109 | 
110 |     // Do not using rounding; this implementation detail is critical
111 |     T roi_start_w = offset_bottom_rois[1] * spatial_scale;
112 |     T roi_start_h = offset_bottom_rois[2] * spatial_scale;
113 |     T roi_end_w = offset_bottom_rois[3] * spatial_scale;
114 |     T roi_end_h = offset_bottom_rois[4] * spatial_scale;
115 |     // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
116 |     // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
117 |     // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
118 |     // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
119 | 
120 |     // Force malformed ROIs to be 1x1
121 |     T roi_width = fmax(roi_end_w - roi_start_w, (T)1.);
122 |     T roi_height = fmax(roi_end_h - roi_start_h, (T)1.);
123 |     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
124 |     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
125 | 
126 |     const T* offset_bottom_data =
127 |         bottom_data + (roi_batch_ind * channels + c) * height * width;
128 | 
129 |     // We use roi_bin_grid to sample the grid and mimic integral
130 |     int roi_bin_grid_h = (sampling_ratio > 0)
131 |         ? sampling_ratio
132 |         : ceilf(roi_height / pooled_height); // e.g., = 2
133 |     int roi_bin_grid_w =
134 |         (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
135 | 
136 |     // We do average (integral) pooling inside a bin
137 |     const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
138 | 
139 |     T output_val = 0.;
140 |     for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
141 |     {
142 |       const T y = roi_start_h + ph * bin_size_h +
143 |           static_cast<T>(iy + .5f) * bin_size_h /
144 |               static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
145 |       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
146 |         const T x = roi_start_w + pw * bin_size_w +
147 |             static_cast<T>(ix + .5f) * bin_size_w /
148 |                 static_cast<T>(roi_bin_grid_w);
149 | 
150 |         T val = bilinear_interpolate(
151 |             offset_bottom_data, height, width, y, x, index);
152 |         output_val += val;
153 |       }
154 |     }
155 |     output_val /= count;
156 | 
157 |     top_data[index] = output_val;
158 |   }
159 | }
160 | 
161 | 
162 | Tensor roi_align_forward_cuda(
163 |   const Tensor& input,
164 |   const Tensor& bottom_rois,
165 |   int64_t pooled_height,
166 |   int64_t pooled_width,
167 |   double spatial_scale,
168 |   int64_t sampling_ratio)
169 | {
170 | 
171 |   // Input is the output of the last convolutional layer in the Backbone network, so
172 |   // it should be in the format of NCHW
173 |   AT_CHECK(input.ndimension() == 4, "Input to RoI Align should be a NCHW Tensor");
174 | 
175 |   // ROIs is the set of region proposals to process. It is a 2D Tensor where the first
176 |   // dim is the # of proposals, and the second dim is the n itself in the form
177 |   // [batch_index startW startH endW endH]
178 |   AT_CHECK(bottom_rois.ndimension() == 2, "RoI Proposals should be a 2D Tensor, (batch_sz x proposals)");
179 |   AT_CHECK(bottom_rois.size(1) == 5, "Proposals should be of the form [batch_index startW startH endW enH]");
180 | 
181 |   auto proposals = bottom_rois.size(0);
182 |   auto channels = input.size(1);
183 |   auto height = input.size(2);
184 |   auto width = input.size(3);
185 | 
186 |   // Output Tensor is (num_rois, C, pooled_height, pooled_width)
187 |   auto output = input.type().tensor({proposals, channels, pooled_height, pooled_width});
188 | 
189 |   AT_CHECK(input.is_contiguous(), "input must be contiguous");
190 |   AT_CHECK(bottom_rois.is_contiguous(), "bottom_rois must be contiguous");
191 | 
192 |   // dim3 block(512);
193 |   // dim3 grid((output.numel() + 512 - 1) / 512);
194 |   int64_t total_threads = output.numel();
195 |   int64_t blocks = fmin(GET_BLOCKS(total_threads),CUDA_MAX_BLOCKS);
196 |   
197 |   roi_align_forward_kernel<<<blocks, CUDA_NUM_THREADS, 0, globalContext().getCurrentCUDAStream()>>>(
198 |     output.numel(), 
199 |     input.data<float>(), 
200 |     bottom_rois.data<float>(), 
201 |     static_cast<float>(spatial_scale), 
202 |     channels,
203 |     height, 
204 |     width, 
205 |     pooled_height, 
206 |     pooled_width, 
207 |     sampling_ratio,
208 |     output.data<float>());
209 |   AT_CHECK(cudaGetLastError() == cudaSuccess, "roi_align_forward_kernel failed");
210 | 
211 |   return output;
212 | }
213 | 
214 | 
215 | } // at::contrib
216 | } // at


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ignacio-rocco/detectorch/bc2bc84781dfe3cb85aa4639ffd21d71989c6183/lib/cppcuda_cffi/__init__.py


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/bind.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from torch.utils.ffi import create_extension
 4 | 
 5 | 
 6 | sources = ['src/roi_align_forward_cpu.c']
 7 | headers = ['src/roi_align_forward_cpu.h']
 8 | defines = []
 9 | with_cuda = False
10 | 
11 | if torch.cuda.is_available():
12 |     print('Including CUDA code.')
13 |     sources += ['src/roi_align_forward_cuda.c','src/roi_align_backward_cuda.c']
14 |     headers += ['src/roi_align_forward_cuda.h','src/roi_align_backward_cuda.h']
15 |     defines += [('WITH_CUDA', None)]
16 |     with_cuda = True
17 | 
18 | this_file = os.path.dirname(os.path.realpath(__file__))
19 | print(this_file)
20 | extra_objects = ['src/cpp/roi_align_cpu_loop.o',
21 |                  'src/cuda/roi_align_forward_cuda_kernel.cu.o',
22 |                  'src/cuda/roi_align_backward_cuda_kernel.cu.o']
23 | 
24 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]
25 | 
26 | ffi = create_extension(
27 |     'roialign',
28 |     headers=headers,
29 |     sources=sources,
30 |     define_macros=defines,
31 |     relative_to=__file__,
32 |     with_cuda=with_cuda,
33 |     extra_objects=extra_objects,
34 |     extra_compile_args=['-std=c11']
35 | )
36 | 
37 | if __name__ == '__main__':
38 |     ffi.build()
39 | 


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/get_lib_path.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | def main():
 5 |     libpath=os.path.join(os.path.dirname(torch.__file__),'lib','include')
 6 |     print(libpath)
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     main()
11 | 
12 | 


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CUDA_PATH=/usr/local/cuda/bin/
 4 | PATH=$CUDA_PATH:$PATH
 5 | 
 6 | TORCHLIBPATH=$(python get_lib_path.py 2>&1)
 7 | echo $TORCHLIBPATH
 8 | 
 9 | cd src/cpp/
10 | 
11 | echo "Compiling roi_align_cpu.cpp with g++..."
12 | g++ -I $TORCHLIBPATH -o roi_align_cpu_loop.o roi_align_cpu_loop.cpp -fPIC -shared -std=c++0x
13 | 
14 | echo "Compiling roi_align_forward_cuda_kernel.cu with nvcc..."
15 | cd ../cuda/
16 | nvcc -c -o roi_align_forward_cuda_kernel.cu.o roi_align_forward_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52
17 | nvcc -c -o roi_align_backward_cuda_kernel.cu.o roi_align_backward_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52
18 | cd ../../
19 | 
20 | python bind.py


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/src/cpp/roi_align_cpu_loop.cpp:
--------------------------------------------------------------------------------
  1 | // Adapted from https://github.com/caffe2/caffe2/blob/master/caffe2/operators/roi_align_op.cc
  2 | // (Ignacio Rocco)
  3 | #ifdef __cplusplus
  4 | 
  5 | #include <vector>
  6 | #include <cmath>
  7 | #include <cfloat>
  8 | 
  9 | struct PreCalc {
 10 |   int pos1;
 11 |   int pos2;
 12 |   int pos3;
 13 |   int pos4;
 14 |   float w1;
 15 |   float w2;
 16 |   float w3;
 17 |   float w4;
 18 | };
 19 | 
 20 | void pre_calc_for_bilinear_interpolate(
 21 |     const int height,
 22 |     const int width,
 23 |     const int pooled_height,
 24 |     const int pooled_width,
 25 |     const int iy_upper,
 26 |     const int ix_upper,
 27 |     float roi_start_h,
 28 |     float roi_start_w,
 29 |     float bin_size_h,
 30 |     float bin_size_w,
 31 |     int roi_bin_grid_h,
 32 |     int roi_bin_grid_w,
 33 |     std::vector<PreCalc>& pre_calc) {
 34 |   int pre_calc_index = 0;
 35 |   for (int ph = 0; ph < pooled_height; ph++) {
 36 |     for (int pw = 0; pw < pooled_width; pw++) {
 37 |       for (int iy = 0; iy < iy_upper; iy++) {
 38 |         const float yy = roi_start_h + ph * bin_size_h +
 39 |             static_cast<float>(iy + .5f) * bin_size_h /
 40 |                 static_cast<float>(roi_bin_grid_h); // e.g., 0.5, 1.5
 41 |         for (int ix = 0; ix < ix_upper; ix++) {
 42 |           const float xx = roi_start_w + pw * bin_size_w +
 43 |               static_cast<float>(ix + .5f) * bin_size_w /
 44 |                   static_cast<float>(roi_bin_grid_w);
 45 | 
 46 |           float x = xx;
 47 |           float y = yy;
 48 |           // deal with: inverse elements are out of feature map boundary
 49 |           if (y < -1.0 || y > height || x < -1.0 || x > width) {
 50 |             // empty
 51 |             PreCalc pc;
 52 |             pc.pos1 = 0;
 53 |             pc.pos2 = 0;
 54 |             pc.pos3 = 0;
 55 |             pc.pos4 = 0;
 56 |             pc.w1 = 0;
 57 |             pc.w2 = 0;
 58 |             pc.w3 = 0;
 59 |             pc.w4 = 0;
 60 |             pre_calc[pre_calc_index] = pc;
 61 |             pre_calc_index += 1;
 62 |             continue;
 63 |           }
 64 | 
 65 |           if (y <= 0) {
 66 |             y = 0;
 67 |           }
 68 |           if (x <= 0) {
 69 |             x = 0;
 70 |           }
 71 | 
 72 |           int y_low = (int)y;
 73 |           int x_low = (int)x;
 74 |           int y_high;
 75 |           int x_high;
 76 | 
 77 |           if (y_low >= height - 1) {
 78 |             y_high = y_low = height - 1;
 79 |             y = (float)y_low;
 80 |           } else {
 81 |             y_high = y_low + 1;
 82 |           }
 83 | 
 84 |           if (x_low >= width - 1) {
 85 |             x_high = x_low = width - 1;
 86 |             x = (float)x_low;
 87 |           } else {
 88 |             x_high = x_low + 1;
 89 |           }
 90 | 
 91 |           float ly = y - y_low;
 92 |           float lx = x - x_low;
 93 |           float hy = 1. - ly, hx = 1. - lx;
 94 |           float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 95 | 
 96 |           // save weights and indeces
 97 |           PreCalc pc;
 98 |           pc.pos1 = y_low * width + x_low;
 99 |           pc.pos2 = y_low * width + x_high;
100 |           pc.pos3 = y_high * width + x_low;
101 |           pc.pos4 = y_high * width + x_high;
102 |           pc.w1 = w1;
103 |           pc.w2 = w2;
104 |           pc.w3 = w3;
105 |           pc.w4 = w4;
106 |           pre_calc[pre_calc_index] = pc;
107 | 
108 |           pre_calc_index += 1;
109 |         }
110 |       }
111 |     }
112 |   }
113 | }
114 | 
115 | extern "C" {
116 | #endif
117 | 
118 | void roi_align_forward_loop(
119 |     const int outputElements,
120 |     const float* bottom_data, // input tensor
121 |     const float* bottom_rois,  // input rois
122 |     const float spatial_scale,
123 |     const int channels,
124 |     const int height,
125 |     const int width,
126 |     const int pooled_height,
127 |     const int pooled_width,
128 |     const int sampling_ratio,
129 |     const int roi_cols, // rois can have 4 or 5 columns
130 |     float* top_data)  // output
131 | {
132 |   int n_rois = outputElements / channels / pooled_width / pooled_height;
133 |   // (n, c, ph, pw) is an element in the pooled output
134 |   // can be parallelized using omp
135 |   // #pragma omp parallel for num_threads(32)
136 |   for (int n = 0; n < n_rois; n++) {
137 |     int index_n = n * channels * pooled_width * pooled_height;
138 | 
139 |     // roi could have 4 or 5 columns
140 |     const float* offset_bottom_rois = bottom_rois + n * roi_cols;
141 |     int roi_batch_ind = 0;
142 |     if (roi_cols == 5) {
143 |       roi_batch_ind = offset_bottom_rois[0];
144 |       offset_bottom_rois++;
145 |     }
146 | 
147 |     // Do not using rounding; this implementation detail is critical
148 |     float roi_start_w = offset_bottom_rois[0] * spatial_scale;
149 |     float roi_start_h = offset_bottom_rois[1] * spatial_scale;
150 |     float roi_end_w = offset_bottom_rois[2] * spatial_scale;
151 |     float roi_end_h = offset_bottom_rois[3] * spatial_scale;
152 |     // float roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
153 |     // float roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
154 |     // float roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
155 |     // float roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
156 | 
157 |     // Force malformed ROIs to be 1x1
158 |     float roi_width = std::max(roi_end_w - roi_start_w, (float)1.);
159 |     float roi_height = std::max(roi_end_h - roi_start_h, (float)1.);
160 |     float bin_size_h = static_cast<float>(roi_height) / static_cast<float>(pooled_height);
161 |     float bin_size_w = static_cast<float>(roi_width) / static_cast<float>(pooled_width);
162 | 
163 |     // We use roi_bin_grid to sample the grid and mimic integral
164 |     int roi_bin_grid_h = (sampling_ratio > 0)
165 |         ? sampling_ratio
166 |         : std::ceil(roi_height / pooled_height); // e.g., = 2
167 |     int roi_bin_grid_w =
168 |         (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / pooled_width);
169 | 
170 |     // We do average (integral) pooling inside a bin
171 |     const float count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
172 | 
173 |     // we want to precalculate indeces and weights shared by all chanels,
174 |     // this is the key point of optimiation
175 |     std::vector<PreCalc> pre_calc(
176 |         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
177 |     pre_calc_for_bilinear_interpolate(
178 |         height,
179 |         width,
180 |         pooled_height,
181 |         pooled_width,
182 |         roi_bin_grid_h,
183 |         roi_bin_grid_w,
184 |         roi_start_h,
185 |         roi_start_w,
186 |         bin_size_h,
187 |         bin_size_w,
188 |         roi_bin_grid_h,
189 |         roi_bin_grid_w,
190 |         pre_calc);
191 | 
192 |     
193 |     for (int c = 0; c < channels; c++) {
194 |       int index_n_c = index_n + c * pooled_width * pooled_height;
195 |       const float* offset_bottom_data =
196 |           bottom_data + (roi_batch_ind * channels + c) * height * width;
197 |       int pre_calc_index = 0;
198 | 
199 |       for (int ph = 0; ph < pooled_height; ph++) {
200 |         for (int pw = 0; pw < pooled_width; pw++) {
201 |           int index = index_n_c + ph * pooled_width + pw;
202 | 
203 |           float output_val = 0.;
204 |           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
205 |             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
206 |               PreCalc pc = pre_calc[pre_calc_index];
207 |               output_val += pc.w1 * offset_bottom_data[pc.pos1] +
208 |                   pc.w2 * offset_bottom_data[pc.pos2] +
209 |                   pc.w3 * offset_bottom_data[pc.pos3] +
210 |                   pc.w4 * offset_bottom_data[pc.pos4];
211 | 
212 |               pre_calc_index += 1;
213 |             }
214 |           }
215 |           output_val /= count;
216 | 
217 |           top_data[index] = output_val;
218 |         } // for pw
219 |       } // for ph
220 |     } // for c
221 |   } // for n
222 | }
223 | 
224 | 
225 | 
226 | 
227 | #ifdef __cplusplus
228 | }
229 | #endif
230 | 


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/src/cpp/roi_align_cpu_loop.h:
--------------------------------------------------------------------------------
 1 | #ifdef __cplusplus
 2 | extern "C" {
 3 | #endif
 4 | 	
 5 | void roi_align_forward_loop(
 6 |     const int outputElements,
 7 |     const float* bottom_data, // input tensor
 8 |     const float* bottom_rois,  // input rois
 9 |     const float spatial_scale,
10 |     const int channels,
11 |     const int height,
12 |     const int width,
13 |     const int pooled_height,
14 |     const int pooled_width,
15 |     const int sampling_ratio,
16 |     const int roi_cols, // rois can have 4 or 5 columns
17 |     float* top_data);
18 | 
19 | #ifdef __cplusplus
20 | }
21 | #endif
22 | 
23 | 


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/src/cuda/roi_align_backward_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | // Adapted from https://github.com/caffe2/caffe2/blob/master/caffe2/operators/roi_align_gradient_op.cu
  2 | // (Ignacio Rocco)
  3 | #ifdef __cplusplus
  4 | extern "C" {
  5 | #endif
  6 | 
  7 | #include <stdio.h>
  8 | #include <math.h>
  9 | #include <float.h>
 10 | 
 11 | // Use 1024 threads per block, which requires cuda sm_2x or above
 12 | const int CUDA_NUM_THREADS = 1024;
 13 | const int CUDA_MAX_BLOCKS = 65535;
 14 | 
 15 | inline int GET_BLOCKS(const int N)
 16 | {
 17 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 18 | }
 19 | 
 20 | __host__ __device__ __forceinline__ float myfmin(float a, float b) {
 21 |   return a > b ? b : a;
 22 | }
 23 | 
 24 | __host__ __device__ __forceinline__ float myfmax(float a, float b) {
 25 |   return a > b ? a : b;
 26 | }
 27 | 
 28 | 
 29 | inline __device__ float gpu_atomic_add(const float val, float* address) {
 30 |   return atomicAdd(address, val);
 31 | }
 32 | 
 33 | __device__ void bilinear_interpolate_gradient(
 34 |     const int height,
 35 |     const int width,
 36 |     float y,
 37 |     float x,
 38 |     float& w1,
 39 |     float& w2,
 40 |     float& w3,
 41 |     float& w4,
 42 |     int& x_low,
 43 |     int& x_high,
 44 |     int& y_low,
 45 |     int& y_high,
 46 |     const int index /* index for debug only*/) {
 47 |   // deal with cases that inverse elements are out of feature map boundary
 48 |   if (y < -1.0 || y > height || x < -1.0 || x > width) {
 49 |     // empty
 50 |     w1 = w2 = w3 = w4 = 0.;
 51 |     x_low = x_high = y_low = y_high = -1;
 52 |     return;
 53 |   }
 54 | 
 55 |   if (y <= 0) {
 56 |     y = 0;
 57 |   }
 58 |   if (x <= 0) {
 59 |     x = 0;
 60 |   }
 61 | 
 62 |   y_low = (int)y;
 63 |   x_low = (int)x;
 64 | 
 65 |   if (y_low >= height - 1) {
 66 |     y_high = y_low = height - 1;
 67 |     y = (float)y_low;
 68 |   } else {
 69 |     y_high = y_low + 1;
 70 |   }
 71 | 
 72 |   if (x_low >= width - 1) {
 73 |     x_high = x_low = width - 1;
 74 |     x = (float)x_low;
 75 |   } else {
 76 |     x_high = x_low + 1;
 77 |   }
 78 | 
 79 |   float ly = y - y_low;
 80 |   float lx = x - x_low;
 81 |   float hy = 1. - ly, hx = 1. - lx;
 82 | 
 83 |   // reference in forward
 84 |   // float v1 = bottom_data[y_low * width + x_low];
 85 |   // float v2 = bottom_data[y_low * width + x_high];
 86 |   // float v3 = bottom_data[y_high * width + x_low];
 87 |   // float v4 = bottom_data[y_high * width + x_high];
 88 |   // float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
 89 | 
 90 |   w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 91 | 
 92 |   return;
 93 | }
 94 | 
 95 | __global__ void roi_align_backward_kernel(
 96 |     const int nthreads,
 97 |     const float* top_diff,
 98 |     const int num_rois,
 99 |     const float spatial_scale,
100 |     const int channels,
101 |     const int height,
102 |     const int width,
103 |     const int pooled_height,
104 |     const int pooled_width,
105 |     const int sampling_ratio,
106 |     float* bottom_diff,
107 |     const float* bottom_rois,
108 |     int rois_cols) {
109 |   //CUDA_1D_KERNEL_LOOP(index, nthreads) {
110 |     for (int index = blockIdx.x * blockDim.x + threadIdx.x;
111 |        index < nthreads;
112 |        index += blockDim.x * gridDim.x)
113 |     {
114 |     // (n, c, ph, pw) is an element in the pooled output
115 |     int pw = index % pooled_width;
116 |     int ph = (index / pooled_width) % pooled_height;
117 |     int c = (index / pooled_width / pooled_height) % channels;
118 |     int n = index / pooled_width / pooled_height / channels;
119 | 
120 |     const float* offset_bottom_rois = bottom_rois + n * 5;
121 |     int roi_batch_ind = offset_bottom_rois[0];
122 | 
123 |     // Do not using rounding; this implementation detail is critical
124 |     float roi_start_w = offset_bottom_rois[1] * spatial_scale;
125 |     float roi_start_h = offset_bottom_rois[2] * spatial_scale;
126 |     float roi_end_w = offset_bottom_rois[3] * spatial_scale;
127 |     float roi_end_h = offset_bottom_rois[4] * spatial_scale;
128 |     // float roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
129 |     // float roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
130 |     // float roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
131 |     // float roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
132 | 
133 |     // Force malformed ROIs to be 1x1
134 |     float roi_width = myfmax(roi_end_w - roi_start_w, (float)1.);
135 |     float roi_height = myfmax(roi_end_h - roi_start_h, (float)1.);
136 |     float bin_size_h = static_cast<float>(roi_height) / static_cast<float>(pooled_height);
137 |     float bin_size_w = static_cast<float>(roi_width) / static_cast<float>(pooled_width);
138 | 
139 |     float* offset_bottom_diff =
140 |         bottom_diff + (roi_batch_ind * channels + c) * height * width;
141 | 
142 |     int top_offset = (n * channels + c) * pooled_height * pooled_width;
143 |     const float* offset_top_diff = top_diff + top_offset;
144 |     const float top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
145 | 
146 |     // We use roi_bin_grid to sample the grid and mimic integral
147 |     int roi_bin_grid_h = (sampling_ratio > 0)
148 |         ? sampling_ratio
149 |         : ceilf(roi_height / pooled_height); // e.g., = 2
150 |     int roi_bin_grid_w =
151 |         (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
152 | 
153 |     // We do average (integral) pooling inside a bin
154 |     const float count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
155 | 
156 |     for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
157 |     {
158 |       const float y = roi_start_h + ph * bin_size_h +
159 |           static_cast<float>(iy + .5f) * bin_size_h /
160 |               static_cast<float>(roi_bin_grid_h); // e.g., 0.5, 1.5
161 |       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
162 |         const float x = roi_start_w + pw * bin_size_w +
163 |             static_cast<float>(ix + .5f) * bin_size_w /
164 |                 static_cast<float>(roi_bin_grid_w);
165 | 
166 |         float w1, w2, w3, w4;
167 |         int x_low, x_high, y_low, y_high;
168 | 
169 |         bilinear_interpolate_gradient(
170 |             height,
171 |             width,
172 |             y,
173 |             x,
174 |             w1,
175 |             w2,
176 |             w3,
177 |             w4,
178 |             x_low,
179 |             x_high,
180 |             y_low,
181 |             y_high,
182 |             index);
183 | 
184 |         float g1 = top_diff_this_bin * w1 / count;
185 |         float g2 = top_diff_this_bin * w2 / count;
186 |         float g3 = top_diff_this_bin * w3 / count;
187 |         float g4 = top_diff_this_bin * w4 / count;
188 | 
189 |         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
190 |           gpu_atomic_add(
191 |               static_cast<float>(g1), offset_bottom_diff + y_low * width + x_low);
192 |           gpu_atomic_add(
193 |               static_cast<float>(g2), offset_bottom_diff + y_low * width + x_high);
194 |           gpu_atomic_add(
195 |               static_cast<float>(g3), offset_bottom_diff + y_high * width + x_low);
196 |           gpu_atomic_add(
197 |               static_cast<float>(g4), offset_bottom_diff + y_high * width + x_high);
198 |         } // if
199 |       } // ix
200 |     } // iy
201 |   } // CUDA_1D_KERNEL_LOOP
202 | } // RoIAlignBackward
203 | 
204 | int launch_roi_align_backward_cuda(
205 |     const int nthreads,
206 |     const float* top_diff,
207 |     const int num_rois,
208 |     const float spatial_scale,
209 |     const int channels,
210 |     const int height,
211 |     const int width,
212 |     const int pooled_height,
213 |     const int pooled_width,
214 |     const int sampling_ratio,
215 |     float* bottom_diff,
216 |     const float* bottom_rois,
217 |     int roi_cols,
218 |     cudaStream_t stream)
219 | {
220 | 
221 |   int64_t blocks = myfmin(GET_BLOCKS(nthreads),CUDA_MAX_BLOCKS);
222 |   
223 |   roi_align_backward_kernel<<<blocks, CUDA_NUM_THREADS, 0, stream>>>(
224 |     nthreads, 
225 |     top_diff,
226 |     num_rois,
227 |     spatial_scale,
228 |     channels,
229 |     height, 
230 |     width, 
231 |     pooled_height,
232 |     pooled_width,
233 |     sampling_ratio,
234 |     bottom_diff,
235 |     bottom_rois,
236 |     roi_cols);
237 | 
238 |   // check for errors
239 |   cudaError_t err = cudaGetLastError();
240 |   if (err != cudaSuccess) {
241 |     printf("error in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
242 |     //THError("aborting");
243 |     return 0;
244 |   }
245 |   return 1;
246 | 
247 | }
248 | 
249 | 
250 | #ifdef __cplusplus
251 | }
252 | #endif


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/src/cuda/roi_align_backward_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | // Adapted from https://github.com/caffe2/caffe2/blob/master/caffe2/operators/roi_align_gradient_op.cu
 2 | // (Ignacio Rocco)
 3 | #ifdef __cplusplus
 4 | extern "C" {
 5 | #endif
 6 | 
 7 | int launch_roi_align_backward_cuda(
 8 |     const int nthreads,
 9 |     const float* top_diff,
10 |     const int num_rois,
11 |     const float spatial_scale,
12 |     const int channels,
13 |     const int height,
14 |     const int width,
15 |     const int pooled_height,
16 |     const int pooled_width,
17 |     const int sampling_ratio,
18 |     float* bottom_diff,
19 |     const float* bottom_rois,
20 |     int roi_cols,
21 |     cudaStream_t stream);
22 | 
23 | #ifdef __cplusplus
24 | }
25 | #endif


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/src/cuda/roi_align_forward_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | // Adapted from https://github.com/caffe2/caffe2/blob/master/caffe2/operators/roi_align_op.cu
  2 | // (Ignacio Rocco)
  3 | #ifdef __cplusplus
  4 | extern "C" {
  5 | #endif
  6 | 
  7 | #include <stdio.h>
  8 | #include <math.h>
  9 | #include <float.h>
 10 | 
 11 | 
 12 | // Use 1024 threads per block, which requires cuda sm_2x or above
 13 | const int CUDA_NUM_THREADS = 1024;
 14 | const int CUDA_MAX_BLOCKS = 65535;
 15 | 
 16 | inline int GET_BLOCKS(const int N)
 17 | {
 18 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 19 | }
 20 | 
 21 | __host__ __device__ __forceinline__ float myfmin(float a, float b) {
 22 |   return a > b ? b : a;
 23 | }
 24 | 
 25 | __host__ __device__ __forceinline__ float myfmax(float a, float b) {
 26 |   return a > b ? a : b;
 27 | }
 28 | 
 29 | __device__ float bilinear_interpolate(
 30 |     const float* bottom_data,
 31 |     const int height,
 32 |     const int width,
 33 |     float y,
 34 |     float x,
 35 |     const int index /* index for debug only*/) {
 36 |   // deal with cases that inverse elements are out of feature map boundary
 37 |   if (y < -1.0 || y > height || x < -1.0 || x > width) {
 38 |     // empty
 39 |     return 0;
 40 |   }
 41 | 
 42 |   if (y <= 0) {
 43 |     y = 0;
 44 |   }
 45 |   if (x <= 0) {
 46 |     x = 0;
 47 |   }
 48 | 
 49 |   int y_low = (int)y;
 50 |   int x_low = (int)x;
 51 |   int y_high;
 52 |   int x_high;
 53 | 
 54 |   if (y_low >= height - 1) {
 55 |     y_high = y_low = height - 1;
 56 |     y = (float)y_low;
 57 |   } else {
 58 |     y_high = y_low + 1;
 59 |   }
 60 | 
 61 |   if (x_low >= width - 1) {
 62 |     x_high = x_low = width - 1;
 63 |     x = (float)x_low;
 64 |   } else {
 65 |     x_high = x_low + 1;
 66 |   }
 67 | 
 68 |   float ly = y - y_low;
 69 |   float lx = x - x_low;
 70 |   float hy = 1. - ly, hx = 1. - lx;
 71 |   // do bilinear interpolation
 72 |   float v1 = bottom_data[y_low * width + x_low];
 73 |   float v2 = bottom_data[y_low * width + x_high];
 74 |   float v3 = bottom_data[y_high * width + x_low];
 75 |   float v4 = bottom_data[y_high * width + x_high];
 76 |   float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 77 | 
 78 |   float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
 79 | 
 80 |   return val;
 81 | }
 82 | 
 83 | __global__ void roi_align_forward_kernel(
 84 |     const int outputElements,
 85 |     const float* bottom_data, // input tensor
 86 |     const float* bottom_rois, // input rois
 87 |     const float spatial_scale,
 88 |     const int channels,
 89 |     const int height,
 90 |     const int width,
 91 |     const int pooled_height,
 92 |     const int pooled_width,
 93 |     const int sampling_ratio,
 94 |     float* top_data) // output
 95 |  {
 96 | //  CUDA_1D_KERNEL_LOOP(index, nthreads) {
 97 |   for (int index = blockIdx.x * blockDim.x + threadIdx.x;
 98 |        index < outputElements;
 99 |        index += blockDim.x * gridDim.x)
100 |   {
101 |     // (n, c, ph, pw) is an element in the pooled output
102 |     int pw = index % pooled_width;
103 |     int ph = (index / pooled_width) % pooled_height;
104 |     int c = (index / pooled_width / pooled_height) % channels;
105 |     int n = index / pooled_width / pooled_height / channels;
106 | 
107 |     const float* offset_bottom_rois = bottom_rois + n * 5;
108 |     int roi_batch_ind = offset_bottom_rois[0];
109 | 
110 |     // Do not using rounding; this implementation detail is critical
111 |     float roi_start_w = offset_bottom_rois[1] * spatial_scale;
112 |     float roi_start_h = offset_bottom_rois[2] * spatial_scale;
113 |     float roi_end_w = offset_bottom_rois[3] * spatial_scale;
114 |     float roi_end_h = offset_bottom_rois[4] * spatial_scale;
115 |     // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
116 |     // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
117 |     // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
118 |     // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
119 | 
120 |     // Force malformed ROIs to be 1x1
121 |     float roi_width = myfmax(roi_end_w - roi_start_w, (float)1.);
122 |     float roi_height = myfmax(roi_end_h - roi_start_h, (float)1.);
123 |     float bin_size_h = static_cast<float>(roi_height) / static_cast<float>(pooled_height);
124 |     float bin_size_w = static_cast<float>(roi_width) / static_cast<float>(pooled_width);
125 | 
126 |     const float* offset_bottom_data =
127 |         bottom_data + (roi_batch_ind * channels + c) * height * width;
128 | 
129 |     // We use roi_bin_grid to sample the grid and mimic integral
130 |     int roi_bin_grid_h = (sampling_ratio > 0)
131 |         ? sampling_ratio
132 |         : ceilf(roi_height / pooled_height); // e.g., = 2
133 |     int roi_bin_grid_w =
134 |         (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
135 | 
136 |     // We do average (integral) pooling inside a bin
137 |     const float count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
138 | 
139 |     float output_val = 0.;
140 |     for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
141 |     {
142 |       const float y = roi_start_h + ph * bin_size_h +
143 |           static_cast<float>(iy + .5f) * bin_size_h /
144 |               static_cast<float>(roi_bin_grid_h); // e.g., 0.5, 1.5
145 |       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
146 |         const float x = roi_start_w + pw * bin_size_w +
147 |             static_cast<float>(ix + .5f) * bin_size_w /
148 |                 static_cast<float>(roi_bin_grid_w);
149 | 
150 |         float val = bilinear_interpolate(
151 |             offset_bottom_data, height, width, y, x, index);
152 |         output_val += val;
153 |       }
154 |     }
155 |     output_val /= count;
156 | 
157 |     top_data[index] = output_val;
158 |   }
159 | }
160 | 
161 | int launch_roi_align_forward_cuda(
162 |     const int outputElements,
163 |     const float* bottom_data, // input tensor
164 |     const float* bottom_rois, // input rois
165 |     const float spatial_scale,
166 |     const int channels,
167 |     const int height,
168 |     const int width,
169 |     const int pooled_height,
170 |     const int pooled_width,
171 |     const int sampling_ratio,
172 |     float* top_data,
173 |     cudaStream_t stream)
174 | {
175 | 
176 |   int64_t blocks = myfmin(GET_BLOCKS(outputElements),CUDA_MAX_BLOCKS);
177 |   
178 |   roi_align_forward_kernel<<<blocks, CUDA_NUM_THREADS, 0, stream>>>(
179 |     outputElements,
180 |     bottom_data, // input tensor
181 |     bottom_rois, // input rois
182 |     spatial_scale,
183 |     channels,
184 |     height,
185 |     width,
186 |     pooled_height,
187 |     pooled_width,
188 |     sampling_ratio,
189 |     top_data);
190 |   
191 |   // check for errors
192 |   cudaError_t err = cudaGetLastError();
193 |   if (err != cudaSuccess) {
194 |     printf("error in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
195 |     //THError("aborting");
196 |     return 0;
197 |   }
198 |   return 1;
199 | 
200 | }
201 | 
202 | 
203 | #ifdef __cplusplus
204 | }
205 | #endif


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/src/cuda/roi_align_forward_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | // Adapted from https://github.com/caffe2/caffe2/blob/master/caffe2/operators/roi_align_op.cu
 2 | // (Ignacio Rocco)
 3 | #ifdef __cplusplus
 4 | extern "C" {
 5 | #endif
 6 | 
 7 | int launch_roi_align_forward_cuda(
 8 |     const int outputElements,
 9 |     const float* bottom_data, // input tensor
10 |     const float* bottom_rois, // input rois
11 |     const float spatial_scale,
12 |     const int channels,
13 |     const int height,
14 |     const int width,
15 |     const int pooled_height,
16 |     const int pooled_width,
17 |     const int sampling_ratio,
18 |     float* top_data,
19 |     cudaStream_t stream);
20 | 
21 | 
22 | #ifdef __cplusplus
23 | }
24 | #endif


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/src/roi_align_backward_cuda.c:
--------------------------------------------------------------------------------
 1 | // Adapted from https://github.com/caffe2/caffe2/blob/master/caffe2/operators/roi_align_gradient_op.cu
 2 | // (Ignacio Rocco)
 3 | #include <THC/THC.h>
 4 | #include <stdbool.h>
 5 | #include <stdio.h>
 6 | #include "cuda/roi_align_backward_cuda_kernel.h"
 7 | 
 8 | #define real float
 9 | 
10 | // this symbol will be resolved automatically from PyTorch libs
11 | extern THCState *state;
12 | 
13 | int roi_align_backward_cuda(
14 |   THCudaTensor *bottom_rois,
15 |   THCudaTensor *grad_output, // gradient of the output of the layer
16 |   THCudaTensor *output,
17 |   int64_t pooled_height,
18 |   int64_t pooled_width,
19 |   double spatial_scale,
20 |   int64_t sampling_ratio)
21 | {
22 | 
23 |   // ROIs is the set of region proposals to process. It is a 2D Tensor where the first
24 |   // dim is the # of proposals, and the second dim is the proposal itself in the form
25 |   // [batch_index startW startH endW endH]
26 |   int num_rois = THCudaTensor_size(state, bottom_rois, 0);
27 |   int roi_cols = THCudaTensor_size(state, bottom_rois, 1);
28 |   int channels = THCudaTensor_size(state, output, 1);
29 |   int height = THCudaTensor_size(state, output, 2);
30 |   int width = THCudaTensor_size(state, output, 3);
31 | 
32 |   
33 |   int64_t total_threads = num_rois*channels*pooled_height*pooled_width;
34 | 
35 |   cudaStream_t stream = THCState_getCurrentStream(state);
36 | 
37 |   launch_roi_align_backward_cuda(
38 |     total_threads,
39 |     THCudaTensor_data(state, grad_output),
40 |     num_rois,
41 |     spatial_scale,
42 |     channels,
43 |     height,
44 |     width,
45 |     pooled_height,
46 |     pooled_width,
47 |     sampling_ratio,
48 |     THCudaTensor_data(state, output),
49 |     THCudaTensor_data(state, bottom_rois),
50 |     roi_cols,
51 |     stream);
52 | 
53 |   return 1;
54 | }
55 | 


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/src/roi_align_backward_cuda.h:
--------------------------------------------------------------------------------
1 | int roi_align_backward_cuda(
2 |   THCudaTensor *bottom_rois,
3 |   THCudaTensor *grad_output, // gradient of the output of the layer
4 |   THCudaTensor *output,
5 |   int64_t pooled_height,
6 |   int64_t pooled_width,
7 |   double spatial_scale,
8 |   int64_t sampling_ratio);


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/src/roi_align_forward_cpu.c:
--------------------------------------------------------------------------------
 1 | // Adapted from https://github.com/caffe2/caffe2/blob/master/caffe2/operators/roi_align_op.cc
 2 | // (Ignacio Rocco)
 3 | 
 4 | #include <TH/TH.h>
 5 | #include <stdbool.h>
 6 | #include <stdio.h>
 7 | #include "cpp/roi_align_cpu_loop.h"
 8 | 
 9 | #define real float
10 | 
11 | int roi_align_forward_cpu(
12 |   THFloatTensor *input,
13 |   THFloatTensor *bottom_rois,
14 |   THFloatTensor *output,
15 |   int64_t pooled_height,
16 |   int64_t pooled_width,
17 |   double spatial_scale,
18 |   int64_t sampling_ratio)
19 | {
20 | 
21 |   int proposals = THFloatTensor_size(bottom_rois, 0);
22 |   int roi_cols = THFloatTensor_size(bottom_rois, 1);
23 |   int channels = THFloatTensor_size(input, 1);
24 |   int height = THFloatTensor_size(input, 2);
25 |   int width = THFloatTensor_size(input, 3);
26 | 
27 | 
28 |   int64_t total_threads = proposals*channels*pooled_height*pooled_width;
29 |   
30 |   roi_align_forward_loop(
31 |     total_threads, 
32 |     THFloatTensor_data(input), 
33 |     THFloatTensor_data(bottom_rois),
34 |     (float)(spatial_scale), 
35 |     channels,
36 |     height, 
37 |     width, 
38 |     pooled_height, 
39 |     pooled_width, 
40 |     sampling_ratio,
41 |     roi_cols,
42 |     THFloatTensor_data(output));
43 | 
44 |   return 1;
45 | }
46 | 
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/src/roi_align_forward_cpu.h:
--------------------------------------------------------------------------------
1 | 
2 | int roi_align_forward_cpu(
3 |   THFloatTensor *input,
4 |   THFloatTensor *bottom_rois,
5 |   THFloatTensor *output,
6 |   int64_t pooled_height,
7 |   int64_t pooled_width,
8 |   double spatial_scale,
9 |   int64_t sampling_ratio);


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/src/roi_align_forward_cuda.c:
--------------------------------------------------------------------------------
 1 | // Adapted from https://github.com/caffe2/caffe2/blob/master/caffe2/operators/roi_align_op.cu
 2 | // (Ignacio Rocco)
 3 | #include <THC/THC.h>
 4 | #include <stdbool.h>
 5 | #include <stdio.h>
 6 | #include "cuda/roi_align_forward_cuda_kernel.h"
 7 | 
 8 | 
 9 | #define real float
10 | 
11 | // this symbol will be resolved automatically from PyTorch libs
12 | extern THCState *state;
13 | 
14 | 
15 | int roi_align_forward_cuda(
16 |   THCudaTensor *input,
17 |   THCudaTensor *bottom_rois,
18 |   THCudaTensor *output,
19 |   int64_t pooled_height,
20 |   int64_t pooled_width,
21 |   double spatial_scale,
22 |   int64_t sampling_ratio)
23 | {
24 | 
25 |   int proposals = THCudaTensor_size(state, bottom_rois, 0);
26 |   int channels = THCudaTensor_size(state, input, 1);
27 |   int height = THCudaTensor_size(state, input, 2);
28 |   int width = THCudaTensor_size(state, input, 3);
29 | 
30 |   
31 |   int64_t total_threads = proposals*channels*pooled_height*pooled_width;
32 |   
33 |   cudaStream_t stream = THCState_getCurrentStream(state);
34 | 
35 |   launch_roi_align_forward_cuda(
36 |     total_threads, 
37 |     THCudaTensor_data(state, input), 
38 |     THCudaTensor_data(state, bottom_rois),
39 |     (float)(spatial_scale), 
40 |     channels,
41 |     height, 
42 |     width, 
43 |     pooled_height, 
44 |     pooled_width, 
45 |     sampling_ratio,
46 |     THCudaTensor_data(state, output),
47 |     stream);
48 | 
49 |   return 1;
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/lib/cppcuda_cffi/src/roi_align_forward_cuda.h:
--------------------------------------------------------------------------------
1 | int roi_align_forward_cuda(
2 |   THCudaTensor *input,
3 |   THCudaTensor *bottom_rois,
4 |   THCudaTensor *output,
5 |   int64_t pooled_height,
6 |   int64_t pooled_width,
7 |   double spatial_scale,
8 |   int64_t sampling_ratio);


--------------------------------------------------------------------------------
/lib/data/coco_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | from torch.utils.data import Dataset
 5 | import numpy as np
 6 | import skimage.io as io
 7 |     
 8 | from data.json_dataset import JsonDataset
 9 | from data.roidb import roidb_for_training
10 | 
11 | class CocoDataset(Dataset):
12 | 
13 |     def __init__(self,
14 |                 ann_file,
15 |                 img_dir,
16 |                 sample_transform=None,
17 |                 proposal_file=None,
18 |                 num_classes=81,
19 |                 proposal_limit=1000,
20 |                 mode='test'):
21 |         self.img_dir = img_dir
22 |         if mode=='test':
23 |             self.coco = JsonDataset(annotation_file=ann_file,image_directory=img_dir) ## needed for evaluation
24 |         #self.img_ids = sorted(list(self.coco.COCO.imgs.keys()))
25 |         #self.classes = self.coco.classes   
26 |         self.num_classes=num_classes
27 |         self.sample_transform = sample_transform
28 |         # load proposals        
29 |         self.proposals=None
30 |         if mode=='test':
31 |             self.roidb = self.coco.get_roidb(proposal_file=proposal_file,proposal_limit=proposal_limit)
32 |             #self.proposals = [entry['boxes'][entry['gt_classes'] == 0] for entry in roidb] # remove gt boxes
33 |         elif mode=='train':
34 |             print('creating roidb for training')
35 |             self.roidb = roidb_for_training(annotation_files=ann_file,
36 |                                        image_directories=img_dir,
37 |                                        proposal_files=proposal_file)
38 |             
39 |     def __len__(self):
40 |         return len(self.roidb)
41 | 
42 |     def __getitem__(self, idx):
43 |         # get db entry
44 |         dbentry = self.roidb[idx]
45 |         # load image
46 |         image_fn = dbentry['image']        
47 |         image = io.imread(image_fn)
48 |         # convert grayscale to RGB
49 |         if len(image.shape) == 2: 
50 |             image = np.repeat(np.expand_dims(image,2), 3, axis=2)
51 |         # flip if needed (in these cases proposal coords are already flipped in roidb)
52 |         if dbentry['flipped']:
53 |             image = image[:, ::-1, :]
54 | 
55 | #         # get proposals
56 | #         proposal_coords = torch.FloatTensor([-1])
57 | #         if self.proposals is not None:
58 | #             sample['proposal_coords']=torch.FloatTensor(self.roidb[idx]['boxes'])
59 |         
60 |         # initially the sample is just composed of the loaded image and the dbentry
61 |         sample = {'image': image, 'dbentry': dbentry}
62 |         
63 |         # the sample transform will do the preprocessing and convert to the inputs required by the network
64 |         if self.sample_transform is not None:
65 |             sample = self.sample_transform(sample)
66 | 
67 |         return sample
68 |         


--------------------------------------------------------------------------------
/lib/data/roidb.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | ##############################################################################
 15 | 
 16 | """Functions for common roidb manipulations."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | from __future__ import unicode_literals
 22 | 
 23 | # from past.builtins import basestring # in python 3: pip install future
 24 | #import logging
 25 | import numpy as np
 26 | 
 27 | from data.json_dataset import JsonDataset
 28 | import utils.boxes as box_utils
 29 | #import utils.keypoints as keypoint_utils
 30 | import utils.segms as segm_utils
 31 | 
 32 | 
 33 | class logging():  # overwrite logger with dummy class which prints
 34 |     def info(self,s):
 35 |         print(s)
 36 |     def debug(self,s):
 37 | #        print('debug: '+s)
 38 |         return
 39 | 
 40 | 
 41 | #logger = logging.getLogger(__name__)
 42 | logger = logging()
 43 | 
 44 | def roidb_for_training(annotation_files,
 45 |                         image_directories,
 46 |                         proposal_files,
 47 |                         train_crowd_filter_thresh=0.7,
 48 |                         use_flipped=True,
 49 |                         train_fg_thresh=0.5,
 50 |                         train_bg_thresh_hi=0.5,
 51 |                         train_bg_thresh_lo=0,
 52 |                         keypoints_on=False,
 53 |                         bbox_thresh=0.5,
 54 |                         cls_agnostic_bbox_reg=False,
 55 |                         bbox_reg_weights=(10.0, 10.0, 5.0, 5.0)):
 56 |     """Load and concatenate roidbs for one or more datasets, along with optional
 57 |     object proposals. The roidb entries are then prepared for use in training,
 58 |     which involves caching certain types of metadata for each roidb entry.
 59 |     """
 60 |     def get_roidb(annotation_file, image_directory, proposal_file):
 61 |         ds = JsonDataset(annotation_file,image_directory)
 62 |         roidb = ds.get_roidb(
 63 |             gt=True,
 64 |             proposal_file=proposal_file,
 65 |             crowd_filter_thresh=train_crowd_filter_thresh
 66 |         )
 67 |         if use_flipped:
 68 |             logger.info('Appending horizontally-flipped training examples...')
 69 |             extend_with_flipped_entries(roidb, ds)
 70 |         logger.info('Loaded dataset: {:s}'.format(ds.name))
 71 |         return roidb
 72 | 
 73 |     if isinstance(annotation_files, str):
 74 |         annotation_files = (annotation_files, )
 75 |     if isinstance(image_directories, str):
 76 |         image_directories = (image_directories, )
 77 |     if isinstance(proposal_files, str):
 78 |         proposal_files = (proposal_files, )
 79 |     if len(proposal_files) == 0:
 80 |         proposal_files = (None, ) * len(annotation_files)
 81 |     assert len(annotation_files) == len(image_directories) and len(annotation_files) == len(proposal_files)
 82 | 
 83 |     # if isinstance(annotation_files,(list,tuple)) and isinstance(image_directories,(list,tuple)) and isinstance(proposal_files,(list,tuple)):
 84 |     roidbs = [get_roidb(*args) for args in zip(annotation_files, image_directories, proposal_files)]
 85 |     roidb = roidbs[0]
 86 |     if len(annotation_files)>1:
 87 |         for r in roidbs[1:]:
 88 |             roidb.extend(r)
 89 |     # elif isinstance(annotation_files,str) and isinstance(image_directories,str) and isinstance(proposal_files,str):
 90 |     #     roidb = get_roidb(annotation_files,image_directories,proposal_files)
 91 | 
 92 |     roidb = filter_for_training(roidb,train_fg_thresh,train_bg_thresh_hi,train_bg_thresh_lo,keypoints_on)
 93 | 
 94 |     logger.info('Computing bounding-box regression targets...')
 95 |     add_bbox_regression_targets(roidb,bbox_thresh,cls_agnostic_bbox_reg,bbox_reg_weights)
 96 |     logger.info('done')
 97 | 
 98 |     _compute_and_log_stats(roidb)
 99 | 
100 |     return roidb
101 | 
102 | 
103 | def extend_with_flipped_entries(roidb, dataset):
104 |     """Flip each entry in the given roidb and return a new roidb that is the
105 |     concatenation of the original roidb and the flipped entries.
106 | 
107 |     "Flipping" an entry means that that image and associated metadata (e.g.,
108 |     ground truth boxes and object proposals) are horizontally flipped.
109 |     """
110 |     flipped_roidb = []
111 |     for entry in roidb:
112 |         width = entry['width']
113 |         boxes = entry['boxes'].copy()
114 |         oldx1 = boxes[:, 0].copy()
115 |         oldx2 = boxes[:, 2].copy()
116 |         boxes[:, 0] = width - oldx2 - 1
117 |         boxes[:, 2] = width - oldx1 - 1
118 |         assert (boxes[:, 2] >= boxes[:, 0]).all()
119 |         flipped_entry = {}
120 |         dont_copy = ('boxes', 'segms', 'gt_keypoints', 'flipped')
121 |         for k, v in entry.items():
122 |             if k not in dont_copy:
123 |                 flipped_entry[k] = v
124 |         flipped_entry['boxes'] = boxes
125 |         flipped_entry['segms'] = segm_utils.flip_segms(
126 |             entry['segms'], entry['height'], entry['width']
127 |         )
128 |         # if dataset.keypoints is not None:
129 |         #     flipped_entry['gt_keypoints'] = keypoint_utils.flip_keypoints(
130 |         #         dataset.keypoints, dataset.keypoint_flip_map,
131 |         #         entry['gt_keypoints'], entry['width']
132 |         #     )
133 |         flipped_entry['flipped'] = True
134 |         flipped_roidb.append(flipped_entry)
135 |     roidb.extend(flipped_roidb)
136 | 
137 | 
138 | def filter_for_training(roidb,
139 |                         train_fg_thresh,
140 |                         train_bg_thresh_hi,
141 |                         train_bg_thresh_lo,
142 |                         keypoints_on):
143 |     """Remove roidb entries that have no usable RoIs based on config settings.
144 |     """
145 |     def is_valid(entry):
146 |         # Valid images have:
147 |         #   (1) At least one foreground RoI OR
148 |         #   (2) At least one background RoI
149 |         overlaps = entry['max_overlaps']
150 |         # find boxes with sufficient overlap
151 |         fg_inds = np.where(overlaps >= train_fg_thresh)[0]
152 |         # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
153 |         bg_inds = np.where((overlaps < train_bg_thresh_hi) &
154 |                            (overlaps >= train_bg_thresh_lo))[0]
155 |         # image is only valid if such boxes exist
156 |         valid = len(fg_inds) > 0 or len(bg_inds) > 0
157 |         if keypoints_on:
158 |             # If we're training for keypoints, exclude images with no keypoints
159 |             valid = valid and entry['has_visible_keypoints']
160 |         return valid
161 | 
162 |     num = len(roidb)
163 |     filtered_roidb = [entry for entry in roidb if is_valid(entry)]
164 |     num_after = len(filtered_roidb)
165 |     logger.info('Filtered {} roidb entries: {} -> {}'.
166 |                 format(num - num_after, num, num_after))
167 |     return filtered_roidb
168 | 
169 | 
170 | def add_bbox_regression_targets(roidb,bbox_thresh,cls_agnostic_bbox_reg,bbox_reg_weights):
171 |     """Add information needed to train bounding-box regressors."""
172 |     for entry in roidb:
173 |         entry['bbox_targets'] = _compute_targets(entry,bbox_thresh,cls_agnostic_bbox_reg,bbox_reg_weights)
174 | 
175 | 
176 | def _compute_targets(entry,bbox_thresh,cls_agnostic_bbox_reg,bbox_reg_weights):
177 |     """Compute bounding-box regression targets for an image."""
178 |     # Indices of ground-truth ROIs
179 |     rois = entry['boxes']
180 |     overlaps = entry['max_overlaps']
181 |     labels = entry['max_classes']
182 |     gt_inds = np.where((entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0]
183 |     # Targets has format (class, tx, ty, tw, th)
184 |     targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
185 |     if len(gt_inds) == 0:
186 |         # Bail if the image has no ground-truth ROIs
187 |         return targets
188 | 
189 |     # Indices of examples for which we try to make predictions
190 |     ex_inds = np.where(overlaps >= bbox_thresh)[0]
191 | 
192 |     # Get IoU overlap between each ex ROI and gt ROI
193 |     ex_gt_overlaps = box_utils.bbox_overlaps(
194 |         rois[ex_inds, :].astype(dtype=np.float32, copy=False),
195 |         rois[gt_inds, :].astype(dtype=np.float32, copy=False))
196 | 
197 |     # Find which gt ROI each ex ROI has max overlap with:
198 |     # this will be the ex ROI's gt target
199 |     gt_assignment = ex_gt_overlaps.argmax(axis=1)
200 |     gt_rois = rois[gt_inds[gt_assignment], :]
201 |     ex_rois = rois[ex_inds, :]
202 |     # Use class "1" for all boxes if using class_agnostic_bbox_reg
203 |     targets[ex_inds, 0] = (
204 |         1 if cls_agnostic_bbox_reg else labels[ex_inds])
205 |     targets[ex_inds, 1:] = box_utils.bbox_transform_inv(ex_rois, gt_rois, bbox_reg_weights)
206 |     return targets
207 | 
208 | 
209 | def _compute_and_log_stats(roidb):
210 |     classes = roidb[0]['dataset'].classes
211 |     char_len = np.max([len(c) for c in classes])
212 |     hist_bins = np.arange(len(classes) + 1)
213 | 
214 |     # Histogram of ground-truth objects
215 |     gt_hist = np.zeros((len(classes)), dtype=np.int)
216 |     for entry in roidb:
217 |         gt_inds = np.where(
218 |             (entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0]
219 |         gt_classes = entry['gt_classes'][gt_inds]
220 |         gt_hist += np.histogram(gt_classes, bins=hist_bins)[0]
221 |     logger.debug('Ground-truth class histogram:')
222 |     for i, v in enumerate(gt_hist):
223 |         logger.debug(
224 |             '{:d}{:s}: {:d}'.format(
225 |                 i, classes[i].rjust(char_len), v))
226 |     logger.debug('-' * char_len)
227 |     logger.debug(
228 |         '{:s}: {:d}'.format(
229 |             'total'.rjust(char_len), np.sum(gt_hist)))
230 | 


--------------------------------------------------------------------------------
/lib/model/collect_and_distribute_fpn_rpn_proposals.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | ##############################################################################
 15 | 
 16 | from __future__ import absolute_import
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | from __future__ import unicode_literals
 20 | 
 21 | import torch
 22 | from torch.autograd import Variable
 23 | 
 24 | import numpy as np
 25 | from utils.multilevel_rois import map_rois_to_fpn_levels
 26 | 
 27 | from math import log2
 28 | # from core.config import cfg
 29 | # from datasets import json_dataset
 30 | # import modeling.FPN as fpn
 31 | # import roi_data.fast_rcnn
 32 | # import utils.blob as blob_utils
 33 | 
 34 | 
 35 | class CollectAndDistributeFpnRpnProposals(torch.nn.Module):
 36 |     def __init__(self, spatial_scales, train=False):
 37 |         super(CollectAndDistributeFpnRpnProposals, self).__init__()
 38 |         self._train = train
 39 |         self.rpn_levels = [int(log2(1/s)) for s in spatial_scales]
 40 |         self.rpn_min_level = self.rpn_levels[0]
 41 |         self.rpn_max_level = self.rpn_levels[-1]
 42 | 
 43 |     def forward(self, roi_list, roi_score_list):
 44 |         """See modeling.detector.CollectAndDistributeFpnRpnProposals for
 45 |         inputs/outputs documentation.
 46 |         """
 47 |         # inputs is
 48 |         # [rpn_rois_fpn2, ..., rpn_rois_fpn6,
 49 |         #  rpn_roi_probs_fpn2, ..., rpn_roi_probs_fpn6]
 50 |         # If training with Faster R-CNN, then inputs will additionally include
 51 |         #  + [roidb, im_info]
 52 |         rois = collect(roi_list, roi_score_list, self._train)
 53 | 
 54 |         #           ************** WARNING ***************
 55 |         #      TRAINING CODE BELOW NOT CONVERTED TO PYTORCH
 56 |         #           ************** WARNING ***************
 57 | 
 58 |         # if self._train:
 59 |         #     # During training we reuse the data loader code. We populate roidb
 60 |         #     # entries on the fly using the rois generated by RPN.
 61 |         #     # im_info: [[im_height, im_width, im_scale], ...]
 62 |         #     im_info = inputs[-1].data
 63 |         #     im_scales = im_info[:, 2]
 64 |         #     roidb = blob_utils.deserialize(inputs[-2].data)
 65 |         #     # For historical consistency with the original Faster R-CNN
 66 |         #     # implementation we are *not* filtering crowd proposals.
 67 |         #     # This choice should be investigated in the future (it likely does
 68 |         #     # not matter).
 69 |         #     json_dataset.add_proposals(roidb, rois, im_scales, crowd_thresh=0)
 70 |         #     # Compute training labels for the RPN proposals; also handles
 71 |         #     # distributing the proposals over FPN levels
 72 |         #     output_blob_names = roi_data.fast_rcnn.get_fast_rcnn_blob_names()
 73 |         #     blobs = {k: [] for k in output_blob_names}
 74 |         #     roi_data.fast_rcnn.add_fast_rcnn_blobs(blobs, im_scales, roidb)
 75 |         #     for i, k in enumerate(output_blob_names):
 76 |         #         blob_utils.py_op_copy_blob(blobs[k], outputs[i])
 77 |         # else:
 78 |         #     # For inference we have a special code path that avoids some data
 79 |         #     # loader overhead
 80 |         #     distribute(rois, None, outputs, self._train)
 81 |         return distribute(rois, self.rpn_min_level, self.rpn_max_level) #, None, outputs, self._train)
 82 | 
 83 | 
 84 | def collect(roi_inputs, score_inputs, train):
 85 |     #cfg_key = 'TRAIN' if is_training else 'TEST'
 86 |     post_nms_topN = 2000 if train else 1000 # cfg[cfg_key].RPN_POST_NMS_TOP_N
 87 |     # k_max = 6 #cfg.FPN.RPN_MAX_LEVEL
 88 |     # k_min = 2 #cfg.FPN.RPN_MIN_LEVEL
 89 |     # num_lvls = k_max - k_min + 1
 90 |     # roi_inputs = inputs[:num_lvls]
 91 |     # score_inputs = inputs[num_lvls:]
 92 |     # if is_training:
 93 |     #     score_inputs = score_inputs[:-2]
 94 | 
 95 |     # rois are in [[batch_idx, x0, y0, x1, y2], ...] format
 96 |     # Combine predictions across all levels and retain the top scoring
 97 |     #rois = np.concatenate([blob.data for blob in roi_inputs])
 98 |     rois = torch.cat(tuple(roi_inputs),0)
 99 |     #scores = np.concatenate([blob.data for blob in score_inputs]).squeeze()
100 |     scores = torch.cat(tuple(score_inputs),0).squeeze()
101 |     #inds = np.argsort(-scores)[:post_nms_topN]
102 |     vals, inds = torch.sort(-scores)
103 |     #rois = rois[inds, :]
104 |     rois = rois[inds[:post_nms_topN], :]
105 |     return rois
106 | 
107 | 
108 | def distribute(rois, lvl_min, lvl_max): #, label_blobs, outputs, train):
109 |     """To understand the output blob order see return value of
110 |     roi_data.fast_rcnn.get_fast_rcnn_blob_names(is_training=False)
111 |     """
112 |     # lvl_min = 2 #cfg.FPN.ROI_MIN_LEVEL
113 |     # lvl_max = 5 #cfg.FPN.ROI_MAX_LEVEL
114 |     lvls = map_rois_to_fpn_levels(rois.data.cpu().numpy(), lvl_min, lvl_max)
115 | 
116 |     # outputs[0].reshape(rois.shape)
117 |     # outputs[0].data[...] = rois
118 | 
119 |     # Create new roi blobs for each FPN level
120 |     # (See: modeling.FPN.add_multilevel_roi_blobs which is similar but annoying
121 |     # to generalize to support this particular case.)
122 |     rois_idx_order = np.empty((0, ))
123 |     distr_rois=[]
124 |     for output_idx, lvl in enumerate(range(lvl_min, lvl_max + 1)):
125 |         idx_lvl = np.where(lvls == lvl)[0]
126 |         distr_rois.append(rois[idx_lvl, :])
127 |         rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
128 |     rois_idx_restore = np.argsort(rois_idx_order)
129 |     return distr_rois, rois_idx_restore


--------------------------------------------------------------------------------
/lib/model/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pickle
 3 | import numpy as np
 4 | #import copy
 5 | import torchvision.models as models
 6 | from model.roi_align import RoIAlign
 7 | from model.generate_proposals import GenerateProposals
 8 | from utils.utils import isnan,infbreak,printmax
 9 | 
10 | from torch.autograd import Variable
11 | from torch.nn.functional import cross_entropy
12 | 
13 | def smooth_L1(pred,targets,alpha_in,alpha_out,beta=1.0):
14 |     x=(pred-targets)*alpha_in
15 |     xabs=torch.abs(x)
16 |     y1=0.5*x**2/beta
17 |     y2=xabs-0.5*beta
18 |     case1=torch.le(xabs,beta).float()
19 |     case2=1-case1
20 |     return torch.sum((y1*case1+y2*case2)*alpha_out)/pred.size(0)
21 | 
22 | def accuracy(cls_score,cls_labels):
23 |     class_dim = cls_score.dim()-1
24 |     argmax=torch.max(torch.nn.functional.softmax(cls_score,dim=class_dim),class_dim)[1]
25 |     accuracy = torch.mean(torch.eq(argmax,cls_labels.long()).float())
26 |     return accuracy
27 | 
28 | # class detector_loss(torch.nn.Module):
29 | #     def __init__(self, do_loss_cls=True, do_loss_bbox=True, do_accuracy_cls=True):
30 | #         super(detector_loss, self).__init__()
31 | #         # Flags
32 | #         self.do_loss_cls = do_loss_cls
33 | #         self.do_loss_bbox = do_loss_bbox
34 | #         self.do_accuracy_cls = do_accuracy_cls
35 | #         # Dicts for losses 
36 | #         # self.losses={}
37 | #         # if do_loss_cls:
38 | #         #     self.losses['loss_cls']=0
39 | #         # if do_loss_bbox:
40 | #         #     self.losses['loss_bbox']=0
41 | #         # # Dicts for metrics       
42 | #         # self.metrics={}
43 | #         # if do_accuracy_cls:
44 | #         #     self.metrics['accuracy_cls']=0
45 | 
46 | #     def forward(self,
47 | #             cls_score,
48 | #             cls_labels,
49 | #             bbox_pred,
50 | #             bbox_targets,
51 | #             bbox_inside_weights,
52 | #             bbox_outside_weights):
53 | 
54 | #         # compute losses
55 | #         losses=[]
56 | #         if self.do_loss_cls:
57 | #             loss_cls = cross_entropy(cls_score,cls_labels.long())
58 | #             losses.append(loss_cls)
59 | #         if self.do_loss_bbox:
60 | #             loss_bbox = smooth_L1(bbox_pred,bbox_targets,bbox_inside_weights,bbox_outside_weights)
61 | #             losses.append(loss_bbox)
62 | 
63 | #         # # compute metrics
64 | #         # if self.do_accuracy_cls:
65 | #         #     self.metrics['accuracy_cls'] = accuracy(cls_score,cls_labels.long())
66 | 
67 | #         # sum total loss
68 | #         #loss = torch.sum(torch.cat(tuple([v.unsqueeze(0) for v in losses]),0))        
69 | 
70 | #         # loss.register_hook(printmax)
71 | 
72 | #         return tuple(losses)
73 |         


--------------------------------------------------------------------------------
/lib/model/roi_align.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.autograd import Function
  3 | from torch.nn.modules.module import Module
  4 | from torch.autograd import Variable 
  5 | import os
  6 | from torch.autograd.function import once_differentiable
  7 | 
  8 | torch_ver = torch.__version__[:3]
  9 | 
 10 | if torch_ver=="0.4":
 11 |     from torch.utils.cpp_extension import load   
 12 |     build_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),'../cppcuda/build/'))
 13 | 
 14 |     print('compiling/loading roi_align')
 15 |     roialign = load(name='roialign',sources=['lib/cppcuda/roi_align_binding.cpp',
 16 |                                             'lib/cppcuda/roi_align_forward_cuda.cu',
 17 |                                             'lib/cppcuda/roi_align_backward_cuda.cu'],
 18 |                     build_directory=build_path,verbose=True)
 19 | else:
 20 |     import cppcuda_cffi.roialign as roialign
 21 | 
 22 | 
 23 | class RoIAlignFunction(Function):
 24 |     # def __init__(ctx, pooled_height, pooled_width, spatial_scale, sampling_ratio):
 25 |     #     ctx.pooled_width = int(pooled_width)
 26 |     #     ctx.pooled_height = int(pooled_height)
 27 |     #     ctx.spatial_scale = float(spatial_scale)
 28 |     #     ctx.sampling_ratio = int(sampling_ratio)
 29 |     #     ctx.features_size = None
 30 |     #     ctx.rois=None
 31 | 
 32 |     @staticmethod  
 33 |     def forward(ctx, features, rois, pooled_height, pooled_width, spatial_scale, sampling_ratio):
 34 |         #ctx.save_for_backward(rois)
 35 |         ctx.rois=rois
 36 |         ctx.features_size=features.size()
 37 |         ctx.pooled_height=pooled_height
 38 |         ctx.pooled_width=pooled_width
 39 |         ctx.spatial_scale=spatial_scale
 40 |         ctx.sampling_ratio=sampling_ratio
 41 | 
 42 |         # compute
 43 |         if features.is_cuda != rois.is_cuda:
 44 |             raise TypeError('features and rois should be on same device (CPU or GPU)')
 45 |         elif features.is_cuda and rois.is_cuda :
 46 |             if torch_ver=="0.4":
 47 |                 output = roialign.roi_align_forward_cuda(features,
 48 |                                                 rois,
 49 |                                                 pooled_height,
 50 |                                                 pooled_width,
 51 |                                                 spatial_scale,
 52 |                                                 sampling_ratio)
 53 |             else:
 54 |                 num_channels = features.size(1)
 55 |                 num_rois = rois.size(0)
 56 |                 output = torch.zeros(num_rois, num_channels, pooled_height, pooled_width).cuda()
 57 |                 roialign.roi_align_forward_cuda(features,
 58 |                                 rois,
 59 |                                 output,
 60 |                                 pooled_height,
 61 |                                 pooled_width,
 62 |                                 spatial_scale,
 63 |                                 sampling_ratio)
 64 | 
 65 |         elif features.is_cuda==False and rois.is_cuda==False:
 66 |             if torch_ver=="0.4":
 67 |                 output = roialign.roi_align_forward_cpu(features,
 68 |                                                 rois,
 69 |                                                 pooled_height,
 70 |                                                 pooled_width,
 71 |                                                 spatial_scale,
 72 |                                                 sampling_ratio)
 73 |             else:
 74 |                 num_channels = features.size(1)
 75 |                 num_rois = rois.size(0)
 76 |                 output = torch.zeros(num_rois, num_channels, pooled_height, pooled_width)
 77 |                 roialign.roi_align_forward_cpu(features,
 78 |                                 rois,
 79 |                                 output,
 80 |                                 pooled_height,
 81 |                                 pooled_width,
 82 |                                 spatial_scale,
 83 |                                 sampling_ratio)
 84 |     
 85 | 
 86 |         if torch_ver=="0.4":
 87 |             return Variable(output,requires_grad=True)
 88 |         else:
 89 |             return output
 90 | 
 91 |     @staticmethod
 92 |     @once_differentiable
 93 |     def backward(ctx, grad_output):
 94 |         #rois, = ctx.saved_variables
 95 |         rois = ctx.rois
 96 |         features_size=ctx.features_size
 97 |         pooled_height=ctx.pooled_height
 98 |         pooled_width=ctx.pooled_width
 99 |         spatial_scale=ctx.spatial_scale
100 |         sampling_ratio=ctx.sampling_ratio
101 | 
102 |         #rois = ctx.rois
103 |         if rois.is_cuda:
104 |             if torch_ver=="0.4":
105 |                 grad_input = roialign.roi_align_backward_cuda(rois,
106 |                                     grad_output,
107 |                                     features_size[0],
108 |                                     features_size[1],
109 |                                     features_size[2],
110 |                                     features_size[3],
111 |                                     pooled_height,
112 |                                     pooled_width,
113 |                                     spatial_scale,
114 |                                     sampling_ratio)
115 |             else:
116 |                 #import pdb; pdb.set_trace()
117 |                 grad_input = torch.zeros(features_size).cuda(rois.get_device()) # <- the problem!
118 |                 roialign.roi_align_backward_cuda(rois,
119 |                                 grad_output,
120 |                                 grad_input,
121 |                                 pooled_height,
122 |                                     pooled_width,
123 |                                     spatial_scale,
124 |                                     sampling_ratio)
125 |             
126 |         else:
127 |             if torch_ver=="0.4":
128 |                 grad_input = roialign.roi_align_backward_cpu(rois,
129 |                                     grad_output,
130 |                                     features_size[0],
131 |                                     features_size[1],
132 |                                     features_size[2],
133 |                                     features_size[3],
134 |                                     pooled_height,
135 |                                     pooled_width,
136 |                                     spatial_scale,
137 |                                     sampling_ratio)
138 |             else:
139 |                 raise("backward pass not implemented on cpu in cffi extension")
140 | 
141 |         # import pdb; pdb.set_trace()
142 |         if torch_ver=="0.4":
143 |             return Variable(grad_input), None, None, None, None, None
144 |         else:
145 |             return grad_input, None, None, None, None, None
146 | 
147 | 
148 | 
149 | 
150 | class RoIAlign(Module):
151 |     def __init__(self, pooled_height, pooled_width, spatial_scale, sampling_ratio=0):
152 |         super(RoIAlign, self).__init__()
153 | 
154 |         self.pooled_height=int(pooled_height)
155 |         self.pooled_width=int(pooled_width)
156 |         self.spatial_scale=float(spatial_scale)
157 |         self.sampling_ratio=int(sampling_ratio)
158 | 
159 |     def forward(self, features, rois):
160 |         # features is a Variable/FloatTensor of size BxCxHxW
161 |         # rois is a (optional: list of) Variable/FloatTensor IDX,Xmin,Ymin,Xmax,Ymax (normalized to [0,1])
162 |         rois = preprocess_rois(rois)
163 |         output = RoIAlignFunction.apply(features,
164 |                                         rois,
165 |                                         self.pooled_height,
166 |                                         self.pooled_width,
167 |                                         self.spatial_scale,
168 |                                         self.sampling_ratio)
169 |         return output
170 |        
171 | 
172 | def preprocess_rois(rois):
173 |     # do some verifications on what has been passed as rois
174 |     if isinstance(rois,list): # if list, convert to single tensor (used for multiscale)
175 |         rois = torch.cat(tuple(rois),0)
176 |     if isinstance(rois,Variable):
177 |         if rois.dim()==3:
178 |             if rois.size(0)==1:
179 |                 rois = rois.squeeze(0)
180 |             else:
181 |                 raise("rois has wrong size")
182 |         if rois.size(1)==4:
183 |             # add zeros
184 |             zeros = Variable(torch.zeros((rois.size(0),1)))
185 |             if rois.is_cuda:
186 |                 zeros = zeros.cuda()
187 |             rois = torch.cat((zeros,rois),1).contiguous()
188 |     return rois


--------------------------------------------------------------------------------
/lib/utils/blob.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | ##############################################################################
 15 | #
 16 | # Based on:
 17 | # --------------------------------------------------------
 18 | # Fast R-CNN
 19 | # Copyright (c) 2015 Microsoft
 20 | # Licensed under The MIT License [see LICENSE for details]
 21 | # Written by Ross Girshick
 22 | # --------------------------------------------------------
 23 | 
 24 | import cv2
 25 | import numpy as np
 26 | 
 27 | def im_list_to_blob(ims,fpn_on=False,fpn_coarsest_stride=32):
 28 |     """Convert a list of images into a network input. Assumes images were
 29 |     prepared using prep_im_for_blob or equivalent: i.e.
 30 |       - BGR channel order
 31 |       - pixel means subtracted
 32 |       - resized to the desired input size
 33 |       - float32 numpy ndarray format
 34 |     Output is a 4D HCHW tensor of the images concatenated along axis 0 with
 35 |     shape.
 36 |     """
 37 |     max_shape = np.array([im.shape for im in ims]).max(axis=0)
 38 |     # Pad the image so they can be divisible by a stride
 39 |     if fpn_on:
 40 |         stride = float(fpn_coarsest_stride)
 41 |         max_shape[0] = int(np.ceil(max_shape[0] / stride) * stride)
 42 |         max_shape[1] = int(np.ceil(max_shape[1] / stride) * stride)
 43 | 
 44 |     num_images = len(ims)
 45 |     blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
 46 |                     dtype=np.float32)
 47 |     for i in range(num_images):
 48 |         im = ims[i]
 49 |         blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
 50 |     # Move channels (axis 3) to axis 1
 51 |     # Axis order will become: (batch elem, channel, height, width)
 52 |     channel_swap = (0, 3, 1, 2)
 53 |     blob = blob.transpose(channel_swap)
 54 |     return blob
 55 | 
 56 | 
 57 | def prep_im_for_blob(im, pixel_means=[122.7717, 115.9465, 102.9801], target_sizes=[800], max_size=1333):
 58 |     """Prepare an image for use as a network input blob. Specially:
 59 |       - Subtract per-channel pixel mean
 60 |       - Convert to float32
 61 |       - Rescale to each of the specified target size (capped at max_size)
 62 |     Returns a list of transformed images, one for each target size. Also returns
 63 |     the scale factors that were used to compute each returned image.
 64 |     """       
 65 |     im = im.astype(np.float32, copy=False)
 66 |     im -= pixel_means
 67 |     im_shape = im.shape
 68 |     im_size_min = np.min(im_shape[0:2])
 69 |     im_size_max = np.max(im_shape[0:2])
 70 | 
 71 |     ims = []
 72 |     im_scales = []
 73 |     for target_size in target_sizes:
 74 |         im_scale = float(target_size) / float(im_size_min)
 75 |         # Prevent the biggest axis from being more than max_size
 76 |         if np.round(im_scale * im_size_max) > max_size:
 77 |             im_scale = float(max_size) / float(im_size_max)
 78 |         # BUGGY im is replaced by scaled im
 79 |         # im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
 80 |         #                 interpolation=cv2.INTER_LINEAR)
 81 |         # ims.append(im)
 82 |         im_prime = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
 83 |                         interpolation=cv2.INTER_LINEAR)
 84 |         ims.append(im_prime)
 85 |         im_scales.append(im_scale)
 86 | 
 87 |     return ims, im_scales
 88 | 
 89 | def get_rois_blob(im_rois, im_scale):
 90 |     """Converts RoIs into network inputs.
 91 |     Arguments:
 92 |         im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates
 93 |         im_scale_factors (list): scale factors as returned by _get_image_blob
 94 |     Returns:
 95 |         blob (ndarray): R x 5 matrix of RoIs in the image pyramid with columns
 96 |             [level, x1, y1, x2, y2]
 97 |     """
 98 |     rois, levels = project_im_rois(im_rois, im_scale)
 99 |     rois_blob = np.hstack((levels, rois))
100 |     return rois_blob.astype(np.float32, copy=False)
101 | 
102 | 
103 | def project_im_rois(im_rois, scales):
104 |     """Project image RoIs into the image pyramid built by _get_image_blob.
105 |     Arguments:
106 |         im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates
107 |         scales (list): scale factors as returned by _get_image_blob
108 |     Returns:
109 |         rois (ndarray): R x 4 matrix of projected RoI coordinates
110 |         levels (ndarray): image pyramid levels used by each projected RoI
111 |     """
112 |     rois = im_rois.astype(np.float, copy=False) * scales
113 |     levels = np.zeros((im_rois.shape[0], 1), dtype=np.int)
114 |     return rois, levels


--------------------------------------------------------------------------------
/lib/utils/collate_custom.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import collections
 3 | #from torch.utils.data.dataloader import default_collate
 4 | import itertools
 5 | 
 6 | def collate_custom(batch,key=None):
 7 |     """ Custom collate function for the Dataset class
 8 |      * It doesn't convert numpy arrays to stacked-tensors, but rather combines them in a list
 9 |      * This is useful for processing annotations of different sizes
10 |     """    
11 |     
12 |     # this case will occur in first pass, and will convert a
13 |     # list of dictionaries (returned by the threads by sampling dataset[idx])
14 |     # to a unified dictionary of collated values    
15 |     if isinstance(batch[0], collections.Mapping):
16 |         return {key: collate_custom([d[key] for d in batch],key) for key in batch[0]}
17 |     # these cases will occur in recursion
18 |     #elif torch.is_tensor(batch[0]): # for tensors, use standrard collating function
19 |         #return default_collate(batch)
20 |     elif isinstance(batch,list) and isinstance(batch[0],list): # flatten lists of lists
21 |         flattened_list  = list(itertools.chain(*batch))
22 |         return flattened_list
23 |     elif isinstance(batch,list) and len(batch)==1: # lists of length 1, remove list wrap
24 |         return batch[0]
25 |     else: # for other types (i.e. lists of len!=1), return as is
26 |         return batch
27 | 
28 | 


--------------------------------------------------------------------------------
/lib/utils/collections.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | ##############################################################################
15 | 
16 | """A simple attribute dictionary used for representing configuration options."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | from __future__ import unicode_literals
22 | 
23 | 
24 | class AttrDict(dict):
25 | 
26 |     def __getattr__(self, name):
27 |         if name in self.__dict__:
28 |             return self.__dict__[name]
29 |         elif name in self:
30 |             return self[name]
31 |         else:
32 |             raise AttributeError(name)
33 | 
34 |     def __setattr__(self, name, value):
35 |         if name in self.__dict__:
36 |             self.__dict__[name] = value
37 |         else:
38 |             self[name] = value
39 | 


--------------------------------------------------------------------------------
/lib/utils/colormap.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | ##############################################################################
 15 | 
 16 | """An awesome colormap for really neat visualizations."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | from __future__ import unicode_literals
 22 | 
 23 | import numpy as np
 24 | 
 25 | 
 26 | def colormap(rgb=False):
 27 |     color_list = np.array(
 28 |         [
 29 |             0.000, 0.447, 0.741,
 30 |             0.850, 0.325, 0.098,
 31 |             0.929, 0.694, 0.125,
 32 |             0.494, 0.184, 0.556,
 33 |             0.466, 0.674, 0.188,
 34 |             0.301, 0.745, 0.933,
 35 |             0.635, 0.078, 0.184,
 36 |             0.300, 0.300, 0.300,
 37 |             0.600, 0.600, 0.600,
 38 |             1.000, 0.000, 0.000,
 39 |             1.000, 0.500, 0.000,
 40 |             0.749, 0.749, 0.000,
 41 |             0.000, 1.000, 0.000,
 42 |             0.000, 0.000, 1.000,
 43 |             0.667, 0.000, 1.000,
 44 |             0.333, 0.333, 0.000,
 45 |             0.333, 0.667, 0.000,
 46 |             0.333, 1.000, 0.000,
 47 |             0.667, 0.333, 0.000,
 48 |             0.667, 0.667, 0.000,
 49 |             0.667, 1.000, 0.000,
 50 |             1.000, 0.333, 0.000,
 51 |             1.000, 0.667, 0.000,
 52 |             1.000, 1.000, 0.000,
 53 |             0.000, 0.333, 0.500,
 54 |             0.000, 0.667, 0.500,
 55 |             0.000, 1.000, 0.500,
 56 |             0.333, 0.000, 0.500,
 57 |             0.333, 0.333, 0.500,
 58 |             0.333, 0.667, 0.500,
 59 |             0.333, 1.000, 0.500,
 60 |             0.667, 0.000, 0.500,
 61 |             0.667, 0.333, 0.500,
 62 |             0.667, 0.667, 0.500,
 63 |             0.667, 1.000, 0.500,
 64 |             1.000, 0.000, 0.500,
 65 |             1.000, 0.333, 0.500,
 66 |             1.000, 0.667, 0.500,
 67 |             1.000, 1.000, 0.500,
 68 |             0.000, 0.333, 1.000,
 69 |             0.000, 0.667, 1.000,
 70 |             0.000, 1.000, 1.000,
 71 |             0.333, 0.000, 1.000,
 72 |             0.333, 0.333, 1.000,
 73 |             0.333, 0.667, 1.000,
 74 |             0.333, 1.000, 1.000,
 75 |             0.667, 0.000, 1.000,
 76 |             0.667, 0.333, 1.000,
 77 |             0.667, 0.667, 1.000,
 78 |             0.667, 1.000, 1.000,
 79 |             1.000, 0.000, 1.000,
 80 |             1.000, 0.333, 1.000,
 81 |             1.000, 0.667, 1.000,
 82 |             0.167, 0.000, 0.000,
 83 |             0.333, 0.000, 0.000,
 84 |             0.500, 0.000, 0.000,
 85 |             0.667, 0.000, 0.000,
 86 |             0.833, 0.000, 0.000,
 87 |             1.000, 0.000, 0.000,
 88 |             0.000, 0.167, 0.000,
 89 |             0.000, 0.333, 0.000,
 90 |             0.000, 0.500, 0.000,
 91 |             0.000, 0.667, 0.000,
 92 |             0.000, 0.833, 0.000,
 93 |             0.000, 1.000, 0.000,
 94 |             0.000, 0.000, 0.167,
 95 |             0.000, 0.000, 0.333,
 96 |             0.000, 0.000, 0.500,
 97 |             0.000, 0.000, 0.667,
 98 |             0.000, 0.000, 0.833,
 99 |             0.000, 0.000, 1.000,
100 |             0.000, 0.000, 0.000,
101 |             0.143, 0.143, 0.143,
102 |             0.286, 0.286, 0.286,
103 |             0.429, 0.429, 0.429,
104 |             0.571, 0.571, 0.571,
105 |             0.714, 0.714, 0.714,
106 |             0.857, 0.857, 0.857,
107 |             1.000, 1.000, 1.000
108 |         ]
109 |     ).astype(np.float32)
110 |     color_list = color_list.reshape((-1, 3)) * 255
111 |     if not rgb:
112 |         color_list = color_list[:, ::-1]
113 |     return color_list
114 | 


--------------------------------------------------------------------------------
/lib/utils/data_parallel.py:
--------------------------------------------------------------------------------
 1 | import operator
 2 | import torch
 3 | import warnings
 4 | from torch.nn import Module
 5 | from torch.nn.parallel.scatter_gather import scatter_kwargs, gather
 6 | from torch.nn.parallel.replicate import replicate
 7 | from torch.nn.parallel.parallel_apply import parallel_apply
 8 | 
 9 | class DataParallel(torch.nn.DataParallel):
10 |     def __init__(self, *args, **kwargs):
11 |             super(MyDataParallel, self).__init__(*args, **kwargs)
12 | 
13 |     def scatter(self, inputs, kwargs, device_ids): # scatter a list of len N into N gpus
14 |         return scatter_lists(inputs, kwargs, device_ids)
15 | 
16 | def scatter_lists(inputs, kwargs,device_ids):
17 |         n_inputs = len(inputs)
18 |         n_devices = len(device_ids)
19 |         for i in range(n_inputs):
20 |             assert(len(inputs[i])==n_devices)
21 |         inputs=tuple([tuple([inputs[i][j].cuda(device_ids[j]) for i in range(n_inputs)]) for j in range(n_devices)])
22 |         return inputs,kwargs
23 | 
24 | 
25 | def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, module_kwargs=None, dont_scatter=False, dont_gather=False):
26 |     r"""Evaluates module(input) in parallel across the GPUs given in device_ids.
27 | 
28 |     This is the functional version of the DataParallel module.
29 | 
30 |     Args:
31 |         module: the module to evaluate in parallel
32 |         inputs: inputs to the module
33 |         device_ids: GPU ids on which to replicate module
34 |         output_device: GPU location of the output  Use -1 to indicate the CPU.
35 |             (default: device_ids[0])
36 |     Returns:
37 |         a Variable containing the result of module(input) located on
38 |         output_device
39 |     """
40 |     if not isinstance(inputs, tuple):
41 |         inputs = (inputs,)
42 |     #print('getting device_ids')
43 |     if device_ids is None:
44 |         device_ids = list(range(torch.cuda.device_count()))
45 |     #print(device_ids)
46 |     if output_device is None:
47 |         output_device = device_ids[0]
48 | 
49 |     if dont_scatter==False:
50 |         do_scatter_lists=isinstance(inputs[0],list)
51 |         if do_scatter_lists:
52 |             inputs, module_kwargs = scatter_lists(inputs, module_kwargs, device_ids)
53 |         else:
54 |             inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim)
55 | 
56 |     if len(device_ids) == 1:
57 |         return module(*inputs[0], **module_kwargs[0])
58 |     #print('getting used device_ids')
59 |     used_device_ids = device_ids[:len(inputs)]
60 |     #print(used_device_ids)
61 |     #print('making model replicas')
62 |     replicas = replicate(module, used_device_ids)
63 |     #print('applying model')
64 |     outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)
65 |     if dont_gather:
66 |         return tuple([[out[i] for out in outputs] for i in range(len(outputs[0]))])
67 |     #print('gathering result')
68 |     return gather(outputs, output_device, dim)


--------------------------------------------------------------------------------
/lib/utils/dummy_datasets.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | ##############################################################################
15 | """Provide stub objects that can act as stand-in "dummy" datasets for simple use
16 | cases, like getting all classes in a dataset. This exists so that demos can be
17 | run without requiring users to download/install datasets first.
18 | """
19 | 
20 | from __future__ import absolute_import
21 | from __future__ import division
22 | from __future__ import print_function
23 | from __future__ import unicode_literals
24 | 
25 | from utils.collections import AttrDict
26 | 
27 | 
28 | def get_coco_dataset():
29 |     """A dummy COCO dataset that includes only the 'classes' field."""
30 |     ds = AttrDict()
31 |     classes = [
32 |         '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
33 |         'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
34 |         'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
35 |         'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
36 |         'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
37 |         'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
38 |         'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass',
39 |         'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
40 |         'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
41 |         'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
42 |         'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
43 |         'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
44 |         'scissors', 'teddy bear', 'hair drier', 'toothbrush'
45 |     ]
46 |     ds.classes = {i: name for i, name in enumerate(classes)}
47 |     return ds
48 | 


--------------------------------------------------------------------------------
/lib/utils/fast_rcnn_sample_rois.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | ##############################################################################
 15 | #
 16 | # Based on:
 17 | # --------------------------------------------------------
 18 | # Fast R-CNN
 19 | # Copyright (c) 2015 Microsoft
 20 | # Licensed under The MIT License [see LICENSE for details]
 21 | # Written by Ross Girshick
 22 | # --------------------------------------------------------
 23 | 
 24 | import numpy as np
 25 | import numpy.random as npr
 26 | 
 27 | 
 28 | 
 29 | def ones(shape, int32=False):
 30 |     """Return a blob of all ones of the given shape with the correct float or
 31 |     int data type.
 32 |     """
 33 |     return np.ones(shape, dtype=np.int32 if int32 else np.float32)
 34 | 
 35 | def zeros(shape, int32=False):
 36 |     """Return a blob of all zeros of the given shape with the correct float or
 37 |     int data type.
 38 |     """
 39 |     return np.zeros(shape, dtype=np.int32 if int32 else np.float32)
 40 | 
 41 | def fast_rcnn_sample_rois(roidb,
 42 |                         im_scale,
 43 |                         batch_idx,
 44 |                         train_batch_size_per_image=512,  # rois per im
 45 |                         train_fg_roi_fraction=0.25,
 46 |                         train_fg_thresh=0.5,
 47 |                         train_bg_thresh_hi=0.5,
 48 |                         train_bg_thresh_lo=0,
 49 |                         mask_on=False,
 50 |                         keypoints_on=False
 51 |                         ):
 52 |     #print('debug: setting random seed 1234 in fast_rcnn.py: _sample_rois()')
 53 |     # npr.seed(1234) # DEBUG
 54 |     """Generate a random sample of RoIs comprising foreground and background
 55 |     examples.
 56 |     """
 57 |     rois_per_image = int(train_batch_size_per_image)
 58 |     fg_rois_per_image = int(np.round(train_fg_roi_fraction * rois_per_image))
 59 |     max_overlaps = roidb['max_overlaps']
 60 | 
 61 |     # Select foreground RoIs as those with >= FG_THRESH overlap
 62 |     fg_inds = np.where(max_overlaps >= train_fg_thresh)[0]
 63 |     # Guard against the case when an image has fewer than fg_rois_per_image
 64 |     # foreground RoIs
 65 |     fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size)
 66 |     # Sample foreground regions without replacement
 67 |     if fg_inds.size > 0:
 68 |         fg_inds = npr.choice(
 69 |             fg_inds, size=fg_rois_per_this_image, replace=False
 70 |         )
 71 | 
 72 |     # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
 73 |     bg_inds = np.where(
 74 |         (max_overlaps < train_bg_thresh_hi) &
 75 |         (max_overlaps >= train_bg_thresh_lo)
 76 |     )[0]
 77 |     # Compute number of background RoIs to take from this image (guarding
 78 |     # against there being fewer than desired)
 79 |     bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
 80 |     bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size)
 81 |     # Sample foreground regions without replacement
 82 |     if bg_inds.size > 0:
 83 |         bg_inds = npr.choice(
 84 |             bg_inds, size=bg_rois_per_this_image, replace=False
 85 |         )
 86 | 
 87 |     # The indices that we're selecting (both fg and bg)
 88 |     keep_inds = np.append(fg_inds, bg_inds)
 89 |     # Label is the class each RoI has max overlap with
 90 |     sampled_labels = roidb['max_classes'][keep_inds]
 91 |     sampled_labels[fg_rois_per_this_image:] = 0  # Label bg RoIs with class 0
 92 |     sampled_boxes = roidb['boxes'][keep_inds]
 93 | 
 94 |     if 'bbox_targets' not in roidb:
 95 |         gt_inds = np.where(roidb['gt_classes'] > 0)[0]
 96 |         gt_boxes = roidb['boxes'][gt_inds, :]
 97 |         gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]]
 98 |         bbox_targets = _compute_targets(
 99 |             sampled_boxes, gt_boxes[gt_assignments, :], sampled_labels
100 |         )
101 |         bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets)
102 |     else:
103 |         bbox_targets, bbox_inside_weights = _expand_bbox_targets(
104 |             roidb['bbox_targets'][keep_inds, :]
105 |         )
106 | 
107 |     bbox_outside_weights = np.array(
108 |         bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype
109 |     )
110 | 
111 |     # Scale rois and format as (batch_idx, x1, y1, x2, y2)
112 |     sampled_rois = sampled_boxes * im_scale
113 |     repeated_batch_idx = batch_idx * ones((sampled_rois.shape[0], 1))
114 |     sampled_rois = np.hstack((repeated_batch_idx, sampled_rois))
115 | 
116 |     # Base Fast R-CNN blobs
117 |     blob_dict = dict(
118 |         labels_int32=sampled_labels.astype(np.int32, copy=False),
119 |         rois=sampled_rois,
120 |         bbox_targets=bbox_targets,
121 |         bbox_inside_weights=bbox_inside_weights,
122 |         bbox_outside_weights=bbox_outside_weights
123 |     )
124 | 
125 |     # # Optionally add Mask R-CNN blobs
126 |     # if mask_on:
127 |     #     roi_data.mask_rcnn.add_mask_rcnn_blobs(
128 |     #         blob_dict, sampled_boxes, roidb, im_scale, batch_idx
129 |     #     )
130 | 
131 |     # # Optionally add Keypoint R-CNN blobs
132 |     # if keypoints_on:
133 |     #     roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs(
134 |     #         blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx
135 |     #     )
136 | 
137 |     return blob_dict
138 | 
139 | def _expand_bbox_targets(bbox_target_data, num_classes=81, cls_agnostic_bbox_reg=False):
140 |     """Bounding-box regression targets are stored in a compact form in the
141 |     roidb.
142 |     This function expands those targets into the 4-of-4*K representation used
143 |     by the network (i.e. only one class has non-zero targets). The loss weights
144 |     are similarly expanded.
145 |     Returns:
146 |         bbox_target_data (ndarray): N x 4K blob of regression targets
147 |         bbox_inside_weights (ndarray): N x 4K blob of loss weights
148 |     """
149 |     num_bbox_reg_classes = num_classes
150 |     if cls_agnostic_bbox_reg:
151 |         num_bbox_reg_classes = 2  # bg and fg
152 | 
153 |     clss = bbox_target_data[:, 0]
154 |     bbox_targets = zeros((clss.size, 4 * num_bbox_reg_classes))
155 |     bbox_inside_weights = zeros(bbox_targets.shape)
156 |     inds = np.where(clss > 0)[0]
157 |     for ind in inds:
158 |         cls = int(clss[ind])
159 |         start = 4 * cls
160 |         end = start + 4
161 |         bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
162 |         bbox_inside_weights[ind, start:end] = (1.0, 1.0, 1.0, 1.0)
163 |     return bbox_targets, bbox_inside_weights


--------------------------------------------------------------------------------
/lib/utils/generate_anchors.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | ##############################################################################
 15 | #
 16 | # Based on:
 17 | # --------------------------------------------------------
 18 | # Faster R-CNN
 19 | # Copyright (c) 2015 Microsoft
 20 | # Licensed under The MIT License [see LICENSE for details]
 21 | # Written by Ross Girshick and Sean Bell
 22 | # --------------------------------------------------------
 23 | 
 24 | import numpy as np
 25 | 
 26 | # Verify that we compute the same anchors as Shaoqing's matlab implementation:
 27 | #
 28 | #    >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
 29 | #    >> anchors
 30 | #
 31 | #    anchors =
 32 | #
 33 | #       -83   -39   100    56
 34 | #      -175   -87   192   104
 35 | #      -359  -183   376   200
 36 | #       -55   -55    72    72
 37 | #      -119  -119   136   136
 38 | #      -247  -247   264   264
 39 | #       -35   -79    52    96
 40 | #       -79  -167    96   184
 41 | #      -167  -343   184   360
 42 | 
 43 | # array([[ -83.,  -39.,  100.,   56.],
 44 | #        [-175.,  -87.,  192.,  104.],
 45 | #        [-359., -183.,  376.,  200.],
 46 | #        [ -55.,  -55.,   72.,   72.],
 47 | #        [-119., -119.,  136.,  136.],
 48 | #        [-247., -247.,  264.,  264.],
 49 | #        [ -35.,  -79.,   52.,   96.],
 50 | #        [ -79., -167.,   96.,  184.],
 51 | #        [-167., -343.,  184.,  360.]])
 52 | 
 53 | 
 54 | def generate_anchors(
 55 |     stride=16, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)
 56 | ):
 57 |     """Generates a matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors
 58 |     are centered on stride / 2, have (approximate) sqrt areas of the specified
 59 |     sizes, and aspect ratios as given.
 60 |     """
 61 |     return _generate_anchors(
 62 |         stride,
 63 |         np.array(sizes, dtype=np.float) / stride,
 64 |         np.array(aspect_ratios, dtype=np.float)
 65 |     )
 66 | 
 67 | 
 68 | def _generate_anchors(base_size, scales, aspect_ratios):
 69 |     """Generate anchor (reference) windows by enumerating aspect ratios X
 70 |     scales wrt a reference (0, 0, base_size - 1, base_size - 1) window.
 71 |     """
 72 |     anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1
 73 |     anchors = _ratio_enum(anchor, aspect_ratios)
 74 |     anchors = np.vstack(
 75 |         [_scale_enum(anchors[i, :], scales) for i in range(anchors.shape[0])]
 76 |     )
 77 |     return anchors
 78 | 
 79 | 
 80 | def _whctrs(anchor):
 81 |     """Return width, height, x center, and y center for an anchor (window)."""
 82 |     w = anchor[2] - anchor[0] + 1
 83 |     h = anchor[3] - anchor[1] + 1
 84 |     x_ctr = anchor[0] + 0.5 * (w - 1)
 85 |     y_ctr = anchor[1] + 0.5 * (h - 1)
 86 |     return w, h, x_ctr, y_ctr
 87 | 
 88 | 
 89 | def _mkanchors(ws, hs, x_ctr, y_ctr):
 90 |     """Given a vector of widths (ws) and heights (hs) around a center
 91 |     (x_ctr, y_ctr), output a set of anchors (windows).
 92 |     """
 93 |     ws = ws[:, np.newaxis]
 94 |     hs = hs[:, np.newaxis]
 95 |     anchors = np.hstack(
 96 |         (
 97 |             x_ctr - 0.5 * (ws - 1),
 98 |             y_ctr - 0.5 * (hs - 1),
 99 |             x_ctr + 0.5 * (ws - 1),
100 |             y_ctr + 0.5 * (hs - 1)
101 |         )
102 |     )
103 |     return anchors
104 | 
105 | 
106 | def _ratio_enum(anchor, ratios):
107 |     """Enumerate a set of anchors for each aspect ratio wrt an anchor."""
108 |     w, h, x_ctr, y_ctr = _whctrs(anchor)
109 |     size = w * h
110 |     size_ratios = size / ratios
111 |     ws = np.round(np.sqrt(size_ratios))
112 |     hs = np.round(ws * ratios)
113 |     anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
114 |     return anchors
115 | 
116 | 
117 | def _scale_enum(anchor, scales):
118 |     """Enumerate a set of anchors for each scale wrt an anchor."""
119 |     w, h, x_ctr, y_ctr = _whctrs(anchor)
120 |     ws = w * scales
121 |     hs = h * scales
122 |     anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
123 |     return anchors


--------------------------------------------------------------------------------
/lib/utils/io.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | ##############################################################################
15 | 
16 | """IO utilities."""
17 | 
18 | import pickle
19 | import os
20 | 
21 | def save_object(obj, file_name):
22 |     """Save a Python object by pickling it."""
23 |     file_name = os.path.abspath(file_name)
24 |     with open(file_name, 'wb') as f:
25 |         pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
26 | 


--------------------------------------------------------------------------------
/lib/utils/logging.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | ##############################################################################
15 | 
16 | """Utilities for logging."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | from __future__ import unicode_literals
22 | 
23 | from collections import deque
24 | from email.mime.text import MIMEText
25 | import json
26 | import logging
27 | import numpy as np
28 | import smtplib
29 | import sys
30 | 
31 | # Print lower precision floating point values than default FLOAT_REPR
32 | json.encoder.FLOAT_REPR = lambda o: format(o, '.6f')
33 | 
34 | 
35 | def log_json_stats(stats, sort_keys=True):
36 |     print('json_stats: {:s}'.format(json.dumps(stats, sort_keys=sort_keys)))
37 | 
38 | 
39 | class SmoothedValue(object):
40 |     """Track a series of values and provide access to smoothed values over a
41 |     window or the global series average.
42 |     """
43 | 
44 |     def __init__(self, window_size):
45 |         self.deque = deque(maxlen=window_size)
46 |         self.series = []
47 |         self.total = 0.0
48 |         self.count = 0
49 | 
50 |     def AddValue(self, value):
51 |         self.deque.append(value)
52 |         self.series.append(value)
53 |         self.count += 1
54 |         self.total += value
55 | 
56 |     def GetMedianValue(self):
57 |         return np.median(self.deque)
58 | 
59 |     def GetAverageValue(self):
60 |         return np.mean(self.deque)
61 | 
62 |     def GetGlobalAverageValue(self):
63 |         return self.total / self.count
64 | 
65 | 
66 | def send_email(subject, body, to):
67 |     s = smtplib.SMTP('localhost')
68 |     mime = MIMEText(body)
69 |     mime['Subject'] = subject
70 |     mime['To'] = to
71 |     s.sendmail('detectron', to, mime.as_string())
72 | 
73 | 
74 | def setup_logging(name):
75 |     FORMAT = '%(levelname)s %(filename)s:%(lineno)4d: %(message)s'
76 |     # Manually clear root loggers to prevent any module that may have called
77 |     # logging.basicConfig() from blocking our logging setup
78 |     logging.root.handlers = []
79 |     logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
80 |     logger = logging.getLogger(name)
81 |     return logger


--------------------------------------------------------------------------------
/lib/utils/multilevel_rois.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | ##############################################################################
15 | 
16 | import utils.boxes as box_utils
17 | import numpy as np
18 | 
19 | def add_multilevel_rois_for_test(blobs, name, roi_min_level=2,roi_max_level=5):
20 |     """Distributes a set of RoIs across FPN pyramid levels by creating new level
21 |     specific RoI blobs.
22 | 
23 |     Arguments:
24 |         blobs (dict): dictionary of blobs
25 |         name (str): a key in 'blobs' identifying the source RoI blob
26 | 
27 |     Returns:
28 |         [by ref] blobs (dict): new keys named by `name + 'fpn' + level`
29 |             are added to dict each with a value that's an R_level x 5 ndarray of
30 |             RoIs (see _get_rois_blob for format)
31 |     """
32 |     lvl_min = roi_min_level
33 |     lvl_max = roi_max_level
34 |     #lvls = map_rois_to_fpn_levels(blobs[name][:, 1:5], lvl_min, lvl_max)
35 |     lvls = map_rois_to_fpn_levels(blobs[name], lvl_min, lvl_max)
36 |     blobs = add_multilevel_roi_blobs(
37 |        blobs, name, blobs[name], lvls, lvl_min, lvl_max
38 |     )
39 |     return blobs
40 |     
41 | def map_rois_to_fpn_levels(rois, k_min, k_max, roi_canonical_scale=224, roi_canonical_level=4):
42 |     """Determine which FPN level each RoI in a set of RoIs should map to based
43 |     on the heuristic in the FPN paper.
44 |     """
45 |     # Compute level ids
46 |     s = np.sqrt(box_utils.boxes_area(rois))
47 |     s0 = roi_canonical_scale  # default: 224
48 |     lvl0 = roi_canonical_level  # default: 4
49 | 
50 |     # Eqn.(1) in FPN paper
51 |     target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-6))
52 |     target_lvls = np.clip(target_lvls, k_min, k_max)
53 |     return target_lvls
54 | 
55 | 
56 | def add_multilevel_roi_blobs(
57 |     #rois,target_lvls, lvl_min, lvl_max):
58 |     blobs, blob_prefix, rois, target_lvls, lvl_min, lvl_max):
59 |     """Add RoI blobs for multiple FPN levels to the blobs dict.
60 | 
61 |     blobs: a dict mapping from blob name to numpy ndarray
62 |     blob_prefix: name prefix to use for the FPN blobs
63 |     rois: the source rois as a 2D numpy array of shape (N, 5) where each row is
64 |       an roi and the columns encode (batch_idx, x1, y1, x2, y2)
65 |     target_lvls: numpy array of shape (N, ) indicating which FPN level each roi
66 |       in rois should be assigned to
67 |     lvl_min: the finest (highest resolution) FPN level (e.g., 2)
68 |     lvl_max: the coarest (lowest resolution) FPN level (e.g., 6)
69 |     """
70 |     rois_idx_order = np.empty((0, ))
71 |     rois_stacked = np.zeros((0, 4), dtype=np.float32)  # for assert    
72 |     for lvl in range(lvl_min, lvl_max + 1):
73 |         idx_lvl = np.where(target_lvls == lvl)[0]
74 |         blobs[blob_prefix + '_fpn' + str(lvl)] = rois[idx_lvl, :]
75 |         rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
76 |         rois_stacked = np.vstack(
77 |             [rois_stacked, blobs[blob_prefix + '_fpn' + str(lvl)]]
78 |         )
79 |     rois_idx_restore = np.argsort(rois_idx_order).astype(np.int32, copy=False)
80 |     blobs[blob_prefix + '_idx_restore_int32'] = rois_idx_restore
81 |     # Sanity check that restore order is correct
82 |     assert (rois_stacked[rois_idx_restore] == rois).all()
83 |     return blobs


--------------------------------------------------------------------------------
/lib/utils/preprocess_sample.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from utils.blob import prep_im_for_blob,im_list_to_blob
 4 | from utils.fast_rcnn_sample_rois import fast_rcnn_sample_rois
 5 | from utils.multilevel_rois import add_multilevel_rois_for_test
 6 | 
 7 | class preprocess_sample(object):
 8 |     # performs the preprocessing (including building image pyramids and scaling the coordinates)
 9 |     def __init__(self,
10 |                  target_sizes=800,
11 |                  max_size=1333,
12 |                  mean=[122.7717, 115.9465, 102.9801],
13 |                  remove_dup_proposals=True,
14 |                  fpn_on=False,
15 |                  spatial_scale=0.0625,
16 |                  sample_proposals_for_training=False):
17 |         self.mean=mean
18 |         self.target_sizes=target_sizes if isinstance(target_sizes,list) else [target_sizes]
19 |         self.max_size=max_size
20 |         self.remove_dup_proposals=remove_dup_proposals
21 |         self.fpn_on=fpn_on
22 |         self.spatial_scale=spatial_scale
23 |         self.sample_proposals_for_training = sample_proposals_for_training
24 |         
25 |     def __call__(self, sample):
26 |         # resizes image and returns scale factors
27 |         original_im_size=sample['image'].shape
28 |         im_list,im_scales = prep_im_for_blob(sample['image'],
29 |                                              pixel_means=self.mean,
30 |                                              target_sizes=self.target_sizes,
31 |                                              max_size=self.max_size)
32 |         sample['image'] = torch.FloatTensor(im_list_to_blob(im_list,self.fpn_on)) # im_list_to blob swaps channels and adds stride in case of fpn
33 |         sample['scaling_factors'] = im_scales[0] 
34 |         sample['original_im_size'] = torch.FloatTensor(original_im_size)
35 |         if len(sample['dbentry']['boxes'])!=0 and not self.sample_proposals_for_training: # Fast RCNN test
36 |             proposals = sample['dbentry']['boxes']*im_scales[0]  
37 |             if self.remove_dup_proposals:
38 |                 proposals,_ = self.remove_dup_prop(proposals) 
39 |             
40 |             if self.fpn_on==False:
41 |                 sample['rois'] = torch.FloatTensor(proposals)
42 |             else:
43 |                 multiscale_proposals = add_multilevel_rois_for_test({'rois': proposals},'rois')
44 |                 for k in multiscale_proposals.keys():
45 |                     sample[k] = torch.FloatTensor(multiscale_proposals[k])
46 | 
47 |         elif self.sample_proposals_for_training: # Fast RCNN training
48 |             sampled_rois_labels_and_targets = fast_rcnn_sample_rois(roidb=sample['dbentry'],
49 |                                                                     im_scale=im_scales[0],
50 |                                                                     batch_idx=0) # ok as long as we keep batch_size=1
51 |             sampled_rois_labels_and_targets = {key: torch.FloatTensor(value) for key,value in sampled_rois_labels_and_targets.items()}
52 |             # add to sample
53 |             sample = {**sample, **sampled_rois_labels_and_targets} 
54 |         # remove dbentry from sample
55 |         del sample['dbentry']
56 |         return sample
57 | 
58 |     # from Detectron test.py
59 |     # When mapping from image ROIs to feature map ROIs, there's some aliasing
60 |     # (some distinct image ROIs get mapped to the same feature ROI).
61 |     # Here, we identify duplicate feature ROIs, so we only compute features
62 |     # on the unique subset.
63 |     def remove_dup_prop(self,proposals): 
64 |         v = np.array([1e3, 1e6, 1e9, 1e12])
65 | 
66 |         hashes = np.round(proposals * self.spatial_scale).dot(v)
67 |         _, index, inv_index = np.unique(hashes, return_index=True, return_inverse=True)
68 |         proposals = proposals[index, :]
69 | 
70 |         return (proposals,inv_index)


--------------------------------------------------------------------------------
/lib/utils/result_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | ##############################################################################
 15 | 
 16 | # some functions are from Detectron
 17 | 
 18 | import numpy as np
 19 | from torch.autograd import Variable
 20 | import utils.boxes as box_utils
 21 | import cv2
 22 | import pycocotools.mask as mask_util
 23 | 
 24 | 
 25 | def to_np(x):
 26 |     if isinstance(x,np.ndarray):
 27 |         return x    
 28 |     if isinstance(x,Variable):
 29 |         x=x.data
 30 |     return x.cpu().numpy()
 31 | 
 32 | def empty_results(num_classes, num_images):
 33 |     """Return empty results lists for boxes, masks, and keypoints.
 34 |     Box detections are collected into:
 35 |       all_boxes[cls][image] = N x 5 array with columns (x1, y1, x2, y2, score)
 36 |     Instance mask predictions are collected into:
 37 |       all_segms[cls][image] = [...] list of COCO RLE encoded masks that are in
 38 |       1:1 correspondence with the boxes in all_boxes[cls][image]
 39 |     Keypoint predictions are collected into:
 40 |       all_keyps[cls][image] = [...] list of keypoints results, each encoded as
 41 |       a 3D array (#rois, 4, #keypoints) with the 4 rows corresponding to
 42 |       [x, y, logit, prob] (See: utils.keypoints.heatmaps_to_keypoints).
 43 |       Keypoints are recorded for person (cls = 1); they are in 1:1
 44 |       correspondence with the boxes in all_boxes[cls][image].
 45 |     """
 46 |     # Note: do not be tempted to use [[] * N], which gives N references to the
 47 |     # *same* empty list.
 48 |     all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]
 49 |     all_segms = [[[] for _ in range(num_images)] for _ in range(num_classes)]
 50 |     all_keyps = [[[] for _ in range(num_images)] for _ in range(num_classes)]
 51 |     return all_boxes, all_segms, all_keyps
 52 | 
 53 | 
 54 | def extend_results(index, all_res, im_res):
 55 |     """Add results for an image to the set of all results at the specified
 56 |     index.
 57 |     """
 58 |     # Skip cls_idx 0 (__background__)
 59 |     for cls_idx in range(1, len(im_res)):
 60 |         all_res[cls_idx][index] = im_res[cls_idx]
 61 |         
 62 | # When mapping from image ROIs to feature map ROIs, there's some aliasing
 63 | # (some distinct image ROIs get mapped to the same feature ROI).
 64 | # Here, we identify duplicate feature ROIs, so we only compute features
 65 | # on the unique subset.
 66 | def remove_dup_prop(self,proposals): 
 67 |     proposals=proposals.data.numpy()
 68 |     v = np.array([1e3, 1e6, 1e9, 1e12])
 69 | 
 70 |     hashes = np.round(proposals * self.spatial_scale).dot(v)
 71 |     _, index, inv_index = np.unique(hashes, return_index=True, return_inverse=True)
 72 |     proposals = proposals[index, :]
 73 |     return torch.FloatTensor(proposals)
 74 | 
 75 | 
 76 | def postprocess_output(rois,scaling_factor,im_size,class_scores,bbox_deltas,bbox_reg_weights = (10.0,10.0,5.0,5.0)):
 77 |     boxes = to_np(rois.div(scaling_factor).squeeze(0))
 78 |     bbox_deltas = to_np(bbox_deltas)    
 79 |     orig_im_size = to_np(im_size).squeeze()    
 80 |     # apply deltas
 81 |     pred_boxes = box_utils.bbox_transform(boxes, bbox_deltas, bbox_reg_weights)
 82 |     # clip on boundaries
 83 |     pred_boxes = box_utils.clip_tiled_boxes(pred_boxes,orig_im_size)    
 84 |     scores = to_np(class_scores)
 85 |     # Map scores and predictions back to the original set of boxes
 86 |     # This re-duplicates the previously removed boxes
 87 |     # Is there any use for this?
 88 | #    inv_index = to_np(batch['proposal_inv_index']).squeeze().astype(np.int64)
 89 | #    scores = scores[inv_index, :]
 90 | #    pred_boxes = pred_boxes[inv_index, :]
 91 |     # threshold on score and run nms to remove duplicates
 92 |     scores_final, boxes_final, boxes_per_class = box_results_with_nms_and_limit(scores, pred_boxes)
 93 |     
 94 |     return (scores_final, boxes_final, boxes_per_class)
 95 | 
 96 | def box_results_with_nms_and_limit(scores, boxes,
 97 |                                    num_classes=81,
 98 |                                    score_thresh=0.05,
 99 |                                    overlap_thresh=0.5,
100 |                                    do_soft_nms=False,
101 |                                    soft_nms_sigma=0.5,
102 |                                    soft_nms_method='linear',
103 |                                    do_bbox_vote=False,
104 |                                    bbox_vote_thresh=0.8,
105 |                                    bbox_vote_method='ID',
106 |                                    max_detections_per_img=100, ### over all classes ###
107 |                                    ):
108 |     """Returns bounding-box detection results by thresholding on scores and
109 |     applying non-maximum suppression (NMS).
110 |     
111 |     A number of #detections presist after this and are returned, sorted by class
112 | 
113 |     `boxes` has shape (#detections, 4 * #classes), where each row represents
114 |     a list of predicted bounding boxes for each of the object classes in the
115 |     dataset (including the background class). The detections in each row
116 |     originate from the same object proposal.
117 | 
118 |     `scores` has shape (#detection, #classes), where each row represents a list
119 |     of object detection confidence scores for each of the object classes in the
120 |     dataset (including the background class). `scores[i, j]`` corresponds to the
121 |     box at `boxes[i, j * 4:(j + 1) * 4]`.
122 |     """
123 |     cls_boxes = [[] for _ in range(num_classes)]
124 |     # Apply threshold on detection probabilities and apply NMS
125 |     # Skip j = 0, because it's the background class
126 |     for j in range(1, num_classes):
127 |         inds = np.where(scores[:, j] > score_thresh)[0]
128 |         scores_j = scores[inds, j]
129 |         boxes_j = boxes[inds, j * 4:(j + 1) * 4]
130 |         dets_j = np.hstack((boxes_j, scores_j[:, np.newaxis])).astype(
131 |             np.float32, copy=False
132 |         )
133 |         if do_soft_nms:
134 |             nms_dets, _ = box_utils.soft_nms(
135 |                 dets_j,
136 |                 sigma=soft_nms_sigma,
137 |                 overlap_thresh=overlap_thresh,
138 |                 score_thresh=0.0001,
139 |                 method=soft_nms_method
140 |             )
141 |         else:
142 |             keep = box_utils.nms(dets_j, overlap_thresh)
143 |             nms_dets = dets_j[keep, :]
144 |         # Refine the post-NMS boxes using bounding-box voting
145 |         if do_bbox_vote:
146 |             nms_dets = box_utils.box_voting(
147 |                 nms_dets,
148 |                 dets_j,
149 |                 bbox_vote_thresh,
150 |                 scoring_method=bbox_vote_method
151 |             )
152 |         cls_boxes[j] = nms_dets
153 | 
154 |     # Limit to max_per_image detections **over all classes**
155 |     if max_detections_per_img > 0:
156 |         image_scores = np.hstack(
157 |             [cls_boxes[j][:, -1] for j in range(1, num_classes)]
158 |         )
159 |         if len(image_scores) > max_detections_per_img:
160 |             image_thresh = np.sort(image_scores)[-max_detections_per_img]
161 |             for j in range(1, num_classes):
162 |                 keep = np.where(cls_boxes[j][:, -1] >= image_thresh)[0]
163 |                 cls_boxes[j] = cls_boxes[j][keep, :]
164 | 
165 |     im_results = np.vstack([cls_boxes[j] for j in range(1, num_classes)])
166 |     boxes = im_results[:, :-1]
167 |     scores = im_results[:, -1]
168 |     return scores, boxes, cls_boxes
169 | 
170 | def segm_results(cls_boxes, masks, ref_boxes, im_h, im_w,
171 |                  num_classes=81,
172 |                  M=14, #  cfg.MRCNN.RESOLUTION
173 |                  cls_specific_mask=True,
174 |                  thresh_binarize=0.5):
175 |     cls_segms = [[] for _ in range(num_classes)]
176 |     mask_ind = 0
177 |     # To work around an issue with cv2.resize (it seems to automatically pad
178 |     # with repeated border values), we manually zero-pad the masks by 1 pixel
179 |     # prior to resizing back to the original image resolution. This prevents
180 |     # "top hat" artifacts. We therefore need to expand the reference boxes by an
181 |     # appropriate factor.
182 |     scale = (M + 2.0) / M
183 |     ref_boxes = box_utils.expand_boxes(ref_boxes, scale)
184 |     ref_boxes = ref_boxes.astype(np.int32)
185 |     padded_mask = np.zeros((M + 2, M + 2), dtype=np.float32)
186 | 
187 |     # skip j = 0, because it's the background class
188 |     for j in range(1, num_classes):
189 |         segms = []
190 |         for _ in range(cls_boxes[j].shape[0]):
191 |             if cls_specific_mask:
192 |                 padded_mask[1:-1, 1:-1] = masks[mask_ind, j, :, :]
193 |             else:
194 |                 padded_mask[1:-1, 1:-1] = masks[mask_ind, 0, :, :]
195 | 
196 |             ref_box = ref_boxes[mask_ind, :]
197 |             w = ref_box[2] - ref_box[0] + 1
198 |             h = ref_box[3] - ref_box[1] + 1
199 |             w = np.maximum(w, 1)
200 |             h = np.maximum(h, 1)
201 | 
202 |             mask = cv2.resize(padded_mask, (w, h))
203 |             mask = np.array(mask > thresh_binarize, dtype=np.uint8)
204 |             im_mask = np.zeros((im_h, im_w), dtype=np.uint8)
205 | 
206 |             x_0 = max(ref_box[0], 0)
207 |             x_1 = min(ref_box[2] + 1, im_w)
208 |             y_0 = max(ref_box[1], 0)
209 |             y_1 = min(ref_box[3] + 1, im_h)
210 | 
211 |             im_mask[y_0:y_1, x_0:x_1] = mask[
212 |                 (y_0 - ref_box[1]):(y_1 - ref_box[1]),
213 |                 (x_0 - ref_box[0]):(x_1 - ref_box[0])
214 |             ]
215 | 
216 |             # Get RLE encoding used by the COCO evaluation API
217 |             rle = mask_util.encode(
218 |                 np.array(im_mask[:, :, np.newaxis], order='F')
219 |             )[0]
220 |             rle['counts'] = rle['counts'].decode() # convert back to str so that it can be later saved to json
221 |             segms.append(rle)
222 | 
223 |             mask_ind += 1
224 | 
225 |         cls_segms[j] = segms
226 | 
227 |     assert mask_ind == masks.shape[0]
228 |     return cls_segms


--------------------------------------------------------------------------------
/lib/utils/segms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | ##############################################################################
 15 | 
 16 | """Functions for interacting with segmentation masks in the COCO format.
 17 | 
 18 | The following terms are used in this module
 19 |     mask: a binary mask encoded as a 2D numpy array
 20 |     segm: a segmentation mask in one of the two COCO formats (polygon or RLE)
 21 |     polygon: COCO's polygon format
 22 |     RLE: COCO's run length encoding format
 23 | """
 24 | 
 25 | from __future__ import absolute_import
 26 | from __future__ import division
 27 | from __future__ import print_function
 28 | from __future__ import unicode_literals
 29 | 
 30 | import numpy as np
 31 | 
 32 | import pycocotools.mask as mask_util
 33 | 
 34 | 
 35 | def flip_segms(segms, height, width):
 36 |     """Left/right flip each mask in a list of masks."""
 37 |     def _flip_poly(poly, width):
 38 |         flipped_poly = np.array(poly)
 39 |         flipped_poly[0::2] = width - np.array(poly[0::2]) - 1
 40 |         return flipped_poly.tolist()
 41 | 
 42 |     def _flip_rle(rle, height, width):
 43 |         if 'counts' in rle and type(rle['counts']) == list:
 44 |             # Magic RLE format handling painfully discovered by looking at the
 45 |             # COCO API showAnns function.
 46 |             rle = mask_util.frPyObjects([rle], height, width)
 47 |         mask = mask_util.decode(rle)
 48 |         mask = mask[:, ::-1, :]
 49 |         rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
 50 |         return rle
 51 | 
 52 |     flipped_segms = []
 53 |     for segm in segms:
 54 |         if type(segm) == list:
 55 |             # Polygon format
 56 |             flipped_segms.append([_flip_poly(poly, width) for poly in segm])
 57 |         else:
 58 |             # RLE format
 59 |             assert type(segm) == dict
 60 |             flipped_segms.append(_flip_rle(segm, height, width))
 61 |     return flipped_segms
 62 | 
 63 | 
 64 | def polys_to_mask(polygons, height, width):
 65 |     """Convert from the COCO polygon segmentation format to a binary mask
 66 |     encoded as a 2D array of data type numpy.float32. The polygon segmentation
 67 |     is understood to be enclosed inside a height x width image. The resulting
 68 |     mask is therefore of shape (height, width).
 69 |     """
 70 |     rle = mask_util.frPyObjects(polygons, height, width)
 71 |     mask = np.array(mask_util.decode(rle), dtype=np.float32)
 72 |     # Flatten in case polygons was a list
 73 |     mask = np.sum(mask, axis=2)
 74 |     mask = np.array(mask > 0, dtype=np.float32)
 75 |     return mask
 76 | 
 77 | 
 78 | def mask_to_bbox(mask):
 79 |     """Compute the tight bounding box of a binary mask."""
 80 |     xs = np.where(np.sum(mask, axis=0) > 0)[0]
 81 |     ys = np.where(np.sum(mask, axis=1) > 0)[0]
 82 | 
 83 |     if len(xs) == 0 or len(ys) == 0:
 84 |         return None
 85 | 
 86 |     x0 = xs[0]
 87 |     x1 = xs[-1]
 88 |     y0 = ys[0]
 89 |     y1 = ys[-1]
 90 |     return np.array((x0, y0, x1, y1), dtype=np.float32)
 91 | 
 92 | 
 93 | def polys_to_mask_wrt_box(polygons, box, M):
 94 |     """Convert from the COCO polygon segmentation format to a binary mask
 95 |     encoded as a 2D array of data type numpy.float32. The polygon segmentation
 96 |     is understood to be enclosed in the given box and rasterized to an M x M
 97 |     mask. The resulting mask is therefore of shape (M, M).
 98 |     """
 99 |     w = box[2] - box[0]
100 |     h = box[3] - box[1]
101 | 
102 |     w = np.maximum(w, 1)
103 |     h = np.maximum(h, 1)
104 | 
105 |     polygons_norm = []
106 |     for poly in polygons:
107 |         p = np.array(poly, dtype=np.float32)
108 |         p[0::2] = (p[0::2] - box[0]) * M / w
109 |         p[1::2] = (p[1::2] - box[1]) * M / h
110 |         polygons_norm.append(p)
111 | 
112 |     rle = mask_util.frPyObjects(polygons_norm, M, M)
113 |     mask = np.array(mask_util.decode(rle), dtype=np.float32)
114 |     # Flatten in case polygons was a list
115 |     mask = np.sum(mask, axis=2)
116 |     mask = np.array(mask > 0, dtype=np.float32)
117 |     return mask
118 | 
119 | 
120 | def polys_to_boxes(polys):
121 |     """Convert a list of polygons into an array of tight bounding boxes."""
122 |     boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32)
123 |     for i in range(len(polys)):
124 |         poly = polys[i]
125 |         x0 = min(min(p[::2]) for p in poly)
126 |         x1 = max(max(p[::2]) for p in poly)
127 |         y0 = min(min(p[1::2]) for p in poly)
128 |         y1 = max(max(p[1::2]) for p in poly)
129 |         boxes_from_polys[i, :] = [x0, y0, x1, y1]
130 | 
131 |     return boxes_from_polys
132 | 
133 | 
134 | def rle_mask_voting(
135 |     top_masks, all_masks, all_dets, iou_thresh, binarize_thresh, method='AVG'
136 | ):
137 |     """Returns new masks (in correspondence with `top_masks`) by combining
138 |     multiple overlapping masks coming from the pool of `all_masks`. Two methods
139 |     for combining masks are supported: 'AVG' uses a weighted average of
140 |     overlapping mask pixels; 'UNION' takes the union of all mask pixels.
141 |     """
142 |     if len(top_masks) == 0:
143 |         return
144 | 
145 |     all_not_crowd = [False] * len(all_masks)
146 |     top_to_all_overlaps = mask_util.iou(top_masks, all_masks, all_not_crowd)
147 |     decoded_all_masks = [
148 |         np.array(mask_util.decode(rle), dtype=np.float32) for rle in all_masks
149 |     ]
150 |     decoded_top_masks = [
151 |         np.array(mask_util.decode(rle), dtype=np.float32) for rle in top_masks
152 |     ]
153 |     all_boxes = all_dets[:, :4].astype(np.int32)
154 |     all_scores = all_dets[:, 4]
155 | 
156 |     # Fill box support with weights
157 |     mask_shape = decoded_all_masks[0].shape
158 |     mask_weights = np.zeros((len(all_masks), mask_shape[0], mask_shape[1]))
159 |     for k in range(len(all_masks)):
160 |         ref_box = all_boxes[k]
161 |         x_0 = max(ref_box[0], 0)
162 |         x_1 = min(ref_box[2] + 1, mask_shape[1])
163 |         y_0 = max(ref_box[1], 0)
164 |         y_1 = min(ref_box[3] + 1, mask_shape[0])
165 |         mask_weights[k, y_0:y_1, x_0:x_1] = all_scores[k]
166 |     mask_weights = np.maximum(mask_weights, 1e-5)
167 | 
168 |     top_segms_out = []
169 |     for k in range(len(top_masks)):
170 |         # Corner case of empty mask
171 |         if decoded_top_masks[k].sum() == 0:
172 |             top_segms_out.append(top_masks[k])
173 |             continue
174 | 
175 |         inds_to_vote = np.where(top_to_all_overlaps[k] >= iou_thresh)[0]
176 |         # Only matches itself
177 |         if len(inds_to_vote) == 1:
178 |             top_segms_out.append(top_masks[k])
179 |             continue
180 | 
181 |         masks_to_vote = [decoded_all_masks[i] for i in inds_to_vote]
182 |         if method == 'AVG':
183 |             ws = mask_weights[inds_to_vote]
184 |             soft_mask = np.average(masks_to_vote, axis=0, weights=ws)
185 |             mask = np.array(soft_mask > binarize_thresh, dtype=np.uint8)
186 |         elif method == 'UNION':
187 |             # Any pixel that's on joins the mask
188 |             soft_mask = np.sum(masks_to_vote, axis=0)
189 |             mask = np.array(soft_mask > 1e-5, dtype=np.uint8)
190 |         else:
191 |             raise NotImplementedError('Method {} is unknown'.format(method))
192 |         rle = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0]
193 |         top_segms_out.append(rle)
194 | 
195 |     return top_segms_out
196 | 
197 | 
198 | def rle_mask_nms(masks, dets, thresh, mode='IOU'):
199 |     """Performs greedy non-maximum suppression based on an overlap measurement
200 |     between masks. The type of measurement is determined by `mode` and can be
201 |     either 'IOU' (standard intersection over union) or 'IOMA' (intersection over
202 |     mininum area).
203 |     """
204 |     if len(masks) == 0:
205 |         return []
206 |     if len(masks) == 1:
207 |         return [0]
208 | 
209 |     if mode == 'IOU':
210 |         # Computes ious[m1, m2] = area(intersect(m1, m2)) / area(union(m1, m2))
211 |         all_not_crowds = [False] * len(masks)
212 |         ious = mask_util.iou(masks, masks, all_not_crowds)
213 |     elif mode == 'IOMA':
214 |         # Computes ious[m1, m2] = area(intersect(m1, m2)) / min(area(m1), area(m2))
215 |         all_crowds = [True] * len(masks)
216 |         # ious[m1, m2] = area(intersect(m1, m2)) / area(m2)
217 |         ious = mask_util.iou(masks, masks, all_crowds)
218 |         # ... = max(area(intersect(m1, m2)) / area(m2),
219 |         #           area(intersect(m2, m1)) / area(m1))
220 |         ious = np.maximum(ious, ious.transpose())
221 |     elif mode == 'CONTAINMENT':
222 |         # Computes ious[m1, m2] = area(intersect(m1, m2)) / area(m2)
223 |         # Which measures how much m2 is contained inside m1
224 |         all_crowds = [True] * len(masks)
225 |         ious = mask_util.iou(masks, masks, all_crowds)
226 |     else:
227 |         raise NotImplementedError('Mode {} is unknown'.format(mode))
228 | 
229 |     scores = dets[:, 4]
230 |     order = np.argsort(-scores)
231 | 
232 |     keep = []
233 |     while order.size > 0:
234 |         i = order[0]
235 |         keep.append(i)
236 |         ovr = ious[i, order[1:]]
237 |         inds_to_keep = np.where(ovr <= thresh)[0]
238 |         order = order[inds_to_keep + 1]
239 | 
240 |     return keep
241 | 
242 | 
243 | def rle_masks_to_boxes(masks):
244 |     """Computes the bounding box of each mask in a list of RLE encoded masks."""
245 |     if len(masks) == 0:
246 |         return []
247 | 
248 |     decoded_masks = [
249 |         np.array(mask_util.decode(rle), dtype=np.float32) for rle in masks
250 |     ]
251 | 
252 |     def get_bounds(flat_mask):
253 |         inds = np.where(flat_mask > 0)[0]
254 |         return inds.min(), inds.max()
255 | 
256 |     boxes = np.zeros((len(decoded_masks), 4))
257 |     keep = [True] * len(decoded_masks)
258 |     for i, mask in enumerate(decoded_masks):
259 |         if mask.sum() == 0:
260 |             keep[i] = False
261 |             continue
262 |         flat_mask = mask.sum(axis=0)
263 |         x0, x1 = get_bounds(flat_mask)
264 |         flat_mask = mask.sum(axis=1)
265 |         y0, y1 = get_bounds(flat_mask)
266 |         boxes[i, :] = (x0, y0, x1, y1)
267 | 
268 |     return boxes, np.where(keep)[0]
269 | 


--------------------------------------------------------------------------------
/lib/utils/selective_search.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 |  
 4 | def selective_search(pil_image=None,quality='f',size=800):
 5 |     # speed-up using multithreads
 6 |     cv2.setUseOptimized(True);
 7 |     cv2.setNumThreads(4);
 8 | 
 9 |     # resize image to limit number of proposals and to bypass a bug in OpenCV with non-square images
10 |     w,h = pil_image.size
11 |     h_factor,w_factor=h/size,w/size
12 |     pil_image=pil_image.resize((size,size))
13 | 
14 |     im = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)        
15 |  
16 |     # create Selective Search Segmentation Object using default parameters
17 |     ss = cv2.ximgproc.segmentation.createSelectiveSearchSegmentation()
18 |  
19 |     # set input image on which we will run segmentation
20 |     ss.setBaseImage(im)
21 |  
22 |     # Switch to fast but low recall Selective Search method
23 |     if (quality == 'f'):
24 |         ss.switchToSelectiveSearchFast()
25 |      # Switch to high recall but slow Selective Search method
26 |     elif (quality == 'q'):
27 |         ss.switchToSelectiveSearchQuality()
28 | 
29 |     # run selective search segmentation on input image
30 |     rects = ss.process()
31 | 
32 |     # rect is in x,y,w,h format
33 |     # convert to xmin,ymin,xmax,ymax format
34 |     rects = np.vstack((rects[:,0]*w_factor, rects[:,1]*h_factor, (rects[:,0]+rects[:,2])*w_factor, (rects[:,1]+rects[:,3])*h_factor)).transpose()
35 |     
36 |     return rects


--------------------------------------------------------------------------------
/lib/utils/solver.py:
--------------------------------------------------------------------------------
 1 | def adjust_learning_rate(optimizer, lr):
 2 |     for param_group in optimizer.param_groups:
 3 |         param_group['lr'] = lr
 4 | 
 5 | 
 6 | def get_step_index(cur_iter,lr_steps=[0, 240000, 320000],max_iter=360000):
 7 |     """Given an iteration, find which learning rate step we're at."""
 8 |     assert lr_steps[0] == 0, 'The first step should always start at 0.'
 9 |     steps = lr_steps + [max_iter]
10 |     for ind, step in enumerate(steps):  # NoQA
11 |         if cur_iter < step:
12 |             break
13 |     return ind - 1
14 | 
15 | 
16 | def lr_func_steps_with_decay(cur_iter,base_lr=0.01,gamma=0.1):
17 |     """For cfg.SOLVER.LR_POLICY = 'steps_with_decay'
18 |     Change the learning rate specified iterations based on the formula
19 |     lr = base_lr * gamma ** lr_step_count.
20 |     Example:
21 |     cfg.SOLVER.MAX_ITER: 90
22 |     cfg.SOLVER.STEPS:    [0,    60,    80]
23 |     cfg.SOLVER.BASE_LR:  0.02
24 |     cfg.SOLVER.GAMMA:    0.1
25 |     for cur_iter in [0, 59]   use 0.02 = 0.02 * 0.1 ** 0
26 |                  in [60, 79]  use 0.002 = 0.02 * 0.1 ** 1
27 |                  in [80, inf] use 0.0002 = 0.02 * 0.1 ** 2
28 |     """
29 |     ind = get_step_index(cur_iter)
30 |     return base_lr * gamma ** ind
31 | 
32 | def get_lr_at_iter(it,warm_up_iters=500,warm_up_factor=0.3333333333333333,warm_up_method='linear'):
33 |     """Get the learning rate at iteration it according to the cfg.SOLVER
34 |     settings.
35 |     """
36 |     lr = lr_func_steps_with_decay(it)
37 |     if it < warm_up_iters:
38 |         if warm_up_method == 'linear':
39 |             alpha = it / warm_up_iters
40 |             warm_up_factor = warm_up_factor * (1 - alpha) + alpha
41 |         elif warm_up_method != 'constant':
42 |             raise KeyError('Unknown WARM_UP_METHOD: {}'.format(warm_up_method))
43 |         lr *= warm_up_factor
44 |     return lr


--------------------------------------------------------------------------------
/lib/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | ##############################################################################
15 | #
16 | # Based on:
17 | # --------------------------------------------------------
18 | # Fast R-CNN
19 | # Copyright (c) 2015 Microsoft
20 | # Licensed under The MIT License [see LICENSE for details]
21 | # Written by Ross Girshick
22 | # --------------------------------------------------------
23 | 
24 | """Timing related functions."""
25 | 
26 | from __future__ import absolute_import
27 | from __future__ import division
28 | from __future__ import print_function
29 | from __future__ import unicode_literals
30 | 
31 | import time
32 | 
33 | 
34 | class Timer(object):
35 |     """A simple timer."""
36 | 
37 |     def __init__(self):
38 |         self.reset()
39 | 
40 |     def tic(self):
41 |         # using time.time instead of time.clock because time time.clock
42 |         # does not normalize for multithreading
43 |         self.start_time = time.time()
44 | 
45 |     def toc(self, average=True):
46 |         self.diff = time.time() - self.start_time
47 |         self.total_time += self.diff
48 |         self.calls += 1
49 |         self.average_time = self.total_time / self.calls
50 |         if average:
51 |             return self.average_time
52 |         else:
53 |             return self.diff
54 | 
55 |     def reset(self):
56 |         self.total_time = 0.
57 |         self.calls = 0
58 |         self.start_time = 0.
59 |         self.diff = 0.
60 |         self.average_time = 0.
61 | 


--------------------------------------------------------------------------------
/lib/utils/training_stats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | # Copyright (c) 2017-present, Facebook, Inc.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | ##############################################################################
 17 | 
 18 | """Utilities for training."""
 19 | 
 20 | from __future__ import absolute_import
 21 | from __future__ import division
 22 | from __future__ import print_function
 23 | from __future__ import unicode_literals
 24 | 
 25 | import datetime
 26 | import numpy as np
 27 | 
 28 | #from caffe2.python import utils as c2_py_utils
 29 | #from core.config import cfg
 30 | from utils.logging import log_json_stats
 31 | from utils.logging import SmoothedValue
 32 | from utils.timer import Timer
 33 | 
 34 | 
 35 | class TrainingStats(object):
 36 |     """Track vital training statistics."""
 37 | 
 38 |     def __init__(self, metrics, losses,
 39 |                  solver_max_iters):
 40 |         self.solver_max_iters = solver_max_iters
 41 |         # Window size for smoothing tracked values (with median filtering)
 42 |         self.win_sz = 20
 43 |         # Output logging period in SGD iterations
 44 |         self.log_period = 20
 45 |         self.smoothed_losses_and_metrics = {
 46 |             key: SmoothedValue(self.win_sz)
 47 |             for key in losses + metrics
 48 |         }
 49 |         self.losses_and_metrics = {
 50 |             key: 0
 51 |             for key in losses + metrics
 52 |         }
 53 |         self.smoothed_total_loss = SmoothedValue(self.win_sz)
 54 |         self.smoothed_mb_qsize = SmoothedValue(self.win_sz)
 55 |         self.iter_total_loss = np.nan
 56 |         self.iter_timer = Timer()
 57 |         self.metrics = metrics
 58 |         self.losses = losses
 59 | 
 60 |     def IterTic(self):
 61 |         self.iter_timer.tic()
 62 | 
 63 |     def IterToc(self):
 64 |         return self.iter_timer.toc(average=False)
 65 | 
 66 |     def ResetIterTimer(self):
 67 |         self.iter_timer.reset()
 68 | 
 69 |     def UpdateIterStats(self,losses_dict, metrics_dict):
 70 |         """Update tracked iteration statistics."""
 71 |         for k in self.losses_and_metrics.keys():
 72 |             if k in self.losses: # if loss
 73 |                 self.losses_and_metrics[k] = losses_dict[k]
 74 |             else: # if metric
 75 |                 self.losses_and_metrics[k] = metrics_dict[k]
 76 | 
 77 |         for k, v in self.smoothed_losses_and_metrics.items():
 78 |             v.AddValue(self.losses_and_metrics[k])
 79 |         #import pdb; pdb.set_trace()
 80 |         self.iter_total_loss = np.sum(
 81 |             np.array([self.losses_and_metrics[k] for k in self.losses])
 82 |         )
 83 |         self.smoothed_total_loss.AddValue(self.iter_total_loss)
 84 |         self.smoothed_mb_qsize.AddValue(
 85 |             #self.model.roi_data_loader._minibatch_queue.qsize()
 86 |             64
 87 |         )
 88 | 
 89 |     def LogIterStats(self, cur_iter, lr):
 90 |         """Log the tracked statistics."""
 91 |         if (cur_iter % self.log_period == 0 or
 92 |                 cur_iter == self.solver_max_iters - 1):
 93 |             stats = self.GetStats(cur_iter, lr)
 94 |             log_json_stats(stats)
 95 | 
 96 |     def GetStats(self, cur_iter, lr):
 97 |         eta_seconds = self.iter_timer.average_time * (
 98 |             self.solver_max_iters - cur_iter
 99 |         )
100 |         eta = str(datetime.timedelta(seconds=int(eta_seconds)))
101 |         #mem_stats = c2_py_utils.GetGPUMemoryUsageStats()
102 |         #mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_GPUS])
103 |         stats = dict(
104 |             iter=cur_iter,
105 |             lr="{:.6f}".format(float(lr)),
106 |             time="{:.6f}".format(self.iter_timer.average_time),
107 |             loss="{:.6f}".format(self.smoothed_total_loss.GetMedianValue()),
108 |             eta=eta,
109 |             #mb_qsize=int(np.round(self.smoothed_mb_qsize.GetMedianValue())),
110 |             #mem=int(np.ceil(mem_usage / 1024 / 1024))
111 |         )
112 |         for k, v in self.smoothed_losses_and_metrics.items():
113 |             stats[k] = "{:.6f}".format(v.GetMedianValue())
114 |         return stats


--------------------------------------------------------------------------------
/lib/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | 
 4 | def normalize_axis(x,L):
 5 |     return (x-1-(L-1)/2)*2/(L-1)
 6 | 
 7 | def unnormalize_axis(x,L):
 8 |     return x*(L-1)/2+1+(L-1)/2
 9 | 
10 | def expand_dim(tensor,dim,desired_dim_len):
11 |     sz = list(tensor.size())
12 |     sz[dim]=desired_dim_len
13 |     return tensor.expand(tuple(sz))
14 | 
15 | def create_file_path(filename):
16 |     if not os.path.exists(os.path.dirname(filename)):
17 |         try:
18 |             os.makedirs(os.path.dirname(filename))
19 |         except OSError as exc: # Guard against race condition
20 |             if exc.errno != errno.EEXIST:
21 |                 raise
22 | 
23 | def to_cuda(x):
24 |     if isinstance(x,dict):
25 |         return {key: to_cuda(x[key]) for key in x.keys()}
26 |     if isinstance(x,list):
27 |         return [y.cuda() for y in x]
28 |     return x.cuda()
29 | 
30 | def to_cuda_variable(x,volatile=True):
31 |     if isinstance(x,dict):
32 |         return {key: to_cuda_variable(x[key],volatile=volatile) for key in x.keys()}
33 |     if isinstance(x,list):
34 |         return [to_cuda_variable(y) for y in x]
35 |     if isinstance(x, (int, float)):
36 |         return x
37 |     if isinstance(x, torch.Tensor):
38 |         if torch.__version__[:3]=="0.4" or volatile==False:
39 |             return Variable(x.cuda())
40 |         else:
41 |             return Variable(x.cuda(),volatile=True)
42 | 
43 | 
44 | def parse_th_to_caffe2(terms,i=0,parsed=''):
45 |     # Convert PyTorch ResNet weight names to caffe2 weight names
46 |     if i==0:
47 |         if terms[i]=='conv1':
48 |             parsed='conv1'
49 |         elif terms[i]=='bn1':
50 |             parsed='res_conv1'
51 |         elif terms[i].startswith('layer'):
52 |             parsed='res'+str(int(terms[i][-1])+1)
53 |     else:
54 |         if terms[i]=='weight' and (terms[i-1].startswith('conv') or terms[i-1]=='0'):
55 |             parsed+='_w'
56 |         elif terms[i]=='weight' and (terms[i-1].startswith('bn') or terms[i-1]=='1'):
57 |             parsed+='_bn_s'
58 |         elif terms[i]=='bias' and (terms[i-1].startswith('bn') or terms[i-1]=='1'):
59 |             parsed+='_bn_b'
60 |         elif terms[i-1].startswith('layer'):
61 |             parsed+='_'+terms[i]
62 |         elif terms[i].startswith('conv') or terms[i].startswith('bn'):
63 |             parsed+='_branch2'+chr(96+int(terms[i][-1]))
64 |         elif terms[i]=='downsample':
65 |             parsed+='_branch1'
66 |     # increase counter
67 |     i+=1
68 |     # do recursion
69 |     if i==len(terms):
70 |         return parsed
71 |     return parse_th_to_caffe2(terms,i,parsed)
72 | 


--------------------------------------------------------------------------------
/lib/utils_cython/build_cython.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | ##############################################################################
15 | 
16 | from __future__ import absolute_import
17 | from __future__ import division
18 | from __future__ import print_function
19 | 
20 | from Cython.Build import cythonize
21 | from setuptools import Extension
22 | from setuptools import setup
23 | 
24 | import numpy as np
25 | 
26 | _NP_INCLUDE_DIRS = np.get_include()
27 | 
28 | 
29 | # Extension modules
30 | ext_modules = [
31 |     Extension(
32 |         name='cython_bbox',
33 |         sources=[
34 |             'cython_bbox.pyx'
35 |         ],
36 |         extra_compile_args=[
37 |             '-Wno-cpp'
38 |         ],
39 |         include_dirs=[
40 |             _NP_INCLUDE_DIRS
41 |         ]
42 |     ),
43 |     Extension(
44 |         name='cython_nms',
45 |         sources=[
46 |             'cython_nms.pyx'
47 |         ],
48 |         extra_compile_args=[
49 |             '-Wno-cpp'
50 |         ],
51 |         include_dirs=[
52 |             _NP_INCLUDE_DIRS
53 |         ]
54 |     )
55 | ]
56 | 
57 | setup(
58 |     name='Detectron',
59 |     ext_modules=cythonize(ext_modules)
60 | )


--------------------------------------------------------------------------------
/lib/utils_cython/cython_bbox.pyx:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | ##############################################################################
15 | #
16 | # Based on:
17 | # --------------------------------------------------------
18 | # Fast R-CNN
19 | # Copyright (c) 2015 Microsoft
20 | # Licensed under The MIT License [see LICENSE for details]
21 | # Written by Sergey Karayev
22 | # --------------------------------------------------------
23 | 
24 | cimport cython
25 | import numpy as np
26 | cimport numpy as np
27 | 
28 | DTYPE = np.float32
29 | ctypedef np.float32_t DTYPE_t
30 | 
31 | @cython.boundscheck(False)
32 | def bbox_overlaps(
33 |         np.ndarray[DTYPE_t, ndim=2] boxes,
34 |         np.ndarray[DTYPE_t, ndim=2] query_boxes):
35 |     """
36 |     Parameters
37 |     ----------
38 |     boxes: (N, 4) ndarray of float
39 |     query_boxes: (K, 4) ndarray of float
40 |     Returns
41 |     -------
42 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
43 |     """
44 |     cdef unsigned int N = boxes.shape[0]
45 |     cdef unsigned int K = query_boxes.shape[0]
46 |     cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
47 |     cdef DTYPE_t iw, ih, box_area
48 |     cdef DTYPE_t ua
49 |     cdef unsigned int k, n
50 |     with nogil:
51 |         for k in range(K):
52 |             box_area = (
53 |                 (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
54 |                 (query_boxes[k, 3] - query_boxes[k, 1] + 1)
55 |             )
56 |             for n in range(N):
57 |                 iw = (
58 |                     min(boxes[n, 2], query_boxes[k, 2]) -
59 |                     max(boxes[n, 0], query_boxes[k, 0]) + 1
60 |                 )
61 |                 if iw > 0:
62 |                     ih = (
63 |                         min(boxes[n, 3], query_boxes[k, 3]) -
64 |                         max(boxes[n, 1], query_boxes[k, 1]) + 1
65 |                     )
66 |                     if ih > 0:
67 |                         ua = float(
68 |                             (boxes[n, 2] - boxes[n, 0] + 1) *
69 |                             (boxes[n, 3] - boxes[n, 1] + 1) +
70 |                             box_area - iw * ih
71 |                         )
72 |                         overlaps[n, k] = iw * ih / ua
73 |     return overlaps


--------------------------------------------------------------------------------
/lib/utils_cython/cython_nms.pyx:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | ##############################################################################
 15 | #
 16 | # Based on:
 17 | # --------------------------------------------------------
 18 | # Fast R-CNN
 19 | # Copyright (c) 2015 Microsoft
 20 | # Licensed under The MIT License [see LICENSE for details]
 21 | # Written by Ross Girshick
 22 | # --------------------------------------------------------
 23 | 
 24 | cimport cython
 25 | import numpy as np
 26 | cimport numpy as np
 27 | 
 28 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b) nogil:
 29 |     return a if a >= b else b
 30 | 
 31 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b) nogil:
 32 |     return a if a <= b else b
 33 | 
 34 | @cython.boundscheck(False)
 35 | @cython.cdivision(True)
 36 | @cython.wraparound(False)
 37 | def nms(np.ndarray[np.float32_t, ndim=2] dets, np.float32_t thresh):
 38 |     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
 39 |     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
 40 |     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
 41 |     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
 42 |     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
 43 | 
 44 |     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
 45 |     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
 46 | 
 47 |     cdef int ndets = dets.shape[0]
 48 |     cdef np.ndarray[np.int_t, ndim=1] suppressed = \
 49 |             np.zeros((ndets), dtype=np.int)
 50 | 
 51 |     # nominal indices
 52 |     cdef int _i, _j
 53 |     # sorted indices
 54 |     cdef int i, j
 55 |     # temp variables for box i's (the box currently under consideration)
 56 |     cdef np.float32_t ix1, iy1, ix2, iy2, iarea
 57 |     # variables for computing overlap with box j (lower scoring box)
 58 |     cdef np.float32_t xx1, yy1, xx2, yy2
 59 |     cdef np.float32_t w, h
 60 |     cdef np.float32_t inter, ovr
 61 | 
 62 |     with nogil:
 63 |       for _i in range(ndets):
 64 |           i = order[_i]
 65 |           if suppressed[i] == 1:
 66 |               continue
 67 |           ix1 = x1[i]
 68 |           iy1 = y1[i]
 69 |           ix2 = x2[i]
 70 |           iy2 = y2[i]
 71 |           iarea = areas[i]
 72 |           for _j in range(_i + 1, ndets):
 73 |               j = order[_j]
 74 |               if suppressed[j] == 1:
 75 |                   continue
 76 |               xx1 = max(ix1, x1[j])
 77 |               yy1 = max(iy1, y1[j])
 78 |               xx2 = min(ix2, x2[j])
 79 |               yy2 = min(iy2, y2[j])
 80 |               w = max(0.0, xx2 - xx1 + 1)
 81 |               h = max(0.0, yy2 - yy1 + 1)
 82 |               inter = w * h
 83 |               ovr = inter / (iarea + areas[j] - inter)
 84 |               if ovr >= thresh:
 85 |                   suppressed[j] = 1
 86 | 
 87 |     return np.where(suppressed == 0)[0]
 88 | 
 89 | # ----------------------------------------------------------
 90 | # Soft-NMS: Improving Object Detection With One Line of Code
 91 | # Copyright (c) University of Maryland, College Park
 92 | # Licensed under The MIT License [see LICENSE for details]
 93 | # Written by Navaneeth Bodla and Bharat Singh
 94 | # ----------------------------------------------------------
 95 | @cython.boundscheck(False)
 96 | @cython.cdivision(True)
 97 | @cython.wraparound(False)
 98 | def soft_nms(
 99 |     np.ndarray[float, ndim=2] boxes_in,
100 |     float sigma=0.5,
101 |     float Nt=0.3,
102 |     float threshold=0.001,
103 |     unsigned int method=0
104 | ):
105 |     boxes = boxes_in.copy()
106 |     cdef unsigned int N = boxes.shape[0]
107 |     cdef float iw, ih, box_area
108 |     cdef float ua
109 |     cdef int pos = 0
110 |     cdef float maxscore = 0
111 |     cdef int maxpos = 0
112 |     cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov
113 |     inds = np.arange(N)
114 | 
115 |     for i in range(N):
116 |         maxscore = boxes[i, 4]
117 |         maxpos = i
118 | 
119 |         tx1 = boxes[i,0]
120 |         ty1 = boxes[i,1]
121 |         tx2 = boxes[i,2]
122 |         ty2 = boxes[i,3]
123 |         ts = boxes[i,4]
124 |         ti = inds[i]
125 | 
126 |         pos = i + 1
127 |         # get max box
128 |         while pos < N:
129 |             if maxscore < boxes[pos, 4]:
130 |                 maxscore = boxes[pos, 4]
131 |                 maxpos = pos
132 |             pos = pos + 1
133 | 
134 |         # add max box as a detection
135 |         boxes[i,0] = boxes[maxpos,0]
136 |         boxes[i,1] = boxes[maxpos,1]
137 |         boxes[i,2] = boxes[maxpos,2]
138 |         boxes[i,3] = boxes[maxpos,3]
139 |         boxes[i,4] = boxes[maxpos,4]
140 |         inds[i] = inds[maxpos]
141 | 
142 |         # swap ith box with position of max box
143 |         boxes[maxpos,0] = tx1
144 |         boxes[maxpos,1] = ty1
145 |         boxes[maxpos,2] = tx2
146 |         boxes[maxpos,3] = ty2
147 |         boxes[maxpos,4] = ts
148 |         inds[maxpos] = ti
149 | 
150 |         tx1 = boxes[i,0]
151 |         ty1 = boxes[i,1]
152 |         tx2 = boxes[i,2]
153 |         ty2 = boxes[i,3]
154 |         ts = boxes[i,4]
155 | 
156 |         pos = i + 1
157 |         # NMS iterations, note that N changes if detection boxes fall below
158 |         # threshold
159 |         while pos < N:
160 |             x1 = boxes[pos, 0]
161 |             y1 = boxes[pos, 1]
162 |             x2 = boxes[pos, 2]
163 |             y2 = boxes[pos, 3]
164 |             s = boxes[pos, 4]
165 | 
166 |             area = (x2 - x1 + 1) * (y2 - y1 + 1)
167 |             iw = (min(tx2, x2) - max(tx1, x1) + 1)
168 |             if iw > 0:
169 |                 ih = (min(ty2, y2) - max(ty1, y1) + 1)
170 |                 if ih > 0:
171 |                     ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
172 |                     ov = iw * ih / ua #iou between max box and detection box
173 | 
174 |                     if method == 1: # linear
175 |                         if ov > Nt:
176 |                             weight = 1 - ov
177 |                         else:
178 |                             weight = 1
179 |                     elif method == 2: # gaussian
180 |                         weight = np.exp(-(ov * ov)/sigma)
181 |                     else: # original NMS
182 |                         if ov > Nt:
183 |                             weight = 0
184 |                         else:
185 |                             weight = 1
186 | 
187 |                     boxes[pos, 4] = weight*boxes[pos, 4]
188 | 
189 |                     # if box score falls below threshold, discard the box by
190 |                     # swapping with last box update N
191 |                     if boxes[pos, 4] < threshold:
192 |                         boxes[pos,0] = boxes[N-1, 0]
193 |                         boxes[pos,1] = boxes[N-1, 1]
194 |                         boxes[pos,2] = boxes[N-1, 2]
195 |                         boxes[pos,3] = boxes[N-1, 3]
196 |                         boxes[pos,4] = boxes[N-1, 4]
197 |                         inds[pos] = inds[N-1]
198 |                         N = N - 1
199 |                         pos = pos - 1
200 | 
201 |             pos = pos + 1
202 | 
203 |     return boxes[:N], inds[:N]


--------------------------------------------------------------------------------
/train_fast.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import torch
  4 | from torch.autograd import Variable
  5 | from torch.utils.data import DataLoader
  6 | 
  7 | import numpy as np
  8 | 
  9 | import sys
 10 | sys.path.insert(0, "lib/")
 11 | from data.coco_dataset import CocoDataset
 12 | from utils.preprocess_sample import preprocess_sample
 13 | from utils.collate_custom import collate_custom
 14 | from utils.utils import to_cuda, to_variable, to_cuda_variable
 15 | from model.detector import detector
 16 | from model.loss import accuracy, smooth_L1
 17 | from utils.solver import adjust_learning_rate,get_lr_at_iter
 18 | from utils.training_stats import TrainingStats
 19 | from torch.nn.utils.clip_grad import clip_grad_norm
 20 | import torch.nn as nn
 21 | from utils.data_parallel import data_parallel
 22 | from torch.nn.functional import cross_entropy
 23 | 
 24 | 
 25 | parser = argparse.ArgumentParser(description='PyTorch Fast RCNN Training')
 26 | # MODEL
 27 | parser.add_argument('--cnn-arch', default='resnet50')
 28 | parser.add_argument('--cnn-pkl', default='files/pretrained_base_cnn/R-50.pkl')
 29 | parser.add_argument('--cnn-mapping', default='files/mapping_files/resnet50_mapping.npy')
 30 | # DATASET
 31 | # parser.add_argument('--dset-path', default=('datasets/data/coco/coco_train2014',
 32 | #           'datasets/data/coco/coco_val2014/'))
 33 | # parser.add_argument('--dset-rois', default=('files/proposal_files/coco_2014_train/rpn_proposals.pkl',
 34 | #                 'files/proposal_files/coco_2014_valminusminival/rpn_proposals.pkl'))
 35 | # parser.add_argument('--dset-ann', default=('datasets/data/coco/annotations/instances_train2014.json',
 36 | #                 'datasets/data/coco/annotations/instances_valminusminival2014.json'))
 37 | # parser.add_argument('--dset-path', default=('datasets/data/coco/coco_train2014',
 38 | #            ))
 39 | # parser.add_argument('--dset-rois', default=('files/proposal_files/coco_2014_train/rpn_proposals.pkl',
 40 | #                  ))
 41 | # parser.add_argument('--dset-ann', default=('datasets/data/coco/annotations/instances_train2014.json',
 42 | #                  ))
 43 | 
 44 | # use MINIVAL for debugging as it loads fast
 45 | parser.add_argument('--dset-path', default=('datasets/data/coco/coco_val2014',
 46 |          ))
 47 | parser.add_argument('--dset-rois', default=('files/proposal_files/coco_2014_minival/rpn_proposals.pkl',
 48 |                ))
 49 | parser.add_argument('--dset-ann', default=('datasets/data/coco/annotations/instances_minival2014.json',
 50 |                ))
 51 | # DATALOADER
 52 | 
 53 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
 54 |                     help='number of data loading workers (default: 0)')
 55 | # SOLVER
 56 | parser.add_argument('--base-lr', default=0.01, type=float)
 57 | parser.add_argument('--lr-steps', default=[0, 240000, 320000])
 58 | parser.add_argument('--momentum', default=0.9, type=float)
 59 | parser.add_argument('--wd', default=1e-4, type=float, help='weight decay (default: 1e-4)')
 60 | # TRAINING
 61 | parser.add_argument('--max-iter', default=360000, type=int)
 62 | parser.add_argument('--batch-size', default=1, type=int)
 63 | parser.add_argument('--start-iter', default=0, type=int, metavar='N',
 64 |                     help='manual iter number (useful on restarts)')
 65 | parser.add_argument('--resume', default='', type=str, metavar='PATH',
 66 |                     help='path to latest checkpoint (default: none)')
 67 | parser.add_argument('--checkpoint-period', default=20000, type=int)
 68 | parser.add_argument('--checkpoint-fn', default='files/results/fast.pth.tar')
 69 | 
 70 | 
 71 | def main():
 72 |     args = parser.parse_args()
 73 |     print(args)
 74 |     # for now, batch_size should match number of gpus
 75 |     assert(args.batch_size==torch.cuda.device_count())
 76 | 
 77 |     # create model
 78 |     model = detector(arch=args.cnn_arch,
 79 |                  base_cnn_pkl_file=args.cnn_pkl,
 80 |                  mapping_file=args.cnn_mapping,
 81 |                  output_prob=False,
 82 |                  return_rois=False,
 83 |                  return_img_features=False)
 84 |     model = model.cuda()
 85 | 
 86 |     # freeze part of the net
 87 |     stop_grad=['conv1','bn1','relu','maxpool','layer1']
 88 |     model_no_grad=torch.nn.Sequential(*[getattr(model.model,l) for l in stop_grad])
 89 |     for param in model_no_grad.parameters():
 90 |         param.requires_grad = False
 91 | 
 92 |     # define  optimizer
 93 |     optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()),
 94 |                                 lr=args.base_lr,
 95 |                                 momentum=args.momentum,
 96 |                                 weight_decay=args.wd)
 97 | 
 98 |     # create dataset
 99 |     train_dataset = CocoDataset(ann_file=args.dset_ann,
100 |                           img_dir=args.dset_path,
101 |                           proposal_file=args.dset_rois,
102 |                           mode='train',
103 |                           sample_transform=preprocess_sample(target_sizes=[800],
104 |                                                              sample_proposals_for_training=True))
105 |     train_loader = DataLoader(train_dataset, batch_size=args.batch_size,shuffle=False, num_workers=args.workers, collate_fn=collate_custom)
106 | 
107 |     training_stats = TrainingStats(losses=['loss_cls','loss_bbox'],
108 |                                    metrics=['accuracy_cls'],
109 |                                    solver_max_iters=args.max_iter)
110 | 
111 |     iter = args.start_iter
112 | 
113 |     print('starting training')
114 | 
115 |     while iter<args.max_iter:
116 |         for i, batch in enumerate(train_loader):
117 | 
118 |             if args.batch_size==1:
119 |                 batch = to_cuda_variable(batch,volatile=False)
120 |             else:
121 |                 # when using multiple GPUs convert to cuda later in data_parallel and list_to_tensor
122 |                 batch = to_variable(batch,volatile=False)             
123 |                 
124 | 
125 |             # update lr
126 |             lr = get_lr_at_iter(iter)
127 |             adjust_learning_rate(optimizer, lr)
128 | 
129 |             # start measuring time
130 |             training_stats.IterTic()
131 | 
132 |             # forward pass            
133 |             if args.batch_size==1:
134 |                 cls_score,bbox_pred=model(batch['image'],batch['rois'])
135 |                 list_to_tensor = lambda x: x                
136 |             else:
137 |                 cls_score,bbox_pred=data_parallel(model,(batch['image'],batch['rois'])) # run model distributed over gpus and concatenate outputs for all batch
138 |                 # convert gt data from lists to concatenated tensors
139 |                 list_to_tensor = lambda x: torch.cat(tuple([i.cuda() for i in x]),0)
140 | 
141 |             cls_labels = list_to_tensor(batch['labels_int32']).long()
142 |             bbox_targets = list_to_tensor(batch['bbox_targets'])
143 |             bbox_inside_weights = list_to_tensor(batch['bbox_inside_weights'])
144 |             bbox_outside_weights = list_to_tensor(batch['bbox_outside_weights'])            
145 |             
146 |             # compute loss
147 |             loss_cls=cross_entropy(cls_score,cls_labels)
148 |             loss_bbox=smooth_L1(bbox_pred,bbox_targets,bbox_inside_weights,bbox_outside_weights)
149 |                                   
150 |             # compute classification accuracy (for stats reporting)
151 |             acc = accuracy(cls_score,cls_labels)
152 | 
153 |             # get final loss
154 |             loss = loss_cls + loss_bbox
155 | 
156 |             # update
157 |             optimizer.zero_grad()
158 |             loss.backward()
159 |             # Without gradient clipping I get inf's and NaNs. 
160 |             # it seems that in Caffe the SGD solver performs grad clipping by default. 
161 |             # https://github.com/BVLC/caffe/blob/master/src/caffe/solvers/sgd_solver.cpp
162 |             # it also seems that Matterport's Mask R-CNN required grad clipping as well 
163 |             # (see README in https://github.com/matterport/Mask_RCNN)            
164 |             # the value max_norm=35 was taken from here https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto
165 |             clip_grad_norm(filter(lambda p: p.requires_grad, model.parameters()), max_norm=35, norm_type=2) 
166 |             optimizer.step()
167 | 
168 |             # stats
169 |             training_stats.IterToc()
170 |             
171 |             training_stats.UpdateIterStats(losses_dict={'loss_cls': loss_cls.data.cpu().numpy().item(),
172 |                                                         'loss_bbox': loss_bbox.data.cpu().numpy().item()},
173 |                                            metrics_dict={'accuracy_cls':acc.data.cpu().numpy().item()})
174 | 
175 |             training_stats.LogIterStats(iter, lr)
176 |             # save checkpoint
177 |             if (iter+1)%args.checkpoint_period == 0:
178 |                 save_checkpoint({
179 |                     'iter': iter,
180 |                     'args': args,
181 |                     'state_dict': model.state_dict(),
182 |                     'optimizer' : optimizer.state_dict(),
183 |                 }, args.checkpoint_fn)
184 | 
185 |             if iter == args.start_iter + 20: # training_stats.LOG_PERIOD=20
186 |                 # Reset the iteration timer to remove outliers from the first few
187 |                 # SGD iterations
188 |                 training_stats.ResetIterTimer()
189 | 
190 |             # allow finishing in the middle of an epoch
191 |             if iter>args.max_iter:
192 |                 break
193 |             # advance iteration
194 |             iter+=1
195 |             #import pdb; pdb.set_trace()
196 | 
197 | def save_checkpoint(state, filename='checkpoint.pth.tar'):
198 |     torch.save(state, filename)    
199 | 
200 | if __name__ == '__main__':
201 |     main()
202 | 


--------------------------------------------------------------------------------