├── lib
    ├── __init__.py
    ├── nms
    │   ├── __init__.py
    │   ├── src
    │   │   ├── nms_cuda.h
    │   │   ├── nms.h
    │   │   ├── cuda
    │   │   │   ├── nms_kernel.h
    │   │   │   └── nms_kernel.cu
    │   │   ├── nms_cuda.c
    │   │   └── nms.c
    │   ├── build.py
    │   └── pth_nms.py
    └── build.sh
├── images
    ├── 1.jpg
    ├── 3.jpg
    ├── 4.jpg
    ├── 5.jpg
    ├── 6.jpg
    ├── 7.jpg
    └── 8.jpg
├── .gitignore
├── coco_eval.py
├── visualize.py
├── anchors.py
├── utils.py
├── README.md
├── losses.py
├── train.py
├── csv_eval.py
├── oid_dataset.py
├── LICENSE
├── model.py
└── dataloader.py


/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/nms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/images/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/1.jpg


--------------------------------------------------------------------------------
/images/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/3.jpg


--------------------------------------------------------------------------------
/images/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/4.jpg


--------------------------------------------------------------------------------
/images/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/5.jpg


--------------------------------------------------------------------------------
/images/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/6.jpg


--------------------------------------------------------------------------------
/images/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/7.jpg


--------------------------------------------------------------------------------
/images/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/8.jpg


--------------------------------------------------------------------------------
/lib/nms/src/nms_cuda.h:
--------------------------------------------------------------------------------
1 | int gpu_nms(THLongTensor * keep_out, THLongTensor* num_out, THCudaTensor * boxes, float nms_overlap_thresh);


--------------------------------------------------------------------------------
/lib/nms/src/nms.h:
--------------------------------------------------------------------------------
1 | int cpu_nms(THLongTensor * keep_out, THLongTensor * num_out, THFloatTensor * boxes, THLongTensor * order, THFloatTensor * areas, float nms_overlap_thresh);


--------------------------------------------------------------------------------
/lib/nms/src/cuda/nms_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _NMS_KERNEL
 2 | #define _NMS_KERNEL
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
 9 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
10 | 
11 | void _nms(int boxes_num, float * boxes_dev,
12 |           unsigned long long * mask_dev, float nms_overlap_thresh);
13 | 
14 | #ifdef __cplusplus
15 | }
16 | #endif
17 | 
18 | #endif
19 | 
20 | 


--------------------------------------------------------------------------------
/lib/build.sh:
--------------------------------------------------------------------------------
 1 | CUDA_ARCH="-gencode arch=compute_30,code=sm_30 \
 2 |            -gencode arch=compute_35,code=sm_35 \
 3 |            -gencode arch=compute_50,code=sm_50 \
 4 |            -gencode arch=compute_52,code=sm_52 \
 5 |            -gencode arch=compute_60,code=sm_60 \
 6 |            -gencode arch=compute_61,code=sm_61"
 7 | 
 8 | 
 9 | # Build NMS
10 | cd nms/src/cuda
11 | echo "Compiling nms kernels by nvcc..."
12 | /usr/local/cuda/bin/nvcc -c -o nms_kernel.cu.o nms_kernel.cu -x cu -Xcompiler -fPIC $CUDA_ARCH
13 | cd ../../
14 | python build.py
15 | cd ../
16 | 


--------------------------------------------------------------------------------
/lib/nms/build.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from torch.utils.ffi import create_extension
 4 | 
 5 | 
 6 | sources = ['src/nms.c']
 7 | headers = ['src/nms.h']
 8 | defines = []
 9 | with_cuda = False
10 | 
11 | if torch.cuda.is_available():
12 |     print('Including CUDA code.')
13 |     sources += ['src/nms_cuda.c']
14 |     headers += ['src/nms_cuda.h']
15 |     defines += [('WITH_CUDA', None)]
16 |     with_cuda = True
17 | 
18 | this_file = os.path.dirname(os.path.realpath(__file__))
19 | print(this_file)
20 | extra_objects = ['src/cuda/nms_kernel.cu.o']
21 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]
22 | 
23 | ffi = create_extension(
24 |     '_ext.nms',
25 |     headers=headers,
26 |     sources=sources,
27 |     define_macros=defines,
28 |     relative_to=__file__,
29 |     with_cuda=with_cuda,
30 |     extra_objects=extra_objects,
31 |     extra_compile_args=['-std=c99']
32 | )
33 | 
34 | if __name__ == '__main__':
35 |     ffi.build()
36 | 


--------------------------------------------------------------------------------
/lib/nms/pth_nms.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from ._ext import nms
 3 | import numpy as np
 4 | 
 5 | def pth_nms(dets, thresh):
 6 |   """
 7 |   dets has to be a tensor
 8 |   """
 9 |   if not dets.is_cuda:
10 |     x1 = dets[:, 0]
11 |     y1 = dets[:, 1]
12 |     x2 = dets[:, 2]
13 |     y2 = dets[:, 3]
14 |     scores = dets[:, 4]
15 | 
16 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
17 |     order = scores.sort(0, descending=True)[1]
18 |     # order = torch.from_numpy(np.ascontiguousarray(scores.numpy().argsort()[::-1])).long()
19 | 
20 |     keep = torch.LongTensor(dets.size(0))
21 |     num_out = torch.LongTensor(1)
22 |     nms.cpu_nms(keep, num_out, dets, order, areas, thresh)
23 | 
24 |     return keep[:num_out[0]]
25 |   else:
26 |     x1 = dets[:, 0]
27 |     y1 = dets[:, 1]
28 |     x2 = dets[:, 2]
29 |     y2 = dets[:, 3]
30 |     scores = dets[:, 4]
31 | 
32 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
33 |     order = scores.sort(0, descending=True)[1]
34 |     # order = torch.from_numpy(np.ascontiguousarray(scores.cpu().numpy().argsort()[::-1])).long().cuda()
35 | 
36 |     dets = dets[order].contiguous()
37 | 
38 |     keep = torch.LongTensor(dets.size(0))
39 |     num_out = torch.LongTensor(1)
40 |     # keep = torch.cuda.LongTensor(dets.size(0))
41 |     # num_out = torch.cuda.LongTensor(1)
42 |     nms.gpu_nms(keep, num_out, dets, thresh)
43 | 
44 |     return order[keep[:num_out[0]].cuda()].contiguous()
45 |     # return order[keep[:num_out[0]]].contiguous()
46 | 
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *.cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # Jupyter Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # SageMath parsed files
 79 | *.sage.py
 80 | 
 81 | # dotenv
 82 | .env
 83 | 
 84 | # virtualenv
 85 | .venv
 86 | venv/
 87 | ENV/
 88 | 
 89 | # Spyder project settings
 90 | .spyderproject
 91 | .spyproject
 92 | 
 93 | # Rope project settings
 94 | .ropeproject
 95 | 
 96 | # mkdocs documentation
 97 | /site
 98 | 
 99 | # mypy
100 | .mypy_cache/
101 | 
102 | *.zip
103 | *.pt
104 | 


--------------------------------------------------------------------------------
/lib/nms/src/nms_cuda.c:
--------------------------------------------------------------------------------
 1 | // ------------------------------------------------------------------
 2 | // Faster R-CNN
 3 | // Copyright (c) 2015 Microsoft
 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details]
 5 | // Written by Shaoqing Ren
 6 | // ------------------------------------------------------------------
 7 | #include <THC/THC.h>
 8 | #include <TH/TH.h>
 9 | #include <math.h>
10 | #include <stdio.h>
11 | 
12 | #include "cuda/nms_kernel.h"
13 | 
14 | 
15 | extern THCState *state;
16 | 
17 | int gpu_nms(THLongTensor * keep, THLongTensor* num_out, THCudaTensor * boxes, float nms_overlap_thresh) {
18 |   // boxes has to be sorted
19 |   THArgCheck(THLongTensor_isContiguous(keep), 0, "boxes must be contiguous");
20 |   THArgCheck(THCudaTensor_isContiguous(state, boxes), 2, "boxes must be contiguous");
21 |   // Number of ROIs
22 |   int boxes_num = THCudaTensor_size(state, boxes, 0);
23 |   int boxes_dim = THCudaTensor_size(state, boxes, 1);
24 | 
25 |   float* boxes_flat = THCudaTensor_data(state, boxes);
26 | 
27 |   const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
28 |   THCudaLongTensor * mask = THCudaLongTensor_newWithSize2d(state, boxes_num, col_blocks);
29 |   unsigned long long* mask_flat = THCudaLongTensor_data(state, mask);
30 | 
31 |   _nms(boxes_num, boxes_flat, mask_flat, nms_overlap_thresh);
32 | 
33 |   THLongTensor * mask_cpu = THLongTensor_newWithSize2d(boxes_num, col_blocks);
34 |   THLongTensor_copyCuda(state, mask_cpu, mask);
35 |   THCudaLongTensor_free(state, mask);
36 | 
37 |   unsigned long long * mask_cpu_flat = THLongTensor_data(mask_cpu);
38 | 
39 |   THLongTensor * remv_cpu = THLongTensor_newWithSize1d(col_blocks);
40 |   unsigned long long* remv_cpu_flat = THLongTensor_data(remv_cpu);
41 |   THLongTensor_fill(remv_cpu, 0);
42 | 
43 |   long * keep_flat = THLongTensor_data(keep);
44 |   long num_to_keep = 0;
45 | 
46 |   int i, j;
47 |   for (i = 0; i < boxes_num; i++) {
48 |     int nblock = i / threadsPerBlock;
49 |     int inblock = i % threadsPerBlock;
50 | 
51 |     if (!(remv_cpu_flat[nblock] & (1ULL << inblock))) {
52 |       keep_flat[num_to_keep++] = i;
53 |       unsigned long long *p = &mask_cpu_flat[0] + i * col_blocks;
54 |       for (j = nblock; j < col_blocks; j++) {
55 |         remv_cpu_flat[j] |= p[j];
56 |       }
57 |     }
58 |   }
59 | 
60 |   long * num_out_flat = THLongTensor_data(num_out);
61 |   * num_out_flat = num_to_keep;
62 | 
63 |   THLongTensor_free(mask_cpu);
64 |   THLongTensor_free(remv_cpu);
65 | 
66 |   return 1;
67 | }
68 | 


--------------------------------------------------------------------------------
/lib/nms/src/nms.c:
--------------------------------------------------------------------------------
 1 | #include <TH/TH.h>
 2 | #include <math.h>
 3 | 
 4 | int cpu_nms(THLongTensor * keep_out, THLongTensor * num_out, THFloatTensor * boxes, THLongTensor * order, THFloatTensor * areas, float nms_overlap_thresh) {
 5 |     // boxes has to be sorted
 6 |     THArgCheck(THLongTensor_isContiguous(keep_out), 0, "keep_out must be contiguous");
 7 |     THArgCheck(THLongTensor_isContiguous(boxes), 2, "boxes must be contiguous");
 8 |     THArgCheck(THLongTensor_isContiguous(order), 3, "order must be contiguous");
 9 |     THArgCheck(THLongTensor_isContiguous(areas), 4, "areas must be contiguous");
10 |     // Number of ROIs
11 |     long boxes_num = THFloatTensor_size(boxes, 0);
12 |     long boxes_dim = THFloatTensor_size(boxes, 1);
13 | 
14 |     long * keep_out_flat = THLongTensor_data(keep_out);
15 |     float * boxes_flat = THFloatTensor_data(boxes);
16 |     long * order_flat = THLongTensor_data(order);
17 |     float * areas_flat = THFloatTensor_data(areas);
18 | 
19 |     THByteTensor* suppressed = THByteTensor_newWithSize1d(boxes_num);
20 |     THByteTensor_fill(suppressed, 0);
21 |     unsigned char * suppressed_flat =  THByteTensor_data(suppressed);
22 | 
23 |     // nominal indices
24 |     int i, j;
25 |     // sorted indices
26 |     int _i, _j;
27 |     // temp variables for box i's (the box currently under consideration)
28 |     float ix1, iy1, ix2, iy2, iarea;
29 |     // variables for computing overlap with box j (lower scoring box)
30 |     float xx1, yy1, xx2, yy2;
31 |     float w, h;
32 |     float inter, ovr;
33 | 
34 |     long num_to_keep = 0;
35 |     for (_i=0; _i < boxes_num; ++_i) {
36 |         i = order_flat[_i];
37 |         if (suppressed_flat[i] == 1) {
38 |             continue;
39 |         }
40 |         keep_out_flat[num_to_keep++] = i;
41 |         ix1 = boxes_flat[i * boxes_dim];
42 |         iy1 = boxes_flat[i * boxes_dim + 1];
43 |         ix2 = boxes_flat[i * boxes_dim + 2];
44 |         iy2 = boxes_flat[i * boxes_dim + 3];
45 |         iarea = areas_flat[i];
46 |         for (_j = _i + 1; _j < boxes_num; ++_j) {
47 |             j = order_flat[_j];
48 |             if (suppressed_flat[j] == 1) {
49 |                 continue;
50 |             }
51 |             xx1 = fmaxf(ix1, boxes_flat[j * boxes_dim]);
52 |             yy1 = fmaxf(iy1, boxes_flat[j * boxes_dim + 1]);
53 |             xx2 = fminf(ix2, boxes_flat[j * boxes_dim + 2]);
54 |             yy2 = fminf(iy2, boxes_flat[j * boxes_dim + 3]);
55 |             w = fmaxf(0.0, xx2 - xx1 + 1);
56 |             h = fmaxf(0.0, yy2 - yy1 + 1);
57 |             inter = w * h;
58 |             ovr = inter / (iarea + areas_flat[j] - inter);
59 |             if (ovr >= nms_overlap_thresh) {
60 |                 suppressed_flat[j] = 1;
61 |             }
62 |         }
63 |     }
64 | 
65 |     long *num_out_flat = THLongTensor_data(num_out);
66 |     *num_out_flat = num_to_keep;
67 |     THByteTensor_free(suppressed);
68 |     return 1;
69 | }


--------------------------------------------------------------------------------
/coco_eval.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from pycocotools.coco import COCO
 4 | from pycocotools.cocoeval import COCOeval
 5 | 
 6 | import numpy as np
 7 | import json
 8 | import os
 9 | 
10 | import torch
11 | 
12 | def evaluate_coco(dataset, model, threshold=0.05):
13 |     
14 |     model.eval()
15 |     
16 |     with torch.no_grad():
17 | 
18 |         # start collecting results
19 |         results = []
20 |         image_ids = []
21 | 
22 |         for index in range(len(dataset)):
23 |             data = dataset[index]
24 |             scale = data['scale']
25 | 
26 |             # run network
27 |             scores, labels, boxes = model(data['img'].permute(2, 0, 1).cuda().float().unsqueeze(dim=0))
28 |             scores = scores.cpu()
29 |             labels = labels.cpu()
30 |             boxes  = boxes.cpu()
31 | 
32 |             # correct boxes for image scale
33 |             boxes /= scale
34 | 
35 |             if boxes.shape[0] > 0:
36 |                 # change to (x, y, w, h) (MS COCO standard)
37 |                 boxes[:, 2] -= boxes[:, 0]
38 |                 boxes[:, 3] -= boxes[:, 1]
39 | 
40 |                 # compute predicted labels and scores
41 |                 #for box, score, label in zip(boxes[0], scores[0], labels[0]):
42 |                 for box_id in range(boxes.shape[0]):
43 |                     score = float(scores[box_id])
44 |                     label = int(labels[box_id])
45 |                     box = boxes[box_id, :]
46 | 
47 |                     # scores are sorted, so we can break
48 |                     if score < threshold:
49 |                         break
50 | 
51 |                     # append detection for each positively labeled class
52 |                     image_result = {
53 |                         'image_id'    : dataset.image_ids[index],
54 |                         'category_id' : dataset.label_to_coco_label(label),
55 |                         'score'       : float(score),
56 |                         'bbox'        : box.tolist(),
57 |                     }
58 | 
59 |                     # append detection to results
60 |                     results.append(image_result)
61 | 
62 |             # append image to list of processed images
63 |             image_ids.append(dataset.image_ids[index])
64 | 
65 |             # print progress
66 |             print('{}/{}'.format(index, len(dataset)), end='\r')
67 | 
68 |         if not len(results):
69 |             return
70 | 
71 |         # write output
72 |         json.dump(results, open('{}_bbox_results.json'.format(dataset.set_name), 'w'), indent=4)
73 | 
74 |         # load results in COCO evaluation tool
75 |         coco_true = dataset.coco
76 |         coco_pred = coco_true.loadRes('{}_bbox_results.json'.format(dataset.set_name))
77 | 
78 |         # run COCO evaluation
79 |         coco_eval = COCOeval(coco_true, coco_pred, 'bbox')
80 |         coco_eval.params.imgIds = image_ids
81 |         coco_eval.evaluate()
82 |         coco_eval.accumulate()
83 |         coco_eval.summarize()
84 | 
85 |         model.train()
86 | 
87 |         return
88 | 


--------------------------------------------------------------------------------
/lib/nms/src/cuda/nms_kernel.cu:
--------------------------------------------------------------------------------
 1 | // ------------------------------------------------------------------
 2 | // Faster R-CNN
 3 | // Copyright (c) 2015 Microsoft
 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details]
 5 | // Written by Shaoqing Ren
 6 | // ------------------------------------------------------------------
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | #include <math.h>
12 | #include <stdio.h>
13 | #include <float.h>
14 | #include "nms_kernel.h"
15 | 
16 | __device__ inline float devIoU(float const * const a, float const * const b) {
17 |   float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
18 |   float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
19 |   float width = fmaxf(right - left + 1, 0.f), height = fmaxf(bottom - top + 1, 0.f);
20 |   float interS = width * height;
21 |   float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
22 |   float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
23 |   return interS / (Sa + Sb - interS);
24 | }
25 | 
26 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
27 |                            const float *dev_boxes, unsigned long long *dev_mask) {
28 |   const int row_start = blockIdx.y;
29 |   const int col_start = blockIdx.x;
30 | 
31 |   // if (row_start > col_start) return;
32 | 
33 |   const int row_size =
34 |         fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
35 |   const int col_size =
36 |         fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
37 | 
38 |   __shared__ float block_boxes[threadsPerBlock * 5];
39 |   if (threadIdx.x < col_size) {
40 |     block_boxes[threadIdx.x * 5 + 0] =
41 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
42 |     block_boxes[threadIdx.x * 5 + 1] =
43 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
44 |     block_boxes[threadIdx.x * 5 + 2] =
45 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
46 |     block_boxes[threadIdx.x * 5 + 3] =
47 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
48 |     block_boxes[threadIdx.x * 5 + 4] =
49 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
50 |   }
51 |   __syncthreads();
52 | 
53 |   if (threadIdx.x < row_size) {
54 |     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
55 |     const float *cur_box = dev_boxes + cur_box_idx * 5;
56 |     int i = 0;
57 |     unsigned long long t = 0;
58 |     int start = 0;
59 |     if (row_start == col_start) {
60 |       start = threadIdx.x + 1;
61 |     }
62 |     for (i = start; i < col_size; i++) {
63 |       if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
64 |         t |= 1ULL << i;
65 |       }
66 |     }
67 |     const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
68 |     dev_mask[cur_box_idx * col_blocks + col_start] = t;
69 |   }
70 | }
71 | 
72 | 
73 | void _nms(int boxes_num, float * boxes_dev,
74 |           unsigned long long * mask_dev, float nms_overlap_thresh) {
75 | 
76 |   dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
77 |               DIVUP(boxes_num, threadsPerBlock));
78 |   dim3 threads(threadsPerBlock);
79 |   nms_kernel<<<blocks, threads>>>(boxes_num,
80 |                                   nms_overlap_thresh,
81 |                                   boxes_dev,
82 |                                   mask_dev);
83 | }
84 | 
85 | #ifdef __cplusplus
86 | }
87 | #endif
88 | 


--------------------------------------------------------------------------------
/visualize.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torchvision
 3 | import time
 4 | import os
 5 | import copy
 6 | import pdb
 7 | import time
 8 | import argparse
 9 | 
10 | import sys
11 | import cv2
12 | 
13 | import torch
14 | from torch.utils.data import Dataset, DataLoader
15 | from torchvision import datasets, models, transforms
16 | 
17 | from dataloader import CocoDataset, CSVDataset, collater, Resizer, AspectRatioBasedSampler, Augmenter, UnNormalizer, Normalizer
18 | 
19 | 
20 | assert torch.__version__.split('.')[1] == '4'
21 | 
22 | print('CUDA available: {}'.format(torch.cuda.is_available()))
23 | 
24 | 
25 | def main(args=None):
26 | 	parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.')
27 | 
28 | 	parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.')
29 | 	parser.add_argument('--coco_path', help='Path to COCO directory')
30 | 	parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)')
31 | 	parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)')
32 | 
33 | 	parser.add_argument('--model', help='Path to model (.pt) file.')
34 | 
35 | 	parser = parser.parse_args(args)
36 | 
37 | 	if parser.dataset == 'coco':
38 | 		dataset_val = CocoDataset(parser.coco_path, set_name='val2017', transform=transforms.Compose([Normalizer(), Resizer()]))
39 | 	elif parser.dataset == 'csv':
40 | 		dataset_val = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Resizer()]))
41 | 	else:
42 | 		raise ValueError('Dataset type not understood (must be csv or coco), exiting.')
43 | 
44 | 	sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False)
45 | 	dataloader_val = DataLoader(dataset_val, num_workers=1, collate_fn=collater, batch_sampler=sampler_val)
46 | 
47 | 	retinanet = torch.load(parser.model)
48 | 
49 | 	use_gpu = True
50 | 
51 | 	if use_gpu:
52 | 		retinanet = retinanet.cuda()
53 | 
54 | 	retinanet.eval()
55 | 
56 | 	unnormalize = UnNormalizer()
57 | 
58 | 	def draw_caption(image, box, caption):
59 | 
60 | 		b = np.array(box).astype(int)
61 | 		cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 0), 2)
62 | 		cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1)
63 | 
64 | 	for idx, data in enumerate(dataloader_val):
65 | 
66 | 		with torch.no_grad():
67 | 			st = time.time()
68 | 			scores, classification, transformed_anchors = retinanet(data['img'].cuda().float())
69 | 			print('Elapsed time: {}'.format(time.time()-st))
70 | 			idxs = np.where(scores>0.5)
71 | 			img = np.array(255 * unnormalize(data['img'][0, :, :, :])).copy()
72 | 
73 | 			img[img<0] = 0
74 | 			img[img>255] = 255
75 | 
76 | 			img = np.transpose(img, (1, 2, 0))
77 | 
78 | 			img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB)
79 | 
80 | 			for j in range(idxs[0].shape[0]):
81 | 				bbox = transformed_anchors[idxs[0][j], :]
82 | 				x1 = int(bbox[0])
83 | 				y1 = int(bbox[1])
84 | 				x2 = int(bbox[2])
85 | 				y2 = int(bbox[3])
86 | 				label_name = dataset_val.labels[int(classification[idxs[0][j]])]
87 | 				draw_caption(img, (x1, y1, x2, y2), label_name)
88 | 
89 | 				cv2.rectangle(img, (x1, y1), (x2, y2), color=(0, 0, 255), thickness=2)
90 | 				print(label_name)
91 | 
92 | 			cv2.imshow('img', img)
93 | 			cv2.waitKey(0)
94 | 
95 | 
96 | 
97 | if __name__ == '__main__':
98 |  main()


--------------------------------------------------------------------------------
/anchors.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | 
  6 | class Anchors(nn.Module):
  7 |     def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None):
  8 |         super(Anchors, self).__init__()
  9 | 
 10 |         if pyramid_levels is None:
 11 |             self.pyramid_levels = [3, 4, 5, 6, 7]
 12 |         if strides is None:
 13 |             self.strides = [2 ** x for x in self.pyramid_levels]
 14 |         if sizes is None:
 15 |             self.sizes = [2 ** (x + 2) for x in self.pyramid_levels]
 16 |         if ratios is None:
 17 |             self.ratios = np.array([0.5, 1, 2])
 18 |         if scales is None:
 19 |             self.scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])
 20 | 
 21 |     def forward(self, image):
 22 |         
 23 |         image_shape = image.shape[2:]
 24 |         image_shape = np.array(image_shape)
 25 |         image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in self.pyramid_levels]
 26 | 
 27 |         # compute anchors over all pyramid levels
 28 |         all_anchors = np.zeros((0, 4)).astype(np.float32)
 29 | 
 30 |         for idx, p in enumerate(self.pyramid_levels):
 31 |             anchors         = generate_anchors(base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales)
 32 |             shifted_anchors = shift(image_shapes[idx], self.strides[idx], anchors)
 33 |             all_anchors     = np.append(all_anchors, shifted_anchors, axis=0)
 34 | 
 35 |         all_anchors = np.expand_dims(all_anchors, axis=0)
 36 | 
 37 |         return torch.from_numpy(all_anchors.astype(np.float32)).cuda()
 38 | 
 39 | def generate_anchors(base_size=16, ratios=None, scales=None):
 40 |     """
 41 |     Generate anchor (reference) windows by enumerating aspect ratios X
 42 |     scales w.r.t. a reference window.
 43 |     """
 44 | 
 45 |     if ratios is None:
 46 |         ratios = np.array([0.5, 1, 2])
 47 | 
 48 |     if scales is None:
 49 |         scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])
 50 | 
 51 |     num_anchors = len(ratios) * len(scales)
 52 | 
 53 |     # initialize output anchors
 54 |     anchors = np.zeros((num_anchors, 4))
 55 | 
 56 |     # scale base_size
 57 |     anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T
 58 | 
 59 |     # compute areas of anchors
 60 |     areas = anchors[:, 2] * anchors[:, 3]
 61 | 
 62 |     # correct for ratios
 63 |     anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales)))
 64 |     anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales))
 65 | 
 66 |     # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2)
 67 |     anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
 68 |     anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T
 69 | 
 70 |     return anchors
 71 | 
 72 | def compute_shape(image_shape, pyramid_levels):
 73 |     """Compute shapes based on pyramid levels.
 74 | 
 75 |     :param image_shape:
 76 |     :param pyramid_levels:
 77 |     :return:
 78 |     """
 79 |     image_shape = np.array(image_shape[:2])
 80 |     image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels]
 81 |     return image_shapes
 82 | 
 83 | 
 84 | def anchors_for_shape(
 85 |     image_shape,
 86 |     pyramid_levels=None,
 87 |     ratios=None,
 88 |     scales=None,
 89 |     strides=None,
 90 |     sizes=None,
 91 |     shapes_callback=None,
 92 | ):
 93 | 
 94 |     image_shapes = compute_shape(image_shape, pyramid_levels)
 95 | 
 96 |     # compute anchors over all pyramid levels
 97 |     all_anchors = np.zeros((0, 4))
 98 |     for idx, p in enumerate(pyramid_levels):
 99 |         anchors         = generate_anchors(base_size=sizes[idx], ratios=ratios, scales=scales)
100 |         shifted_anchors = shift(image_shapes[idx], strides[idx], anchors)
101 |         all_anchors     = np.append(all_anchors, shifted_anchors, axis=0)
102 | 
103 |     return all_anchors
104 | 
105 | 
106 | def shift(shape, stride, anchors):
107 |     shift_x = (np.arange(0, shape[1]) + 0.5) * stride
108 |     shift_y = (np.arange(0, shape[0]) + 0.5) * stride
109 | 
110 |     shift_x, shift_y = np.meshgrid(shift_x, shift_y)
111 | 
112 |     shifts = np.vstack((
113 |         shift_x.ravel(), shift_y.ravel(),
114 |         shift_x.ravel(), shift_y.ravel()
115 |     )).transpose()
116 | 
117 |     # add A anchors (1, A, 4) to
118 |     # cell K shifts (K, 1, 4) to get
119 |     # shift anchors (K, A, 4)
120 |     # reshape to (K*A, 4) shifted anchors
121 |     A = anchors.shape[0]
122 |     K = shifts.shape[0]
123 |     all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
124 |     all_anchors = all_anchors.reshape((K * A, 4))
125 | 
126 |     return all_anchors
127 | 
128 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import numpy as np
  4 | 
  5 | def conv3x3(in_planes, out_planes, stride=1):
  6 |     """3x3 convolution with padding"""
  7 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
  8 |                      padding=1, bias=False)
  9 | 
 10 | class BasicBlock(nn.Module):
 11 |     expansion = 1
 12 | 
 13 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 14 |         super(BasicBlock, self).__init__()
 15 |         self.conv1 = conv3x3(inplanes, planes, stride)
 16 |         self.bn1 = nn.BatchNorm2d(planes)
 17 |         self.relu = nn.ReLU(inplace=True)
 18 |         self.conv2 = conv3x3(planes, planes)
 19 |         self.bn2 = nn.BatchNorm2d(planes)
 20 |         self.downsample = downsample
 21 |         self.stride = stride
 22 | 
 23 |     def forward(self, x):
 24 |         residual = x
 25 | 
 26 |         out = self.conv1(x)
 27 |         out = self.bn1(out)
 28 |         out = self.relu(out)
 29 | 
 30 |         out = self.conv2(out)
 31 |         out = self.bn2(out)
 32 | 
 33 |         if self.downsample is not None:
 34 |             residual = self.downsample(x)
 35 | 
 36 |         out += residual
 37 |         out = self.relu(out)
 38 | 
 39 |         return out
 40 | 
 41 | 
 42 | class Bottleneck(nn.Module):
 43 |     expansion = 4
 44 | 
 45 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 46 |         super(Bottleneck, self).__init__()
 47 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
 48 |         self.bn1 = nn.BatchNorm2d(planes)
 49 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
 50 |                                padding=1, bias=False)
 51 |         self.bn2 = nn.BatchNorm2d(planes)
 52 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
 53 |         self.bn3 = nn.BatchNorm2d(planes * 4)
 54 |         self.relu = nn.ReLU(inplace=True)
 55 |         self.downsample = downsample
 56 |         self.stride = stride
 57 | 
 58 |     def forward(self, x):
 59 |         residual = x
 60 | 
 61 |         out = self.conv1(x)
 62 |         out = self.bn1(out)
 63 |         out = self.relu(out)
 64 | 
 65 |         out = self.conv2(out)
 66 |         out = self.bn2(out)
 67 |         out = self.relu(out)
 68 | 
 69 |         out = self.conv3(out)
 70 |         out = self.bn3(out)
 71 | 
 72 |         if self.downsample is not None:
 73 |             residual = self.downsample(x)
 74 | 
 75 |         out += residual
 76 |         out = self.relu(out)
 77 | 
 78 |         return out
 79 | 
 80 | class BBoxTransform(nn.Module):
 81 | 
 82 |     def __init__(self, mean=None, std=None):
 83 |         super(BBoxTransform, self).__init__()
 84 |         if mean is None:
 85 |             self.mean = torch.from_numpy(np.array([0, 0, 0, 0]).astype(np.float32)).cuda()
 86 |         else:
 87 |             self.mean = mean
 88 |         if std is None:
 89 |             self.std = torch.from_numpy(np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32)).cuda()
 90 |         else:
 91 |             self.std = std
 92 | 
 93 |     def forward(self, boxes, deltas):
 94 | 
 95 |         widths  = boxes[:, :, 2] - boxes[:, :, 0]
 96 |         heights = boxes[:, :, 3] - boxes[:, :, 1]
 97 |         ctr_x   = boxes[:, :, 0] + 0.5 * widths
 98 |         ctr_y   = boxes[:, :, 1] + 0.5 * heights
 99 | 
100 |         dx = deltas[:, :, 0] * self.std[0] + self.mean[0]
101 |         dy = deltas[:, :, 1] * self.std[1] + self.mean[1]
102 |         dw = deltas[:, :, 2] * self.std[2] + self.mean[2]
103 |         dh = deltas[:, :, 3] * self.std[3] + self.mean[3]
104 | 
105 |         pred_ctr_x = ctr_x + dx * widths
106 |         pred_ctr_y = ctr_y + dy * heights
107 |         pred_w     = torch.exp(dw) * widths
108 |         pred_h     = torch.exp(dh) * heights
109 | 
110 |         pred_boxes_x1 = pred_ctr_x - 0.5 * pred_w
111 |         pred_boxes_y1 = pred_ctr_y - 0.5 * pred_h
112 |         pred_boxes_x2 = pred_ctr_x + 0.5 * pred_w
113 |         pred_boxes_y2 = pred_ctr_y + 0.5 * pred_h
114 | 
115 |         pred_boxes = torch.stack([pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2], dim=2)
116 | 
117 |         return pred_boxes
118 | 
119 | 
120 | class ClipBoxes(nn.Module):
121 | 
122 |     def __init__(self, width=None, height=None):
123 |         super(ClipBoxes, self).__init__()
124 | 
125 |     def forward(self, boxes, img):
126 | 
127 |         batch_size, num_channels, height, width = img.shape
128 | 
129 |         boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0)
130 |         boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0)
131 | 
132 |         boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width)
133 |         boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height)
134 |       
135 |         return boxes
136 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pytorch-retinanet
  2 | 
  3 | ![img3](https://github.com/yhenon/pytorch-retinanet/blob/master/images/3.jpg)
  4 | ![img5](https://github.com/yhenon/pytorch-retinanet/blob/master/images/5.jpg)
  5 | 
  6 | Pytorch  implementation of RetinaNet object detection as described in [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) by Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He and Piotr Dollár.
  7 | 
  8 | This implementation is primarily designed to be easy to read and simple to modify.
  9 | 
 10 | ## Results
 11 | Currently, this repo achieves 33.7% mAP at 600px resolution with a Resnet-50 backbone. The published result is 34.0% mAP. The difference is likely due to the use of Adam optimizer instead of SGD with weight decay.
 12 | 
 13 | ## Installation
 14 | 
 15 | 1) Clone this repo
 16 | 
 17 | 2) Install the required packages:
 18 | 
 19 | ```
 20 | apt-get install tk-dev python-tk
 21 | ```
 22 | 
 23 | 3) Install the python packages:
 24 | 	
 25 | ```
 26 | pip install cffi
 27 | 
 28 | pip install pandas
 29 | 
 30 | pip install pycocotools
 31 | 
 32 | pip install cython
 33 | 
 34 | pip install pycocotools
 35 | 
 36 | pip install opencv-python
 37 | 
 38 | pip install requests
 39 | 
 40 | ```
 41 | 
 42 | 4) Build the NMS extension.
 43 | 
 44 | ```
 45 | cd pytorch-retinanet/lib
 46 | bash build.sh
 47 | cd ../
 48 | ```
 49 | 
 50 | Note that you may have to edit line 14 of `build.sh` if you want to change which version of python you are building the extension for.
 51 | 
 52 | ## Training
 53 | 
 54 | The network can be trained using the `train.py` script. Currently, two dataloaders are available: COCO and CSV. For training on coco, use
 55 | 
 56 | ```
 57 | python train.py --dataset coco --coco_path ../coco --depth 50
 58 | ```
 59 | 
 60 | For training using a custom dataset, with annotations in CSV format (see below), use
 61 | 
 62 | ```
 63 | python train.py --dataset csv --csv_train <path/to/train_annots.csv>  --csv_classes <path/to/train/class_list.csv>  --csv_val <path/to/val_annots.csv>
 64 | ```
 65 | 
 66 | Note that the --csv_val argument is optional, in which case no validation will be performed.
 67 | 
 68 | ## Pre-trained model
 69 | 
 70 | A pre-trained model is available at: 
 71 | - https://drive.google.com/open?id=1yLmjq3JtXi841yXWBxst0coAgR26MNBS (this is a pytorch state dict)
 72 | - https://drive.google.com/open?id=1hCtM35R_t6T8RJVSd74K4gB-A1MR-TxC (this is a pytorch model serialized via `torch.save()`)
 73 | 
 74 | The state dict model can be loaded using:
 75 | 
 76 | ```
 77 | retinanet = model.resnet50(num_classes=dataset_train.num_classes(),)
 78 | retinanet.load_state_dict(torch.load(PATH_TO_WEIGHTS))
 79 | ```
 80 | 
 81 | The pytorch model can be loaded directly using:
 82 | 
 83 | ```
 84 | retinanet = torch.load(PATH_TO_MODEL)
 85 | ```
 86 | 
 87 | ## Visualization
 88 | 
 89 | To visualize the network detection, use `visualize.py`:
 90 | 
 91 | ```
 92 | python visualize.py --dataset coco --coco_path ../coco --model <path/to/model.pt>
 93 | ```
 94 | This will visualize bounding boxes on the validation set. To visualise with a CSV dataset, use:
 95 | 
 96 | ```
 97 | python visualize.py --dataset csv --csv_classes <path/to/train/class_list.csv>  --csv_val <path/to/val_annots.csv> --model <path/to/model.pt>
 98 | ```
 99 | 
100 | ## Model
101 | 
102 | The retinanet model uses a resnet backbone. You can set the depth of the resnet model using the --depth argument. Depth must be one of 18, 34, 50, 101 or 152. Note that deeper models are more accurate but are slower and use more memory.
103 | 
104 | ## CSV datasets
105 | The `CSVGenerator` provides an easy way to define your own datasets.
106 | It uses two CSV files: one file containing annotations and one file containing a class name to ID mapping.
107 | 
108 | ### Annotations format
109 | The CSV file with annotations should contain one annotation per line.
110 | Images with multiple bounding boxes should use one row per bounding box.
111 | Note that indexing for pixel values starts at 0.
112 | The expected format of each line is:
113 | ```
114 | path/to/image.jpg,x1,y1,x2,y2,class_name
115 | ```
116 | 
117 | Some images may not contain any labeled objects.
118 | To add these images to the dataset as negative examples,
119 | add an annotation where `x1`, `y1`, `x2`, `y2` and `class_name` are all empty:
120 | ```
121 | path/to/image.jpg,,,,,
122 | ```
123 | 
124 | A full example:
125 | ```
126 | /data/imgs/img_001.jpg,837,346,981,456,cow
127 | /data/imgs/img_002.jpg,215,312,279,391,cat
128 | /data/imgs/img_002.jpg,22,5,89,84,bird
129 | /data/imgs/img_003.jpg,,,,,
130 | ```
131 | 
132 | This defines a dataset with 3 images.
133 | `img_001.jpg` contains a cow.
134 | `img_002.jpg` contains a cat and a bird.
135 | `img_003.jpg` contains no interesting objects/animals.
136 | 
137 | 
138 | ### Class mapping format
139 | The class name to ID mapping file should contain one mapping per line.
140 | Each line should use the following format:
141 | ```
142 | class_name,id
143 | ```
144 | 
145 | Indexing for classes starts at 0.
146 | Do not include a background class as it is implicit.
147 | 
148 | For example:
149 | ```
150 | cow,0
151 | cat,1
152 | bird,2
153 | ```
154 | 
155 | ## Acknowledgements
156 | 
157 | - Significant amounts of code are borrowed from the [keras retinanet implementation](https://github.com/fizyr/keras-retinanet)
158 | - The NMS module used is from the [pytorch faster-rcnn implementation](https://github.com/ruotianluo/pytorch-faster-rcnn)
159 | 
160 | ## Examples
161 | 
162 | ![img1](https://github.com/yhenon/pytorch-retinanet/blob/master/images/1.jpg)
163 | ![img2](https://github.com/yhenon/pytorch-retinanet/blob/master/images/2.jpg)
164 | ![img4](https://github.com/yhenon/pytorch-retinanet/blob/master/images/4.jpg)
165 | ![img6](https://github.com/yhenon/pytorch-retinanet/blob/master/images/6.jpg)
166 | ![img7](https://github.com/yhenon/pytorch-retinanet/blob/master/images/7.jpg)
167 | ![img8](https://github.com/yhenon/pytorch-retinanet/blob/master/images/8.jpg)
168 | 


--------------------------------------------------------------------------------
/losses.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | def calc_iou(a, b):
  6 |     area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
  7 | 
  8 |     iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0])
  9 |     ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1])
 10 | 
 11 |     iw = torch.clamp(iw, min=0)
 12 |     ih = torch.clamp(ih, min=0)
 13 | 
 14 |     ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih
 15 | 
 16 |     ua = torch.clamp(ua, min=1e-8)
 17 | 
 18 |     intersection = iw * ih
 19 | 
 20 |     IoU = intersection / ua
 21 | 
 22 |     return IoU
 23 | 
 24 | class FocalLoss(nn.Module):
 25 |     #def __init__(self):
 26 | 
 27 |     def forward(self, classifications, regressions, anchors, annotations):
 28 |         alpha = 0.25
 29 |         gamma = 2.0
 30 |         batch_size = classifications.shape[0]
 31 |         classification_losses = []
 32 |         regression_losses = []
 33 | 
 34 |         anchor = anchors[0, :, :]
 35 | 
 36 |         anchor_widths  = anchor[:, 2] - anchor[:, 0]
 37 |         anchor_heights = anchor[:, 3] - anchor[:, 1]
 38 |         anchor_ctr_x   = anchor[:, 0] + 0.5 * anchor_widths
 39 |         anchor_ctr_y   = anchor[:, 1] + 0.5 * anchor_heights
 40 | 
 41 |         for j in range(batch_size):
 42 | 
 43 |             classification = classifications[j, :, :]
 44 |             regression = regressions[j, :, :]
 45 | 
 46 |             bbox_annotation = annotations[j, :, :]
 47 |             bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1]
 48 | 
 49 |             if bbox_annotation.shape[0] == 0:
 50 |                 regression_losses.append(torch.tensor(0).float().cuda())
 51 |                 classification_losses.append(torch.tensor(0).float().cuda())
 52 | 
 53 |                 continue
 54 | 
 55 |             classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4)
 56 | 
 57 |             IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4]) # num_anchors x num_annotations
 58 | 
 59 |             IoU_max, IoU_argmax = torch.max(IoU, dim=1) # num_anchors x 1
 60 | 
 61 |             #import pdb
 62 |             #pdb.set_trace()
 63 | 
 64 |             # compute the loss for classification
 65 |             targets = torch.ones(classification.shape) * -1
 66 |             targets = targets.cuda()
 67 | 
 68 |             targets[torch.lt(IoU_max, 0.4), :] = 0
 69 | 
 70 |             positive_indices = torch.ge(IoU_max, 0.5)
 71 | 
 72 |             num_positive_anchors = positive_indices.sum()
 73 | 
 74 |             assigned_annotations = bbox_annotation[IoU_argmax, :]
 75 | 
 76 |             targets[positive_indices, :] = 0
 77 |             targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1
 78 | 
 79 |             alpha_factor = torch.ones(targets.shape).cuda() * alpha
 80 | 
 81 |             alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor)
 82 |             focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification)
 83 |             focal_weight = alpha_factor * torch.pow(focal_weight, gamma)
 84 | 
 85 |             bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification))
 86 | 
 87 |             # cls_loss = focal_weight * torch.pow(bce, gamma)
 88 |             cls_loss = focal_weight * bce
 89 | 
 90 |             cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda())
 91 | 
 92 |             classification_losses.append(cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0))
 93 | 
 94 |             # compute the loss for regression
 95 | 
 96 |             if positive_indices.sum() > 0:
 97 |                 assigned_annotations = assigned_annotations[positive_indices, :]
 98 | 
 99 |                 anchor_widths_pi = anchor_widths[positive_indices]
100 |                 anchor_heights_pi = anchor_heights[positive_indices]
101 |                 anchor_ctr_x_pi = anchor_ctr_x[positive_indices]
102 |                 anchor_ctr_y_pi = anchor_ctr_y[positive_indices]
103 | 
104 |                 gt_widths  = assigned_annotations[:, 2] - assigned_annotations[:, 0]
105 |                 gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1]
106 |                 gt_ctr_x   = assigned_annotations[:, 0] + 0.5 * gt_widths
107 |                 gt_ctr_y   = assigned_annotations[:, 1] + 0.5 * gt_heights
108 | 
109 |                 # clip widths to 1
110 |                 gt_widths  = torch.clamp(gt_widths, min=1)
111 |                 gt_heights = torch.clamp(gt_heights, min=1)
112 | 
113 |                 targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi
114 |                 targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi
115 |                 targets_dw = torch.log(gt_widths / anchor_widths_pi)
116 |                 targets_dh = torch.log(gt_heights / anchor_heights_pi)
117 | 
118 |                 targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh))
119 |                 targets = targets.t()
120 | 
121 |                 targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda()
122 | 
123 | 
124 |                 negative_indices = 1 - positive_indices
125 | 
126 |                 regression_diff = torch.abs(targets - regression[positive_indices, :])
127 | 
128 |                 regression_loss = torch.where(
129 |                     torch.le(regression_diff, 1.0 / 9.0),
130 |                     0.5 * 9.0 * torch.pow(regression_diff, 2),
131 |                     regression_diff - 0.5 / 9.0
132 |                 )
133 |                 regression_losses.append(regression_loss.mean())
134 |             else:
135 |                 regression_losses.append(torch.tensor(0).float().cuda())
136 | 
137 |         return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True)
138 | 
139 |     
140 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import os
  3 | import copy
  4 | import argparse
  5 | import pdb
  6 | import collections
  7 | import sys
  8 | 
  9 | import numpy as np
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.optim as optim
 14 | from torch.optim import lr_scheduler
 15 | from torch.autograd import Variable
 16 | from torchvision import datasets, models, transforms
 17 | import torchvision
 18 | 
 19 | import model
 20 | from anchors import Anchors
 21 | import losses
 22 | from dataloader import CocoDataset, CSVDataset, collater, Resizer, AspectRatioBasedSampler, Augmenter, UnNormalizer, Normalizer
 23 | from torch.utils.data import Dataset, DataLoader
 24 | 
 25 | import coco_eval
 26 | import csv_eval
 27 | 
 28 | assert torch.__version__.split('.')[1] == '4'
 29 | 
 30 | print('CUDA available: {}'.format(torch.cuda.is_available()))
 31 | 
 32 | 
 33 | def main(args=None):
 34 | 
 35 | 	parser     = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.')
 36 | 
 37 | 	parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.')
 38 | 	parser.add_argument('--coco_path', help='Path to COCO directory')
 39 | 	parser.add_argument('--csv_train', help='Path to file containing training annotations (see readme)')
 40 | 	parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)')
 41 | 	parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)')
 42 | 
 43 | 	parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50)
 44 | 	parser.add_argument('--epochs', help='Number of epochs', type=int, default=100)
 45 | 
 46 | 	parser = parser.parse_args(args)
 47 | 
 48 | 	# Create the data loaders
 49 | 	if parser.dataset == 'coco':
 50 | 
 51 | 		if parser.coco_path is None:
 52 | 			raise ValueError('Must provide --coco_path when training on COCO,')
 53 | 
 54 | 		dataset_train = CocoDataset(parser.coco_path, set_name='train2017', transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()]))
 55 | 		dataset_val = CocoDataset(parser.coco_path, set_name='val2017', transform=transforms.Compose([Normalizer(), Resizer()]))
 56 | 
 57 | 	elif parser.dataset == 'csv':
 58 | 
 59 | 		if parser.csv_train is None:
 60 | 			raise ValueError('Must provide --csv_train when training on COCO,')
 61 | 
 62 | 		if parser.csv_classes is None:
 63 | 			raise ValueError('Must provide --csv_classes when training on COCO,')
 64 | 
 65 | 
 66 | 		dataset_train = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()]))
 67 | 
 68 | 		if parser.csv_val is None:
 69 | 			dataset_val = None
 70 | 			print('No validation annotations provided.')
 71 | 		else:
 72 | 			dataset_val = CSVDataset(train_file=parser.csv_val, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Resizer()]))
 73 | 
 74 | 	else:
 75 | 		raise ValueError('Dataset type not understood (must be csv or coco), exiting.')
 76 | 
 77 | 	sampler = AspectRatioBasedSampler(dataset_train, batch_size=2, drop_last=False)
 78 | 	dataloader_train = DataLoader(dataset_train, num_workers=3, collate_fn=collater, batch_sampler=sampler)
 79 | 
 80 | 	if dataset_val is not None:
 81 | 		sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False)
 82 | 		dataloader_val = DataLoader(dataset_val, num_workers=3, collate_fn=collater, batch_sampler=sampler_val)
 83 | 
 84 | 	# Create the model
 85 | 	if parser.depth == 18:
 86 | 		retinanet = model.resnet18(num_classes=dataset_train.num_classes(), pretrained=True)
 87 | 	elif parser.depth == 34:
 88 | 		retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True)
 89 | 	elif parser.depth == 50:
 90 | 		retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True)
 91 | 	elif parser.depth == 101:
 92 | 		retinanet = model.resnet101(num_classes=dataset_train.num_classes(), pretrained=True)
 93 | 	elif parser.depth == 152:
 94 | 		retinanet = model.resnet152(num_classes=dataset_train.num_classes(), pretrained=True)
 95 | 	else:
 96 | 		raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152')		
 97 | 
 98 | 	use_gpu = True
 99 | 
100 | 	if use_gpu:
101 | 		retinanet = retinanet.cuda()
102 | 	
103 | 	retinanet = torch.nn.DataParallel(retinanet).cuda()
104 | 
105 | 	retinanet.training = True
106 | 
107 | 	optimizer = optim.Adam(retinanet.parameters(), lr=1e-5)
108 | 
109 | 	scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)
110 | 
111 | 	loss_hist = collections.deque(maxlen=500)
112 | 
113 | 	retinanet.train()
114 | 	retinanet.module.freeze_bn()
115 | 
116 | 	print('Num training images: {}'.format(len(dataset_train)))
117 | 
118 | 	for epoch_num in range(parser.epochs):
119 | 
120 | 		retinanet.train()
121 | 		retinanet.module.freeze_bn()
122 | 		
123 | 		epoch_loss = []
124 | 		
125 | 		for iter_num, data in enumerate(dataloader_train):
126 | 			try:
127 | 				optimizer.zero_grad()
128 | 
129 | 				classification_loss, regression_loss = retinanet([data['img'].cuda().float(), data['annot']])
130 | 
131 | 				classification_loss = classification_loss.mean()
132 | 				regression_loss = regression_loss.mean()
133 | 
134 | 				loss = classification_loss + regression_loss
135 | 				
136 | 				if bool(loss == 0):
137 | 					continue
138 | 
139 | 				loss.backward()
140 | 
141 | 				torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1)
142 | 
143 | 				optimizer.step()
144 | 
145 | 				loss_hist.append(float(loss))
146 | 
147 | 				epoch_loss.append(float(loss))
148 | 
149 | 				print('Epoch: {} | Iteration: {} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | Running loss: {:1.5f}'.format(epoch_num, iter_num, float(classification_loss), float(regression_loss), np.mean(loss_hist)))
150 | 				
151 | 				del classification_loss
152 | 				del regression_loss
153 | 			except Exception as e:
154 | 				print(e)
155 | 				continue
156 | 
157 | 		if parser.dataset == 'coco':
158 | 
159 | 			print('Evaluating dataset')
160 | 
161 | 			coco_eval.evaluate_coco(dataset_val, retinanet)
162 | 
163 | 		elif parser.dataset == 'csv' and parser.csv_val is not None:
164 | 
165 | 			print('Evaluating dataset')
166 | 
167 | 			mAP = csv_eval.evaluate(dataset_val, retinanet)
168 | 
169 | 		
170 | 		scheduler.step(np.mean(epoch_loss))	
171 | 
172 | 		torch.save(retinanet.module, '{}_retinanet_{}.pt'.format(parser.dataset, epoch_num))
173 | 
174 | 	retinanet.eval()
175 | 
176 | 	torch.save(retinanet, 'model_final.pt'.format(epoch_num))
177 | 
178 | if __name__ == '__main__':
179 |  main()
180 | 


--------------------------------------------------------------------------------
/csv_eval.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import json
  5 | import os
  6 | 
  7 | import torch
  8 | 
  9 | 
 10 | 
 11 | def compute_overlap(a, b):
 12 |     """
 13 |     Parameters
 14 |     ----------
 15 |     a: (N, 4) ndarray of float
 16 |     b: (K, 4) ndarray of float
 17 |     Returns
 18 |     -------
 19 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
 20 |     """
 21 |     area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
 22 | 
 23 |     iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0])
 24 |     ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1])
 25 | 
 26 |     iw = np.maximum(iw, 0)
 27 |     ih = np.maximum(ih, 0)
 28 | 
 29 |     ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
 30 | 
 31 |     ua = np.maximum(ua, np.finfo(float).eps)
 32 | 
 33 |     intersection = iw * ih
 34 | 
 35 |     return intersection / ua
 36 | 
 37 | 
 38 | def _compute_ap(recall, precision):
 39 |     """ Compute the average precision, given the recall and precision curves.
 40 |     Code originally from https://github.com/rbgirshick/py-faster-rcnn.
 41 |     # Arguments
 42 |         recall:    The recall curve (list).
 43 |         precision: The precision curve (list).
 44 |     # Returns
 45 |         The average precision as computed in py-faster-rcnn.
 46 |     """
 47 |     # correct AP calculation
 48 |     # first append sentinel values at the end
 49 |     mrec = np.concatenate(([0.], recall, [1.]))
 50 |     mpre = np.concatenate(([0.], precision, [0.]))
 51 | 
 52 |     # compute the precision envelope
 53 |     for i in range(mpre.size - 1, 0, -1):
 54 |         mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 55 | 
 56 |     # to calculate area under PR curve, look for points
 57 |     # where X axis (recall) changes value
 58 |     i = np.where(mrec[1:] != mrec[:-1])[0]
 59 | 
 60 |     # and sum (\Delta recall) * prec
 61 |     ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 62 |     return ap
 63 | 
 64 | 
 65 | def _get_detections(dataset, retinanet, score_threshold=0.05, max_detections=100, save_path=None):
 66 |     """ Get the detections from the retinanet using the generator.
 67 |     The result is a list of lists such that the size is:
 68 |         all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes]
 69 |     # Arguments
 70 |         dataset         : The generator used to run images through the retinanet.
 71 |         retinanet           : The retinanet to run on the images.
 72 |         score_threshold : The score confidence threshold to use.
 73 |         max_detections  : The maximum number of detections to use per image.
 74 |         save_path       : The path to save the images with visualized detections to.
 75 |     # Returns
 76 |         A list of lists containing the detections for each image in the generator.
 77 |     """
 78 |     all_detections = [[None for i in range(dataset.num_classes())] for j in range(len(dataset))]
 79 | 
 80 |     retinanet.eval()
 81 |     
 82 |     with torch.no_grad():
 83 | 
 84 |         for index in range(len(dataset)):
 85 |             data = dataset[index]
 86 |             scale = data['scale']
 87 | 
 88 |             # run network
 89 |             scores, labels, boxes = retinanet(data['img'].permute(2, 0, 1).cuda().float().unsqueeze(dim=0))
 90 |             scores = scores.cpu().numpy()
 91 |             labels = labels.cpu().numpy()
 92 |             boxes  = boxes.cpu().numpy()
 93 | 
 94 |             # correct boxes for image scale
 95 |             boxes /= scale
 96 | 
 97 |             # select indices which have a score above the threshold
 98 |             indices = np.where(scores > score_threshold)[0]
 99 |             if indices.shape[0] > 0:
100 |                 # select those scores
101 |                 scores = scores[indices]
102 | 
103 |                 # find the order with which to sort the scores
104 |                 scores_sort = np.argsort(-scores)[:max_detections]
105 | 
106 |                 # select detections
107 |                 image_boxes      = boxes[indices[scores_sort], :]
108 |                 image_scores     = scores[scores_sort]
109 |                 image_labels     = labels[indices[scores_sort]]
110 |                 image_detections = np.concatenate([image_boxes, np.expand_dims(image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
111 | 
112 |                 # copy detections to all_detections
113 |                 for label in range(dataset.num_classes()):
114 |                     all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1]
115 |             else:
116 |                 # copy detections to all_detections
117 |                 for label in range(dataset.num_classes()):
118 |                     all_detections[index][label] = np.zeros((0, 5))
119 | 
120 |             print('{}/{}'.format(index + 1, len(dataset)), end='\r')
121 | 
122 |     return all_detections
123 | 
124 | 
125 | def _get_annotations(generator):
126 |     """ Get the ground truth annotations from the generator.
127 |     The result is a list of lists such that the size is:
128 |         all_detections[num_images][num_classes] = annotations[num_detections, 5]
129 |     # Arguments
130 |         generator : The generator used to retrieve ground truth annotations.
131 |     # Returns
132 |         A list of lists containing the annotations for each image in the generator.
133 |     """
134 |     all_annotations = [[None for i in range(generator.num_classes())] for j in range(len(generator))]
135 | 
136 |     for i in range(len(generator)):
137 |         # load the annotations
138 |         annotations = generator.load_annotations(i)
139 | 
140 |         # copy detections to all_annotations
141 |         for label in range(generator.num_classes()):
142 |             all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy()
143 | 
144 |         print('{}/{}'.format(i + 1, len(generator)), end='\r')
145 | 
146 |     return all_annotations
147 | 
148 | 
149 | def evaluate(
150 |     generator,
151 |     retinanet,
152 |     iou_threshold=0.5,
153 |     score_threshold=0.05,
154 |     max_detections=100,
155 |     save_path=None
156 | ):
157 |     """ Evaluate a given dataset using a given retinanet.
158 |     # Arguments
159 |         generator       : The generator that represents the dataset to evaluate.
160 |         retinanet           : The retinanet to evaluate.
161 |         iou_threshold   : The threshold used to consider when a detection is positive or negative.
162 |         score_threshold : The score confidence threshold to use for detections.
163 |         max_detections  : The maximum number of detections to use per image.
164 |         save_path       : The path to save images with visualized detections to.
165 |     # Returns
166 |         A dict mapping class names to mAP scores.
167 |     """
168 | 
169 | 
170 | 
171 |     # gather all detections and annotations
172 | 
173 |     all_detections     = _get_detections(generator, retinanet, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path)
174 |     all_annotations    = _get_annotations(generator)
175 | 
176 |     average_precisions = {}
177 | 
178 |     for label in range(generator.num_classes()):
179 |         false_positives = np.zeros((0,))
180 |         true_positives  = np.zeros((0,))
181 |         scores          = np.zeros((0,))
182 |         num_annotations = 0.0
183 | 
184 |         for i in range(len(generator)):
185 |             detections           = all_detections[i][label]
186 |             annotations          = all_annotations[i][label]
187 |             num_annotations     += annotations.shape[0]
188 |             detected_annotations = []
189 | 
190 |             for d in detections:
191 |                 scores = np.append(scores, d[4])
192 | 
193 |                 if annotations.shape[0] == 0:
194 |                     false_positives = np.append(false_positives, 1)
195 |                     true_positives  = np.append(true_positives, 0)
196 |                     continue
197 | 
198 |                 overlaps            = compute_overlap(np.expand_dims(d, axis=0), annotations)
199 |                 assigned_annotation = np.argmax(overlaps, axis=1)
200 |                 max_overlap         = overlaps[0, assigned_annotation]
201 | 
202 |                 if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
203 |                     false_positives = np.append(false_positives, 0)
204 |                     true_positives  = np.append(true_positives, 1)
205 |                     detected_annotations.append(assigned_annotation)
206 |                 else:
207 |                     false_positives = np.append(false_positives, 1)
208 |                     true_positives  = np.append(true_positives, 0)
209 | 
210 |         # no annotations -> AP for this class is 0 (is this correct?)
211 |         if num_annotations == 0:
212 |             average_precisions[label] = 0, 0
213 |             continue
214 | 
215 |         # sort by score
216 |         indices         = np.argsort(-scores)
217 |         false_positives = false_positives[indices]
218 |         true_positives  = true_positives[indices]
219 | 
220 |         # compute false positives and true positives
221 |         false_positives = np.cumsum(false_positives)
222 |         true_positives  = np.cumsum(true_positives)
223 | 
224 |         # compute recall and precision
225 |         recall    = true_positives / num_annotations
226 |         precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps)
227 | 
228 |         # compute average precision
229 |         average_precision  = _compute_ap(recall, precision)
230 |         average_precisions[label] = average_precision, num_annotations
231 |     
232 |     print('\nmAP:')
233 |     for label in range(generator.num_classes()):
234 |         label_name = generator.label_to_name(label)
235 |         print('{}: {}'.format(label_name, average_precisions[label][0]))
236 |     
237 |     return average_precisions
238 | 
239 | 


--------------------------------------------------------------------------------
/oid_dataset.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function, division
  2 | 
  3 | import csv
  4 | import json
  5 | import os
  6 | import warnings
  7 | 
  8 | import numpy as np
  9 | import skimage
 10 | import skimage.color
 11 | import skimage.io
 12 | import skimage.transform
 13 | from PIL import Image
 14 | from torch.utils.data import Dataset
 15 | 
 16 | 
 17 | def get_labels(metadata_dir, version='v4'):
 18 |     if version == 'v4' or version == 'challenge2018':
 19 |         csv_file = 'class-descriptions-boxable.csv' if version == 'v4' else 'challenge-2018-class-descriptions-500.csv'
 20 | 
 21 |         boxable_classes_descriptions = os.path.join(metadata_dir, csv_file)
 22 |         id_to_labels = {}
 23 |         cls_index = {}
 24 | 
 25 |         i = 0
 26 |         with open(boxable_classes_descriptions) as f:
 27 |             for row in csv.reader(f):
 28 |                 # make sure the csv row is not empty (usually the last one)
 29 |                 if len(row):
 30 |                     label = row[0]
 31 |                     description = row[1].replace("\"", "").replace("'", "").replace('`', '')
 32 | 
 33 |                     id_to_labels[i] = description
 34 |                     cls_index[label] = i
 35 | 
 36 |                     i += 1
 37 |     else:
 38 |         trainable_classes_path = os.path.join(metadata_dir, 'classes-bbox-trainable.txt')
 39 |         description_path = os.path.join(metadata_dir, 'class-descriptions.csv')
 40 | 
 41 |         description_table = {}
 42 |         with open(description_path) as f:
 43 |             for row in csv.reader(f):
 44 |                 # make sure the csv row is not empty (usually the last one)
 45 |                 if len(row):
 46 |                     description_table[row[0]] = row[1].replace("\"", "").replace("'", "").replace('`', '')
 47 | 
 48 |         with open(trainable_classes_path, 'rb') as f:
 49 |             trainable_classes = f.read().split('\n')
 50 | 
 51 |         id_to_labels = dict([(i, description_table[c]) for i, c in enumerate(trainable_classes)])
 52 |         cls_index = dict([(c, i) for i, c in enumerate(trainable_classes)])
 53 | 
 54 |     return id_to_labels, cls_index
 55 | 
 56 | 
 57 | def generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version='v4'):
 58 |     validation_image_ids = {}
 59 | 
 60 |     if version == 'v4':
 61 |         annotations_path = os.path.join(metadata_dir, subset, '{}-annotations-bbox.csv'.format(subset))
 62 |     elif version == 'challenge2018':
 63 |         validation_image_ids_path = os.path.join(metadata_dir, 'challenge-2018-image-ids-valset-od.csv')
 64 | 
 65 |         with open(validation_image_ids_path, 'r') as csv_file:
 66 |             reader = csv.DictReader(csv_file, fieldnames=['ImageID'])
 67 |             reader.next()
 68 |             for line, row in enumerate(reader):
 69 |                 image_id = row['ImageID']
 70 |                 validation_image_ids[image_id] = True
 71 | 
 72 |         annotations_path = os.path.join(metadata_dir, 'challenge-2018-train-annotations-bbox.csv')
 73 |     else:
 74 |         annotations_path = os.path.join(metadata_dir, subset, 'annotations-human-bbox.csv')
 75 | 
 76 |     fieldnames = ['ImageID', 'Source', 'LabelName', 'Confidence',
 77 |                   'XMin', 'XMax', 'YMin', 'YMax',
 78 |                   'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction', 'IsInside']
 79 | 
 80 |     id_annotations = dict()
 81 |     with open(annotations_path, 'r') as csv_file:
 82 |         reader = csv.DictReader(csv_file, fieldnames=fieldnames)
 83 |         next(reader)
 84 | 
 85 |         images_sizes = {}
 86 |         for line, row in enumerate(reader):
 87 |             frame = row['ImageID']
 88 | 
 89 |             if version == 'challenge2018':
 90 |                 if subset == 'train':
 91 |                     if frame in validation_image_ids:
 92 |                         continue
 93 |                 elif subset == 'validation':
 94 |                     if frame not in validation_image_ids:
 95 |                         continue
 96 |                 else:
 97 |                     raise NotImplementedError('This generator handles only the train and validation subsets')
 98 | 
 99 |             class_name = row['LabelName']
100 | 
101 |             if class_name not in cls_index:
102 |                 continue
103 | 
104 |             cls_id = cls_index[class_name]
105 | 
106 |             if version == 'challenge2018':
107 |                 # We recommend participants to use the provided subset of the training set as a validation set.
108 |                 # This is preferable over using the V4 val/test sets, as the training set is more densely annotated.
109 |                 img_path = os.path.join(main_dir, 'images', 'train', frame + '.jpg')
110 |             else:
111 |                 img_path = os.path.join(main_dir, 'images', subset, frame + '.jpg')
112 | 
113 |             if frame in images_sizes:
114 |                 width, height = images_sizes[frame]
115 |             else:
116 |                 try:
117 |                     with Image.open(img_path) as img:
118 |                         width, height = img.width, img.height
119 |                         images_sizes[frame] = (width, height)
120 |                 except Exception as ex:
121 |                     if version == 'challenge2018':
122 |                         raise ex
123 |                     continue
124 | 
125 |             x1 = float(row['XMin'])
126 |             x2 = float(row['XMax'])
127 |             y1 = float(row['YMin'])
128 |             y2 = float(row['YMax'])
129 | 
130 |             x1_int = int(round(x1 * width))
131 |             x2_int = int(round(x2 * width))
132 |             y1_int = int(round(y1 * height))
133 |             y2_int = int(round(y2 * height))
134 | 
135 |             # Check that the bounding box is valid.
136 |             if x2 <= x1:
137 |                 raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1))
138 |             if y2 <= y1:
139 |                 raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1))
140 | 
141 |             if y2_int == y1_int:
142 |                 warnings.warn('filtering line {}: rounding y2 ({}) and y1 ({}) makes them equal'.format(line, y2, y1))
143 |                 continue
144 | 
145 |             if x2_int == x1_int:
146 |                 warnings.warn('filtering line {}: rounding x2 ({}) and x1 ({}) makes them equal'.format(line, x2, x1))
147 |                 continue
148 | 
149 |             img_id = row['ImageID']
150 |             annotation = {'cls_id': cls_id, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2}
151 | 
152 |             if img_id in id_annotations:
153 |                 annotations = id_annotations[img_id]
154 |                 annotations['boxes'].append(annotation)
155 |             else:
156 |                 id_annotations[img_id] = {'w': width, 'h': height, 'boxes': [annotation]}
157 |     return id_annotations
158 | 
159 | 
160 | class OidDataset(Dataset):
161 |     """Oid dataset."""
162 | 
163 |     def __init__(self, main_dir, subset, version='v4', annotation_cache_dir='.', transform=None):
164 |         if version == 'v4':
165 |             metadata = '2018_04'
166 |         elif version == 'challenge2018':
167 |             metadata = 'challenge2018'
168 |         elif version == 'v3':
169 |             metadata = '2017_11'
170 |         else:
171 |             raise NotImplementedError('There is currently no implementation for versions older than v3')
172 | 
173 |         self.transform = transform
174 | 
175 |         if version == 'challenge2018':
176 |             self.base_dir = os.path.join(main_dir, 'images', 'train')
177 |         else:
178 |             self.base_dir = os.path.join(main_dir, 'images', subset)
179 | 
180 |         metadata_dir = os.path.join(main_dir, metadata)
181 |         annotation_cache_json = os.path.join(annotation_cache_dir, subset + '.json')
182 | 
183 |         self.id_to_labels, cls_index = get_labels(metadata_dir, version=version)
184 | 
185 |         if os.path.exists(annotation_cache_json):
186 |             with open(annotation_cache_json, 'r') as f:
187 |                 self.annotations = json.loads(f.read())
188 |         else:
189 |             self.annotations = generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index,
190 |                                                                 version=version)
191 |             json.dump(self.annotations, open(annotation_cache_json, "w"))
192 | 
193 |         self.id_to_image_id = dict([(i, k) for i, k in enumerate(self.annotations)])
194 | 
195 |         # (label -> name)
196 |         self.labels = self.id_to_labels
197 | 
198 |     def __len__(self):
199 |         return len(self.annotations)
200 | 
201 |     def __getitem__(self, idx):
202 | 
203 |         img = self.load_image(idx)
204 |         annot = self.load_annotations(idx)
205 |         sample = {'img': img, 'annot': annot}
206 |         if self.transform:
207 |             sample = self.transform(sample)
208 | 
209 |         return sample
210 | 
211 |     def image_path(self, image_index):
212 |         path = os.path.join(self.base_dir, self.id_to_image_id[image_index] + '.jpg')
213 |         return path
214 | 
215 |     def load_image(self, image_index):
216 |         path = self.image_path(image_index)
217 |         img = skimage.io.imread(path)
218 | 
219 |         if len(img.shape) == 1:
220 |             img = img[0]
221 | 
222 |         if len(img.shape) == 2:
223 |             img = skimage.color.gray2rgb(img)
224 | 
225 |         try:
226 |             return img.astype(np.float32) / 255.0
227 |         except Exception:
228 |             print (path)
229 |             exit(0)
230 | 
231 |     def load_annotations(self, image_index):
232 |         # get ground truth annotations
233 |         image_annotations = self.annotations[self.id_to_image_id[image_index]]
234 | 
235 |         labels = image_annotations['boxes']
236 |         height, width = image_annotations['h'], image_annotations['w']
237 | 
238 |         boxes = np.zeros((len(labels), 5))
239 |         for idx, ann in enumerate(labels):
240 |             cls_id = ann['cls_id']
241 |             x1 = ann['x1'] * width
242 |             x2 = ann['x2'] * width
243 |             y1 = ann['y1'] * height
244 |             y2 = ann['y2'] * height
245 | 
246 |             boxes[idx, 0] = x1
247 |             boxes[idx, 1] = y1
248 |             boxes[idx, 2] = x2
249 |             boxes[idx, 3] = y2
250 |             boxes[idx, 4] = cls_id
251 | 
252 |         return boxes
253 | 
254 |     def image_aspect_ratio(self, image_index):
255 |         img_annotations = self.annotations[self.id_to_image_id[image_index]]
256 |         height, width = img_annotations['h'], img_annotations['w']
257 |         return float(width) / float(height)
258 | 
259 |     def num_classes(self):
260 |         return len(self.id_to_labels)
261 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | import math
  4 | import time
  5 | import torch.utils.model_zoo as model_zoo
  6 | from utils import BasicBlock, Bottleneck, BBoxTransform, ClipBoxes
  7 | from anchors import Anchors
  8 | import losses
  9 | from lib.nms.pth_nms import pth_nms
 10 | 
 11 | def nms(dets, thresh):
 12 |     "Dispatch to either CPU or GPU NMS implementations.\
 13 |     Accept dets as tensor"""
 14 |     return pth_nms(dets, thresh)
 15 | 
 16 | model_urls = {
 17 |     'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
 18 |     'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
 19 |     'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
 20 |     'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
 21 |     'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
 22 | }
 23 | 
 24 | class PyramidFeatures(nn.Module):
 25 |     def __init__(self, C3_size, C4_size, C5_size, feature_size=256):
 26 |         super(PyramidFeatures, self).__init__()
 27 |         
 28 |         # upsample C5 to get P5 from the FPN paper
 29 |         self.P5_1           = nn.Conv2d(C5_size, feature_size, kernel_size=1, stride=1, padding=0)
 30 |         self.P5_upsampled   = nn.Upsample(scale_factor=2, mode='nearest')
 31 |         self.P5_2           = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)
 32 | 
 33 |         # add P5 elementwise to C4
 34 |         self.P4_1           = nn.Conv2d(C4_size, feature_size, kernel_size=1, stride=1, padding=0)
 35 |         self.P4_upsampled   = nn.Upsample(scale_factor=2, mode='nearest')
 36 |         self.P4_2           = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)
 37 | 
 38 |         # add P4 elementwise to C3
 39 |         self.P3_1 = nn.Conv2d(C3_size, feature_size, kernel_size=1, stride=1, padding=0)
 40 |         self.P3_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)
 41 | 
 42 |         # "P6 is obtained via a 3x3 stride-2 conv on C5"
 43 |         self.P6 = nn.Conv2d(C5_size, feature_size, kernel_size=3, stride=2, padding=1)
 44 | 
 45 |         # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6"
 46 |         self.P7_1 = nn.ReLU()
 47 |         self.P7_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=2, padding=1)
 48 | 
 49 |     def forward(self, inputs):
 50 | 
 51 |         C3, C4, C5 = inputs
 52 | 
 53 |         P5_x = self.P5_1(C5)
 54 |         P5_upsampled_x = self.P5_upsampled(P5_x)
 55 |         P5_x = self.P5_2(P5_x)
 56 |         
 57 |         P4_x = self.P4_1(C4)
 58 |         P4_x = P5_upsampled_x + P4_x
 59 |         P4_upsampled_x = self.P4_upsampled(P4_x)
 60 |         P4_x = self.P4_2(P4_x)
 61 | 
 62 |         P3_x = self.P3_1(C3)
 63 |         P3_x = P3_x + P4_upsampled_x
 64 |         P3_x = self.P3_2(P3_x)
 65 | 
 66 |         P6_x = self.P6(C5)
 67 | 
 68 |         P7_x = self.P7_1(P6_x)
 69 |         P7_x = self.P7_2(P7_x)
 70 | 
 71 |         return [P3_x, P4_x, P5_x, P6_x, P7_x]
 72 | 
 73 | 
 74 | class RegressionModel(nn.Module):
 75 |     def __init__(self, num_features_in, num_anchors=9, feature_size=256):
 76 |         super(RegressionModel, self).__init__()
 77 |         
 78 |         self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
 79 |         self.act1 = nn.ReLU()
 80 | 
 81 |         self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
 82 |         self.act2 = nn.ReLU()
 83 | 
 84 |         self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
 85 |         self.act3 = nn.ReLU()
 86 | 
 87 |         self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
 88 |         self.act4 = nn.ReLU()
 89 | 
 90 |         self.output = nn.Conv2d(feature_size, num_anchors*4, kernel_size=3, padding=1)
 91 | 
 92 |     def forward(self, x):
 93 | 
 94 |         out = self.conv1(x)
 95 |         out = self.act1(out)
 96 | 
 97 |         out = self.conv2(out)
 98 |         out = self.act2(out)
 99 | 
100 |         out = self.conv3(out)
101 |         out = self.act3(out)
102 | 
103 |         out = self.conv4(out)
104 |         out = self.act4(out)
105 | 
106 |         out = self.output(out)
107 | 
108 |         # out is B x C x W x H, with C = 4*num_anchors
109 |         out = out.permute(0, 2, 3, 1)
110 | 
111 |         return out.contiguous().view(out.shape[0], -1, 4)
112 | 
113 | class ClassificationModel(nn.Module):
114 |     def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256):
115 |         super(ClassificationModel, self).__init__()
116 | 
117 |         self.num_classes = num_classes
118 |         self.num_anchors = num_anchors
119 |         
120 |         self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
121 |         self.act1 = nn.ReLU()
122 | 
123 |         self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
124 |         self.act2 = nn.ReLU()
125 | 
126 |         self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
127 |         self.act3 = nn.ReLU()
128 | 
129 |         self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
130 |         self.act4 = nn.ReLU()
131 | 
132 |         self.output = nn.Conv2d(feature_size, num_anchors*num_classes, kernel_size=3, padding=1)
133 |         self.output_act = nn.Sigmoid()
134 | 
135 |     def forward(self, x):
136 | 
137 |         out = self.conv1(x)
138 |         out = self.act1(out)
139 | 
140 |         out = self.conv2(out)
141 |         out = self.act2(out)
142 | 
143 |         out = self.conv3(out)
144 |         out = self.act3(out)
145 | 
146 |         out = self.conv4(out)
147 |         out = self.act4(out)
148 | 
149 |         out = self.output(out)
150 |         out = self.output_act(out)
151 | 
152 |         # out is B x C x W x H, with C = n_classes + n_anchors
153 |         out1 = out.permute(0, 2, 3, 1)
154 | 
155 |         batch_size, width, height, channels = out1.shape
156 | 
157 |         out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes)
158 | 
159 |         return out2.contiguous().view(x.shape[0], -1, self.num_classes)
160 | 
161 | class ResNet(nn.Module):
162 | 
163 |     def __init__(self, num_classes, block, layers):
164 |         self.inplanes = 64
165 |         super(ResNet, self).__init__()
166 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
167 |         self.bn1 = nn.BatchNorm2d(64)
168 |         self.relu = nn.ReLU(inplace=True)
169 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
170 |         self.layer1 = self._make_layer(block, 64, layers[0])
171 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
172 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
173 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
174 | 
175 |         if block == BasicBlock:
176 |             fpn_sizes = [self.layer2[layers[1]-1].conv2.out_channels, self.layer3[layers[2]-1].conv2.out_channels, self.layer4[layers[3]-1].conv2.out_channels]
177 |         elif block == Bottleneck:
178 |             fpn_sizes = [self.layer2[layers[1]-1].conv3.out_channels, self.layer3[layers[2]-1].conv3.out_channels, self.layer4[layers[3]-1].conv3.out_channels]
179 | 
180 |         self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2])
181 | 
182 |         self.regressionModel = RegressionModel(256)
183 |         self.classificationModel = ClassificationModel(256, num_classes=num_classes)
184 | 
185 |         self.anchors = Anchors()
186 | 
187 |         self.regressBoxes = BBoxTransform()
188 | 
189 |         self.clipBoxes = ClipBoxes()
190 |         
191 |         self.focalLoss = losses.FocalLoss()
192 |                 
193 |         for m in self.modules():
194 |             if isinstance(m, nn.Conv2d):
195 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
196 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
197 |             elif isinstance(m, nn.BatchNorm2d):
198 |                 m.weight.data.fill_(1)
199 |                 m.bias.data.zero_()
200 | 
201 |         prior = 0.01
202 |         
203 |         self.classificationModel.output.weight.data.fill_(0)
204 |         self.classificationModel.output.bias.data.fill_(-math.log((1.0-prior)/prior))
205 | 
206 |         self.regressionModel.output.weight.data.fill_(0)
207 |         self.regressionModel.output.bias.data.fill_(0)
208 | 
209 |         self.freeze_bn()
210 | 
211 |     def _make_layer(self, block, planes, blocks, stride=1):
212 |         downsample = None
213 |         if stride != 1 or self.inplanes != planes * block.expansion:
214 |             downsample = nn.Sequential(
215 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
216 |                           kernel_size=1, stride=stride, bias=False),
217 |                 nn.BatchNorm2d(planes * block.expansion),
218 |             )
219 | 
220 |         layers = []
221 |         layers.append(block(self.inplanes, planes, stride, downsample))
222 |         self.inplanes = planes * block.expansion
223 |         for i in range(1, blocks):
224 |             layers.append(block(self.inplanes, planes))
225 | 
226 |         return nn.Sequential(*layers)
227 | 
228 |     def freeze_bn(self):
229 |         '''Freeze BatchNorm layers.'''
230 |         for layer in self.modules():
231 |             if isinstance(layer, nn.BatchNorm2d):
232 |                 layer.eval()
233 | 
234 |     def forward(self, inputs):
235 | 
236 |         if self.training:
237 |             img_batch, annotations = inputs
238 |         else:
239 |             img_batch = inputs
240 |             
241 |         x = self.conv1(img_batch)
242 |         x = self.bn1(x)
243 |         x = self.relu(x)
244 |         x = self.maxpool(x)
245 | 
246 |         x1 = self.layer1(x)
247 |         x2 = self.layer2(x1)
248 |         x3 = self.layer3(x2)
249 |         x4 = self.layer4(x3)
250 | 
251 |         features = self.fpn([x2, x3, x4])
252 | 
253 |         regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1)
254 | 
255 |         classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1)
256 | 
257 |         anchors = self.anchors(img_batch)
258 | 
259 |         if self.training:
260 |             return self.focalLoss(classification, regression, anchors, annotations)
261 |         else:
262 |             transformed_anchors = self.regressBoxes(anchors, regression)
263 |             transformed_anchors = self.clipBoxes(transformed_anchors, img_batch)
264 | 
265 |             scores = torch.max(classification, dim=2, keepdim=True)[0]
266 | 
267 |             scores_over_thresh = (scores>0.05)[0, :, 0]
268 | 
269 |             if scores_over_thresh.sum() == 0:
270 |                 # no boxes to NMS, just return
271 |                 return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)]
272 | 
273 |             classification = classification[:, scores_over_thresh, :]
274 |             transformed_anchors = transformed_anchors[:, scores_over_thresh, :]
275 |             scores = scores[:, scores_over_thresh, :]
276 | 
277 |             anchors_nms_idx = nms(torch.cat([transformed_anchors, scores], dim=2)[0, :, :], 0.5)
278 | 
279 |             nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(dim=1)
280 | 
281 |             return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]]
282 | 
283 | 
284 | 
285 | def resnet18(num_classes, pretrained=False, **kwargs):
286 |     """Constructs a ResNet-18 model.
287 |     Args:
288 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
289 |     """
290 |     model = ResNet(num_classes, BasicBlock, [2, 2, 2, 2], **kwargs)
291 |     if pretrained:
292 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet18'], model_dir='.'), strict=False)
293 |     return model
294 | 
295 | 
296 | def resnet34(num_classes, pretrained=False, **kwargs):
297 |     """Constructs a ResNet-34 model.
298 |     Args:
299 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
300 |     """
301 |     model = ResNet(num_classes, BasicBlock, [3, 4, 6, 3], **kwargs)
302 |     if pretrained:
303 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet34'], model_dir='.'), strict=False)
304 |     return model
305 | 
306 | 
307 | def resnet50(num_classes, pretrained=False, **kwargs):
308 |     """Constructs a ResNet-50 model.
309 |     Args:
310 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
311 |     """
312 |     model = ResNet(num_classes, Bottleneck, [3, 4, 6, 3], **kwargs)
313 |     if pretrained:
314 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet50'], model_dir='.'), strict=False)
315 |     return model
316 | 
317 | def resnet101(num_classes, pretrained=False, **kwargs):
318 |     """Constructs a ResNet-101 model.
319 |     Args:
320 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
321 |     """
322 |     model = ResNet(num_classes, Bottleneck, [3, 4, 23, 3], **kwargs)
323 |     if pretrained:
324 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet101'], model_dir='.'), strict=False)
325 |     return model
326 | 
327 | 
328 | def resnet152(num_classes, pretrained=False, **kwargs):
329 |     """Constructs a ResNet-152 model.
330 |     Args:
331 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
332 |     """
333 |     model = ResNet(num_classes, Bottleneck, [3, 8, 36, 3], **kwargs)
334 |     if pretrained:
335 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet152'], model_dir='.'), strict=False)
336 |     return model


--------------------------------------------------------------------------------
/dataloader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function, division
  2 | import sys
  3 | import os
  4 | import torch
  5 | import numpy as np
  6 | import random
  7 | import csv
  8 | 
  9 | from torch.utils.data import Dataset, DataLoader
 10 | from torchvision import transforms, utils
 11 | from torch.utils.data.sampler import Sampler
 12 | 
 13 | from pycocotools.coco import COCO
 14 | 
 15 | import skimage.io
 16 | import skimage.transform
 17 | import skimage.color
 18 | import skimage
 19 | 
 20 | from PIL import Image
 21 | 
 22 | 
 23 | class CocoDataset(Dataset):
 24 |     """Coco dataset."""
 25 | 
 26 |     def __init__(self, root_dir, set_name='train2017', transform=None):
 27 |         """
 28 |         Args:
 29 |             root_dir (string): COCO directory.
 30 |             transform (callable, optional): Optional transform to be applied
 31 |                 on a sample.
 32 |         """
 33 |         self.root_dir = root_dir
 34 |         self.set_name = set_name
 35 |         self.transform = transform
 36 | 
 37 |         self.coco      = COCO(os.path.join(self.root_dir, 'annotations', 'instances_' + self.set_name + '.json'))
 38 |         self.image_ids = self.coco.getImgIds()
 39 | 
 40 |         self.load_classes()
 41 | 
 42 |     def load_classes(self):
 43 |         # load class names (name -> label)
 44 |         categories = self.coco.loadCats(self.coco.getCatIds())
 45 |         categories.sort(key=lambda x: x['id'])
 46 | 
 47 |         self.classes             = {}
 48 |         self.coco_labels         = {}
 49 |         self.coco_labels_inverse = {}
 50 |         for c in categories:
 51 |             self.coco_labels[len(self.classes)] = c['id']
 52 |             self.coco_labels_inverse[c['id']] = len(self.classes)
 53 |             self.classes[c['name']] = len(self.classes)
 54 | 
 55 |         # also load the reverse (label -> name)
 56 |         self.labels = {}
 57 |         for key, value in self.classes.items():
 58 |             self.labels[value] = key
 59 | 
 60 |     def __len__(self):
 61 |         return len(self.image_ids)
 62 | 
 63 |     def __getitem__(self, idx):
 64 | 
 65 |         img = self.load_image(idx)
 66 |         annot = self.load_annotations(idx)
 67 |         sample = {'img': img, 'annot': annot}
 68 |         if self.transform:
 69 |             sample = self.transform(sample)
 70 | 
 71 |         return sample
 72 | 
 73 |     def load_image(self, image_index):
 74 |         image_info = self.coco.loadImgs(self.image_ids[image_index])[0]
 75 |         path       = os.path.join(self.root_dir, 'images', self.set_name, image_info['file_name'])
 76 |         img = skimage.io.imread(path)
 77 | 
 78 |         if len(img.shape) == 2:
 79 |             img = skimage.color.gray2rgb(img)
 80 | 
 81 |         return img.astype(np.float32)/255.0
 82 | 
 83 |     def load_annotations(self, image_index):
 84 |         # get ground truth annotations
 85 |         annotations_ids = self.coco.getAnnIds(imgIds=self.image_ids[image_index], iscrowd=False)
 86 |         annotations     = np.zeros((0, 5))
 87 | 
 88 |         # some images appear to miss annotations (like image with id 257034)
 89 |         if len(annotations_ids) == 0:
 90 |             return annotations
 91 | 
 92 |         # parse annotations
 93 |         coco_annotations = self.coco.loadAnns(annotations_ids)
 94 |         for idx, a in enumerate(coco_annotations):
 95 | 
 96 |             # some annotations have basically no width / height, skip them
 97 |             if a['bbox'][2] < 1 or a['bbox'][3] < 1:
 98 |                 continue
 99 | 
100 |             annotation        = np.zeros((1, 5))
101 |             annotation[0, :4] = a['bbox']
102 |             annotation[0, 4]  = self.coco_label_to_label(a['category_id'])
103 |             annotations       = np.append(annotations, annotation, axis=0)
104 | 
105 |         # transform from [x, y, w, h] to [x1, y1, x2, y2]
106 |         annotations[:, 2] = annotations[:, 0] + annotations[:, 2]
107 |         annotations[:, 3] = annotations[:, 1] + annotations[:, 3]
108 | 
109 |         return annotations
110 | 
111 |     def coco_label_to_label(self, coco_label):
112 |         return self.coco_labels_inverse[coco_label]
113 | 
114 | 
115 |     def label_to_coco_label(self, label):
116 |         return self.coco_labels[label]
117 | 
118 |     def image_aspect_ratio(self, image_index):
119 |         image = self.coco.loadImgs(self.image_ids[image_index])[0]
120 |         return float(image['width']) / float(image['height'])
121 | 
122 |     def num_classes(self):
123 |         return 80
124 | 
125 | 
126 | class CSVDataset(Dataset):
127 |     """CSV dataset."""
128 | 
129 |     def __init__(self, train_file, class_list, transform=None):
130 |         """
131 |         Args:
132 |             train_file (string): CSV file with training annotations
133 |             annotations (string): CSV file with class list
134 |             test_file (string, optional): CSV file with testing annotations
135 |         """
136 |         self.train_file = train_file
137 |         self.class_list = class_list
138 |         self.transform = transform
139 | 
140 |         # parse the provided class file
141 |         try:
142 |             with self._open_for_csv(self.class_list) as file:
143 |                 self.classes = self.load_classes(csv.reader(file, delimiter=','))
144 |         except ValueError as e:
145 |             raise_from(ValueError('invalid CSV class file: {}: {}'.format(self.class_list, e)), None)
146 | 
147 |         self.labels = {}
148 |         for key, value in self.classes.items():
149 |             self.labels[value] = key
150 | 
151 |         # csv with img_path, x1, y1, x2, y2, class_name
152 |         try:
153 |             with self._open_for_csv(self.train_file) as file:
154 |                 self.image_data = self._read_annotations(csv.reader(file, delimiter=','), self.classes)
155 |         except ValueError as e:
156 |             raise_from(ValueError('invalid CSV annotations file: {}: {}'.format(self.train_file, e)), None)
157 |         self.image_names = list(self.image_data.keys())
158 | 
159 |     def _parse(self, value, function, fmt):
160 |         """
161 |         Parse a string into a value, and format a nice ValueError if it fails.
162 |         Returns `function(value)`.
163 |         Any `ValueError` raised is catched and a new `ValueError` is raised
164 |         with message `fmt.format(e)`, where `e` is the caught `ValueError`.
165 |         """
166 |         try:
167 |             return function(value)
168 |         except ValueError as e:
169 |             raise_from(ValueError(fmt.format(e)), None)
170 | 
171 |     def _open_for_csv(self, path):
172 |         """
173 |         Open a file with flags suitable for csv.reader.
174 |         This is different for python2 it means with mode 'rb',
175 |         for python3 this means 'r' with "universal newlines".
176 |         """
177 |         if sys.version_info[0] < 3:
178 |             return open(path, 'rb')
179 |         else:
180 |             return open(path, 'r', newline='')
181 | 
182 | 
183 |     def load_classes(self, csv_reader):
184 |         result = {}
185 | 
186 |         for line, row in enumerate(csv_reader):
187 |             line += 1
188 | 
189 |             try:
190 |                 class_name, class_id = row
191 |             except ValueError:
192 |                 raise_from(ValueError('line {}: format should be \'class_name,class_id\''.format(line)), None)
193 |             class_id = self._parse(class_id, int, 'line {}: malformed class ID: {{}}'.format(line))
194 | 
195 |             if class_name in result:
196 |                 raise ValueError('line {}: duplicate class name: \'{}\''.format(line, class_name))
197 |             result[class_name] = class_id
198 |         return result
199 | 
200 | 
201 |     def __len__(self):
202 |         return len(self.image_names)
203 | 
204 |     def __getitem__(self, idx):
205 | 
206 |         img = self.load_image(idx)
207 |         annot = self.load_annotations(idx)
208 |         sample = {'img': img, 'annot': annot}
209 |         if self.transform:
210 |             sample = self.transform(sample)
211 | 
212 |         return sample
213 | 
214 |     def load_image(self, image_index):
215 |         img = skimage.io.imread(self.image_names[image_index])
216 | 
217 |         if len(img.shape) == 2:
218 |             img = skimage.color.gray2rgb(img)
219 | 
220 |         return img.astype(np.float32)/255.0
221 | 
222 |     def load_annotations(self, image_index):
223 |         # get ground truth annotations
224 |         annotation_list = self.image_data[self.image_names[image_index]]
225 |         annotations     = np.zeros((0, 5))
226 | 
227 |         # some images appear to miss annotations (like image with id 257034)
228 |         if len(annotation_list) == 0:
229 |             return annotations
230 | 
231 |         # parse annotations
232 |         for idx, a in enumerate(annotation_list):
233 |             # some annotations have basically no width / height, skip them
234 |             x1 = a['x1']
235 |             x2 = a['x2']
236 |             y1 = a['y1']
237 |             y2 = a['y2']
238 | 
239 |             if (x2-x1) < 1 or (y2-y1) < 1:
240 |                 continue
241 | 
242 |             annotation        = np.zeros((1, 5))
243 |             
244 |             annotation[0, 0] = x1
245 |             annotation[0, 1] = y1
246 |             annotation[0, 2] = x2
247 |             annotation[0, 3] = y2
248 | 
249 |             annotation[0, 4]  = self.name_to_label(a['class'])
250 |             annotations       = np.append(annotations, annotation, axis=0)
251 | 
252 |         return annotations
253 | 
254 |     def _read_annotations(self, csv_reader, classes):
255 |         result = {}
256 |         for line, row in enumerate(csv_reader):
257 |             line += 1
258 | 
259 |             try:
260 |                 img_file, x1, y1, x2, y2, class_name = row[:6]
261 |             except ValueError:
262 |                 raise_from(ValueError('line {}: format should be \'img_file,x1,y1,x2,y2,class_name\' or \'img_file,,,,,\''.format(line)), None)
263 | 
264 |             if img_file not in result:
265 |                 result[img_file] = []
266 | 
267 |             # If a row contains only an image path, it's an image without annotations.
268 |             if (x1, y1, x2, y2, class_name) == ('', '', '', '', ''):
269 |                 continue
270 | 
271 |             x1 = self._parse(x1, int, 'line {}: malformed x1: {{}}'.format(line))
272 |             y1 = self._parse(y1, int, 'line {}: malformed y1: {{}}'.format(line))
273 |             x2 = self._parse(x2, int, 'line {}: malformed x2: {{}}'.format(line))
274 |             y2 = self._parse(y2, int, 'line {}: malformed y2: {{}}'.format(line))
275 | 
276 |             # Check that the bounding box is valid.
277 |             if x2 <= x1:
278 |                 raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1))
279 |             if y2 <= y1:
280 |                 raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1))
281 | 
282 |             # check if the current class name is correctly present
283 |             if class_name not in classes:
284 |                 raise ValueError('line {}: unknown class name: \'{}\' (classes: {})'.format(line, class_name, classes))
285 | 
286 |             result[img_file].append({'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2, 'class': class_name})
287 |         return result
288 | 
289 |     def name_to_label(self, name):
290 |         return self.classes[name]
291 | 
292 |     def label_to_name(self, label):
293 |         return self.labels[label]
294 | 
295 |     def num_classes(self):
296 |         return max(self.classes.values()) + 1
297 | 
298 |     def image_aspect_ratio(self, image_index):
299 |         image = Image.open(self.image_names[image_index])
300 |         return float(image.width) / float(image.height)
301 | 
302 | 
303 | def collater(data):
304 | 
305 |     imgs = [s['img'] for s in data]
306 |     annots = [s['annot'] for s in data]
307 |     scales = [s['scale'] for s in data]
308 |         
309 |     widths = [int(s.shape[0]) for s in imgs]
310 |     heights = [int(s.shape[1]) for s in imgs]
311 |     batch_size = len(imgs)
312 | 
313 |     max_width = np.array(widths).max()
314 |     max_height = np.array(heights).max()
315 | 
316 |     padded_imgs = torch.zeros(batch_size, max_width, max_height, 3)
317 | 
318 |     for i in range(batch_size):
319 |         img = imgs[i]
320 |         padded_imgs[i, :int(img.shape[0]), :int(img.shape[1]), :] = img
321 | 
322 |     max_num_annots = max(annot.shape[0] for annot in annots)
323 |     
324 |     if max_num_annots > 0:
325 | 
326 |         annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1
327 | 
328 |         if max_num_annots > 0:
329 |             for idx, annot in enumerate(annots):
330 |                 #print(annot.shape)
331 |                 if annot.shape[0] > 0:
332 |                     annot_padded[idx, :annot.shape[0], :] = annot
333 |     else:
334 |         annot_padded = torch.ones((len(annots), 1, 5)) * -1
335 | 
336 | 
337 |     padded_imgs = padded_imgs.permute(0, 3, 1, 2)
338 | 
339 |     return {'img': padded_imgs, 'annot': annot_padded, 'scale': scales}
340 | 
341 | class Resizer(object):
342 |     """Convert ndarrays in sample to Tensors."""
343 | 
344 |     def __call__(self, sample, min_side=608, max_side=1024):
345 |         image, annots = sample['img'], sample['annot']
346 | 
347 |         rows, cols, cns = image.shape
348 | 
349 |         smallest_side = min(rows, cols)
350 | 
351 |         # rescale the image so the smallest side is min_side
352 |         scale = min_side / smallest_side
353 | 
354 |         # check if the largest side is now greater than max_side, which can happen
355 |         # when images have a large aspect ratio
356 |         largest_side = max(rows, cols)
357 | 
358 |         if largest_side * scale > max_side:
359 |             scale = max_side / largest_side
360 | 
361 |         # resize the image with the computed scale
362 |         image = skimage.transform.resize(image, (int(round(rows*scale)), int(round((cols*scale)))))
363 |         rows, cols, cns = image.shape
364 | 
365 |         pad_w = 32 - rows%32
366 |         pad_h = 32 - cols%32
367 | 
368 |         new_image = np.zeros((rows + pad_w, cols + pad_h, cns)).astype(np.float32)
369 |         new_image[:rows, :cols, :] = image.astype(np.float32)
370 | 
371 |         annots[:, :4] *= scale
372 | 
373 |         return {'img': torch.from_numpy(new_image), 'annot': torch.from_numpy(annots), 'scale': scale}
374 | 
375 | 
376 | class Augmenter(object):
377 |     """Convert ndarrays in sample to Tensors."""
378 | 
379 |     def __call__(self, sample, flip_x=0.5):
380 | 
381 |         if np.random.rand() < flip_x:
382 |             image, annots = sample['img'], sample['annot']
383 |             image = image[:, ::-1, :]
384 | 
385 |             rows, cols, channels = image.shape
386 | 
387 |             x1 = annots[:, 0].copy()
388 |             x2 = annots[:, 2].copy()
389 |             
390 |             x_tmp = x1.copy()
391 | 
392 |             annots[:, 0] = cols - x2
393 |             annots[:, 2] = cols - x_tmp
394 | 
395 |             sample = {'img': image, 'annot': annots}
396 | 
397 |         return sample
398 | 
399 | 
400 | class Normalizer(object):
401 | 
402 |     def __init__(self):
403 |         self.mean = np.array([[[0.485, 0.456, 0.406]]])
404 |         self.std = np.array([[[0.229, 0.224, 0.225]]])
405 | 
406 |     def __call__(self, sample):
407 | 
408 |         image, annots = sample['img'], sample['annot']
409 | 
410 |         return {'img':((image.astype(np.float32)-self.mean)/self.std), 'annot': annots}
411 | 
412 | class UnNormalizer(object):
413 |     def __init__(self, mean=None, std=None):
414 |         if mean == None:
415 |             self.mean = [0.485, 0.456, 0.406]
416 |         else:
417 |             self.mean = mean
418 |         if std == None:
419 |             self.std = [0.229, 0.224, 0.225]
420 |         else:
421 |             self.std = std
422 | 
423 |     def __call__(self, tensor):
424 |         """
425 |         Args:
426 |             tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
427 |         Returns:
428 |             Tensor: Normalized image.
429 |         """
430 |         for t, m, s in zip(tensor, self.mean, self.std):
431 |             t.mul_(s).add_(m)
432 |         return tensor
433 | 
434 | 
435 | class AspectRatioBasedSampler(Sampler):
436 | 
437 |     def __init__(self, data_source, batch_size, drop_last):
438 |         self.data_source = data_source
439 |         self.batch_size = batch_size
440 |         self.drop_last = drop_last
441 |         self.groups = self.group_images()
442 | 
443 |     def __iter__(self):
444 |         random.shuffle(self.groups)
445 |         for group in self.groups:
446 |             yield group
447 | 
448 |     def __len__(self):
449 |         if self.drop_last:
450 |             return len(self.sampler) // self.batch_size
451 |         else:
452 |             return (len(self.sampler) + self.batch_size - 1) // self.batch_size
453 | 
454 |     def group_images(self):
455 |         # determine the order of the images
456 |         order = list(range(len(self.data_source)))
457 |         order.sort(key=lambda x: self.data_source.image_aspect_ratio(x))
458 | 
459 |         # divide into groups, one group = one batch
460 |         return [[order[x % len(order)] for x in range(i, i + self.batch_size)] for i in range(0, len(order), self.batch_size)]
461 | 


--------------------------------------------------------------------------------