├── .gitignore ├── EfficientDet.ipynb ├── LICENSE ├── README.md ├── anchors.py ├── bifpn.py ├── coco_eval.py ├── csv_eval.py ├── dataloader.py ├── efficientdet.py ├── images ├── 1.jpg ├── 3.jpg ├── 4.jpg ├── 5.jpg ├── 6.jpg ├── 7.jpg └── 8.jpg ├── losses.py ├── oid_dataset.py ├── opt ├── nms_wrapper.py ├── soft_nms_cpu.pyx └── src │ ├── nms_cpu.cpp │ ├── nms_cuda.cpp │ ├── nms_kernel.cu │ └── soft_nms_cpu.pyx ├── retinanet.py ├── timeitdec.py ├── train.py ├── utils.py └── visualize.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # Jupyter Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # SageMath parsed files 79 | *.sage.py 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | .venv 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | .spyproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | # mkdocs documentation 97 | /site 98 | 99 | # mypy 100 | .mypy_cache/ 101 | 102 | *.zip 103 | *.pt 104 | -------------------------------------------------------------------------------- /EfficientDet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# EfficientDet" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 49, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "CUDA available: True\n", 20 | "loading annotations into memory...\n", 21 | "Done (t=16.03s)\n", 22 | "creating index...\n", 23 | "index created!\n", 24 | "loading annotations into memory...\n", 25 | "Done (t=0.61s)\n", 26 | "creating index...\n", 27 | "index created!\n", 28 | "Loaded pretrained weights for efficientnet-b0\n", 29 | "DataParallel(\n", 30 | " 2.009 M, 100.000% Params, 3.742 GMac, 100.000% MACs, \n", 31 | " (module): EfficientDet(\n", 32 | " 2.009 M, 100.000% Params, 3.742 GMac, 100.000% MACs, \n", 33 | " (efficientnet): Sequential(\n", 34 | " 1.114 M, 55.476% Params, 0.065 GMac, 1.726% MACs, \n", 35 | " (0): Conv2dStaticSamePadding(\n", 36 | " 0.001 M, 0.043% Params, 0.0 GMac, 0.000% MACs, 3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False\n", 37 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(0, 1, 0, 1), value=0.0)\n", 38 | " )\n", 39 | " (1): BatchNorm2d(0.0 M, 0.003% Params, 0.004 GMac, 0.112% MACs, 32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 40 | " (2): MBConvBlock(\n", 41 | " 0.001 M, 0.072% Params, 0.006 GMac, 0.168% MACs, \n", 42 | " (_depthwise_conv): Conv2dStaticSamePadding(\n", 43 | " 0.0 M, 0.014% Params, 0.0 GMac, 0.000% MACs, 32, 32, kernel_size=(3, 3), stride=[1, 1], groups=32, bias=False\n", 44 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(1, 1, 1, 1), value=0.0)\n", 45 | " )\n", 46 | " (_bn1): BatchNorm2d(0.0 M, 0.003% Params, 0.004 GMac, 0.112% MACs, 32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 47 | " (_se_reduce): Conv2dStaticSamePadding(\n", 48 | " 0.0 M, 0.013% Params, 0.0 GMac, 0.000% MACs, 32, 8, kernel_size=(1, 1), stride=(1, 1)\n", 49 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 50 | " )\n", 51 | " (_se_expand): Conv2dStaticSamePadding(\n", 52 | " 0.0 M, 0.014% Params, 0.0 GMac, 0.000% MACs, 8, 32, kernel_size=(1, 1), stride=(1, 1)\n", 53 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 54 | " )\n", 55 | " (_project_conv): Conv2dStaticSamePadding(\n", 56 | " 0.001 M, 0.025% Params, 0.0 GMac, 0.000% MACs, 32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 57 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 58 | " )\n", 59 | " (_bn2): BatchNorm2d(0.0 M, 0.002% Params, 0.002 GMac, 0.056% MACs, 16, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 60 | " (_swish): MemoryEfficientSwish(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 61 | " )\n", 62 | " (3): MBConvBlock(\n", 63 | " 0.006 M, 0.299% Params, 0.017 GMac, 0.441% MACs, \n", 64 | " (_expand_conv): Conv2dStaticSamePadding(\n", 65 | " 0.002 M, 0.076% Params, 0.0 GMac, 0.000% MACs, 16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 66 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 67 | " )\n", 68 | " (_bn0): BatchNorm2d(0.0 M, 0.010% Params, 0.013 GMac, 0.336% MACs, 96, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 69 | " (_depthwise_conv): Conv2dStaticSamePadding(\n", 70 | " 0.001 M, 0.043% Params, 0.0 GMac, 0.000% MACs, 96, 96, kernel_size=(3, 3), stride=[2, 2], groups=96, bias=False\n", 71 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(0, 1, 0, 1), value=0.0)\n", 72 | " )\n", 73 | " (_bn1): BatchNorm2d(0.0 M, 0.010% Params, 0.003 GMac, 0.084% MACs, 96, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 74 | " (_se_reduce): Conv2dStaticSamePadding(\n", 75 | " 0.0 M, 0.019% Params, 0.0 GMac, 0.000% MACs, 96, 4, kernel_size=(1, 1), stride=(1, 1)\n", 76 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 77 | " )\n", 78 | " (_se_expand): Conv2dStaticSamePadding(\n", 79 | " 0.0 M, 0.024% Params, 0.0 GMac, 0.000% MACs, 4, 96, kernel_size=(1, 1), stride=(1, 1)\n", 80 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 81 | " )\n", 82 | " (_project_conv): Conv2dStaticSamePadding(\n", 83 | " 0.002 M, 0.115% Params, 0.0 GMac, 0.000% MACs, 96, 24, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 84 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 85 | " )\n", 86 | " (_bn2): BatchNorm2d(0.0 M, 0.002% Params, 0.001 GMac, 0.021% MACs, 24, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 87 | " (_swish): MemoryEfficientSwish(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 88 | " )\n", 89 | " (4): MBConvBlock(\n", 90 | " 0.011 M, 0.533% Params, 0.01 GMac, 0.273% MACs, \n", 91 | " (_expand_conv): Conv2dStaticSamePadding(\n", 92 | " 0.003 M, 0.172% Params, 0.0 GMac, 0.000% MACs, 24, 144, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 93 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 94 | " )\n", 95 | " (_bn0): BatchNorm2d(0.0 M, 0.014% Params, 0.005 GMac, 0.126% MACs, 144, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 96 | " (_depthwise_conv): Conv2dStaticSamePadding(\n", 97 | " 0.001 M, 0.065% Params, 0.0 GMac, 0.000% MACs, 144, 144, kernel_size=(3, 3), stride=(1, 1), groups=144, bias=False\n", 98 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(1, 1, 1, 1), value=0.0)\n", 99 | " )\n", 100 | " (_bn1): BatchNorm2d(0.0 M, 0.014% Params, 0.005 GMac, 0.126% MACs, 144, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 101 | " (_se_reduce): Conv2dStaticSamePadding(\n", 102 | " 0.001 M, 0.043% Params, 0.0 GMac, 0.000% MACs, 144, 6, kernel_size=(1, 1), stride=(1, 1)\n", 103 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 104 | " )\n", 105 | " (_se_expand): Conv2dStaticSamePadding(\n", 106 | " 0.001 M, 0.050% Params, 0.0 GMac, 0.000% MACs, 6, 144, kernel_size=(1, 1), stride=(1, 1)\n", 107 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 108 | " )\n", 109 | " (_project_conv): Conv2dStaticSamePadding(\n", 110 | " 0.003 M, 0.172% Params, 0.0 GMac, 0.000% MACs, 144, 24, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 111 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 112 | " )\n", 113 | " (_bn2): BatchNorm2d(0.0 M, 0.002% Params, 0.001 GMac, 0.021% MACs, 24, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 114 | " (_swish): MemoryEfficientSwish(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 115 | " )\n", 116 | " (5): MBConvBlock(\n", 117 | " 0.015 M, 0.764% Params, 0.006 GMac, 0.166% MACs, \n", 118 | " (_expand_conv): Conv2dStaticSamePadding(\n", 119 | " 0.003 M, 0.172% Params, 0.0 GMac, 0.000% MACs, 24, 144, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 120 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 121 | " )\n", 122 | " (_bn0): BatchNorm2d(0.0 M, 0.014% Params, 0.005 GMac, 0.126% MACs, 144, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 123 | " (_depthwise_conv): Conv2dStaticSamePadding(\n", 124 | " 0.004 M, 0.179% Params, 0.0 GMac, 0.000% MACs, 144, 144, kernel_size=(5, 5), stride=[2, 2], groups=144, bias=False\n", 125 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(1, 2, 1, 2), value=0.0)\n", 126 | " )\n", 127 | " (_bn1): BatchNorm2d(0.0 M, 0.014% Params, 0.001 GMac, 0.032% MACs, 144, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 128 | " (_se_reduce): Conv2dStaticSamePadding(\n", 129 | " 0.001 M, 0.043% Params, 0.0 GMac, 0.000% MACs, 144, 6, kernel_size=(1, 1), stride=(1, 1)\n", 130 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 131 | " )\n", 132 | " (_se_expand): Conv2dStaticSamePadding(\n", 133 | " 0.001 M, 0.050% Params, 0.0 GMac, 0.000% MACs, 6, 144, kernel_size=(1, 1), stride=(1, 1)\n", 134 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 135 | " )\n", 136 | " (_project_conv): Conv2dStaticSamePadding(\n", 137 | " 0.006 M, 0.287% Params, 0.0 GMac, 0.000% MACs, 144, 40, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 138 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 139 | " )\n", 140 | " (_bn2): BatchNorm2d(0.0 M, 0.004% Params, 0.0 GMac, 0.009% MACs, 40, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 141 | " (_swish): MemoryEfficientSwish(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 142 | " )\n", 143 | " (6): MBConvBlock(\n", 144 | " 0.031 M, 1.558% Params, 0.004 GMac, 0.114% MACs, \n", 145 | " (_expand_conv): Conv2dStaticSamePadding(\n", 146 | " 0.01 M, 0.478% Params, 0.0 GMac, 0.000% MACs, 40, 240, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 147 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 148 | " )\n", 149 | " (_bn0): BatchNorm2d(0.0 M, 0.024% Params, 0.002 GMac, 0.053% MACs, 240, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 150 | " (_depthwise_conv): Conv2dStaticSamePadding(\n", 151 | " 0.006 M, 0.299% Params, 0.0 GMac, 0.000% MACs, 240, 240, kernel_size=(5, 5), stride=(1, 1), groups=240, bias=False\n", 152 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(2, 2, 2, 2), value=0.0)\n", 153 | " )\n", 154 | " (_bn1): BatchNorm2d(0.0 M, 0.024% Params, 0.002 GMac, 0.053% MACs, 240, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 155 | " (_se_reduce): Conv2dStaticSamePadding(\n", 156 | " 0.002 M, 0.120% Params, 0.0 GMac, 0.000% MACs, 240, 10, kernel_size=(1, 1), stride=(1, 1)\n", 157 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 158 | " )\n", 159 | " (_se_expand): Conv2dStaticSamePadding(\n", 160 | " 0.003 M, 0.131% Params, 0.0 GMac, 0.000% MACs, 10, 240, kernel_size=(1, 1), stride=(1, 1)\n", 161 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 162 | " )\n", 163 | " (_project_conv): Conv2dStaticSamePadding(\n", 164 | " 0.01 M, 0.478% Params, 0.0 GMac, 0.000% MACs, 240, 40, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 165 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 166 | " )\n", 167 | " (_bn2): BatchNorm2d(0.0 M, 0.004% Params, 0.0 GMac, 0.009% MACs, 40, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 168 | " (_swish): MemoryEfficientSwish(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 169 | " )\n", 170 | " (7): MBConvBlock(\n", 171 | " 0.037 M, 1.849% Params, 0.003 GMac, 0.070% MACs, \n", 172 | " (_expand_conv): Conv2dStaticSamePadding(\n", 173 | " 0.01 M, 0.478% Params, 0.0 GMac, 0.000% MACs, 40, 240, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 174 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 175 | " )\n", 176 | " (_bn0): BatchNorm2d(0.0 M, 0.024% Params, 0.002 GMac, 0.053% MACs, 240, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 177 | " (_depthwise_conv): Conv2dStaticSamePadding(\n", 178 | " 0.002 M, 0.108% Params, 0.0 GMac, 0.000% MACs, 240, 240, kernel_size=(3, 3), stride=[2, 2], groups=240, bias=False\n", 179 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(0, 1, 0, 1), value=0.0)\n", 180 | " )\n", 181 | " (_bn1): BatchNorm2d(0.0 M, 0.024% Params, 0.0 GMac, 0.013% MACs, 240, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 182 | " (_se_reduce): Conv2dStaticSamePadding(\n", 183 | " 0.002 M, 0.120% Params, 0.0 GMac, 0.000% MACs, 240, 10, kernel_size=(1, 1), stride=(1, 1)\n", 184 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 185 | " )\n", 186 | " (_se_expand): Conv2dStaticSamePadding(\n", 187 | " 0.003 M, 0.131% Params, 0.0 GMac, 0.000% MACs, 10, 240, kernel_size=(1, 1), stride=(1, 1)\n", 188 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 189 | " )\n", 190 | " (_project_conv): Conv2dStaticSamePadding(\n", 191 | " 0.019 M, 0.956% Params, 0.0 GMac, 0.000% MACs, 240, 80, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 192 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 193 | " )\n", 194 | " (_bn2): BatchNorm2d(0.0 M, 0.008% Params, 0.0 GMac, 0.004% MACs, 80, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 195 | " (_swish): MemoryEfficientSwish(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 196 | " )\n", 197 | " (8): MBConvBlock(\n", 198 | " 0.103 M, 5.123% Params, 0.002 GMac, 0.057% MACs, \n", 199 | " (_expand_conv): Conv2dStaticSamePadding(\n", 200 | " 0.038 M, 1.912% Params, 0.0 GMac, 0.000% MACs, 80, 480, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 201 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 202 | " )\n", 203 | " (_bn0): BatchNorm2d(0.001 M, 0.048% Params, 0.001 GMac, 0.026% MACs, 480, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 204 | " (_depthwise_conv): Conv2dStaticSamePadding(\n", 205 | " 0.004 M, 0.215% Params, 0.0 GMac, 0.000% MACs, 480, 480, kernel_size=(3, 3), stride=(1, 1), groups=480, bias=False\n", 206 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(1, 1, 1, 1), value=0.0)\n", 207 | " )\n", 208 | " (_bn1): BatchNorm2d(0.001 M, 0.048% Params, 0.001 GMac, 0.026% MACs, 480, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 209 | " (_se_reduce): Conv2dStaticSamePadding(\n", 210 | " 0.01 M, 0.479% Params, 0.0 GMac, 0.000% MACs, 480, 20, kernel_size=(1, 1), stride=(1, 1)\n", 211 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 212 | " )\n", 213 | " (_se_expand): Conv2dStaticSamePadding(\n", 214 | " 0.01 M, 0.502% Params, 0.0 GMac, 0.000% MACs, 20, 480, kernel_size=(1, 1), stride=(1, 1)\n", 215 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 216 | " )\n", 217 | " (_project_conv): Conv2dStaticSamePadding(\n", 218 | " 0.038 M, 1.912% Params, 0.0 GMac, 0.000% MACs, 480, 80, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 219 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 220 | " )\n", 221 | " (_bn2): BatchNorm2d(0.0 M, 0.008% Params, 0.0 GMac, 0.004% MACs, 80, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 222 | " (_swish): MemoryEfficientSwish(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 223 | " )\n", 224 | " (9): MBConvBlock(\n", 225 | " 0.103 M, 5.123% Params, 0.002 GMac, 0.057% MACs, \n", 226 | " (_expand_conv): Conv2dStaticSamePadding(\n", 227 | " 0.038 M, 1.912% Params, 0.0 GMac, 0.000% MACs, 80, 480, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 228 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 229 | " )\n", 230 | " (_bn0): BatchNorm2d(0.001 M, 0.048% Params, 0.001 GMac, 0.026% MACs, 480, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 231 | " (_depthwise_conv): Conv2dStaticSamePadding(\n", 232 | " 0.004 M, 0.215% Params, 0.0 GMac, 0.000% MACs, 480, 480, kernel_size=(3, 3), stride=(1, 1), groups=480, bias=False\n", 233 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(1, 1, 1, 1), value=0.0)\n", 234 | " )\n", 235 | " (_bn1): BatchNorm2d(0.001 M, 0.048% Params, 0.001 GMac, 0.026% MACs, 480, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 236 | " (_se_reduce): Conv2dStaticSamePadding(\n", 237 | " 0.01 M, 0.479% Params, 0.0 GMac, 0.000% MACs, 480, 20, kernel_size=(1, 1), stride=(1, 1)\n", 238 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 239 | " )\n", 240 | " (_se_expand): Conv2dStaticSamePadding(\n", 241 | " 0.01 M, 0.502% Params, 0.0 GMac, 0.000% MACs, 20, 480, kernel_size=(1, 1), stride=(1, 1)\n", 242 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 243 | " )\n", 244 | " (_project_conv): Conv2dStaticSamePadding(\n", 245 | " 0.038 M, 1.912% Params, 0.0 GMac, 0.000% MACs, 480, 80, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 246 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 247 | " )\n", 248 | " (_bn2): BatchNorm2d(0.0 M, 0.008% Params, 0.0 GMac, 0.004% MACs, 80, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 249 | " (_swish): MemoryEfficientSwish(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 250 | " )\n", 251 | " (10): MBConvBlock(\n", 252 | " 0.126 M, 6.273% Params, 0.002 GMac, 0.059% MACs, \n", 253 | " (_expand_conv): Conv2dStaticSamePadding(\n", 254 | " 0.038 M, 1.912% Params, 0.0 GMac, 0.000% MACs, 80, 480, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 255 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 256 | " )\n", 257 | " (_bn0): BatchNorm2d(0.001 M, 0.048% Params, 0.001 GMac, 0.026% MACs, 480, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 258 | " (_depthwise_conv): Conv2dStaticSamePadding(\n", 259 | " 0.012 M, 0.597% Params, 0.0 GMac, 0.000% MACs, 480, 480, kernel_size=(5, 5), stride=[1, 1], groups=480, bias=False\n", 260 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(2, 2, 2, 2), value=0.0)\n", 261 | " )\n", 262 | " (_bn1): BatchNorm2d(0.001 M, 0.048% Params, 0.001 GMac, 0.026% MACs, 480, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 263 | " (_se_reduce): Conv2dStaticSamePadding(\n", 264 | " 0.01 M, 0.479% Params, 0.0 GMac, 0.000% MACs, 480, 20, kernel_size=(1, 1), stride=(1, 1)\n", 265 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 266 | " )\n", 267 | " (_se_expand): Conv2dStaticSamePadding(\n", 268 | " 0.01 M, 0.502% Params, 0.0 GMac, 0.000% MACs, 20, 480, kernel_size=(1, 1), stride=(1, 1)\n", 269 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 270 | " )\n", 271 | " (_project_conv): Conv2dStaticSamePadding(\n", 272 | " 0.054 M, 2.676% Params, 0.0 GMac, 0.000% MACs, 480, 112, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 273 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 274 | " )\n", 275 | " (_bn2): BatchNorm2d(0.0 M, 0.011% Params, 0.0 GMac, 0.006% MACs, 112, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 276 | " (_swish): MemoryEfficientSwish(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 277 | " )\n", 278 | " (11): MBConvBlock(\n", 279 | " 0.209 M, 10.384% Params, 0.003 GMac, 0.080% MACs, \n", 280 | " (_expand_conv): Conv2dStaticSamePadding(\n", 281 | " 0.075 M, 3.747% Params, 0.0 GMac, 0.000% MACs, 112, 672, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 282 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 283 | " )\n", 284 | " (_bn0): BatchNorm2d(0.001 M, 0.067% Params, 0.001 GMac, 0.037% MACs, 672, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 285 | " (_depthwise_conv): Conv2dStaticSamePadding(\n", 286 | " 0.017 M, 0.836% Params, 0.0 GMac, 0.000% MACs, 672, 672, kernel_size=(5, 5), stride=(1, 1), groups=672, bias=False\n", 287 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(2, 2, 2, 2), value=0.0)\n", 288 | " )\n", 289 | " (_bn1): BatchNorm2d(0.001 M, 0.067% Params, 0.001 GMac, 0.037% MACs, 672, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 290 | " (_se_reduce): Conv2dStaticSamePadding(\n", 291 | " 0.019 M, 0.938% Params, 0.0 GMac, 0.000% MACs, 672, 28, kernel_size=(1, 1), stride=(1, 1)\n", 292 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 293 | " )\n", 294 | " (_se_expand): Conv2dStaticSamePadding(\n", 295 | " 0.019 M, 0.970% Params, 0.0 GMac, 0.000% MACs, 28, 672, kernel_size=(1, 1), stride=(1, 1)\n", 296 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 297 | " )\n", 298 | " (_project_conv): Conv2dStaticSamePadding(\n", 299 | " 0.075 M, 3.747% Params, 0.0 GMac, 0.000% MACs, 672, 112, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 300 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 301 | " )\n", 302 | " (_bn2): BatchNorm2d(0.0 M, 0.011% Params, 0.0 GMac, 0.006% MACs, 112, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 303 | " (_swish): MemoryEfficientSwish(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 304 | " )\n", 305 | " (12): MBConvBlock(\n", 306 | " 0.209 M, 10.384% Params, 0.003 GMac, 0.080% MACs, \n", 307 | " (_expand_conv): Conv2dStaticSamePadding(\n", 308 | " 0.075 M, 3.747% Params, 0.0 GMac, 0.000% MACs, 112, 672, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 309 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 310 | " )\n", 311 | " (_bn0): BatchNorm2d(0.001 M, 0.067% Params, 0.001 GMac, 0.037% MACs, 672, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 312 | " (_depthwise_conv): Conv2dStaticSamePadding(\n", 313 | " 0.017 M, 0.836% Params, 0.0 GMac, 0.000% MACs, 672, 672, kernel_size=(5, 5), stride=(1, 1), groups=672, bias=False\n", 314 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(2, 2, 2, 2), value=0.0)\n", 315 | " )\n", 316 | " (_bn1): BatchNorm2d(0.001 M, 0.067% Params, 0.001 GMac, 0.037% MACs, 672, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 317 | " (_se_reduce): Conv2dStaticSamePadding(\n", 318 | " 0.019 M, 0.938% Params, 0.0 GMac, 0.000% MACs, 672, 28, kernel_size=(1, 1), stride=(1, 1)\n", 319 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 320 | " )\n", 321 | " (_se_expand): Conv2dStaticSamePadding(\n", 322 | " 0.019 M, 0.970% Params, 0.0 GMac, 0.000% MACs, 28, 672, kernel_size=(1, 1), stride=(1, 1)\n", 323 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 324 | " )\n", 325 | " (_project_conv): Conv2dStaticSamePadding(\n", 326 | " 0.075 M, 3.747% Params, 0.0 GMac, 0.000% MACs, 672, 112, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 327 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 328 | " )\n", 329 | " (_bn2): BatchNorm2d(0.0 M, 0.011% Params, 0.0 GMac, 0.006% MACs, 112, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 330 | " (_swish): MemoryEfficientSwish(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 331 | " )\n", 332 | " (13): MBConvBlock(\n", 333 | " 0.262 M, 13.068% Params, 0.002 GMac, 0.049% MACs, \n", 334 | " (_expand_conv): Conv2dStaticSamePadding(\n", 335 | " 0.075 M, 3.747% Params, 0.0 GMac, 0.000% MACs, 112, 672, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 336 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 337 | " )\n", 338 | " (_bn0): BatchNorm2d(0.001 M, 0.067% Params, 0.001 GMac, 0.037% MACs, 672, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 339 | " (_depthwise_conv): Conv2dStaticSamePadding(\n", 340 | " 0.017 M, 0.836% Params, 0.0 GMac, 0.000% MACs, 672, 672, kernel_size=(5, 5), stride=[2, 2], groups=672, bias=False\n", 341 | " (static_padding): ZeroPad2d(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, padding=(1, 2, 1, 2), value=0.0)\n", 342 | " )\n", 343 | " (_bn1): BatchNorm2d(0.001 M, 0.067% Params, 0.0 GMac, 0.009% MACs, 672, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 344 | " (_se_reduce): Conv2dStaticSamePadding(\n", 345 | " 0.019 M, 0.938% Params, 0.0 GMac, 0.000% MACs, 672, 28, kernel_size=(1, 1), stride=(1, 1)\n", 346 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 347 | " )\n", 348 | " (_se_expand): Conv2dStaticSamePadding(\n", 349 | " 0.019 M, 0.970% Params, 0.0 GMac, 0.000% MACs, 28, 672, kernel_size=(1, 1), stride=(1, 1)\n", 350 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 351 | " )\n", 352 | " (_project_conv): Conv2dStaticSamePadding(\n", 353 | " 0.129 M, 6.424% Params, 0.0 GMac, 0.000% MACs, 672, 192, kernel_size=(1, 1), stride=(1, 1), bias=False\n", 354 | " (static_padding): Identity(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 355 | " )\n", 356 | " (_bn2): BatchNorm2d(0.0 M, 0.019% Params, 0.0 GMac, 0.003% MACs, 192, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)\n", 357 | " (_swish): MemoryEfficientSwish(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 358 | " )\n", 359 | " )\n", 360 | " (fpn): BiFPN(\n", 361 | " 0.237 M, 11.775% Params, 0.086 GMac, 2.301% MACs, \n", 362 | " (p3): Conv2d(0.003 M, 0.131% Params, 0.011 GMac, 0.287% MACs, 40, 64, kernel_size=(1, 1), stride=(1, 1))\n", 363 | " (p4): Conv2d(0.005 M, 0.258% Params, 0.005 GMac, 0.142% MACs, 80, 64, kernel_size=(1, 1), stride=(1, 1))\n", 364 | " (p5): Conv2d(0.012 M, 0.615% Params, 0.003 GMac, 0.085% MACs, 192, 64, kernel_size=(1, 1), stride=(1, 1))\n", 365 | " (p6): Conv2d(0.111 M, 5.509% Params, 0.007 GMac, 0.189% MACs, 192, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n", 366 | " (p7): ConvBlock(\n", 367 | " 0.037 M, 1.845% Params, 0.001 GMac, 0.016% MACs, \n", 368 | " (conv): Conv2d(0.037 M, 1.838% Params, 0.001 GMac, 0.016% MACs, 64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n", 369 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.000% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 370 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 371 | " )\n", 372 | " (bifpn): Sequential(\n", 373 | " 0.069 M, 3.418% Params, 0.059 GMac, 1.582% MACs, \n", 374 | " (0): BiFPNBlock(\n", 375 | " 0.034 M, 1.709% Params, 0.03 GMac, 0.791% MACs, \n", 376 | " (p3_td): DepthwiseConvBlock(\n", 377 | " 0.004 M, 0.213% Params, 0.018 GMac, 0.476% MACs, \n", 378 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.007% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 379 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.017 GMac, 0.448% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 380 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.001 GMac, 0.014% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 381 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.007% MACs, )\n", 382 | " )\n", 383 | " (p4_td): DepthwiseConvBlock(\n", 384 | " 0.004 M, 0.213% Params, 0.004 GMac, 0.119% MACs, \n", 385 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.002% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 386 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.004 GMac, 0.112% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 387 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.004% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 388 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.002% MACs, )\n", 389 | " )\n", 390 | " (p5_td): DepthwiseConvBlock(\n", 391 | " 0.004 M, 0.213% Params, 0.001 GMac, 0.030% MACs, \n", 392 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.000% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 393 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.001 GMac, 0.028% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 394 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.001% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 395 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 396 | " )\n", 397 | " (p6_td): DepthwiseConvBlock(\n", 398 | " 0.004 M, 0.213% Params, 0.0 GMac, 0.007% MACs, \n", 399 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.000% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 400 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.0 GMac, 0.007% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 401 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.000% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 402 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 403 | " )\n", 404 | " (p4_out): DepthwiseConvBlock(\n", 405 | " 0.004 M, 0.213% Params, 0.004 GMac, 0.119% MACs, \n", 406 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.002% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 407 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.004 GMac, 0.112% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 408 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.004% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 409 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.002% MACs, )\n", 410 | " )\n", 411 | " (p5_out): DepthwiseConvBlock(\n", 412 | " 0.004 M, 0.213% Params, 0.001 GMac, 0.030% MACs, \n", 413 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.000% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 414 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.001 GMac, 0.028% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 415 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.001% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 416 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 417 | " )\n", 418 | " (p6_out): DepthwiseConvBlock(\n", 419 | " 0.004 M, 0.213% Params, 0.0 GMac, 0.007% MACs, \n", 420 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.000% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 421 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.0 GMac, 0.007% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 422 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.000% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 423 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 424 | " )\n", 425 | " (p7_out): DepthwiseConvBlock(\n", 426 | " 0.004 M, 0.213% Params, 0.0 GMac, 0.002% MACs, \n", 427 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.000% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 428 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.0 GMac, 0.002% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 429 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.000% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 430 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 431 | " )\n", 432 | " (w1_relu): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 433 | " (w2_relu): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 434 | " )\n", 435 | " (1): BiFPNBlock(\n", 436 | " 0.034 M, 1.709% Params, 0.03 GMac, 0.791% MACs, \n", 437 | " (p3_td): DepthwiseConvBlock(\n", 438 | " 0.004 M, 0.213% Params, 0.018 GMac, 0.476% MACs, \n", 439 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.007% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 440 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.017 GMac, 0.448% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 441 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.001 GMac, 0.014% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 442 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.007% MACs, )\n", 443 | " )\n", 444 | " (p4_td): DepthwiseConvBlock(\n", 445 | " 0.004 M, 0.213% Params, 0.004 GMac, 0.119% MACs, \n", 446 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.002% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 447 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.004 GMac, 0.112% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 448 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.004% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 449 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.002% MACs, )\n", 450 | " )\n", 451 | " (p5_td): DepthwiseConvBlock(\n", 452 | " 0.004 M, 0.213% Params, 0.001 GMac, 0.030% MACs, \n", 453 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.000% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 454 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.001 GMac, 0.028% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 455 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.001% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 456 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 457 | " )\n", 458 | " (p6_td): DepthwiseConvBlock(\n", 459 | " 0.004 M, 0.213% Params, 0.0 GMac, 0.007% MACs, \n", 460 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.000% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 461 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.0 GMac, 0.007% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 462 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.000% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 463 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 464 | " )\n", 465 | " (p4_out): DepthwiseConvBlock(\n", 466 | " 0.004 M, 0.213% Params, 0.004 GMac, 0.119% MACs, \n", 467 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.002% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 468 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.004 GMac, 0.112% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 469 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.004% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 470 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.002% MACs, )\n", 471 | " )\n", 472 | " (p5_out): DepthwiseConvBlock(\n", 473 | " 0.004 M, 0.213% Params, 0.001 GMac, 0.030% MACs, \n", 474 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.000% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 475 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.001 GMac, 0.028% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 476 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.001% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 477 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 478 | " )\n", 479 | " (p6_out): DepthwiseConvBlock(\n", 480 | " 0.004 M, 0.213% Params, 0.0 GMac, 0.007% MACs, \n", 481 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.000% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 482 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.0 GMac, 0.007% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 483 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.000% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 484 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 485 | " )\n", 486 | " (p7_out): DepthwiseConvBlock(\n", 487 | " 0.004 M, 0.213% Params, 0.0 GMac, 0.002% MACs, \n", 488 | " (depthwise): Conv2d(0.0 M, 0.003% Params, 0.0 GMac, 0.000% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), groups=64, bias=False)\n", 489 | " (pointwise): Conv2d(0.004 M, 0.204% Params, 0.0 GMac, 0.002% MACs, 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 490 | " (bn): BatchNorm2d(0.0 M, 0.006% Params, 0.0 GMac, 0.000% MACs, 64, eps=4e-05, momentum=0.9997, affine=True, track_running_stats=True)\n", 491 | " (act): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 492 | " )\n", 493 | " (w1_relu): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 494 | " (w2_relu): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 495 | " )\n", 496 | " )\n", 497 | " )\n", 498 | " (regressionModel): RegressionModel(\n", 499 | " 0.132 M, 6.550% Params, 0.719 GMac, 19.212% MACs, \n", 500 | " (prediction_net): Sequential(\n", 501 | " 0.111 M, 5.515% Params, 0.605 GMac, 16.183% MACs, \n", 502 | " (0): Conv2d(0.037 M, 1.838% Params, 0.201 GMac, 5.385% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 503 | " (1): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.009% MACs, )\n", 504 | " (2): Conv2d(0.037 M, 1.838% Params, 0.201 GMac, 5.385% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 505 | " (3): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.009% MACs, )\n", 506 | " (4): Conv2d(0.037 M, 1.838% Params, 0.201 GMac, 5.385% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 507 | " (5): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.009% MACs, )\n", 508 | " )\n", 509 | " (output): Conv2d(0.021 M, 1.034% Params, 0.113 GMac, 3.029% MACs, 64, 36, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 510 | " )\n", 511 | " (classificationModel): ClassificationModel(\n", 512 | " 0.526 M, 26.199% Params, 2.872 GMac, 76.762% MACs, \n", 513 | " (classification_net): Sequential(\n", 514 | " 0.111 M, 5.515% Params, 0.605 GMac, 16.183% MACs, \n", 515 | " (0): Conv2d(0.037 M, 1.838% Params, 0.201 GMac, 5.385% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 516 | " (1): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.009% MACs, )\n", 517 | " (2): Conv2d(0.037 M, 1.838% Params, 0.201 GMac, 5.385% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 518 | " (3): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.009% MACs, )\n", 519 | " (4): Conv2d(0.037 M, 1.838% Params, 0.201 GMac, 5.385% MACs, 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 520 | " (5): ReLU(0.0 M, 0.000% Params, 0.0 GMac, 0.009% MACs, )\n", 521 | " )\n", 522 | " (output): Conv2d(0.415 M, 20.683% Params, 2.267 GMac, 60.580% MACs, 64, 720, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", 523 | " (output_act): Sigmoid(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 524 | " )\n", 525 | " (anchors): Anchors(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 526 | " (regressBoxes): BBoxTransform(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 527 | " (clipBoxes): ClipBoxes(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 528 | " (focalLoss): FocalLoss(0.0 M, 0.000% Params, 0.0 GMac, 0.000% MACs, )\n", 529 | " )\n", 530 | ")\n", 531 | "Computational complexity: 3.74 GMac\n", 532 | "Number of parameters: 2.01 M \n", 533 | "Num training images: 118287\n", 534 | " 0%| | 0/3697 [00:00\n", 540 | " main()\n", 541 | " File \"train.py\", line 153, in main\n", 542 | " classification_loss, regression_loss = model([data['img'].cuda().float(), data['annot']])\n", 543 | " File \"/usr/local/lib/python3.7/site-packages/torch/nn/modules/module.py\", line 541, in __call__\n", 544 | " result = self.forward(*input, **kwargs)\n", 545 | " File \"/usr/local/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py\", line 150, in forward\n", 546 | " return self.module(*inputs[0], **kwargs[0])\n", 547 | " File \"/usr/local/lib/python3.7/site-packages/torch/nn/modules/module.py\", line 541, in __call__\n", 548 | " result = self.forward(*input, **kwargs)\n", 549 | " File \"/mnt/synology/pelvis/projects/tristan/Repositories/Efficientdet-PT/efficientdet.py\", line 174, in forward\n", 550 | " return self.focalLoss(classification, regression, anchors, annotations)\n", 551 | " File \"/usr/local/lib/python3.7/site-packages/torch/nn/modules/module.py\", line 541, in __call__\n", 552 | " result = self.forward(*input, **kwargs)\n", 553 | " File \"/mnt/synology/pelvis/projects/tristan/Repositories/Efficientdet-PT/timeitdec.py\", line 6, in timed\n", 554 | " result = method(*args, **kw)\n", 555 | " File \"/mnt/synology/pelvis/projects/tristan/Repositories/Efficientdet-PT/losses.py\", line 87, in forward\n", 556 | " alpha_factor = torch.ones(targets.shape).cuda() * alpha\n", 557 | "KeyboardInterrupt\n" 558 | ] 559 | } 560 | ], 561 | "source": [ 562 | "!python3.7 train.py --dataset coco --coco_path ../../Datasets/COCO2017 --efficientdet --batch-size 32 --scaling-compound 0 --print-model-complexity" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 12, 568 | "metadata": {}, 569 | "outputs": [ 570 | { 571 | "name": "stdout", 572 | "output_type": "stream", 573 | "text": [ 574 | "Requirement already satisfied: cython in /home/user/.local/lib/python3.7/site-packages (0.29.14)\n", 575 | "\u001b[33mYou are using pip version 18.1, however version 19.3.1 is available.\n", 576 | "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", 577 | "Requirement already satisfied: pycocotools in /home/user/.local/lib/python3.7/site-packages (2.0.0)\n", 578 | "\u001b[33mYou are using pip version 18.1, however version 19.3.1 is available.\n", 579 | "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", 580 | "Requirement already satisfied: efficientnet_pytorch in /home/user/.local/lib/python3.7/site-packages (0.5.1)\n", 581 | "Requirement already satisfied: torch in /usr/local/lib/python3.7/site-packages (from efficientnet_pytorch) (1.3.0)\n", 582 | "Requirement already satisfied: numpy in /usr/local/lib/python3.7/site-packages (from torch->efficientnet_pytorch) (1.15.3)\n", 583 | "\u001b[33mYou are using pip version 18.1, however version 19.3.1 is available.\n", 584 | "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", 585 | "Requirement already satisfied: pthflops in /home/user/.local/lib/python3.7/site-packages (0.2.1)\n", 586 | "Requirement already satisfied: torch in /usr/local/lib/python3.7/site-packages (from pthflops) (1.3.0)\n", 587 | "Requirement already satisfied: numpy in /usr/local/lib/python3.7/site-packages (from torch->pthflops) (1.15.3)\n", 588 | "\u001b[33mYou are using pip version 18.1, however version 19.3.1 is available.\n", 589 | "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" 590 | ] 591 | } 592 | ], 593 | "source": [ 594 | "!pip3 install cython --user\n", 595 | "!pip3 install pycocotools --user\n", 596 | "!pip3 install efficientnet_pytorch --user\n", 597 | "!pip3 install pthflops --user\n" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": null, 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [] 606 | } 607 | ], 608 | "metadata": { 609 | "kernelspec": { 610 | "display_name": "Python 3", 611 | "language": "python", 612 | "name": "python3" 613 | }, 614 | "language_info": { 615 | "codemirror_mode": { 616 | "name": "ipython", 617 | "version": 3 618 | }, 619 | "file_extension": ".py", 620 | "mimetype": "text/x-python", 621 | "name": "python", 622 | "nbconvert_exporter": "python", 623 | "pygments_lexer": "ipython3", 624 | "version": "3.7.1" 625 | } 626 | }, 627 | "nbformat": 4, 628 | "nbformat_minor": 4 629 | } 630 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch EfficientDet 2 | Here we implement [EfficientDet](https://arxiv.org/abs/1911.09070). The code is based on a RetinaNet implementation by [yhenon/pytorch-retinanet](https://github.com/yhenon/pytorch-retinanet). We use the EfficientNet backend by [rwightman/gen-efficientnet-pytorch](https://github.com/rwightman/gen-efficientnet-pytorch). 3 | 4 | ## Current status 5 | Current implementation is able to run. I'll update this document as soon as I have some preliminary results. The paper by Tan et al. gives a few more details, which we would like to implement and report on: 6 | * Use exponential moving average with decay 0.9998. 7 | * Initialize convolution layers 8 | * Train model using using SGD optimizer with momentum 0.9 and weight decay 4e-5. 9 | * Implement described learning rate, which is first linearly increased from 0 to 0.08 in the initial 5% warm-up training steps and then annealed down using cosine decay rule. 10 | * Report performance. 11 | 12 | If you have other issues that need my attention, feel free to make a pull request or leave an [issue](https://github.com/tristandb/EfficientDet-PyTorch/issues). 13 | 14 | ## Results 15 | 16 | Model | mAP | #Params | #FLOPS 17 | 18 | 19 | ## Installation 20 | 21 | 1) Clone this repo 22 | 23 | 2) Install the required packages: 24 | 25 | ``` 26 | apt-get install tk-dev python-tk 27 | ``` 28 | 29 | 3) Install the python packages: 30 | 31 | ``` 32 | 33 | pip install pandas 34 | 35 | pip install pycocotools 36 | 37 | pip install cython 38 | 39 | pip install opencv-python 40 | 41 | pip install requests 42 | 43 | pip install efficientnet_pytorch 44 | 45 | ``` 46 | 47 | Note that you may have to edit line 14 of `build.sh` if you want to change which version of python you are building the extension for. 48 | 49 | ## Training 50 | 51 | The network can be trained using the `train.py` script. Currently, two dataloaders are available: COCO and CSV. For training on coco, use 52 | 53 | ``` 54 | python3 train.py --efficientnet --dataset coco --coco_path ../../Datasets/COCO2017 --scaling-compound 0 --batch-size 8 55 | ``` 56 | 57 | For training using a custom dataset, with annotations in CSV format (see below), use 58 | 59 | ``` 60 | python train.py --dataset csv --csv_train --csv_classes --csv_val 61 | ``` 62 | 63 | Note that the --csv_val argument is optional, in which case no validation will be performed. 64 | 65 | ## Acknowledgements 66 | - The code is based on a RetinaNet implementation by [yhenon/pytorch-retinanet](https://github.com/yhenon/pytorch-retinanet). 67 | - Significant amounts of code are borrowed from the [keras retinanet implementation](https://github.com/fizyr/keras-retinanet) 68 | - The NMS module used is from the [pytorch faster-rcnn implementation](https://github.com/ruotianluo/pytorch-faster-rcnn) 69 | - We use the EfficientNet backend by [rwightman/gen-efficientnet-pytorch](https://github.com/rwightman/gen-efficientnet-pytorch). -------------------------------------------------------------------------------- /anchors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class Anchors(nn.Module): 7 | def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None): 8 | super(Anchors, self).__init__() 9 | 10 | if pyramid_levels is None: 11 | self.pyramid_levels = [3, 4, 5, 6, 7] 12 | if strides is None: 13 | self.strides = [2 ** x for x in self.pyramid_levels] 14 | if sizes is None: 15 | self.sizes = [2 ** (x + 2) for x in self.pyramid_levels] 16 | if ratios is None: 17 | self.ratios = np.array([0.5, 1, 2]) 18 | if scales is None: 19 | self.scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) 20 | 21 | def forward(self, image): 22 | 23 | image_shape = image.shape[2:] 24 | image_shape = np.array(image_shape) 25 | image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in self.pyramid_levels] 26 | 27 | # compute anchors over all pyramid levels 28 | all_anchors = np.zeros((0, 4)).astype(np.float32) 29 | 30 | for idx, p in enumerate(self.pyramid_levels): 31 | anchors = generate_anchors(base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales) 32 | shifted_anchors = shift(image_shapes[idx], self.strides[idx], anchors) 33 | all_anchors = np.append(all_anchors, shifted_anchors, axis=0) 34 | 35 | all_anchors = np.expand_dims(all_anchors, axis=0) 36 | 37 | return torch.from_numpy(all_anchors.astype(np.float32)).cuda() 38 | 39 | def generate_anchors(base_size=16, ratios=None, scales=None): 40 | """ 41 | Generate anchor (reference) windows by enumerating aspect ratios X 42 | scales w.r.t. a reference window. 43 | """ 44 | 45 | if ratios is None: 46 | ratios = np.array([0.5, 1, 2]) 47 | 48 | if scales is None: 49 | scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) 50 | 51 | num_anchors = len(ratios) * len(scales) 52 | 53 | # initialize output anchors 54 | anchors = np.zeros((num_anchors, 4)) 55 | 56 | # scale base_size 57 | anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T 58 | 59 | # compute areas of anchors 60 | areas = anchors[:, 2] * anchors[:, 3] 61 | 62 | # correct for ratios 63 | anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales))) 64 | anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales)) 65 | 66 | # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2) 67 | anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T 68 | anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T 69 | 70 | return anchors 71 | 72 | def compute_shape(image_shape, pyramid_levels): 73 | """Compute shapes based on pyramid levels. 74 | 75 | :param image_shape: 76 | :param pyramid_levels: 77 | :return: 78 | """ 79 | image_shape = np.array(image_shape[:2]) 80 | image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels] 81 | return image_shapes 82 | 83 | 84 | def anchors_for_shape( 85 | image_shape, 86 | pyramid_levels=None, 87 | ratios=None, 88 | scales=None, 89 | strides=None, 90 | sizes=None, 91 | shapes_callback=None, 92 | ): 93 | 94 | image_shapes = compute_shape(image_shape, pyramid_levels) 95 | 96 | # compute anchors over all pyramid levels 97 | all_anchors = np.zeros((0, 4)) 98 | for idx, p in enumerate(pyramid_levels): 99 | anchors = generate_anchors(base_size=sizes[idx], ratios=ratios, scales=scales) 100 | shifted_anchors = shift(image_shapes[idx], strides[idx], anchors) 101 | all_anchors = np.append(all_anchors, shifted_anchors, axis=0) 102 | 103 | return all_anchors 104 | 105 | 106 | def shift(shape, stride, anchors): 107 | shift_x = (np.arange(0, shape[1]) + 0.5) * stride 108 | shift_y = (np.arange(0, shape[0]) + 0.5) * stride 109 | 110 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 111 | 112 | shifts = np.vstack(( 113 | shift_x.ravel(), shift_y.ravel(), 114 | shift_x.ravel(), shift_y.ravel() 115 | )).transpose() 116 | 117 | # add A anchors (1, A, 4) to 118 | # cell K shifts (K, 1, 4) to get 119 | # shift anchors (K, A, 4) 120 | # reshape to (K*A, 4) shifted anchors 121 | A = anchors.shape[0] 122 | K = shifts.shape[0] 123 | all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) 124 | all_anchors = all_anchors.reshape((K * A, 4)) 125 | 126 | return all_anchors 127 | 128 | -------------------------------------------------------------------------------- /bifpn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from torch.autograd import Variable 6 | 7 | class DepthwiseConvBlock(nn.Module): 8 | """ 9 | Depthwise seperable convolution. 10 | 11 | 12 | """ 13 | def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, freeze_bn=False): 14 | super(DepthwiseConvBlock,self).__init__() 15 | self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size, stride, 16 | padding, dilation, groups=in_channels, bias=False) 17 | self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, 18 | stride=1, padding=0, dilation=1, groups=1, bias=False) 19 | 20 | 21 | self.bn = nn.BatchNorm2d(out_channels, momentum=0.9997, eps=4e-5) 22 | self.act = nn.ReLU() 23 | 24 | def forward(self, inputs): 25 | x = self.depthwise(inputs) 26 | x = self.pointwise(x) 27 | x = self.bn(x) 28 | return self.act(x) 29 | 30 | class ConvBlock(nn.Module): 31 | """ 32 | Convolution block with Batch Normalization and ReLU activation. 33 | 34 | """ 35 | def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, freeze_bn=False): 36 | super(ConvBlock,self).__init__() 37 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding) 38 | self.bn = nn.BatchNorm2d(out_channels, momentum=0.9997, eps=4e-5) 39 | self.act = nn.ReLU() 40 | 41 | def forward(self, inputs): 42 | x = self.conv(inputs) 43 | x = self.bn(x) 44 | return self.act(x) 45 | 46 | class BiFPNBlock(nn.Module): 47 | """ 48 | Bi-directional Feature Pyramid Network 49 | """ 50 | def __init__(self, feature_size=64, epsilon=0.0001): 51 | super(BiFPNBlock, self).__init__() 52 | self.epsilon = epsilon 53 | 54 | self.p3_td = DepthwiseConvBlock(feature_size, feature_size) 55 | self.p4_td = DepthwiseConvBlock(feature_size, feature_size) 56 | self.p5_td = DepthwiseConvBlock(feature_size, feature_size) 57 | self.p6_td = DepthwiseConvBlock(feature_size, feature_size) 58 | 59 | self.p4_out = DepthwiseConvBlock(feature_size, feature_size) 60 | self.p5_out = DepthwiseConvBlock(feature_size, feature_size) 61 | self.p6_out = DepthwiseConvBlock(feature_size, feature_size) 62 | self.p7_out = DepthwiseConvBlock(feature_size, feature_size) 63 | 64 | # TODO: Init weights 65 | self.w1 = nn.Parameter(torch.Tensor(2, 4)) 66 | self.w1_relu = nn.ReLU() 67 | self.w2 = nn.Parameter(torch.Tensor(3, 4)) 68 | self.w2_relu = nn.ReLU() 69 | 70 | def forward(self, inputs): 71 | p3_x, p4_x, p5_x, p6_x, p7_x = inputs 72 | 73 | # Calculate Top-Down Pathway 74 | w1 = self.w1_relu(self.w1) 75 | w1 /= torch.sum(w1, dim=0) + self.epsilon 76 | w2 = self.w2_relu(self.w2) 77 | w2 /= torch.sum(w2, dim=0) + self.epsilon 78 | 79 | p7_td = p7_x 80 | p6_td = self.p6_td(w1[0, 0] * p6_x + w1[1, 0] * F.interpolate(p7_td, scale_factor=2)) 81 | p5_td = self.p5_td(w1[0, 1] * p5_x + w1[1, 1] * F.interpolate(p6_td, scale_factor=2)) 82 | p4_td = self.p4_td(w1[0, 2] * p4_x + w1[1, 2] * F.interpolate(p5_td, scale_factor=2)) 83 | p3_td = self.p3_td(w1[0, 3] * p3_x + w1[1, 3] * F.interpolate(p4_td, scale_factor=2)) 84 | 85 | # Calculate Bottom-Up Pathway 86 | p3_out = p3_td 87 | p4_out = self.p4_out(w2[0, 0] * p4_x + w2[1, 0] * p4_td + w2[2, 0] * nn.Upsample(scale_factor=0.5)(p3_out)) 88 | p5_out = self.p5_out(w2[0, 1] * p5_x + w2[1, 1] * p5_td + w2[2, 1] * nn.Upsample(scale_factor=0.5)(p4_out)) 89 | p6_out = self.p6_out(w2[0, 2] * p6_x + w2[1, 2] * p6_td + w2[2, 2] * nn.Upsample(scale_factor=0.5)(p5_out)) 90 | p7_out = self.p7_out(w2[0, 3] * p7_x + w2[1, 3] * p7_td + w2[2, 3] * nn.Upsample(scale_factor=0.5)(p6_out)) 91 | 92 | return [p3_out, p4_out, p5_out, p6_out, p7_out] 93 | 94 | class BiFPN(nn.Module): 95 | def __init__(self, size, feature_size=64, num_layers=2, epsilon=0.0001): 96 | super(BiFPN, self).__init__() 97 | self.p3 = nn.Conv2d(size[0], feature_size, kernel_size=1, stride=1, padding=0) 98 | self.p4 = nn.Conv2d(size[1], feature_size, kernel_size=1, stride=1, padding=0) 99 | self.p5 = nn.Conv2d(size[2], feature_size, kernel_size=1, stride=1, padding=0) 100 | 101 | # p6 is obtained via a 3x3 stride-2 conv on C5 102 | self.p6 = nn.Conv2d(size[2], feature_size, kernel_size=3, stride=2, padding=1) 103 | 104 | # p7 is computed by applying ReLU followed by a 3x3 stride-2 conv on p6 105 | self.p7 = ConvBlock(feature_size, feature_size, kernel_size=3, stride=2, padding=1) 106 | 107 | bifpns = [] 108 | for _ in range(num_layers): 109 | bifpns.append(BiFPNBlock(feature_size)) 110 | self.bifpn = nn.Sequential(*bifpns) 111 | 112 | def forward(self, inputs): 113 | c3, c4, c5 = inputs 114 | 115 | # Calculate the input column of BiFPN 116 | p3_x = self.p3(c3) 117 | p4_x = self.p4(c4) 118 | p5_x = self.p5(c5) 119 | p6_x = self.p6(c5) 120 | p7_x = self.p7(p6_x) 121 | 122 | features = [p3_x, p4_x, p5_x, p6_x, p7_x] 123 | return self.bifpn(features) 124 | -------------------------------------------------------------------------------- /coco_eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from pycocotools.coco import COCO 4 | from pycocotools.cocoeval import COCOeval 5 | 6 | import numpy as np 7 | import json 8 | import os 9 | 10 | import torch 11 | 12 | def evaluate_coco(dataset, model, threshold=0.05): 13 | 14 | model.eval() 15 | 16 | with torch.no_grad(): 17 | 18 | # start collecting results 19 | results = [] 20 | image_ids = [] 21 | 22 | for index in range(len(dataset)): 23 | data = dataset[index] 24 | scale = data['scale'] 25 | 26 | # run network 27 | scores, labels, boxes = model(data['img'].permute(2, 0, 1).cuda().float().unsqueeze(dim=0)) 28 | scores = scores.cpu() 29 | labels = labels.cpu() 30 | boxes = boxes.cpu() 31 | 32 | # correct boxes for image scale 33 | boxes /= scale 34 | 35 | if boxes.shape[0] > 0: 36 | # change to (x, y, w, h) (MS COCO standard) 37 | boxes[:, 2] -= boxes[:, 0] 38 | boxes[:, 3] -= boxes[:, 1] 39 | 40 | # compute predicted labels and scores 41 | #for box, score, label in zip(boxes[0], scores[0], labels[0]): 42 | for box_id in range(boxes.shape[0]): 43 | score = float(scores[box_id]) 44 | label = int(labels[box_id]) 45 | box = boxes[box_id, :] 46 | 47 | # scores are sorted, so we can break 48 | if score < threshold: 49 | break 50 | 51 | # append detection for each positively labeled class 52 | image_result = { 53 | 'image_id' : dataset.image_ids[index], 54 | 'category_id' : dataset.label_to_coco_label(label), 55 | 'score' : float(score), 56 | 'bbox' : box.tolist(), 57 | } 58 | 59 | # append detection to results 60 | results.append(image_result) 61 | 62 | # append image to list of processed images 63 | image_ids.append(dataset.image_ids[index]) 64 | 65 | # print progress 66 | print('{}/{}'.format(index, len(dataset)), end='\r') 67 | 68 | if not len(results): 69 | return 70 | 71 | # write output 72 | json.dump(results, open('{}_bbox_results.json'.format(dataset.set_name), 'w'), indent=4) 73 | 74 | # load results in COCO evaluation tool 75 | coco_true = dataset.coco 76 | coco_pred = coco_true.loadRes('{}_bbox_results.json'.format(dataset.set_name)) 77 | 78 | # run COCO evaluation 79 | coco_eval = COCOeval(coco_true, coco_pred, 'bbox') 80 | coco_eval.params.imgIds = image_ids 81 | coco_eval.evaluate() 82 | coco_eval.accumulate() 83 | coco_eval.summarize() 84 | 85 | model.train() 86 | 87 | return 88 | -------------------------------------------------------------------------------- /csv_eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import json 5 | import os 6 | 7 | import torch 8 | 9 | 10 | 11 | def compute_overlap(a, b): 12 | """ 13 | Parameters 14 | ---------- 15 | a: (N, 4) ndarray of float 16 | b: (K, 4) ndarray of float 17 | Returns 18 | ------- 19 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 20 | """ 21 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 22 | 23 | iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0]) 24 | ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1]) 25 | 26 | iw = np.maximum(iw, 0) 27 | ih = np.maximum(ih, 0) 28 | 29 | ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih 30 | 31 | ua = np.maximum(ua, np.finfo(float).eps) 32 | 33 | intersection = iw * ih 34 | 35 | return intersection / ua 36 | 37 | 38 | def _compute_ap(recall, precision): 39 | """ Compute the average precision, given the recall and precision curves. 40 | Code originally from https://github.com/rbgirshick/py-faster-rcnn. 41 | # Arguments 42 | recall: The recall curve (list). 43 | precision: The precision curve (list). 44 | # Returns 45 | The average precision as computed in py-faster-rcnn. 46 | """ 47 | # correct AP calculation 48 | # first append sentinel values at the end 49 | mrec = np.concatenate(([0.], recall, [1.])) 50 | mpre = np.concatenate(([0.], precision, [0.])) 51 | 52 | # compute the precision envelope 53 | for i in range(mpre.size - 1, 0, -1): 54 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 55 | 56 | # to calculate area under PR curve, look for points 57 | # where X axis (recall) changes value 58 | i = np.where(mrec[1:] != mrec[:-1])[0] 59 | 60 | # and sum (\Delta recall) * prec 61 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 62 | return ap 63 | 64 | 65 | def _get_detections(dataset, retinanet, score_threshold=0.05, max_detections=100, save_path=None): 66 | """ Get the detections from the retinanet using the generator. 67 | The result is a list of lists such that the size is: 68 | all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes] 69 | # Arguments 70 | dataset : The generator used to run images through the retinanet. 71 | retinanet : The retinanet to run on the images. 72 | score_threshold : The score confidence threshold to use. 73 | max_detections : The maximum number of detections to use per image. 74 | save_path : The path to save the images with visualized detections to. 75 | # Returns 76 | A list of lists containing the detections for each image in the generator. 77 | """ 78 | all_detections = [[None for i in range(dataset.num_classes())] for j in range(len(dataset))] 79 | 80 | retinanet.eval() 81 | 82 | with torch.no_grad(): 83 | 84 | for index in range(len(dataset)): 85 | data = dataset[index] 86 | scale = data['scale'] 87 | 88 | # run network 89 | scores, labels, boxes = retinanet(data['img'].permute(2, 0, 1).cuda().float().unsqueeze(dim=0)) 90 | scores = scores.cpu().numpy() 91 | labels = labels.cpu().numpy() 92 | boxes = boxes.cpu().numpy() 93 | 94 | # correct boxes for image scale 95 | boxes /= scale 96 | 97 | # select indices which have a score above the threshold 98 | indices = np.where(scores > score_threshold)[0] 99 | if indices.shape[0] > 0: 100 | # select those scores 101 | scores = scores[indices] 102 | 103 | # find the order with which to sort the scores 104 | scores_sort = np.argsort(-scores)[:max_detections] 105 | 106 | # select detections 107 | image_boxes = boxes[indices[scores_sort], :] 108 | image_scores = scores[scores_sort] 109 | image_labels = labels[indices[scores_sort]] 110 | image_detections = np.concatenate([image_boxes, np.expand_dims(image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1) 111 | 112 | # copy detections to all_detections 113 | for label in range(dataset.num_classes()): 114 | all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1] 115 | else: 116 | # copy detections to all_detections 117 | for label in range(dataset.num_classes()): 118 | all_detections[index][label] = np.zeros((0, 5)) 119 | 120 | print('{}/{}'.format(index + 1, len(dataset)), end='\r') 121 | 122 | return all_detections 123 | 124 | 125 | def _get_annotations(generator): 126 | """ Get the ground truth annotations from the generator. 127 | The result is a list of lists such that the size is: 128 | all_detections[num_images][num_classes] = annotations[num_detections, 5] 129 | # Arguments 130 | generator : The generator used to retrieve ground truth annotations. 131 | # Returns 132 | A list of lists containing the annotations for each image in the generator. 133 | """ 134 | all_annotations = [[None for i in range(generator.num_classes())] for j in range(len(generator))] 135 | 136 | for i in range(len(generator)): 137 | # load the annotations 138 | annotations = generator.load_annotations(i) 139 | 140 | # copy detections to all_annotations 141 | for label in range(generator.num_classes()): 142 | all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy() 143 | 144 | print('{}/{}'.format(i + 1, len(generator)), end='\r') 145 | 146 | return all_annotations 147 | 148 | 149 | def evaluate( 150 | generator, 151 | retinanet, 152 | iou_threshold=0.5, 153 | score_threshold=0.05, 154 | max_detections=100, 155 | save_path=None 156 | ): 157 | """ Evaluate a given dataset using a given retinanet. 158 | # Arguments 159 | generator : The generator that represents the dataset to evaluate. 160 | retinanet : The retinanet to evaluate. 161 | iou_threshold : The threshold used to consider when a detection is positive or negative. 162 | score_threshold : The score confidence threshold to use for detections. 163 | max_detections : The maximum number of detections to use per image. 164 | save_path : The path to save images with visualized detections to. 165 | # Returns 166 | A dict mapping class names to mAP scores. 167 | """ 168 | 169 | 170 | 171 | # gather all detections and annotations 172 | 173 | all_detections = _get_detections(generator, retinanet, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path) 174 | all_annotations = _get_annotations(generator) 175 | 176 | average_precisions = {} 177 | 178 | for label in range(generator.num_classes()): 179 | false_positives = np.zeros((0,)) 180 | true_positives = np.zeros((0,)) 181 | scores = np.zeros((0,)) 182 | num_annotations = 0.0 183 | 184 | for i in range(len(generator)): 185 | detections = all_detections[i][label] 186 | annotations = all_annotations[i][label] 187 | num_annotations += annotations.shape[0] 188 | detected_annotations = [] 189 | 190 | for d in detections: 191 | scores = np.append(scores, d[4]) 192 | 193 | if annotations.shape[0] == 0: 194 | false_positives = np.append(false_positives, 1) 195 | true_positives = np.append(true_positives, 0) 196 | continue 197 | 198 | overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) 199 | assigned_annotation = np.argmax(overlaps, axis=1) 200 | max_overlap = overlaps[0, assigned_annotation] 201 | 202 | if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: 203 | false_positives = np.append(false_positives, 0) 204 | true_positives = np.append(true_positives, 1) 205 | detected_annotations.append(assigned_annotation) 206 | else: 207 | false_positives = np.append(false_positives, 1) 208 | true_positives = np.append(true_positives, 0) 209 | 210 | # no annotations -> AP for this class is 0 (is this correct?) 211 | if num_annotations == 0: 212 | average_precisions[label] = 0, 0 213 | continue 214 | 215 | # sort by score 216 | indices = np.argsort(-scores) 217 | false_positives = false_positives[indices] 218 | true_positives = true_positives[indices] 219 | 220 | # compute false positives and true positives 221 | false_positives = np.cumsum(false_positives) 222 | true_positives = np.cumsum(true_positives) 223 | 224 | # compute recall and precision 225 | recall = true_positives / num_annotations 226 | precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps) 227 | 228 | # compute average precision 229 | average_precision = _compute_ap(recall, precision) 230 | average_precisions[label] = average_precision, num_annotations 231 | 232 | print('\nmAP:') 233 | for label in range(generator.num_classes()): 234 | label_name = generator.label_to_name(label) 235 | print('{}: {}'.format(label_name, average_precisions[label][0])) 236 | 237 | return average_precisions 238 | 239 | -------------------------------------------------------------------------------- /dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import sys 3 | import os 4 | import torch 5 | import numpy as np 6 | import random 7 | import csv 8 | 9 | from torch.utils.data import Dataset, DataLoader 10 | from torchvision import transforms, utils 11 | from torch.utils.data.sampler import Sampler 12 | 13 | from pycocotools.coco import COCO 14 | 15 | import skimage.io 16 | import skimage.transform 17 | import skimage.color 18 | import skimage 19 | 20 | from PIL import Image 21 | 22 | 23 | class CocoDataset(Dataset): 24 | """Coco dataset.""" 25 | 26 | def __init__(self, root_dir, set_name='train2017', transform=None): 27 | """ 28 | Args: 29 | root_dir (string): COCO directory. 30 | transform (callable, optional): Optional transform to be applied 31 | on a sample. 32 | """ 33 | self.root_dir = root_dir 34 | self.set_name = set_name 35 | self.transform = transform 36 | 37 | self.coco = COCO(os.path.join(self.root_dir, 'annotations', 'instances_' + self.set_name + '.json')) 38 | self.image_ids = self.coco.getImgIds() 39 | 40 | self.load_classes() 41 | 42 | def load_classes(self): 43 | # load class names (name -> label) 44 | categories = self.coco.loadCats(self.coco.getCatIds()) 45 | categories.sort(key=lambda x: x['id']) 46 | 47 | self.classes = {} 48 | self.coco_labels = {} 49 | self.coco_labels_inverse = {} 50 | for c in categories: 51 | self.coco_labels[len(self.classes)] = c['id'] 52 | self.coco_labels_inverse[c['id']] = len(self.classes) 53 | self.classes[c['name']] = len(self.classes) 54 | 55 | # also load the reverse (label -> name) 56 | self.labels = {} 57 | for key, value in self.classes.items(): 58 | self.labels[value] = key 59 | 60 | def __len__(self): 61 | return len(self.image_ids) 62 | 63 | def __getitem__(self, idx): 64 | 65 | img = self.load_image(idx) 66 | annot = self.load_annotations(idx) 67 | sample = {'img': img, 'annot': annot} 68 | if self.transform: 69 | sample = self.transform(sample) 70 | 71 | return sample 72 | 73 | def load_image(self, image_index): 74 | image_info = self.coco.loadImgs(self.image_ids[image_index])[0] 75 | path = os.path.join(self.root_dir, 'images', self.set_name, image_info['file_name']) 76 | img = skimage.io.imread(path) 77 | 78 | if len(img.shape) == 2: 79 | img = skimage.color.gray2rgb(img) 80 | 81 | return img.astype(np.float32)/255.0 82 | 83 | def load_annotations(self, image_index): 84 | # get ground truth annotations 85 | annotations_ids = self.coco.getAnnIds(imgIds=self.image_ids[image_index], iscrowd=False) 86 | annotations = np.zeros((0, 5)) 87 | 88 | # some images appear to miss annotations (like image with id 257034) 89 | if len(annotations_ids) == 0: 90 | return annotations 91 | 92 | # parse annotations 93 | coco_annotations = self.coco.loadAnns(annotations_ids) 94 | for idx, a in enumerate(coco_annotations): 95 | 96 | # some annotations have basically no width / height, skip them 97 | if a['bbox'][2] < 1 or a['bbox'][3] < 1: 98 | continue 99 | 100 | annotation = np.zeros((1, 5)) 101 | annotation[0, :4] = a['bbox'] 102 | annotation[0, 4] = self.coco_label_to_label(a['category_id']) 103 | annotations = np.append(annotations, annotation, axis=0) 104 | 105 | # transform from [x, y, w, h] to [x1, y1, x2, y2] 106 | annotations[:, 2] = annotations[:, 0] + annotations[:, 2] 107 | annotations[:, 3] = annotations[:, 1] + annotations[:, 3] 108 | 109 | return annotations 110 | 111 | def coco_label_to_label(self, coco_label): 112 | return self.coco_labels_inverse[coco_label] 113 | 114 | 115 | def label_to_coco_label(self, label): 116 | return self.coco_labels[label] 117 | 118 | def image_aspect_ratio(self, image_index): 119 | image = self.coco.loadImgs(self.image_ids[image_index])[0] 120 | return float(image['width']) / float(image['height']) 121 | 122 | def num_classes(self): 123 | return 80 124 | 125 | 126 | class CSVDataset(Dataset): 127 | """CSV dataset.""" 128 | 129 | def __init__(self, train_file, class_list, transform=None): 130 | """ 131 | Args: 132 | train_file (string): CSV file with training annotations 133 | annotations (string): CSV file with class list 134 | test_file (string, optional): CSV file with testing annotations 135 | """ 136 | self.train_file = train_file 137 | self.class_list = class_list 138 | self.transform = transform 139 | 140 | # parse the provided class file 141 | try: 142 | with self._open_for_csv(self.class_list) as file: 143 | self.classes = self.load_classes(csv.reader(file, delimiter=',')) 144 | except ValueError as e: 145 | raise_from(ValueError('invalid CSV class file: {}: {}'.format(self.class_list, e)), None) 146 | 147 | self.labels = {} 148 | for key, value in self.classes.items(): 149 | self.labels[value] = key 150 | 151 | # csv with img_path, x1, y1, x2, y2, class_name 152 | try: 153 | with self._open_for_csv(self.train_file) as file: 154 | self.image_data = self._read_annotations(csv.reader(file, delimiter=','), self.classes) 155 | except ValueError as e: 156 | raise_from(ValueError('invalid CSV annotations file: {}: {}'.format(self.train_file, e)), None) 157 | self.image_names = list(self.image_data.keys()) 158 | 159 | def _parse(self, value, function, fmt): 160 | """ 161 | Parse a string into a value, and format a nice ValueError if it fails. 162 | Returns `function(value)`. 163 | Any `ValueError` raised is catched and a new `ValueError` is raised 164 | with message `fmt.format(e)`, where `e` is the caught `ValueError`. 165 | """ 166 | try: 167 | return function(value) 168 | except ValueError as e: 169 | raise_from(ValueError(fmt.format(e)), None) 170 | 171 | def _open_for_csv(self, path): 172 | """ 173 | Open a file with flags suitable for csv.reader. 174 | This is different for python2 it means with mode 'rb', 175 | for python3 this means 'r' with "universal newlines". 176 | """ 177 | if sys.version_info[0] < 3: 178 | return open(path, 'rb') 179 | else: 180 | return open(path, 'r', newline='') 181 | 182 | 183 | def load_classes(self, csv_reader): 184 | result = {} 185 | 186 | for line, row in enumerate(csv_reader): 187 | line += 1 188 | 189 | try: 190 | class_name, class_id = row 191 | except ValueError: 192 | raise_from(ValueError('line {}: format should be \'class_name,class_id\''.format(line)), None) 193 | class_id = self._parse(class_id, int, 'line {}: malformed class ID: {{}}'.format(line)) 194 | 195 | if class_name in result: 196 | raise ValueError('line {}: duplicate class name: \'{}\''.format(line, class_name)) 197 | result[class_name] = class_id 198 | return result 199 | 200 | 201 | def __len__(self): 202 | return len(self.image_names) 203 | 204 | def __getitem__(self, idx): 205 | 206 | img = self.load_image(idx) 207 | annot = self.load_annotations(idx) 208 | sample = {'img': img, 'annot': annot} 209 | if self.transform: 210 | sample = self.transform(sample) 211 | 212 | return sample 213 | 214 | def load_image(self, image_index): 215 | img = skimage.io.imread(self.image_names[image_index]) 216 | 217 | if len(img.shape) == 2: 218 | img = skimage.color.gray2rgb(img) 219 | 220 | return img.astype(np.float32)/255.0 221 | 222 | def load_annotations(self, image_index): 223 | # get ground truth annotations 224 | annotation_list = self.image_data[self.image_names[image_index]] 225 | annotations = np.zeros((0, 5)) 226 | 227 | # some images appear to miss annotations (like image with id 257034) 228 | if len(annotation_list) == 0: 229 | return annotations 230 | 231 | # parse annotations 232 | for idx, a in enumerate(annotation_list): 233 | # some annotations have basically no width / height, skip them 234 | x1 = a['x1'] 235 | x2 = a['x2'] 236 | y1 = a['y1'] 237 | y2 = a['y2'] 238 | 239 | if (x2-x1) < 1 or (y2-y1) < 1: 240 | continue 241 | 242 | annotation = np.zeros((1, 5)) 243 | 244 | annotation[0, 0] = x1 245 | annotation[0, 1] = y1 246 | annotation[0, 2] = x2 247 | annotation[0, 3] = y2 248 | 249 | annotation[0, 4] = self.name_to_label(a['class']) 250 | annotations = np.append(annotations, annotation, axis=0) 251 | 252 | return annotations 253 | 254 | def _read_annotations(self, csv_reader, classes): 255 | result = {} 256 | for line, row in enumerate(csv_reader): 257 | line += 1 258 | 259 | try: 260 | img_file, x1, y1, x2, y2, class_name = row[:6] 261 | except ValueError: 262 | raise_from(ValueError('line {}: format should be \'img_file,x1,y1,x2,y2,class_name\' or \'img_file,,,,,\''.format(line)), None) 263 | 264 | if img_file not in result: 265 | result[img_file] = [] 266 | 267 | # If a row contains only an image path, it's an image without annotations. 268 | if (x1, y1, x2, y2, class_name) == ('', '', '', '', ''): 269 | continue 270 | 271 | x1 = self._parse(x1, int, 'line {}: malformed x1: {{}}'.format(line)) 272 | y1 = self._parse(y1, int, 'line {}: malformed y1: {{}}'.format(line)) 273 | x2 = self._parse(x2, int, 'line {}: malformed x2: {{}}'.format(line)) 274 | y2 = self._parse(y2, int, 'line {}: malformed y2: {{}}'.format(line)) 275 | 276 | # Check that the bounding box is valid. 277 | if x2 <= x1: 278 | raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1)) 279 | if y2 <= y1: 280 | raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1)) 281 | 282 | # check if the current class name is correctly present 283 | if class_name not in classes: 284 | raise ValueError('line {}: unknown class name: \'{}\' (classes: {})'.format(line, class_name, classes)) 285 | 286 | result[img_file].append({'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2, 'class': class_name}) 287 | return result 288 | 289 | def name_to_label(self, name): 290 | return self.classes[name] 291 | 292 | def label_to_name(self, label): 293 | return self.labels[label] 294 | 295 | def num_classes(self): 296 | return max(self.classes.values()) + 1 297 | 298 | def image_aspect_ratio(self, image_index): 299 | image = Image.open(self.image_names[image_index]) 300 | return float(image.width) / float(image.height) 301 | 302 | 303 | def collater(data): 304 | 305 | imgs = [s['img'] for s in data] 306 | annots = [s['annot'] for s in data] 307 | scales = [s['scale'] for s in data] 308 | 309 | widths = [int(s.shape[0]) for s in imgs] 310 | heights = [int(s.shape[1]) for s in imgs] 311 | batch_size = len(imgs) 312 | 313 | max_width = np.array(widths).max() 314 | max_height = np.array(heights).max() 315 | 316 | padded_imgs = torch.zeros(batch_size, max_width, max_height, 3) 317 | 318 | for i in range(batch_size): 319 | img = imgs[i] 320 | padded_imgs[i, :int(img.shape[0]), :int(img.shape[1]), :] = img 321 | 322 | max_num_annots = max(annot.shape[0] for annot in annots) 323 | 324 | if max_num_annots > 0: 325 | 326 | annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1 327 | 328 | if max_num_annots > 0: 329 | for idx, annot in enumerate(annots): 330 | #print(annot.shape) 331 | if annot.shape[0] > 0: 332 | annot_padded[idx, :annot.shape[0], :] = annot 333 | else: 334 | annot_padded = torch.ones((len(annots), 1, 5)) * -1 335 | 336 | 337 | padded_imgs = padded_imgs.permute(0, 3, 1, 2) 338 | 339 | return {'img': padded_imgs, 'annot': annot_padded, 'scale': scales} 340 | 341 | class Resizer(object): 342 | """Convert ndarrays in sample to Tensors.""" 343 | def __init__(self, img_size): 344 | self.img_size = img_size 345 | 346 | def __call__(self, sample): 347 | image, annots = sample['img'], sample['annot'] 348 | 349 | rows, cols, cns = image.shape 350 | 351 | largest_side = max(rows, cols) 352 | 353 | scale = self.img_size / largest_side 354 | 355 | # resize the image with the computed scale 356 | image = skimage.transform.resize(image, (int(round(rows*scale)), int(round((cols*scale))))) 357 | rows, cols, cns = image.shape 358 | 359 | new_image = np.zeros((self.img_size, self.img_size, cns)).astype(np.float32) 360 | new_image[:rows, :cols, :] = image.astype(np.float32) 361 | 362 | annots[:, :4] *= scale 363 | 364 | return {'img': torch.from_numpy(new_image), 'annot': torch.from_numpy(annots), 'scale': scale} 365 | 366 | 367 | class Augmenter(object): 368 | """Convert ndarrays in sample to Tensors.""" 369 | 370 | def __call__(self, sample, flip_x=0.5): 371 | 372 | if np.random.rand() < flip_x: 373 | image, annots = sample['img'], sample['annot'] 374 | image = image[:, ::-1, :] 375 | 376 | rows, cols, channels = image.shape 377 | 378 | x1 = annots[:, 0].copy() 379 | x2 = annots[:, 2].copy() 380 | 381 | x_tmp = x1.copy() 382 | 383 | annots[:, 0] = cols - x2 384 | annots[:, 2] = cols - x_tmp 385 | 386 | sample = {'img': image, 'annot': annots} 387 | 388 | return sample 389 | 390 | 391 | class Normalizer(object): 392 | 393 | def __init__(self): 394 | self.mean = np.array([[[0.485, 0.456, 0.406]]]) 395 | self.std = np.array([[[0.229, 0.224, 0.225]]]) 396 | 397 | def __call__(self, sample): 398 | 399 | image, annots = sample['img'], sample['annot'] 400 | 401 | return {'img':((image.astype(np.float32)-self.mean)/self.std), 'annot': annots} 402 | 403 | class UnNormalizer(object): 404 | def __init__(self, mean=None, std=None): 405 | if mean == None: 406 | self.mean = [0.485, 0.456, 0.406] 407 | else: 408 | self.mean = mean 409 | if std == None: 410 | self.std = [0.229, 0.224, 0.225] 411 | else: 412 | self.std = std 413 | 414 | def __call__(self, tensor): 415 | """ 416 | Args: 417 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 418 | Returns: 419 | Tensor: Normalized image. 420 | """ 421 | for t, m, s in zip(tensor, self.mean, self.std): 422 | t.mul_(s).add_(m) 423 | return tensor 424 | 425 | 426 | class AspectRatioBasedSampler(Sampler): 427 | 428 | def __init__(self, data_source, batch_size, drop_last): 429 | self.data_source = data_source 430 | self.batch_size = batch_size 431 | self.drop_last = drop_last 432 | self.groups = self.group_images() 433 | 434 | def __iter__(self): 435 | random.shuffle(self.groups) 436 | for group in self.groups: 437 | yield group 438 | 439 | def __len__(self): 440 | if self.drop_last: 441 | return len(self.data_source) // self.batch_size 442 | else: 443 | return (len(self.data_source) + self.batch_size - 1) // self.batch_size 444 | 445 | def group_images(self): 446 | # determine the order of the images 447 | order = list(range(len(self.data_source))) 448 | order.sort(key=lambda x: self.data_source.image_aspect_ratio(x)) 449 | 450 | # divide into groups, one group = one batch 451 | return [[order[x % len(order)] for x in range(i, i + self.batch_size)] for i in range(0, len(order), self.batch_size)] 452 | -------------------------------------------------------------------------------- /efficientdet.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import math 4 | import time 5 | import torch.utils.model_zoo as model_zoo 6 | from utils import BasicBlock, Bottleneck, BBoxTransform, ClipBoxes 7 | from anchors import Anchors 8 | import losses 9 | from torchvision.ops import nms 10 | 11 | from efficientnet_pytorch import EfficientNet 12 | 13 | from bifpn import BiFPN 14 | 15 | from timeitdec import timeit 16 | 17 | w_bifpn = [64, 88, 112, 160, 224, 288, 384, 384] 18 | 19 | class RegressionModel(nn.Module): 20 | def __init__(self, num_features_in, d_class=3, num_anchors=9, feature_size=64): 21 | super(RegressionModel, self).__init__() 22 | 23 | prediction_net = [] 24 | for _ in range(d_class): 25 | prediction_net.append(nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)) 26 | prediction_net.append(nn.ReLU()) 27 | num_features_in = feature_size 28 | self.prediction_net = nn.Sequential(*prediction_net) 29 | 30 | self.output = nn.Conv2d(feature_size, num_anchors*4, kernel_size=3, padding=1) 31 | 32 | def forward(self, x): 33 | out = self.prediction_net(x) 34 | out = self.output(out) 35 | 36 | # out is B x C x W x H, with C = 4*num_anchors 37 | out = out.permute(0, 2, 3, 1) 38 | 39 | return out.contiguous().view(out.shape[0], -1, 4) 40 | 41 | class ClassificationModel(nn.Module): 42 | def __init__(self, num_features_in, num_anchors=9, d_class=3, num_classes=80, prior=0.01, feature_size=64): 43 | super(ClassificationModel, self).__init__() 44 | 45 | self.num_classes = num_classes 46 | self.num_anchors = num_anchors 47 | 48 | classification_net = [] 49 | for _ in range(d_class): 50 | classification_net.append(nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)) 51 | classification_net.append(nn.ReLU()) 52 | num_features_in = feature_size 53 | self.classification_net = nn.Sequential(*classification_net) 54 | 55 | self.output = nn.Conv2d(feature_size, num_anchors*num_classes, kernel_size=3, padding=1) 56 | self.output_act = nn.Sigmoid() 57 | 58 | def forward(self, x): 59 | 60 | out = self.classification_net(x) 61 | 62 | out = self.output(out) 63 | out = self.output_act(out) 64 | 65 | # out is B x C x W x H, with C = n_classes + n_anchors 66 | out1 = out.permute(0, 2, 3, 1) 67 | 68 | batch_size, width, height, channels = out1.shape 69 | 70 | out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes) 71 | 72 | return out2.contiguous().view(x.shape[0], -1, self.num_classes) 73 | 74 | class EfficientDet(nn.Module): 75 | 76 | def __init__(self, num_classes, block, pretrained=False, phi=0): 77 | self.inplanes = w_bifpn[phi] 78 | super(EfficientDet, self).__init__() 79 | efficientnet = EfficientNet.from_pretrained(f'efficientnet-b{phi}') 80 | blocks = [] 81 | count = 0 82 | fpn_sizes = [] 83 | for block in efficientnet._blocks: 84 | blocks.append(block) 85 | if block._depthwise_conv.stride == [2, 2]: 86 | count += 1 87 | fpn_sizes.append(block._project_conv.out_channels) 88 | if len(fpn_sizes) >= 4: 89 | break 90 | 91 | self.efficientnet = nn.Sequential(efficientnet._conv_stem, efficientnet._bn0, *blocks) 92 | num_layers = min(phi+2, 8) 93 | self.fpn = BiFPN(fpn_sizes[1:], feature_size=w_bifpn[phi], num_layers=num_layers) 94 | 95 | d_class = 3 + (phi // 3) 96 | self.regressionModel = RegressionModel(w_bifpn[phi], feature_size=w_bifpn[phi], d_class=d_class) 97 | self.classificationModel = ClassificationModel(w_bifpn[phi], feature_size=w_bifpn[phi], d_class=d_class, num_classes=num_classes) 98 | 99 | self.anchors = Anchors() 100 | 101 | self.regressBoxes = BBoxTransform() 102 | 103 | self.clipBoxes = ClipBoxes() 104 | 105 | self.focalLoss = losses.FocalLoss().cuda() 106 | 107 | for m in self.modules(): 108 | if isinstance(m, nn.Conv2d): 109 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 110 | m.weight.data.normal_(0, math.sqrt(2. / n)) 111 | elif isinstance(m, nn.BatchNorm2d): 112 | m.weight.data.fill_(1) 113 | m.bias.data.zero_() 114 | 115 | prior = 0.01 116 | 117 | self.classificationModel.output.weight.data.fill_(0) 118 | self.classificationModel.output.bias.data.fill_(-math.log((1.0-prior)/prior)) 119 | 120 | self.regressionModel.output.weight.data.fill_(0) 121 | self.regressionModel.output.bias.data.fill_(0) 122 | 123 | self.freeze_bn() 124 | 125 | def _make_layer(self, block, planes, blocks, stride=1): 126 | downsample = None 127 | if stride != 1 or self.inplanes != planes * block.expansion: 128 | downsample = nn.Sequential( 129 | nn.Conv2d(self.inplanes, planes * block.expansion, 130 | kernel_size=1, stride=stride, bias=False), 131 | nn.BatchNorm2d(planes * block.expansion), 132 | ) 133 | 134 | layers = [] 135 | layers.append(block(self.inplanes, planes, stride, downsample)) 136 | self.inplanes = planes * block.expansion 137 | for i in range(1, blocks): 138 | layers.append(block(self.inplanes, planes)) 139 | 140 | return nn.Sequential(*layers) 141 | 142 | def freeze_bn(self): 143 | '''Freeze BatchNorm layers.''' 144 | for layer in self.modules(): 145 | if isinstance(layer, nn.BatchNorm2d): 146 | layer.eval() 147 | 148 | def forward(self, inputs): 149 | 150 | if self.training: 151 | img_batch, annotations = inputs 152 | else: 153 | img_batch = inputs 154 | 155 | x = self.efficientnet[0](img_batch) 156 | x = self.efficientnet[1](x) 157 | 158 | # Forward batch trough backbone 159 | features = [] 160 | for block in self.efficientnet[2:]: 161 | x = block(x) 162 | if block._depthwise_conv.stride == [2, 2]: 163 | features.append(x) 164 | 165 | features = self.fpn(features[1:]) 166 | 167 | regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1) 168 | 169 | classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1) 170 | 171 | anchors = self.anchors(img_batch) 172 | 173 | if self.training: 174 | return self.focalLoss(classification, regression, anchors, annotations) 175 | else: 176 | transformed_anchors = self.regressBoxes(anchors, regression) 177 | transformed_anchors = self.clipBoxes(transformed_anchors, img_batch) 178 | 179 | scores = torch.max(classification, dim=2, keepdim=True)[0] 180 | 181 | scores_over_thresh = (scores>0.05)[0, :, 0] 182 | 183 | if scores_over_thresh.sum() == 0: 184 | # no boxes to NMS, just return 185 | return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)] 186 | 187 | classification = classification[:, scores_over_thresh, :] 188 | transformed_anchors = transformed_anchors[:, scores_over_thresh, :] 189 | scores = scores[:, scores_over_thresh, :] 190 | 191 | anchors_nms_idx = nms(transformed_anchors, scores, 0.5) 192 | 193 | nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(dim=1) 194 | 195 | return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]] 196 | 197 | 198 | def efficientdet(num_classes, pretrained=True, **kwargs): 199 | """Constructs an EfficientDet 200 | Args: 201 | pretrained (bool): If True, returns a model pre-trained on ImageNet. 202 | """ 203 | model = EfficientDet(num_classes, Bottleneck, pretrained=pretrained, **kwargs) 204 | return model -------------------------------------------------------------------------------- /images/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tristandb/EfficientDet-PyTorch/b86f3661c9167ed9394bdfd430ea4673ad5177c7/images/1.jpg -------------------------------------------------------------------------------- /images/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tristandb/EfficientDet-PyTorch/b86f3661c9167ed9394bdfd430ea4673ad5177c7/images/3.jpg -------------------------------------------------------------------------------- /images/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tristandb/EfficientDet-PyTorch/b86f3661c9167ed9394bdfd430ea4673ad5177c7/images/4.jpg -------------------------------------------------------------------------------- /images/5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tristandb/EfficientDet-PyTorch/b86f3661c9167ed9394bdfd430ea4673ad5177c7/images/5.jpg -------------------------------------------------------------------------------- /images/6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tristandb/EfficientDet-PyTorch/b86f3661c9167ed9394bdfd430ea4673ad5177c7/images/6.jpg -------------------------------------------------------------------------------- /images/7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tristandb/EfficientDet-PyTorch/b86f3661c9167ed9394bdfd430ea4673ad5177c7/images/7.jpg -------------------------------------------------------------------------------- /images/8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tristandb/EfficientDet-PyTorch/b86f3661c9167ed9394bdfd430ea4673ad5177c7/images/8.jpg -------------------------------------------------------------------------------- /losses.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from timeitdec import timeit 6 | 7 | def calc_iou(a, b): 8 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 9 | 10 | iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0]) 11 | ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1]) 12 | 13 | iw = torch.clamp(iw, min=0) 14 | ih = torch.clamp(ih, min=0) 15 | 16 | ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih 17 | 18 | ua = torch.clamp(ua, min=1e-8) 19 | 20 | intersection = iw * ih 21 | 22 | IoU = intersection / ua 23 | 24 | return IoU 25 | 26 | class FocalLoss(nn.Module): 27 | #def __init__(self): 28 | 29 | def forward(self, classifications, regressions, anchors, annotations): 30 | #print("classifications", classifications.shape) 31 | #print("regressions", regressions.shape) 32 | #print("anchors", anchors.shape) 33 | #print("annotations", annotations.shape) 34 | #print(annotations) 35 | alpha = 0.25 36 | gamma = 1.5 37 | batch_size = classifications.shape[0] 38 | classification_losses = [] 39 | regression_losses = [] 40 | 41 | anchor = anchors[0, :, :] 42 | 43 | anchor_widths = anchor[:, 2] - anchor[:, 0] 44 | anchor_heights = anchor[:, 3] - anchor[:, 1] 45 | anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths 46 | anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights 47 | 48 | for j in range(batch_size): 49 | 50 | classification = classifications[j, :, :] 51 | regression = regressions[j, :, :] 52 | 53 | bbox_annotation = annotations[j, :, :] 54 | bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1] 55 | 56 | if bbox_annotation.shape[0] == 0: 57 | regression_losses.append(torch.tensor(0).float().cuda()) 58 | classification_losses.append(torch.tensor(0).float().cuda()) 59 | 60 | continue 61 | 62 | classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4) 63 | 64 | IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4]) # num_anchors x num_annotations 65 | 66 | IoU_max, IoU_argmax = torch.max(IoU, dim=1) # num_anchors x 1 67 | 68 | #import pdb 69 | #pdb.set_trace() 70 | 71 | # compute the loss for classification 72 | targets = torch.ones(classification.shape) * -1 73 | targets = targets.cuda() 74 | 75 | targets[torch.lt(IoU_max, 0.4), :] = 0 76 | 77 | positive_indices = torch.ge(IoU_max, 0.5) 78 | 79 | num_positive_anchors = positive_indices.sum() 80 | 81 | assigned_annotations = bbox_annotation[IoU_argmax, :] 82 | 83 | targets[positive_indices, :] = 0 84 | targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1 85 | 86 | alpha_factor = torch.ones(targets.shape).cuda() * alpha 87 | 88 | alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor) 89 | focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification) 90 | focal_weight = alpha_factor * torch.pow(focal_weight, gamma) 91 | 92 | bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification)) 93 | 94 | # cls_loss = focal_weight * torch.pow(bce, gamma) 95 | cls_loss = focal_weight * bce 96 | 97 | cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda()) 98 | 99 | classification_losses.append(cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0)) 100 | 101 | # compute the loss for regression 102 | 103 | if positive_indices.sum() > 0: 104 | assigned_annotations = assigned_annotations[positive_indices, :] 105 | 106 | anchor_widths_pi = anchor_widths[positive_indices] 107 | anchor_heights_pi = anchor_heights[positive_indices] 108 | anchor_ctr_x_pi = anchor_ctr_x[positive_indices] 109 | anchor_ctr_y_pi = anchor_ctr_y[positive_indices] 110 | 111 | gt_widths = assigned_annotations[:, 2] - assigned_annotations[:, 0] 112 | gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1] 113 | gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths 114 | gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights 115 | 116 | # clip widths to 1 117 | gt_widths = torch.clamp(gt_widths, min=1) 118 | gt_heights = torch.clamp(gt_heights, min=1) 119 | 120 | targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi 121 | targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi 122 | targets_dw = torch.log(gt_widths / anchor_widths_pi) 123 | targets_dh = torch.log(gt_heights / anchor_heights_pi) 124 | 125 | targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh)) 126 | targets = targets.t() 127 | 128 | targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda() 129 | 130 | 131 | negative_indices = ~ positive_indices 132 | 133 | regression_diff = torch.abs(targets - regression[positive_indices, :]) 134 | 135 | regression_loss = torch.where( 136 | torch.le(regression_diff, 1.0 / 9.0), 137 | 0.5 * 9.0 * torch.pow(regression_diff, 2), 138 | regression_diff - 0.5 / 9.0 139 | ) 140 | regression_losses.append(regression_loss.mean()) 141 | else: 142 | regression_losses.append(torch.tensor(0).float().cuda()) 143 | 144 | return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True) 145 | 146 | 147 | 148 | import torch 149 | import torch.nn as nn 150 | import torch.nn.functional as F 151 | 152 | from torch.autograd import Variable 153 | 154 | import time 155 | 156 | 157 | class FocalLoss1(nn.Module): 158 | def __init__(self, num_classes, device): 159 | super(FocalLoss, self).__init__() 160 | self.num_classes = num_classes 161 | self.device = device 162 | 163 | def focal_loss(self, x, y): 164 | '''Focal loss. 165 | Args: 166 | x: (tensor) sized [N,D]. 167 | y: (tensor) sized [N,]. 168 | Return: 169 | (tensor) focal loss. 170 | ''' 171 | alpha = 0.25 172 | gamma = 2 173 | 174 | t = F.one_hot(y.data, 1+self.num_classes) # [N,21] 175 | t = t[:,1:] # exclude background 176 | t = Variable(t) 177 | 178 | p = x.sigmoid() 179 | pt = p*t + (1-p)*(1-t) # pt = p if t > 0 else 1-p 180 | w = alpha*t + (1-alpha)*(1-t) # w = alpha if t > 0 else 1-alpha 181 | w = w * (1-pt).pow(gamma) 182 | return F.binary_cross_entropy_with_logits(x, t, w, reduction='sum') 183 | 184 | def focal_loss_alt(self, x, y, alpha=0.25, gamma=1.5): 185 | '''Focal loss alternative. 186 | 187 | Args: 188 | x: (tensor) sized [N,D]. 189 | y: (tensor) sized [N,]. 190 | 191 | Return: 192 | (tensor) focal loss. 193 | ''' 194 | t = F.one_hot(y, self.num_classes+1) 195 | t = t[:,1:] 196 | 197 | xt = x*(2*t-1) # xt = x if t > 0 else -x 198 | pt = (2*xt+1).sigmoid() 199 | pt = pt.clamp(1e-7, 1.0) 200 | w = (0+alpha)*(0+t) + (1-alpha)*(1-t) 201 | loss = -w*pt.log() / gamma 202 | return loss.sum() 203 | 204 | 205 | def forward(self, loc_preds, loc_targets, cls_preds, cls_targets): 206 | '''Compute loss between (loc_preds, loc_targets) and (cls_preds, cls_targets). 207 | Args: 208 | loc_preds: (tensor) predicted locations, sized [batch_size, #anchors, 4]. 209 | loc_targets: (tensor) encoded target locations, sized [batch_size, #anchors, 4]. 210 | cls_preds: (tensor) predicted class confidences, sized [batch_size, #anchors, #classes]. 211 | cls_targets: (tensor) encoded target labels, sized [batch_size, #anchors]. 212 | loss: 213 | (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + FocalLoss(cls_preds, cls_targets). 214 | ''' 215 | 216 | batch_size, num_boxes = cls_targets.size() 217 | pos = cls_targets > 0 # [N,#anchors] 218 | num_pos = pos.data.long().sum() 219 | 220 | ################################################################ 221 | # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets) 222 | ################################################################ 223 | mask = pos.unsqueeze(2).expand_as(loc_preds) # [N,#anchors,4] 224 | masked_loc_preds = loc_preds[mask].view(-1,4) # [#pos,4] 225 | masked_loc_targets = loc_targets[mask].view(-1,4) # [#pos,4] 226 | loc_loss = F.smooth_l1_loss(masked_loc_preds, masked_loc_targets, reduction='sum') 227 | 228 | ################################################################ 229 | # cls_loss = FocalLoss(loc_preds, loc_targets) 230 | ################################################################ 231 | pos_neg = cls_targets > -1 # exclude ignored anchors 232 | num_peg = pos_neg.data.long().sum() 233 | mask = pos_neg.unsqueeze(2).expand_as(cls_preds) 234 | masked_cls_preds = cls_preds[mask].view(-1,self.num_classes) 235 | cls_loss = self.focal_loss_alt(masked_cls_preds, cls_targets[pos_neg]) 236 | 237 | #print('loc_loss: %.3f | cls_loss: %.3f' % (loc_loss.data[0]/num_pos, cls_loss.data[0]/num_peg), end=' | ') 238 | loss = loc_loss/num_pos + cls_loss/num_peg 239 | return loss -------------------------------------------------------------------------------- /oid_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | 3 | import csv 4 | import json 5 | import os 6 | import warnings 7 | 8 | import numpy as np 9 | import skimage 10 | import skimage.color 11 | import skimage.io 12 | import skimage.transform 13 | from PIL import Image 14 | from torch.utils.data import Dataset 15 | 16 | 17 | def get_labels(metadata_dir, version='v4'): 18 | if version == 'v4' or version == 'challenge2018': 19 | csv_file = 'class-descriptions-boxable.csv' if version == 'v4' else 'challenge-2018-class-descriptions-500.csv' 20 | 21 | boxable_classes_descriptions = os.path.join(metadata_dir, csv_file) 22 | id_to_labels = {} 23 | cls_index = {} 24 | 25 | i = 0 26 | with open(boxable_classes_descriptions) as f: 27 | for row in csv.reader(f): 28 | # make sure the csv row is not empty (usually the last one) 29 | if len(row): 30 | label = row[0] 31 | description = row[1].replace("\"", "").replace("'", "").replace('`', '') 32 | 33 | id_to_labels[i] = description 34 | cls_index[label] = i 35 | 36 | i += 1 37 | else: 38 | trainable_classes_path = os.path.join(metadata_dir, 'classes-bbox-trainable.txt') 39 | description_path = os.path.join(metadata_dir, 'class-descriptions.csv') 40 | 41 | description_table = {} 42 | with open(description_path) as f: 43 | for row in csv.reader(f): 44 | # make sure the csv row is not empty (usually the last one) 45 | if len(row): 46 | description_table[row[0]] = row[1].replace("\"", "").replace("'", "").replace('`', '') 47 | 48 | with open(trainable_classes_path, 'rb') as f: 49 | trainable_classes = f.read().split('\n') 50 | 51 | id_to_labels = dict([(i, description_table[c]) for i, c in enumerate(trainable_classes)]) 52 | cls_index = dict([(c, i) for i, c in enumerate(trainable_classes)]) 53 | 54 | return id_to_labels, cls_index 55 | 56 | 57 | def generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version='v4'): 58 | validation_image_ids = {} 59 | 60 | if version == 'v4': 61 | annotations_path = os.path.join(metadata_dir, subset, '{}-annotations-bbox.csv'.format(subset)) 62 | elif version == 'challenge2018': 63 | validation_image_ids_path = os.path.join(metadata_dir, 'challenge-2018-image-ids-valset-od.csv') 64 | 65 | with open(validation_image_ids_path, 'r') as csv_file: 66 | reader = csv.DictReader(csv_file, fieldnames=['ImageID']) 67 | reader.next() 68 | for line, row in enumerate(reader): 69 | image_id = row['ImageID'] 70 | validation_image_ids[image_id] = True 71 | 72 | annotations_path = os.path.join(metadata_dir, 'challenge-2018-train-annotations-bbox.csv') 73 | else: 74 | annotations_path = os.path.join(metadata_dir, subset, 'annotations-human-bbox.csv') 75 | 76 | fieldnames = ['ImageID', 'Source', 'LabelName', 'Confidence', 77 | 'XMin', 'XMax', 'YMin', 'YMax', 78 | 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction', 'IsInside'] 79 | 80 | id_annotations = dict() 81 | with open(annotations_path, 'r') as csv_file: 82 | reader = csv.DictReader(csv_file, fieldnames=fieldnames) 83 | next(reader) 84 | 85 | images_sizes = {} 86 | for line, row in enumerate(reader): 87 | frame = row['ImageID'] 88 | 89 | if version == 'challenge2018': 90 | if subset == 'train': 91 | if frame in validation_image_ids: 92 | continue 93 | elif subset == 'validation': 94 | if frame not in validation_image_ids: 95 | continue 96 | else: 97 | raise NotImplementedError('This generator handles only the train and validation subsets') 98 | 99 | class_name = row['LabelName'] 100 | 101 | if class_name not in cls_index: 102 | continue 103 | 104 | cls_id = cls_index[class_name] 105 | 106 | if version == 'challenge2018': 107 | # We recommend participants to use the provided subset of the training set as a validation set. 108 | # This is preferable over using the V4 val/test sets, as the training set is more densely annotated. 109 | img_path = os.path.join(main_dir, 'images', 'train', frame + '.jpg') 110 | else: 111 | img_path = os.path.join(main_dir, 'images', subset, frame + '.jpg') 112 | 113 | if frame in images_sizes: 114 | width, height = images_sizes[frame] 115 | else: 116 | try: 117 | with Image.open(img_path) as img: 118 | width, height = img.width, img.height 119 | images_sizes[frame] = (width, height) 120 | except Exception as ex: 121 | if version == 'challenge2018': 122 | raise ex 123 | continue 124 | 125 | x1 = float(row['XMin']) 126 | x2 = float(row['XMax']) 127 | y1 = float(row['YMin']) 128 | y2 = float(row['YMax']) 129 | 130 | x1_int = int(round(x1 * width)) 131 | x2_int = int(round(x2 * width)) 132 | y1_int = int(round(y1 * height)) 133 | y2_int = int(round(y2 * height)) 134 | 135 | # Check that the bounding box is valid. 136 | if x2 <= x1: 137 | raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1)) 138 | if y2 <= y1: 139 | raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1)) 140 | 141 | if y2_int == y1_int: 142 | warnings.warn('filtering line {}: rounding y2 ({}) and y1 ({}) makes them equal'.format(line, y2, y1)) 143 | continue 144 | 145 | if x2_int == x1_int: 146 | warnings.warn('filtering line {}: rounding x2 ({}) and x1 ({}) makes them equal'.format(line, x2, x1)) 147 | continue 148 | 149 | img_id = row['ImageID'] 150 | annotation = {'cls_id': cls_id, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2} 151 | 152 | if img_id in id_annotations: 153 | annotations = id_annotations[img_id] 154 | annotations['boxes'].append(annotation) 155 | else: 156 | id_annotations[img_id] = {'w': width, 'h': height, 'boxes': [annotation]} 157 | return id_annotations 158 | 159 | 160 | class OidDataset(Dataset): 161 | """Oid dataset.""" 162 | 163 | def __init__(self, main_dir, subset, version='v4', annotation_cache_dir='.', transform=None): 164 | if version == 'v4': 165 | metadata = '2018_04' 166 | elif version == 'challenge2018': 167 | metadata = 'challenge2018' 168 | elif version == 'v3': 169 | metadata = '2017_11' 170 | else: 171 | raise NotImplementedError('There is currently no implementation for versions older than v3') 172 | 173 | self.transform = transform 174 | 175 | if version == 'challenge2018': 176 | self.base_dir = os.path.join(main_dir, 'images', 'train') 177 | else: 178 | self.base_dir = os.path.join(main_dir, 'images', subset) 179 | 180 | metadata_dir = os.path.join(main_dir, metadata) 181 | annotation_cache_json = os.path.join(annotation_cache_dir, subset + '.json') 182 | 183 | self.id_to_labels, cls_index = get_labels(metadata_dir, version=version) 184 | 185 | if os.path.exists(annotation_cache_json): 186 | with open(annotation_cache_json, 'r') as f: 187 | self.annotations = json.loads(f.read()) 188 | else: 189 | self.annotations = generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, 190 | version=version) 191 | json.dump(self.annotations, open(annotation_cache_json, "w")) 192 | 193 | self.id_to_image_id = dict([(i, k) for i, k in enumerate(self.annotations)]) 194 | 195 | # (label -> name) 196 | self.labels = self.id_to_labels 197 | 198 | def __len__(self): 199 | return len(self.annotations) 200 | 201 | def __getitem__(self, idx): 202 | 203 | img = self.load_image(idx) 204 | annot = self.load_annotations(idx) 205 | sample = {'img': img, 'annot': annot} 206 | if self.transform: 207 | sample = self.transform(sample) 208 | 209 | return sample 210 | 211 | def image_path(self, image_index): 212 | path = os.path.join(self.base_dir, self.id_to_image_id[image_index] + '.jpg') 213 | return path 214 | 215 | def load_image(self, image_index): 216 | path = self.image_path(image_index) 217 | img = skimage.io.imread(path) 218 | 219 | if len(img.shape) == 1: 220 | img = img[0] 221 | 222 | if len(img.shape) == 2: 223 | img = skimage.color.gray2rgb(img) 224 | 225 | try: 226 | return img.astype(np.float32) / 255.0 227 | except Exception: 228 | print (path) 229 | exit(0) 230 | 231 | def load_annotations(self, image_index): 232 | # get ground truth annotations 233 | image_annotations = self.annotations[self.id_to_image_id[image_index]] 234 | 235 | labels = image_annotations['boxes'] 236 | height, width = image_annotations['h'], image_annotations['w'] 237 | 238 | boxes = np.zeros((len(labels), 5)) 239 | for idx, ann in enumerate(labels): 240 | cls_id = ann['cls_id'] 241 | x1 = ann['x1'] * width 242 | x2 = ann['x2'] * width 243 | y1 = ann['y1'] * height 244 | y2 = ann['y2'] * height 245 | 246 | boxes[idx, 0] = x1 247 | boxes[idx, 1] = y1 248 | boxes[idx, 2] = x2 249 | boxes[idx, 3] = y2 250 | boxes[idx, 4] = cls_id 251 | 252 | return boxes 253 | 254 | def image_aspect_ratio(self, image_index): 255 | img_annotations = self.annotations[self.id_to_image_id[image_index]] 256 | height, width = img_annotations['h'], img_annotations['w'] 257 | return float(width) / float(height) 258 | 259 | def num_classes(self): 260 | return len(self.id_to_labels) 261 | -------------------------------------------------------------------------------- /opt/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | # from . import nms_cpu, nms_cuda 5 | from .soft_nms_cpu import soft_nms_cpu 6 | 7 | def nms(dets, iou_thr, device_id=None): 8 | """Dispatch to either CPU or GPU NMS implementations. 9 | The input can be either a torch tensor or numpy array. GPU NMS will be used 10 | if the input is a gpu tensor or device_id is specified, otherwise CPU NMS 11 | will be used. The returned type will always be the same as inputs. 12 | Arguments: 13 | dets (torch.Tensor or np.ndarray): bboxes with scores. 14 | iou_thr (float): IoU threshold for NMS. 15 | device_id (int, optional): when `dets` is a numpy array, if `device_id` 16 | is None, then cpu nms is used, otherwise gpu_nms will be used. 17 | Returns: 18 | tuple: kept bboxes and indice, which is always the same data type as 19 | the input. 20 | Example: 21 | >>> dets = np.array([[49.1, 32.4, 51.0, 35.9, 0.9], 22 | >>> [49.3, 32.9, 51.0, 35.3, 0.9], 23 | >>> [49.2, 31.8, 51.0, 35.4, 0.5], 24 | >>> [35.1, 11.5, 39.1, 15.7, 0.5], 25 | >>> [35.6, 11.8, 39.3, 14.2, 0.5], 26 | >>> [35.3, 11.5, 39.9, 14.5, 0.4], 27 | >>> [35.2, 11.7, 39.7, 15.7, 0.3]], dtype=np.float32) 28 | >>> iou_thr = 0.7 29 | >>> supressed, inds = nms(dets, iou_thr) 30 | >>> assert len(inds) == len(supressed) == 3 31 | """ 32 | # convert dets (tensor or numpy array) to tensor 33 | if isinstance(dets, torch.Tensor): 34 | is_numpy = False 35 | dets_th = dets 36 | elif isinstance(dets, np.ndarray): 37 | is_numpy = True 38 | device = 'cpu' if device_id is None else 'cuda:{}'.format(device_id) 39 | dets_th = torch.from_numpy(dets).to(device) 40 | else: 41 | raise TypeError( 42 | 'dets must be either a Tensor or numpy array, but got {}'.format( 43 | type(dets))) 44 | 45 | # execute cpu or cuda nms 46 | if dets_th.shape[0] == 0: 47 | inds = dets_th.new_zeros(0, dtype=torch.long) 48 | else: 49 | if dets_th.is_cuda: 50 | pass 51 | #inds = nms_cuda.nms(dets_th, iou_thr) 52 | else: 53 | pass 54 | #inds = nms_cpu.nms(dets_th, iou_thr) 55 | 56 | if is_numpy: 57 | inds = inds.cpu().numpy() 58 | return dets[inds, :], inds 59 | 60 | 61 | def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3): 62 | """ 63 | Example: 64 | >>> dets = np.array([[4., 3., 5., 3., 0.9], 65 | >>> [4., 3., 5., 4., 0.9], 66 | >>> [3., 1., 3., 1., 0.5], 67 | >>> [3., 1., 3., 1., 0.5], 68 | >>> [3., 1., 3., 1., 0.4], 69 | >>> [3., 1., 3., 1., 0.0]], dtype=np.float32) 70 | >>> iou_thr = 0.7 71 | >>> supressed, inds = soft_nms(dets, iou_thr, sigma=0.5) 72 | >>> assert len(inds) == len(supressed) == 3 73 | """ 74 | if isinstance(dets, torch.Tensor): 75 | is_tensor = True 76 | dets_np = dets.detach().cpu().numpy() 77 | elif isinstance(dets, np.ndarray): 78 | is_tensor = False 79 | dets_np = dets 80 | else: 81 | raise TypeError( 82 | 'dets must be either a Tensor or numpy array, but got {}'.format( 83 | type(dets))) 84 | 85 | method_codes = {'linear': 1, 'gaussian': 2} 86 | if method not in method_codes: 87 | raise ValueError('Invalid method for SoftNMS: {}'.format(method)) 88 | new_dets, inds = soft_nms_cpu( 89 | dets_np, 90 | iou_thr, 91 | method=method_codes[method], 92 | sigma=sigma, 93 | min_score=min_score) 94 | 95 | if is_tensor: 96 | return dets.new_tensor(new_dets), dets.new_tensor( 97 | inds, dtype=torch.long) 98 | else: 99 | return new_dets.astype(np.float32), inds.astype(np.int64) -------------------------------------------------------------------------------- /opt/soft_nms_cpu.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def soft_nms_cpu( 4 | np.ndarray[float, ndim=2] boxes_in, 5 | float iou_thr, 6 | unsigned int method=1, 7 | float sigma=0.5, 8 | float min_score=0.001, 9 | ): 10 | boxes = boxes_in.copy() 11 | cdef unsigned int N = boxes.shape[0] 12 | cdef float iw, ih, box_area 13 | cdef float ua 14 | cdef int pos = 0 15 | cdef float maxscore = 0 16 | cdef int maxpos = 0 17 | cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov 18 | inds = np.arange(N) 19 | 20 | for i in range(N): 21 | maxscore = boxes[i, 4] 22 | maxpos = i 23 | 24 | tx1 = boxes[i, 0] 25 | ty1 = boxes[i, 1] 26 | tx2 = boxes[i, 2] 27 | ty2 = boxes[i, 3] 28 | ts = boxes[i, 4] 29 | ti = inds[i] 30 | 31 | pos = i + 1 32 | # get max box 33 | while pos < N: 34 | if maxscore < boxes[pos, 4]: 35 | maxscore = boxes[pos, 4] 36 | maxpos = pos 37 | pos = pos + 1 38 | 39 | # add max box as a detection 40 | boxes[i, 0] = boxes[maxpos, 0] 41 | boxes[i, 1] = boxes[maxpos, 1] 42 | boxes[i, 2] = boxes[maxpos, 2] 43 | boxes[i, 3] = boxes[maxpos, 3] 44 | boxes[i, 4] = boxes[maxpos, 4] 45 | inds[i] = inds[maxpos] 46 | 47 | # swap ith box with position of max box 48 | boxes[maxpos, 0] = tx1 49 | boxes[maxpos, 1] = ty1 50 | boxes[maxpos, 2] = tx2 51 | boxes[maxpos, 3] = ty2 52 | boxes[maxpos, 4] = ts 53 | inds[maxpos] = ti 54 | 55 | tx1 = boxes[i, 0] 56 | ty1 = boxes[i, 1] 57 | tx2 = boxes[i, 2] 58 | ty2 = boxes[i, 3] 59 | ts = boxes[i, 4] 60 | 61 | pos = i + 1 62 | # NMS iterations, note that N changes if detection boxes fall below 63 | # threshold 64 | while pos < N: 65 | x1 = boxes[pos, 0] 66 | y1 = boxes[pos, 1] 67 | x2 = boxes[pos, 2] 68 | y2 = boxes[pos, 3] 69 | s = boxes[pos, 4] 70 | 71 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 72 | iw = (min(tx2, x2) - max(tx1, x1) + 1) 73 | if iw > 0: 74 | ih = (min(ty2, y2) - max(ty1, y1) + 1) 75 | if ih > 0: 76 | ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) 77 | ov = iw * ih / ua # iou between max box and detection box 78 | 79 | if method == 1: # linear 80 | if ov > iou_thr: 81 | weight = 1 - ov 82 | else: 83 | weight = 1 84 | elif method == 2: # gaussian 85 | weight = np.exp(-(ov * ov) / sigma) 86 | else: # original NMS 87 | if ov > iou_thr: 88 | weight = 0 89 | else: 90 | weight = 1 91 | 92 | boxes[pos, 4] = weight * boxes[pos, 4] 93 | 94 | # if box score falls below threshold, discard the box by 95 | # swapping with last box update N 96 | if boxes[pos, 4] < min_score: 97 | boxes[pos, 0] = boxes[N-1, 0] 98 | boxes[pos, 1] = boxes[N-1, 1] 99 | boxes[pos, 2] = boxes[N-1, 2] 100 | boxes[pos, 3] = boxes[N-1, 3] 101 | boxes[pos, 4] = boxes[N-1, 4] 102 | inds[pos] = inds[N - 1] 103 | N = N - 1 104 | pos = pos - 1 105 | 106 | pos = pos + 1 107 | 108 | return boxes[:N], inds[:N] -------------------------------------------------------------------------------- /opt/src/nms_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | 4 | template 5 | at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) { 6 | AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); 7 | 8 | if (dets.numel() == 0) { 9 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 10 | } 11 | 12 | auto x1_t = dets.select(1, 0).contiguous(); 13 | auto y1_t = dets.select(1, 1).contiguous(); 14 | auto x2_t = dets.select(1, 2).contiguous(); 15 | auto y2_t = dets.select(1, 3).contiguous(); 16 | auto scores = dets.select(1, 4).contiguous(); 17 | 18 | at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); 19 | 20 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 21 | 22 | auto ndets = dets.size(0); 23 | at::Tensor suppressed_t = 24 | at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); 25 | 26 | auto suppressed = suppressed_t.data(); 27 | auto order = order_t.data(); 28 | auto x1 = x1_t.data(); 29 | auto y1 = y1_t.data(); 30 | auto x2 = x2_t.data(); 31 | auto y2 = y2_t.data(); 32 | auto areas = areas_t.data(); 33 | 34 | for (int64_t _i = 0; _i < ndets; _i++) { 35 | auto i = order[_i]; 36 | if (suppressed[i] == 1) continue; 37 | auto ix1 = x1[i]; 38 | auto iy1 = y1[i]; 39 | auto ix2 = x2[i]; 40 | auto iy2 = y2[i]; 41 | auto iarea = areas[i]; 42 | 43 | for (int64_t _j = _i + 1; _j < ndets; _j++) { 44 | auto j = order[_j]; 45 | if (suppressed[j] == 1) continue; 46 | auto xx1 = std::max(ix1, x1[j]); 47 | auto yy1 = std::max(iy1, y1[j]); 48 | auto xx2 = std::min(ix2, x2[j]); 49 | auto yy2 = std::min(iy2, y2[j]); 50 | 51 | auto w = std::max(static_cast(0), xx2 - xx1 + 1); 52 | auto h = std::max(static_cast(0), yy2 - yy1 + 1); 53 | auto inter = w * h; 54 | auto ovr = inter / (iarea + areas[j] - inter); 55 | if (ovr >= threshold) suppressed[j] = 1; 56 | } 57 | } 58 | return at::nonzero(suppressed_t == 0).squeeze(1); 59 | } 60 | 61 | at::Tensor nms(const at::Tensor& dets, const float threshold) { 62 | at::Tensor result; 63 | AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] { 64 | result = nms_cpu_kernel(dets, threshold); 65 | }); 66 | return result; 67 | } 68 | 69 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 70 | m.def("nms", &nms, "non-maximum suppression"); 71 | } -------------------------------------------------------------------------------- /opt/src/nms_cuda.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | 4 | #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") 5 | 6 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); 7 | 8 | at::Tensor nms(const at::Tensor& dets, const float threshold) { 9 | CHECK_CUDA(dets); 10 | if (dets.numel() == 0) 11 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 12 | return nms_cuda(dets, threshold); 13 | } 14 | 15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 16 | m.def("nms", &nms, "non-maximum suppression"); 17 | } -------------------------------------------------------------------------------- /opt/src/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 13 | 14 | __device__ inline float devIoU(float const * const a, float const * const b) { 15 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 16 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 17 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 18 | float interS = width * height; 19 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 20 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 21 | return interS / (Sa + Sb - interS); 22 | } 23 | 24 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 25 | const float *dev_boxes, unsigned long long *dev_mask) { 26 | const int row_start = blockIdx.y; 27 | const int col_start = blockIdx.x; 28 | 29 | // if (row_start > col_start) return; 30 | 31 | const int row_size = 32 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 33 | const int col_size = 34 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 35 | 36 | __shared__ float block_boxes[threadsPerBlock * 5]; 37 | if (threadIdx.x < col_size) { 38 | block_boxes[threadIdx.x * 5 + 0] = 39 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 40 | block_boxes[threadIdx.x * 5 + 1] = 41 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 42 | block_boxes[threadIdx.x * 5 + 2] = 43 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 44 | block_boxes[threadIdx.x * 5 + 3] = 45 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 46 | block_boxes[threadIdx.x * 5 + 4] = 47 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 48 | } 49 | __syncthreads(); 50 | 51 | if (threadIdx.x < row_size) { 52 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 53 | const float *cur_box = dev_boxes + cur_box_idx * 5; 54 | int i = 0; 55 | unsigned long long t = 0; 56 | int start = 0; 57 | if (row_start == col_start) { 58 | start = threadIdx.x + 1; 59 | } 60 | for (i = start; i < col_size; i++) { 61 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 62 | t |= 1ULL << i; 63 | } 64 | } 65 | const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); 66 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 67 | } 68 | } 69 | 70 | // boxes is a N x 5 tensor 71 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { 72 | 73 | // Ensure CUDA uses the input tensor device. 74 | at::DeviceGuard guard(boxes.device()); 75 | 76 | using scalar_t = float; 77 | AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); 78 | auto scores = boxes.select(1, 4); 79 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 80 | auto boxes_sorted = boxes.index_select(0, order_t); 81 | 82 | int boxes_num = boxes.size(0); 83 | 84 | const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); 85 | 86 | scalar_t* boxes_dev = boxes_sorted.data(); 87 | 88 | THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState 89 | 90 | unsigned long long* mask_dev = NULL; 91 | //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, 92 | // boxes_num * col_blocks * sizeof(unsigned long long))); 93 | 94 | mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); 95 | 96 | dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), 97 | THCCeilDiv(boxes_num, threadsPerBlock)); 98 | dim3 threads(threadsPerBlock); 99 | nms_kernel<<>>(boxes_num, 100 | nms_overlap_thresh, 101 | boxes_dev, 102 | mask_dev); 103 | 104 | std::vector mask_host(boxes_num * col_blocks); 105 | THCudaCheck(cudaMemcpy(&mask_host[0], 106 | mask_dev, 107 | sizeof(unsigned long long) * boxes_num * col_blocks, 108 | cudaMemcpyDeviceToHost)); 109 | 110 | std::vector remv(col_blocks); 111 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 112 | 113 | at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); 114 | int64_t* keep_out = keep.data(); 115 | 116 | int num_to_keep = 0; 117 | for (int i = 0; i < boxes_num; i++) { 118 | int nblock = i / threadsPerBlock; 119 | int inblock = i % threadsPerBlock; 120 | 121 | if (!(remv[nblock] & (1ULL << inblock))) { 122 | keep_out[num_to_keep++] = i; 123 | unsigned long long *p = &mask_host[0] + i * col_blocks; 124 | for (int j = nblock; j < col_blocks; j++) { 125 | remv[j] |= p[j]; 126 | } 127 | } 128 | } 129 | 130 | THCudaFree(state, mask_dev); 131 | // TODO improve this part 132 | return std::get<0>(order_t.index({ 133 | keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( 134 | order_t.device(), keep.scalar_type()) 135 | }).sort(0, false)); 136 | } 137 | -------------------------------------------------------------------------------- /opt/src/soft_nms_cpu.pyx: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------- 2 | # Soft-NMS: Improving Object Detection With One Line of Code 3 | # Copyright (c) University of Maryland, College Park 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Navaneeth Bodla and Bharat Singh 6 | # Modified by Kai Chen 7 | # ---------------------------------------------------------- 8 | 9 | # cython: language_level=3, boundscheck=False 10 | 11 | import numpy as np 12 | cimport numpy as np 13 | 14 | 15 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 16 | return a if a >= b else b 17 | 18 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 19 | return a if a <= b else b 20 | 21 | 22 | def soft_nms_cpu( 23 | np.ndarray[float, ndim=2] boxes_in, 24 | float iou_thr, 25 | unsigned int method=1, 26 | float sigma=0.5, 27 | float min_score=0.001, 28 | ): 29 | boxes = boxes_in.copy() 30 | cdef int N = boxes.shape[0] 31 | cdef float iw, ih, box_area 32 | cdef float ua 33 | cdef int pos = 0 34 | cdef float maxscore = 0 35 | cdef int maxpos = 0 36 | cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov 37 | inds = np.arange(N) 38 | 39 | for i in range(N): 40 | maxscore = boxes[i, 4] 41 | maxpos = i 42 | 43 | tx1 = boxes[i, 0] 44 | ty1 = boxes[i, 1] 45 | tx2 = boxes[i, 2] 46 | ty2 = boxes[i, 3] 47 | ts = boxes[i, 4] 48 | ti = inds[i] 49 | 50 | pos = i + 1 51 | # get max box 52 | while pos < N: 53 | if maxscore < boxes[pos, 4]: 54 | maxscore = boxes[pos, 4] 55 | maxpos = pos 56 | pos = pos + 1 57 | 58 | # add max box as a detection 59 | boxes[i, 0] = boxes[maxpos, 0] 60 | boxes[i, 1] = boxes[maxpos, 1] 61 | boxes[i, 2] = boxes[maxpos, 2] 62 | boxes[i, 3] = boxes[maxpos, 3] 63 | boxes[i, 4] = boxes[maxpos, 4] 64 | inds[i] = inds[maxpos] 65 | 66 | # swap ith box with position of max box 67 | boxes[maxpos, 0] = tx1 68 | boxes[maxpos, 1] = ty1 69 | boxes[maxpos, 2] = tx2 70 | boxes[maxpos, 3] = ty2 71 | boxes[maxpos, 4] = ts 72 | inds[maxpos] = ti 73 | 74 | tx1 = boxes[i, 0] 75 | ty1 = boxes[i, 1] 76 | tx2 = boxes[i, 2] 77 | ty2 = boxes[i, 3] 78 | ts = boxes[i, 4] 79 | 80 | pos = i + 1 81 | # NMS iterations, note that N changes if detection boxes fall below 82 | # threshold 83 | while pos < N: 84 | x1 = boxes[pos, 0] 85 | y1 = boxes[pos, 1] 86 | x2 = boxes[pos, 2] 87 | y2 = boxes[pos, 3] 88 | s = boxes[pos, 4] 89 | 90 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 91 | iw = (min(tx2, x2) - max(tx1, x1) + 1) 92 | if iw > 0: 93 | ih = (min(ty2, y2) - max(ty1, y1) + 1) 94 | if ih > 0: 95 | ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) 96 | ov = iw * ih / ua # iou between max box and detection box 97 | 98 | if method == 1: # linear 99 | if ov > iou_thr: 100 | weight = 1 - ov 101 | else: 102 | weight = 1 103 | elif method == 2: # gaussian 104 | weight = np.exp(-(ov * ov) / sigma) 105 | else: # original NMS 106 | if ov > iou_thr: 107 | weight = 0 108 | else: 109 | weight = 1 110 | 111 | boxes[pos, 4] = weight * boxes[pos, 4] 112 | 113 | # if box score falls below threshold, discard the box by 114 | # swapping with last box update N 115 | if boxes[pos, 4] < min_score: 116 | boxes[pos, 0] = boxes[N-1, 0] 117 | boxes[pos, 1] = boxes[N-1, 1] 118 | boxes[pos, 2] = boxes[N-1, 2] 119 | boxes[pos, 3] = boxes[N-1, 3] 120 | boxes[pos, 4] = boxes[N-1, 4] 121 | inds[pos] = inds[N - 1] 122 | N = N - 1 123 | pos = pos - 1 124 | 125 | pos = pos + 1 126 | 127 | return boxes[:N], inds[:N] 128 | -------------------------------------------------------------------------------- /retinanet.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import math 4 | import time 5 | import torch.utils.model_zoo as model_zoo 6 | from utils import BasicBlock, Bottleneck, BBoxTransform, ClipBoxes 7 | from anchors import Anchors 8 | import losses 9 | from torchvision.ops import nms 10 | 11 | model_urls = { 12 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 13 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 14 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 15 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 16 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 17 | } 18 | 19 | class PyramidFeatures(nn.Module): 20 | def __init__(self, C3_size, C4_size, C5_size, feature_size=256): 21 | super(PyramidFeatures, self).__init__() 22 | 23 | # upsample C5 to get P5 from the FPN paper 24 | self.P5_1 = nn.Conv2d(C5_size, feature_size, kernel_size=1, stride=1, padding=0) 25 | self.P5_upsampled = nn.Upsample(scale_factor=2, mode='nearest') 26 | self.P5_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) 27 | 28 | # add P5 elementwise to C4 29 | self.P4_1 = nn.Conv2d(C4_size, feature_size, kernel_size=1, stride=1, padding=0) 30 | self.P4_upsampled = nn.Upsample(scale_factor=2, mode='nearest') 31 | self.P4_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) 32 | 33 | # add P4 elementwise to C3 34 | self.P3_1 = nn.Conv2d(C3_size, feature_size, kernel_size=1, stride=1, padding=0) 35 | self.P3_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) 36 | 37 | # "P6 is obtained via a 3x3 stride-2 conv on C5" 38 | self.P6 = nn.Conv2d(C5_size, feature_size, kernel_size=3, stride=2, padding=1) 39 | 40 | # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6" 41 | self.P7_1 = nn.ReLU() 42 | self.P7_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=2, padding=1) 43 | 44 | def forward(self, inputs): 45 | 46 | C3, C4, C5 = inputs 47 | 48 | P5_x = self.P5_1(C5) 49 | P5_upsampled_x = self.P5_upsampled(P5_x) 50 | P5_x = self.P5_2(P5_x) 51 | 52 | P4_x = self.P4_1(C4) 53 | P4_x = P5_upsampled_x + P4_x 54 | P4_upsampled_x = self.P4_upsampled(P4_x) 55 | P4_x = self.P4_2(P4_x) 56 | 57 | P3_x = self.P3_1(C3) 58 | P3_x = P3_x + P4_upsampled_x 59 | P3_x = self.P3_2(P3_x) 60 | 61 | P6_x = self.P6(C5) 62 | 63 | P7_x = self.P7_1(P6_x) 64 | P7_x = self.P7_2(P7_x) 65 | 66 | return [P3_x, P4_x, P5_x, P6_x, P7_x] 67 | 68 | 69 | class RegressionModel(nn.Module): 70 | def __init__(self, num_features_in, num_anchors=9, feature_size=256): 71 | super(RegressionModel, self).__init__() 72 | 73 | self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1) 74 | self.act1 = nn.ReLU() 75 | 76 | self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 77 | self.act2 = nn.ReLU() 78 | 79 | self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 80 | self.act3 = nn.ReLU() 81 | 82 | self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 83 | self.act4 = nn.ReLU() 84 | 85 | self.output = nn.Conv2d(feature_size, num_anchors*4, kernel_size=3, padding=1) 86 | 87 | def forward(self, x): 88 | 89 | out = self.conv1(x) 90 | out = self.act1(out) 91 | 92 | out = self.conv2(out) 93 | out = self.act2(out) 94 | 95 | out = self.conv3(out) 96 | out = self.act3(out) 97 | 98 | out = self.conv4(out) 99 | out = self.act4(out) 100 | 101 | out = self.output(out) 102 | 103 | # out is B x C x W x H, with C = 4*num_anchors 104 | out = out.permute(0, 2, 3, 1) 105 | 106 | return out.contiguous().view(out.shape[0], -1, 4) 107 | 108 | class ClassificationModel(nn.Module): 109 | def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256): 110 | super(ClassificationModel, self).__init__() 111 | 112 | self.num_classes = num_classes 113 | self.num_anchors = num_anchors 114 | 115 | self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1) 116 | self.act1 = nn.ReLU() 117 | 118 | self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 119 | self.act2 = nn.ReLU() 120 | 121 | self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 122 | self.act3 = nn.ReLU() 123 | 124 | self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 125 | self.act4 = nn.ReLU() 126 | 127 | self.output = nn.Conv2d(feature_size, num_anchors*num_classes, kernel_size=3, padding=1) 128 | self.output_act = nn.Sigmoid() 129 | 130 | def forward(self, x): 131 | 132 | out = self.conv1(x) 133 | out = self.act1(out) 134 | 135 | out = self.conv2(out) 136 | out = self.act2(out) 137 | 138 | out = self.conv3(out) 139 | out = self.act3(out) 140 | 141 | out = self.conv4(out) 142 | out = self.act4(out) 143 | 144 | out = self.output(out) 145 | out = self.output_act(out) 146 | 147 | # out is B x C x W x H, with C = n_classes + n_anchors 148 | out1 = out.permute(0, 2, 3, 1) 149 | 150 | batch_size, width, height, channels = out1.shape 151 | 152 | out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes) 153 | 154 | return out2.contiguous().view(x.shape[0], -1, self.num_classes) 155 | 156 | class ResNet(nn.Module): 157 | 158 | def __init__(self, num_classes, block, layers): 159 | self.inplanes = 64 160 | super(ResNet, self).__init__() 161 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 162 | self.bn1 = nn.BatchNorm2d(64) 163 | self.relu = nn.ReLU(inplace=True) 164 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 165 | self.layer1 = self._make_layer(block, 64, layers[0]) 166 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 167 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 168 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 169 | 170 | if block == BasicBlock: 171 | fpn_sizes = [self.layer2[layers[1]-1].conv2.out_channels, self.layer3[layers[2]-1].conv2.out_channels, self.layer4[layers[3]-1].conv2.out_channels] 172 | elif block == Bottleneck: 173 | fpn_sizes = [self.layer2[layers[1]-1].conv3.out_channels, self.layer3[layers[2]-1].conv3.out_channels, self.layer4[layers[3]-1].conv3.out_channels] 174 | 175 | self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2]) 176 | 177 | self.regressionModel = RegressionModel(256) 178 | self.classificationModel = ClassificationModel(256, num_classes=num_classes) 179 | 180 | self.anchors = Anchors() 181 | 182 | self.regressBoxes = BBoxTransform() 183 | 184 | self.clipBoxes = ClipBoxes() 185 | 186 | self.focalLoss = losses.FocalLoss() 187 | 188 | for m in self.modules(): 189 | if isinstance(m, nn.Conv2d): 190 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 191 | m.weight.data.normal_(0, math.sqrt(2. / n)) 192 | elif isinstance(m, nn.BatchNorm2d): 193 | m.weight.data.fill_(1) 194 | m.bias.data.zero_() 195 | 196 | prior = 0.01 197 | 198 | self.classificationModel.output.weight.data.fill_(0) 199 | self.classificationModel.output.bias.data.fill_(-math.log((1.0-prior)/prior)) 200 | 201 | self.regressionModel.output.weight.data.fill_(0) 202 | self.regressionModel.output.bias.data.fill_(0) 203 | 204 | self.freeze_bn() 205 | 206 | def _make_layer(self, block, planes, blocks, stride=1): 207 | downsample = None 208 | if stride != 1 or self.inplanes != planes * block.expansion: 209 | downsample = nn.Sequential( 210 | nn.Conv2d(self.inplanes, planes * block.expansion, 211 | kernel_size=1, stride=stride, bias=False), 212 | nn.BatchNorm2d(planes * block.expansion), 213 | ) 214 | 215 | layers = [] 216 | layers.append(block(self.inplanes, planes, stride, downsample)) 217 | self.inplanes = planes * block.expansion 218 | for i in range(1, blocks): 219 | layers.append(block(self.inplanes, planes)) 220 | 221 | return nn.Sequential(*layers) 222 | 223 | def freeze_bn(self): 224 | '''Freeze BatchNorm layers.''' 225 | for layer in self.modules(): 226 | if isinstance(layer, nn.BatchNorm2d): 227 | layer.eval() 228 | 229 | def forward(self, inputs): 230 | 231 | if self.training: 232 | img_batch, annotations = inputs 233 | else: 234 | img_batch = inputs 235 | 236 | x = self.conv1(img_batch) 237 | x = self.bn1(x) 238 | x = self.relu(x) 239 | x = self.maxpool(x) 240 | 241 | x1 = self.layer1(x) 242 | x2 = self.layer2(x1) 243 | x3 = self.layer3(x2) 244 | x4 = self.layer4(x3) 245 | 246 | features = self.fpn([x2, x3, x4]) 247 | 248 | regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1) 249 | 250 | classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1) 251 | 252 | anchors = self.anchors(img_batch) 253 | 254 | if self.training: 255 | return self.focalLoss(classification, regression, anchors, annotations) 256 | else: 257 | transformed_anchors = self.regressBoxes(anchors, regression) 258 | transformed_anchors = self.clipBoxes(transformed_anchors, img_batch) 259 | 260 | scores = torch.max(classification, dim=2, keepdim=True)[0] 261 | 262 | scores_over_thresh = (scores>0.05)[0, :, 0] 263 | 264 | if scores_over_thresh.sum() == 0: 265 | # no boxes to NMS, just return 266 | return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)] 267 | 268 | classification = classification[:, scores_over_thresh, :] 269 | transformed_anchors = transformed_anchors[:, scores_over_thresh, :] 270 | scores = scores[:, scores_over_thresh, :] 271 | 272 | anchors_nms_idx = nms(transformed_anchors, scores, 0.5) 273 | 274 | nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(dim=1) 275 | 276 | return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]] 277 | 278 | 279 | 280 | def resnet18(num_classes, pretrained=False, **kwargs): 281 | """Constructs a ResNet-18 model. 282 | Args: 283 | pretrained (bool): If True, returns a model pre-trained on ImageNet 284 | """ 285 | model = ResNet(num_classes, BasicBlock, [2, 2, 2, 2], **kwargs) 286 | if pretrained: 287 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18'], model_dir='.'), strict=False) 288 | return model 289 | 290 | 291 | def resnet34(num_classes, pretrained=False, **kwargs): 292 | """Constructs a ResNet-34 model. 293 | Args: 294 | pretrained (bool): If True, returns a model pre-trained on ImageNet 295 | """ 296 | model = ResNet(num_classes, BasicBlock, [3, 4, 6, 3], **kwargs) 297 | if pretrained: 298 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34'], model_dir='.'), strict=False) 299 | return model 300 | 301 | 302 | def resnet50(num_classes, pretrained=False, **kwargs): 303 | """Constructs a ResNet-50 model. 304 | Args: 305 | pretrained (bool): If True, returns a model pre-trained on ImageNet 306 | """ 307 | model = ResNet(num_classes, Bottleneck, [3, 4, 6, 3], **kwargs) 308 | if pretrained: 309 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50'], model_dir='.'), strict=False) 310 | return model 311 | 312 | def resnet101(num_classes, pretrained=False, **kwargs): 313 | """Constructs a ResNet-101 model. 314 | Args: 315 | pretrained (bool): If True, returns a model pre-trained on ImageNet 316 | """ 317 | model = ResNet(num_classes, Bottleneck, [3, 4, 23, 3], **kwargs) 318 | if pretrained: 319 | model.load_state_dict(model_zoo.load_url(model_urls['resnet101'], model_dir='.'), strict=False) 320 | return model 321 | 322 | 323 | def resnet152(num_classes, pretrained=False, **kwargs): 324 | """Constructs a ResNet-152 model. 325 | Args: 326 | pretrained (bool): If True, returns a model pre-trained on ImageNet 327 | """ 328 | model = ResNet(num_classes, Bottleneck, [3, 8, 36, 3], **kwargs) 329 | if pretrained: 330 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152'], model_dir='.'), strict=False) 331 | return model -------------------------------------------------------------------------------- /timeitdec.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | def timeit(method): 4 | def timed(*args, **kw): 5 | ts = time.time() 6 | result = method(*args, **kw) 7 | te = time.time() 8 | if 'log_time' in kw: 9 | name = kw.get('log_name', method.__name__.upper()) 10 | kw['log_time'][name] = int((te - ts) * 1000) 11 | else: 12 | print ('%r %2.2f ms' % \ 13 | (method.__name__, (te - ts) * 1000)) 14 | return result 15 | return timed -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import copy 4 | import argparse 5 | import pdb 6 | import collections 7 | import sys 8 | 9 | import numpy as np 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.optim as optim 14 | from torch.optim import lr_scheduler 15 | from torch.autograd import Variable 16 | from torchvision import datasets, models, transforms 17 | import torchvision 18 | 19 | import retinanet 20 | import efficientdet 21 | from anchors import Anchors 22 | import losses 23 | from dataloader import CocoDataset, CSVDataset, collater, Resizer, AspectRatioBasedSampler, Augmenter, UnNormalizer, Normalizer 24 | from torch.utils.data import Dataset, DataLoader 25 | 26 | import coco_eval 27 | import csv_eval 28 | 29 | from tqdm import tqdm 30 | from ptflops import get_model_complexity_info 31 | 32 | #assert torch.__version__.split('.')[1] == '4' 33 | 34 | print('CUDA available: {}'.format(torch.cuda.is_available())) 35 | 36 | 37 | 38 | def freeze_layer(layer): 39 | for param in layer.parameters(): 40 | param.requires_grad = False 41 | 42 | def main(args=None): 43 | 44 | parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.') 45 | 46 | parser.add_argument('--efficientdet', help='Use EfficientDet.', action="store_true") 47 | parser.add_argument('--scaling-compound', help='EfficientDet scaling compound phi.', type=int, default=0) 48 | parser.add_argument('--batch-size', help='Batchsize.', type=int, default=6) 49 | parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') 50 | parser.add_argument('--coco_path', help='Path to COCO directory') 51 | parser.add_argument('--csv_train', help='Path to file containing training annotations (see readme)') 52 | parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') 53 | parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)') 54 | 55 | parser.add_argument('--print-model-complexity', help='Print model complexity.', action="store_true") 56 | 57 | parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=None) 58 | parser.add_argument('--epochs', help='Number of epochs', type=int, default=100) 59 | 60 | parser = parser.parse_args(args) 61 | 62 | img_size = parser.scaling_compound * 128 + 512 63 | 64 | # Create the data loaders 65 | if parser.dataset == 'coco': 66 | 67 | if parser.coco_path is None: 68 | raise ValueError('Must provide --coco_path when training on COCO,') 69 | 70 | dataset_train = CocoDataset(parser.coco_path, set_name='train2017', transform=transforms.Compose([Normalizer(), Augmenter(), Resizer(img_size=img_size)])) 71 | dataset_val = CocoDataset(parser.coco_path, set_name='val2017', transform=transforms.Compose([Normalizer(), Resizer(img_size=img_size)])) 72 | 73 | elif parser.dataset == 'csv': 74 | 75 | if parser.csv_train is None: 76 | raise ValueError('Must provide --csv_train when training on COCO,') 77 | 78 | if parser.csv_classes is None: 79 | raise ValueError('Must provide --csv_classes when training on COCO,') 80 | 81 | 82 | dataset_train = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Augmenter(), Resizer(img_size=img_size)])) 83 | 84 | if parser.csv_val is None: 85 | dataset_val = None 86 | print('No validation annotations provided.') 87 | else: 88 | dataset_val = CSVDataset(train_file=parser.csv_val, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Resizer(img_size=img_size)])) 89 | 90 | else: 91 | raise ValueError('Dataset type not understood (must be csv or coco), exiting.') 92 | 93 | sampler = AspectRatioBasedSampler(dataset_train, batch_size=parser.batch_size, drop_last=False) 94 | dataloader_train = DataLoader(dataset_train, num_workers=3, collate_fn=collater, batch_sampler=sampler) 95 | 96 | if dataset_val is not None: 97 | sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) 98 | dataloader_val = DataLoader(dataset_val, num_workers=3, collate_fn=collater, batch_sampler=sampler_val) 99 | 100 | # Create the model 101 | if parser.depth == 18: 102 | model = retinanet.resnet18(num_classes=dataset_train.num_classes(), pretrained=True) 103 | elif parser.depth == 34: 104 | model = retinanet.resnet34(num_classes=dataset_train.num_classes(), pretrained=True) 105 | elif parser.depth == 50: 106 | model = retinanet.resnet50(num_classes=dataset_train.num_classes(), pretrained=True) 107 | elif parser.depth == 101: 108 | model = retinanet.resnet101(num_classes=dataset_train.num_classes(), pretrained=True) 109 | elif parser.depth == 152: 110 | model = retinanet.resnet152(num_classes=dataset_train.num_classes(), pretrained=True) 111 | elif parser.efficientdet: 112 | model = efficientdet.efficientdet(num_classes=dataset_train.num_classes(), pretrained=True, phi=parser.scaling_compound) 113 | else: 114 | raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152, or specify ') 115 | 116 | use_gpu = True 117 | 118 | if use_gpu: 119 | model = model.cuda() 120 | 121 | model = torch.nn.DataParallel(model).cuda() 122 | 123 | if parser.print_model_complexity: 124 | flops, params = get_model_complexity_info(model, (3, img_size, img_size), as_strings=True, print_per_layer_stat=True) 125 | print('{:<30} {:<8}'.format('Computational complexity: ', flops)) 126 | print('{:<30} {:<8}'.format('Number of parameters: ', params)) 127 | 128 | model.training = True 129 | 130 | optimizer = optim.SGD(model.parameters(), lr=4e-5) 131 | 132 | scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) 133 | 134 | loss_hist = collections.deque(maxlen=500) 135 | 136 | model.train() 137 | model.module.freeze_bn() 138 | 139 | print('Num training images: {}'.format(len(dataset_train))) 140 | 141 | for epoch_num in range(parser.epochs): 142 | 143 | model.train() 144 | model.module.freeze_bn() 145 | 146 | freeze_layer(model.module.efficientnet) 147 | 148 | epoch_loss = [] 149 | pbar = tqdm(enumerate(dataloader_train), total=len(dataloader_train)) 150 | for iter_num, data in pbar: 151 | optimizer.zero_grad() 152 | 153 | classification_loss, regression_loss = model([data['img'].cuda().float(), data['annot']]) 154 | 155 | classification_loss = classification_loss.mean() 156 | regression_loss = regression_loss.mean() 157 | 158 | loss = classification_loss + regression_loss 159 | 160 | if bool(loss == 0): 161 | continue 162 | 163 | loss.backward() 164 | 165 | torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) 166 | 167 | optimizer.step() 168 | 169 | loss_hist.append(float(loss)) 170 | 171 | epoch_loss.append(float(loss)) 172 | 173 | mem = torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0 174 | pbar.set_description(f'{mem:.3g}G | {float(classification_loss):1.5f} | {float(regression_loss):1.5f} | {np.mean(loss_hist):1.5f}') 175 | #print('Epoch: {} | Iteration: {} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | Running loss: {:1.5f}'.format(epoch_num, iter_num, float(classification_loss), float(regression_loss), np.mean(loss_hist))) 176 | 177 | del classification_loss 178 | del regression_loss 179 | 180 | if parser.dataset == 'coco': 181 | 182 | print('Evaluating dataset') 183 | 184 | coco_eval.evaluate_coco(dataset_val, model) 185 | 186 | elif parser.dataset == 'csv' and parser.csv_val is not None: 187 | 188 | print('Evaluating dataset') 189 | 190 | mAP = csv_eval.evaluate(dataset_val, model) 191 | 192 | 193 | scheduler.step(np.mean(epoch_loss)) 194 | 195 | torch.save(model.module_model_, '{}_model_{}.pt'.format(parser.dataset, epoch_num)) 196 | 197 | model.eval() 198 | 199 | torch.save(model, 'model_final.pt'.format(epoch_num)) 200 | 201 | if __name__ == '__main__': 202 | main() 203 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | def conv3x3(in_planes, out_planes, stride=1): 6 | """3x3 convolution with padding""" 7 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 8 | padding=1, bias=False) 9 | 10 | class BasicBlock(nn.Module): 11 | expansion = 1 12 | 13 | def __init__(self, inplanes, planes, stride=1, downsample=None): 14 | super(BasicBlock, self).__init__() 15 | self.conv1 = conv3x3(inplanes, planes, stride) 16 | self.bn1 = nn.BatchNorm2d(planes) 17 | self.relu = nn.ReLU(inplace=True) 18 | self.conv2 = conv3x3(planes, planes) 19 | self.bn2 = nn.BatchNorm2d(planes) 20 | self.downsample = downsample 21 | self.stride = stride 22 | 23 | def forward(self, x): 24 | residual = x 25 | 26 | out = self.conv1(x) 27 | out = self.bn1(out) 28 | out = self.relu(out) 29 | 30 | out = self.conv2(out) 31 | out = self.bn2(out) 32 | 33 | if self.downsample is not None: 34 | residual = self.downsample(x) 35 | 36 | out += residual 37 | out = self.relu(out) 38 | 39 | return out 40 | 41 | 42 | class Bottleneck(nn.Module): 43 | expansion = 4 44 | 45 | def __init__(self, inplanes, planes, stride=1, downsample=None): 46 | super(Bottleneck, self).__init__() 47 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 48 | self.bn1 = nn.BatchNorm2d(planes) 49 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 50 | padding=1, bias=False) 51 | self.bn2 = nn.BatchNorm2d(planes) 52 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 53 | self.bn3 = nn.BatchNorm2d(planes * 4) 54 | self.relu = nn.ReLU(inplace=True) 55 | self.downsample = downsample 56 | self.stride = stride 57 | 58 | def forward(self, x): 59 | residual = x 60 | 61 | out = self.conv1(x) 62 | out = self.bn1(out) 63 | out = self.relu(out) 64 | 65 | out = self.conv2(out) 66 | out = self.bn2(out) 67 | out = self.relu(out) 68 | 69 | out = self.conv3(out) 70 | out = self.bn3(out) 71 | 72 | if self.downsample is not None: 73 | residual = self.downsample(x) 74 | 75 | out += residual 76 | out = self.relu(out) 77 | 78 | return out 79 | 80 | class BBoxTransform(nn.Module): 81 | 82 | def __init__(self, mean=None, std=None): 83 | super(BBoxTransform, self).__init__() 84 | if mean is None: 85 | self.mean = torch.from_numpy(np.array([0, 0, 0, 0]).astype(np.float32)).cuda() 86 | else: 87 | self.mean = mean 88 | if std is None: 89 | self.std = torch.from_numpy(np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32)).cuda() 90 | else: 91 | self.std = std 92 | 93 | def forward(self, boxes, deltas): 94 | 95 | widths = boxes[:, :, 2] - boxes[:, :, 0] 96 | heights = boxes[:, :, 3] - boxes[:, :, 1] 97 | ctr_x = boxes[:, :, 0] + 0.5 * widths 98 | ctr_y = boxes[:, :, 1] + 0.5 * heights 99 | 100 | dx = deltas[:, :, 0] * self.std[0] + self.mean[0] 101 | dy = deltas[:, :, 1] * self.std[1] + self.mean[1] 102 | dw = deltas[:, :, 2] * self.std[2] + self.mean[2] 103 | dh = deltas[:, :, 3] * self.std[3] + self.mean[3] 104 | 105 | pred_ctr_x = ctr_x + dx * widths 106 | pred_ctr_y = ctr_y + dy * heights 107 | pred_w = torch.exp(dw) * widths 108 | pred_h = torch.exp(dh) * heights 109 | 110 | pred_boxes_x1 = pred_ctr_x - 0.5 * pred_w 111 | pred_boxes_y1 = pred_ctr_y - 0.5 * pred_h 112 | pred_boxes_x2 = pred_ctr_x + 0.5 * pred_w 113 | pred_boxes_y2 = pred_ctr_y + 0.5 * pred_h 114 | 115 | pred_boxes = torch.stack([pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2], dim=2) 116 | 117 | return pred_boxes 118 | 119 | 120 | class ClipBoxes(nn.Module): 121 | 122 | def __init__(self, width=None, height=None): 123 | super(ClipBoxes, self).__init__() 124 | 125 | def forward(self, boxes, img): 126 | 127 | batch_size, num_channels, height, width = img.shape 128 | 129 | boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0) 130 | boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0) 131 | 132 | boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width) 133 | boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height) 134 | 135 | return boxes 136 | -------------------------------------------------------------------------------- /visualize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torchvision 3 | import time 4 | import os 5 | import copy 6 | import pdb 7 | import time 8 | import argparse 9 | 10 | import sys 11 | import cv2 12 | 13 | import torch 14 | from torch.utils.data import Dataset, DataLoader 15 | from torchvision import datasets, models, transforms 16 | 17 | from dataloader import CocoDataset, CSVDataset, collater, Resizer, AspectRatioBasedSampler, Augmenter, UnNormalizer, Normalizer 18 | 19 | 20 | assert torch.__version__.split('.')[1] == '4' 21 | 22 | print('CUDA available: {}'.format(torch.cuda.is_available())) 23 | 24 | 25 | def main(args=None): 26 | parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.') 27 | 28 | parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') 29 | parser.add_argument('--coco_path', help='Path to COCO directory') 30 | parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') 31 | parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)') 32 | 33 | parser.add_argument('--model', help='Path to model (.pt) file.') 34 | 35 | parser = parser.parse_args(args) 36 | 37 | if parser.dataset == 'coco': 38 | dataset_val = CocoDataset(parser.coco_path, set_name='val2017', transform=transforms.Compose([Normalizer(), Resizer()])) 39 | elif parser.dataset == 'csv': 40 | dataset_val = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Resizer()])) 41 | else: 42 | raise ValueError('Dataset type not understood (must be csv or coco), exiting.') 43 | 44 | sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) 45 | dataloader_val = DataLoader(dataset_val, num_workers=1, collate_fn=collater, batch_sampler=sampler_val) 46 | 47 | retinanet = torch.load(parser.model) 48 | 49 | use_gpu = True 50 | 51 | if use_gpu: 52 | retinanet = retinanet.cuda() 53 | 54 | retinanet.eval() 55 | 56 | unnormalize = UnNormalizer() 57 | 58 | def draw_caption(image, box, caption): 59 | 60 | b = np.array(box).astype(int) 61 | cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 0), 2) 62 | cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) 63 | 64 | for idx, data in enumerate(dataloader_val): 65 | 66 | with torch.no_grad(): 67 | st = time.time() 68 | scores, classification, transformed_anchors = retinanet(data['img'].cuda().float()) 69 | print('Elapsed time: {}'.format(time.time()-st)) 70 | idxs = np.where(scores>0.5) 71 | img = np.array(255 * unnormalize(data['img'][0, :, :, :])).copy() 72 | 73 | img[img<0] = 0 74 | img[img>255] = 255 75 | 76 | img = np.transpose(img, (1, 2, 0)) 77 | 78 | img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB) 79 | 80 | for j in range(idxs[0].shape[0]): 81 | bbox = transformed_anchors[idxs[0][j], :] 82 | x1 = int(bbox[0]) 83 | y1 = int(bbox[1]) 84 | x2 = int(bbox[2]) 85 | y2 = int(bbox[3]) 86 | label_name = dataset_val.labels[int(classification[idxs[0][j]])] 87 | draw_caption(img, (x1, y1, x2, y2), label_name) 88 | 89 | cv2.rectangle(img, (x1, y1), (x2, y2), color=(0, 0, 255), thickness=2) 90 | print(label_name) 91 | 92 | cv2.imshow('img', img) 93 | cv2.waitKey(0) 94 | 95 | 96 | 97 | if __name__ == '__main__': 98 | main() --------------------------------------------------------------------------------