├── .dockerignore
├── .gitignore
├── CONTRIBUTING.md
├── Dockerfile
├── Dockerfile.deepstream
├── INFERENCE.md
├── LICENSE
├── README.md
├── TRAINING.md
├── __init__.py
├── csrc
    ├── calibrator.h
    ├── cuda
    │   ├── decode.cu
    │   ├── decode.h
    │   ├── nms.cu
    │   ├── nms.h
    │   └── utils.h
    ├── engine.cpp
    ├── engine.h
    ├── extensions.cpp
    └── plugins
    │   ├── DecodePlugin.h
    │   └── NMSPlugin.h
├── extras
    ├── cppapi
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── export.cpp
    │   ├── infer.cpp
    │   └── infervideo.cpp
    ├── deepstream
    │   ├── README.md
    │   └── deepstream-sample
    │   │   ├── CMakeLists.txt
    │   │   ├── ds_config_1vid.txt
    │   │   ├── ds_config_8vid.txt
    │   │   ├── infer_config_batch1.txt
    │   │   ├── infer_config_batch8.txt
    │   │   ├── labels_coco.txt
    │   │   └── nvdsparsebbox_retinanet.cpp
    ├── tensorrt-6.0.1.5-cp36-none-linux_x86_64.whl
    └── test.sh
├── markup_utils
    ├── __init__.py
    └── supervisly_to_coco.py
├── retinanet
    ├── __init__.py
    ├── backbones
    │   ├── __init__.py
    │   ├── fpn.py
    │   ├── layers.py
    │   ├── resnet.py
    │   └── utils.py
    ├── box.py
    ├── dali.py
    ├── data.py
    ├── export_models.sh
    ├── infer.py
    ├── infer_example.py
    ├── inference_no_dali.py
    ├── loss.py
    ├── main.py
    ├── model.py
    ├── train.py
    └── utils.py
├── setup.py
└── unet
    ├── common
        ├── __init__.py
        ├── models_common.py
        ├── pt_models.py
        └── smp_models.py
    ├── convert_to_trt.py
    ├── export_onnx.py
    └── infer_service.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | .DS_Store
 3 | __pycache__
 4 | *.pyc
 5 | *.o
 6 | *.so
 7 | *.egg-info
 8 | build
 9 | dist
10 | .vscode
11 | *.jpg
12 | !tests/*.jpg
13 | *.pkl
14 | *.torch
15 | *.plan
16 | 
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | __pycache__
 3 | *.pyc
 4 | *.o
 5 | *.so
 6 | odtk/tensorrt/src/*.py
 7 | odtk/tensorrt/src/*.cxx
 8 | *.egg-info
 9 | build
10 | dist
11 | .vscode
12 | *.jpg
13 | !tests/*.jpg
14 | *.pkl
15 | *.torch
16 | *.plan
17 | *.idea
18 | /idea
19 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Reporting problems, asking questions
 2 | ------------------------------------
 3 | 
 4 | 
 5 | We appreciate feedback, questions or bug reports. When you need help with the code, try to follow the process outlined in the Stack Overflow (https://stackoverflow.com/help/mcve) document. 
 6 | 
 7 | At a minimum, your issues should describe the following:
 8 | 
 9 | * What command you ran 
10 | * What was the result you observed
11 | * What was the result you expected
12 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:19.10-py3
 2 | 
 3 | ARG USER=alex
 4 | ARG UID=1000
 5 | ARG GID=1000
 6 | ARG PW=alex
 7 | RUN useradd -m ${USER} --uid=${UID} && echo "${USER}:${PW}" | chpasswd
 8 | 
 9 | 
10 | RUN apt-get -y update && apt-get -y upgrade && apt-get -y install curl && apt-get -y install wget && apt-get -y install git && apt-get -y install automake && apt-get install -y sudo && adduser ${USER} sudo
11 | #RUN git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ && cd ..
12 | 
13 | # COPY --chown=${USER}:${USER}
14 | 
15 | RUN pip install git+https://github.com/bonlime/pytorch-tools.git@master
16 | 
17 | COPY . retinanet/
18 | RUN pip install --no-cache-dir -e retinanet/
19 | RUN pip install /workspace/retinanet/extras/tensorrt-6.0.1.5-cp36-none-linux_x86_64.whl
20 | RUN pip install tensorboardx
21 | RUN pip install albumentations
22 | RUN pip install RPi.GPIO
23 | RUN pip install setproctitle
24 | RUN pip install paramiko
25 | RUN pip install flask
26 | RUN pip install mem_top
27 | RUN pip install arrow
28 | RUN pip install pycuda
29 | RUN pip install torchvision
30 | RUN pip install pretrainedmodels
31 | RUN pip install efficientnet-pytorch
32 | RUN pip install git+https://github.com/qubvel/segmentation_models.pytorch
33 | 
34 | RUN chown -R ${USER}:${USER} retinanet/
35 | 
36 | RUN apt-get install -y openssh-server && apt install -y tmux && apt-get -y install bison flex && apt-cache search pcre && apt-get -y install net-tools && apt-get -y install nmap
37 | RUN apt-get -y install libpcre3 libpcre3-dev && apt-get -y install iputils-ping
38 | 
39 | RUN git clone https://github.com/swig/swig.git && cd swig && ./autogen.sh && ./configure && make && make install && cd ..
40 | 
41 | RUN wget https://www.baslerweb.com/fp-1551786516/media/downloads/software/pylon_software/pylon-5.2.0.13457-x86_64.tar.gz
42 | RUN tar -xvzf pylon-5.2.0.13457-x86_64.tar.gz && cd pylon-5.2.0.13457-x86_64 && tar -C /opt -xzf pylonSDK-5.2.0.13457-x86_64.tar.gz && cd ..
43 | 
44 | RUN git clone https://github.com/basler/pypylon.git && cd pypylon && pip install . && cd ..
45 | 
46 | RUN mkdir /var/run/sshd
47 | RUN echo 'root:pass' | chpasswd
48 | RUN sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
49 | 
50 | # SSH login fix. Otherwise user is kicked off after login
51 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
52 | 
53 | ENV NOTVISIBLE "in users profile"
54 | RUN echo "export VISIBLE=now" >> /etc/profile
55 | 
56 | EXPOSE 22 5000 6000 6001 7000 7001 8000
57 | CMD ["/usr/sbin/sshd", "-D"]
58 | 
59 | #USER ${UID}:${GID}
60 | #WORKDIR /home/${USER}
61 | 


--------------------------------------------------------------------------------
/Dockerfile.deepstream:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:19.02-py3
 2 | 
 3 | COPY . /workspace/retinanet-examples/
 4 | 
 5 | RUN apt-get update && apt-get install -y libssl1.0.0  libgstreamer1.0-0    gstreamer1.0-tools   gstreamer1.0-plugins-good   gstreamer1.0-plugins-bad     gstreamer1.0-plugins-ugly  gstreamer1.0-libav   libgstrtspserver-1.0-0   libjansson4 ffmpeg
 6 | 
 7 | WORKDIR /root
 8 | 
 9 | RUN git clone https://github.com/edenhill/librdkafka.git /librdkafka && \
10 |     cd /librdkafka && ./configure && make && make install && \
11 |     mkdir -p /usr/local/deepstream && \
12 |     cp /usr/local/lib/librdkafka* /usr/local/deepstream
13 | 
14 | COPY extras/deepstream/DeepStream_Release/binaries.tbz2  \
15 |      extras/deepstream/DeepStream_Release/LicenseAgreement.pdf  \
16 |      extras/deepstream/DeepStream_Release/README \
17 |      /root/DeepStream_Release/
18 | 
19 | RUN cd /root/DeepStream_Release && \
20 |     tar -xvf binaries.tbz2 -C /
21 | 
22 | # config files + sample apps
23 | COPY extras/deepstream/DeepStream_Release/samples  \
24 |      /root/DeepStream_Release/samples
25 | 
26 | COPY extras/deepstream/DeepStream_Release/sources \
27 |      /root/DeepStream_Release/sources
28 | 
29 | RUN  chmod u+x /root/DeepStream_Release/sources/tools/nvds_logger/setup_nvds_logger.sh
30 | 
31 | # To get video driver libraries at runtime (libnvidia-encode.so/libnvcuvid.so)
32 | ENV NVIDIA_DRIVER_CAPABILITIES $NVIDIA_DRIVER_CAPABILITIES,video
33 | 
34 | RUN ln -sf /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/lib/x86_64-linux-gnu/libnvcuvid.so
35 | 
36 | RUN pip install --no-cache-dir -e /workspace/retinanet-examples
37 | 
38 | RUN mkdir /workspace/retinanet-examples/extras/deepstream/deepstream-sample/build && \
39 |     cd /workspace/retinanet-examples/extras/deepstream/deepstream-sample/build && \
40 |     cmake -DDeepStream_DIR=/root/DeepStream_Release .. && make
41 | 
42 | WORKDIR /workspace/retinanet-examples/extras/deepstream
43 | 


--------------------------------------------------------------------------------
/INFERENCE.md:
--------------------------------------------------------------------------------
 1 | # Inference
 2 | 
 3 | We provide two ways to do inference with `retinanet-examples`:
 4 | * PyTorch inference using a trained model (FP32 or FP16 precision)
 5 | * Export trained pytorch model to TensorRT for optimized inference (FP32, FP16 or INT8 precision)
 6 | 
 7 | `retinanet-examples infer` will run distributed inference across all available GPUs. When using PyTorch, the default behavior is to run inference with mixed precision. The precision used when running inference with a TensorRT engine will correspond to the precision chosen when the model was exported to TensorRT (see [TensorRT section](#exporting-trained-pytorch-model-to-tensorrt section) below). 
 8 | 
 9 | **NOTE**: Availability of HW support for fast FP16 and INT8 precision like [NVIDIA Tensor Cores](https://www.nvidia.com/en-us/data-center/tensorcore/) depends on your GPU architecture: Volta or newer GPUs support both FP16 and INT8, and Pascal GPUs can support either FP16 or INT8. 
10 | 
11 | ## PyTorch Inference
12 | 
13 | Evaluate trained PyTorch detection model on COCO 2017 (mixed precision):
14 | 
15 | ```bash
16 | retinanet infer model.pth --images=/data/coco/val2017 --annotations=instances_val2017.json --batch 8
17 | ```
18 | **NOTE**: `--batch N` specifies *global* batch size to be used for inference. The batch size per GPU will be `N // num_gpus`.
19 | 
20 | Use full precision (FP32) during evaluation:
21 | 
22 | ```bash
23 | retinanet infer model.pth --images=/data/coco/val2017 --annotations=instances_val2017.json --full-precision
24 | ```
25 | 
26 | Evaluate PyTorch detection model with a small input image size:
27 | 
28 | ```bash
29 | retinanet infer model.pth --images=/data/coco/val2017 --annotations=instances_val2017.json  --resize 400 --max-size 640
30 | ```
31 | Here, the shorter side of the input images will be resized to `resize` as long as the longer side doesn't get larger than `max-size`, otherwise the longer side of the input image will be resized to `max-size`.
32 | 
33 | **NOTE**: To get best accuracy, training the model at the preferred export size is encouraged.
34 | 
35 | Run inference using your own dataset:
36 | 
37 | ```bash
38 | retinanet infer model.pth --images=/data/your_images --output=detections.json
39 | ```
40 | 
41 | ## Exporting trained PyTorch model to TensorRT
42 | 
43 | `retinanet-examples` provides an simple workflow to optimize a trained PyTorch model for inference deployment using TensorRT. The PyTorch model is exported to [ONNX](https://github.com/onnx/onnx), and then the ONNX model is consumed and optimized by TensorRT.
44 | To learn more about TensorRT optimization, refer here: https://developer.nvidia.com/tensorrt
45 | 
46 | **NOTE**: When a model is optimized with TensorRT, the output is a TensorRT engine (.plan file) that can be used for deployment. This TensorRT engine has several fixed properties that are specified during the export process.
47 | * Input image size: TensorRT engines only support a fixed input size.
48 | * Precision: TensorRT supports FP32, FP16, or INT8 precision.
49 | * Target GPU: TensorRT optimizations are tied to the type of GPU on the system where optimization is performed. They are not transferable across different types of GPUs. Put another way, if you aim to deploy your TensorRT engine on a Tesla T4 GPU, you must run the optimization on a system with a T4 GPU. 
50 | 
51 | The workflow for exporting a trained PyTorch detection model to TensorRT is as simple as:
52 | 
53 | ```bash
54 | retinanet-examples export model.pth model_fp16.plan --batch 1 --size 1280
55 | ```
56 | This will create a TensorRT engine optimized for batch size 1, using an input size of 1280x1280. By default, the engine will be created to run in FP16 precision.
57 | 
58 | Export your model to use full precision using a non-square input size:
59 | ```bash
60 | retinanet-examples export model.pth model_fp32.plan --full-precision --batch 1 --size 800 1280
61 | ```
62 | 
63 | In order to use INT8 precision with TensorRT, you need to provide calibration images (images that are representative of what will be seen at runtime) that will be used to rescale the network.
64 | ```bash
65 | retinanet-examples export model.pth model_int8.plan --batch 2 --int8 --calibration-images /data/val/ --calibration-batches 10 --calibration-table model_calibration_table
66 | ```
67 | 
68 | This will randomly select 20 images from `/data/val/` to calibrate the network for INT8 precision. The results from calibration will be saved to `model_calibration_table` that can be used to create subsequent INT8 engines for this model without needed to recalibrate. 
69 | 
70 | Build an INT8 engine for a previously calibrated model:
71 | ```bash
72 | retinanet-examples export model.pth model_int8.plan --batch 2 --int8 --calibration-table model_calibration_table
73 | ```
74 | 
75 | 
76 | ## Deployment with TensorRT on NVIDIA Jetson AGX Xavier
77 | 
78 | We provide a path for deploying trained models with TensorRT onto embedded platforms like [NVIDIA Jetson AGX Xavier](https://developer.nvidia.com/embedded/buy/jetson-agx-xavier-devkit), where PyTorch is not readily available. 
79 | 
80 | You will need to export your trained PyTorch model to ONNX representation on your host system, and copy the resulting ONNX model to your Jetson AGX Xavier:
81 | ```bash
82 | retinanet-examples export model.pth model.onnx --size 800 1280
83 | ```
84 | 
85 | Refer to additional documentation on using the example cppapi code to build the TensorRT engine and run inference here: [cppapi example code](extras/cppapi/README.md)
86 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions
 5 | are met:
 6 |  * Redistributions of source code must retain the above copyright
 7 |    notice, this list of conditions and the following disclaimer.
 8 |  * Redistributions in binary form must reproduce the above copyright
 9 |    notice, this list of conditions and the following disclaimer in the
10 |    documentation and/or other materials provided with the distribution.
11 |  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |    contributors may be used to endorse or promote products derived
13 |    from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RetinaNet Examples
  2 | 
  3 | **Fast** and **accurate** single stage object detection with end-to-end GPU optimization.
  4 | 
  5 | ## Description
  6 | 
  7 | [RetinaNet](#references) is a single shot object detector with multiple backbones offering various performance/accuracy trade-offs.
  8 | 
  9 | It is optimized for end-to-end GPU processing using:
 10 | * The [PyTorch](https://pytorch.org) deep learning framework with [ONNX](https://onnx.ai) support
 11 | * NVIDIA [Apex](https://github.com/NVIDIA/apex) for mixed precision and distributed training
 12 | * NVIDIA [DALI](https://github.com/NVIDIA/DALI) for optimized data pre-processing
 13 | * NVIDIA [TensorRT](https://developer.nvidia.com/tensorrt) for high-performance inference
 14 | * NVIDIA [DeepStream](https://developer.nvidia.com/deepstream-sdk) for optimized real-time video streams support
 15 | 
 16 | ## Disclaimer
 17 | 
 18 | This is a research project, not an official NVIDIA product.
 19 | 
 20 | ## Performance
 21 | 
 22 | The detection pipeline allows the user to select a specific backbone depending on the latency-accuracy trade-off preferred.
 23 | 
 24 | Backbone | Resize | mAP @[IoU=0.50:0.95] | Training Time on [DGX1v](https://www.nvidia.com/en-us/data-center/dgx-1/) | TensorRT Inference Latency FP16 on [V100](https://www.nvidia.com/en-us/data-center/tesla-v100/) | TensorRT Inference Latency INT8 on [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/)
 25 | --- | :---: | :---: | :---: | :---: | :---:
 26 | ResNet18FPN | 800 | 0.318 | 5 hrs  | 12 ms/im | 12 ms/im
 27 | ResNet34FPN | 800 | 0.343 | 6 hrs  | 14 ms/im | 14 ms/im
 28 | ResNet50FPN | 800 | 0.358 | 7 hrs  | 16 ms/im | 16 ms/im
 29 | ResNet101FPN | 800 | 0.376 | 10 hrs | 20 ms/im | 20 ms/im
 30 | ResNet152FPN | 800 | 0.393 | 12 hrs | 25 ms/im | 24 ms/im
 31 | 
 32 | Training results for [COCO 2017](http://cocodataset.org/#detection-2017) (train/val) after full training schedule with default parameters. Inference results include bounding boxes post-processing for a batch size of 1.
 33 | 
 34 | ## Installation
 35 | 
 36 | For best performance, we encourage using the latest [PyTorch NGC docker container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch):
 37 | ```bash
 38 | nvidia-docker run --rm --ipc=host -it nvcr.io/nvidia/pytorch:19.05-py3
 39 | ```
 40 | 
 41 | From the container, simply install retinanet using `pip`:
 42 | ```bash
 43 | pip install --no-cache-dir git+https://github.com/nvidia/retinanet-examples
 44 | ```
 45 | 
 46 | Or you can clone this repository, build and run your own image:
 47 | ```bash
 48 | git clone https://github.com/nvidia/retinanet-examples
 49 | docker build -t retinanet:latest retinanet/
 50 | nvidia-docker run --rm --ipc=host -it retinanet:latest
 51 | ```
 52 | 
 53 | ## Usage
 54 | 
 55 | Training, inference, evaluation and model export can be done through the `retinanet` utility.
 56 | 
 57 | For more details refer to the [INFERENCE](INFERENCE.md) and [TRAINING](TRAINING.md) documentation.
 58 | 
 59 | ### Training
 60 | 
 61 | Train a detection model on [COCO 2017](http://cocodataset.org/#download) from pre-trained backbone:
 62 | ```bash
 63 | retinanet train retinanet_rn50fpn.pth --backbone ResNet50FPN \
 64 |     --images /coco/images/train2017/ --annotations /coco/annotations/instances_train2017.json \
 65 |     --val-images /coco/images/val2017/ --val-annotations /coco/annotations/instances_val2017.json
 66 | ```
 67 | 
 68 | ### Fine Tuning
 69 | 
 70 | Fine-tune a pre-trained model on your dataset. In the example below we use [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html) with [JSON annotations](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip):
 71 | ```bash
 72 | retinanet train model_mydataset.pth \
 73 |     --fine-tune retinanet_rn50fpn.pth \
 74 |     --classes 20 --iters 10000 --val-iters 1000 --lr 0.0005 \
 75 |     --resize 512 --jitter 480 640 --images /voc/JPEGImages/ \
 76 |     --annotations /voc/pascal_train2012.json --val-annotations /voc/pascal_val2012.json
 77 | ```
 78 | 
 79 | Note: the shorter side of the input images will be resized to `resize` as long as the longer side doesn't get larger than `max-size`. During training, the images will be randomly randomly resized to a new size within the `jitter` range.
 80 | 
 81 | ### Inference
 82 | 
 83 | Evaluate your detection model on [COCO 2017](http://cocodataset.org/#download):
 84 | ```bash
 85 | retinanet infer retinanet_rn50fpn.pth --images /coco/images/val2017/ --annotations /coco/annotations/instances_val2017.json
 86 | ```
 87 | 
 88 | Run inference on [your dataset](#datasets):
 89 | ```bash
 90 | retinanet infer retinanet_rn50fpn.pth --images /dataset/val --output detections.json
 91 | ```
 92 | 
 93 | ### Optimized Inference with TensorRT
 94 | 
 95 | For faster inference, export the detection model to an optimized FP16 TensorRT engine:
 96 | ```bash
 97 | retinanet export model.pth engine.plan
 98 | ```
 99 | Note: for older versions of TensorRT (prior to TensorRT 5.1 / 19.03 containers) the ONNX opset version should be specified (using `--opset 8` for instance).
100 | 
101 | Evaluate the model with TensorRT backend on [COCO 2017](http://cocodataset.org/#download):
102 | ```bash
103 | retinanet infer engine.plan --images /coco/images/val2017/ --annotations /coco/annotations/instances_val2017.json
104 | ```
105 | 
106 | ### INT8 Inference with TensorRT
107 | 
108 | For even faster inference, do INT8 calibration to create an optimized INT8 TensorRT engine:
109 | ```bash
110 | retinanet export model.pth engine.plan --int8 --calibration-images /coco/images/val2017/
111 | ```
112 | This will create an INT8CalibrationTable file that can be used to create INT8 TensorRT engines for the same model later on without needing to do calibration.
113 | 
114 | Or create an optimized INT8 TensorRT engine using a cached calibration table:
115 | ```bash
116 | retinanet export model.pth engine.plan --int8 --calibration-table /path/to/INT8CalibrationTable
117 | ```
118 | 
119 | ## Datasets
120 | 
121 | RetinaNet supports annotations in the [COCO JSON format](http://cocodataset.org/#format-data).
122 | When converting the annotations from your own dataset into JSON, the following entries are required:
123 | ```
124 | {
125 |     "images": [{
126 |         "id" : int,
127 |         "file_name" : str
128 |     }],
129 |     "annotations": [{
130 |         "id" : int,
131 |         "image_id" : int, 
132 |         "category_id" : int,
133 |         "bbox" : [x, y, w, h]
134 |     }],
135 |     "categories": [{
136 |         "id" : int
137 |     ]}
138 | }
139 | ```
140 | 
141 | ## References
142 | 
143 | - [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002).
144 |   Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, Piotr Dollár.
145 |   ICCV, 2017.
146 | - [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677).
147 |   Priya Goyal, Piotr Dollár, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, Kaiming He.
148 |   June 2017.
149 | - [Feature Pyramid Networks for Object Detection](https://arxiv.org/abs/1612.03144).
150 |   Tsung-Yi Lin, Piotr Dollár, Ross Girshick, Kaiming He, Bharath Hariharan, Serge Belongie.
151 |   CVPR, 2017.
152 | - [Deep Residual Learning for Image Recognition](http://arxiv.org/abs/1512.03385).
153 |   Kaiming He, Xiangyu Zhang, Shaoqing Renm Jian Sun.
154 |   CVPR, 2016.
155 | 


--------------------------------------------------------------------------------
/TRAINING.md:
--------------------------------------------------------------------------------
 1 | # Training
 2 | 
 3 | There are two main ways to train a model with `retinanet-examples`:
 4 | * Fine-tuning the detection model using a model already trained on a large dataset (like MS-COCO)
 5 | * Fully training the detection model from random initialization using a pre-trained backbone (usually on ImageNet)
 6 | 
 7 | ## Fine-tuning
 8 | 
 9 | Fine-tuning an existing model trained on COCO allows you to use transfer learning to get a accurate model for your own dataset with minimal training.
10 | When fine-tuning, we re-initialize the last layer of the classification head so the network will re-learn how to map features to classes scores regardless of the number of classes in your own dataset.
11 | 
12 | You can fine-tune a pre-trained model on your dataset. In the example below we use [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html) with [JSON annotations](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip):
13 | ```bash
14 | retinanet train model_mydataset.pth \
15 |     --fine-tune retinanet_rn50fpn.pth \
16 |     --classes 20 --iters 10000 --val-iters 1000 --lr 0.0005 \
17 |     --resize 512 --jitter 480 640 --images /voc/JPEGImages/ \
18 |     --annotations /voc/pascal_train2012.json --val-annotations /voc/pascal_val2012.json
19 | ```
20 | 
21 | Even though the COCO model was trained on 80 classes, we can easily use tranfer learning to fine-tune it on the Pascal VOC model representing only 20 classes.
22 | 
23 | The shorter side of the input images will be resized to `resize` as long as the longer side doesn't get larger than `max-size`.
24 | During training the images will be randomly resized to a new size within the `jitter` range.
25 | 
26 | We usually want to fine-tune the model with a lower learning rate `lr` than during full training and for less iterations `iters`.
27 | 
28 | ## Full Training
29 | 
30 | If you do not have a pre-trained model, if your dataset is substantially large, or if you have written your own backbone, then you should fully train the detection model.
31 | 
32 | Full training usually starts from a pre-trained backbone (automatically downloaded with the current backbones we offer) that has been pre-trained on a classification task with a large dataset like [ImageNet](http://www.image-net.org).
33 | This is especially necessary for backbones using batch normalization as they require large batch sizes during training that cannot be provided when training on the detection task as the input images have to be relatively large.
34 | 
35 | Train a detection model on [COCO 2017](http://cocodataset.org/#download) from pre-trained backbone:
36 | ```bash
37 | retinanet train retinanet_rn50fpn.pth --backbone ResNet50FPN \
38 |     --images /coco/images/train2017/ --annotations /coco/annotations/instances_train2017.json \
39 |     --val-images /coco/images/val2017/ --val-annotations /coco/annotations/instances_val2017.json
40 | ```
41 | 
42 | We use mixed precision training by default. Full precision training can be used by providing the `full-precision` option although it doesn't provide improved accuracy in our experience.
43 | 
44 | If you want to setup your own training schedule, the following options are useful:
45 | * `iters` is the total number of iterations you want to train the model for (1 iteration with a `batch` size of 16 correspond to going through 16 images of your dataset)
46 | * `milestone` is a list of number of iteration at which we want to decay the learning rate
47 | * `lr` represents the initial learning rate and `gamma` is the factor by which we multiply the learning rate at each decay milestone
48 | * `schedule` is a float value that `iters` and `milestones` will be multiplied with to easily scale the learning schedule
49 | * `warmup` is the number of initial iterations during which we want to linearly ramp-up the learning rate to avoid early divergence of the loss.
50 | 
51 | You can also monitor the loss and learning rate schedule of the training using TensorBoard bu specifying a `logdir` path.
52 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aidonchuk/retinanet-examples/b0a9c0ef36c38eb8d602de83d68665b23df4e76f/__init__.py


--------------------------------------------------------------------------------
/csrc/calibrator.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and associated documentation files (the "Software"),
  6 |  * to deal in the Software without restriction, including without limitation
  7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 |  * and/or sell copies of the Software, and to permit persons to whom the
  9 |  * Software is furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in
 12 |  * all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 |  * DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | 
 23 | #pragma once
 24 | 
 25 | #include <opencv2/opencv.hpp>
 26 | #include <opencv2/opencv.hpp>
 27 | #include <opencv2/core/core.hpp>
 28 | #include <opencv2/highgui/highgui.hpp>
 29 | #include <iterator>
 30 | #include <vector>
 31 | #include <assert.h>
 32 | #include <algorithm>
 33 | #include "NvInfer.h"
 34 | 
 35 | using namespace std;
 36 | using namespace cv;
 37 | 
 38 | class ImageStream {
 39 | public:
 40 |     ImageStream(int batchSize, Dims inputDims, const vector<string> calibrationImages)
 41 |         : _batchSize(batchSize)
 42 |         , _calibrationImages(calibrationImages)
 43 |         , _currentBatch(0)
 44 |         , _maxBatches(_calibrationImages.size() / _batchSize)
 45 |         , _inputDims(inputDims) {
 46 |         _batch.resize(_batchSize * _inputDims.d[0] * _inputDims.d[1] * _inputDims.d[2]);
 47 |     }
 48 | 
 49 |     int getBatchSize() const { return _batchSize;}
 50 | 
 51 |     int getMaxBatches() const { return _maxBatches;}
 52 | 
 53 |     float* getBatch() { return &_batch[0];}
 54 | 
 55 |     Dims getInputDims() { return _inputDims;}
 56 | 
 57 |     bool next() {
 58 |         
 59 |         if (_currentBatch == _maxBatches)
 60 |             return false;
 61 | 
 62 |         for (int i = 0; i < _batchSize; i++) {
 63 |             auto image = imread(_calibrationImages[_batchSize * _currentBatch + i].c_str(), IMREAD_COLOR);
 64 |             cv::resize(image, image, Size(_inputDims.d[2], _inputDims.d[1]));
 65 |             cv::Mat pixels;
 66 |             image.convertTo(pixels, CV_32FC3, 1.0 / 255, 0);
 67 | 
 68 |             vector<float> img;
 69 | 
 70 |             if (pixels.isContinuous())
 71 |                 img.assign((float*)pixels.datastart, (float*)pixels.dataend);
 72 |             else
 73 |                 return false;
 74 | 
 75 |             auto hw = _inputDims.d[1] * _inputDims.d[2];
 76 |             auto channels = _inputDims.d[0];
 77 |             auto vol = channels * hw;
 78 | 
 79 |             for (int c = 0; c < channels; c++) {
 80 |                 for (int j = 0; j < hw; j++) {
 81 |                     _batch[i * vol + c * hw + j] = (img[channels * j + 2 - c] - _mean[c]) / _std[c];
 82 |                 }
 83 |             }
 84 |         }
 85 | 
 86 |         _currentBatch++;
 87 |         return true;
 88 |     }
 89 |             
 90 |     void reset() {
 91 |         _currentBatch = 0;
 92 |     }
 93 | 
 94 | private:
 95 |     int _batchSize;
 96 |     vector<string> _calibrationImages;
 97 |     int _currentBatch;
 98 |     int _maxBatches;
 99 |     Dims _inputDims;
100 | 
101 |     vector<float> _mean {0.485, 0.456, 0.406};
102 |     vector<float> _std {0.229, 0.224, 0.225};
103 |     vector<float> _batch;
104 |     
105 | };
106 | 
107 | class Int8EntropyCalibrator: public IInt8EntropyCalibrator {
108 | public:
109 |     Int8EntropyCalibrator(ImageStream& stream, const string networkName, const string calibrationCacheName, bool readCache = true)
110 |         : _stream(stream)
111 |         , _networkName(networkName)
112 |         , _calibrationCacheName(calibrationCacheName)
113 |         , _readCache(readCache) {
114 |             Dims d = _stream.getInputDims();
115 |             _inputCount = _stream.getBatchSize() * d.d[0] * d.d[1] * d.d[2];
116 |             cudaMalloc(&_deviceInput, _inputCount * sizeof(float));
117 |         }
118 | 
119 |     int getBatchSize() const override {return _stream.getBatchSize();}
120 | 
121 |     virtual ~Int8EntropyCalibrator() {cudaFree(_deviceInput);}
122 | 
123 |     bool getBatch(void* bindings[], const char* names[], int nbBindings) override {
124 | 
125 |         if (!_stream.next())
126 |             return false;
127 | 
128 |         cudaMemcpy(_deviceInput, _stream.getBatch(), _inputCount * sizeof(float), cudaMemcpyHostToDevice);
129 |         bindings[0] = _deviceInput;
130 |         return true;
131 |     }
132 | 
133 |     const void* readCalibrationCache(size_t& length) { 
134 |         _calibrationCache.clear();
135 |         ifstream input(calibrationTableName(), ios::binary);
136 |         input >> noskipws;
137 |         if (_readCache && input.good())
138 |             copy(istream_iterator<char>(input), istream_iterator<char>(), back_inserter(_calibrationCache));
139 | 
140 |         length = _calibrationCache.size();
141 |         return length ? &_calibrationCache[0] : nullptr;
142 |     }
143 | 
144 |     void writeCalibrationCache(const void* cache, size_t length) {
145 |         std::ofstream output(calibrationTableName(), std::ios::binary);
146 |         output.write(reinterpret_cast<const char*>(cache), length);
147 |     }
148 | 
149 | private:
150 |     std::string calibrationTableName() {
151 |         // Use calibration cache if provided
152 |         if(_calibrationCacheName.length() > 0)
153 |             return _calibrationCacheName;
154 | 
155 |         assert(_networkName.length() > 0);
156 |         Dims d = _stream.getInputDims();
157 |         return std::string("Int8CalibrationTable_") + _networkName + to_string(d.d[1]) + "x" + to_string(d.d[2]) + "_" + to_string(_stream.getMaxBatches());
158 |     }
159 | 
160 |     ImageStream _stream;
161 |     const string _networkName;
162 |     const string _calibrationCacheName;
163 |     bool _readCache {true};
164 |     size_t _inputCount;
165 |     void* _deviceInput {nullptr};
166 |     vector<char> _calibrationCache;
167 | 
168 | };
169 | 


--------------------------------------------------------------------------------
/csrc/cuda/decode.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and associated documentation files (the "Software"),
  6 |  * to deal in the Software without restriction, including without limitation
  7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 |  * and/or sell copies of the Software, and to permit persons to whom the
  9 |  * Software is furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in
 12 |  * all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 |  * DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | 
 23 | #include "decode.h"
 24 | #include "utils.h"
 25 | 
 26 | #include <algorithm>
 27 | #include <cstdint>
 28 | 
 29 | #include <thrust/device_ptr.h>
 30 | #include <thrust/sequence.h>
 31 | #include <thrust/execution_policy.h>
 32 | #include <thrust/gather.h>
 33 | #include <thrust/tabulate.h>
 34 | #include <thrust/count.h>
 35 | #include <thrust/find.h>
 36 | #include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
 37 | #include <thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh>
 38 | 
 39 | namespace retinanet {
 40 | namespace cuda {
 41 | 
 42 | int decode(int batch_size,
 43 |           const void *const *inputs, void **outputs,
 44 |           size_t height, size_t width, size_t scale,
 45 |           size_t num_anchors, size_t num_classes,
 46 |           const std::vector<float> &anchors, float score_thresh, int top_n,
 47 |           void *workspace, size_t workspace_size, cudaStream_t stream) {
 48 | 
 49 |   int scores_size = num_anchors * num_classes * height * width;
 50 |   
 51 |   if (!workspace || !workspace_size) {
 52 |     // Return required scratch space size cub style
 53 |     workspace_size  = get_size_aligned<float>(anchors.size()); // anchors
 54 |     workspace_size += get_size_aligned<bool>(scores_size);     // flags
 55 |     workspace_size += get_size_aligned<int>(scores_size);      // indices
 56 |     workspace_size += get_size_aligned<int>(scores_size);      // indices_sorted
 57 |     workspace_size += get_size_aligned<float>(scores_size);    // scores
 58 |     workspace_size += get_size_aligned<float>(scores_size);    // scores_sorted
 59 |   
 60 |     size_t temp_size_flag = 0;
 61 |     thrust::cuda_cub::cub::DeviceSelect::Flagged((void *)nullptr, temp_size_flag,
 62 |       thrust::cuda_cub::cub::CountingInputIterator<int>(scores_size),
 63 |       (bool *)nullptr, (int *)nullptr, (int *)nullptr, scores_size);
 64 |     size_t temp_size_sort = 0;
 65 |     thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending((void *)nullptr, temp_size_sort,
 66 |       (float *)nullptr, (float *)nullptr, (int *)nullptr, (int *)nullptr, scores_size);
 67 |     workspace_size += std::max(temp_size_flag, temp_size_sort);
 68 | 
 69 |     return workspace_size;
 70 |   }
 71 | 
 72 |   auto anchors_d = get_next_ptr<float>(anchors.size(), workspace, workspace_size);
 73 |   cudaMemcpyAsync(anchors_d, anchors.data(), anchors.size() * sizeof *anchors_d, cudaMemcpyHostToDevice, stream);
 74 | 
 75 |   auto on_stream = thrust::cuda::par.on(stream);
 76 | 
 77 |   auto flags = get_next_ptr<bool>(scores_size, workspace, workspace_size);
 78 |   auto indices = get_next_ptr<int>(scores_size, workspace, workspace_size);
 79 |   auto indices_sorted = get_next_ptr<int>(scores_size, workspace, workspace_size);
 80 |   auto scores = get_next_ptr<float>(scores_size, workspace, workspace_size);
 81 |   auto scores_sorted = get_next_ptr<float>(scores_size, workspace, workspace_size);
 82 | 
 83 |   for (int batch = 0; batch < batch_size; batch++) {
 84 |     auto in_scores = static_cast<const float *>(inputs[0]) + batch * scores_size;
 85 |     auto in_boxes = static_cast<const float *>(inputs[1]) + batch * (scores_size / num_classes) * 4;
 86 | 
 87 |     auto out_scores = static_cast<float *>(outputs[0]) + batch * top_n;
 88 |     auto out_boxes = static_cast<float4 *>(outputs[1]) + batch * top_n;
 89 |     auto out_classes = static_cast<float *>(outputs[2]) + batch * top_n;
 90 | 
 91 |     // Discard scores below threshold
 92 |     thrust::transform(on_stream, in_scores, in_scores + scores_size,
 93 |       flags, thrust::placeholders::_1 > score_thresh);
 94 | 
 95 |     int *num_selected = reinterpret_cast<int *>(indices_sorted);
 96 |     thrust::cuda_cub::cub::DeviceSelect::Flagged(workspace, workspace_size,
 97 |       thrust::cuda_cub::cub::CountingInputIterator<int>(0),
 98 |       flags, indices, num_selected, scores_size, stream);
 99 |     cudaStreamSynchronize(stream);
100 |     int num_detections = *thrust::device_pointer_cast(num_selected);
101 | 
102 |     // Only keep top n scores
103 |     auto indices_filtered = indices;
104 |     if (num_detections > top_n) {
105 |       thrust::gather(on_stream, indices, indices + num_detections,
106 |         in_scores, scores);
107 |       thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
108 |         scores, scores_sorted, indices, indices_sorted, num_detections, 0, sizeof(*scores)*8, stream);
109 |         indices_filtered = indices_sorted;
110 |         num_detections = top_n;
111 |     }
112 | 
113 |     // Gather boxes
114 |     bool has_anchors = !anchors.empty();
115 |     thrust::transform(on_stream, indices_filtered, indices_filtered + num_detections,
116 |       thrust::make_zip_iterator(thrust::make_tuple(out_scores, out_boxes, out_classes)),
117 |       [=] __device__ (int i) {
118 |         int x = i % width;
119 |         int y = (i / width) % height;
120 |         int a = (i / num_classes / height / width) % num_anchors;
121 |         int cls = (i / height / width) % num_classes;
122 |         float4 box = float4{
123 |           in_boxes[((a * 4 + 0) * height + y) * width + x],
124 |           in_boxes[((a * 4 + 1) * height + y) * width + x],
125 |           in_boxes[((a * 4 + 2) * height + y) * width + x],
126 |           in_boxes[((a * 4 + 3) * height + y) * width + x]
127 |         };
128 | 
129 |         if (has_anchors) {
130 |           // Add anchors offsets to deltas
131 |           float x = (i % width) * scale;
132 |           float y = ((i / width)  % height) * scale;
133 |           float *d = anchors_d + 4*a;
134 |           
135 |           float x1 = x + d[0];
136 |           float y1 = y + d[1];
137 |           float x2 = x + d[2];
138 |           float y2 = y + d[3];
139 |           float w = x2 - x1 + 1.0f;
140 |           float h = y2 - y1 + 1.0f;
141 |           float pred_ctr_x = box.x * w + x1 + 0.5f * w;
142 |           float pred_ctr_y = box.y * h + y1 + 0.5f * h;
143 |           float pred_w = exp(box.z) * w;
144 |           float pred_h = exp(box.w) * h;
145 | 
146 |           box = float4{
147 |             max(0.0f, pred_ctr_x - 0.5f * pred_w),
148 |             max(0.0f, pred_ctr_y - 0.5f * pred_h),
149 |             min(pred_ctr_x + 0.5f * pred_w - 1.0f, width * scale - 1.0f),
150 |             min(pred_ctr_y + 0.5f * pred_h - 1.0f, height * scale - 1.0f)
151 |           };
152 |         }
153 |         
154 |         return thrust::make_tuple(in_scores[i], box, cls);
155 |       });
156 | 
157 |     // Zero-out unused scores
158 |     if (num_detections < top_n) {
159 |       thrust::fill(on_stream, out_scores + num_detections,
160 |         out_scores + top_n, 0.0f);
161 |       thrust::fill(on_stream, out_classes + num_detections,
162 |         out_classes + top_n, 0.0f);
163 |     }
164 |   }
165 | 
166 |   return 0;
167 | }
168 | 
169 | }
170 | }
171 | 


--------------------------------------------------------------------------------
/csrc/cuda/decode.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | 
23 | #pragma once
24 | 
25 | #include <vector>
26 | 
27 | namespace retinanet {
28 | namespace cuda {
29 | 
30 | int decode(int batchSize,
31 |     const void *const *inputs, void **outputs,
32 |     size_t height, size_t width, size_t scale,
33 |     size_t num_anchors, size_t num_classes,
34 |     const std::vector<float> &anchors, float score_thresh, int top_n,
35 |     void *workspace, size_t workspace_size, cudaStream_t stream);
36 | 
37 | }
38 | }


--------------------------------------------------------------------------------
/csrc/cuda/nms.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and associated documentation files (the "Software"),
  6 |  * to deal in the Software without restriction, including without limitation
  7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 |  * and/or sell copies of the Software, and to permit persons to whom the
  9 |  * Software is furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in
 12 |  * all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 |  * DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | 
 23 | #include "nms.h"
 24 | #include "utils.h"
 25 | 
 26 | #include <algorithm>
 27 | #include <iostream>
 28 | #include <stdexcept>
 29 | #include <cstdint>
 30 | #include <vector>
 31 | #include <cmath>
 32 | 
 33 | #include <cuda.h>
 34 | #include <thrust/device_ptr.h>
 35 | #include <thrust/sequence.h>
 36 | #include <thrust/execution_policy.h>
 37 | #include <thrust/gather.h>
 38 | #include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
 39 | #include <thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh>
 40 | 
 41 | namespace retinanet {
 42 | namespace cuda {
 43 | 
 44 | __global__ void nms_kernel(
 45 |       const int num_per_thread, const float threshold, const int num_detections,
 46 |       const int *indices, float *scores, const float *classes, const float4 *boxes) {
 47 | 
 48 |   // Go through detections by descending score
 49 |   for (int m = 0; m < num_detections; m++) {
 50 |     for (int n = 0; n < num_per_thread; n++) {
 51 |       int i = threadIdx.x * num_per_thread + n;
 52 |       if (i < num_detections && m < i && scores[m] > 0.0f) {
 53 |         int idx = indices[i];
 54 |         int max_idx = indices[m];
 55 |         int icls = classes[idx];
 56 |         int mcls = classes[max_idx];
 57 |         if (mcls == icls) {
 58 |           float4 ibox = boxes[idx];
 59 |           float4 mbox = boxes[max_idx];
 60 |           float x1 = max(ibox.x, mbox.x);
 61 |           float y1 = max(ibox.y, mbox.y);
 62 |           float x2 = min(ibox.z, mbox.z);
 63 |           float y2 = min(ibox.w, mbox.w);
 64 |           float w = max(0.0f, x2 - x1 + 1);
 65 |           float h = max(0.0f, y2 - y1 + 1);
 66 |           float iarea = (ibox.z - ibox.x + 1) * (ibox.w - ibox.y + 1);
 67 |           float marea = (mbox.z - mbox.x + 1) * (mbox.w - mbox.y + 1);
 68 |           float inter = w * h;
 69 |           float overlap = inter / (iarea + marea - inter);
 70 |           if (overlap > threshold) {
 71 |             scores[i] = 0.0f;
 72 |           }
 73 |         }
 74 |       }
 75 |     }
 76 | 
 77 |     // Sync discarded detections
 78 |     __syncthreads();
 79 |   }
 80 | }
 81 | 
 82 | int nms(int batch_size,
 83 |         const void *const *inputs, void **outputs,
 84 |         size_t count, int detections_per_im, float nms_thresh,
 85 |         void *workspace, size_t workspace_size, cudaStream_t stream) {
 86 | 
 87 |   if (!workspace || !workspace_size) {
 88 |     // Return required scratch space size cub style
 89 |     workspace_size  = get_size_aligned<bool>(count);  // flags
 90 |     workspace_size += get_size_aligned<int>(count);   // indices
 91 |     workspace_size += get_size_aligned<int>(count);   // indices_sorted
 92 |     workspace_size += get_size_aligned<float>(count); // scores
 93 |     workspace_size += get_size_aligned<float>(count); // scores_sorted
 94 |   
 95 |     size_t temp_size_flag = 0;
 96 |     thrust::cuda_cub::cub::DeviceSelect::Flagged((void *)nullptr, temp_size_flag,
 97 |       thrust::cuda_cub::cub::CountingInputIterator<int>(count),
 98 |       (bool *)nullptr, (int *)nullptr, (int *)nullptr, count);
 99 |     size_t temp_size_sort = 0;
100 |     thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending((void *)nullptr, temp_size_sort,
101 |       (float *)nullptr, (float *)nullptr, (int *)nullptr, (int *)nullptr, count);
102 |     workspace_size += std::max(temp_size_flag, temp_size_sort);
103 | 
104 |     return workspace_size;
105 |   }
106 | 
107 |   auto on_stream = thrust::cuda::par.on(stream);
108 | 
109 |   auto flags = get_next_ptr<bool>(count, workspace, workspace_size);
110 |   auto indices = get_next_ptr<int>(count, workspace, workspace_size);
111 |   auto indices_sorted = get_next_ptr<int>(count, workspace, workspace_size);
112 |   auto scores = get_next_ptr<float>(count, workspace, workspace_size);
113 |   auto scores_sorted = get_next_ptr<float>(count, workspace, workspace_size);
114 | 
115 |   for (int batch = 0; batch < batch_size; batch++) {
116 |     auto in_scores = static_cast<const float *>(inputs[0]) + batch * count;
117 |     auto in_boxes = static_cast<const float4 *>(inputs[1]) + batch * count;
118 |     auto in_classes = static_cast<const float *>(inputs[2]) + batch * count;
119 | 
120 |     auto out_scores = static_cast<float *>(outputs[0]) + batch * detections_per_im;
121 |     auto out_boxes = static_cast<float4 *>(outputs[1]) + batch * detections_per_im;
122 |     auto out_classes = static_cast<float *>(outputs[2]) + batch * detections_per_im;
123 |     
124 |     // Discard null scores
125 |     thrust::transform(on_stream, in_scores, in_scores + count,
126 |       flags, thrust::placeholders::_1 > 0.0f);
127 | 
128 |     int *num_selected = reinterpret_cast<int *>(indices_sorted);
129 |     thrust::cuda_cub::cub::DeviceSelect::Flagged(workspace, workspace_size,
130 |       thrust::cuda_cub::cub::CountingInputIterator<int>(0),
131 |       flags, indices, num_selected, count, stream);
132 |     cudaStreamSynchronize(stream);
133 |     int num_detections = *thrust::device_pointer_cast(num_selected);
134 | 
135 |     // Sort scores and corresponding indices
136 |     thrust::gather(on_stream, indices, indices + num_detections, in_scores, scores);
137 |     thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
138 |       scores, scores_sorted, indices, indices_sorted, num_detections, 0, sizeof(*scores)*8, stream);
139 | 
140 |     // Launch actual NMS kernel - 1 block with each thread handling n detections
141 |     const int max_threads = 1024;
142 |     int num_per_thread = ceil((float)num_detections / max_threads);
143 |     nms_kernel<<<1, max_threads, 0, stream>>>(num_per_thread, nms_thresh, num_detections,
144 |       indices_sorted, scores_sorted, in_classes, in_boxes);
145 | 
146 |     // Re-sort with updated scores
147 |     thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
148 |       scores_sorted, scores, indices_sorted, indices, num_detections, 0, sizeof(*scores)*8, stream);
149 | 
150 |     // Gather filtered scores, boxes, classes
151 |     num_detections = min(detections_per_im, num_detections);
152 |     cudaMemcpyAsync(out_scores, scores, num_detections * sizeof *scores, cudaMemcpyDeviceToDevice, stream);
153 |     if (num_detections < detections_per_im) {
154 |       thrust::fill_n(on_stream, out_scores + num_detections, detections_per_im - num_detections, 0);
155 |     }
156 |     thrust::gather(on_stream, indices, indices + num_detections, in_boxes, out_boxes);
157 |     thrust::gather(on_stream, indices, indices + num_detections, in_classes, out_classes);
158 |   }
159 |   
160 |   return 0;
161 | }
162 | 
163 | }
164 | }
165 | 


--------------------------------------------------------------------------------
/csrc/cuda/nms.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | 
23 | #pragma once
24 | 
25 | namespace retinanet {
26 | namespace cuda {
27 | 
28 | int nms(int batchSize,
29 |     const void *const *inputs, void **outputs,
30 |     size_t count, int detections_per_im, float nms_thresh,
31 |     void *workspace, size_t workspace_size, cudaStream_t stream);
32 | 
33 | }
34 | }


--------------------------------------------------------------------------------
/csrc/cuda/utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | 
23 | #pragma once
24 | 
25 | #include <stdexcept>
26 | #include <cstdint>
27 | 
28 | #define CUDA_ALIGN 256
29 | 
30 | template <typename T>
31 | inline size_t get_size_aligned(size_t num_elem) {
32 |     size_t size = num_elem * sizeof(T);
33 |     size_t extra_align = 0;
34 |     if (size % CUDA_ALIGN != 0) {
35 |         extra_align = CUDA_ALIGN - size % CUDA_ALIGN;
36 |     }
37 |     return size + extra_align;
38 | }
39 | 
40 | template <typename T>
41 | inline T *get_next_ptr(size_t num_elem, void *&workspace, size_t &workspace_size) {
42 |   size_t size = get_size_aligned<T>(num_elem);
43 |   if (size > workspace_size) {
44 |     throw std::runtime_error("Workspace is too small!");
45 |   }
46 |   workspace_size -= size;
47 |   T *ptr = reinterpret_cast<T *>(workspace);
48 |   workspace = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(workspace) + size);
49 |   return ptr;
50 | }
51 | 


--------------------------------------------------------------------------------
/csrc/engine.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and associated documentation files (the "Software"),
  6 |  * to deal in the Software without restriction, including without limitation
  7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 |  * and/or sell copies of the Software, and to permit persons to whom the
  9 |  * Software is furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in
 12 |  * all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 |  * DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | 
 23 | #include "engine.h"
 24 | 
 25 | #include <iostream>
 26 | #include <fstream>
 27 | 
 28 | #include <NvOnnxConfig.h>
 29 | #include <NvOnnxParser.h>
 30 | 
 31 | #include "plugins/DecodePlugin.h"
 32 | #include "plugins/NMSPlugin.h"
 33 | #include "calibrator.h"
 34 | 
 35 | using namespace nvinfer1;
 36 | using namespace nvonnxparser;
 37 | 
 38 | namespace retinanet {
 39 | 
 40 | class Logger : public ILogger {
 41 | public:
 42 |     Logger(bool verbose)
 43 |         : _verbose(verbose) {
 44 |     }
 45 | 
 46 |     void log(Severity severity, const char *msg) override {
 47 |         if (_verbose || (severity != Severity::kINFO) && (severity != Severity::kVERBOSE))
 48 |             cout << msg << endl;
 49 |     }
 50 | 
 51 | private:
 52 |    bool _verbose{false};
 53 | };
 54 | 
 55 | void Engine::_load(const string &path) {
 56 |     ifstream file(path, ios::in | ios::binary);
 57 |     file.seekg (0, file.end);
 58 |     size_t size = file.tellg();
 59 |     file.seekg (0, file.beg);
 60 | 
 61 |     char *buffer = new char[size];
 62 |     file.read(buffer, size);
 63 |     file.close();
 64 | 
 65 |     _engine = _runtime->deserializeCudaEngine(buffer, size, nullptr);
 66 | 
 67 |     delete[] buffer;
 68 | }
 69 | 
 70 | void Engine::_prepare() {
 71 |     _context = _engine->createExecutionContext();
 72 |     cudaStreamCreate(&_stream);
 73 | }
 74 | 
 75 | Engine::Engine(const string &path, bool verbose) {
 76 |     Logger logger(verbose);
 77 |     _runtime = createInferRuntime(logger);
 78 |     _load(path);
 79 |     _prepare();
 80 | }
 81 | 
 82 | Engine::~Engine() {
 83 |     if (_stream) cudaStreamDestroy(_stream);
 84 |     if (_context) _context->destroy();
 85 |     if (_engine) _engine->destroy();
 86 |     if (_runtime) _runtime->destroy();
 87 | }
 88 | 
 89 | Engine::Engine(const char *onnx_model, size_t onnx_size, size_t batch, string precision,
 90 |     float score_thresh, int top_n, const vector<vector<float>>& anchors,
 91 |     float nms_thresh, int detections_per_im, const vector<string>& calibration_images, 
 92 |     string model_name, string calibration_table, bool verbose, size_t workspace_size) {
 93 | 
 94 |     Logger logger(verbose);
 95 |     _runtime = createInferRuntime(logger);
 96 | 
 97 |     bool fp16 = precision.compare("FP16") == 0;
 98 |     bool int8 = precision.compare("INT8") == 0;
 99 | 
100 |     // Create builder
101 |     auto builder = createInferBuilder(logger);
102 |     auto builderConfig = builder->createBuilderConfig();
103 |     builder->setMaxBatchSize(batch);
104 |     // Allow use of FP16 layers when running in INT8
105 |     if(fp16 || int8) builderConfig->setFlag(BuilderFlag::kFP16);
106 |     builderConfig->setMaxWorkspaceSize(workspace_size);
107 | 
108 |     // Parse ONNX FCN
109 |     cout << "Building " << precision << " core model..." << endl;
110 |     const auto flags = 0U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
111 |     auto network = builder->createNetworkV2(flags);
112 |     auto parser = createParser(*network, logger);
113 |     parser->parse(onnx_model, onnx_size);
114 | 
115 |     auto input = network->getInput(0);
116 |     auto inputDims = input->getDimensions();
117 | 
118 |     std::unique_ptr<Int8EntropyCalibrator> calib;
119 |     if (int8) {
120 |         builderConfig->setFlag(BuilderFlag::kINT8);
121 |         ImageStream stream(batch, inputDims, calibration_images);
122 |         calib = std::unique_ptr<Int8EntropyCalibrator>(new Int8EntropyCalibrator(stream, model_name, calibration_table));
123 |         builderConfig->setInt8Calibrator(calib.get());
124 |     }
125 | 
126 |     // Add decode plugins
127 |     cout << "Building accelerated plugins..." << endl;
128 |     vector<DecodePlugin> decodePlugins;
129 |     vector<ITensor *> scores, boxes, classes;
130 |     auto nbOutputs = network->getNbOutputs();
131 |     for (int i = 0; i < nbOutputs / 2; i++) {
132 |         auto classOutput = network->getOutput(i);
133 |         auto boxOutput = network->getOutput(nbOutputs / 2 + i);
134 |         auto outputDims = classOutput->getDimensions();
135 | 
136 |         int scale = inputDims.d[2] / outputDims.d[2];
137 |         auto decodePlugin = DecodePlugin(score_thresh, top_n, anchors[i], scale);
138 |         decodePlugins.push_back(decodePlugin);
139 |         vector<ITensor *> inputs = {classOutput, boxOutput};
140 |         auto layer = network->addPluginV2(inputs.data(), inputs.size(), decodePlugin);
141 |         scores.push_back(layer->getOutput(0));
142 |         boxes.push_back(layer->getOutput(1));
143 |         classes.push_back(layer->getOutput(2));
144 |     }
145 | 
146 |     // Cleanup outputs
147 |     for (int i = 0; i < nbOutputs; i++) {
148 |         auto output = network->getOutput(0);
149 |         network->unmarkOutput(*output);
150 |     }
151 | 
152 |     // Concat tensors from each feature map
153 |     vector<ITensor *> concat;
154 |     for (auto tensors : {scores, boxes, classes}) {
155 |         auto layer = network->addConcatenation(tensors.data(), tensors.size());
156 |         concat.push_back(layer->getOutput(0));
157 |     }
158 |     
159 |     // Add NMS plugin
160 |     auto nmsPlugin = NMSPlugin(nms_thresh, detections_per_im);
161 |     auto layer = network->addPluginV2(concat.data(), concat.size(), nmsPlugin);
162 |     vector<string> names = {"scores", "boxes", "classes"};
163 |     for (int i = 0; i < layer->getNbOutputs(); i++) {
164 |         auto output = layer->getOutput(i);
165 |         network->markOutput(*output);
166 |         output->setName(names[i].c_str());
167 |     }
168 | 
169 |     // Build engine
170 |     cout << "Applying optimizations and building TRT CUDA engine..." << endl;
171 |     _engine = builder->buildEngineWithConfig(*network, *builderConfig);
172 | 
173 |     // Housekeeping
174 |     parser->destroy();
175 |     network->destroy();
176 |     builderConfig->destroy();
177 |     builder->destroy();
178 | 
179 |     _prepare();
180 | }
181 | 
182 | void Engine::save(const string &path) {
183 |     cout << "Writing to " << path << "..." << endl;
184 |     auto serialized = _engine->serialize();
185 |     ofstream file(path, ios::out | ios::binary);
186 |     file.write(reinterpret_cast<const char*>(serialized->data()), serialized->size());
187 | 
188 |     serialized->destroy();    
189 | }
190 | 
191 | void Engine::infer(vector<void *> &buffers, int batch) {
192 |     _context->enqueue(batch, buffers.data(), _stream, nullptr);
193 |     cudaStreamSynchronize(_stream);
194 | }
195 | 
196 | vector<int> Engine::getInputSize() {
197 |     auto dims = _engine->getBindingDimensions(0);
198 |     return {dims.d[1], dims.d[2]};
199 | }
200 | 
201 | int Engine::getMaxBatchSize() {
202 |     return _engine->getMaxBatchSize();
203 | }
204 | 
205 | int Engine::getMaxDetections() {
206 |     return _engine->getBindingDimensions(1).d[0];
207 | }
208 | 
209 | int Engine::getStride() {
210 |     return 1;
211 | }
212 | 
213 | }
214 | 


--------------------------------------------------------------------------------
/csrc/engine.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | 
23 | #pragma once
24 | 
25 | #include <string>
26 | #include <vector>
27 | 
28 | #include <NvInfer.h>
29 | 
30 | #include <cuda_runtime.h>
31 | 
32 | using namespace std;
33 | using namespace nvinfer1;
34 | 
35 | namespace retinanet {
36 | 
37 | // RetinaNet wrapper around TensorRT CUDA engine
38 | class Engine {
39 | public:
40 |     // Create engine from engine path
41 |     Engine(const string &engine_path, bool verbose=false);
42 | 
43 |     // Create engine from serialized onnx model
44 |     Engine(const char *onnx_model, size_t onnx_size, size_t batch, string precision,
45 |         float score_thresh, int top_n, const vector<vector<float>>& anchors,
46 |         float nms_thresh, int detections_per_im, const vector<string>& calibration_files,
47 |         string model_name, string calibration_table, bool verbose, size_t workspace_size=(1ULL << 30));
48 | 
49 |     ~Engine();
50 | 
51 |     // Save model to path
52 |     void save(const string &path);
53 | 
54 |     // Infer using pre-allocated GPU buffers {data, scores, boxes, classes}
55 |     void infer(vector<void *> &buffers, int batch=1);
56 | 
57 |     // Get (h, w) size of the fixed input
58 |     vector<int> getInputSize();
59 | 
60 |     // Get max allowed batch size
61 |     int getMaxBatchSize();
62 | 
63 |     // Get max number of detections
64 |     int getMaxDetections();
65 | 
66 |     // Get stride
67 |     int getStride();
68 | 
69 | private:
70 |     IRuntime *_runtime = nullptr;
71 |     ICudaEngine *_engine = nullptr;
72 |     IExecutionContext *_context = nullptr;
73 |     cudaStream_t _stream = nullptr;
74 | 
75 |     void _load(const string &path);
76 |     void _prepare();
77 | 
78 | };
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/csrc/extensions.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and associated documentation files (the "Software"),
  6 |  * to deal in the Software without restriction, including without limitation
  7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 |  * and/or sell copies of the Software, and to permit persons to whom the
  9 |  * Software is furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in
 12 |  * all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 |  * DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | 
 23 | #include <torch/extension.h>
 24 | #include <ATen/cuda/CUDAContext.h>
 25 | 
 26 | #include <vector>
 27 | #include <optional>
 28 | 
 29 | #include "engine.h"
 30 | #include "cuda/decode.h"
 31 | #include "cuda/nms.h"
 32 | 
 33 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
 34 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
 35 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
 36 | 
 37 | vector<at::Tensor> decode(at::Tensor cls_head, at::Tensor box_head,
 38 |         vector<float> &anchors, int scale, float score_thresh, int top_n) {
 39 | 
 40 |     CHECK_INPUT(cls_head);
 41 |     CHECK_INPUT(box_head);
 42 | 
 43 |     int batch = cls_head.size(0);
 44 |     int num_anchors = anchors.size() / 4;
 45 |     int num_classes = cls_head.size(1) / num_anchors;
 46 |     int height = cls_head.size(2);
 47 |     int width = cls_head.size(3);
 48 |     auto options = cls_head.options();
 49 | 
 50 |     auto scores = at::zeros({batch, top_n}, options);
 51 |     auto boxes = at::zeros({batch, top_n, 4}, options);
 52 |     auto classes = at::zeros({batch, top_n}, options);
 53 | 
 54 | 
 55 |     // Create scratch buffer
 56 |     int size = retinanet::cuda::decode(batch, nullptr, nullptr, height, width, scale,
 57 |         num_anchors, num_classes, anchors, score_thresh, top_n, nullptr, 0, nullptr);
 58 |     auto scratch = at::zeros({size}, options.dtype(torch::kUInt8));
 59 | 
 60 |     // Decode boxes
 61 |     vector<void *> inputs = {cls_head.data_ptr(), box_head.data_ptr()};
 62 |     vector<void *> outputs = {scores.data_ptr(), boxes.data_ptr(), classes.data_ptr()};
 63 |     retinanet::cuda::decode(batch, inputs.data(), outputs.data(), height, width, scale,
 64 |         num_anchors, num_classes, anchors, score_thresh, top_n,
 65 |         scratch.data_ptr(), size, at::cuda::getCurrentCUDAStream());
 66 | 
 67 |     return {scores, boxes, classes};
 68 | }
 69 | 
 70 | vector<at::Tensor> nms(at::Tensor scores, at::Tensor boxes, at::Tensor classes,
 71 |         float nms_thresh, int detections_per_im) {
 72 | 
 73 |     CHECK_INPUT(scores);
 74 |     CHECK_INPUT(boxes);
 75 |     CHECK_INPUT(classes);
 76 | 
 77 |     int batch = scores.size(0);
 78 |     int count = scores.size(1);
 79 |     auto options = scores.options();
 80 | 
 81 |     auto nms_scores = at::zeros({batch, detections_per_im}, scores.options());
 82 |     auto nms_boxes = at::zeros({batch, detections_per_im, 4}, boxes.options());
 83 |     auto nms_classes = at::zeros({batch, detections_per_im}, classes.options());
 84 | 
 85 |     // Create scratch buffer
 86 |     int size = retinanet::cuda::nms(batch, nullptr, nullptr, count, 
 87 |         detections_per_im, nms_thresh, nullptr, 0, nullptr);
 88 |     auto scratch = at::zeros({size}, options.dtype(torch::kUInt8));
 89 | 
 90 |     // Perform NMS
 91 |     vector<void *> inputs = {scores.data_ptr(), boxes.data_ptr(), classes.data_ptr()};
 92 |     vector<void *> outputs = {nms_scores.data_ptr(), nms_boxes.data_ptr(), nms_classes.data_ptr()};
 93 |     retinanet::cuda::nms(batch, inputs.data(), outputs.data(), count,
 94 |         detections_per_im, nms_thresh,
 95 |         scratch.data_ptr(), size, at::cuda::getCurrentCUDAStream());
 96 | 
 97 |     return {nms_scores, nms_boxes, nms_classes};
 98 | }
 99 | 
100 | vector<at::Tensor> infer(retinanet::Engine &engine, at::Tensor data) {
101 |     CHECK_INPUT(data);
102 |     
103 |     int batch = data.size(0);
104 |     auto input_size = engine.getInputSize();
105 |     data = at::constant_pad_nd(data, {0, input_size[1] - data.size(3), 0, input_size[0] - data.size(2)});
106 | 
107 |     int num_detections = engine.getMaxDetections();
108 |     auto scores = at::zeros({batch, num_detections}, data.options());
109 |     auto boxes = at::zeros({batch, num_detections, 4}, data.options());
110 |     auto classes = at::zeros({batch, num_detections}, data.options());
111 | 
112 |     vector<void *> buffers;
113 |     for (auto buffer : {data, scores, boxes, classes}) {
114 |         buffers.push_back(buffer.data<float>());
115 |     }
116 | 
117 |     engine.infer(buffers, batch);
118 | 
119 |     return {scores, boxes, classes};
120 | }
121 | 
122 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
123 |     pybind11::class_<retinanet::Engine>(m, "Engine")
124 |         .def(pybind11::init<const char *, size_t, size_t, string, float,
125 |             int, const vector<vector<float>>&, float, int, const vector<string>&, string, string, bool>())
126 |         .def("save", &retinanet::Engine::save)
127 |         .def("infer", &retinanet::Engine::infer)
128 |         .def_property_readonly("stride", &retinanet::Engine::getStride)
129 |         .def_property_readonly("input_size", &retinanet::Engine::getInputSize)
130 |         .def_static("load", [](const string &path) {
131 |             return new retinanet::Engine(path);
132 |         })
133 |         .def("__call__", [](retinanet::Engine &engine, at::Tensor data) { 
134 |             return infer(engine, data);
135 |         });
136 |     m.def("decode", &decode);
137 |     m.def("nms", &nms);
138 | }
139 | 


--------------------------------------------------------------------------------
/csrc/plugins/DecodePlugin.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and associated documentation files (the "Software"),
  6 |  * to deal in the Software without restriction, including without limitation
  7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 |  * and/or sell copies of the Software, and to permit persons to whom the
  9 |  * Software is furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in
 12 |  * all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 |  * DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | 
 23 | #pragma once 
 24 | 
 25 | #include <NvInfer.h>
 26 | 
 27 | #include <cassert>
 28 | #include <vector>
 29 | 
 30 | #include "../cuda/decode.h"
 31 | 
 32 | using namespace nvinfer1;
 33 | 
 34 | #define RETINANET_PLUGIN_NAME "RetinaNetDecode"
 35 | #define RETINANET_PLUGIN_VERSION "1"
 36 | #define RETINANET_PLUGIN_NAMESPACE ""
 37 | 
 38 | namespace retinanet {
 39 | 
 40 | class DecodePlugin : public IPluginV2 {
 41 |   float _score_thresh;
 42 |   int _top_n;
 43 |   std::vector<float> _anchors;
 44 |   float _scale;
 45 | 
 46 |   size_t _height;
 47 |   size_t _width;
 48 |   size_t _num_anchors;
 49 |   size_t _num_classes;
 50 | 
 51 | protected:
 52 |   void deserialize(void const* data, size_t length) {
 53 |     const char* d = static_cast<const char*>(data);
 54 |     read(d, _score_thresh);
 55 |     read(d, _top_n);
 56 |     size_t anchors_size;
 57 |     read(d, anchors_size);
 58 |     while( anchors_size-- ) {
 59 |       float val;
 60 |       read(d, val);
 61 |       _anchors.push_back(val);
 62 |     }
 63 |     read(d, _scale);
 64 |     read(d, _height);
 65 |     read(d, _width);
 66 |     read(d, _num_anchors);
 67 |     read(d, _num_classes);
 68 |   }
 69 | 
 70 |   size_t getSerializationSize() const override {
 71 |     return sizeof(_score_thresh) + sizeof(_top_n)
 72 |       + sizeof(size_t) + sizeof(float) * _anchors.size() + sizeof(_scale)
 73 |       + sizeof(_height) + sizeof(_width) + sizeof(_num_anchors) + sizeof(_num_classes);
 74 |   }
 75 | 
 76 |   void serialize(void *buffer) const override {
 77 |     char* d = static_cast<char*>(buffer);
 78 |     write(d, _score_thresh);
 79 |     write(d, _top_n);
 80 |     write(d, _anchors.size());
 81 |     for( auto &val : _anchors ) {
 82 |       write(d, val);
 83 |     }
 84 |     write(d, _scale);
 85 |     write(d, _height);
 86 |     write(d, _width);
 87 |     write(d, _num_anchors);
 88 |     write(d, _num_classes);
 89 |   }
 90 | 
 91 | public:
 92 |   DecodePlugin(float score_thresh, int top_n, std::vector<float> const& anchors, int scale)
 93 |     : _score_thresh(score_thresh), _top_n(top_n), _anchors(anchors), _scale(scale) {}
 94 | 
 95 |   DecodePlugin(void const* data, size_t length) {
 96 |       this->deserialize(data, length);
 97 |   }
 98 | 
 99 |   const char *getPluginType() const override {
100 |     return RETINANET_PLUGIN_NAME;
101 |   }
102 |  
103 |   const char *getPluginVersion() const override {
104 |     return RETINANET_PLUGIN_VERSION;
105 |   }
106 |   
107 |   int getNbOutputs() const override {
108 |     return 3;
109 |   }
110 | 
111 |   Dims getOutputDimensions(int index,
112 |                                      const Dims *inputs, int nbInputDims) override {
113 |     assert(nbInputDims == 2);
114 |     assert(index < this->getNbOutputs());
115 |     return Dims3(_top_n * (index == 1 ? 4 : 1), 1, 1);
116 |   }
117 | 
118 |   bool supportsFormat(DataType type, PluginFormat format) const override {
119 |     return type == DataType::kFLOAT && format == PluginFormat::kLINEAR;
120 |   }
121 | 
122 |   void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, 
123 |                         int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override {
124 |     assert(type == nvinfer1::DataType::kFLOAT && format == nvinfer1::PluginFormat::kLINEAR);
125 |     assert(nbInputs == 2);
126 |     auto const& scores_dims = inputDims[0];
127 |     auto const& boxes_dims = inputDims[1];
128 |     assert(scores_dims.d[1] == boxes_dims.d[1]);
129 |     assert(scores_dims.d[2] == boxes_dims.d[2]);
130 |     _height = scores_dims.d[1];
131 |     _width = scores_dims.d[2];
132 |     _num_anchors = boxes_dims.d[0] / 4; 
133 |     _num_classes = scores_dims.d[0] / _num_anchors;
134 |   }
135 | 
136 |   int initialize() override { return 0; }
137 | 
138 |   void terminate() override {}
139 | 
140 |   size_t getWorkspaceSize(int maxBatchSize) const override {
141 |     static int size = -1;
142 |     if (size < 0) {
143 |       size = cuda::decode(maxBatchSize, nullptr, nullptr, _height, _width, _scale,
144 |         _num_anchors, _num_classes, _anchors, _score_thresh, _top_n, 
145 |         nullptr, 0, nullptr);
146 |     }
147 |     return size;
148 |   }
149 | 
150 |   int enqueue(int batchSize,
151 |               const void *const *inputs, void **outputs,
152 |               void *workspace, cudaStream_t stream) override {
153 |     return cuda::decode(batchSize, inputs, outputs, _height, _width, _scale,
154 |       _num_anchors, _num_classes, _anchors, _score_thresh, _top_n,
155 |       workspace, getWorkspaceSize(batchSize), stream);
156 |   }
157 | 
158 |   void destroy() override {
159 |     delete this;
160 |   };
161 | 
162 |   const char *getPluginNamespace() const override {
163 |     return RETINANET_PLUGIN_NAMESPACE;
164 |   }
165 |   
166 |   void setPluginNamespace(const char *N) override {
167 | 
168 |   }
169 | 
170 |   IPluginV2 *clone() const override {
171 |     return new DecodePlugin(_score_thresh, _top_n, _anchors, _scale);
172 |   }
173 | 
174 | private:
175 |   template<typename T> void write(char*& buffer, const T& val) const {
176 |     *reinterpret_cast<T*>(buffer) = val;
177 |     buffer += sizeof(T);
178 |   }
179 | 
180 |   template<typename T> void read(const char*& buffer, T& val) {
181 |     val = *reinterpret_cast<const T*>(buffer);
182 |     buffer += sizeof(T);
183 |   }
184 | };
185 | 
186 | class DecodePluginCreator : public IPluginCreator {
187 | public:
188 |   DecodePluginCreator() {}
189 | 
190 |   const char *getPluginName () const override {
191 |     return RETINANET_PLUGIN_NAME;
192 |   }
193 | 
194 |   const char *getPluginVersion () const override {
195 |     return RETINANET_PLUGIN_VERSION;
196 |   }
197 |  
198 |   const char *getPluginNamespace() const override {
199 |     return RETINANET_PLUGIN_NAMESPACE;
200 |   }
201 | 
202 |   IPluginV2 *deserializePlugin (const char *name, const void *serialData, size_t serialLength) override {
203 |     return new DecodePlugin(serialData, serialLength);
204 |   }
205 | 
206 |   void setPluginNamespace(const char *N) override {}
207 |   const PluginFieldCollection *getFieldNames() override { return nullptr; }
208 |   IPluginV2 *createPlugin (const char *name, const PluginFieldCollection *fc) override { return nullptr; }
209 | };
210 | 
211 | REGISTER_TENSORRT_PLUGIN(DecodePluginCreator);
212 | 
213 | }
214 | 
215 | #undef RETINANET_PLUGIN_NAME
216 | #undef RETINANET_PLUGIN_VERSION
217 | #undef RETINANET_PLUGIN_NAMESPACE
218 | 


--------------------------------------------------------------------------------
/csrc/plugins/NMSPlugin.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and associated documentation files (the "Software"),
  6 |  * to deal in the Software without restriction, including without limitation
  7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 |  * and/or sell copies of the Software, and to permit persons to whom the
  9 |  * Software is furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in
 12 |  * all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 |  * DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | 
 23 | #pragma once
 24 | 
 25 | #include <NvInfer.h>
 26 | 
 27 | #include <vector>
 28 | #include <cassert>
 29 | 
 30 | #include "../cuda/nms.h"
 31 | 
 32 | using namespace nvinfer1;
 33 | 
 34 | #define RETINANET_PLUGIN_NAME "RetinaNetNMS"
 35 | #define RETINANET_PLUGIN_VERSION "1"
 36 | #define RETINANET_PLUGIN_NAMESPACE ""
 37 | 
 38 | namespace retinanet {
 39 | 
 40 | class NMSPlugin : public IPluginV2 {
 41 |   float _nms_thresh;
 42 |   int _detections_per_im;
 43 | 
 44 |   size_t _count;
 45 | 
 46 | protected:
 47 |   void deserialize(void const* data, size_t length) {
 48 |     const char* d = static_cast<const char*>(data);
 49 |     read(d, _nms_thresh);
 50 |     read(d, _detections_per_im);
 51 |     read(d, _count);
 52 |   }
 53 | 
 54 |   size_t getSerializationSize() const override {
 55 |     return sizeof(_nms_thresh) + sizeof(_detections_per_im)
 56 |       + sizeof(_count);
 57 |   }
 58 | 
 59 |   void serialize(void *buffer) const override {
 60 |     char* d = static_cast<char*>(buffer);
 61 |     write(d, _nms_thresh);
 62 |     write(d, _detections_per_im);
 63 |     write(d, _count);
 64 |   }
 65 | 
 66 | public:
 67 |   NMSPlugin(float nms_thresh, int detections_per_im)
 68 |     : _nms_thresh(nms_thresh), _detections_per_im(detections_per_im) {
 69 |     assert(nms_thresh > 0);
 70 |     assert(detections_per_im > 0);
 71 |   }
 72 | 
 73 |   NMSPlugin(void const* data, size_t length) {
 74 |     this->deserialize(data, length);
 75 |   }
 76 | 
 77 |   const char *getPluginType() const override {
 78 |     return RETINANET_PLUGIN_NAME;
 79 |   }
 80 |  
 81 |   const char *getPluginVersion() const override {
 82 |     return RETINANET_PLUGIN_VERSION;
 83 |   }
 84 |   
 85 |   int getNbOutputs() const override {
 86 |     return 3;
 87 |   }
 88 | 
 89 |   Dims getOutputDimensions(int index,
 90 |                                      const Dims *inputs, int nbInputDims) override {
 91 |     assert(nbInputDims == 3);
 92 |     assert(index < this->getNbOutputs());
 93 |     return Dims3(_detections_per_im * (index == 1 ? 4 : 1), 1, 1);
 94 |   }
 95 | 
 96 |   bool supportsFormat(DataType type, PluginFormat format) const override {
 97 |     return type == DataType::kFLOAT && format == PluginFormat::kLINEAR;
 98 |   }
 99 | 
100 |   void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, 
101 |                         int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override {
102 |     assert(type == nvinfer1::DataType::kFLOAT && format == nvinfer1::PluginFormat::kLINEAR);
103 |     assert(nbInputs == 3);
104 |     assert(inputDims[0].d[0] == inputDims[2].d[0]);
105 |     assert(inputDims[1].d[0] == inputDims[2].d[0] * 4);
106 |     _count = inputDims[0].d[0];
107 |   }
108 | 
109 |   int initialize() override { return 0; }
110 | 
111 |   void terminate() override {}
112 | 
113 |   size_t getWorkspaceSize(int maxBatchSize) const override {
114 |     static int size = -1;
115 |     if (size < 0) {
116 |       size = cuda::nms(maxBatchSize, nullptr, nullptr, _count, 
117 |         _detections_per_im, _nms_thresh, 
118 |         nullptr, 0, nullptr);
119 |     }
120 |     return size;
121 |   }
122 | 
123 |   int enqueue(int batchSize,
124 |               const void *const *inputs, void **outputs,
125 |               void *workspace, cudaStream_t stream) override {
126 |     return cuda::nms(batchSize, inputs, outputs, _count, 
127 |       _detections_per_im, _nms_thresh,
128 |       workspace, getWorkspaceSize(batchSize), stream);
129 |   }
130 | 
131 |   void destroy() override {
132 |     delete this;
133 |   }
134 | 
135 |   const char *getPluginNamespace() const override {
136 |     return RETINANET_PLUGIN_NAMESPACE;
137 |   }
138 |   
139 |   void setPluginNamespace(const char *N) override {
140 |     
141 |   }
142 | 
143 |   IPluginV2 *clone() const override {
144 |     return new NMSPlugin(_nms_thresh, _detections_per_im);
145 |   }
146 | 
147 | private:
148 |   template<typename T> void write(char*& buffer, const T& val) const {
149 |     *reinterpret_cast<T*>(buffer) = val;
150 |     buffer += sizeof(T);
151 |   }
152 | 
153 |   template<typename T> void read(const char*& buffer, T& val) {
154 |     val = *reinterpret_cast<const T*>(buffer);
155 |     buffer += sizeof(T);
156 |   }
157 | };
158 | 
159 | class NMSPluginCreator : public IPluginCreator {
160 | public:
161 |   NMSPluginCreator() {}
162 |   
163 |   const char *getPluginNamespace() const override {
164 |     return RETINANET_PLUGIN_NAMESPACE;
165 |   }
166 |   const char *getPluginName () const override {
167 |     return RETINANET_PLUGIN_NAME;
168 |   }
169 | 
170 |   const char *getPluginVersion () const override {
171 |     return RETINANET_PLUGIN_VERSION;
172 |   }
173 |  
174 |   IPluginV2 *deserializePlugin (const char *name, const void *serialData, size_t serialLength) override {
175 |     return new NMSPlugin(serialData, serialLength);
176 |   }
177 | 
178 |   void setPluginNamespace(const char *N) override {}
179 |   const PluginFieldCollection *getFieldNames() override { return nullptr; }
180 |   IPluginV2 *createPlugin (const char *name, const PluginFieldCollection *fc) override { return nullptr; }
181 | };
182 | 
183 | REGISTER_TENSORRT_PLUGIN(NMSPluginCreator);
184 | 
185 | }
186 | 
187 | #undef RETINANET_PLUGIN_NAME
188 | #undef RETINANET_PLUGIN_VERSION
189 | #undef RETINANET_PLUGIN_NAMESPACE
190 | 


--------------------------------------------------------------------------------
/extras/cppapi/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
 2 | 
 3 | project(retinanet_infer LANGUAGES CXX)
 4 | set(CMAKE_CXX_STANDARD 11)
 5 | find_package(CUDA REQUIRED)
 6 | enable_language(CUDA)
 7 | 
 8 | if(DEFINED TensorRT_DIR)
 9 |   include_directories("${TensorRT_DIR}/include")
10 |   link_directories("${TensorRT_DIR}/lib")
11 | endif(DEFINED TensorRT_DIR)
12 | include_directories(${CUDA_INCLUDE_DIRS})
13 | 
14 | add_library(retinanet
15 |   ../../csrc/cuda/decode.h
16 |   ../../csrc/cuda/decode.cu
17 |   ../../csrc/cuda/nms.h
18 |   ../../csrc/cuda/nms.cu
19 |   ../../csrc/cuda/utils.h
20 |   ../../csrc/engine.h
21 |   ../../csrc/engine.cpp
22 |   ../../csrc/calibrator.h
23 | )
24 | set_target_properties(retinanet PROPERTIES 
25 |   CUDA_RESOLVE_DEVICE_SYMBOLS ON
26 | )
27 | target_link_libraries(retinanet PUBLIC nvinfer nvonnxparser ${OPENCV_LIBS})
28 | 
29 | find_package(OpenCV REQUIRED)
30 | add_executable(export export.cpp)
31 | target_link_libraries(export PRIVATE retinanet ${OpenCV_LIBS})
32 | 
33 | find_package(OpenCV REQUIRED)
34 | 
35 | add_executable(infer infer.cpp)
36 | target_link_libraries(infer PRIVATE retinanet ${OpenCV_LIBS} cuda ${CUDA_LIBRARIES})
37 | 
38 | if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
39 |   add_executable(infervideo infervideo.cpp)
40 |   target_link_libraries(infervideo PRIVATE retinanet ${OpenCV_LIBS} cuda ${CUDA_LIBRARIES})
41 | endif()
42 | 


--------------------------------------------------------------------------------
/extras/cppapi/README.md:
--------------------------------------------------------------------------------
 1 | # RetinaNet C++ Inference API - Sample Code
 2 | 
 3 | The C++ API allows you to build a TensorRT engine for inference using the ONNX export of a core model.
 4 | 
 5 | The following shows how to build and run code samples for exporting an ONNX core model (from RetinaNet or other toolkit supporting the same sort of core model structure) to a TensorRT engine and doing inference on images.
 6 | 
 7 | ## Building
 8 | 
 9 | Building the example requires the following toolkits and libraries to be set up properly on your system:
10 | * A proper C++ toolchain (MSVS on Windows)
11 | * [CMake](https://cmake.org/download/) version 3.9 or later
12 | * NVIDIA [CUDA](https://developer.nvidia.com/cuda-toolkit)
13 | * NVIDIA [CuDNN](https://developer.nvidia.com/cudnn)
14 | * NVIDIA [TensorRT](https://developer.nvidia.com/tensorrt)
15 | * [OpenCV](https://opencv.org/releases.html)
16 | 
17 | ### Linux
18 | ```bash
19 | mkdir build && cd build
20 | cmake -DCMAKE_CUDA_FLAGS="--expt-extended-lambda -std=c++11" ..
21 | make
22 | ```
23 | 
24 | ### Windows
25 | ```bash
26 | mkdir build && cd build
27 | cmake -G "Visual Studio 15 2017" -A x64 -T host=x64,cuda=10.0 -DTensorRT_DIR="C:\path\to\tensorrt" -DOpenCV_DIR="C:\path\to\opencv\build" ..
28 | msbuild retinanet_infer.sln
29 | ```
30 | 
31 | ## Running
32 | 
33 | If you don't have an ONNX core model, generate one from your RetinaNet model:
34 | ```bash
35 | retinanet export model.pth model.onnx
36 | ```
37 | 
38 | Load the ONNX core model and export it to a RetinaNet TensorRT engine (using FP16 precision):
39 | ```bash
40 | export{.exe} model.onnx engine.plan
41 | ```
42 | 
43 | You can also export the ONNX core model to an INT8 TensorRT engine if you have already done INT8 calibration:
44 | ```bash
45 | export{.exe} model.onnx engine.plan INT8CalibrationTable
46 | ```
47 | 
48 | Run a test inference:
49 | ```bash
50 | infer{.exe} engine.plan image.jpg
51 | ```
52 | 
53 | Note: make sure the TensorRT, CuDNN and OpenCV libraries are available in your environment and path.
54 | 
55 | We have verified these steps with the following configurations:
56 | * DGX-1V using the provided Docker container (CUDA 10, cuDNN 7.4.2, TensorRT 5.0.2, OpenCV 3.4.3)
57 | * Jetson AGX Xavier with JetPack 4.1.1 Developer Preview (CUDA 10, cuDNN 7.3.1, TensorRT 5.0.3, OpenCV 3.3.1)
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/extras/cppapi/export.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <stdexcept>
 3 | #include <fstream>
 4 | #include <vector>
 5 | 
 6 | #include "../../csrc/engine.h"
 7 | 
 8 | using namespace std;
 9 | 
10 | // Sample program to build a TensorRT Engine from an ONNX model from RetinaNet
11 | //
12 | // By default TensorRT will target FP16 precision (supported on Pascal, Volta, and Turing GPUs)
13 | //
14 | // You can optionally provide an INT8CalibrationTable file created during RetinaNet INT8 calibration
15 | // to build a TensorRT engine with INT8 precision
16 | 
17 | int main(int argc, char *argv[]) {
18 | 	if (argc != 3 && argc != 4) {
19 | 		cerr << "Usage: " << argv[0] << " core_model.onnx engine.plan {Int8CalibrationTable}" << endl;
20 | 		return 1;
21 | 	}
22 | 
23 | 	ifstream onnxFile;
24 | 	onnxFile.open(argv[1], ios::in | ios::binary); 
25 | 
26 |         if (!onnxFile.good()) {
27 |             cerr << "\nERROR: Unable to read specified ONNX model " << argv[1] << endl;
28 |             return -1;
29 |         }
30 | 
31 | 	onnxFile.seekg (0, onnxFile.end);
32 | 	size_t size = onnxFile.tellg();
33 | 	onnxFile.seekg (0, onnxFile.beg);
34 | 
35 | 	auto *buffer = new char[size];
36 | 	onnxFile.read(buffer, size);
37 | 	onnxFile.close();
38 | 
39 |         // Define default RetinaNet parameters to use for TRT export
40 | 	int batch = 1;
41 | 	float score_thresh = 0.05f;
42 | 	int top_n = 1000;
43 |         size_t workspace_size =(1ULL << 30);
44 |         float nms_thresh = 0.5;
45 |         int detections_per_im = 100;
46 |         bool verbose = false;
47 | 	vector<vector<float>> anchors = {
48 | 		{-12.0, -12.0, 19.0, 19.0, -8.0, -20.0, 15.0, 27.0, -18.0, -8.0, 25.0, 15.0, -16.15, -16.15, 23.15, 23.15, -11.11, -26.23, 18.11, 33.23, -23.71, -11.11, 30.71, 18.11, -21.39, -21.39, 28.39, 28.39, -15.04, -34.09, 22.04, 41.09, -30.92, -15.04, 37.92, 22.04},
49 | 		{-24.0, -24.0, 39.0, 39.0, -14.0, -36.0, 29.0, 51.0, -38.0, -16.0, 53.0, 31.0, -32.31, -32.31, 47.31, 47.31, -19.71, -47.43, 34.71, 62.43, -49.95, -22.23, 64.95, 37.23, -42.79, -42.79, 57.79, 57.79, -26.92, -61.84, 41.92, 76.84, -65.02, -30.09, 80.02, 45.09},
50 | 		{-48.0, -48.0, 79.0, 79.0, -30.0, -76.0, 61.0, 107.0, -74.0, -28.0, 105.0, 59.0, -64.63, -64.63, 95.63, 95.63, -41.95, -99.91, 72.95, 130.91, -97.39, -39.43, 128.39, 70.43, -85.59, -85.59, 116.59, 116.59, -57.02, -130.04, 88.02, 161.04, -126.86, -53.84, 157.86, 84.84}, 
51 | 		{-96.0, -96.0, 159.0, 159.0, -58.0, -148.0, 121.0, 211.0, -150.0, -60.0, 213.0, 123.0, -129.26, -129.26, 192.26, 192.26, -81.39, -194.78, 144.39, 257.78, -197.30, -83.91, 260.30, 146.91, -171.18, -171.18, 234.18, 234.18, -110.86, -253.73, 173.86, 316.73, -256.90, -114.04, 319.90, 177.04},
52 | 		{-192.0, -192.0, 319.0, 319.0, -118.0, -300.0, 245.0, 427.0, -298.0, -116.0, 425.0, 243.0, -258.53, -258.53, 385.53, 385.53, -165.30, -394.61, 292.30, 521.61, -392.09, -162.78, 519.09, 289.78, -342.37, -342.37, 469.37, 469.37, -224.90, -513.81, 351.90, 640.81, -510.63, -221.73, 637.63, 348.73}
53 | 	};
54 | 
55 |         // For now, assume we have already done calibration elsewhere 
56 |         // if we want to create an INT8 TensorRT engine, so no need 
57 |         // to provide calibration files or model name
58 |         const vector<string> calibration_files;
59 |         string model_name = "";
60 |         string calibration_table = argc == 4 ? string(argv[3]) : "";
61 | 
62 |         // Use FP16 precision by default, use INT8 if calibration table is provided
63 |         string precision = "FP16";
64 |         if (argc == 4)
65 |             precision = "INT8";
66 | 
67 | 	cout << "Building engine..." << endl;
68 | 	auto engine = retinanet::Engine(buffer, size, batch, precision, score_thresh, top_n,
69 | 		anchors, nms_thresh, detections_per_im, calibration_files, model_name, calibration_table, verbose, workspace_size);
70 | 	engine.save(string(argv[2]));
71 | 
72 | 
73 | 	delete [] buffer;
74 | 
75 | 	return 0;
76 | }
77 | 


--------------------------------------------------------------------------------
/extras/cppapi/infer.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <stdexcept>
  3 | #include <fstream>
  4 | #include <vector>
  5 | #include <chrono>
  6 | 
  7 | #include <opencv2/opencv.hpp>
  8 | #include <opencv2/opencv.hpp>
  9 | #include <opencv2/core/core.hpp>
 10 | #include <opencv2/highgui/highgui.hpp>
 11 | 
 12 | #include <cuda_runtime.h>
 13 | 
 14 | #include "../../csrc/engine.h"
 15 | 
 16 | using namespace std;
 17 | using namespace cv;
 18 | 
 19 | int main(int argc, char *argv[]) {
 20 | 	if (argc != 3) {
 21 | 		cerr << "Usage: " << argv[0] << " engine.plan image.jpg" << endl;
 22 | 		return 1;
 23 | 	}
 24 | 
 25 | 	cout << "Loading engine..." << endl;
 26 | 	auto engine = retinanet::Engine(argv[1]);
 27 | 
 28 | 	cout << "Preparing data..." << endl;
 29 | 	auto image = imread(argv[2], IMREAD_COLOR);
 30 | 	auto inputSize = engine.getInputSize();
 31 | 	cv::resize(image, image, Size(inputSize[1], inputSize[0]));
 32 | 	cv::Mat pixels;
 33 | 	image.convertTo(pixels, CV_32FC3, 1.0 / 255, 0);
 34 | 
 35 | 	int channels = 3;
 36 | 	vector<float> img;
 37 | 	vector<float> data (channels * inputSize[0] * inputSize[1]);
 38 | 
 39 | 	if (pixels.isContinuous())
 40 | 		img.assign((float*)pixels.datastart, (float*)pixels.dataend);
 41 | 	else {
 42 | 		cerr << "Error reading image " << argv[2] << endl;
 43 | 		return -1;
 44 | 	}
 45 | 
 46 | 	vector<float> mean {0.485, 0.456, 0.406};
 47 | 	vector<float> std {0.229, 0.224, 0.225};
 48 | 
 49 | 	for (int c = 0; c < channels; c++) {
 50 | 		for (int j = 0, hw = inputSize[0] * inputSize[1]; j < hw; j++) {
 51 | 			data[c * hw + j] = (img[channels * j + 2 - c] - mean[c]) / std[c];
 52 | 		}
 53 | 	}        
 54 | 
 55 | 	// Create device buffers
 56 | 	void *data_d, *scores_d, *boxes_d, *classes_d;
 57 | 	auto num_det = engine.getMaxDetections();
 58 | 	cudaMalloc(&data_d, 3 * inputSize[0] * inputSize[1] * sizeof(float));
 59 | 	cudaMalloc(&scores_d, num_det * sizeof(float));
 60 | 	cudaMalloc(&boxes_d, num_det * 4 * sizeof(float));
 61 | 	cudaMalloc(&classes_d, num_det * sizeof(float));
 62 | 
 63 | 	// Copy image to device
 64 | 	size_t dataSize = data.size() * sizeof(float);
 65 | 	cudaMemcpy(data_d, data.data(), dataSize, cudaMemcpyHostToDevice);
 66 | 
 67 | 	// Run inference n times
 68 | 	cout << "Running inference..." << endl;
 69 | 	const int count = 100;
 70 | 	auto start = chrono::steady_clock::now();
 71 |  	vector<void *> buffers = { data_d, scores_d, boxes_d, classes_d };
 72 | 	for (int i = 0; i < count; i++) {
 73 | 		engine.infer(buffers, 1);
 74 | 	}
 75 | 	auto stop = chrono::steady_clock::now();
 76 | 	auto timing = chrono::duration_cast<chrono::duration<double>>(stop - start);
 77 | 	cout << "Took " << timing.count() / count << " seconds per inference." << endl;
 78 | 
 79 | 	cudaFree(data_d);
 80 | 
 81 | 	// Get back the bounding boxes
 82 | 	unique_ptr<float[]> scores(new float[num_det]);
 83 | 	unique_ptr<float[]> boxes(new float[num_det * 4]);
 84 | 	unique_ptr<float[]> classes(new float[num_det]);
 85 | 	cudaMemcpy(scores.get(), scores_d, sizeof(float) * num_det, cudaMemcpyDeviceToHost);
 86 | 	cudaMemcpy(boxes.get(), boxes_d, sizeof(float) * num_det * 4, cudaMemcpyDeviceToHost);
 87 | 	cudaMemcpy(classes.get(), classes_d, sizeof(float) * num_det, cudaMemcpyDeviceToHost);
 88 | 
 89 | 	cudaFree(scores_d);
 90 | 	cudaFree(boxes_d);
 91 | 	cudaFree(classes_d);
 92 | 
 93 | 	for (int i = 0; i < num_det; i++) {
 94 | 		// Show results over confidence threshold
 95 | 		if (scores[i] >= 0.3f) {
 96 | 			float x1 = boxes[i*4+0];
 97 | 			float y1 = boxes[i*4+1];
 98 | 			float x2 = boxes[i*4+2];
 99 | 			float y2 = boxes[i*4+3];
100 | 			cout << "Found box {" << x1 << ", " << y1 << ", " << x2 << ", " << y2
101 | 				<< "} with score " << scores[i] << " and class " << classes[i] << endl;
102 | 
103 | 			// Draw bounding box on image
104 | 			cv::rectangle(image, Point(x1, y1), Point(x2, y2), cv::Scalar(0, 255, 0));
105 | 		}
106 | 	}
107 | 
108 | 	// Write image
109 | 	imwrite("detections.png", image);
110 | 
111 | 	return 0;
112 | }
113 | 


--------------------------------------------------------------------------------
/extras/cppapi/infervideo.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <stdexcept>
  3 | #include <fstream>
  4 | #include <vector>
  5 | #include <chrono>
  6 | 
  7 | #include <opencv2/opencv.hpp>
  8 | #include <opencv2/opencv.hpp>
  9 | #include <opencv2/core/core.hpp>
 10 | #include <opencv2/highgui/highgui.hpp>
 11 | 
 12 | #include <cuda_runtime.h>
 13 | 
 14 | #include "../../csrc/engine.h"
 15 | 
 16 | using namespace std;
 17 | using namespace cv;
 18 | 
 19 | int main(int argc, char *argv[]) {
 20 | 	if (argc != 4) {
 21 | 		cerr << "Usage: " << argv[0] << " engine.plan input.mov output.mp4" << endl;
 22 | 		return 1;
 23 | 	}
 24 | 
 25 | 	cout << "Loading engine..." << endl;
 26 | 	auto engine = retinanet::Engine(argv[1]);
 27 | 	VideoCapture src(argv[2]);
 28 | 
 29 | 	if (!src.isOpened()){
 30 | 		cerr << "Could not read " << argv[2] << endl;
 31 | 		return 1;
 32 | 	}
 33 | 	
 34 | 	auto fh=src.get(CV_CAP_PROP_FRAME_HEIGHT);
 35 | 	auto fw=src.get(CV_CAP_PROP_FRAME_WIDTH);
 36 | 	auto fps=src.get(CV_CAP_PROP_FPS);
 37 | 	auto nframes=src.get(CV_CAP_PROP_FRAME_COUNT);
 38 | 
 39 | 	VideoWriter sink;
 40 | 	sink.open(argv[3], 0x31637661, fps, Size(fw, fh));
 41 | 	Mat frame;
 42 | 	Mat resized_frame;
 43 | 	Mat inferred_frame;
 44 | 	int count=1;
 45 | 
 46 | 	auto inputSize = engine.getInputSize();
 47 | 	// Create device buffers
 48 | 	void *data_d, *scores_d, *boxes_d, *classes_d;
 49 | 	auto num_det = engine.getMaxDetections();
 50 | 	cudaMalloc(&data_d, 3 * inputSize[0] * inputSize[1] * sizeof(float));
 51 | 	cudaMalloc(&scores_d, num_det * sizeof(float));
 52 | 	cudaMalloc(&boxes_d, num_det * 4 * sizeof(float));
 53 | 	cudaMalloc(&classes_d, num_det * sizeof(float));
 54 | 
 55 | 	auto scores = new float[num_det];
 56 | 	auto boxes = new float[num_det * 4];
 57 | 	auto classes = new float[num_det];
 58 | 
 59 | 	vector<float> mean {0.485, 0.456, 0.406};
 60 |     vector<float> std {0.229, 0.224, 0.225};
 61 | 
 62 |     vector<uint8_t> blues {0,63,127,191,255,0}; //colors for bonuding boxes
 63 |     vector<uint8_t> greens {0,255,191,127,63,0};
 64 |     vector<uint8_t> reds {191,255,0,0,63,127};
 65 | 
 66 |     int channels = 3;
 67 |     vector<float> img;
 68 |     vector<float> data (channels * inputSize[0] * inputSize[1]);
 69 | 
 70 | 	while (1){
 71 | 		src >> frame;
 72 | 		if (frame.empty()){
 73 | 			cout << "Finished inference!" << endl;
 74 | 			break;
 75 | 		}
 76 | 
 77 | 	cv::resize(frame, resized_frame, Size(inputSize[1], inputSize[0]));
 78 |         cv::Mat pixels;
 79 |         resized_frame.convertTo(pixels, CV_32FC3, 1.0 / 255, 0);
 80 | 
 81 |         img.assign((float*)pixels.datastart, (float*)pixels.dataend);
 82 |   
 83 |         for (int c = 0; c < channels; c++) {
 84 |             for (int j = 0, hw = inputSize[0] * inputSize[1]; j < hw; j++) {
 85 |                 data[c * hw + j] = (img[channels * j + 2 - c] - mean[c]) / std[c];
 86 |             }
 87 |         }
 88 | 
 89 | 	// Copy image to device
 90 | 	size_t dataSize = data.size() * sizeof(float);
 91 | 	cudaMemcpy(data_d, data.data(), dataSize, cudaMemcpyHostToDevice);
 92 | 
 93 | 
 94 | 	//Do inference
 95 | 	cout << "Inferring on frame: " << count <<"/" << nframes << endl;
 96 | 	count++;
 97 |  	vector<void *> buffers = { data_d, scores_d, boxes_d, classes_d };
 98 | 	engine.infer(buffers, 1);
 99 | 
100 | 	cudaMemcpy(scores, scores_d, sizeof(float) * num_det, cudaMemcpyDeviceToHost);
101 | 	cudaMemcpy(boxes, boxes_d, sizeof(float) * num_det * 4, cudaMemcpyDeviceToHost);
102 | 	cudaMemcpy(classes, classes_d, sizeof(float) * num_det, cudaMemcpyDeviceToHost);
103 | 
104 | 	// Get back the bounding boxes
105 | 	for (int i = 0; i < num_det; i++) {
106 | 		// Show results over confidence threshold
107 | 		if (scores[i] >= 0.2f) {
108 | 			float x1 = boxes[i*4+0];
109 | 			float y1 = boxes[i*4+1];
110 | 			float x2 = boxes[i*4+2];
111 | 			float y2 = boxes[i*4+3];
112 | 			int cls=classes[i];
113 | 			// Draw bounding box on image
114 | 			cv::rectangle(resized_frame, Point(x1, y1), Point(x2, y2), cv::Scalar(blues[cls], greens[cls], reds[cls]));
115 | 		}
116 | 	}
117 | 	cv::resize(resized_frame, inferred_frame, Size(fw, fh));
118 | 	sink.write(inferred_frame);
119 | 	}
120 | 	src.release();
121 | 	sink.release();
122 | 	return 0;
123 | }
124 | 


--------------------------------------------------------------------------------
/extras/deepstream/README.md:
--------------------------------------------------------------------------------
  1 | # Deploying RetinaNet in DeepStream 4.0
  2 | 
  3 | This shows how to export a trained RetinaNet model to TensorRT and deploy it in a video analytics application using NVIDIA DeepStream 4.0.
  4 | 
  5 | ## Prerequisites
  6 | * A GPU supported by DeepStream: Jetson Xavier, Tesla P4/P40/V100/T4
  7 | * A trained PyTorch RetinaNet model.
  8 | * A video source, either `.mp4` files or a webcam.
  9 | 
 10 | ## Tesla GPUs
 11 | Setup instructions:
 12 | 
 13 | #### 1. Download DeepStream 4.0 
 14 | Download DeepStream 4.0 SDK for Tesla "Download .tar" from [https://developer.nvidia.com/deepstream-download](https://developer.nvidia.com/deepstream-download) and place in the `extras/deepstream` directory. 
 15 | 
 16 | This file should be called `deepstream_sdk_v4.0.1_x86_64.tbz2`.
 17 | 
 18 | #### 2. Unpack DeepStream
 19 | You may need to adjust the permissions on the `.tbz2` file before you can extract it. 
 20 | 
 21 | ```
 22 | cd extras/deepstream
 23 | mkdir DeepStream_Release
 24 | tar -xvf deepstream_sdk_v4.0.1_x86_64.tbz2 -C DeepStream_Release/
 25 | ```
 26 | 
 27 | #### 3. Build and enter the DeepStream docker container
 28 | ```
 29 | docker build -f <your_path>/retinanet-examples/Dockerfile.deepstream -t ds_retinanet:latest <your_path>/retinanet-examples
 30 | docker run --gpus all -it --rm --ipc=host -v <dir containing your data>:/data ds_retinanet:latest
 31 | ```
 32 | 
 33 | #### 4. Export your trained PyTorch RetinaNet model to TensorRT per the [INFERENCE](https://github.com/NVIDIA/retinanet-examples/blob/master/INFERENCE.md) instructions:
 34 | ```
 35 | retinanet export <PyTorch model> <engine> --opset 8 --batch n
 36 | 
 37 | OR
 38 | 
 39 | retinanet export <PyTorch model> <engine> --opset 8 --int8 --calibration-images <example images> --batch n
 40 | ```
 41 | 
 42 | #### 5. Run deepstream-app
 43 | Once all of the config files have been modified, launch the DeepStream application: 
 44 | ```
 45 | cd /workspace/retinanet-examples/extras/deepstream/deepstream-sample/
 46 | LD_PRELOAD=build/libnvdsparsebbox_retinanet.so deepstream-app -c <config file>
 47 | ```
 48 | 
 49 | ## Jetson AGX Xavier
 50 | Setup instructions.
 51 | 
 52 | Note that for compatibility reasons, you must use the **TRT5 branch** of this repository. 
 53 | 
 54 | #### 1. Flash Jetson Xavier with [Jetpack 4.2.3](https://developer.nvidia.com/embedded/jetpack)
 55 | 
 56 | **Ensure that you tick the DeepStream box, under Additional SDKs**
 57 | 
 58 | #### 2. (on Jetson) Install additional DeepStream dependencies:
 59 | ```
 60 | sudo apt install \
 61 |     libssl1.0.0 \
 62 |     libgstreamer1.0-0 \
 63 |     gstreamer1.0-tools \
 64 |     gstreamer1.0-plugins-good \
 65 |     gstreamer1.0-plugins-bad \
 66 |     gstreamer1.0-plugins-ugly \
 67 |     gstreamer1.0-libav \
 68 |     libgstrtspserver-1.0-0 \
 69 |     libjansson4=2.11-1
 70 |     librdkafka1=0.11.3-1build1
 71 | ```
 72 | 
 73 | #### 3. (on host) Covert PyTorch model to ONNX.
 74 | 
 75 | ```bash
 76 | retinanet export model.pth model.onnx
 77 | ```
 78 | 
 79 | #### 4. Copy ONNX RetinaNet model and config files to Jetson Xavier
 80 | 
 81 | Use `scp` or a memory card.
 82 | 
 83 | #### 5. (on Jetson) Make the C++ API
 84 | 
 85 | **Reminder: You must use the **TRT5 branch**.
 86 | ```bash
 87 | cd extras/cppapi
 88 | mkdir build && cd build
 89 | cmake -DCMAKE_CUDA_FLAGS="--expt-extended-lambda -std=c++11" ..
 90 | make
 91 | ```
 92 | 
 93 | #### 6. (on Jetson) Make the RetinaNet plugin
 94 | 
 95 | ```bash
 96 | cd extras/deepstream/deepstream-sample
 97 | mkdir build && cd build
 98 | cmake -DDeepStream_DIR=/opt/nvidia/deepstream/deepstream-4.0 .. && make -j
 99 | ```
100 | 
101 | #### 7. (on Jetson) Build the TensorRT Engine
102 | 
103 | ```bash
104 | cd extras/cppapi
105 | ./export model.onnx engine.plan
106 | ```
107 | 
108 | #### 8. (on Jetson) Modify the DeepStream config files
109 | As described in the "preparing the DeepStream config file" section below. 
110 | 
111 | #### 9. (on Jetson) Run deepstream-app
112 | Once all of the config files have been modified, launch the DeepStream application: 
113 | ```
114 | cd extras/deepstream/deepstream-sample
115 | LD_PRELOAD=build/libnvdsparsebbox_retinanet.so deepstream-app -c <config file>
116 | ```
117 | 
118 | ## Preparing the DeepStream config file:
119 | We have included two example DeepStream config files in `deepstream-sample`.
120 | - `ds_config_1vids.txt`: Performs detection on a single video, using the detector specified by `infer_config_batch1.txt`.
121 | - `ds_config_8vids.txt`: Performs detection on multiple video streams simultaneously, using the detector specified by `infer_config_batch8.txt`. Frames from each video are combined into a single batch and passed to the detector for inference.
122 | 
123 | The `ds_config_*` files are DeepStream config files. They describe the overall processing. `infer_config_*` files define the individual detectors, which can be chained in series.
124 | 
125 | Before they can be used, these config files must be modified to specify the correct paths to the input and output videos files, and the TensorRT engines.
126 | 
127 | * **Input files** are specified in the deepstream config files by the `uri=file://<path>` parameter.
128 | 
129 | * **Output files** are specified in the deepstream config files by the `output-file=<path>` parameter.
130 | 
131 | * **TensorRT engines** are specified in both the DeepStream config files, and also the detector config files, by the `model-engine-file=<path>` parameters. 
132 | 
133 | On Xavier, you can optionally set `enable=1` to `[sink1]` in `ds_config_*` files to display the processed video stream.
134 | 
135 | 
136 | ## Convert output video file to mp4
137 | You can convert the outputted `.mkv` file to `.mp4` using `ffmpeg`.
138 | ```
139 | ffmpeg -i /data/output/file1.mkv -c copy /data/output/file2.mp4
140 | ```
141 | 


--------------------------------------------------------------------------------
/extras/deepstream/deepstream-sample/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5.1)
 2 | 
 3 | project(deepstream-retinanet)
 4 | enable_language(CXX)
 5 | include(FindCUDA)
 6 | 
 7 | set(CMAKE_CXX_STANDARD 11)
 8 | find_package(CUDA REQUIRED)
 9 | 
10 | if(DEFINED TensorRT_DIR)
11 |   include_directories("${TensorRT_DIR}/include")
12 |   link_directories("${TensorRT_DIR}/lib")
13 | endif(DEFINED TensorRT_DIR)
14 | if(DEFINED DeepStream_DIR)
15 |   include_directories("${DeepStream_DIR}/sources/includes")
16 | endif(DEFINED DeepStream_DIR)
17 | include_directories(${CUDA_INCLUDE_DIRS})
18 | 
19 | if(NOT DEFINED ARCH)
20 |   set(ARCH "sm_70")
21 | endif(NOT DEFINED ARCH)
22 | 
23 | cuda_add_library(nvdsparsebbox_retinanet SHARED
24 |   ../../../csrc/cuda/decode.h
25 |   ../../../csrc/cuda/decode.cu
26 |   ../../../csrc/cuda/nms.h
27 |   ../../../csrc/cuda/nms.cu
28 |   ../../../csrc/cuda/utils.h
29 |   ../../../csrc/engine.cpp
30 |   nvdsparsebbox_retinanet.cpp
31 |   OPTIONS -arch ${ARCH} -std=c++11 --expt-extended-lambda
32 | )
33 | target_link_libraries(nvdsparsebbox_retinanet ${CUDA_LIBRARIES} nvinfer nvinfer_plugin nvonnxparser)
34 | 


--------------------------------------------------------------------------------
/extras/deepstream/deepstream-sample/ds_config_1vid.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018 NVIDIA Corporation.  All rights reserved.
 2 | #
 3 | # NVIDIA Corporation and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA Corporation is strictly prohibited.
 8 | 
 9 | [application]
10 | enable-perf-measurement=1
11 | perf-measurement-interval-sec=1
12 | 
13 | [tiled-display]
14 | enable=0
15 | rows=1
16 | columns=1
17 | width=1280
18 | height=720
19 | gpu-id=0
20 | 
21 | [source0]
22 | enable=1
23 | type=2
24 | num-sources=1
25 | uri=file://<path>
26 | gpu-id=0
27 | 
28 | [streammux]
29 | gpu-id=0
30 | batch-size=1
31 | batched-push-timeout=-1
32 | ## Set muxer output width and height
33 | width=1280
34 | height=720
35 | cuda-memory-type=1
36 | enable-padding=1
37 | 
38 | [sink0]
39 | enable=1
40 | type=3
41 | #1=mp4 2=mkv
42 | container=1
43 | #1=h264 2=h265 3=mpeg4
44 | ## only SW mpeg4 is supported right now.
45 | codec=3
46 | sync=1
47 | bitrate=80000000
48 | output-file=<path>
49 | source-id=0
50 | 
51 | [sink1]
52 | enable=0
53 | #Type - 1=FakeSink 2=EglSink 3=File
54 | type=2
55 | sync=1
56 | source-id=0
57 | gpu-id=0
58 | cuda-memory-type=1
59 | 
60 | 
61 | [osd]
62 | enable=1
63 | gpu-id=0
64 | border-width=2
65 | text-size=12
66 | text-color=1;1;1;1;
67 | text-bg-color=0.3;0.3;0.3;1
68 | font=Arial
69 | show-clock=0
70 | clock-x-offset=800
71 | clock-y-offset=820
72 | clock-text-size=12
73 | clock-color=1;0;0;0
74 | 
75 | [primary-gie]
76 | enable=1
77 | gpu-id=0
78 | batch-size=1
79 | gie-unique-id=1
80 | interval=0
81 | labelfile-path=labels_coco.txt
82 | model-engine-file=<path>
83 | config-file=infer_config_batch1.txt
84 | 


--------------------------------------------------------------------------------
/extras/deepstream/deepstream-sample/ds_config_8vid.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018 NVIDIA Corporation.  All rights reserved.
 2 | #
 3 | # NVIDIA Corporation and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA Corporation is strictly prohibited.
 8 | 
 9 | [application]
10 | enable-perf-measurement=1
11 | perf-measurement-interval-sec=5
12 | 
13 | [tiled-display]
14 | enable=1
15 | rows=2
16 | columns=4
17 | width=1280
18 | height=720
19 | gpu-id=0
20 | cuda-memory-type=1
21 | 
22 | [source0]
23 | enable=1
24 | type=3
25 | num-sources=4
26 | uri=file://<path>
27 | gpu-id=0
28 | cuda-memory-type=1
29 | 
30 | [source1]
31 | enable=1
32 | type=3
33 | num-sources=4
34 | uri=file://<path>
35 | gpu-id=0
36 | cuda-memory-type=1
37 | 
38 | [streammux]
39 | gpu-id=0
40 | batched-push-timeout=-1
41 | ## Set muxer output width and height
42 | width=1280
43 | height=720
44 | cuda-memory-type=1
45 | enable-padding=1
46 | batch-size=8
47 | 
48 | [sink0]
49 | enable=1
50 | type=3
51 | #1=mp4 2=mkv
52 | container=1
53 | #1=h264 2=h265 3=mpeg4
54 | ## only SW mpeg4 is supported right now.
55 | codec=3
56 | sync=0
57 | bitrate=32000000
58 | output-file=<path>
59 | source-id=0
60 | cuda-memory-type=1
61 | 
62 | [sink1]
63 | enable=0
64 | #Type - 1=FakeSink 2=EglSink 3=File
65 | type=2
66 | sync=1
67 | source-id=0
68 | gpu-id=0
69 | cuda-memory-type=1
70 | 
71 | 
72 | [osd]
73 | enable=1
74 | gpu-id=0
75 | border-width=2
76 | text-size=12
77 | text-color=1;1;1;1;
78 | text-bg-color=0.3;0.3;0.3;1
79 | font=Arial
80 | show-clock=0
81 | clock-x-offset=800
82 | clock-y-offset=820
83 | clock-text-size=12
84 | clock-color=1;0;0;0
85 | 
86 | [primary-gie]
87 | enable=1
88 | gpu-id=0
89 | batch-size=8
90 | gie-unique-id=1
91 | interval=0
92 | labelfile-path=labels_coco.txt
93 | model-engine-file=<path>
94 | config-file=infer_config_batch8.txt
95 | cuda-memory-type=1
96 | 


--------------------------------------------------------------------------------
/extras/deepstream/deepstream-sample/infer_config_batch1.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018 NVIDIA Corporation.  All rights reserved.
 2 | # NVIDIA Corporation and its licensors retain all intellectual property
 3 | # and proprietary rights in and to this software, related documentation
 4 | # and any modifications thereto.  Any use, reproduction, disclosure or
 5 | # distribution of this software and related documentation without an express
 6 | # license agreement from NVIDIA Corporation is strictly prohibited.
 7 | 
 8 | # Following properties are mandatory when engine files are not specified:
 9 | #   int8-calib-file(Only in INT8)
10 | #   Caffemodel mandatory properties: model-file, proto-file, output-blob-names
11 | #   UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
12 | #   ONNX: onnx-file
13 | #
14 | # Mandatory properties for detectors:
15 | #   parse-func, num-detected-classes,
16 | #   custom-lib-path (when parse-func=0 i.e. custom),
17 | #   parse-bbox-func-name (when parse-func=0)
18 | #
19 | # Optional properties for detectors:
20 | #   enable-dbscan(Default=false), interval(Primary mode only, Default=0)
21 | #
22 | # Mandatory properties for classifiers:
23 | #   classifier-threshold, is-classifier
24 | #
25 | # Optional properties for classifiers:
26 | #   classifier-async-mode(Secondary mode only, Default=false)
27 | #
28 | # Optional properties in secondary mode:
29 | #   operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
30 | #   input-object-min-width, input-object-min-height, input-object-max-width,
31 | #   input-object-max-height
32 | #
33 | # Following properties are always recommended:
34 | #   batch-size(Default=1)
35 | #
36 | # Other optional properties:
37 | #   net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
38 | #   model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
39 | #   mean-file, gie-unique-id(Default=0), offsets, gie-mode (Default=1 i.e. primary),
40 | #   custom-lib-path, network-mode(Default=0 i.e FP32)
41 | #
42 | # The values in the config file are overridden by values set through GObject
43 | # properties.
44 | 
45 | [property]
46 | gpu-id=0
47 | net-scale-factor=0.017352074
48 | offsets=123.675;116.28;103.53
49 | model-engine-file=<path>
50 | labelfile-path=labels_coco.txt
51 | batch-size=1
52 | ## 0=FP32, 1=INT8, 2=FP16 mode
53 | network-mode=2
54 | num-detected-classes=80
55 | interval=0
56 | gie-unique-id=1
57 | parse-func=0
58 | is-classifier=0
59 | output-blob-names=boxes;scores;classes
60 | parse-bbox-func-name=NvDsInferParseRetinaNet
61 | custom-lib-path=build/libnvdsparsebbox_retinanet.so
62 | #enable-dbscan=1
63 | 
64 | 
65 | [class-attrs-all]
66 | threshold=0.5
67 | group-threshold=0
68 | ## Set eps=0.7 and minBoxes for enable-dbscan=1
69 | #eps=0.2
70 | ##minBoxes=3
71 | #roi-top-offset=0
72 | #roi-bottom-offset=0
73 | detected-min-w=4
74 | detected-min-h=4
75 | #detected-max-w=0
76 | #detected-max-h=0
77 | 
78 | ## Per class configuration
79 | #[class-attrs-2]
80 | #threshold=0.6
81 | #eps=0.5
82 | #group-threshold=3
83 | #roi-top-offset=20
84 | #roi-bottom-offset=10
85 | #detected-min-w=40
86 | #detected-min-h=40
87 | #detected-max-w=400
88 | #detected-max-h=800
89 | 


--------------------------------------------------------------------------------
/extras/deepstream/deepstream-sample/infer_config_batch8.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018 NVIDIA Corporation.  All rights reserved.
 2 | # NVIDIA Corporation and its licensors retain all intellectual property
 3 | # and proprietary rights in and to this software, related documentation
 4 | # and any modifications thereto.  Any use, reproduction, disclosure or
 5 | # distribution of this software and related documentation without an express
 6 | # license agreement from NVIDIA Corporation is strictly prohibited.
 7 | 
 8 | # Following properties are mandatory when engine files are not specified:
 9 | #   int8-calib-file(Only in INT8)
10 | #   Caffemodel mandatory properties: model-file, proto-file, output-blob-names
11 | #   UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
12 | #   ONNX: onnx-file
13 | #
14 | # Mandatory properties for detectors:
15 | #   parse-func, num-detected-classes,
16 | #   custom-lib-path (when parse-func=0 i.e. custom),
17 | #   parse-bbox-func-name (when parse-func=0)
18 | #
19 | # Optional properties for detectors:
20 | #   enable-dbscan(Default=false), interval(Primary mode only, Default=0)
21 | #
22 | # Mandatory properties for classifiers:
23 | #   classifier-threshold, is-classifier
24 | #
25 | # Optional properties for classifiers:
26 | #   classifier-async-mode(Secondary mode only, Default=false)
27 | #
28 | # Optional properties in secondary mode:
29 | #   operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
30 | #   input-object-min-width, input-object-min-height, input-object-max-width,
31 | #   input-object-max-height
32 | #
33 | # Following properties are always recommended:
34 | #   batch-size(Default=1)
35 | #
36 | # Other optional properties:
37 | #   net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
38 | #   model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
39 | #   mean-file, gie-unique-id(Default=0), offsets, gie-mode (Default=1 i.e. primary),
40 | #   custom-lib-path, network-mode(Default=0 i.e FP32)
41 | #
42 | # The values in the config file are overridden by values set through GObject
43 | # properties.
44 | 
45 | [property]
46 | gpu-id=0
47 | net-scale-factor=0.017352074
48 | offsets=123.675;116.28;103.53
49 | model-engine-file=<path>
50 | labelfile-path=labels_coco.txt
51 | #int8-calib-file=cal_trt4.bin
52 | batch-size=8
53 | ## 0=FP32, 1=INT8, 2=FP16 mode
54 | network-mode=2
55 | num-detected-classes=80
56 | interval=0
57 | gie-unique-id=1
58 | parse-func=0
59 | is-classifier=0
60 | output-blob-names=boxes;scores;classes
61 | parse-bbox-func-name=NvDsInferParseRetinaNet
62 | custom-lib-path=build/libnvdsparsebbox_retinanet.so
63 | #enable-dbscan=1
64 | 
65 | 
66 | [class-attrs-all]
67 | threshold=0.5
68 | group-threshold=0
69 | ## Set eps=0.7 and minBoxes for enable-dbscan=1
70 | #eps=0.2
71 | ##minBoxes=3
72 | #roi-top-offset=0
73 | #roi-bottom-offset=0
74 | detected-min-w=4
75 | detected-min-h=4
76 | #detected-max-w=0
77 | #detected-max-h=0
78 | 
79 | ## Per class configuration
80 | #[class-attrs-2]
81 | #threshold=0.6
82 | #eps=0.5
83 | #group-threshold=3
84 | #roi-top-offset=20
85 | #roi-bottom-offset=10
86 | #detected-min-w=40
87 | #detected-min-h=40
88 | #detected-max-w=400
89 | #detected-max-h=800
90 | 


--------------------------------------------------------------------------------
/extras/deepstream/deepstream-sample/labels_coco.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorcycle
 5 | airplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | couch
59 | potted plant
60 | bed
61 | dining table
62 | toilet
63 | tv
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush


--------------------------------------------------------------------------------
/extras/deepstream/deepstream-sample/nvdsparsebbox_retinanet.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * NVIDIA Corporation and its licensors retain all intellectual property
  5 |  * and proprietary rights in and to this software, related documentation
  6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
  7 |  * distridbution of this software and related documentation without an express
  8 |  * license agreement from NVIDIA Corporation is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | #include <cstring>
 13 | #include <iostream>
 14 | #include "nvdsinfer_custom_impl.h"
 15 | 
 16 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
 17 | 
 18 | /* This is a sample bounding box parsing function for the sample Resnet10
 19 |  * detector model provided with the SDK. */
 20 | 
 21 | /* C-linkage to prevent name-mangling */
 22 | extern "C"
 23 | bool NvDsInferParseRetinaNet (std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
 24 |         NvDsInferNetworkInfo  const &networkInfo,
 25 |         NvDsInferParseDetectionParams const &detectionParams,
 26 |         std::vector<NvDsInferParseObjectInfo> &objectList)
 27 | {
 28 |   static int bboxLayerIndex = -1;
 29 |   static int classesLayerIndex = -1;
 30 |   static int scoresLayerIndex = -1;
 31 |   static NvDsInferDimsCHW scoresLayerDims;
 32 |   int numDetsToParse;
 33 | 
 34 |   /* Find the bbox layer */
 35 |   if (bboxLayerIndex == -1) {
 36 |     for (unsigned int i = 0; i < outputLayersInfo.size(); i++) {
 37 |       if (strcmp(outputLayersInfo[i].layerName, "boxes") == 0) {
 38 |         bboxLayerIndex = i;
 39 |         break;
 40 |       }
 41 |     }
 42 |     if (bboxLayerIndex == -1) {
 43 |     std::cerr << "Could not find bbox layer buffer while parsing" << std::endl;
 44 |     return false;
 45 |     }
 46 |   }
 47 | 
 48 |   /* Find the scores layer */
 49 |   if (scoresLayerIndex == -1) {
 50 |     for (unsigned int i = 0; i < outputLayersInfo.size(); i++) {
 51 |       if (strcmp(outputLayersInfo[i].layerName, "scores") == 0) {
 52 |         scoresLayerIndex = i;
 53 |         getDimsCHWFromDims(scoresLayerDims, outputLayersInfo[i].dims);
 54 |         break;
 55 |       }
 56 |     }
 57 |     if (scoresLayerIndex == -1) {
 58 |     std::cerr << "Could not find scores layer buffer while parsing" << std::endl;
 59 |     return false;
 60 |     }
 61 |   }
 62 | 
 63 |   /* Find the classes layer */
 64 |   if (classesLayerIndex == -1) {
 65 |     for (unsigned int i = 0; i < outputLayersInfo.size(); i++) {
 66 |       if (strcmp(outputLayersInfo[i].layerName, "classes") == 0) {
 67 |         classesLayerIndex = i;
 68 |         break;
 69 |       }
 70 |     }
 71 |     if (classesLayerIndex == -1) {
 72 |     std::cerr << "Could not find classes layer buffer while parsing" << std::endl;
 73 |     return false;
 74 |     }
 75 |   }  
 76 | 
 77 |   
 78 |   /* Calculate the number of detections to parse */
 79 |   numDetsToParse = scoresLayerDims.c;
 80 | 
 81 |   float *bboxes = (float *) outputLayersInfo[bboxLayerIndex].buffer;
 82 |   float *classes = (float *) outputLayersInfo[classesLayerIndex].buffer;
 83 |   float *scores = (float *) outputLayersInfo[scoresLayerIndex].buffer;
 84 |   
 85 |   for (int indx = 0; indx < numDetsToParse; indx++)
 86 |   {
 87 |     float outputX1 = bboxes[indx * 4];
 88 |     float outputY1 = bboxes[indx * 4 + 1];
 89 |     float outputX2 = bboxes[indx * 4 + 2];
 90 |     float outputY2 = bboxes[indx * 4 + 3];
 91 |     float this_class = classes[indx];
 92 |     float this_score = scores[indx];
 93 |     float threshold = detectionParams.perClassThreshold[this_class];
 94 |     
 95 |     if (this_score >= threshold)
 96 |     {
 97 |       NvDsInferParseObjectInfo object;
 98 |       
 99 |       object.classId = this_class;
100 |       object.detectionConfidence = this_score;
101 | 
102 |       object.left = outputX1;
103 |       object.top = outputY1;
104 |       object.width = outputX2 - outputX1;
105 |       object.height = outputY2 - outputY1;
106 | 
107 |       objectList.push_back(object);
108 |     }
109 |   }
110 |   return true;
111 | }
112 | 
113 | /* Check that the custom function has been defined correctly */
114 | CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseRetinaNet);
115 | 


--------------------------------------------------------------------------------
/extras/tensorrt-6.0.1.5-cp36-none-linux_x86_64.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aidonchuk/retinanet-examples/b0a9c0ef36c38eb8d602de83d68665b23df4e76f/extras/tensorrt-6.0.1.5-cp36-none-linux_x86_64.whl


--------------------------------------------------------------------------------
/extras/test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ $# -ne 2 ]; then
 4 |     echo "Usage: $0 images_path annotations.json"
 5 |     exit 1
 6 | fi
 7 | 
 8 | tmp="/tmp/retinanet"
 9 | 
10 | tests=(
11 |     "retinanet train ${tmp}/model.pth --images $1 --annotations $2 --max-size 640 --override --iters 100 --backbone ResNet18FPN ResNet50FPN"
12 |     "retinanet train ${tmp}/model.pth --images $1 --annotations $2 --max-size 640 --override --iters 100"
13 |     "retinanet train ${tmp}/model.pth --fine-tune ${tmp}/model.pth --images $1 --annotations $2 --max-size 640 --override --iters 100"
14 |     "retinanet infer ${tmp}/model.pth --images ${tmp}/test_images --max-size 640"
15 |     "retinanet export ${tmp}/model.pth ${tmp}/engine.plan --size 640"
16 |     "retinanet infer ${tmp}/engine.plan --images ${tmp}/test_images --max-size 640"
17 | )
18 | 
19 | start=`date +%s`
20 | 
21 | # Prepare small image folder for inference
22 | if [ ! -d ${tmp}/test_images ]; then
23 |     mkdir -p ${tmp}/test_images
24 |     cp $(find $1 | tail -n 10) ${tmp}/test_images
25 | fi
26 | 
27 | # Run all tests
28 | for test in "${tests[@]}"; do
29 |     echo "Running \"${test}\""
30 |     ${test}
31 |     if [ $? -ne 0 ]; then
32 |         echo "Test failed!"
33 |         exit 1
34 |     fi
35 | done
36 | 
37 | end=`date +%s`
38 | 
39 | echo "All test succeeded in $((end-start)) seconds!"


--------------------------------------------------------------------------------
/markup_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aidonchuk/retinanet-examples/b0a9c0ef36c38eb8d602de83d68665b23df4e76f/markup_utils/__init__.py


--------------------------------------------------------------------------------
/markup_utils/supervisly_to_coco.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | train_path = '/workspace/mounted_vol/dataset/train_supervisely/ann/'
 5 | val_path = '/workspace/mounted_vol/dataset/val_supervisely/ann/'
 6 | 
 7 | 
 8 | def mark(name, path):
 9 |     files = [x[:-9] for x in filter(lambda x: os.path.isfile(path + x), os.listdir(path))]
10 |     files.sort()
11 |     # files = [i for i in os.listdir(path)]
12 |     images = []
13 |     annotations = []
14 |     categories = [{'id': 1, 'name': '1'},
15 |                   {'id': 2, 'name': '2'},
16 |                   {'id': 3, 'name': '3'},
17 |                   {'id': 4, 'name': '4'}]
18 | 
19 |     id_image = 1
20 |     id_ann = 1
21 | 
22 |     for i in files:
23 |         with open(path + str(i) + '.png.json') as json_file:
24 |             data = json.load(json_file)
25 |             size = data['size']
26 |             objects = data['objects']
27 |             images.append(
28 |                 {'id': id_image, 'file_name': str(i) + '.png', 'width': size['width'], 'height': size['height']})
29 | 
30 |             for j in objects:
31 |                 p = j['points']
32 |                 e = p['exterior']
33 |                 x0 = e[0][0]
34 |                 y0 = e[0][1]
35 | 
36 |                 w = e[1][0] - x0
37 |                 h = e[1][1] - y0
38 | 
39 |                 ann = {
40 |                     'id': id_ann,
41 |                     'image_id': id_image,
42 |                     'category_id': 10 if int(j['classTitle']) == 0 else int(j['classTitle']),
43 |                     'area': 0,
44 |                     'segmentation': [[]],
45 |                     'iscrowd': 0,
46 |                     'ignore': 0,
47 |                     'bbox': [x0, y0, w, h]
48 |                 }
49 |                 annotations.append(ann)
50 | 
51 |             id_image += 1
52 |             print(images)
53 |             print(annotations)
54 | 
55 |     r = {'images': images, 'annotations': annotations, 'categories': categories}
56 |     # r = json.dumps(r)
57 |     with open(name + ".json", "w") as f:
58 |         json.dump(r, f, indent=4, sort_keys=True)
59 | 
60 | 
61 | mark('train', train_path)
62 | mark('val', val_path)
63 | 


--------------------------------------------------------------------------------
/retinanet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aidonchuk/retinanet-examples/b0a9c0ef36c38eb8d602de83d68665b23df4e76f/retinanet/__init__.py


--------------------------------------------------------------------------------
/retinanet/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | from .resnet import *
4 | from .fpn import *
5 | 


--------------------------------------------------------------------------------
/retinanet/backbones/fpn.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | from torchvision.models import resnet as vrn
 4 | 
 5 | from .resnet import ResNet
 6 | from .utils import register, register_torchvision_030
 7 | 
 8 | class FPN(nn.Module):
 9 |     'Feature Pyramid Network - https://arxiv.org/abs/1612.03144'
10 | 
11 |     def __init__(self, features):
12 |         super().__init__()
13 | 
14 |         self.stride = 128
15 |         self.features = features
16 | 
17 |         is_light = features.bottleneck == vrn.BasicBlock
18 |         channels = [128, 256, 512] if is_light else [512, 1024, 2048]
19 | 
20 |         self.lateral3 = nn.Conv2d(channels[0], 256, 1)
21 |         self.lateral4 = nn.Conv2d(channels[1], 256, 1)
22 |         self.lateral5 = nn.Conv2d(channels[2], 256, 1)
23 |         self.pyramid6 = nn.Conv2d(channels[2], 256, 3, stride=2, padding=1)
24 |         self.pyramid7 = nn.Conv2d(256, 256, 3, stride=2, padding=1)
25 |         self.smooth3 = nn.Conv2d(256, 256, 3, padding=1)
26 |         self.smooth4 = nn.Conv2d(256, 256, 3, padding=1)
27 |         self.smooth5 = nn.Conv2d(256, 256, 3, padding=1)
28 | 
29 |     def initialize(self):
30 |         def init_layer(layer):
31 |             if isinstance(layer, nn.Conv2d):
32 |                 nn.init.xavier_uniform_(layer.weight)
33 |                 if layer.bias is not None:
34 |                     nn.init.constant_(layer.bias, val=0)
35 |         self.apply(init_layer)
36 | 
37 |         self.features.initialize()
38 | 
39 |     def forward(self, x):
40 |         c3, c4, c5 = self.features(x)
41 | 
42 |         p5 = self.lateral5(c5)
43 |         p4 = self.lateral4(c4)
44 |         p4 = F.interpolate(p5, scale_factor=2) + p4
45 |         p3 = self.lateral3(c3)
46 |         p3 = F.interpolate(p4, scale_factor=2) + p3
47 | 
48 |         p6 = self.pyramid6(c5)
49 |         p7 = self.pyramid7(F.relu(p6))
50 | 
51 |         p3 = self.smooth3(p3)
52 |         p4 = self.smooth4(p4)
53 |         p5 = self.smooth5(p5)
54 | 
55 |         return [p3, p4, p5, p6, p7]
56 | 
57 | @register
58 | def ResNet18FPN():
59 |     return FPN(ResNet(layers=[2, 2, 2, 2], bottleneck=vrn.BasicBlock, outputs=[3, 4, 5], url=vrn.model_urls['resnet18']))
60 | 
61 | @register
62 | def ResNet34FPN():
63 |     return FPN(ResNet(layers=[3, 4, 6, 3], bottleneck=vrn.BasicBlock, outputs=[3, 4, 5], url=vrn.model_urls['resnet34']))
64 | 
65 | @register
66 | def ResNet50FPN():
67 |     return FPN(ResNet(layers=[3, 4, 6, 3], bottleneck=vrn.Bottleneck, outputs=[3, 4, 5], url=vrn.model_urls['resnet50']))
68 | 
69 | @register
70 | def ResNet101FPN():
71 |     return FPN(ResNet(layers=[3, 4, 23, 3], bottleneck=vrn.Bottleneck, outputs=[3, 4, 5], url=vrn.model_urls['resnet101']))
72 | 
73 | @register
74 | def ResNet152FPN():
75 |     return FPN(ResNet(layers=[3, 8, 36, 3], bottleneck=vrn.Bottleneck, outputs=[3, 4, 5], url=vrn.model_urls['resnet152']))
76 | 
77 | @register_torchvision_030
78 | def ResNeXt50_32x4dFPN():
79 |     return FPN(ResNet(layers=[3, 4, 6, 3], bottleneck=vrn.Bottleneck, outputs=[3, 4, 5], groups=32, width_per_group=4, url=vrn.model_urls['resnext50_32x4d']))
80 | 
81 | @register_torchvision_030
82 | def ResNeXt101_32x8dFPN():
83 |     return FPN(ResNet(layers=[3, 4, 23, 3], bottleneck=vrn.Bottleneck, outputs=[3, 4, 5], groups=32, width_per_group=8, url=vrn.model_urls['resnext101_32x8d']))
84 | 


--------------------------------------------------------------------------------
/retinanet/backbones/layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class FixedBatchNorm2d(nn.Module):
 6 |     'BatchNorm2d where the batch statistics and the affine parameters are fixed'
 7 | 
 8 |     def __init__(self, n):
 9 |         super().__init__()
10 |         self.register_buffer("weight", torch.ones(n))
11 |         self.register_buffer("bias", torch.zeros(n))
12 |         self.register_buffer("running_mean", torch.zeros(n))
13 |         self.register_buffer("running_var", torch.ones(n))
14 |         
15 |     def forward(self, x):
16 |         return F.batch_norm(x, running_mean=self.running_mean, running_var=self.running_var, weight=self.weight, bias=self.bias)
17 | 
18 | def convert_fixedbn_model(module):
19 |     'Convert batch norm layers to fixed'
20 | 
21 |     mod = module
22 |     if isinstance(module, nn.BatchNorm2d):
23 |         mod = FixedBatchNorm2d(module.num_features)
24 |         mod.running_mean = module.running_mean
25 |         mod.running_var = module.running_var
26 |         if module.affine:
27 |             mod.weight.data = module.weight.data.clone().detach()
28 |             mod.bias.data = module.bias.data.clone().detach()
29 |     for name, child in module.named_children():
30 |         mod.add_module(name, convert_fixedbn_model(child))
31 | 
32 |     return mod
33 | 


--------------------------------------------------------------------------------
/retinanet/backbones/resnet.py:
--------------------------------------------------------------------------------
 1 | import torchvision
 2 | from torchvision.models import resnet as vrn
 3 | import torch.utils.model_zoo as model_zoo
 4 | 
 5 | from .utils import register
 6 | 
 7 | class ResNet(vrn.ResNet):
 8 |     'Deep Residual Network - https://arxiv.org/abs/1512.03385'
 9 | 
10 |     def __init__(self, layers=[3, 4, 6, 3], bottleneck=vrn.Bottleneck, outputs=[5], groups=1, width_per_group=64, url=None):
11 |         self.stride = 128        
12 |         self.bottleneck = bottleneck
13 |         self.outputs = outputs
14 |         self.url = url
15 | 
16 |         # torchvision added support for ResNeXt in version 0.3.0,
17 |         # and introduces additional args to torchvision.models.resnet constructor
18 |         kwargs_common = {'block': bottleneck, 'layers': layers}
19 |         kwargs_extra = {'groups': groups, 'width_per_group': width_per_group} if torchvision.__version__ > '0.2.1' else {}
20 |         kwargs = {**kwargs_common, **kwargs_extra}
21 |         super().__init__(**kwargs)
22 | 
23 |     def initialize(self):
24 |         if self.url:
25 |             self.load_state_dict(model_zoo.load_url(self.url))
26 | 
27 |     def forward(self, x):
28 |         x = self.conv1(x)
29 |         x = self.bn1(x)
30 |         x = self.relu(x)
31 |         x = self.maxpool(x)
32 | 
33 |         outputs = []
34 |         for i, layer in enumerate([self.layer1, self.layer2, self.layer3, self.layer4]):
35 |             level = i + 2
36 |             if level > max(self.outputs):
37 |                 break
38 |             x = layer(x)
39 |             if level in self.outputs:
40 |                 outputs.append(x)
41 | 
42 |         return outputs
43 | 
44 | @register
45 | def ResNet18C4():
46 |     return ResNet(layers=[2, 2, 2, 2], bottleneck=vrn.BasicBlock, outputs=[4], url=vrn.model_urls['resnet18'])
47 | 
48 | @register
49 | def ResNet34C4():
50 |     return ResNet(layers=[3, 4, 6, 3], bottleneck=vrn.BasicBlock, outputs=[4], url=vrn.model_urls['resnet34'])
51 | 


--------------------------------------------------------------------------------
/retinanet/backbones/utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torchvision
 3 | 
 4 | def register_torchvision_030(f):
 5 |     if torchvision.__version__ > '0.2.1':
 6 |         return register(f)
 7 | 
 8 | def register(f):
 9 |     all = sys.modules[f.__module__].__dict__.setdefault('__all__', [])
10 |     if f.__name__ in all:
11 |         raise RuntimeError('{} already exist!'.format(f.__name__))
12 |     all.append(f.__name__)
13 |     return f
14 | 


--------------------------------------------------------------------------------
/retinanet/box.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from ._C import decode as decode_cuda
  4 | from ._C import nms as nms_cuda
  5 | 
  6 | 
  7 | def generate_anchors(stride, ratio_vals, scales_vals):
  8 |     'Generate anchors coordinates from scales/ratios'
  9 | 
 10 |     scales = torch.FloatTensor(scales_vals).repeat(len(ratio_vals), 1)
 11 |     scales = scales.transpose(0, 1).contiguous().view(-1, 1)
 12 |     ratios = torch.FloatTensor(ratio_vals * len(scales_vals))
 13 | 
 14 |     wh = torch.FloatTensor([stride]).repeat(len(ratios), 2)
 15 |     ws = torch.round(torch.sqrt(wh[:, 0] * wh[:, 1] / ratios))
 16 |     dwh = torch.stack([ws, torch.round(ws * ratios)], dim=1)
 17 |     xy1 = 0.5 * (wh - dwh * scales)
 18 |     xy2 = 0.5 * (wh + dwh * scales) - 1
 19 |     return torch.cat([xy1, xy2], dim=1)
 20 | 
 21 | 
 22 | def box2delta(boxes, anchors):
 23 |     'Convert boxes to deltas from anchors'
 24 | 
 25 |     anchors_wh = anchors[:, 2:] - anchors[:, :2] + 1
 26 |     anchors_ctr = anchors[:, :2] + 0.5 * anchors_wh
 27 |     boxes_wh = boxes[:, 2:] - boxes[:, :2] + 1
 28 |     boxes_ctr = boxes[:, :2] + 0.5 * boxes_wh
 29 | 
 30 |     return torch.cat([
 31 |         (boxes_ctr - anchors_ctr) / anchors_wh,
 32 |         torch.log(boxes_wh / anchors_wh)
 33 |     ], 1)
 34 | 
 35 | 
 36 | def delta2box(deltas, anchors, size, stride):
 37 |     'Convert deltas from anchors to boxes'
 38 | 
 39 |     anchors_wh = anchors[:, 2:] - anchors[:, :2] + 1
 40 |     ctr = anchors[:, :2] + 0.5 * anchors_wh
 41 |     pred_ctr = deltas[:, :2] * anchors_wh + ctr
 42 |     pred_wh = torch.exp(deltas[:, 2:]) * anchors_wh
 43 | 
 44 |     m = torch.zeros([2], device=deltas.device, dtype=deltas.dtype)
 45 |     M = (torch.tensor([size], device=deltas.device, dtype=deltas.dtype) * stride - 1)
 46 |     clamp = lambda t: torch.max(m, torch.min(t, M))
 47 |     return torch.cat([
 48 |         clamp(pred_ctr - 0.5 * pred_wh),
 49 |         clamp(pred_ctr + 0.5 * pred_wh - 1)
 50 |     ], 1)
 51 | 
 52 | 
 53 | def snap_to_anchors(boxes, size, stride, anchors, num_classes, device):
 54 |     'Snap target boxes (x, y, w, h) to anchors'
 55 | 
 56 |     num_anchors = anchors.size()[0] if anchors is not None else 1
 57 |     width, height = (int(size[0] / stride), int(size[1] / stride))
 58 | 
 59 |     if boxes.nelement() == 0:
 60 |         return (torch.zeros([num_anchors, num_classes, height, width], device=device),
 61 |                 torch.zeros([num_anchors, 4, height, width], device=device),
 62 |                 torch.zeros([num_anchors, 1, height, width], device=device))
 63 | 
 64 |     boxes, classes = boxes.split(4, dim=1)
 65 | 
 66 |     # Generate anchors
 67 |     x, y = torch.meshgrid([torch.arange(0, size[i], stride, device=device, dtype=classes.dtype) for i in range(2)])
 68 |     xyxy = torch.stack((x, y, x, y), 2).unsqueeze(0)
 69 |     anchors = anchors.view(-1, 1, 1, 4).to(dtype=classes.dtype)
 70 |     anchors = (xyxy + anchors).contiguous().view(-1, 4)
 71 | 
 72 |     # Compute overlap between boxes and anchors
 73 |     boxes = torch.cat([boxes[:, :2], boxes[:, :2] + boxes[:, 2:] - 1], 1)
 74 |     xy1 = torch.max(anchors[:, None, :2], boxes[:, :2])
 75 |     xy2 = torch.min(anchors[:, None, 2:], boxes[:, 2:])
 76 |     inter = torch.prod((xy2 - xy1 + 1).clamp(0), 2)
 77 |     boxes_area = torch.prod(boxes[:, 2:] - boxes[:, :2] + 1, 1)
 78 |     anchors_area = torch.prod(anchors[:, 2:] - anchors[:, :2] + 1, 1)
 79 |     overlap = inter / (anchors_area[:, None] + boxes_area - inter)
 80 | 
 81 |     # Keep best box per anchor
 82 |     overlap, indices = overlap.max(1)
 83 |     box_target = box2delta(boxes[indices], anchors)
 84 |     box_target = box_target.view(num_anchors, 1, width, height, 4)
 85 |     box_target = box_target.transpose(1, 4).transpose(2, 3)
 86 |     box_target = box_target.squeeze().contiguous()
 87 | 
 88 |     depth = torch.ones_like(overlap) * -1
 89 |     depth[overlap < 0.4] = 0  # background
 90 |     depth[overlap >= 0.5] = classes[indices][overlap >= 0.5].squeeze() + 1  # objects
 91 |     depth = depth.view(num_anchors, width, height).transpose(1, 2).contiguous()
 92 | 
 93 |     # Generate target classes
 94 |     cls_target = torch.zeros((anchors.size()[0], num_classes + 1), device=device, dtype=boxes.dtype)
 95 |     if classes.nelement() == 0:
 96 |         classes = torch.LongTensor([num_classes], device=device).expand_as(indices)
 97 |     else:
 98 |         classes = classes[indices].long()
 99 |     classes = classes.view(-1, 1)
100 |     classes[overlap < 0.4] = num_classes  # background has no class
101 |     cls_target.scatter_(1, classes, 1)
102 |     cls_target = cls_target[:, :num_classes].view(-1, 1, width, height, num_classes)
103 |     cls_target = cls_target.transpose(1, 4).transpose(2, 3)
104 |     cls_target = cls_target.squeeze().contiguous()
105 | 
106 |     return (cls_target.view(num_anchors, num_classes, height, width),
107 |             box_target.view(num_anchors, 4, height, width),
108 |             depth.view(num_anchors, 1, height, width))
109 | 
110 | 
111 | def decode(all_cls_head, all_box_head, stride=1, threshold=0.05, top_n=1000, anchors=None):
112 |     'Box Decoding and Filtering'
113 | 
114 |     if torch.cuda.is_available():
115 |         return decode_cuda(all_cls_head.float(), all_box_head.float(),
116 |                            anchors.view(-1).tolist(), stride, threshold, top_n)
117 | 
118 |     device = all_cls_head.device
119 |     anchors = anchors.to(device).type(all_cls_head.type())
120 |     num_anchors = anchors.size()[0] if anchors is not None else 1
121 |     num_classes = all_cls_head.size()[1] // num_anchors
122 |     height, width = all_cls_head.size()[-2:]
123 | 
124 |     batch_size = all_cls_head.size()[0]
125 |     out_scores = torch.zeros((batch_size, top_n), device=device)
126 |     out_boxes = torch.zeros((batch_size, top_n, 4), device=device)
127 |     out_classes = torch.zeros((batch_size, top_n), device=device)
128 | 
129 |     # Per item in batch
130 |     for batch in range(batch_size):
131 |         cls_head = all_cls_head[batch, :, :, :].contiguous().view(-1)
132 |         box_head = all_box_head[batch, :, :, :].contiguous().view(-1, 4)
133 | 
134 |         # Keep scores over threshold
135 |         keep = (cls_head >= threshold).nonzero().view(-1)
136 |         if keep.nelement() == 0:
137 |             continue
138 | 
139 |         # Gather top elements
140 |         scores = torch.index_select(cls_head, 0, keep)
141 |         scores, indices = torch.topk(scores, min(top_n, keep.size()[0]), dim=0)
142 |         indices = torch.index_select(keep, 0, indices).view(-1)
143 |         classes = (indices / width / height) % num_classes
144 |         classes = classes.type(all_cls_head.type())
145 | 
146 |         # Infer kept bboxes
147 |         x = indices % width
148 |         y = (indices / width) % height
149 |         a = indices / num_classes / height / width
150 |         box_head = box_head.view(num_anchors, 4, height, width)
151 |         boxes = box_head[a, :, y, x]
152 | 
153 |         if anchors is not None:
154 |             grid = torch.stack([x, y, x, y], 1).type(all_cls_head.type()) * stride + anchors[a, :]
155 |             boxes = delta2box(boxes, grid, [width, height], stride)
156 | 
157 |         out_scores[batch, :scores.size()[0]] = scores
158 |         out_boxes[batch, :boxes.size()[0], :] = boxes
159 |         out_classes[batch, :classes.size()[0]] = classes
160 | 
161 |     return out_scores, out_boxes, out_classes
162 | 
163 | 
164 | def nms(all_scores, all_boxes, all_classes, nms=0.5, ndetections=100):
165 |     'Non Maximum Suppression'
166 | 
167 |     if torch.cuda.is_available():
168 |         return nms_cuda(
169 |             all_scores.float(), all_boxes.float(), all_classes.float(), nms, ndetections)
170 | 
171 |     device = all_scores.device
172 |     batch_size = all_scores.size()[0]
173 |     out_scores = torch.zeros((batch_size, ndetections), device=device)
174 |     out_boxes = torch.zeros((batch_size, ndetections, 4), device=device)
175 |     out_classes = torch.zeros((batch_size, ndetections), device=device)
176 | 
177 |     # Per item in batch
178 |     for batch in range(batch_size):
179 |         # Discard null scores
180 |         keep = (all_scores[batch, :].view(-1) > 0).nonzero()
181 |         scores = all_scores[batch, keep].view(-1)
182 |         boxes = all_boxes[batch, keep, :].view(-1, 4)
183 |         classes = all_classes[batch, keep].view(-1)
184 | 
185 |         if scores.nelement() == 0:
186 |             continue
187 | 
188 |         # Sort boxes
189 |         scores, indices = torch.sort(scores, descending=True)
190 |         boxes, classes = boxes[indices], classes[indices]
191 |         areas = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1).view(-1)
192 |         keep = torch.ones(scores.nelement(), device=device, dtype=torch.uint8).view(-1)
193 | 
194 |         for i in range(ndetections):
195 |             if i >= keep.nonzero().nelement() or i >= scores.nelement():
196 |                 i -= 1
197 |                 break
198 | 
199 |             # Find overlapping boxes with lower score
200 |             xy1 = torch.max(boxes[:, :2], boxes[i, :2])
201 |             xy2 = torch.min(boxes[:, 2:], boxes[i, 2:])
202 |             inter = torch.prod((xy2 - xy1 + 1).clamp(0), 1)
203 |             criterion = ((scores > scores[i]) |
204 |                          (inter / (areas + areas[i] - inter) <= nms) |
205 |                          (classes != classes[i]))
206 |             criterion[i] = 1
207 | 
208 |             # Only keep relevant boxes
209 |             scores = scores[criterion.nonzero()].view(-1)
210 |             boxes = boxes[criterion.nonzero(), :].view(-1, 4)
211 |             classes = classes[criterion.nonzero()].view(-1)
212 |             areas = areas[criterion.nonzero()].view(-1)
213 |             keep[(~criterion).nonzero()] = 0
214 | 
215 |         out_scores[batch, :i + 1] = scores[:i + 1]
216 |         out_boxes[batch, :i + 1, :] = boxes[:i + 1, :]
217 |         out_classes[batch, :i + 1] = classes[:i + 1]
218 |     print(out_scores, out_boxes, out_classes)
219 |     return out_scores, out_boxes, out_classes
220 | 


--------------------------------------------------------------------------------
/retinanet/dali.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from contextlib import redirect_stdout
  3 | from math import ceil
  4 | import ctypes
  5 | import numpy as np
  6 | import torch
  7 | import numpy as np
  8 | from nvidia.dali import pipeline, ops, types
  9 | from pycocotools.coco import COCO
 10 | 
 11 | 
 12 | class COCOPipeline(pipeline.Pipeline):
 13 |     'Dali pipeline for COCO'
 14 | 
 15 |     def __init__(self, batch_size, num_threads, path, training, annotations, world, device_id, mean, std, resize,
 16 |                  max_size, stride):
 17 |         super().__init__(batch_size=batch_size, num_threads=num_threads, device_id=device_id,
 18 |                          prefetch_queue_depth=num_threads, seed=42)
 19 | 
 20 |         self.path = path
 21 |         self.training = training
 22 |         self.stride = stride
 23 |         self.iter = 0
 24 | 
 25 |         self.reader = ops.COCOReader(annotations_file=annotations, file_root=path, num_shards=world,
 26 |                                      shard_id=torch.cuda.current_device(),
 27 |                                      ltrb=True, ratio=True, shuffle_after_epoch=True, save_img_ids=True)
 28 | 
 29 |         self.decode_train = ops.ImageDecoderSlice(device="mixed", output_type=types.RGB)
 30 |         self.decode_infer = ops.ImageDecoder(device="mixed", output_type=types.RGB)
 31 |         self.bbox_crop = ops.RandomBBoxCrop(device='cpu', ltrb=True, scaling=[0.3, 1.0],
 32 |                                             thresholds=[0.1, 0.3, 0.5, 0.7, 0.9])
 33 | 
 34 |         self.bbox_flip = ops.BbFlip(device='cpu', ltrb=True)
 35 |         self.img_flip = ops.Flip(device='gpu')
 36 |         self.coin_flip = ops.CoinFlip(probability=0.5)
 37 | 
 38 |         if isinstance(resize, list): resize = max(resize)
 39 |         self.rand_resize = ops.Uniform(range=[resize, float(max_size)])
 40 | 
 41 |         self.resize_train = ops.Resize(device='gpu', interp_type=types.DALIInterpType.INTERP_CUBIC, save_attrs=True)
 42 |         self.resize_infer = ops.Resize(device='gpu', interp_type=types.DALIInterpType.INTERP_CUBIC,
 43 |                                        resize_longer=max_size, save_attrs=True)
 44 | 
 45 |         padded_size = max_size + ((self.stride - max_size % self.stride) % self.stride)
 46 | 
 47 |         self.pad = ops.Paste(device='gpu', fill_value=0, ratio=1.1, min_canvas_size=padded_size, paste_x=0, paste_y=0)
 48 |         self.normalize = ops.CropMirrorNormalize(device='gpu', mean=mean, std=std, crop=(padded_size, padded_size),
 49 |                                                  crop_pos_x=0, crop_pos_y=0)
 50 | 
 51 |     def define_graph(self):
 52 | 
 53 |         images, bboxes, labels, img_ids = self.reader()
 54 | 
 55 |         if self.training:
 56 |             crop_begin, crop_size, bboxes, labels = self.bbox_crop(bboxes, labels)
 57 |             images = self.decode_train(images, crop_begin, crop_size)
 58 |             resize = self.rand_resize()
 59 |             images, attrs = self.resize_train(images, resize_longer=resize)
 60 | 
 61 |             flip = self.coin_flip()
 62 |             bboxes = self.bbox_flip(bboxes, horizontal=flip)
 63 |             images = self.img_flip(images, horizontal=flip)
 64 | 
 65 |         else:
 66 |             images = self.decode_infer(images)
 67 |             images, attrs = self.resize_infer(images)
 68 | 
 69 |         resized_images = images
 70 |         images = self.normalize(self.pad(images))
 71 | 
 72 |         return images, bboxes, labels, img_ids, attrs, resized_images
 73 | 
 74 | 
 75 | class DaliDataIterator():
 76 |     'Data loader for data parallel using Dali'
 77 | 
 78 |     def __init__(self, path, resize, max_size, batch_size, stride, world, annotations, training=False):
 79 |         self.training = training
 80 |         self.resize = resize
 81 |         self.max_size = max_size
 82 |         self.stride = stride
 83 |         self.batch_size = batch_size // world
 84 |         self.mean = [255. * x for x in [0.485, 0.456, 0.406]]
 85 |         self.std = [255. * x for x in [0.229, 0.224, 0.225]]
 86 |         self.world = world
 87 |         self.path = path
 88 | 
 89 |         # Setup COCO
 90 |         with redirect_stdout(None):
 91 |             self.coco = COCO(annotations)
 92 |         self.ids = list(self.coco.imgs.keys())
 93 |         if 'categories' in self.coco.dataset:
 94 |             self.categories_inv = {k: i for i, k in enumerate(self.coco.getCatIds())}
 95 | 
 96 |         self.pipe = COCOPipeline(batch_size=self.batch_size, num_threads=2,
 97 |                                  path=path, training=training, annotations=annotations, world=world,
 98 |                                  device_id=torch.cuda.current_device(), mean=self.mean, std=self.std, resize=resize,
 99 |                                  max_size=max_size, stride=self.stride)
100 | 
101 |         self.pipe.build()
102 | 
103 |     def __repr__(self):
104 |         return '\n'.join([
105 |             '    loader: dali',
106 |             '    resize: {}, max: {}'.format(self.resize, self.max_size),
107 |         ])
108 | 
109 |     def __len__(self):
110 |         return ceil(len(self.ids) // self.world / self.batch_size)
111 | 
112 |     def __iter__(self):
113 |         for _ in range(self.__len__()):
114 | 
115 |             data, ratios, ids, num_detections = [], [], [], []
116 |             dali_data, dali_boxes, dali_labels, dali_ids, dali_attrs, dali_resize_img = self.pipe.run()
117 | 
118 |             for l in range(len(dali_boxes)):
119 |                 num_detections.append(dali_boxes.at(l).shape[0])
120 | 
121 |             pyt_targets = -1 * torch.ones([len(dali_boxes), max(max(num_detections), 1), 5])
122 | 
123 |             for batch in range(self.batch_size):
124 |                 id = int(dali_ids.at(batch)[0])
125 | 
126 |                 # Convert dali tensor to pytorch
127 |                 dali_tensor = dali_data.at(batch)
128 |                 tensor_shape = dali_tensor.shape()
129 | 
130 |                 datum = torch.zeros(dali_tensor.shape(), dtype=torch.float, device=torch.device('cuda'))
131 |                 c_type_pointer = ctypes.c_void_p(datum.data_ptr())
132 |                 dali_tensor.copy_to_external(c_type_pointer)
133 | 
134 |                 # Calculate image resize ratio to rescale boxes
135 |                 prior_size = dali_attrs.as_cpu().at(batch)
136 |                 resized_size = dali_resize_img.at(batch).shape()
137 |                 ratio = max(resized_size) / max(prior_size)
138 | 
139 |                 if self.training:
140 |                     # Rescale boxes
141 |                     b_arr = dali_boxes.at(batch)
142 |                     num_dets = b_arr.shape[0]
143 |                     if num_dets is not 0:
144 |                         pyt_bbox = torch.from_numpy(b_arr).float()
145 | 
146 |                         pyt_bbox[:, 0] *= float(prior_size[1])
147 |                         pyt_bbox[:, 1] *= float(prior_size[0])
148 |                         pyt_bbox[:, 2] *= float(prior_size[1])
149 |                         pyt_bbox[:, 3] *= float(prior_size[0])
150 |                         # (l,t,r,b) ->  (x,y,w,h) == (l,r, r-l, b-t)
151 |                         pyt_bbox[:, 2] -= pyt_bbox[:, 0]
152 |                         pyt_bbox[:, 3] -= pyt_bbox[:, 1]
153 |                         pyt_targets[batch, :num_dets, :4] = pyt_bbox * ratio
154 | 
155 |                     # Arrange labels in target tensor
156 |                     l_arr = dali_labels.at(batch)
157 |                     if num_dets is not 0:
158 |                         pyt_label = torch.from_numpy(l_arr).float()
159 |                         pyt_label -= 1  # Rescale labels to [0,79] instead of [1,80]
160 |                         pyt_targets[batch, :num_dets, 4] = pyt_label.squeeze()
161 | 
162 |                 ids.append(id)
163 |                 data.append(datum.unsqueeze(0))
164 |                 ratios.append(ratio)
165 | 
166 |             data = torch.cat(data, dim=0)
167 | 
168 |             if self.training:
169 |                 pyt_targets = pyt_targets.cuda(non_blocking=True)
170 | 
171 |                 yield data, pyt_targets
172 | 
173 |             else:
174 |                 ids = torch.Tensor(ids).int().cuda(non_blocking=True)
175 |                 ratios = torch.Tensor(ratios).cuda(non_blocking=True)
176 | 
177 |                 yield data, ids, ratios


--------------------------------------------------------------------------------
/retinanet/data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | from contextlib import redirect_stdout
  4 | 
  5 | import albumentations as A
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn.functional as F
  9 | from PIL import Image
 10 | from albumentations import CLAHE, IAASharpen, IAAEmboss, RandomBrightnessContrast, RGBShift, ImageCompression, \
 11 |     RandomGamma, ChannelShuffle, InvertImg, ToGray, RandomSnow, RandomRain, RandomFog, ChannelDropout, ISONoise, OneOf, \
 12 |     IAAAdditiveGaussianNoise, GaussNoise, Blur, MotionBlur, MedianBlur, HueSaturationValue
 13 | from pycocotools.coco import COCO
 14 | from torch.utils import data
 15 | 
 16 | 
 17 | class CocoDataset(data.dataset.Dataset):
 18 |     'Dataset looping through a set of images'
 19 | 
 20 |     def __init__(self, path, resize, max_size, stride, annotations=None, training=False, crop_number=False):
 21 |         super().__init__()
 22 | 
 23 |         self.path = os.path.expanduser(path)
 24 |         self.resize = resize
 25 |         self.max_size = max_size
 26 |         self.stride = stride
 27 |         self.mean = [0.485, 0.456, 0.406]
 28 |         self.std = [0.229, 0.224, 0.225]
 29 |         self.training = training
 30 |         self.crop_number = crop_number
 31 | 
 32 |         with redirect_stdout(None):
 33 |             self.coco = COCO(annotations)
 34 |         self.ids = list(self.coco.imgs.keys())
 35 |         if 'categories' in self.coco.dataset:
 36 |             self.categories_inv = {k: i for i, k in enumerate(self.coco.getCatIds())}
 37 | 
 38 |     def __len__(self):
 39 |         return len(self.ids)
 40 | 
 41 |     def __getitem__(self, index):
 42 |         ' Get sample'
 43 | 
 44 |         # Load image
 45 |         id = self.ids[index]
 46 |         if self.coco:
 47 |             image = self.coco.loadImgs(id)[0]['file_name']
 48 |         im = Image.open('{}/{}'.format(self.path, image)).convert("RGB")
 49 | 
 50 |         if self.crop_number:
 51 |             boxes, categories = self._get_target(id)
 52 |             for i, j in enumerate(boxes):
 53 |                 if categories[i] == 11:
 54 |                     b = np.asarray(j, dtype=np.int)
 55 |                     im = np.asarray(im)
 56 |                     im = im[b[1]:b[1] + b[3], b[0]:b[0] + b[2], :]
 57 |                     im = Image.fromarray(im)
 58 | 
 59 |         # Randomly sample scale for resize during training
 60 |         resize = self.resize
 61 |         if isinstance(resize, list):
 62 |             resize = random.randint(self.resize[0], self.resize[-1])
 63 | 
 64 |         ratio = resize / min(im.size)
 65 |         if ratio * max(im.size) > self.max_size:
 66 |             ratio = self.max_size / max(im.size)
 67 |         im = im.resize((int(ratio * d) for d in im.size), Image.BILINEAR)
 68 |         # im.save(str(id) + '.png', 'PNG')
 69 | 
 70 |         if self.training:
 71 |             # Get annotations
 72 |             boxes, categories = self._get_target(id)
 73 |             if self.crop_number:
 74 |                 boxes, categories = self.new_bbox_coords(boxes, categories)
 75 |             boxes *= ratio
 76 | 
 77 |             annotations = {'image': np.asarray(im), 'bboxes': np.asarray(boxes),
 78 |                            'category_id': np.asarray(categories)}
 79 |             # print(image)
 80 |             aug = self.get_aug([
 81 |                 OneOf([
 82 |                     CLAHE(),
 83 |                     IAASharpen(),
 84 |                     IAAEmboss(),
 85 |                     RandomBrightnessContrast(),
 86 |                     RGBShift(),
 87 |                     ImageCompression(),
 88 |                     RandomGamma(),
 89 |                     ChannelShuffle(),
 90 |                     InvertImg(),
 91 |                     ToGray(),
 92 |                     RandomSnow(),
 93 |                     RandomRain(),
 94 |                     RandomFog(),
 95 |                     ChannelDropout(),
 96 |                     ISONoise()
 97 |                 ], p=0.4),
 98 |                 OneOf([
 99 |                     IAAAdditiveGaussianNoise(),
100 |                     GaussNoise(),
101 |                 ], p=0.3),
102 |                 OneOf([
103 |                     Blur(),
104 |                     MotionBlur(),
105 |                     MedianBlur(),
106 |                 ], p=0.4),
107 |                 HueSaturationValue()
108 |             ])
109 | 
110 |             try:
111 |                 augmented = aug(**annotations)
112 |                 im, boxes, categories = augmented['image'], torch.tensor(augmented['bboxes']), torch.tensor(
113 |                     augmented['category_id'])
114 |                 target = torch.cat([boxes, categories], dim=1)
115 |                 im = Image.fromarray(im)
116 |                 # im.save(str(id) + '.png', 'PNG')
117 |                 # print(str(id) + ' ' + str(boxes))
118 |             except Exception as e:
119 |                 print(image)
120 |                 print(e)
121 |         # Convert to tensor and normalize
122 |         data = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes()))
123 |         data = data.float().div(255).view(*im.size[::-1], len(im.mode))
124 |         data = data.permute(2, 0, 1)
125 | 
126 |         for t, mean, std in zip(data, self.mean, self.std):
127 |             t.sub_(mean).div_(std)
128 | 
129 |         # Apply padding
130 |         pw, ph = ((self.stride - d % self.stride) % self.stride for d in im.size)
131 |         data = F.pad(data, (0, pw, 0, ph))
132 | 
133 |         if self.training:
134 |             return data, target
135 | 
136 |         return data, id, ratio
137 | 
138 |     def new_bbox_coords(self, boxes, categories):
139 |         boxes, categories = np.asarray(boxes), np.asarray(categories)
140 |         parent = None
141 |         for i, j in enumerate(boxes):
142 |             if categories[i] == 11:
143 |                 parent = j.copy()
144 |         b = []
145 |         c = []
146 |         for i, j in enumerate(boxes):
147 |             if not categories[i] == 11:
148 |                 if parent[0] < j[0] < (parent[0] + parent[2]) and parent[1] < j[1] < (parent[1] + parent[3]):
149 |                     j[0] = j[0] - parent[0]
150 |                     j[1] = j[1] - parent[1]
151 |                     b.append([j[0], j[1], j[2], j[3]])
152 |                     c.append(categories[i])
153 | 
154 |         return torch.tensor(b), torch.tensor(c)
155 | 
156 |     def get_aug(self, aug, min_area=0., min_visibility=0.):
157 |         return A.Compose(aug, A.BboxParams(format='coco', min_area=min_area,
158 |                                            min_visibility=min_visibility, label_fields=['category_id']))
159 | 
160 |     def _get_target(self, id):
161 |         'Get annotations for sample'
162 | 
163 |         ann_ids = self.coco.getAnnIds(imgIds=[id])
164 |         annotations = self.coco.loadAnns(ann_ids)
165 | 
166 |         boxes, categories = [], []
167 |         for ann in annotations:
168 |             if ann['bbox'][2] < 1 and ann['bbox'][3] < 1:
169 |                 continue
170 |             boxes.append(ann['bbox'])
171 |             cat = ann['category_id']
172 |             # if 'categories' in self.coco.dataset:
173 |             #    cat = self.categories_inv[cat]
174 |             categories.append(cat)
175 | 
176 |         if boxes:
177 |             target = (torch.FloatTensor(boxes),
178 |                       torch.FloatTensor(categories).unsqueeze(1))
179 |         else:
180 |             target = (torch.ones([1, 4]), torch.ones([1, 1]) * -1)
181 | 
182 |         return target
183 | 
184 |     def collate_fn(self, batch):
185 |         'Create batch from multiple samples'
186 | 
187 |         if self.training:
188 |             data, targets = zip(*batch)
189 |             max_det = max([t.size()[0] for t in targets])
190 |             targets = [torch.cat([t, torch.ones([max_det - t.size()[0], 5]) * -1]) for t in targets]
191 |             targets = torch.stack(targets, 0)
192 |         else:
193 |             data, indices, ratios = zip(*batch)
194 | 
195 |         # Pad data to match max batch dimensions
196 |         sizes = [d.size()[-2:] for d in data]
197 |         w, h = (max(dim) for dim in zip(*sizes))
198 | 
199 |         data_stack = []
200 |         for datum in data:
201 |             pw, ph = w - datum.size()[-2], h - datum.size()[-1]
202 |             data_stack.append(
203 |                 F.pad(datum, (0, ph, 0, pw)) if max(ph, pw) > 0 else datum)
204 | 
205 |         data = torch.stack(data_stack)
206 | 
207 |         if self.training:
208 |             return data, targets
209 | 
210 |         ratios = torch.FloatTensor(ratios).view(-1, 1, 1)
211 |         return data, torch.IntTensor(indices), ratios
212 | 
213 | 
214 | class DataIterator():
215 |     'Data loader for data parallel'
216 | 
217 |     def __init__(self, path, resize, max_size, batch_size, stride, world, annotations, training=False,
218 |                  crop_number=False):
219 |         self.resize = resize
220 |         self.max_size = max_size
221 | 
222 |         self.dataset = CocoDataset(path, resize=resize, max_size=max_size,
223 |                                    stride=stride, annotations=annotations, training=training, crop_number=crop_number)
224 |         self.ids = self.dataset.ids
225 |         self.coco = self.dataset.coco
226 | 
227 |         self.sampler = data.distributed.DistributedSampler(self.dataset) if world > 1 else None
228 |         self.dataloader = data.DataLoader(self.dataset, batch_size=batch_size // world,
229 |                                           sampler=self.sampler, collate_fn=self.dataset.collate_fn, num_workers=2,
230 |                                           pin_memory=True)
231 | 
232 |     def __repr__(self):
233 |         return '\n'.join([
234 |             '    loader: pytorch',
235 |             '    resize: {}, max: {}'.format(self.resize, self.max_size),
236 |         ])
237 | 
238 |     def __len__(self):
239 |         return len(self.dataloader)
240 | 
241 |     def __iter__(self):
242 |         for output in self.dataloader:
243 |             if self.dataset.training:
244 |                 data, target = output
245 |             else:
246 |                 data, ids, ratio = output
247 | 
248 |             if torch.cuda.is_available():
249 |                 data = data.cuda(non_blocking=True)
250 | 
251 |             if self.dataset.training:
252 |                 if torch.cuda.is_available():
253 |                     target = target.cuda(non_blocking=True)
254 |                 yield data, target
255 |             else:
256 |                 if torch.cuda.is_available():
257 |                     ids = ids.cuda(non_blocking=True)
258 |                     ratio = ratio.cuda(non_blocking=True)
259 |                 yield data, ids, ratio
260 | 


--------------------------------------------------------------------------------
/retinanet/export_models.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | /opt/conda/bin/python -W ignore main.py export ../models/onnx/retinanet_rn18fpn_1_stage.pth ../models/retinanet_rn18fpn_1_stage.engine --full-precision --size 1280 1024 --batch 1
4 | /opt/conda/bin/python -W ignore main.py export ../models/onnx/retinanet_rn34fpn_2_stage.pth ../models/retinanet_rn34fpn_2_stage.engine --full-precision --size 256 512 --batch 1
5 | 


--------------------------------------------------------------------------------
/retinanet/infer.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import tempfile
  4 | from contextlib import redirect_stdout
  5 | 
  6 | import torch
  7 | from apex import amp
  8 | from apex.parallel import DistributedDataParallel as DDP
  9 | from pycocotools.cocoeval import COCOeval
 10 | 
 11 | from .dali import DaliDataIterator
 12 | from .data import DataIterator
 13 | from .model import Model
 14 | from .utils import Profiler
 15 | 
 16 | def infer(model, path, detections_file, resize, max_size, batch_size, mixed_precision=True, is_master=True, world=0,
 17 |           annotations=None, use_dali=True, is_validation=False, verbose=True):
 18 |     'Run inference on images from path'
 19 | 
 20 |     backend = 'pytorch' if isinstance(model, Model) or isinstance(model, DDP) else 'tensorrt'
 21 | 
 22 |     stride = model.module.stride if isinstance(model, DDP) else model.stride
 23 | 
 24 |     # Create annotations if none was provided
 25 |     if not annotations:
 26 |         annotations = tempfile.mktemp('.json')
 27 |         images = [{'id': i, 'file_name': f} for i, f in enumerate(os.listdir(path))]
 28 |         json.dump({'images': images}, open(annotations, 'w'))
 29 | 
 30 |     # TensorRT only supports fixed input sizes, so override input size accordingly
 31 |     if backend == 'tensorrt': max_size = max(model.input_size)
 32 | 
 33 |     # Prepare dataset
 34 |     if verbose: print('Preparing dataset...')
 35 |     data_iterator = (DaliDataIterator if use_dali else DataIterator)(
 36 |         path, resize, max_size, batch_size, stride,
 37 |         world, annotations, training=False)
 38 |     if verbose: print(data_iterator)
 39 | 
 40 |     # Prepare model
 41 |     if backend is 'pytorch':
 42 |         # If we are doing validation during training,
 43 |         # no need to register model with AMP again
 44 |         if not is_validation:
 45 |             if torch.cuda.is_available(): model = model.cuda()
 46 |             model = amp.initialize(model, None,
 47 |                                    opt_level='O2' if mixed_precision else 'O0',
 48 |                                    keep_batchnorm_fp32=True,
 49 |                                    verbosity=0)
 50 | 
 51 |         model.eval()
 52 | 
 53 |     if verbose:
 54 |         print('   backend: {}'.format(backend))
 55 |         print('    device: {} {}'.format(
 56 |             world, 'cpu' if not torch.cuda.is_available() else 'gpu' if world == 1 else 'gpus'))
 57 |         print('     batch: {}, precision: {}'.format(batch_size,
 58 |                                                      'unknown' if backend is 'tensorrt' else 'mixed' if mixed_precision else 'full'))
 59 |         print('Running inference...')
 60 | 
 61 |     results = []
 62 |     profiler = Profiler(['infer', 'fw'])
 63 |     with torch.no_grad():
 64 |         for i, (data, ids, ratios) in enumerate(data_iterator):
 65 |             # Forward pass
 66 |             profiler.start('fw')
 67 |             scores, boxes, classes = model(data)
 68 |             profiler.stop('fw')
 69 | 
 70 |             results.append([scores, boxes, classes, ids, ratios])
 71 | 
 72 |             profiler.bump('infer')
 73 |             if verbose and (profiler.totals['infer'] > 60 or i == len(data_iterator) - 1):
 74 |                 size = len(data_iterator.ids)
 75 |                 msg = '[{:{len}}/{}]'.format(min((i + 1) * batch_size,
 76 |                                                  size), size, len=len(str(size)))
 77 |                 msg += ' {:.3f}s/{}-batch'.format(profiler.means['infer'], batch_size)
 78 |                 msg += ' (fw: {:.3f}s)'.format(profiler.means['fw'])
 79 |                 msg += ', {:.1f} im/s'.format(batch_size / profiler.means['infer'])
 80 |                 print(msg, flush=True)
 81 | 
 82 |                 profiler.reset()
 83 | 
 84 |     # Gather results from all devices
 85 |     if verbose: print('Gathering results...')
 86 |     results = [torch.cat(r, dim=0) for r in zip(*results)]
 87 |     if world > 1:
 88 |         for r, result in enumerate(results):
 89 |             all_result = [torch.ones_like(result, device=result.device) for _ in range(world)]
 90 |             torch.distributed.all_gather(list(all_result), result)
 91 |             results[r] = torch.cat(all_result, dim=0)
 92 | 
 93 |     if is_master:
 94 |         # Copy buffers back to host
 95 |         results = [r.cpu() for r in results]
 96 | 
 97 |         # Collect detections
 98 |         detections = []
 99 |         processed_ids = set()
100 |         for scores, boxes, classes, image_id, ratios in zip(*results):
101 |             image_id = image_id.item()
102 |             if image_id in processed_ids:
103 |                 continue
104 |             processed_ids.add(image_id)
105 | 
106 |             keep = (scores > 0).nonzero()
107 |             scores = scores[keep].view(-1)
108 |             boxes = boxes[keep, :].view(-1, 4) / ratios
109 |             classes = classes[keep].view(-1).int()
110 | 
111 |             for score, box, cat in zip(scores, boxes, classes):
112 |                 x1, y1, x2, y2 = box.data.tolist()
113 |                 cat = cat.item()
114 |                 if 'annotations' in data_iterator.coco.dataset:
115 |                     cat = data_iterator.coco.getCatIds()[cat]
116 |                 detections.append({
117 |                     'image_id': image_id,
118 |                     'score': score.item(),
119 |                     'bbox': [x1, y1, x2 - x1 + 1, y2 - y1 + 1],
120 |                     'category_id': cat
121 |                 })
122 | 
123 |         if detections:
124 |             # Save detections
125 |             if detections_file and verbose: print('Writing {}...'.format(detections_file))
126 |             detections = {'annotations': detections}
127 |             detections['images'] = data_iterator.coco.dataset['images']
128 |             if 'categories' in data_iterator.coco.dataset:
129 |                 detections['categories'] = [data_iterator.coco.dataset['categories']]
130 |             if detections_file:
131 |                 json.dump(detections, open(detections_file, 'w'), indent=4)
132 | 
133 |             # Evaluate model on dataset
134 |             if 'annotations' in data_iterator.coco.dataset:
135 |                 if verbose: print('Evaluating model...')
136 |                 with redirect_stdout(None):
137 |                     coco_pred = data_iterator.coco.loadRes(detections['annotations'])
138 |                     coco_eval = COCOeval(data_iterator.coco, coco_pred, 'bbox')
139 |                     coco_eval.evaluate()
140 |                     coco_eval.accumulate()
141 |                 coco_eval.summarize()
142 |                 # print(coco_eval.stats)
143 |                 return 1 - 2 * (coco_eval.stats[0] * coco_eval.stats[8] / (coco_eval.stats[0] + coco_eval.stats[8]))
144 |         else:
145 |             print('No detections!')
146 | 
147 |     return 'asd'
148 | 


--------------------------------------------------------------------------------
/retinanet/infer_example.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL.Image import Image
 3 | import torch.nn.functional as F
 4 | from retinanet.main import get_model_trt
 5 | 
 6 | model = get_model_trt('trt_engine_path_retina')
 7 | 
 8 | CUDA_VISIBLE_DEVICES = 0
 9 | 
10 | 
11 | def infer(model, imgs):
12 |     r = []
13 |     resize = 1024
14 |     max_size = 1280
15 |     stride = model.stride
16 | 
17 |     for k, i in enumerate(imgs):
18 |         data, ration = resize_normalize_pad(i, stride, resize, max_size)
19 |         scores, boxes, classes = model(data)
20 |         score, box, clazz = scores[0][0].cpu().numpy(), boxes[0][0].cpu().numpy(), classes[0][0].cpu().numpy()
21 |         if score > 0.8:
22 |             print(score, box, clazz)
23 | 
24 |     return r
25 | 
26 | 
27 | def resize_normalize_pad(img, stride, resize, max_size):
28 |     mean = [0.485, 0.456, 0.406]
29 |     std = [0.229, 0.224, 0.225]
30 |     img = img.copy()[..., ::-1]
31 |     im = Image.fromarray(img)
32 |     ratio = resize / min(im.size)
33 |     if ratio * max(im.size) > max_size:
34 |         ratio = max_size / max(im.size)
35 |     im = im.resize((int(ratio * d) for d in im.size), Image.BILINEAR)
36 | 
37 |     data = torch.ByteTensor(torch.ByteStorage.from_buffer(im.tobytes()))
38 |     data = data.float().div(255).view(*im.size[::-1], len(im.mode))
39 |     data = data.permute(2, 0, 1)
40 | 
41 |     for t, mean, std in zip(data, mean, std):
42 |         t.sub_(mean).div_(std)
43 | 
44 |     pw, ph = ((stride - d % stride) % stride for d in im.size)
45 |     data = F.pad(data, (0, pw, 0, ph)).to(CUDA_VISIBLE_DEVICES)
46 |     data = data.unsqueeze_(0)
47 |     return data, ratio
48 | 


--------------------------------------------------------------------------------
/retinanet/inference_no_dali.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys, os
 3 | 
 4 | import pycuda.driver as cuda
 5 | import pycuda.autoinit
 6 | import tensorrt as trt
 7 | from PIL import Image
 8 | import numpy as np
 9 | 
10 | import tensorflow as tf
11 | # For preprocessing_factory and nets_factory, we need Slim to be available in the PYTHONPATH
12 | # This can be cloned from https://github.com/tensorflow/models.git into the current directory
13 | ROOT_DIR = os.path.dirname(__file__)
14 | SLIM_DIR = os.path.join(ROOT_DIR, "models", "research", "slim")
15 | sys.path.insert(1, SLIM_DIR)
16 | from preprocessing import preprocessing_factory
17 | from nets import nets_factory
18 | 
19 | TRT_LOGGER = trt.Logger()
20 | IMAGE_PATH = os.path.join(ROOT_DIR, "lfw_cropped", "Zach_Parise", "Cropped1.bmp")
21 | 
22 | class HostDeviceMem(object):
23 |     def __init__(self, host_mem, device_mem):
24 |         self.host = host_mem
25 |         self.device = device_mem
26 | 
27 | def allocate_buffers(engine):
28 |     inputs = []
29 |     outputs = []
30 |     bindings = []
31 |     for binding in engine:
32 |         size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
33 |         dtype = engine.get_binding_dtype(binding)
34 |         # Allocate device buffers for each binding.
35 |         device_mem = cuda.mem_alloc(size * dtype.itemsize)
36 |         bindings.append(int(device_mem))
37 |         if engine.binding_is_input(binding):
38 |             inputs.append(HostDeviceMem(None, device_mem))
39 |         else:
40 |             # We only need to allocate host buffers for outputs.
41 |             host_mem = cuda.pagelocked_empty(size, trt.nptype(dtype))
42 |             outputs.append(HostDeviceMem(host_mem, device_mem))
43 |     return inputs, outputs, bindings
44 | 
45 | def do_inference(context, bindings, inputs, outputs, stream):
46 |     [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
47 |     context.execute_async(bindings=bindings, stream_handle=stream.handle)
48 |     [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
49 |     stream.synchronize()
50 |     return [out.host for out in outputs]
51 | 
52 | def load_engine():
53 |     with trt.Runtime(TRT_LOGGER) as runtime, open(os.path.join(ROOT_DIR, "resnet50.engine"), "rb") as f:
54 |         return runtime.deserialize_cuda_engine(f.read())
55 | 
56 | # Build the TF preprocessing graph.
57 | def build_preprocessing_graph():
58 |     MODEL_NAME = "resnet_v1_50"
59 |     image_size = nets_factory.get_network_fn(MODEL_NAME, 8, is_training=False).default_image_size
60 |     image_preprocessing_fn = preprocessing_factory.get_preprocessing(MODEL_NAME, is_training=False)
61 | 
62 |     graph = tf.Graph()
63 |     with graph.as_default():
64 |         inp_tensor = tf.placeholder(tf.uint8, shape=[None, None, 3])
65 |         pre_image = image_preprocessing_fn(inp_tensor, image_size, image_size)
66 |         out_tensor = tf.expand_dims(pre_image, 0)
67 |     return graph, inp_tensor, out_tensor
68 | 
69 | def main():
70 |     graph, inp_tensor, out_tensor = build_preprocessing_graph()
71 |     # Using a ConfigProto, we can limit the amount of GPU memory TensorFlow uses.
72 |     # By default, it will try to reserve ALL memory, which is clearly not needed here.
73 |     config = tf.ConfigProto()
74 |     # The two options are:
75 |     # 1. Explicitly set the fraction of GPU memory available to TensorFlow
76 |     # 2. Use allow_growth to allocate memory only as needed.
77 |     # Option 1.
78 |     # config.gpu_options.per_process_gpu_memory_fraction = 0.1
79 |     # Option 2.
80 |     config.gpu_options.allow_growth = True
81 |     # Finally, make sure you set up the session with the config.
82 |     with tf.Session(graph=graph, config=config) as sess, load_engine() as engine, engine.create_execution_context() as context:
83 |         inputs, outputs, bindings = allocate_buffers(engine)
84 |         stream = cuda.Stream()
85 | 
86 |         # Inference loop.
87 |         img = np.array(Image.open(IMAGE_PATH))
88 |         inputs[0].host = sess.run(out_tensor, feed_dict={inp_tensor: img}).ravel()
89 |         [output] = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
90 | 
91 | if __name__ == '__main__':
92 |     main()
93 | 


--------------------------------------------------------------------------------
/retinanet/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class FocalLoss(nn.Module):
 7 |     'Focal Loss - https://arxiv.org/abs/1708.02002'
 8 | 
 9 |     def __init__(self, alpha=0.25, gamma=2):
10 |         super().__init__()
11 |         self.alpha = alpha
12 |         self.gamma = gamma
13 | 
14 |     def forward(self, pred_logits, target):
15 |         pred = pred_logits.sigmoid()
16 |         ce = F.binary_cross_entropy_with_logits(pred_logits, target, reduction='none')
17 |         alpha = target * self.alpha + (1. - target) * (1. - self.alpha)
18 |         pt = torch.where(target == 1, pred, 1 - pred)
19 |         return alpha * (1. - pt) ** self.gamma * ce
20 | 
21 | 
22 | class SmoothL1Loss(nn.Module):
23 |     'Smooth L1 Loss'
24 | 
25 |     def __init__(self, beta=0.11):
26 |         super().__init__()
27 |         self.beta = beta
28 | 
29 |     def forward(self, pred, target):
30 |         x = (pred - target).abs()
31 |         l1 = x - 0.5 * self.beta
32 |         l2 = 0.5 * x ** 2 / self.beta
33 |         return torch.where(x >= self.beta, l1, l2)
34 | 
35 | 
36 | class LossBinary(nn.Module):
37 |     """
38 |     Loss defined as \alpha BCE - (1 - \alpha) SoftJaccard
39 |     """
40 | 
41 |     def __init__(self, jaccard_weight=0.3):
42 |         super().__init__()
43 |         self.nll_loss = nn.BCEWithLogitsLoss()
44 |         self.jaccard_weight = jaccard_weight
45 | 
46 |     def forward(self, outputs, targets):
47 |         loss = (1 - self.jaccard_weight) * self.nll_loss(outputs, targets)
48 | 
49 |         if self.jaccard_weight:
50 |             eps = 1e-15
51 |             jaccard_target = (targets == 1).float()
52 |             jaccard_output = F.sigmoid(outputs)
53 | 
54 |             intersection = (jaccard_output * jaccard_target).sum()
55 |             union = jaccard_output.sum() + jaccard_target.sum()
56 | 
57 |             loss -= self.jaccard_weight * torch.log((intersection + eps) / (union - intersection + eps))
58 |         return loss
59 | 


--------------------------------------------------------------------------------
/retinanet/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import os
  4 | import random
  5 | import sys
  6 | 
  7 | import torch.cuda
  8 | import torch.distributed
  9 | import torch.multiprocessing
 10 | from retinanet._C import Engine
 11 | 
 12 | from retinanet import infer, train
 13 | from retinanet.model import Model
 14 | 
 15 | CUDA_VISIBLE_DEVICES = 1
 16 | 
 17 | 
 18 | def parse(args):
 19 |     parser = argparse.ArgumentParser(description='RetinaNet Detection Utility.')
 20 |     parser.add_argument('--master', metavar='address:port', type=str, help='Adress and port of the master worker',
 21 |                         default='127.0.0.1:29500')
 22 | 
 23 |     subparsers = parser.add_subparsers(help='sub-command', dest='command')
 24 |     subparsers.required = True
 25 | 
 26 |     devcount = max(1, torch.cuda.device_count())
 27 | 
 28 |     parser_train = subparsers.add_parser('train', help='train a network')
 29 |     parser_train.add_argument('model', type=str, help='path to output model or checkpoint to resume from')
 30 |     parser_train.add_argument('--annotations', metavar='path', type=str, help='path to COCO style annotations',
 31 |                               required=True)
 32 |     parser_train.add_argument('--images', metavar='path', type=str, help='path to images', default='.')
 33 |     parser_train.add_argument('--backbone', action='store', type=str, nargs='+', help='backbone model (or list of)',
 34 |                               default=['ResNet50FPN'])
 35 |     parser_train.add_argument('--classes', metavar='num', type=int, help='number of classes', default=80)
 36 |     parser_train.add_argument('--batch', metavar='size', type=int, help='batch size', default=1 * devcount)
 37 |     parser_train.add_argument('--resize', metavar='scale', type=int, help='resize to given size', default=1280)
 38 |     parser_train.add_argument('--max-size', metavar='max', type=int, help='maximum resizing size', default=1280)
 39 |     parser_train.add_argument('--jitter', metavar='min max', type=int, nargs=2, help='jitter size within range',
 40 |                               default=[1024, 1280])
 41 |     parser_train.add_argument('--iters', metavar='number', type=int, help='number of iterations to train for',
 42 |                               default=20000)
 43 |     parser_train.add_argument('--milestones', action='store', type=int, nargs='*',
 44 |                               help='list of iteration indices where learning rate decays', default=[15000, 30000])
 45 |     parser_train.add_argument('--schedule', metavar='scale', type=float,
 46 |                               help='scale schedule (affecting iters and milestones)', default=1)
 47 |     parser_train.add_argument('--full-precision', help='train in full precision', action='store_true')
 48 |     parser_train.add_argument('--lr', metavar='value', help='learning rate', type=float, default=0.0001)
 49 |     parser_train.add_argument('--warmup', metavar='iterations', help='numer of warmup iterations', type=int,
 50 |                               default=200)
 51 |     parser_train.add_argument('--gamma', metavar='value', type=float,
 52 |                               help='multiplicative factor of learning rate decay', default=0.1)
 53 |     parser_train.add_argument('--override', help='override model', action='store_true')
 54 |     parser_train.add_argument('--val-annotations', metavar='path', type=str,
 55 |                               help='path to COCO style validation annotations')
 56 |     parser_train.add_argument('--val-images', metavar='path', type=str, help='path to validation images')
 57 |     parser_train.add_argument('--post-metrics', metavar='url', type=str, help='post metrics to specified url')
 58 |     parser_train.add_argument('--fine-tune', metavar='path', type=str, help='fine tune a pretrained model')
 59 |     parser_train.add_argument('--logdir', metavar='logdir', type=str, help='directory where to write logs')
 60 |     parser_train.add_argument('--val-iters', metavar='number', type=int,
 61 |                               help='number of iterations between each validation', default=20)
 62 |     parser_train.add_argument('--with-dali', help='use dali for data loading', action='store_true')
 63 |     parser_train.add_argument('--crop-number', help='crop number', type=bool, default=False)
 64 |     parser_train.add_argument('--model-threshold', type=float, default=0.05)
 65 |     parser_train.add_argument('--model-top-n', type=int, default=1000)
 66 |     parser_train.add_argument('--model-nms', type=float, default=0.5)
 67 |     parser_train.add_argument('--model-detections', type=int, default=100)
 68 | 
 69 |     parser_infer = subparsers.add_parser('infer', help='run inference')
 70 |     parser_infer.add_argument('model', type=str, help='path to model')
 71 |     parser_infer.add_argument('--images', metavar='path', type=str, help='path to images', default='.')
 72 |     parser_infer.add_argument('--annotations', metavar='annotations', type=str,
 73 |                               help='evaluate using provided annotations')
 74 |     parser_infer.add_argument('--output', metavar='file', type=str, help='save detections to specified JSON file',
 75 |                               default='detections.json')
 76 |     parser_infer.add_argument('--batch', metavar='size', type=int, help='batch size', default=2 * devcount)
 77 |     parser_infer.add_argument('--resize', metavar='scale', type=int, help='resize to given size', default=800)
 78 |     parser_infer.add_argument('--max-size', metavar='max', type=int, help='maximum resizing size', default=1333)
 79 |     parser_infer.add_argument('--with-dali', help='use dali for data loading', action='store_true')
 80 |     parser_infer.add_argument('--full-precision', help='inference in full precision', action='store_true')
 81 | 
 82 |     parser_export = subparsers.add_parser('export', help='export a model into a TensorRT engine')
 83 |     parser_export.add_argument('model', type=str, help='path to model')
 84 |     parser_export.add_argument('export', type=str, help='path to exported output')
 85 |     parser_export.add_argument('--size', metavar='height width', type=int, nargs='+',
 86 |                                help='input size (square) or sizes (h w) to use when generating TensorRT engine',
 87 |                                default=[1280])
 88 |     parser_export.add_argument('--batch', metavar='size', type=int, help='max batch size to use for TensorRT engine',
 89 |                                default=2)
 90 |     parser_export.add_argument('--full-precision', help='export in full instead of half precision', action='store_true')
 91 |     parser_export.add_argument('--int8', help='calibrate model and export in int8 precision', action='store_true')
 92 |     parser_export.add_argument('--opset', metavar='version', type=int, help='ONNX opset version')
 93 |     parser_export.add_argument('--calibration-batches', metavar='size', type=int,
 94 |                                help='number of batches to use for int8 calibration', default=10)
 95 |     parser_export.add_argument('--calibration-images', metavar='path', type=str,
 96 |                                help='path to calibration images to use for int8 calibration', default="")
 97 |     parser_export.add_argument('--calibration-table', metavar='path', type=str,
 98 |                                help='path of existing calibration table to load from, or name of new calibration table',
 99 |                                default="")
100 |     parser_export.add_argument('--verbose', help='enable verbose logging', action='store_true')
101 | 
102 |     return parser.parse_args(args)
103 | 
104 | 
105 | def load_model(args, verbose=False):
106 |     if args.command != 'train' and not os.path.isfile(args.model):
107 |         raise RuntimeError('Model file {} does not exist!'.format(args.model))
108 | 
109 |     model = None
110 |     state = {}
111 |     _, ext = os.path.splitext(args.model)
112 | 
113 |     if args.command == 'train' and (not os.path.exists(args.model) or args.override):
114 |         if verbose: print('Initializing model...')
115 |         model = Model(args.backbone, args.classes,
116 |                       config={'threshold': args.model_threshold, 'top_n': args.model_top_n, 'nms': args.model_nms,
117 |                               'detections': args.model_detections})
118 |         model.initialize(args.fine_tune)
119 |         if verbose: print(model)
120 | 
121 |     elif ext == '.pth' or ext == '.torch':
122 |         if verbose: print('Loading model from {}...'.format(os.path.basename(args.model)))
123 |         model, state = Model.load(args.model)
124 |         if verbose: print(model)
125 | 
126 |     elif args.command == 'infer' and ext in ['.engine', '.plan']:
127 |         model = None
128 | 
129 |     else:
130 |         raise RuntimeError('Invalid model format "{}"!'.format(args.ext))
131 | 
132 |     state['path'] = args.model
133 |     return model, state
134 | 
135 | 
136 | def worker(rank, args, world, model, state):
137 |     'Per-device distributed worker'
138 | 
139 |     if torch.cuda.is_available():
140 |         os.environ.update({
141 |             'MASTER_PORT': args.master.split(':')[-1],
142 |             'MASTER_ADDR': ':'.join(args.master.split(':')[:-1]),
143 |             'WORLD_SIZE': str(world),
144 |             'RANK': str(rank),
145 |             'CUDA_DEVICE': str(rank)
146 |         })
147 | 
148 |         torch.cuda.set_device(rank)
149 |         torch.distributed.init_process_group(backend='nccl', init_method='env://')
150 | 
151 |         if args.batch % world != 0:
152 |             raise RuntimeError('Batch size should be a multiple of the number of GPUs')
153 | 
154 |     if args.command == 'train':
155 |         train.train(model, state, args.images, args.annotations,
156 |                     args.val_images or args.images, args.val_annotations, args.resize, args.max_size, args.jitter,
157 |                     args.batch, int(args.iters * args.schedule), args.val_iters, not args.full_precision, args.lr,
158 |                     args.warmup, [int(m * args.schedule) for m in args.milestones], args.gamma,
159 |                     is_master=(rank == 0), world=world, use_dali=args.with_dali,
160 |                     metrics_url=args.post_metrics, logdir=args.logdir, verbose=(rank == 0),
161 |                     crop_number=args.crop_number)
162 | 
163 |     elif args.command == 'infer':
164 |         if model is None:
165 |             if rank == 0: print('Loading CUDA engine from {}...'.format(os.path.basename(args.model)))
166 |             model = Engine.load(args.model)
167 | 
168 |         infer.infer(model, args.images, args.output, args.resize, args.max_size, args.batch,
169 |                     annotations=args.annotations, mixed_precision=not args.full_precision,
170 |                     is_master=(rank == 0), world=world, use_dali=args.with_dali, verbose=(rank == 0))
171 | 
172 |     elif args.command == 'export':
173 |         onnx_only = args.export.split('.')[-1] == 'onnx'
174 |         input_size = args.size * 2 if len(args.size) == 1 else args.size
175 | 
176 |         calibration_files = []
177 |         if args.int8:
178 |             # Get list of images to use for calibration
179 |             if os.path.isdir(args.calibration_images):
180 |                 import glob
181 |                 file_extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
182 |                 for ex in file_extensions:
183 |                     calibration_files += glob.glob("{}/*{}".format(args.calibration_images, ex), recursive=True)
184 |                 # Only need enough images for specified num of calibration batches
185 |                 if len(calibration_files) >= args.calibration_batches * args.batch:
186 |                     calibration_files = calibration_files[:(args.calibration_batches * args.batch)]
187 |                 else:
188 |                     print('Only found enough images for {} batches. Continuing anyway...'.format(
189 |                         len(calibration_files) // args.batch))
190 | 
191 |                 random.shuffle(calibration_files)
192 | 
193 |         precision = "FP32"
194 |         if args.int8:
195 |             precision = "INT8"
196 |         elif not args.full_precision:
197 |             precision = "FP16"
198 | 
199 |         exported = model.export(input_size, args.batch, precision, calibration_files, args.calibration_table,
200 |                                 args.verbose, onnx_only=onnx_only)
201 |         if onnx_only:
202 |             with open(args.export, 'wb') as out:
203 |                 out.write(exported)
204 |         else:
205 |             exported.save(args.export)
206 | 
207 | 
208 | def main(args=None):
209 |     'Entry point for the retinanet command'
210 | 
211 |     args = parse(args or sys.argv[1:])
212 | 
213 |     model, state = load_model(args, verbose=True)
214 |     if model: model.share_memory()
215 | 
216 |     world = torch.cuda.device_count()
217 |     if args.command == 'export' or world <= 1:
218 |         worker(0, args, 1, model, state)
219 |     else:
220 |         torch.multiprocessing.spawn(worker, args=(args, world, model, state), nprocs=world)
221 | 
222 | 
223 | def get_model_trt(path):
224 |     return Engine.load(path)
225 | 
226 | 
227 | if __name__ == '__main__':
228 |     main()
229 | 


--------------------------------------------------------------------------------
/retinanet/model.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import io
  3 | import math
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from . import backbones as backbones_mod
  8 | from ._C import Engine
  9 | from .box import generate_anchors, snap_to_anchors, decode, nms
 10 | from .loss import FocalLoss, SmoothL1Loss
 11 | 
 12 | 
 13 | class Model(nn.Module):
 14 |     'RetinaNet - https://arxiv.org/abs/1708.02002'
 15 | 
 16 |     def __init__(self, backbones='ResNet50FPN', classes=80, config={}):
 17 |         super().__init__()
 18 | 
 19 |         if not isinstance(backbones, list):
 20 |             backbones = [backbones]
 21 | 
 22 |         self.backbones = nn.ModuleDict({b: getattr(backbones_mod, b)() for b in backbones})
 23 |         self.name = 'RetinaNet'
 24 |         self.exporting = False
 25 | 
 26 |         self.ratios = [1.0, 2.0, 0.5]
 27 |         self.scales = [4 * 2 ** (i / 3) for i in range(3)]
 28 |         self.anchors = {}
 29 |         self.classes = classes
 30 | 
 31 |         self.threshold = config.get('threshold', 0.05)
 32 |         self.top_n = config.get('top_n', 1000)
 33 |         self.nms = config.get('nms', 0.5)
 34 |         self.detections = config.get('detections', 100)
 35 | 
 36 |         self.stride = max([b.stride for _, b in self.backbones.items()])
 37 | 
 38 |         # classification and box regression heads
 39 |         def make_head(out_size):
 40 |             layers = []
 41 |             for _ in range(4):
 42 |                 layers += [nn.Conv2d(256, 256, 3, padding=1), nn.ReLU()]
 43 |             layers += [nn.Conv2d(256, out_size, 3, padding=1)]
 44 |             return nn.Sequential(*layers)
 45 | 
 46 |         anchors = len(self.ratios) * len(self.scales)
 47 |         self.cls_head = make_head(classes * anchors)
 48 |         self.box_head = make_head(4 * anchors)
 49 | 
 50 |         self.cls_criterion = FocalLoss()
 51 |         self.box_criterion = SmoothL1Loss(beta=0.11)
 52 | 
 53 |     def __repr__(self):
 54 |         return '\n'.join([
 55 |             '     model: {}'.format(self.name),
 56 |             '  backbone: {}'.format(', '.join([k for k, _ in self.backbones.items()])),
 57 |             '   classes: {}, anchors: {}'.format(self.classes, len(self.ratios) * len(self.scales)),
 58 |         ])
 59 | 
 60 |     def initialize(self, pre_trained):
 61 |         if pre_trained:
 62 |             # Initialize using weights from pre-trained model
 63 |             if not os.path.isfile(pre_trained):
 64 |                 raise ValueError('No checkpoint {}'.format(pre_trained))
 65 | 
 66 |             print('Fine-tuning weights from {}...'.format(os.path.basename(pre_trained)))
 67 |             state_dict = self.state_dict()
 68 |             chk = torch.load(pre_trained, map_location=lambda storage, loc: storage)
 69 |             ignored = ['cls_head.8.bias', 'cls_head.8.weight']
 70 |             weights = {k: v for k, v in chk['state_dict'].items() if k not in ignored}
 71 |             state_dict.update(weights)
 72 |             self.load_state_dict(state_dict)
 73 | 
 74 |             del chk, weights
 75 |             torch.cuda.empty_cache()
 76 | 
 77 |         else:
 78 |             # Initialize backbone(s)
 79 |             for _, backbone in self.backbones.items():
 80 |                 backbone.initialize()
 81 | 
 82 |             # Initialize heads
 83 |             def initialize_layer(layer):
 84 |                 if isinstance(layer, nn.Conv2d):
 85 |                     nn.init.normal_(layer.weight, std=0.01)
 86 |                     if layer.bias is not None:
 87 |                         nn.init.constant_(layer.bias, val=0)
 88 | 
 89 |             self.cls_head.apply(initialize_layer)
 90 |             self.box_head.apply(initialize_layer)
 91 | 
 92 |         # Initialize class head prior
 93 |         def initialize_prior(layer):
 94 |             pi = 0.01
 95 |             b = - math.log((1 - pi) / pi)
 96 |             nn.init.constant_(layer.bias, b)
 97 |             nn.init.normal_(layer.weight, std=0.01)
 98 | 
 99 |         self.cls_head[-1].apply(initialize_prior)
100 | 
101 |     def forward(self, x):
102 |         if self.training: x, targets = x
103 | 
104 |         # Backbones forward pass
105 |         features = []
106 |         for _, backbone in self.backbones.items():
107 |             features.extend(backbone(x))
108 | 
109 |         # Heads forward pass
110 |         cls_heads = [self.cls_head(t) for t in features]
111 |         box_heads = [self.box_head(t) for t in features]
112 | 
113 |         if self.training:
114 |             return self._compute_loss(x, cls_heads, box_heads, targets.float())
115 | 
116 |         cls_heads = [cls_head.sigmoid() for cls_head in cls_heads]
117 | 
118 |         if self.exporting:
119 |             self.strides = [x.shape[-1] // cls_head.shape[-1] for cls_head in cls_heads]
120 |             return cls_heads, box_heads
121 | 
122 |         # Inference post-processing
123 |         decoded = []
124 |         for cls_head, box_head in zip(cls_heads, box_heads):
125 |             # Generate level's anchors
126 |             stride = x.shape[-1] // cls_head.shape[-1]
127 |             if stride not in self.anchors:
128 |                 self.anchors[stride] = generate_anchors(stride, self.ratios, self.scales)
129 | 
130 |             # Decode and filter boxes
131 |             decoded.append(decode(cls_head, box_head, stride,
132 |                                   self.threshold, self.top_n, self.anchors[stride]))
133 | 
134 |         # Perform non-maximum suppression
135 |         decoded = [torch.cat(tensors, 1) for tensors in zip(*decoded)]
136 |         return nms(*decoded, self.nms, self.detections)
137 | 
138 |     def _extract_targets(self, targets, stride, size):
139 |         cls_target, box_target, depth = [], [], []
140 |         for target in targets:
141 |             target = target[target[:, -1] > -1]
142 |             if stride not in self.anchors:
143 |                 self.anchors[stride] = generate_anchors(stride, self.ratios, self.scales)
144 |             snapped = snap_to_anchors(
145 |                 target, [s * stride for s in size[::-1]], stride,
146 |                 self.anchors[stride].to(targets.device), self.classes, targets.device)
147 |             for l, s in zip((cls_target, box_target, depth), snapped): l.append(s)
148 |         return torch.stack(cls_target), torch.stack(box_target), torch.stack(depth)
149 | 
150 |     def _compute_loss(self, x, cls_heads, box_heads, targets):
151 |         cls_losses, box_losses, fg_targets = [], [], []
152 |         for cls_head, box_head in zip(cls_heads, box_heads):
153 |             size = cls_head.shape[-2:]
154 |             stride = x.shape[-1] / cls_head.shape[-1]
155 | 
156 |             cls_target, box_target, depth = self._extract_targets(targets, stride, size)
157 |             fg_targets.append((depth > 0).sum().float().clamp(min=1))
158 | 
159 |             cls_head = cls_head.view_as(cls_target).float()
160 |             cls_mask = (depth >= 0).expand_as(cls_target).float()
161 |             cls_loss = self.cls_criterion(cls_head, cls_target)
162 |             cls_loss = cls_mask * cls_loss
163 |             cls_losses.append(cls_loss.sum())
164 | 
165 |             box_head = box_head.view_as(box_target).float()
166 |             box_mask = (depth > 0).expand_as(box_target).float()
167 |             box_loss = self.box_criterion(box_head, box_target)
168 |             box_loss = box_mask * box_loss
169 |             box_losses.append(box_loss.sum())
170 | 
171 |         fg_targets = torch.stack(fg_targets).sum()
172 |         cls_loss = torch.stack(cls_losses).sum() / fg_targets
173 |         box_loss = torch.stack(box_losses).sum() / fg_targets
174 |         return cls_loss, box_loss
175 | 
176 |     def save(self, state):
177 |         checkpoint = {
178 |             'backbone': [k for k, _ in self.backbones.items()],
179 |             'classes': self.classes,
180 |             'state_dict': self.state_dict()
181 |         }
182 | 
183 |         for key in ('iteration', 'optimizer', 'scheduler'):
184 |             if key in state:
185 |                 checkpoint[key] = state[key]
186 | 
187 |         dir_name = state['path'][:-4]
188 |         if not os.path.exists(dir_name):
189 |             os.makedirs(dir_name)
190 | 
191 |         torch.save(checkpoint, dir_name + '/' + str(state['iteration']) + '_' + state['path'])
192 | 
193 | 
194 |     @classmethod
195 |     def load(cls, filename):
196 |         if not os.path.isfile(filename):
197 |             raise ValueError('No checkpoint {}'.format(filename))
198 | 
199 |         checkpoint = torch.load(filename, map_location=lambda storage, loc: storage)
200 |         # Recreate model from checkpoint instead of from individual backbones
201 |         model = cls(backbones=checkpoint['backbone'], classes=checkpoint['classes'])
202 |         model.load_state_dict(checkpoint['state_dict'])
203 | 
204 |         state = {}
205 |         for key in ('iteration', 'optimizer', 'scheduler'):
206 |             if key in checkpoint:
207 |                 state[key] = checkpoint[key]
208 | 
209 |         del checkpoint
210 |         torch.cuda.empty_cache()
211 | 
212 |         return model, state
213 | 
214 |     def export(self, size, batch, precision, calibration_files, calibration_table, verbose, onnx_only=False):
215 | 
216 |         import torch.onnx.symbolic_opset9 as onnx_symbolic
217 |         def upsample_nearest2d(g, input, output_size):
218 |             # Currently, TRT 5.1/6.0 ONNX Parser does not support all ONNX ops
219 |             # needed to support dynamic upsampling ONNX forumlation
220 |             # Here we hardcode scale=2 as a temporary workaround
221 |             scales = g.op("Constant", value_t=torch.tensor([1., 1., 2., 2.]))
222 |             return g.op("Upsample", input, scales, mode_s="nearest")
223 | 
224 |         onnx_symbolic.upsample_nearest2d = upsample_nearest2d
225 | 
226 |         # Export to ONNX
227 |         print('Exporting to ONNX...')
228 |         self.exporting = True
229 |         onnx_bytes = io.BytesIO()
230 |         zero_input = torch.zeros([1, 3, *size]).cuda()
231 |         extra_args = {'verbose': verbose}
232 |         torch.onnx.export(self.cuda(), zero_input, onnx_bytes, *extra_args)
233 |         self.exporting = False
234 | 
235 |         if onnx_only:
236 |             return onnx_bytes.getvalue()
237 | 
238 |         # Build TensorRT engine
239 |         model_name = '_'.join([k for k, _ in self.backbones.items()])
240 |         anchors = [generate_anchors(stride, self.ratios, self.scales).view(-1).tolist()
241 |                    for stride in self.strides]
242 |         return Engine(onnx_bytes.getvalue(), len(onnx_bytes.getvalue()), batch, precision,
243 |                       self.threshold, self.top_n, anchors, self.nms, self.detections, calibration_files, model_name,
244 |                       calibration_table, verbose)


--------------------------------------------------------------------------------
/retinanet/train.py:
--------------------------------------------------------------------------------
  1 | from math import isfinite
  2 | from statistics import mean
  3 | 
  4 | import torch
  5 | from apex import amp
  6 | from apex.parallel import DistributedDataParallel
  7 | from torch.optim import Adam, SGD
  8 | from torch.optim.lr_scheduler import ReduceLROnPlateau
  9 | 
 10 | from .backbones.layers import convert_fixedbn_model
 11 | from .dali import DaliDataIterator
 12 | from .data import DataIterator
 13 | from .infer import infer
 14 | from .utils import ignore_sigint, post_metrics, Profiler
 15 | 
 16 | 
 17 | def train(model, state, path, annotations, val_path, val_annotations, resize, max_size, jitter, batch_size, iterations,
 18 |           val_iterations, mixed_precision, lr, warmup, milestones, gamma, is_master=True, world=1, use_dali=True,
 19 |           verbose=True, metrics_url=None, logdir=None, crop_number=False):
 20 |     'Train the model on the given dataset'
 21 | 
 22 |     # Prepare model
 23 |     nn_model = model
 24 |     stride = model.stride
 25 | 
 26 |     model = convert_fixedbn_model(model)
 27 |     if torch.cuda.is_available():
 28 |         model = model.cuda()
 29 | 
 30 |     # Setup optimizer and schedule
 31 |     # optimizer = SGD(model.parameters(), lr=lr, weight_decay=0.0000001) # , momentum=0.9
 32 |     optimizer = Adam(model.parameters(), lr=lr, weight_decay=0.0000001)
 33 | 
 34 |     model, optimizer = amp.initialize(model, optimizer,
 35 |                                       opt_level='O0' if mixed_precision else 'O0',
 36 |                                       # keep_batchnorm_fp32=True,
 37 |                                       loss_scale=128.0,
 38 |                                       verbosity=is_master)
 39 | 
 40 |     if world > 1:
 41 |         model = DistributedDataParallel(model)
 42 |     model.train()
 43 | 
 44 |     if 'optimizer' in state:
 45 |         optimizer.load_state_dict(state['optimizer'])
 46 | 
 47 |     '''
 48 |     def schedule(train_iter):
 49 |         if warmup and train_iter <= warmup:
 50 |             return 0.9 * train_iter / warmup + 0.1
 51 |         return gamma ** len([m for m in milestones if m <= train_iter])
 52 |     scheduler = LambdaLR(optimizer, schedule)
 53 |     '''
 54 | 
 55 |     scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
 56 | 
 57 |     # Prepare dataset
 58 |     if verbose: print('Preparing dataset...')
 59 |     data_iterator = (DaliDataIterator if use_dali else DataIterator)(
 60 |         path, jitter, max_size, batch_size, stride,
 61 |         world, annotations, training=True, crop_number=crop_number)
 62 |     if verbose: print(data_iterator)
 63 | 
 64 |     if verbose:
 65 |         print('    device: {} {}'.format(
 66 |             world, 'cpu' if not torch.cuda.is_available() else 'gpu' if world == 1 else 'gpus'))
 67 |         print('    batch: {}, precision: {}'.format(batch_size, 'mixed' if mixed_precision else 'full'))
 68 |         print('Training model for {} iterations...'.format(iterations))
 69 | 
 70 |     # Create TensorBoard writer
 71 |     if logdir is not None:
 72 |         from tensorboardX import SummaryWriter
 73 |         if is_master and verbose:
 74 |             print('Writing TensorBoard logs to: {}'.format(logdir))
 75 |         writer = SummaryWriter(logdir=logdir)
 76 | 
 77 |     profiler = Profiler(['train', 'fw', 'bw'])
 78 |     iteration = state.get('iteration', 0)
 79 |     try:
 80 |         while iteration < iterations:
 81 |             cls_losses, box_losses = [], []
 82 |             for i, (data, target) in enumerate(data_iterator):
 83 |                 # scheduler.step(iteration)
 84 | 
 85 |                 # Forward pass
 86 |                 profiler.start('fw')
 87 | 
 88 |                 optimizer.zero_grad()
 89 |                 cls_loss, box_loss = model([data, target])
 90 |                 del data
 91 |                 profiler.stop('fw')
 92 | 
 93 |                 # Backward pass
 94 |                 profiler.start('bw')
 95 |                 with amp.scale_loss(cls_loss + box_loss, optimizer) as scaled_loss:
 96 |                     scaled_loss.backward()
 97 |                 optimizer.step()
 98 | 
 99 |                 # Reduce all losses
100 |                 cls_loss, box_loss = cls_loss.mean().clone(), box_loss.mean().clone()
101 |                 if world > 1:
102 |                     torch.distributed.all_reduce(cls_loss)
103 |                     torch.distributed.all_reduce(box_loss)
104 |                     cls_loss /= world
105 |                     box_loss /= world
106 |                 if is_master:
107 |                     cls_losses.append(cls_loss)
108 |                     box_losses.append(box_loss)
109 | 
110 |                 if is_master and not isfinite(cls_loss + box_loss):
111 |                     raise RuntimeError('Loss is diverging!\n{}'.format(
112 |                         'Try lowering the learning rate.'))
113 | 
114 |                 del cls_loss, box_loss
115 |                 profiler.stop('bw')
116 | 
117 |                 iteration += 1
118 |                 profiler.bump('train')
119 |                 if is_master and (profiler.totals['train'] > 2 or iteration == iterations):
120 |                     focal_loss = torch.stack(list(cls_losses)).mean().item()
121 |                     box_loss = torch.stack(list(box_losses)).mean().item()
122 |                     learning_rate = optimizer.param_groups[0]['lr']
123 |                     if verbose:
124 |                         msg = '[{:{len}}/{}]'.format(iteration, iterations, len=len(str(iterations)))
125 |                         msg += ' focal loss: {:.5f}'.format(focal_loss)
126 |                         msg += ', box loss: {:.5f}'.format(box_loss)
127 |                         msg += ', {:.3f}s/{}-batch'.format(profiler.means['train'], batch_size)
128 |                         msg += ' (fw: {:.3f}s, bw: {:.3f}s)'.format(profiler.means['fw'], profiler.means['bw'])
129 |                         msg += ', {:.1f} im/s'.format(batch_size / profiler.means['train'])
130 |                         msg += ', lr: {:.2g}'.format(learning_rate)
131 |                         print(msg, flush=True)
132 | 
133 |                     if logdir is not None:
134 |                         writer.add_scalar('focal_loss', focal_loss, iteration)
135 |                         writer.add_scalar('box_loss', box_loss, iteration)
136 |                         writer.add_scalar('learning_rate', learning_rate, iteration)
137 |                         del box_loss, focal_loss
138 | 
139 |                     if metrics_url:
140 |                         post_metrics(metrics_url, {
141 |                             'focal loss': mean(cls_losses),
142 |                             'box loss': mean(box_losses),
143 |                             'im_s': batch_size / profiler.means['train'],
144 |                             'lr': learning_rate
145 |                         })
146 | 
147 |                     # Save model weights
148 |                     state.update({
149 |                         'iteration': iteration,
150 |                         'optimizer': optimizer.state_dict(),
151 |                         'scheduler': scheduler.state_dict(),
152 |                     })
153 |                     # with ignore_sigint():
154 |                     #    nn_model.save(state)
155 | 
156 |                     profiler.reset()
157 |                     del cls_losses[:], box_losses[:]
158 | 
159 |                 if val_annotations and (iteration == iterations or iteration % val_iterations == 0):
160 |                     f1_m = infer(model, val_path, None, resize, max_size, batch_size, annotations=val_annotations,
161 |                                  mixed_precision=mixed_precision, is_master=is_master, world=world, use_dali=use_dali,
162 |                                  is_validation=True, verbose=True)
163 | 
164 |                     if not isinstance(f1_m, str):
165 |                         print('f1_m:' + str(f1_m))
166 |                         scheduler.step(f1_m)
167 |                     model.train()
168 |                     if is_master:
169 |                         print('Saving model: ' + str(state['iteration']))
170 |                         with ignore_sigint():
171 |                             nn_model.save(state)
172 | 
173 |                 if iteration == iterations:
174 |                     break
175 |     except Exception as e:
176 |         print(e)
177 | 
178 |     if logdir is not None:
179 |         writer.close()
180 | 


--------------------------------------------------------------------------------
/retinanet/utils.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import time
 3 | import json
 4 | import warnings
 5 | import signal
 6 | from datetime import datetime
 7 | from contextlib import contextmanager
 8 | from PIL import Image, ImageDraw
 9 | import requests
10 | 
11 | def show_detections(detections):
12 |     'Show image with drawn detections'
13 | 
14 |     for image, detections in detections.items():
15 |         im = Image.open(image).convert('RGBA')
16 |         overlay = Image.new('RGBA', im.size, (255,255,255,0))
17 |         draw = ImageDraw.Draw(overlay)
18 |         detections.sort(key=lambda d: d['score'])
19 |         for detection in detections:
20 |             box = detection['bbox']
21 |             alpha = int(detection['score'] * 255)
22 |             draw.rectangle(box, outline=(255, 255, 255, alpha))
23 |             draw.text((box[0]+2, box[1]), '[{}]'.format(detection['class']),
24 |                 fill=(255, 255, 255, alpha))
25 |             draw.text((box[0]+2, box[1]+10), '{:.2}'.format(detection['score']),
26 |                 fill=(255, 255, 255, alpha))
27 |         im = Image.alpha_composite(im, overlay)
28 |         im.show()
29 | 
30 | def save_detections(path, detections):
31 |     print('Writing detections to {}...'.format(os.path.basename(path)))
32 |     with open(path, 'w') as f:
33 |         json.dump(detections, f)
34 | 
35 | @contextmanager
36 | def ignore_sigint():
37 |     handler = signal.getsignal(signal.SIGINT)
38 |     signal.signal(signal.SIGINT, signal.SIG_IGN)
39 |     try:
40 |         yield
41 |     finally:
42 |         signal.signal(signal.SIGINT, handler)
43 | 
44 | class Profiler(object):
45 |     def __init__(self, names=['main']):
46 |         self.names = names
47 |         self.lasts = { k: 0 for k in names }
48 |         self.totals = self.lasts.copy()
49 |         self.counts = self.lasts.copy()
50 |         self.means = self.lasts.copy()
51 |         self.reset()
52 | 
53 |     def reset(self):
54 |         last = time.time()
55 |         for name in self.names:
56 |             self.lasts[name] = last
57 |             self.totals[name] = 0
58 |             self.counts[name] = 0
59 |             self.means[name] = 0
60 | 
61 |     def start(self, name='main'):
62 |         self.lasts[name] = time.time()
63 | 
64 |     def stop(self, name='main'):
65 |         self.totals[name] += time.time() - self.lasts[name]
66 |         self.counts[name] += 1
67 |         self.means[name] = self.totals[name] / self.counts[name]
68 | 
69 |     def bump(self, name='main'):
70 |         self.stop(name)
71 |         self.start(name)
72 | 
73 | def post_metrics(url, metrics):
74 |     try:
75 |         for k, v in metrics.items():
76 |             requests.post(url,
77 |                 data={ 'time': int(datetime.now().timestamp() * 1e9), 
78 |                         'metric': k, 'value': v })
79 |     except Exception as e:
80 |         warnings.warn('Warning: posting metrics failed: {}'.format(e))
81 | 
82 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 3 | 
 4 | setup(
 5 |     name='retinanet',
 6 |     version='0.1',
 7 |     description='Fast and accurate single shot object detector',
 8 |     author='NVIDIA Corporation',
 9 |     author_email='fchabert@nvidia.com',
10 |     packages=['retinanet', 'retinanet.backbones'],
11 |     ext_modules=[CUDAExtension('retinanet._C',
12 |                                ['csrc/extensions.cpp', 'csrc/engine.cpp', 'csrc/cuda/decode.cu', 'csrc/cuda/nms.cu'],
13 |                                extra_compile_args={
14 |                                    'cxx': ['-std=c++11', '-O2', '-Wall'],
15 |                                    'nvcc': [
16 |                                        '-std=c++11', '--expt-extended-lambda', '--use_fast_math', '-Xcompiler', '-Wall',
17 |                                        '-gencode=arch=compute_60,code=sm_60', '-gencode=arch=compute_61,code=sm_61',
18 |                                        '-gencode=arch=compute_70,code=sm_70', '-gencode=arch=compute_72,code=sm_72',
19 |                                        '-gencode=arch=compute_75,code=sm_75', '-gencode=arch=compute_75,code=compute_75'
20 |                                    ],
21 |                                },
22 |                                libraries=['nvinfer', 'nvinfer_plugin', 'nvonnxparser'])
23 |                  ],
24 |     cmdclass={'build_ext': BuildExtension.with_options(no_python_abi_suffix=True)},
25 |     install_requires=[
26 |         'torch>=1.0.0a0',
27 |         'torchvision',
28 |         'apex @ git+https://github.com/NVIDIA/apex',
29 |         'pycocotools @ git+https://github.com/nvidia/cocoapi.git#subdirectory=PythonAPI',
30 |         'pillow==6.2.2',
31 |         'requests',
32 |     ],
33 |     entry_points={'console_scripts': ['retinanet=retinanet.main:main']}
34 | )
35 | 


--------------------------------------------------------------------------------
/unet/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aidonchuk/retinanet-examples/b0a9c0ef36c38eb8d602de83d68665b23df4e76f/unet/common/__init__.py


--------------------------------------------------------------------------------
/unet/common/models_common.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torchvision.models import resnet18
  4 | 
  5 | 
  6 | class ResNet18(nn.Module):
  7 | 
  8 |     def __init__(self, num_classes=1, num_filters=32, pretrained=True, is_deconv=True, requires_grad=True):
  9 |         super().__init__()
 10 |         self.num_classes = num_classes
 11 | 
 12 |         self.pool = nn.MaxPool2d(2, 2)
 13 | 
 14 |         self.encoder = resnet18(pretrained=pretrained)
 15 | 
 16 |         for params in self.encoder.parameters():
 17 |             params.requires_grad = requires_grad
 18 | 
 19 |         self.relu = nn.ReLU(inplace=True)
 20 | 
 21 |         self.conv1 = nn.Sequential(self.encoder.conv1,
 22 |                                    self.encoder.bn1,
 23 |                                    self.encoder.relu,
 24 |                                    self.pool
 25 |                                    )
 26 | 
 27 |         self.conv2 = self.encoder.layer1
 28 |         self.conv3 = self.encoder.layer2
 29 |         self.conv4 = self.encoder.layer3
 30 |         self.conv5 = self.encoder.layer4
 31 | 
 32 |         self.center = DecoderBlock(512, num_filters * 8 * 2, num_filters * 8, is_deconv)
 33 | 
 34 |         self.dec5 = DecoderBlock(512 + num_filters * 8, num_filters * 8 * 2, num_filters * 8, is_deconv)
 35 |         self.dec4 = DecoderBlock(256 + num_filters * 8, num_filters * 8 * 2, num_filters * 8, is_deconv)
 36 |         self.dec3 = DecoderBlock(128 + num_filters * 8, num_filters * 4 * 2, num_filters * 2, is_deconv)
 37 |         self.dec2 = DecoderBlock(64 + num_filters * 2, num_filters * 2 * 2, num_filters * 2 * 2, is_deconv)
 38 |         self.dec1 = DecoderBlock(num_filters * 2 * 2, num_filters * 2 * 2, num_filters, is_deconv)
 39 |         self.dec0 = ConvRelu(num_filters, num_filters)
 40 |         self.final = nn.Conv2d(num_filters, num_classes, kernel_size=1)
 41 | 
 42 |     def forward(self, x):
 43 |         conv1 = self.conv1(x)
 44 |         conv2 = self.conv2(conv1)
 45 |         conv3 = self.conv3(conv2)
 46 |         conv4 = self.conv4(conv3)
 47 |         conv5 = self.conv5(conv4)
 48 | 
 49 |         center = self.center(self.pool(conv5))
 50 | 
 51 |         dec5 = self.dec5(torch.cat([center, conv5], 1))
 52 |         dec4 = self.dec4(torch.cat([dec5, conv4], 1))
 53 |         dec3 = self.dec3(torch.cat([dec4, conv3], 1))
 54 |         dec2 = self.dec2(torch.cat([dec3, conv2], 1))
 55 |         dec1 = self.dec1(dec2)
 56 |         dec0 = self.dec0(dec1)
 57 | 
 58 |         x_out = self.final(dec0)
 59 |         return x_out
 60 | 
 61 | 
 62 | class DecoderBlock(nn.Module):
 63 | 
 64 |     def __init__(self, in_channels, middle_channels, out_channels, is_deconv=False, one_conv=False):
 65 |         super(DecoderBlock, self).__init__()
 66 |         self.in_channels = in_channels
 67 | 
 68 |         if is_deconv:
 69 | 
 70 |             self.block = nn.Sequential(
 71 |                 nn.ConvTranspose2d(in_channels, out_channels,
 72 |                                    kernel_size=4, stride=2, padding=1),
 73 |                 nn.ReLU(inplace=True)
 74 |             )
 75 |         else:
 76 |             if not one_conv:
 77 |                 self.block = nn.Sequential(
 78 |                     nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
 79 |                     ConvRelu(in_channels, middle_channels),
 80 |                     ConvRelu(middle_channels, out_channels),
 81 |                 )
 82 |             else:
 83 |                 self.block = nn.Sequential(
 84 |                     nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
 85 |                     ConvRelu(in_channels, out_channels),
 86 |                 )
 87 | 
 88 |     def forward(self, x):
 89 |         return self.block(x)
 90 | 
 91 | 
 92 | def conv3x3(in_, out):
 93 |     return nn.Conv2d(in_, out, 3, padding=1)
 94 | 
 95 | 
 96 | def conv1x1(in_, out):
 97 |     return nn.Conv2d(in_, out, 1, padding=0)
 98 | 
 99 | 
100 | class ConvRelu(nn.Module):
101 |     def __init__(self, in_: int, out: int):
102 |         super(ConvRelu, self).__init__()
103 |         self.conv = conv3x3(in_, out)
104 |         self.activation = nn.ReLU(inplace=True)
105 | 
106 |     def forward(self, x):
107 |         x = self.conv(x)
108 |         x = self.activation(x)
109 |         return x
110 | 
111 | 
112 | class ConvRelu1x1(nn.Module):
113 |     def __init__(self, in_: int, out: int):
114 |         super(ConvRelu1x1, self).__init__()
115 |         self.conv = conv1x1(in_, out)
116 |         self.activation = nn.ReLU(inplace=True)
117 | 
118 |     def forward(self, x):
119 |         x = self.conv(x)
120 |         x = self.activation(x)
121 |         return x
122 | 


--------------------------------------------------------------------------------
/unet/common/pt_models.py:
--------------------------------------------------------------------------------
1 | from pytorch_tools.segmentation_models import Unet
2 | 
3 | 
4 | def resnet34_blur(num_classes):
5 |     model = Unet(decoder_use_batchnorm=False, classes=num_classes, antialias=False)
6 |     return model
7 | 


--------------------------------------------------------------------------------
/unet/common/smp_models.py:
--------------------------------------------------------------------------------
 1 | import segmentation_models_pytorch as smp
 2 | 
 3 | 
 4 | def se_resnet50(num_classes):
 5 |     ENCODER = 'se_resnet50'
 6 |     ENCODER_WEIGHTS = 'imagenet'
 7 | 
 8 |     model = smp.Unet(
 9 |         encoder_name=ENCODER,
10 |         encoder_weights=ENCODER_WEIGHTS,
11 |         classes=num_classes,
12 |         activation=None,
13 |     )
14 | 
15 |     return model
16 | 
17 | 
18 | def resnet34_fpn(num_classes):
19 |     ENCODER = 'resnet34'
20 |     ENCODER_WEIGHTS = 'imagenet'
21 | 
22 |     model = smp.FPN(
23 |         encoder_name=ENCODER,
24 |         encoder_weights=ENCODER_WEIGHTS,
25 |         classes=num_classes,
26 |         activation=None,
27 |     )
28 | 
29 |     return model
30 | 
31 | 
32 | def resnet34_psp(num_classes):
33 |     ENCODER = 'resnet34'
34 |     ENCODER_WEIGHTS = 'imagenet'
35 | 
36 |     model = smp.PSPNet(
37 |         encoder_name=ENCODER,
38 |         encoder_weights=ENCODER_WEIGHTS,
39 |         classes=num_classes,
40 |         activation=None,
41 |     )
42 | 
43 |     return model
44 | 


--------------------------------------------------------------------------------
/unet/convert_to_trt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import sys
 4 | 
 5 | import tensorrt as trt
 6 | 
 7 | try:
 8 |     import pycuda.driver as cuda
 9 |     import pycuda.autoinit
10 | except NameError:
11 |     FileNotFoundError = IOError
12 | 
13 | sys.path.insert(1, os.path.join(sys.path[0], ".."))
14 | 
15 | TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
16 | trt.init_libnvinfer_plugins(TRT_LOGGER, '')
17 | 
18 | 
19 | def GiB(val):
20 |     return val * 1 << 30
21 | 
22 | 
23 | def onnx_engine_trt(engine_path):
24 |     with trt.Builder(TRT_LOGGER) as builder, \
25 |             builder.create_network() as network, \
26 |             trt.OnnxParser(network, TRT_LOGGER) as parser:
27 |         builder.max_workspace_size = GiB(1)
28 |         builder.fp16_mode = False
29 |         with open(engine_path, 'rb') as model:
30 |             parser.parse(model.read())
31 |         # network.mark_output(network.get_layer(network.num_layers - 1).get_output(0))
32 |         engine = builder.build_cuda_engine(network)
33 |         with open(engine_path[:-5] + '.trt', 'wb') as f:
34 |             f.write(engine.serialize())
35 | 
36 | 
37 | print('Export...')
38 | onnx_engine_trt('../models/onnx/resnet.onnx')
39 | shutil.move('../models/onnx/resnet.engine', '../models/resnet.engine')
40 | 


--------------------------------------------------------------------------------
/unet/export_onnx.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from unet.common.models_common import ResNet18
 4 | 
 5 | 
 6 | def get_model(model_path, model_type='ResNet18'):
 7 |     num_classes = 4
 8 | 
 9 |     if model_type == 'ResNet18':
10 |         model = ResNet18(num_classes=num_classes)
11 | 
12 |     state = torch.load(str(model_path))
13 |     state = {key.replace('module.', ''): value for key, value in state['model'].items()}
14 |     model.load_state_dict(state)
15 | 
16 |     model.eval()
17 |     if torch.cuda.is_available():
18 |         return model.cuda()
19 | 
20 |     return model
21 | 
22 | 
23 | model = get_model('39_model_0.pt', model_type='ResNet34')
24 | 
25 | p = torch.randn(1, 3, 320, 320, device='cuda')
26 | 
27 | torch.onnx.export(model, p, 'resnet34.onnx', verbose=True)
28 | 
29 | import onnx
30 | 
31 | # Load the ONNX model
32 | model = onnx.load('resnet34.onnx')
33 | 
34 | # Check that the IR is well formed
35 | onnx.checker.check_model(model)
36 | 
37 | # Print a human readable representation of the graph
38 | print(onnx.helper.printable_graph(model.graph))
39 | 


--------------------------------------------------------------------------------
/unet/infer_service.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import cv2
 5 | import numpy as np
 6 | import tensorrt as trt
 7 | from common.pre_process_utils import normalized, DTYPE
 8 | 
 9 | remove_after_predict = False
10 | camera = None
11 | cam_id = None
12 | 
13 | try:
14 |     import pycuda.driver as cuda
15 |     import pycuda.autoinit
16 | except NameError:
17 |     FileNotFoundError = IOError
18 | 
19 | sys.path.insert(1, os.path.join(sys.path[0], ".."))
20 | 
21 | TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
22 | trt.init_libnvinfer_plugins(TRT_LOGGER, '')
23 | 
24 | DTYPE = trt.float32
25 | 
26 | def normalized(img, buff, input_shape):
27 |     def normalize_image(image):
28 |         image_arr = np.asarray(cv2.resize(image, (input_shape[1], input_shape[2])).transpose(
29 |             [2, 0, 1]).astype(
30 |             trt.nptype(DTYPE)).ravel())
31 |         return (image_arr / 255.0 - 0.456) / 0.225
32 | 
33 |     np.copyto(buff, normalize_image(img))
34 |     return img
35 | 
36 | 
37 | def allocate_buffers(engine):
38 |     h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(DTYPE))
39 |     h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(DTYPE))
40 |     d_input = cuda.mem_alloc(h_input.nbytes)
41 |     d_output = cuda.mem_alloc(h_output.nbytes)
42 |     stream = cuda.Stream()
43 |     return h_input, d_input, h_output, d_output, stream
44 | 
45 | 
46 | def do_inference(context, h_input, d_input, h_output, d_output, stream):
47 |     cuda.memcpy_htod_async(d_input, h_input, stream)
48 |     context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
49 |     cuda.memcpy_dtoh_async(h_output, d_output, stream)
50 |     stream.synchronize()
51 | 
52 | 
53 | def load_engine_trt(engine_path):
54 |     with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as trt_runtime:
55 |         engine_data = f.read()
56 |         engine = trt_runtime.deserialize_cuda_engine(engine_data)
57 |         return engine
58 | 
59 | 
60 | def infer(context, images, h_input, d_input, h_output, d_output, stream, input_shape, output_shape):
61 |     r = []
62 |     for img in images:
63 |         normalized(img, h_input, input_shape)
64 |         # start_inf = time.time()
65 |         do_inference(context, h_input, d_input, h_output, d_output, stream)
66 |         r.append(np.reshape(h_output.copy(), output_shape))
67 |         # print("TensorRT inference time: {} ms".format(int(round((time.time() - start_inf) * 1000))))
68 |     return r
69 | 
70 | 
71 | imgs = None  # np array
72 | 
73 | INPUT_SHAPE = (3, 320, 320)
74 | OUTPUT_SHAPE = (4, 320, 320)
75 | 
76 | with load_engine_trt('trt_engine_path_pads') as trt_engine:
77 |     h_input, d_input, h_output, d_output, stream = allocate_buffers(trt_engine)
78 |     with trt_engine.create_execution_context() as context:
79 |         output = infer(context, imgs, h_input, d_input, h_output, d_output, stream, INPUT_SHAPE, OUTPUT_SHAPE)
80 |         del context
81 |     del trt_engine
82 |     del h_input, d_input, h_output, d_output, stream
83 | 


--------------------------------------------------------------------------------