├── .gitattributes
├── LICENCE
├── README.md
├── assets
    ├── animate.gif
    ├── banner.gif
    ├── figures
    │   ├── 000000006471.jpg
    │   └── 000000014439.jpg
    └── sparseinst.png
├── configs
    ├── Base-SparseInst.yaml
    ├── Sparse_Inst_r50_giam_onnx.yaml
    ├── sparse_inst_cspdarknet53_giam.yaml
    ├── sparse_inst_darknet53_giam.yaml
    ├── sparse_inst_pvt_b1_giam.yaml
    ├── sparse_inst_pvt_b2_li_giam.yaml
    ├── sparse_inst_r101_dcn_giam.yaml
    ├── sparse_inst_r101_giam.yaml
    ├── sparse_inst_r50_base.yaml
    ├── sparse_inst_r50_dcn_giam_aug.yaml
    ├── sparse_inst_r50_giam.yaml
    ├── sparse_inst_r50_giam_aug.yaml
    ├── sparse_inst_r50_giam_fp16.yaml
    ├── sparse_inst_r50_giam_soft.yaml
    ├── sparse_inst_r50vd_base.yaml
    ├── sparse_inst_r50vd_dcn_giam.yaml
    ├── sparse_inst_r50vd_dcn_giam_aug.yaml
    ├── sparse_inst_r50vd_giam.yaml
    └── sparse_inst_r50vd_giam_aug.yaml
├── convert_onnx.py
├── convert_tensorrt.py
├── datasets
    ├── prepare_ade20k_sem_seg.py
    ├── prepare_cocofied_lvis.py
    ├── prepare_for_tests.sh
    └── prepare_panoptic_fpn.py
├── demo.py
├── engine
    ├── __pycache__
    │   └── defaults.cpython-36.pyc
    └── defaults.py
├── eval_tensorrt_onnx.py
├── input
    └── input_image
    │   ├── 640x640.jpg
    │   ├── cup.jpg
    │   ├── femme.jpg
    │   ├── homme.jpg
    │   ├── horses.jpg
    │   ├── image1.jpg
    │   ├── input.jpg
    │   ├── results.png
    │   ├── skate.jpg
    │   └── turkish_coffee.jpg
├── onnx
    └── __pycache__
    │   └── image_processing.cpython-36.pyc
├── output
    ├── mnist.tar.gz
    ├── mnist
    │   ├── model.onnx
    │   ├── test_data_set_0
    │   │   ├── input_0.pb
    │   │   └── output_0.pb
    │   ├── test_data_set_1
    │   │   ├── input_0.pb
    │   │   └── output_0.pb
    │   └── test_data_set_2
    │   │   ├── input_0.pb
    │   │   └── output_0.pb
    └── sparse_inst_r50_giam
    │   ├── config.yaml
    │   └── log.txt
├── results
    ├── 640_result.jpg
    ├── result_onnx.png
    └── result_tensorrt.png
├── sparseinst
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   ├── caffe2sparseinst.cpython-36.pyc
    │   ├── coco_evaluation.cpython-36.pyc
    │   ├── config.cpython-36.pyc
    │   ├── d2_predictor.cpython-36.pyc
    │   ├── dataset_mapper.cpython-36.pyc
    │   ├── decoder.cpython-36.pyc
    │   ├── encoder.cpython-36.pyc
    │   ├── loss.cpython-36.pyc
    │   ├── sparseinst.cpython-36.pyc
    │   └── utils.cpython-36.pyc
    ├── backbones
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── cspnet.cpython-36.pyc
    │   │   ├── pvt.cpython-36.pyc
    │   │   └── resnet.cpython-36.pyc
    │   ├── cspnet.py
    │   ├── pvt.py
    │   └── resnet.py
    ├── caffe2sparseinst.py
    ├── coco_evaluation.py
    ├── config.py
    ├── d2_predictor.py
    ├── dataset_mapper.py
    ├── decoder.py
    ├── encoder.py
    ├── input.ppm
    ├── loss.py
    ├── sparseinst.py
    └── utils.py
├── test.py
├── test_net.py
└── train_net.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Hust Visual Learning Team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SparseInst_TensorRT
  2 |  **This repository implement the real-time Instance Segmentation Algorithm named SparseInst with TensoRT and ONNX.**
  3 |  
  4 | ## Some remarks 
  5 |   - The initial repository on which I build mine is from **hustvl/SparseInst**__ repository (https://github.com/hustvl/SparseInst.git), for additional information about the installation of SparseInst, refer to the original repository. 
  6 |   - This project is built upon the excellent framework detectron2, and you should install detectron2 first, please check official installation guide for more details. (https://github.com/facebookresearch/detectron2.git)
  7 |   - For command other than TensoRT and ONNX inference, please refer to the initial repository (e.g test_net.py). 
  8 |   - If you face any problem during the parsing time, don't hesitate to drop an issue or a :star: if there aren't any. _**if you have compatibility problem, check the model weights uploaded in the table below and go directly in the testing section**_.
  9 |   - Be aware that in order to parse the model to ONNX and TensorRT, the files sparseinst.py, encoder.py and decoder.py has been modified/slightly modified, don't forget to check the modifications if you come from the initial repository.
 10 |   
 11 |  
 12 |  ## Prerequisites
 13 |   <details>
 14 |   <summary>Click me</summary>
 15 |   
 16 |   - Install Pytorch (1.10.0) and TorchVision (0.11.1)
 17 |   ```
 18 |   pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
 19 |   
 20 |   If other versions of torch are needed, select yours by putting torch==1.11.0+cu102 for example.
 21 |   ```
 22 |   - Install CUDA (10.2) and cuDNN (8.0.0) : https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=WSL-Ubuntu&target_version=2.0&target_type=deb_local
 23 |   
 24 |     - For WSL-Ubuntu :
 25 |   ```
 26 |   sudo wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-wsl-ubuntu.pin
 27 |   sudo mv cuda-wsl-ubuntu.pin /etc/apt/preferences.d/cuda-repository-pin-600
 28 |   sudo wget https://developer.download.nvidia.com/compute/cuda/11.7.1/local_insta
 29 |       llers/cuda-repo-wsl-ubuntu-11-7-local_11.7.1-1_amd64.deb
 30 | sudo dpkg -i cuda-repo-wsl-ubuntu-11-7-local_11.7.1-1_amd64.deb
 31 |   sudo cp /var/cuda-repo-wsl-ubuntu-11-7-local/cuda-96193861-keyring.gpg /usr/share/keyrings/
 32 |   sudo apt-get update
 33 |   sudo apt-get -y install cuda
 34 |    ```
 35 |   
 36 |   - Install TensorRT (8.0.1.6), if you are using an nvidia edge device, TensorRT should already be installed
 37 |   ```
 38 |   python3 -m pip install --upgrade setuptools pip
 39 |   python3 -m pip install nvidia-pyindex
 40 |   python3 -m pip install --upgrade nvidia-tensorrt
 41 |   
 42 |   Verify installation by writing  : assert tensorrt.Builder(tensorrt.Logger())
 43 |   ```
 44 |   - Install ONNX and ONNXruntime
 45 |   ```
 46 |   pip install onnxruntime-gpu
 47 |   pip install onnxruntime
 48 |   pip install numpy protobuf==4.21.5  
 49 |   pip install onnx
 50 |   ```
 51 |   - Install all the other packages needed to run the original SparseInst algorithm (Should be done if you have installed Dectectron2)
 52 |  
 53 |  </details>
 54 |  
 55 |  ## Models and Results for TensorRT and ONNX inference script:
 56 |  
 57 |  The inference speed for Pytorch, ONNX and TensorRT has been compared and shown in the table below. SparseInst running with TensoRT achieved more a less 3 times faster inference speed of SparseInst than running with Pytorch. Lowering the input size of the image can lead to a decent real-time speed.
 58 |  The models from TensorRT and ONNX are built upon the first Pytorch listed weights in the table below : SparseInst R-50 G-IAM.
 59 |  
 60 |  *Note: All the computations has been done on a Nvidia Jetson TX2 Jetpack 4.6. Further test will be done on a Nvidia 2070 RTI*
 61 |  
 62 |  <div align="center">
 63 |  
 64 |  | Model | Input Size |  Inference Speed| Weights
 65 | | :---         |     :---:      |        :---: |         ---: |
 66 | | Pytorch   | 640   | 1.71  FPS  | [model](https://drive.google.com/file/d/130gyxYT6r9j5Nwp5nCo_wthYPuTwa9c4/view?usp=sharing)|
 67 | | TensorRT     | 320    |  20.32 FPS     |[model](https://drive.google.com/file/d/17-eBWVrpnwv0ueeDsEmAqSKlNh3If4AI/view?usp=sharing)|
 68 | | TensorRT     | 640    |  6.00 FPS     |[model](https://drive.google.com/file/d/1Kh97LZNzsuBJTeDVXwRKx8CiX7CeMI3v/view?usp=sharing)|
 69 | | ONNX     | 320    | 0.22 FPS     |[model](https://drive.google.com/file/d/1H6YH3YUPaA4vO3IyIGaZNAkGBsU9xHCH/view?usp=sharing)|
 70 | | ONNX     | 640     |0.03 FPS     |[model](https://drive.google.com/file/d/1GEoQssyJ9MZRnEISiatF_tREpdGAnSjk/view?usp=sharing)|
 71 |  
 72 | 
 73 |  
 74 |  <img
 75 |   src="results/result_tensorrt.png"
 76 |   alt="Alt text"
 77 |   title="Result for TensorRT demo"
 78 |   style="display: inline-block; margin: 1 auto; max-width: 150px">
 79 |  
 80 |  </div>
 81 | 
 82 |  ## Building the ONNX model  :
 83 |  
 84 |  To build the model from Pytorch to ONNX, you need to run the following command. You can set the arguments to default. Please check if the config path and the model weights path are correctly set up.
 85 |  ```
 86 |  <sudo python3 convert_onnx.py --config-file config-gile --output output_directory_onnxmodel --image dummy_input --opts MODEL.WEIGHTS weights_directory>
 87 |  ```
 88 |  
 89 |   ## Building the TensorRT model  :
 90 |   
 91 |   To build the model from ONNX to TensorRT, you need to run the following command. You can set the arguments to default. If you have any problem while parsing the model to TensorRT, don't hesitate to ask.
 92 |  ```
 93 |  <sudo python3 convert_tensortt.py --onnx_model onnx-model-directory --output output_directory_TensoRTModel
 94 |  ```
 95 |  
 96 |   ## Testing SparseInst with Pytorch, TensorRT and ONNX :
 97 |   
 98 |   To test the inference speed (FPS) of the Pytorch, TensorRT and ONNX models, run the following command. 
 99 |   
100 |  1. Pytorch
101 |  ```
102 |  sudo python3 eval_tensorrt_onnx.py  -c 0.2 --width_resized 320 --height_resized 320 --input datasets/coco/calib_images/*  --use_pytorch 
103 |  ```
104 |  2. TensorRT
105 |  ```
106 |  sudo python3 eval_tensorrt_onnx.py  -c 0.2 --width_resized 320 --height_resized 320 --input datasets/coco/calib_images/*  --use_tensorrt --tensorrt_engine engine/sparseinst_trt_320_320.engine
107 |  ```
108 |  3. ONNX
109 |  ```
110 |  sudo python3 eval_tensorrt_onnx.py  -c 0.2 --width_resized 320 --height_resized 320 --input datasets/coco/calib_images/* --use_onnx --onnx_engine onnx/sparseinst_onnx_320_320.onnx 
111 |  ```
112 |  
113 | **Notes :**
114 | - **Input argument** can either be an image or a directory of images (directory/*)
115 | - You can of course infer all three together, just add the argument --use_model of the model you want to infer aswell as the engine (Not for Pytorch).
116 | - In the terminal : 
117 |   - *TRT inference only* time reprensents the inference speed of the model alone
118 | 
119 |   ```
120 |   TRT inference only use time 4.970773220062256 for 100 images, FPS=20.117594501474272
121 |   ```
122 |   - *TRT algorithm* time represents the inference speed and the preprocessing time combined
123 | 
124 |   ```
125 |   TRT algorithm use time 22.519110441207886 for 100 images, FPS=4.440672745980644
126 |   ```
127 |  
128 |  ## Visualizing SparseInst with Pytorch, TensorRT and ONNX :
129 |  To visualize segmentation results on your images, you can run the following commands : 
130 |  
131 |  1. Pytorch
132 |  ```
133 |  sudo python3 eval_tensorrt_onnx.py  -c 0.2 --width_resized 320 --height_resized 320 --input datasets/coco/calib_images/*  --use_pytorch --output_pytorch results/result_image_pytorch/result_pytorch --save_image
134 |  ```
135 |  2. TensorRT
136 |  ```
137 |  sudo python3 eval_tensorrt_onnx.py  -c 0.2 --width_resized 320 --height_resized 320 --input datasets/coco/calib_images/*  --use_tensorrt --tensorrt_engine engine/sparseinst_trt_320_320.engine --output_tensorrt results/result_image_tensorrt/result_tensorrt --save_image
138 |  ```
139 |  3. ONNX
140 |  ```
141 |  sudo python3 eval_tensorrt_onnx.py  -c 0.2 --width_resized 320 --height_resized 320 --input datasets/coco/calib_images/* --use_onnx --onnx_engine onnx/sparseinst_onnx_320_320.onnx --output_onnx results/result_image_onnx/result_onnx --save_image
142 |  ```
143 | **Notes :**
144 | - You can still infer and visualize all three together, just add all the argument together
145 | - If you don't specify --save_image, it will only infer the model and not save the outputs.
146 | 
147 | 
148 |  
149 | 


--------------------------------------------------------------------------------
/assets/animate.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/assets/animate.gif


--------------------------------------------------------------------------------
/assets/banner.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/assets/banner.gif


--------------------------------------------------------------------------------
/assets/figures/000000006471.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/assets/figures/000000006471.jpg


--------------------------------------------------------------------------------
/assets/figures/000000014439.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/assets/figures/000000014439.jpg


--------------------------------------------------------------------------------
/assets/sparseinst.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/assets/sparseinst.png


--------------------------------------------------------------------------------
/configs/Base-SparseInst.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "SparseInst"
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 4 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 5 |   PIXEL_STD: [58.395, 57.120, 57.375]
 6 |   BACKBONE:
 7 |     FREEZE_AT: 0
 8 |     NAME: "build_resnet_backbone"
 9 |   RESNETS:
10 |     NORM: "FrozenBN"
11 |     DEPTH: 50
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res3", "res4", "res5"]
14 |   SPARSE_INST:
15 |     ENCODER:
16 |       NAME: "InstanceContextEncoder"
17 |     DECODER:
18 |       NAME: "GroupIAMDecoder"
19 | DATASETS:
20 |   TRAIN: ("coco_2017_train",)
21 |   TEST:  ("coco_2017_val",)
22 | SOLVER:
23 |   IMS_PER_BATCH: 64
24 |   BASE_LR: 0.00005
25 |   STEPS: (210000, 250000)
26 |   MAX_ITER: 270000
27 |   WEIGHT_DECAY: 0.05
28 | INPUT:
29 |   MIN_SIZE_TRAIN: (416, 448, 480, 512, 544, 576, 608, 640)
30 |   MAX_SIZE_TRAIN: 853
31 |   MIN_SIZE_TEST: 640
32 |   MAX_SIZE_TEST: 853
33 |   FORMAT: "RGB"
34 |   MASK_FORMAT: "bitmask"
35 | TEST:
36 |   EVAL_PERIOD: 7330
37 | DATALOADER:
38 |   NUM_WORKERS: 4
39 | VERSION: 2
40 | 


--------------------------------------------------------------------------------
/configs/Sparse_Inst_r50_giam_onnx.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   meta_architecture: "SparseInst"
 3 |   weights: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 4 |   pixel_mean: [123.675, 116.280, 103.530]
 5 |   pixel_std: [58.395, 57.120, 57.375]
 6 |   backbone:
 7 |     freeze_at: 0
 8 |     name: "build_resnet_backbone"
 9 |   resnets:
10 |     norm: "FrozenBN"
11 |     depth: 50
12 |     stride_in_1x1: False
13 |     out_features: ["res3", "res4", "res5"]
14 |   sparse_inst:
15 |     encoder:
16 |       name: "InstanceContextEncoder"
17 |     decoder:
18 |       name: "GroupIAMDecoder"
19 | data:
20 |   train: ("coco_2017_train",)
21 |   test:  ("coco_2017_val",)
22 |   val:  ("coco_2017_val",)
23 | solver:
24 |   ims_pre_batch: 64
25 |   base_lr: 0.00005
26 |   steps: (210000, 250000)
27 |   max_iter: 270000
28 |   weight_decay: 0.05
29 | input:
30 |   min_size_train: (416, 448, 480, 512, 544, 576, 608, 640)
31 |   max_size_train: 853
32 |   min_size_test: 640
33 |   max_size_test: 853
34 |   format: "RGB"
35 |   mask£_format: "bitmask"
36 | test:
37 |   eval_period: 7330
38 | dataloader:
39 |   num_workers: 4
40 | version: 2
41 | output_dir: "output/sparse_inst_r50_giam_onnx"


--------------------------------------------------------------------------------
/configs/sparse_inst_cspdarknet53_giam.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "pretrained_models/cspdarknet53_ra_256-d05c7c21.pth"
 4 |   BACKBONE:
 5 |     NAME: "build_cspnet_backbone"
 6 |   SPARSE_INST:
 7 |     ENCODER:
 8 |       IN_FEATURES: ["csp2", "csp3", "csp4"]
 9 |   CSPNET:
10 |     NAME: "darknet53"
11 |     OUT_FEATURES: ["csp2", "csp3", "csp4"]
12 | OUTPUT_DIR: "output/sparse_inst_cspdarknet53_giam"


--------------------------------------------------------------------------------
/configs/sparse_inst_darknet53_giam.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: ""
 4 |   BACKBONE:
 5 |     NAME: "build_cspnet_backbone"
 6 |   SPARSE_INST:
 7 |     ENCODER:
 8 |       IN_FEATURES: ["csp2", "csp3", "csp4"]
 9 |   CSPNET:
10 |     NAME: "darknet53"
11 |     OUT_FEATURES: ["csp2", "csp3", "csp4"]
12 | OUTPUT_DIR: "output/sparse_inst_darknet53_giam"


--------------------------------------------------------------------------------
/configs/sparse_inst_pvt_b1_giam.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "pretrained_models/pvt_v2_b1.pth"
 4 |   BACKBONE:
 5 |     NAME: "build_pyramid_vision_transformer"
 6 |   SPARSE_INST:
 7 |     ENCODER:
 8 |       IN_FEATURES: ["p2", "p3", "p4"]
 9 |   PVT:
10 |     NAME: "b1"
11 |     OUT_FEATURES: ["p2", "p3", "p4"]
12 | OUTPUT_DIR: "output/sparse_inst_pvt_b1_giam"


--------------------------------------------------------------------------------
/configs/sparse_inst_pvt_b2_li_giam.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "pretrained_models/pvt_v2_b2_li.pth"
 4 |   BACKBONE:
 5 |     NAME: "build_pyramid_vision_transformer"
 6 |   SPARSE_INST:
 7 |     ENCODER:
 8 |       IN_FEATURES: ["p2", "p3", "p4"]
 9 |   PVT:
10 |     NAME: "b2"
11 |     LINEAR: True
12 |     OUT_FEATURES: ["p2", "p3", "p4"]
13 | OUTPUT_DIR: "output/sparse_inst_pvt_b2_linear_giam"


--------------------------------------------------------------------------------
/configs/sparse_inst_r101_dcn_giam.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   WEIGHTS: "pretrained_models/R-101.pkl"
4 |   RESNETS:
5 |     DEPTH: 101
6 |     DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5
7 | OUTPUT_DIR: "output/sparse_inst_r101_dcn_giam"


--------------------------------------------------------------------------------
/configs/sparse_inst_r101_giam.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   WEIGHTS: "pretrained_models/R-101.pkl"
4 |   RESNETS:
5 |     DEPTH: 101
6 | OUTPUT_DIR: "output/sparse_inst_r101_giam"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50_base.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   SPARSE_INST:
4 |     DECODER:
5 |       NAME: "BaseIAMDecoder"
6 | OUTPUT_DIR: "output/sparse_inst_r50_base"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50_dcn_giam_aug.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   RESNETS:
 4 |     DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5
 5 | INPUT:
 6 |   CROP:
 7 |     ENABLED: True
 8 |     TYPE: "absolute_range"
 9 |     SIZE: (384, 600)
10 |   MASK_FORMAT: "polygon"
11 | OUTPUT_DIR: "output/sparse_inst_r50_dcn_giam_aug"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50_giam.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | OUTPUT_DIR: "output/sparse_inst_r50_giam"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50_giam_aug.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | INPUT:
3 |   CROP:
4 |     ENABLED: True
5 |     TYPE: "absolute_range"
6 |     SIZE: (384, 600)
7 |   MASK_FORMAT: "polygon"
8 | OUTPUT_DIR: "output/sparse_inst_r50_giam_aug"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50_giam_fp16.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   WEIGHTS: "pretrained_models/R-50.pkl"
4 | SOLVER:
5 |   AMP:
6 |     ENABLED: True
7 | OUTPUT_DIR: "output/sparse_inst_r50_giam_fp16"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50_giam_soft.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   WEIGHTS: "pretrained_models/R-50.pkl"
4 | MODEL:
5 |   SPARSE_INST:
6 |     DECODER:
7 |       NAME: "GroupIAMSoftDecoder"
8 | OUTPUT_DIR: "output/sparse_inst_r50_giam_soft"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50vd_base.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 | MODEL:
 4 |   WEIGHTS: "pretrained_models/resnet50d_ra2-464e36ba.pth"
 5 |   BACKBONE:
 6 |     FREEZE_AT: 0
 7 |     NAME: "build_resnet_vd_backbone"
 8 |   SPARSE_INST:
 9 |     DECODER:
10 |       NAME: "BaseIAMDecoder"
11 | OUTPUT_DIR: "output/sparse_inst_r50_base"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50vd_dcn_giam.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     FREEZE_AT: 0
 5 |     NAME: "build_resnet_vd_backbone"
 6 |   RESNETS:
 7 |     DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5
 8 | OUTPUT_DIR: "output/sparse_inst_r50vd_dcn_giam"
 9 |   
10 | 


--------------------------------------------------------------------------------
/configs/sparse_inst_r50vd_dcn_giam_aug.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     FREEZE_AT: 0
 5 |     NAME: "build_resnet_vd_backbone"
 6 |   RESNETS:
 7 |     DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5
 8 | INPUT:
 9 |   CROP:
10 |     ENABLED: True
11 |     TYPE: "absolute_range"
12 |     SIZE: (384, 600)
13 |   MASK_FORMAT: "polygon"
14 | OUTPUT_DIR: "output/sparse_inst_r50vd_dcn_giam_aug"
15 |   
16 | 


--------------------------------------------------------------------------------
/configs/sparse_inst_r50vd_giam.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   WEIGHTS: "pretrained_models/resnet50d_ra2-464e36ba.pth"
4 |   BACKBONE:
5 |     FREEZE_AT: 0
6 |     NAME: "build_resnet_vd_backbone"
7 | OUTPUT_DIR: "output/sparse_inst_r50vd_giam"
8 | 


--------------------------------------------------------------------------------
/configs/sparse_inst_r50vd_giam_aug.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   BACKBONE:
 4 |     FREEZE_AT: 0
 5 |     NAME: "build_resnet_vd_backbone"
 6 | INPUT:
 7 |   CROP:
 8 |     ENABLED: True
 9 |     TYPE: "absolute_range"
10 |     SIZE: (384, 600)
11 |   MASK_FORMAT: "polygon"
12 | OUTPUT_DIR: "output/sparse_inst_r50vd_giam_aug"
13 | 


--------------------------------------------------------------------------------
/convert_onnx.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import argparse
  3 | import  cv2
  4 | import torch
  5 | 
  6 | from detectron2.utils.logger import setup_logger
  7 | from detectron2.modeling import build_model
  8 | from detectron2.checkpoint import DetectionCheckpointer
  9 | from detectron2.config import get_cfg
 10 | 
 11 | from asyncio import streams
 12 | import os
 13 | from detectron2.data.detection_utils import read_image
 14 | import torch
 15 | from sparseinst import add_sparse_inst_config
 16 | from detectron2.utils.logger import setup_logger
 17 | from detectron2.config import get_cfg
 18 | import argparse
 19 | 
 20 | from sparseinst.config import add_sparse_inst_config
 21 | 
 22 | 
 23 | def normalizer(x, mean, std): return (x - mean) / std
 24 | 
 25 | def main():
 26 |     parser = argparse.ArgumentParser(
 27 |         description="Export model to the onnx format")
 28 |     parser.add_argument(
 29 |         "--config-file",
 30 |         default="configs/sparse_inst_r50_giam.yaml",
 31 |         metavar="FILE",
 32 |         help="path to config file",
 33 |     )
 34 |     parser.add_argument('--width', default=640, type=int)
 35 |     parser.add_argument('--height', default=640, type=int)
 36 |     parser.add_argument('--level', default=0, type=int)
 37 |     parser.add_argument(
 38 |         "--output",
 39 |         default="onnx/sparseinst_giam_onnx_2b7d68_classes_lujzz_without_interpolate_torch2trt_.onnx",
 40 |         metavar="FILE",
 41 |         help="path to the output onnx file",
 42 |     )
 43 |     parser.add_argument(
 44 |         "--opts",
 45 |         help="Modify config options using the command-line 'KEY VALUE' pairs",
 46 |         default=['MODEL.WEIGHTS', 'weights/sparse_inst_r50_giam_aug_2b7d68.pth'],
 47 |         nargs=argparse.REMAINDER,
 48 |     )
 49 |     parser.add_argument(
 50 |         "--image",
 51 |         default='input/input_image/640x640.jpg',
 52 |         metavar="FILE",
 53 |         help="path to the output onnx file",
 54 |     )
 55 | 
 56 |      
 57 | 
 58 |     cfg = get_cfg()
 59 |     add_sparse_inst_config(cfg)
 60 |     args = parser.parse_args()
 61 |     cfg.merge_from_file(args.config_file)
 62 |     cfg.merge_from_list(args.opts)
 63 | 
 64 |     # norm for ONNX: change FrozenBN back to BN
 65 |     cfg.MODEL.BACKBONE.FREEZE_AT = 0
 66 |     cfg.MODEL.RESNETS.NORM = "BN"
 67 | 
 68 |     cfg.freeze()
 69 | 
 70 |     output_dir = cfg.OUTPUT_DIR
 71 |     logger = setup_logger(output=output_dir)
 72 |     logger.info(cfg)
 73 | 
 74 | 
 75 |     model = build_model(cfg)
 76 |     model.to(cfg.MODEL.DEVICE)
 77 |     logger.info("Model:\n{}".format(model))
 78 | 
 79 |     checkpointer = DetectionCheckpointer(model)
 80 |     _ = checkpointer.load(cfg.MODEL.WEIGHTS)
 81 |     logger.info("load Model:\n{}".format(cfg.MODEL.WEIGHTS))
 82 |     device = torch.device('cuda:0')
 83 | 
 84 |     input_names = ["input_image"]
 85 | 
 86 |     #dummy_input = torch.rand((3, height, width)).to(cfg.MODEL.DEVICE)
 87 |     pixel_mean = torch.Tensor([123.675, 116.280, 103.530]).to(device).view(3, 1, 1)
 88 |     pixel_std = torch.Tensor([58.395, 57.120, 57.375]).to(device).view(3, 1, 1)
 89 | 
 90 |     path = args.image
 91 |     original_image = read_image(path, format="RGB")
 92 |     print(original_image.shape)
 93 |     image = cv2.resize(original_image, (args.width, args.height))
 94 | 
 95 |     image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)).to(device)
 96 |     image = normalizer(image, pixel_mean, pixel_std)
 97 |     image = image.repeat(1,1,1,1)
 98 |     print(image.shape)
 99 |     dummy_input = image
100 | 
101 |     output_names = ["scores", "classes", "masks"]
102 | 
103 |     model.forward = model.forward_test_3
104 |     model.eval()
105 |     torch.onnx.export(
106 |         model,
107 |         dummy_input,
108 |         args.output,
109 |         verbose=True,
110 |         input_names=input_names,
111 |         output_names=output_names,
112 |         keep_initializers_as_inputs=False,
113 |         opset_version=11,
114 |     )
115 |     
116 |     logger.info("Done. The onnx model is saved into {}.".format(args.output))
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/convert_tensorrt.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | import tensorrt as trt
 3 | import os
 4 | import argparse
 5 | 
 6 | 
 7 | 
 8 | def build_engine( onnx_file_path, engine_file_path, flop=16):
 9 |     network_flags = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
10 | 
11 |     trt_logger = trt.Logger(trt.Logger.VERBOSE)
12 |     builder = trt.Builder(trt_logger)
13 |     network = builder.create_network(
14 |         network_flags
15 |     )
16 |     parser = trt.OnnxParser(network, trt_logger)
17 | 
18 | 
19 |     with open(onnx_file_path, 'rb') as model:
20 |         if not parser.parse(model.read()):
21 |             print('ERROR: Failed to parse the ONNX file')
22 |             for error in range(parser.num_errors):
23 |                 print(parser.num_errors)
24 |                 print(parser.get_error(error))
25 |             return None
26 | 
27 |     print("Completed parsing ONNX file")
28 |     builder.max_batch_size = 1
29 | 
30 |     if os.path.isfile(engine_file_path):
31 |         try:
32 |             os.remove(engine_file_path)
33 |         except Exception:
34 |             print("cannot removing existing file: ",
35 |             engine_file_path)
36 |     print("Creating Tensorrt Engine")
37 | 
38 |     config = builder.create_builder_config()
39 |     config.set_tactic_sources(1 << int(trt.TacticSource.CUBLAS))
40 |     config.max_workspace_size = 2 << 30
41 |     config.set_flag(trt.BuilderFlag.FP16)
42 | 
43 |     engine = builder.build_engine(network, config)
44 | 
45 |     with open(engine_file_path, "wb") as f:
46 |         f.write(engine.serialize())
47 |     print("Serialized Engine Saved at: ", engine_file_path)
48 |     return engine
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser(
52 |         description="Export model to the onnx format")
53 |     parser.add_argument(
54 |         "--onnx_model",
55 |         default="onnx/sparseinst_giam_onnx_2b7d68_classes_lujzz_without_interpolate_torch2trt_.onnx",
56 |         metavar="FILE",
57 |         help="path to onnx model file",
58 |     )
59 |     parser.add_argument(
60 |         "--output",
61 |         default='engine/sparseinst_giam_onnx_2b7d68_classes_lujzz_without_interpolate_torch2trt_.engine',
62 |         metavar="FILE",
63 |         help="path to the output tensorrt file",
64 |     )
65 | 
66 |     args = parser.parse_args()
67 |     build_engine(args.onnx_model, args.output)
68 | 


--------------------------------------------------------------------------------
/datasets/prepare_ade20k_sem_seg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import numpy as np
 5 | import os
 6 | from pathlib import Path
 7 | import tqdm
 8 | from PIL import Image
 9 | 
10 | 
11 | def convert(input, output):
12 |     img = np.asarray(Image.open(input))
13 |     assert img.dtype == np.uint8
14 |     img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
15 |     Image.fromarray(img).save(output)
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
20 |     for name in ["training", "validation"]:
21 |         annotation_dir = dataset_dir / "annotations" / name
22 |         output_dir = dataset_dir / "annotations_detectron2" / name
23 |         output_dir.mkdir(parents=True, exist_ok=True)
24 |         for file in tqdm.tqdm(list(annotation_dir.iterdir())):
25 |             output_file = output_dir / file.name
26 |             convert(file, output_file)
27 | 


--------------------------------------------------------------------------------
/datasets/prepare_cocofied_lvis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) Facebook, Inc. and its affiliates.
  4 | 
  5 | import copy
  6 | import json
  7 | import os
  8 | from collections import defaultdict
  9 | 
 10 | # This mapping is extracted from the official LVIS mapping:
 11 | # https://github.com/lvis-dataset/lvis-api/blob/master/data/coco_to_synset.json
 12 | COCO_SYNSET_CATEGORIES = [
 13 |     {"synset": "person.n.01", "coco_cat_id": 1},
 14 |     {"synset": "bicycle.n.01", "coco_cat_id": 2},
 15 |     {"synset": "car.n.01", "coco_cat_id": 3},
 16 |     {"synset": "motorcycle.n.01", "coco_cat_id": 4},
 17 |     {"synset": "airplane.n.01", "coco_cat_id": 5},
 18 |     {"synset": "bus.n.01", "coco_cat_id": 6},
 19 |     {"synset": "train.n.01", "coco_cat_id": 7},
 20 |     {"synset": "truck.n.01", "coco_cat_id": 8},
 21 |     {"synset": "boat.n.01", "coco_cat_id": 9},
 22 |     {"synset": "traffic_light.n.01", "coco_cat_id": 10},
 23 |     {"synset": "fireplug.n.01", "coco_cat_id": 11},
 24 |     {"synset": "stop_sign.n.01", "coco_cat_id": 13},
 25 |     {"synset": "parking_meter.n.01", "coco_cat_id": 14},
 26 |     {"synset": "bench.n.01", "coco_cat_id": 15},
 27 |     {"synset": "bird.n.01", "coco_cat_id": 16},
 28 |     {"synset": "cat.n.01", "coco_cat_id": 17},
 29 |     {"synset": "dog.n.01", "coco_cat_id": 18},
 30 |     {"synset": "horse.n.01", "coco_cat_id": 19},
 31 |     {"synset": "sheep.n.01", "coco_cat_id": 20},
 32 |     {"synset": "beef.n.01", "coco_cat_id": 21},
 33 |     {"synset": "elephant.n.01", "coco_cat_id": 22},
 34 |     {"synset": "bear.n.01", "coco_cat_id": 23},
 35 |     {"synset": "zebra.n.01", "coco_cat_id": 24},
 36 |     {"synset": "giraffe.n.01", "coco_cat_id": 25},
 37 |     {"synset": "backpack.n.01", "coco_cat_id": 27},
 38 |     {"synset": "umbrella.n.01", "coco_cat_id": 28},
 39 |     {"synset": "bag.n.04", "coco_cat_id": 31},
 40 |     {"synset": "necktie.n.01", "coco_cat_id": 32},
 41 |     {"synset": "bag.n.06", "coco_cat_id": 33},
 42 |     {"synset": "frisbee.n.01", "coco_cat_id": 34},
 43 |     {"synset": "ski.n.01", "coco_cat_id": 35},
 44 |     {"synset": "snowboard.n.01", "coco_cat_id": 36},
 45 |     {"synset": "ball.n.06", "coco_cat_id": 37},
 46 |     {"synset": "kite.n.03", "coco_cat_id": 38},
 47 |     {"synset": "baseball_bat.n.01", "coco_cat_id": 39},
 48 |     {"synset": "baseball_glove.n.01", "coco_cat_id": 40},
 49 |     {"synset": "skateboard.n.01", "coco_cat_id": 41},
 50 |     {"synset": "surfboard.n.01", "coco_cat_id": 42},
 51 |     {"synset": "tennis_racket.n.01", "coco_cat_id": 43},
 52 |     {"synset": "bottle.n.01", "coco_cat_id": 44},
 53 |     {"synset": "wineglass.n.01", "coco_cat_id": 46},
 54 |     {"synset": "cup.n.01", "coco_cat_id": 47},
 55 |     {"synset": "fork.n.01", "coco_cat_id": 48},
 56 |     {"synset": "knife.n.01", "coco_cat_id": 49},
 57 |     {"synset": "spoon.n.01", "coco_cat_id": 50},
 58 |     {"synset": "bowl.n.03", "coco_cat_id": 51},
 59 |     {"synset": "banana.n.02", "coco_cat_id": 52},
 60 |     {"synset": "apple.n.01", "coco_cat_id": 53},
 61 |     {"synset": "sandwich.n.01", "coco_cat_id": 54},
 62 |     {"synset": "orange.n.01", "coco_cat_id": 55},
 63 |     {"synset": "broccoli.n.01", "coco_cat_id": 56},
 64 |     {"synset": "carrot.n.01", "coco_cat_id": 57},
 65 |     {"synset": "frank.n.02", "coco_cat_id": 58},
 66 |     {"synset": "pizza.n.01", "coco_cat_id": 59},
 67 |     {"synset": "doughnut.n.02", "coco_cat_id": 60},
 68 |     {"synset": "cake.n.03", "coco_cat_id": 61},
 69 |     {"synset": "chair.n.01", "coco_cat_id": 62},
 70 |     {"synset": "sofa.n.01", "coco_cat_id": 63},
 71 |     {"synset": "pot.n.04", "coco_cat_id": 64},
 72 |     {"synset": "bed.n.01", "coco_cat_id": 65},
 73 |     {"synset": "dining_table.n.01", "coco_cat_id": 67},
 74 |     {"synset": "toilet.n.02", "coco_cat_id": 70},
 75 |     {"synset": "television_receiver.n.01", "coco_cat_id": 72},
 76 |     {"synset": "laptop.n.01", "coco_cat_id": 73},
 77 |     {"synset": "mouse.n.04", "coco_cat_id": 74},
 78 |     {"synset": "remote_control.n.01", "coco_cat_id": 75},
 79 |     {"synset": "computer_keyboard.n.01", "coco_cat_id": 76},
 80 |     {"synset": "cellular_telephone.n.01", "coco_cat_id": 77},
 81 |     {"synset": "microwave.n.02", "coco_cat_id": 78},
 82 |     {"synset": "oven.n.01", "coco_cat_id": 79},
 83 |     {"synset": "toaster.n.02", "coco_cat_id": 80},
 84 |     {"synset": "sink.n.01", "coco_cat_id": 81},
 85 |     {"synset": "electric_refrigerator.n.01", "coco_cat_id": 82},
 86 |     {"synset": "book.n.01", "coco_cat_id": 84},
 87 |     {"synset": "clock.n.01", "coco_cat_id": 85},
 88 |     {"synset": "vase.n.01", "coco_cat_id": 86},
 89 |     {"synset": "scissors.n.01", "coco_cat_id": 87},
 90 |     {"synset": "teddy.n.01", "coco_cat_id": 88},
 91 |     {"synset": "hand_blower.n.01", "coco_cat_id": 89},
 92 |     {"synset": "toothbrush.n.01", "coco_cat_id": 90},
 93 | ]
 94 | 
 95 | 
 96 | def cocofy_lvis(input_filename, output_filename):
 97 |     """
 98 |     Filter LVIS instance segmentation annotations to remove all categories that are not included in
 99 |     COCO. The new json files can be used to evaluate COCO AP using `lvis-api`. The category ids in
100 |     the output json are the incontiguous COCO dataset ids.
101 | 
102 |     Args:
103 |         input_filename (str): path to the LVIS json file.
104 |         output_filename (str): path to the COCOfied json file.
105 |     """
106 | 
107 |     with open(input_filename, "r") as f:
108 |         lvis_json = json.load(f)
109 | 
110 |     lvis_annos = lvis_json.pop("annotations")
111 |     cocofied_lvis = copy.deepcopy(lvis_json)
112 |     lvis_json["annotations"] = lvis_annos
113 | 
114 |     # Mapping from lvis cat id to coco cat id via synset
115 |     lvis_cat_id_to_synset = {cat["id"]: cat["synset"] for cat in lvis_json["categories"]}
116 |     synset_to_coco_cat_id = {x["synset"]: x["coco_cat_id"] for x in COCO_SYNSET_CATEGORIES}
117 |     # Synsets that we will keep in the dataset
118 |     synsets_to_keep = set(synset_to_coco_cat_id.keys())
119 |     coco_cat_id_with_instances = defaultdict(int)
120 | 
121 |     new_annos = []
122 |     ann_id = 1
123 |     for ann in lvis_annos:
124 |         lvis_cat_id = ann["category_id"]
125 |         synset = lvis_cat_id_to_synset[lvis_cat_id]
126 |         if synset not in synsets_to_keep:
127 |             continue
128 |         coco_cat_id = synset_to_coco_cat_id[synset]
129 |         new_ann = copy.deepcopy(ann)
130 |         new_ann["category_id"] = coco_cat_id
131 |         new_ann["id"] = ann_id
132 |         ann_id += 1
133 |         new_annos.append(new_ann)
134 |         coco_cat_id_with_instances[coco_cat_id] += 1
135 |     cocofied_lvis["annotations"] = new_annos
136 | 
137 |     for image in cocofied_lvis["images"]:
138 |         for key in ["not_exhaustive_category_ids", "neg_category_ids"]:
139 |             new_category_list = []
140 |             for lvis_cat_id in image[key]:
141 |                 synset = lvis_cat_id_to_synset[lvis_cat_id]
142 |                 if synset not in synsets_to_keep:
143 |                     continue
144 |                 coco_cat_id = synset_to_coco_cat_id[synset]
145 |                 new_category_list.append(coco_cat_id)
146 |                 coco_cat_id_with_instances[coco_cat_id] += 1
147 |             image[key] = new_category_list
148 | 
149 |     coco_cat_id_with_instances = set(coco_cat_id_with_instances.keys())
150 | 
151 |     new_categories = []
152 |     for cat in lvis_json["categories"]:
153 |         synset = cat["synset"]
154 |         if synset not in synsets_to_keep:
155 |             continue
156 |         coco_cat_id = synset_to_coco_cat_id[synset]
157 |         if coco_cat_id not in coco_cat_id_with_instances:
158 |             continue
159 |         new_cat = copy.deepcopy(cat)
160 |         new_cat["id"] = coco_cat_id
161 |         new_categories.append(new_cat)
162 |     cocofied_lvis["categories"] = new_categories
163 | 
164 |     with open(output_filename, "w") as f:
165 |         json.dump(cocofied_lvis, f)
166 |     print("{} is COCOfied and stored in {}.".format(input_filename, output_filename))
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "lvis")
171 |     for s in ["lvis_v0.5_train", "lvis_v0.5_val"]:
172 |         print("Start COCOfing {}.".format(s))
173 |         cocofy_lvis(
174 |             os.path.join(dataset_dir, "{}.json".format(s)),
175 |             os.path.join(dataset_dir, "{}_cocofied.json".format(s)),
176 |         )
177 | 


--------------------------------------------------------------------------------
/datasets/prepare_for_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | # Download some files needed for running tests.
 5 | 
 6 | cd "${0%/*}"
 7 | 
 8 | BASE=https://dl.fbaipublicfiles.com/detectron2
 9 | mkdir -p coco/annotations
10 | 
11 | for anno in instances_val2017_100 \
12 |   person_keypoints_val2017_100 \
13 |   instances_minival2014_100 \
14 |   person_keypoints_minival2014_100; do
15 | 
16 |   dest=coco/annotations/$anno.json
17 |   [[ -s $dest ]] && {
18 |     echo "$dest exists. Skipping ..."
19 |   } || {
20 |     wget $BASE/annotations/coco/$anno.json -O $dest
21 |   }
22 | done
23 | 


--------------------------------------------------------------------------------
/datasets/prepare_panoptic_fpn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Copyright (c) Facebook, Inc. and its affiliates.
  4 | 
  5 | import functools
  6 | import json
  7 | import multiprocessing as mp
  8 | import numpy as np
  9 | import os
 10 | import time
 11 | from fvcore.common.download import download
 12 | from panopticapi.utils import rgb2id
 13 | from PIL import Image
 14 | 
 15 | from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
 16 | 
 17 | 
 18 | def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
 19 |     panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
 20 |     panoptic = rgb2id(panoptic)
 21 |     output = np.zeros_like(panoptic, dtype=np.uint8) + 255
 22 |     for seg in segments:
 23 |         cat_id = seg["category_id"]
 24 |         new_cat_id = id_map[cat_id]
 25 |         output[panoptic == seg["id"]] = new_cat_id
 26 |     Image.fromarray(output).save(output_semantic)
 27 | 
 28 | 
 29 | def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
 30 |     """
 31 |     Create semantic segmentation annotations from panoptic segmentation
 32 |     annotations, to be used by PanopticFPN.
 33 | 
 34 |     It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
 35 |     It maps all stuff categories to contiguous ids starting from 1.
 36 | 
 37 |     Args:
 38 |         panoptic_json (str): path to the panoptic json file, in COCO's format.
 39 |         panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
 40 |         sem_seg_root (str): a directory to output semantic annotation files
 41 |         categories (list[dict]): category metadata. Each dict needs to have:
 42 |             "id": corresponds to the "category_id" in the json annotations
 43 |             "isthing": 0 or 1
 44 |     """
 45 |     os.makedirs(sem_seg_root, exist_ok=True)
 46 | 
 47 |     stuff_ids = [k["id"] for k in categories if k["isthing"] == 0]
 48 |     thing_ids = [k["id"] for k in categories if k["isthing"] == 1]
 49 |     id_map = {}  # map from category id to id in the output semantic annotation
 50 |     assert len(stuff_ids) <= 254
 51 |     for i, stuff_id in enumerate(stuff_ids):
 52 |         id_map[stuff_id] = i + 1
 53 |     for thing_id in thing_ids:
 54 |         id_map[thing_id] = 0
 55 |     id_map[0] = 255
 56 | 
 57 |     with open(panoptic_json) as f:
 58 |         obj = json.load(f)
 59 | 
 60 |     pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
 61 | 
 62 |     def iter_annotations():
 63 |         for anno in obj["annotations"]:
 64 |             file_name = anno["file_name"]
 65 |             segments = anno["segments_info"]
 66 |             input = os.path.join(panoptic_root, file_name)
 67 |             output = os.path.join(sem_seg_root, file_name)
 68 |             yield input, output, segments
 69 | 
 70 |     print("Start writing to {} ...".format(sem_seg_root))
 71 |     start = time.time()
 72 |     pool.starmap(
 73 |         functools.partial(_process_panoptic_to_semantic, id_map=id_map),
 74 |         iter_annotations(),
 75 |         chunksize=100,
 76 |     )
 77 |     print("Finished. time: {:.2f}s".format(time.time() - start))
 78 | 
 79 | 
 80 | if __name__ == "__main__":
 81 |     dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
 82 |     for s in ["val2017", "train2017"]:
 83 |         separate_coco_semantic_from_panoptic(
 84 |             os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
 85 |             os.path.join(dataset_dir, "panoptic_{}".format(s)),
 86 |             os.path.join(dataset_dir, "panoptic_stuff_{}".format(s)),
 87 |             COCO_CATEGORIES,
 88 |         )
 89 | 
 90 |     # Prepare val2017_100 for quick testing:
 91 | 
 92 |     dest_dir = os.path.join(dataset_dir, "annotations/")
 93 |     URL_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
 94 |     download(URL_PREFIX + "annotations/coco/panoptic_val2017_100.json", dest_dir)
 95 |     with open(os.path.join(dest_dir, "panoptic_val2017_100.json")) as f:
 96 |         obj = json.load(f)
 97 | 
 98 |     def link_val100(dir_full, dir_100):
 99 |         print("Creating " + dir_100 + " ...")
100 |         os.makedirs(dir_100, exist_ok=True)
101 |         for img in obj["images"]:
102 |             basename = os.path.splitext(img["file_name"])[0]
103 |             src = os.path.join(dir_full, basename + ".png")
104 |             dst = os.path.join(dir_100, basename + ".png")
105 |             src = os.path.relpath(src, start=dir_100)
106 |             os.symlink(src, dst)
107 | 
108 |     link_val100(
109 |         os.path.join(dataset_dir, "panoptic_val2017"),
110 |         os.path.join(dataset_dir, "panoptic_val2017_100"),
111 |     )
112 | 
113 |     link_val100(
114 |         os.path.join(dataset_dir, "panoptic_stuff_val2017"),
115 |         os.path.join(dataset_dir, "panoptic_stuff_val2017_100"),
116 |     )
117 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | import argparse
  3 | import glob
  4 | import multiprocessing as mp
  5 | import os
  6 | import time
  7 | import cv2
  8 | import tqdm
  9 | import numpy as np
 10 | 
 11 | from detectron2.config import get_cfg
 12 | from detectron2.data.detection_utils import read_image
 13 | from detectron2.utils.logger import setup_logger
 14 | 
 15 | from sparseinst import VisualizationDemo, add_sparse_inst_config
 16 | 
 17 | 
 18 | # constants
 19 | WINDOW_NAME = "COCO detections"
 20 | 
 21 | 
 22 | def setup_cfg(args):
 23 |     # load config from file and command-line arguments
 24 |     cfg = get_cfg()
 25 |     add_sparse_inst_config(cfg)
 26 |     cfg.merge_from_file(args.config_file)
 27 |     cfg.merge_from_list(args.opts)
 28 |     # Set score_threshold for builtin models
 29 |     cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
 30 |     cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
 31 |     cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
 32 |     cfg.freeze()
 33 |     return cfg
 34 | 
 35 | 
 36 | def get_parser():
 37 |     parser = argparse.ArgumentParser(
 38 |         description="Detectron2 demo for builtin models")
 39 |     parser.add_argument(
 40 |         "--config-file",
 41 |         default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
 42 |         metavar="FILE",
 43 |         help="path to config file",
 44 |     )
 45 |     parser.add_argument("--webcam", action="store_true",
 46 |                         help="Take inputs from webcam.")
 47 |     parser.add_argument("--video-input", help="Path to video file.")
 48 |     parser.add_argument(
 49 |         "--input",
 50 |         nargs="+",
 51 |         help="A list of space separated input images; "
 52 |         "or a single glob pattern such as 'directory/*.jpg'",
 53 |     )
 54 |     parser.add_argument(
 55 |         "--output",
 56 |         help="A file or directory to save output visualizations. "
 57 |         "If not given, will show output in an OpenCV window.",
 58 |     )
 59 | 
 60 |     parser.add_argument(
 61 |         "--confidence-threshold",
 62 |         type=float,
 63 |         default=0.5,
 64 |         help="Minimum score for instance predictions to be shown",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--opts",
 68 |         help="Modify config options using the command-line 'KEY VALUE' pairs",
 69 |         default=[],
 70 |         nargs=argparse.REMAINDER,
 71 |     )
 72 |     return parser
 73 | 
 74 | 
 75 | if __name__ == "__main__":
 76 |     mp.set_start_method("spawn", force=True)
 77 |     args = get_parser().parse_args()
 78 |     setup_logger(name="fvcore")
 79 |     logger = setup_logger()
 80 |     logger.info("Arguments: " + str(args))
 81 | 
 82 |     cfg = setup_cfg(args)
 83 | 
 84 |     demo = VisualizationDemo(cfg)
 85 | 
 86 |     if args.input:
 87 |         if len(args.input) == 1:
 88 |             print('args.input', args.input[0])
 89 |             args.input = glob.glob(os.path.expanduser(args.input[0]))
 90 |             assert args.input, "The input path(s) was not found"
 91 |         for path in tqdm.tqdm(args.input, disable=not args.output):
 92 |             # use PIL, to be consistent with evaluation
 93 |             #             img = read_image(path, format="BGR")
 94 |             # OneNet uses RGB input as default
 95 |             img = read_image(path, format="RGB")
 96 |             print('image_intput', img)
 97 |             print('shape image input', np.shape(img))
 98 |             start_time = time.time()
 99 |             predictions, visualized_output = demo.run_on_image(
100 |                 img, args.confidence_threshold)
101 |             logger.info(
102 |                 "{}: {} in {:.2f}s".format(
103 |                     path,
104 |                     "detected {} instances".format(
105 |                         len(predictions["instances"]))
106 |                     if "instances" in predictions
107 |                     else "finished",
108 |                     time.time() - start_time,
109 |                 )
110 |             )
111 | 
112 |             if args.output:
113 |                 if os.path.isdir(args.output):
114 |                     assert os.path.isdir(args.output), args.output
115 |                     out_filename = os.path.join(
116 |                         args.output, os.path.basename(path))
117 |                 else:
118 |                     assert len(
119 |                         args.output) > 0, "Please specify a directory with args.output"
120 |                     out_filename = args.output
121 |                 visualized_output.save(out_filename)
122 |             else:
123 |                 cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
124 |                 cv2.imshow(
125 |                     WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
126 |                 if cv2.waitKey(0) == 27:
127 |                     break  # esc to quit
128 |     elif args.webcam:
129 |         assert args.input is None, "Cannot have both --input and --webcam!"
130 |         assert args.output is None, "output not yet supported with --webcam!"
131 |         cam = cv2.VideoCapture(0)
132 |         for vis in tqdm.tqdm(demo.run_on_video(cam, args.confidence_threshold)):
133 |             cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
134 |             cv2.imshow(WINDOW_NAME, vis)
135 |             if cv2.waitKey(1) == 27:
136 |                 break  # esc to quit
137 |         cam.release()
138 |         cv2.destroyAllWindows()
139 |     elif args.video_input:
140 |         video = cv2.VideoCapture(args.video_input)
141 |         width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
142 |         height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
143 |         frames_per_second = video.get(cv2.CAP_PROP_FPS)
144 |         num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
145 |         basename = os.path.basename(args.video_input)
146 | 
147 |         if args.output:
148 |             if os.path.isdir(args.output):
149 |                 output_fname = os.path.join(args.output, basename)
150 |                 output_fname = os.path.splitext(output_fname)[0] + ".mkv"
151 |             else:
152 |                 output_fname = args.output
153 |             assert not os.path.isfile(output_fname), output_fname
154 |             output_file = cv2.VideoWriter(
155 |                 filename=output_fname,
156 |                 # some installation of opencv may not support x264 (due to its license),
157 |                 # you can try other format (e.g. MPEG)
158 |                 fourcc=cv2.VideoWriter_fourcc(*"mp4v"),
159 |                 fps=float(frames_per_second),
160 |                 frameSize=(width, height),
161 |                 isColor=True,
162 |             )
163 |         assert os.path.isfile(args.video_input)
164 |         for vis_frame in tqdm.tqdm(demo.run_on_video(video, args.confidence_threshold), total=num_frames):
165 |             if args.output:
166 |                 output_file.write(vis_frame)
167 |             else:
168 |                 cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
169 |                 cv2.imshow(basename, vis_frame)
170 |                 if cv2.waitKey(1) == 27:
171 |                     break  # esc to quit
172 |         video.release()
173 |         if args.output:
174 |             output_file.release()
175 |         else:
176 |             cv2.destroyAllWindows()
177 | 


--------------------------------------------------------------------------------
/engine/__pycache__/defaults.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/engine/__pycache__/defaults.cpython-36.pyc


--------------------------------------------------------------------------------
/engine/defaults.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | 
 4 | """
 5 | This file contains components with some default boilerplate logic user may need
 6 | in training / testing. They will not work for everyone, but many users may find them useful.
 7 | 
 8 | The behavior of functions/classes in this file is subject to change,
 9 | since they are meant to represent the "common default behavior" people need in their projects.
10 | """
11 | import numpy as np
12 | import argparse
13 | import logging
14 | import os
15 | import sys
16 | import weakref
17 | from collections import OrderedDict
18 | from typing import Optional
19 | import torch
20 | from fvcore.nn.precise_bn import get_bn_modules
21 | from omegaconf import OmegaConf
22 | from torch.nn.parallel import DistributedDataParallel
23 | 
24 | import detectron2.data.transforms as T
25 | from detectron2.checkpoint import DetectionCheckpointer
26 | from detectron2.config import CfgNode, LazyConfig
27 | from detectron2.data import (
28 |     MetadataCatalog,
29 |     build_detection_test_loader,
30 |     build_detection_train_loader,
31 | )
32 | from detectron2.evaluation import (
33 |     DatasetEvaluator,
34 |     inference_on_dataset,
35 |     print_csv_format,
36 |     verify_results,
37 | )
38 | from detectron2.modeling import build_model
39 | from detectron2.solver import build_lr_scheduler, build_optimizer
40 | from detectron2.utils import comm
41 | from detectron2.utils.collect_env import collect_env_info
42 | from detectron2.utils.env import seed_all_rng
43 | from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
44 | from detectron2.utils.file_io import PathManager
45 | from detectron2.utils.logger import setup_logger
46 | 
47 | 
48 | 
49 | __all__ = [
50 |     "DefaultPredictor",
51 | ]
52 | 
53 | class DefaultPredictor:
54 |    
55 | 
56 |     def __init__(self, cfg):
57 |         self.cfg = cfg.clone()  # cfg can be modified by model
58 |         self.model = build_model(self.cfg)
59 |         self.model.eval()
60 |         if len(cfg.DATASETS.TEST):
61 |             self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
62 | 
63 |         checkpointer = DetectionCheckpointer(self.model)
64 |         checkpointer.load(cfg.MODEL.WEIGHTS)
65 | 
66 |         self.aug = T.ResizeShortestEdge(
67 |             [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
68 |         )
69 | 
70 |         self.input_format = cfg.INPUT.FORMAT
71 |         assert self.input_format in ["RGB", "BGR"], self.input_format
72 | 
73 |     def __call__(self, original_image):
74 |         """
75 |         Args:
76 |             original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
77 | 
78 |         Returns:
79 |             predictions (dict):
80 |                 the output of the model for one image only.
81 |                 See :doc:`/tutorials/models` for details about the format.
82 |         """
83 |         with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
84 |             # Apply pre-processing to image.
85 |             if self.input_format == "RGB":
86 |                 # whether the model expects BGR inputs or RGB
87 |                 original_image = original_image[:, :, ::-1]
88 |             height, width = original_image.shape[:2]
89 |             image = self.aug.get_transform(original_image).apply_image(original_image)
90 |             image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
91 |             print("shape of image", np.shape(image))
92 |             print("image defaults", image)
93 |             inputs = {"image": image, "height": height, "width": width}
94 |             predictions = self.model([inputs])[0]
95 | 
96 |             return predictions
97 | 
98 | 


--------------------------------------------------------------------------------
/input/input_image/640x640.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/640x640.jpg


--------------------------------------------------------------------------------
/input/input_image/cup.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/cup.jpg


--------------------------------------------------------------------------------
/input/input_image/femme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/femme.jpg


--------------------------------------------------------------------------------
/input/input_image/homme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/homme.jpg


--------------------------------------------------------------------------------
/input/input_image/horses.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/horses.jpg


--------------------------------------------------------------------------------
/input/input_image/image1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/image1.jpg


--------------------------------------------------------------------------------
/input/input_image/input.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/input.jpg


--------------------------------------------------------------------------------
/input/input_image/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/results.png


--------------------------------------------------------------------------------
/input/input_image/skate.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/skate.jpg


--------------------------------------------------------------------------------
/input/input_image/turkish_coffee.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/turkish_coffee.jpg


--------------------------------------------------------------------------------
/onnx/__pycache__/image_processing.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/onnx/__pycache__/image_processing.cpython-36.pyc


--------------------------------------------------------------------------------
/output/mnist.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist.tar.gz


--------------------------------------------------------------------------------
/output/mnist/model.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/model.onnx


--------------------------------------------------------------------------------
/output/mnist/test_data_set_0/input_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_0/input_0.pb


--------------------------------------------------------------------------------
/output/mnist/test_data_set_0/output_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_0/output_0.pb


--------------------------------------------------------------------------------
/output/mnist/test_data_set_1/input_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_1/input_0.pb


--------------------------------------------------------------------------------
/output/mnist/test_data_set_1/output_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_1/output_0.pb


--------------------------------------------------------------------------------
/output/mnist/test_data_set_2/input_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_2/input_0.pb


--------------------------------------------------------------------------------
/output/mnist/test_data_set_2/output_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_2/output_0.pb


--------------------------------------------------------------------------------
/output/sparse_inst_r50_giam/config.yaml:
--------------------------------------------------------------------------------
  1 | CUDNN_BENCHMARK: false
  2 | DATALOADER:
  3 |   ASPECT_RATIO_GROUPING: true
  4 |   FILTER_EMPTY_ANNOTATIONS: true
  5 |   NUM_WORKERS: 4
  6 |   REPEAT_THRESHOLD: 0.0
  7 |   SAMPLER_TRAIN: TrainingSampler
  8 | DATASETS:
  9 |   PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
 10 |   PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
 11 |   PROPOSAL_FILES_TEST: []
 12 |   PROPOSAL_FILES_TRAIN: []
 13 |   TEST:
 14 |   - coco_2017_val
 15 |   TRAIN:
 16 |   - coco_2017_train
 17 | GLOBAL:
 18 |   HACK: 1.0
 19 | INPUT:
 20 |   CROP:
 21 |     ENABLED: false
 22 |     SIZE:
 23 |     - 0.9
 24 |     - 0.9
 25 |     TYPE: relative_range
 26 |   FORMAT: RGB
 27 |   MASK_FORMAT: bitmask
 28 |   MAX_SIZE_TEST: 853
 29 |   MAX_SIZE_TRAIN: 853
 30 |   MIN_SIZE_TEST: 512
 31 |   MIN_SIZE_TRAIN:
 32 |   - 416
 33 |   - 448
 34 |   - 480
 35 |   - 512
 36 |   - 544
 37 |   - 576
 38 |   - 608
 39 |   - 640
 40 |   MIN_SIZE_TRAIN_SAMPLING: choice
 41 |   RANDOM_FLIP: horizontal
 42 | MODEL:
 43 |   ANCHOR_GENERATOR:
 44 |     ANGLES:
 45 |     - - -90
 46 |       - 0
 47 |       - 90
 48 |     ASPECT_RATIOS:
 49 |     - - 0.5
 50 |       - 1.0
 51 |       - 2.0
 52 |     NAME: DefaultAnchorGenerator
 53 |     OFFSET: 0.0
 54 |     SIZES:
 55 |     - - 32
 56 |       - 64
 57 |       - 128
 58 |       - 256
 59 |       - 512
 60 |   BACKBONE:
 61 |     FREEZE_AT: 0
 62 |     NAME: build_resnet_backbone
 63 |   CSPNET:
 64 |     NAME: darknet53
 65 |     NORM: ''
 66 |     OUT_FEATURES:
 67 |     - csp1
 68 |     - csp2
 69 |     - csp3
 70 |     - csp4
 71 |   DEVICE: cuda
 72 |   FPN:
 73 |     FUSE_TYPE: sum
 74 |     IN_FEATURES: []
 75 |     NORM: ''
 76 |     OUT_CHANNELS: 256
 77 |   KEYPOINT_ON: false
 78 |   LOAD_PROPOSALS: false
 79 |   MASK_ON: true
 80 |   META_ARCHITECTURE: SparseInst
 81 |   PANOPTIC_FPN:
 82 |     COMBINE:
 83 |       ENABLED: true
 84 |       INSTANCES_CONFIDENCE_THRESH: 0.5
 85 |       OVERLAP_THRESH: 0.5
 86 |       STUFF_AREA_LIMIT: 4096
 87 |     INSTANCE_LOSS_WEIGHT: 1.0
 88 |   PIXEL_MEAN:
 89 |   - 123.675
 90 |   - 116.28
 91 |   - 103.53
 92 |   PIXEL_STD:
 93 |   - 58.395
 94 |   - 57.12
 95 |   - 57.375
 96 |   PROPOSAL_GENERATOR:
 97 |     MIN_SIZE: 0
 98 |     NAME: RPN
 99 |   PVT:
100 |     LINEAR: false
101 |     NAME: b1
102 |     OUT_FEATURES:
103 |     - p2
104 |     - p3
105 |     - p4
106 |   RESNETS:
107 |     DEFORM_MODULATED: false
108 |     DEFORM_NUM_GROUPS: 1
109 |     DEFORM_ON_PER_STAGE:
110 |     - false
111 |     - false
112 |     - false
113 |     - false
114 |     DEPTH: 50
115 |     NORM: FrozenBN
116 |     NUM_GROUPS: 1
117 |     OUT_FEATURES:
118 |     - res3
119 |     - res4
120 |     - res5
121 |     RES2_OUT_CHANNELS: 256
122 |     RES5_DILATION: 1
123 |     STEM_OUT_CHANNELS: 64
124 |     STRIDE_IN_1X1: false
125 |     WIDTH_PER_GROUP: 64
126 |   RETINANET:
127 |     BBOX_REG_LOSS_TYPE: smooth_l1
128 |     BBOX_REG_WEIGHTS:
129 |     - 1.0
130 |     - 1.0
131 |     - 1.0
132 |     - 1.0
133 |     FOCAL_LOSS_ALPHA: 0.25
134 |     FOCAL_LOSS_GAMMA: 2.0
135 |     IN_FEATURES:
136 |     - p3
137 |     - p4
138 |     - p5
139 |     - p6
140 |     - p7
141 |     IOU_LABELS:
142 |     - 0
143 |     - -1
144 |     - 1
145 |     IOU_THRESHOLDS:
146 |     - 0.4
147 |     - 0.5
148 |     NMS_THRESH_TEST: 0.5
149 |     NORM: ''
150 |     NUM_CLASSES: 80
151 |     NUM_CONVS: 4
152 |     PRIOR_PROB: 0.01
153 |     SCORE_THRESH_TEST: 0.05
154 |     SMOOTH_L1_LOSS_BETA: 0.1
155 |     TOPK_CANDIDATES_TEST: 1000
156 |   ROI_BOX_CASCADE_HEAD:
157 |     BBOX_REG_WEIGHTS:
158 |     - - 10.0
159 |       - 10.0
160 |       - 5.0
161 |       - 5.0
162 |     - - 20.0
163 |       - 20.0
164 |       - 10.0
165 |       - 10.0
166 |     - - 30.0
167 |       - 30.0
168 |       - 15.0
169 |       - 15.0
170 |     IOUS:
171 |     - 0.5
172 |     - 0.6
173 |     - 0.7
174 |   ROI_BOX_HEAD:
175 |     BBOX_REG_LOSS_TYPE: smooth_l1
176 |     BBOX_REG_LOSS_WEIGHT: 1.0
177 |     BBOX_REG_WEIGHTS:
178 |     - 10.0
179 |     - 10.0
180 |     - 5.0
181 |     - 5.0
182 |     CLS_AGNOSTIC_BBOX_REG: false
183 |     CONV_DIM: 256
184 |     FC_DIM: 1024
185 |     NAME: ''
186 |     NORM: ''
187 |     NUM_CONV: 0
188 |     NUM_FC: 0
189 |     POOLER_RESOLUTION: 14
190 |     POOLER_SAMPLING_RATIO: 0
191 |     POOLER_TYPE: ROIAlignV2
192 |     SMOOTH_L1_BETA: 0.0
193 |     TRAIN_ON_PRED_BOXES: false
194 |   ROI_HEADS:
195 |     BATCH_SIZE_PER_IMAGE: 512
196 |     IN_FEATURES:
197 |     - res4
198 |     IOU_LABELS:
199 |     - 0
200 |     - 1
201 |     IOU_THRESHOLDS:
202 |     - 0.5
203 |     NAME: Res5ROIHeads
204 |     NMS_THRESH_TEST: 0.5
205 |     NUM_CLASSES: 80
206 |     POSITIVE_FRACTION: 0.25
207 |     PROPOSAL_APPEND_GT: true
208 |     SCORE_THRESH_TEST: 0.05
209 |   ROI_KEYPOINT_HEAD:
210 |     CONV_DIMS:
211 |     - 512
212 |     - 512
213 |     - 512
214 |     - 512
215 |     - 512
216 |     - 512
217 |     - 512
218 |     - 512
219 |     LOSS_WEIGHT: 1.0
220 |     MIN_KEYPOINTS_PER_IMAGE: 1
221 |     NAME: KRCNNConvDeconvUpsampleHead
222 |     NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
223 |     NUM_KEYPOINTS: 17
224 |     POOLER_RESOLUTION: 14
225 |     POOLER_SAMPLING_RATIO: 0
226 |     POOLER_TYPE: ROIAlignV2
227 |   ROI_MASK_HEAD:
228 |     CLS_AGNOSTIC_MASK: false
229 |     CONV_DIM: 256
230 |     NAME: MaskRCNNConvUpsampleHead
231 |     NORM: ''
232 |     NUM_CONV: 0
233 |     POOLER_RESOLUTION: 14
234 |     POOLER_SAMPLING_RATIO: 0
235 |     POOLER_TYPE: ROIAlignV2
236 |   RPN:
237 |     BATCH_SIZE_PER_IMAGE: 256
238 |     BBOX_REG_LOSS_TYPE: smooth_l1
239 |     BBOX_REG_LOSS_WEIGHT: 1.0
240 |     BBOX_REG_WEIGHTS:
241 |     - 1.0
242 |     - 1.0
243 |     - 1.0
244 |     - 1.0
245 |     BOUNDARY_THRESH: -1
246 |     CONV_DIMS:
247 |     - -1
248 |     HEAD_NAME: StandardRPNHead
249 |     IN_FEATURES:
250 |     - res4
251 |     IOU_LABELS:
252 |     - 0
253 |     - -1
254 |     - 1
255 |     IOU_THRESHOLDS:
256 |     - 0.3
257 |     - 0.7
258 |     LOSS_WEIGHT: 1.0
259 |     NMS_THRESH: 0.7
260 |     POSITIVE_FRACTION: 0.5
261 |     POST_NMS_TOPK_TEST: 1000
262 |     POST_NMS_TOPK_TRAIN: 2000
263 |     PRE_NMS_TOPK_TEST: 6000
264 |     PRE_NMS_TOPK_TRAIN: 12000
265 |     SMOOTH_L1_BETA: 0.0
266 |   SEM_SEG_HEAD:
267 |     COMMON_STRIDE: 4
268 |     CONVS_DIM: 128
269 |     IGNORE_VALUE: 255
270 |     IN_FEATURES:
271 |     - p2
272 |     - p3
273 |     - p4
274 |     - p5
275 |     LOSS_WEIGHT: 1.0
276 |     NAME: SemSegFPNHead
277 |     NORM: GN
278 |     NUM_CLASSES: 54
279 |   SPARSE_INST:
280 |     CLS_THRESHOLD: 0.005
281 |     DATASET_MAPPER: SparseInstDatasetMapper
282 |     DECODER:
283 |       GROUPS: 4
284 |       INST:
285 |         CONVS: 4
286 |         DIM: 256
287 |       KERNEL_DIM: 128
288 |       MASK:
289 |         CONVS: 4
290 |         DIM: 256
291 |       NAME: GroupIAMDecoder
292 |       NUM_CLASSES: 80
293 |       NUM_MASKS: 100
294 |       OUTPUT_IAM: false
295 |       SCALE_FACTOR: 2.0
296 |     ENCODER:
297 |       IN_FEATURES:
298 |       - res3
299 |       - res4
300 |       - res5
301 |       NAME: InstanceContextEncoder
302 |       NORM: ''
303 |       NUM_CHANNELS: 256
304 |     LOSS:
305 |       CLASS_WEIGHT: 2.0
306 |       ITEMS:
307 |       - labels
308 |       - masks
309 |       MASK_DICE_WEIGHT: 2.0
310 |       MASK_PIXEL_WEIGHT: 5.0
311 |       NAME: SparseInstCriterion
312 |       OBJECTNESS_WEIGHT: 1.0
313 |     MASK_THRESHOLD: 0.45
314 |     MATCHER:
315 |       ALPHA: 0.8
316 |       BETA: 0.2
317 |       NAME: SparseInstMatcher
318 |     MAX_DETECTIONS: 100
319 |   WEIGHTS: sparse_inst_r50_giam_aug_2b7d68.pth
320 | OUTPUT_DIR: output/sparse_inst_r50_giam
321 | SEED: -1
322 | SOLVER:
323 |   AMP:
324 |     ENABLED: false
325 |   AMSGRAD: false
326 |   BACKBONE_MULTIPLIER: 1.0
327 |   BASE_LR: 5.0e-05
328 |   BIAS_LR_FACTOR: 1.0
329 |   CHECKPOINT_PERIOD: 5000
330 |   CLIP_GRADIENTS:
331 |     CLIP_TYPE: value
332 |     CLIP_VALUE: 1.0
333 |     ENABLED: false
334 |     NORM_TYPE: 2.0
335 |   GAMMA: 0.1
336 |   IMS_PER_BATCH: 64
337 |   LR_SCHEDULER_NAME: WarmupMultiStepLR
338 |   MAX_ITER: 270000
339 |   MOMENTUM: 0.9
340 |   NESTEROV: false
341 |   OPTIMIZER: ADAMW
342 |   REFERENCE_WORLD_SIZE: 0
343 |   STEPS:
344 |   - 210000
345 |   - 250000
346 |   WARMUP_FACTOR: 0.001
347 |   WARMUP_ITERS: 1000
348 |   WARMUP_METHOD: linear
349 |   WEIGHT_DECAY: 0.05
350 |   WEIGHT_DECAY_BIAS: null
351 |   WEIGHT_DECAY_NORM: 0.0
352 | TEST:
353 |   AUG:
354 |     ENABLED: false
355 |     FLIP: true
356 |     MAX_SIZE: 4000
357 |     MIN_SIZES:
358 |     - 400
359 |     - 500
360 |     - 600
361 |     - 700
362 |     - 800
363 |     - 900
364 |     - 1000
365 |     - 1100
366 |     - 1200
367 |   DETECTIONS_PER_IMAGE: 100
368 |   EVAL_PERIOD: 7330
369 |   EXPECTED_RESULTS: []
370 |   KEYPOINT_OKS_SIGMAS: []
371 |   PRECISE_BN:
372 |     ENABLED: false
373 |     NUM_ITER: 200
374 | VERSION: 2
375 | VIS_PERIOD: 0
376 | 


--------------------------------------------------------------------------------
/results/640_result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/results/640_result.jpg


--------------------------------------------------------------------------------
/results/result_onnx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/results/result_onnx.png


--------------------------------------------------------------------------------
/results/result_tensorrt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/results/result_tensorrt.png


--------------------------------------------------------------------------------
/sparseinst/__init__.py:
--------------------------------------------------------------------------------
 1 | from .sparseinst import SparseInst
 2 | from .encoder import build_sparse_inst_encoder
 3 | from .decoder import build_sparse_inst_decoder
 4 | from .config import add_sparse_inst_config
 5 | from .loss import build_sparse_inst_criterion
 6 | from .dataset_mapper import SparseInstDatasetMapper
 7 | from .coco_evaluation import COCOMaskEvaluator
 8 | from .backbones import build_resnet_vd_backbone, build_pyramid_vision_transformer
 9 | from .d2_predictor import VisualizationDemo
10 | 


--------------------------------------------------------------------------------
/sparseinst/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/__pycache__/caffe2sparseinst.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/caffe2sparseinst.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/__pycache__/coco_evaluation.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/coco_evaluation.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/config.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/__pycache__/d2_predictor.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/d2_predictor.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/__pycache__/dataset_mapper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/dataset_mapper.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/__pycache__/decoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/decoder.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/__pycache__/encoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/encoder.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/__pycache__/loss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/loss.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/__pycache__/sparseinst.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/sparseinst.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet import build_resnet_vd_backbone
2 | from .pvt import build_pyramid_vision_transformer
3 | from .cspnet import build_cspnet_backbone


--------------------------------------------------------------------------------
/sparseinst/backbones/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/backbones/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/backbones/__pycache__/cspnet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/backbones/__pycache__/cspnet.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/backbones/__pycache__/pvt.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/backbones/__pycache__/pvt.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/backbones/__pycache__/resnet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/backbones/__pycache__/resnet.cpython-36.pyc


--------------------------------------------------------------------------------
/sparseinst/backbones/cspnet.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from timm.models.layers import ConvBnAct, DropPath, AvgPool2dSame, create_attn
  7 | 
  8 | 
  9 | from detectron2.layers import ShapeSpec, FrozenBatchNorm2d
 10 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY
 11 | 
 12 | 
 13 | model_cfgs = dict(
 14 |     cspresnet50=dict(
 15 |         stem=dict(out_chs=64, kernel_size=7, stride=2, pool='max'),
 16 |         stage=dict(
 17 |             out_chs=(128, 256, 512, 1024),
 18 |             depth=(3, 3, 5, 2),
 19 |             stride=(1,) + (2,) * 3,
 20 |             exp_ratio=(2.,) * 4,
 21 |             bottle_ratio=(0.5,) * 4,
 22 |             block_ratio=(1.,) * 4,
 23 |             cross_linear=True,
 24 |         )
 25 |     ),
 26 |     cspresnet50d=dict(
 27 |         stem=dict(out_chs=[32, 32, 64], kernel_size=3, stride=2, pool='max'),
 28 |         stage=dict(
 29 |             out_chs=(128, 256, 512, 1024),
 30 |             depth=(3, 3, 5, 2),
 31 |             stride=(1,) + (2,) * 3,
 32 |             exp_ratio=(2.,) * 4,
 33 |             bottle_ratio=(0.5,) * 4,
 34 |             block_ratio=(1.,) * 4,
 35 |             cross_linear=True,
 36 |         )
 37 |     ),
 38 |     cspresnet50w=dict(
 39 |         stem=dict(out_chs=[32, 32, 64], kernel_size=3, stride=2, pool='max'),
 40 |         stage=dict(
 41 |             out_chs=(256, 512, 1024, 2048),
 42 |             depth=(3, 3, 5, 2),
 43 |             stride=(1,) + (2,) * 3,
 44 |             exp_ratio=(1.,) * 4,
 45 |             bottle_ratio=(0.25,) * 4,
 46 |             block_ratio=(0.5,) * 4,
 47 |             cross_linear=True,
 48 |         )
 49 |     ),
 50 |     cspresnext50=dict(
 51 |         stem=dict(out_chs=64, kernel_size=7, stride=2, pool='max'),
 52 |         stage=dict(
 53 |             out_chs=(256, 512, 1024, 2048),
 54 |             depth=(3, 3, 5, 2),
 55 |             stride=(1,) + (2,) * 3,
 56 |             groups=(32,) * 4,
 57 |             exp_ratio=(1.,) * 4,
 58 |             bottle_ratio=(1.,) * 4,
 59 |             block_ratio=(0.5,) * 4,
 60 |             cross_linear=True,
 61 |         )
 62 |     ),
 63 |     cspdarknet53=dict(
 64 |         stem=dict(out_chs=32, kernel_size=3, stride=1, pool=''),
 65 |         stage=dict(
 66 |             out_chs=(64, 128, 256, 512, 1024),
 67 |             depth=(1, 2, 8, 8, 4),
 68 |             stride=(2,) * 5,
 69 |             exp_ratio=(2.,) + (1.,) * 4,
 70 |             bottle_ratio=(0.5,) + (1.0,) * 4,
 71 |             block_ratio=(1.,) + (0.5,) * 4,
 72 |             down_growth=True,
 73 |         )
 74 |     ),
 75 |     darknet53=dict(
 76 |         stem=dict(out_chs=32, kernel_size=3, stride=1, pool=''),
 77 |         stage=dict(
 78 |             out_chs=(64, 128, 256, 512, 1024),
 79 |             depth=(1, 2, 8, 8, 4),
 80 |             stride=(2,) * 5,
 81 |             bottle_ratio=(0.5,) * 5,
 82 |             block_ratio=(1.,) * 5,
 83 |         )
 84 |     )
 85 | )
 86 | 
 87 | 
 88 | def create_stem(
 89 |         in_chans=3, out_chs=32, kernel_size=3, stride=2, pool='',
 90 |         act_layer=None, norm_layer=None, aa_layer=None):
 91 |     stem = nn.Sequential()
 92 |     if not isinstance(out_chs, (tuple, list)):
 93 |         out_chs = [out_chs]
 94 |     assert len(out_chs)
 95 |     in_c = in_chans
 96 |     for i, out_c in enumerate(out_chs):
 97 |         conv_name = f'conv{i + 1}'
 98 |         stem.add_module(conv_name, ConvBnAct(
 99 |             in_c, out_c, kernel_size, stride=stride if i == 0 else 1,
100 |             act_layer=act_layer, norm_layer=norm_layer))
101 |         in_c = out_c
102 |         last_conv = conv_name
103 |     if pool:
104 |         if aa_layer is not None:
105 |             stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
106 |             stem.add_module('aa', aa_layer(channels=in_c, stride=2))
107 |         else:
108 |             stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
109 |     return stem, dict(num_chs=in_c, reduction=stride, module='.'.join(['stem', last_conv]))
110 | 
111 | 
112 | class ResBottleneck(nn.Module):
113 |     """ ResNe(X)t Bottleneck Block
114 |     """
115 | 
116 |     def __init__(self, in_chs, out_chs, dilation=1, bottle_ratio=0.25, groups=1,
117 |                  act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_last=False,
118 |                  attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
119 |         super(ResBottleneck, self).__init__()
120 |         mid_chs = int(round(out_chs * bottle_ratio))
121 |         ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer,
122 |                        aa_layer=aa_layer, drop_block=drop_block)
123 | 
124 |         self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
125 |         self.conv2 = ConvBnAct(mid_chs, mid_chs, kernel_size=3,
126 |                                dilation=dilation, groups=groups, **ckwargs)
127 |         self.attn2 = create_attn(attn_layer, channels=mid_chs) if not attn_last else None
128 |         self.conv3 = ConvBnAct(mid_chs, out_chs, kernel_size=1, apply_act=False, **ckwargs)
129 |         self.attn3 = create_attn(attn_layer, channels=out_chs) if attn_last else None
130 |         self.drop_path = drop_path
131 |         self.act3 = act_layer(inplace=True)
132 | 
133 |     def zero_init_last_bn(self):
134 |         nn.init.zeros_(self.conv3.bn.weight)
135 | 
136 |     def forward(self, x):
137 |         shortcut = x
138 |         x = self.conv1(x)
139 |         x = self.conv2(x)
140 |         if self.attn2 is not None:
141 |             x = self.attn2(x)
142 |         x = self.conv3(x)
143 |         if self.attn3 is not None:
144 |             x = self.attn3(x)
145 |         if self.drop_path is not None:
146 |             x = self.drop_path(x)
147 |         x = x + shortcut
148 |         # FIXME partial shortcut needed if first block handled as per original, not used for my current impl
149 |         #x[:, :shortcut.size(1)] += shortcut
150 |         x = self.act3(x)
151 |         return x
152 | 
153 | 
154 | class DarkBlock(nn.Module):
155 |     """ DarkNet Block
156 |     """
157 | 
158 |     def __init__(self, in_chs, out_chs, dilation=1, bottle_ratio=0.5, groups=1,
159 |                  act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_layer=None, aa_layer=None,
160 |                  drop_block=None, drop_path=None):
161 |         super(DarkBlock, self).__init__()
162 |         mid_chs = int(round(out_chs * bottle_ratio))
163 |         ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer,
164 |                        aa_layer=aa_layer, drop_block=drop_block)
165 |         self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
166 |         self.conv2 = ConvBnAct(mid_chs, out_chs, kernel_size=3,
167 |                                dilation=dilation, groups=groups, **ckwargs)
168 |         self.attn = create_attn(attn_layer, channels=out_chs)
169 |         self.drop_path = drop_path
170 | 
171 |     def zero_init_last_bn(self):
172 |         nn.init.zeros_(self.conv2.bn.weight)
173 | 
174 |     def forward(self, x):
175 |         shortcut = x
176 |         x = self.conv1(x)
177 |         x = self.conv2(x)
178 |         if self.attn is not None:
179 |             x = self.attn(x)
180 |         if self.drop_path is not None:
181 |             x = self.drop_path(x)
182 |         x = x + shortcut
183 |         return x
184 | 
185 | 
186 | class CrossStage(nn.Module):
187 |     """Cross Stage."""
188 | 
189 |     def __init__(self, in_chs, out_chs, stride, dilation, depth, block_ratio=1., bottle_ratio=1., exp_ratio=1.,
190 |                  groups=1, first_dilation=None, down_growth=False, cross_linear=False, block_dpr=None,
191 |                  block_fn=ResBottleneck, **block_kwargs):
192 |         super(CrossStage, self).__init__()
193 |         first_dilation = first_dilation or dilation
194 |         down_chs = out_chs if down_growth else in_chs  # grow downsample channels to output channels
195 |         exp_chs = int(round(out_chs * exp_ratio))
196 |         block_out_chs = int(round(out_chs * block_ratio))
197 |         conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'),
198 |                            norm_layer=block_kwargs.get('norm_layer'))
199 | 
200 |         if stride != 1 or first_dilation != dilation:
201 |             self.conv_down = ConvBnAct(
202 |                 in_chs, down_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
203 |                 aa_layer=block_kwargs.get('aa_layer', None), **conv_kwargs)
204 |             prev_chs = down_chs
205 |         else:
206 |             self.conv_down = None
207 |             prev_chs = in_chs
208 | 
209 |         # FIXME this 1x1 expansion is pushed down into the cross and block paths in the darknet cfgs. Also,
210 |         # there is also special case for the first stage for some of the model that results in uneven split
211 |         # across the two paths. I did it this way for simplicity for now.
212 |         self.conv_exp = ConvBnAct(prev_chs, exp_chs, kernel_size=1,
213 |                                   apply_act=not cross_linear, **conv_kwargs)
214 |         prev_chs = exp_chs // 2  # output of conv_exp is always split in two
215 | 
216 |         self.blocks = nn.Sequential()
217 |         for i in range(depth):
218 |             drop_path = DropPath(block_dpr[i]) if block_dpr and block_dpr[i] else None
219 |             self.blocks.add_module(str(i), block_fn(
220 |                 prev_chs, block_out_chs, dilation, bottle_ratio, groups, drop_path=drop_path, **block_kwargs))
221 |             prev_chs = block_out_chs
222 | 
223 |         # transition convs
224 |         self.conv_transition_b = ConvBnAct(prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs)
225 |         self.conv_transition = ConvBnAct(exp_chs, out_chs, kernel_size=1, **conv_kwargs)
226 | 
227 |     def forward(self, x):
228 |         if self.conv_down is not None:
229 |             x = self.conv_down(x)
230 |         x = self.conv_exp(x)
231 |         split = x.shape[1] // 2
232 |         xs, xb = x[:, :split], x[:, split:]
233 |         xb = self.blocks(xb)
234 |         xb = self.conv_transition_b(xb).contiguous()
235 |         out = self.conv_transition(torch.cat([xs, xb], dim=1))
236 |         return out
237 | 
238 | 
239 | class DarkStage(nn.Module):
240 |     """DarkNet stage."""
241 | 
242 |     def __init__(self, in_chs, out_chs, stride, dilation, depth, block_ratio=1., bottle_ratio=1., groups=1,
243 |                  first_dilation=None, block_fn=ResBottleneck, block_dpr=None, **block_kwargs):
244 |         super(DarkStage, self).__init__()
245 |         first_dilation = first_dilation or dilation
246 | 
247 |         self.conv_down = ConvBnAct(
248 |             in_chs, out_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
249 |             act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'),
250 |             aa_layer=block_kwargs.get('aa_layer', None))
251 | 
252 |         prev_chs = out_chs
253 |         block_out_chs = int(round(out_chs * block_ratio))
254 |         self.blocks = nn.Sequential()
255 |         for i in range(depth):
256 |             drop_path = DropPath(block_dpr[i]) if block_dpr and block_dpr[i] else None
257 |             self.blocks.add_module(str(i), block_fn(
258 |                 prev_chs, block_out_chs, dilation, bottle_ratio, groups, drop_path=drop_path, **block_kwargs))
259 |             prev_chs = block_out_chs
260 | 
261 |     def forward(self, x):
262 |         x = self.conv_down(x)
263 |         x = self.blocks(x)
264 |         return x
265 | 
266 | 
267 | def _cfg_to_stage_args(cfg, curr_stride=2, output_stride=32, drop_path_rate=0.):
268 |     # get per stage args for stage and containing blocks, calculate strides to meet target output_stride
269 |     num_stages = len(cfg['depth'])
270 |     if 'groups' not in cfg:
271 |         cfg['groups'] = (1,) * num_stages
272 |     if 'down_growth' in cfg and not isinstance(cfg['down_growth'], (list, tuple)):
273 |         cfg['down_growth'] = (cfg['down_growth'],) * num_stages
274 |     if 'cross_linear' in cfg and not isinstance(cfg['cross_linear'], (list, tuple)):
275 |         cfg['cross_linear'] = (cfg['cross_linear'],) * num_stages
276 |     cfg['block_dpr'] = [None] * num_stages if not drop_path_rate else \
277 |         [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg['depth'])).split(cfg['depth'])]
278 |     stage_strides = []
279 |     stage_dilations = []
280 |     stage_first_dilations = []
281 |     dilation = 1
282 |     for cfg_stride in cfg['stride']:
283 |         stage_first_dilations.append(dilation)
284 |         if curr_stride >= output_stride:
285 |             dilation *= cfg_stride
286 |             stride = 1
287 |         else:
288 |             stride = cfg_stride
289 |             curr_stride *= stride
290 |         stage_strides.append(stride)
291 |         stage_dilations.append(dilation)
292 |     cfg['stride'] = stage_strides
293 |     cfg['dilation'] = stage_dilations
294 |     cfg['first_dilation'] = stage_first_dilations
295 |     stage_args = [dict(zip(cfg.keys(), values)) for values in zip(*cfg.values())]
296 |     return stage_args
297 | 
298 | 
299 | class CSPNet(Backbone):
300 |     """Cross Stage Partial base model.
301 | 
302 |     Paper: `CSPNet: A New Backbone that can Enhance Learning Capability of CNN` - https://arxiv.org/abs/1911.11929
303 |     Ref Impl: https://github.com/WongKinYiu/CrossStagePartialNetworks
304 | 
305 |     NOTE: There are differences in the way I handle the 1x1 'expansion' conv in this impl vs the
306 |     darknet impl. I did it this way for simplicity and less special cases.
307 |     """
308 | 
309 |     def __init__(self, cfg, in_chans=3, output_stride=32, global_pool='avg', drop_rate=0.,
310 |                  act_layer=nn.LeakyReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_path_rate=0.,
311 |                  zero_init_last_bn=True, stage_fn=CrossStage, block_fn=ResBottleneck, out_features=None):
312 |         super().__init__()
313 |         self.drop_rate = drop_rate
314 |         assert output_stride in (8, 16, 32)
315 |         layer_args = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer)
316 | 
317 |         # Construct the stem
318 |         self.stem, stem_feat_info = create_stem(in_chans, **cfg['stem'], **layer_args)
319 |         self.feature_info = [stem_feat_info]
320 |         prev_chs = stem_feat_info['num_chs']
321 |         curr_stride = stem_feat_info['reduction']  # reduction does not include pool
322 |         if cfg['stem']['pool']:
323 |             curr_stride *= 2
324 | 
325 |         # Construct the stages
326 |         per_stage_args = _cfg_to_stage_args(
327 |             cfg['stage'], curr_stride=curr_stride, output_stride=output_stride, drop_path_rate=drop_path_rate)
328 |         self.stages = nn.Sequential()
329 |         out_channels = []
330 |         out_strides = []
331 |         for i, sa in enumerate(per_stage_args):
332 |             self.stages.add_module(
333 |                 str(i), stage_fn(prev_chs, **sa, **layer_args, block_fn=block_fn))
334 |             prev_chs = sa['out_chs']
335 |             curr_stride *= sa['stride']
336 |             self.feature_info += [dict(num_chs=prev_chs,
337 |                                        reduction=curr_stride, module=f'stages.{i}')]
338 |             out_channels.append(prev_chs)
339 |             out_strides.append(curr_stride)
340 | 
341 |         for m in self.modules():
342 |             if isinstance(m, nn.Conv2d):
343 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
344 |             elif isinstance(m, nn.BatchNorm2d):
345 |                 nn.init.ones_(m.weight)
346 |                 nn.init.zeros_(m.bias)
347 |             elif isinstance(m, nn.Linear):
348 |                 nn.init.normal_(m.weight, mean=0.0, std=0.01)
349 |                 nn.init.zeros_(m.bias)
350 |         if zero_init_last_bn:
351 |             for m in self.modules():
352 |                 if hasattr(m, 'zero_init_last_bn'):
353 |                     m.zero_init_last_bn()
354 | 
355 |         # cspdarknet: csp1, csp2, csp3, csp4
356 |         # cspresnet: csp0, csp1, csp2, csp3
357 |         out_features_names = ["csp{}".format(i) for i in range(len(per_stage_args))]
358 |         self._out_feature_strides = dict(zip(out_features_names, out_strides))
359 |         self._out_feature_channels = dict(zip(out_features_names, out_channels))
360 |         if out_features is None:
361 |             self._out_features = out_features_names
362 |         else:
363 |             self._out_features = out_features
364 | 
365 |     def output_shape(self):
366 |         return {
367 |             name: ShapeSpec(
368 |                 channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
369 |             )
370 |             for name in self._out_features
371 |         }
372 | 
373 |     def size_divisibility(self):
374 |         return 32
375 | 
376 |     def forward(self, x):
377 |         x = self.stem(x)
378 |         outputs = {}
379 |         for i, stage in enumerate(self.stages):
380 |             name = f"csp{i}"
381 |             x = stage(x)
382 |             if name in self._out_features:
383 |                 outputs[name] = x
384 |         return outputs
385 | 
386 | 
387 | @BACKBONE_REGISTRY.register()
388 | def build_cspnet_backbone(cfg, input_shape=None):
389 | 
390 |     cspnet_name = cfg.MODEL.CSPNET.NAME
391 |     norm_name = cfg.MODEL.CSPNET.NORM
392 |     out_features = cfg.MODEL.CSPNET.OUT_FEATURES
393 |     # DarkNet53 doesn't have batch norm
394 |     if norm_name == "FrozenBN":
395 |         norm = FrozenBatchNorm2d
396 |     elif norm_name == "SyncBN":
397 |         from detectron2.layers import NaiveSyncBatchNorm
398 |         norm = NaiveSyncBatchNorm
399 |     else:
400 |         norm = nn.BatchNorm2d
401 | 
402 |     assert cspnet_name in ["cspresnet50", "cspresnet50d", "cspresnet50w",
403 |                            "cspresnext50", "cspdarknet53", "darknet53"]
404 | 
405 |     model_cfg = model_cfgs[cspnet_name]
406 | 
407 |     if "darknet" in cspnet_name:
408 |         block_fn = DarkBlock
409 |     else:
410 |         block_fn = ResBottleneck
411 | 
412 |     if cspnet_name == "darknet53":
413 |         stage_fn = DarkStage
414 |     else:
415 |         stage_fn = CrossStage
416 | 
417 |     model = CSPNet(
418 |         model_cfg,
419 |         in_chans=input_shape.channels,
420 |         norm_layer=norm,
421 |         stage_fn=stage_fn,
422 |         block_fn=block_fn,
423 |         out_features=out_features)
424 |     return model
425 | 


--------------------------------------------------------------------------------
/sparseinst/backbones/pvt.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from functools import partial
  6 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_
  7 | from detectron2.layers import ShapeSpec
  8 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY
  9 | 
 10 | 
 11 | class Mlp(nn.Module):
 12 |     def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False):
 13 |         super().__init__()
 14 |         out_features = out_features or in_features
 15 |         hidden_features = hidden_features or in_features
 16 |         self.fc1 = nn.Linear(in_features, hidden_features)
 17 |         self.dwconv = DWConv(hidden_features)
 18 |         self.act = act_layer()
 19 |         self.fc2 = nn.Linear(hidden_features, out_features)
 20 |         self.drop = nn.Dropout(drop)
 21 |         self.linear = linear
 22 |         if self.linear:
 23 |             self.relu = nn.ReLU(inplace=True)
 24 |         self.apply(self._init_weights)
 25 | 
 26 |     def _init_weights(self, m):
 27 |         if isinstance(m, nn.Linear):
 28 |             trunc_normal_(m.weight, std=.02)
 29 |             if isinstance(m, nn.Linear) and m.bias is not None:
 30 |                 nn.init.constant_(m.bias, 0)
 31 |         elif isinstance(m, nn.LayerNorm):
 32 |             nn.init.constant_(m.bias, 0)
 33 |             nn.init.constant_(m.weight, 1.0)
 34 |         elif isinstance(m, nn.Conv2d):
 35 |             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 36 |             fan_out //= m.groups
 37 |             m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
 38 |             if m.bias is not None:
 39 |                 m.bias.data.zero_()
 40 | 
 41 |     def forward(self, x, H, W):
 42 |         x = self.fc1(x)
 43 |         if self.linear:
 44 |             x = self.relu(x)
 45 |         x = self.dwconv(x, H, W)
 46 |         x = self.act(x)
 47 |         x = self.drop(x)
 48 |         x = self.fc2(x)
 49 |         x = self.drop(x)
 50 |         return x
 51 | 
 52 | 
 53 | class Attention(nn.Module):
 54 |     def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False):
 55 |         super().__init__()
 56 |         assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
 57 | 
 58 |         self.dim = dim
 59 |         self.num_heads = num_heads
 60 |         head_dim = dim // num_heads
 61 |         self.scale = qk_scale or head_dim ** -0.5
 62 | 
 63 |         self.q = nn.Linear(dim, dim, bias=qkv_bias)
 64 |         self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
 65 |         self.attn_drop = nn.Dropout(attn_drop)
 66 |         self.proj = nn.Linear(dim, dim)
 67 |         self.proj_drop = nn.Dropout(proj_drop)
 68 | 
 69 |         self.linear = linear
 70 |         self.sr_ratio = sr_ratio
 71 |         if not linear:
 72 |             if sr_ratio > 1:
 73 |                 self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
 74 |                 self.norm = nn.LayerNorm(dim)
 75 |         else:
 76 |             self.pool = nn.AdaptiveAvgPool2d(7)
 77 |             self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1)
 78 |             self.norm = nn.LayerNorm(dim)
 79 |             self.act = nn.GELU()
 80 |         self.apply(self._init_weights)
 81 | 
 82 |     def _init_weights(self, m):
 83 |         if isinstance(m, nn.Linear):
 84 |             trunc_normal_(m.weight, std=.02)
 85 |             if isinstance(m, nn.Linear) and m.bias is not None:
 86 |                 nn.init.constant_(m.bias, 0)
 87 |         elif isinstance(m, nn.LayerNorm):
 88 |             nn.init.constant_(m.bias, 0)
 89 |             nn.init.constant_(m.weight, 1.0)
 90 |         elif isinstance(m, nn.Conv2d):
 91 |             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 92 |             fan_out //= m.groups
 93 |             m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
 94 |             if m.bias is not None:
 95 |                 m.bias.data.zero_()
 96 | 
 97 |     def forward(self, x, H, W):
 98 |         B, N, C = x.shape
 99 |         q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
100 | 
101 |         if not self.linear:
102 |             if self.sr_ratio > 1:
103 |                 x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
104 |                 x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
105 |                 x_ = self.norm(x_)
106 |                 kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
107 |             else:
108 |                 kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
109 |         else:
110 |             x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
111 |             x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1)
112 |             x_ = self.norm(x_)
113 |             x_ = self.act(x_)
114 |             kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
115 |         k, v = kv[0], kv[1]
116 | 
117 |         attn = (q @ k.transpose(-2, -1)) * self.scale
118 |         attn = attn.softmax(dim=-1)
119 |         attn = self.attn_drop(attn)
120 | 
121 |         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
122 |         x = self.proj(x)
123 |         x = self.proj_drop(x)
124 | 
125 |         return x
126 | 
127 | 
128 | class Block(nn.Module):
129 | 
130 |     def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
131 |                  drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False):
132 |         super().__init__()
133 |         self.norm1 = norm_layer(dim)
134 |         self.attn = Attention(
135 |             dim,
136 |             num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
137 |             attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear)
138 |         # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
139 |         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
140 |         self.norm2 = norm_layer(dim)
141 |         mlp_hidden_dim = int(dim * mlp_ratio)
142 |         self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear)
143 | 
144 |         self.apply(self._init_weights)
145 | 
146 |     def _init_weights(self, m):
147 |         if isinstance(m, nn.Linear):
148 |             trunc_normal_(m.weight, std=.02)
149 |             if isinstance(m, nn.Linear) and m.bias is not None:
150 |                 nn.init.constant_(m.bias, 0)
151 |         elif isinstance(m, nn.LayerNorm):
152 |             nn.init.constant_(m.bias, 0)
153 |             nn.init.constant_(m.weight, 1.0)
154 |         elif isinstance(m, nn.Conv2d):
155 |             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
156 |             fan_out //= m.groups
157 |             m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
158 |             if m.bias is not None:
159 |                 m.bias.data.zero_()
160 | 
161 |     def forward(self, x, H, W):
162 |         x = x + self.drop_path(self.attn(self.norm1(x), H, W))
163 |         x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
164 | 
165 |         return x
166 | 
167 | 
168 | class OverlapPatchEmbed(nn.Module):
169 |     """ Image to Patch Embedding
170 |     """
171 | 
172 |     def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
173 |         super().__init__()
174 |         img_size = to_2tuple(img_size)
175 |         patch_size = to_2tuple(patch_size)
176 | 
177 |         self.img_size = img_size
178 |         self.patch_size = patch_size
179 |         self.H, self.W = img_size[0] // stride, img_size[1] // stride
180 |         self.num_patches = self.H * self.W
181 |         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
182 |                               padding=(patch_size[0] // 2, patch_size[1] // 2))
183 |         self.norm = nn.LayerNorm(embed_dim)
184 | 
185 |         self.apply(self._init_weights)
186 | 
187 |     def _init_weights(self, m):
188 |         if isinstance(m, nn.Linear):
189 |             trunc_normal_(m.weight, std=.02)
190 |             if isinstance(m, nn.Linear) and m.bias is not None:
191 |                 nn.init.constant_(m.bias, 0)
192 |         elif isinstance(m, nn.LayerNorm):
193 |             nn.init.constant_(m.bias, 0)
194 |             nn.init.constant_(m.weight, 1.0)
195 |         elif isinstance(m, nn.Conv2d):
196 |             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
197 |             fan_out //= m.groups
198 |             m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
199 |             if m.bias is not None:
200 |                 m.bias.data.zero_()
201 | 
202 |     def forward(self, x):
203 |         x = self.proj(x)
204 |         _, _, H, W = x.shape
205 |         x = x.flatten(2).transpose(1, 2)
206 |         x = self.norm(x)
207 | 
208 |         return x, H, W
209 | 
210 | 
211 | class PyramidVisionTransformerV2(Backbone):
212 |     def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dims=[64, 128, 256, 512],
213 |                  num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
214 |                  attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, depths=[3, 4, 6, 3],
215 |                  sr_ratios=[8, 4, 2, 1], num_stages=4, linear=False, out_features=None):
216 |         super().__init__()
217 |         self.depths = depths
218 |         self.num_stages = num_stages
219 |         self.linear = linear
220 | 
221 |         dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
222 |         cur = 0
223 | 
224 |         for i in range(num_stages):
225 |             patch_embed = OverlapPatchEmbed(img_size=img_size if i == 0 else img_size // (2 ** (i + 1)),
226 |                                             patch_size=7 if i == 0 else 3,
227 |                                             stride=4 if i == 0 else 2,
228 |                                             in_chans=in_chans if i == 0 else embed_dims[i - 1],
229 |                                             embed_dim=embed_dims[i])
230 | 
231 |             block = nn.ModuleList([Block(
232 |                 dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias,
233 |                 qk_scale=qk_scale,
234 |                 drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer,
235 |                 sr_ratio=sr_ratios[i], linear=linear)
236 |                 for j in range(depths[i])])
237 |             norm = norm_layer(embed_dims[i])
238 |             cur += depths[i]
239 | 
240 |             setattr(self, f"patch_embed{i + 1}", patch_embed)
241 |             setattr(self, f"block{i + 1}", block)
242 |             setattr(self, f"norm{i + 1}", norm)
243 |         
244 |         out_features_names = ["p1", "p2", "p3", "p4"]
245 |         self._out_feature_strides = dict(zip(out_features_names, [4, 8, 16, 32]))
246 |         self._out_feature_channels = dict(zip(out_features_names, embed_dims))
247 |         if out_features is None:
248 |             self._out_features = out_features_names
249 |         else:
250 |             self._out_features = out_features
251 |         self.out_features_names = out_features_names
252 |         self.apply(self._init_weights)
253 | 
254 |     def _init_weights(self, m):
255 |         if isinstance(m, nn.Linear):
256 |             trunc_normal_(m.weight, std=.02)
257 |             if isinstance(m, nn.Linear) and m.bias is not None:
258 |                 nn.init.constant_(m.bias, 0)
259 |         elif isinstance(m, nn.LayerNorm):
260 |             nn.init.constant_(m.bias, 0)
261 |             nn.init.constant_(m.weight, 1.0)
262 |         elif isinstance(m, nn.Conv2d):
263 |             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
264 |             fan_out //= m.groups
265 |             m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
266 |             if m.bias is not None:
267 |                 m.bias.data.zero_()
268 | 
269 |     def freeze_patch_emb(self):
270 |         self.patch_embed1.requires_grad = False
271 | 
272 |     @torch.jit.ignore
273 |     def no_weight_decay(self):
274 |         return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'}  # has pos_embed may be better
275 | 
276 | 
277 |     def output_shape(self):
278 |         return {
279 |             name: ShapeSpec(
280 |                 channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
281 |             )
282 |             for name in self._out_features
283 |         }
284 | 
285 |     def size_divisibility(self):
286 |         return 32
287 | 
288 | 
289 |     def forward(self, x):
290 |         B = x.shape[0]
291 |         outputs = {}
292 | 
293 |         for i in range(self.num_stages):
294 |             patch_embed = getattr(self, f"patch_embed{i + 1}")
295 |             block = getattr(self, f"block{i + 1}")
296 |             norm = getattr(self, f"norm{i + 1}")
297 |             x, H, W = patch_embed(x)
298 |             for blk in block:
299 |                 x = blk(x, H, W)
300 |             x = norm(x)
301 |             x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
302 |             if self.out_features_names[i] in self._out_features:
303 |                 outputs[self.out_features_names[i]] = x
304 |         return outputs
305 | 
306 | 
307 | class DWConv(nn.Module):
308 |     def __init__(self, dim=768):
309 |         super(DWConv, self).__init__()
310 |         self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
311 | 
312 |     def forward(self, x, H, W):
313 |         B, N, C = x.shape
314 |         x = x.transpose(1, 2).view(B, C, H, W)
315 |         x = self.dwconv(x)
316 |         x = x.flatten(2).transpose(1, 2)
317 | 
318 |         return x
319 | 
320 | 
321 | def _conv_filter(state_dict, patch_size=16):
322 |     """ convert patch embedding weight from manual patchify + linear proj to conv"""
323 |     out_dict = {}
324 |     for k, v in state_dict.items():
325 |         if 'patch_embed.proj.weight' in k:
326 |             v = v.reshape((v.shape[0], 3, patch_size, patch_size))
327 |         out_dict[k] = v
328 | 
329 |     return out_dict
330 | 
331 | 
332 | @BACKBONE_REGISTRY.register()
333 | def build_pyramid_vision_transformer(cfg, input_shape):
334 |     name = cfg.MODEL.PVT.NAME
335 |     linear = cfg.MODEL.PVT.LINEAR
336 |     out_features = cfg.MODEL.PVT.OUT_FEATURES
337 | 
338 |     if linear:
339 |         name = "b2"
340 | 
341 |     if name == "b0":
342 |         embed_dims=[32, 64, 160, 256]
343 |     else:
344 |         embed_dims=[64, 128, 320, 512]
345 |     
346 |     depths = {
347 |         "b0": [2, 2, 2, 2],
348 |         "b1": [2, 2, 2, 2],
349 |         "b2": [3, 4, 6, 3],
350 |         "b3": [3, 4, 18, 3],
351 |         "b4": [3, 8, 27, 3],
352 |         "b5": [3, 6, 40, 3]
353 |     }
354 | 
355 |     if name == "b5":
356 |         mlp_ratios = [4, 4, 4, 4]
357 |     else:
358 |         mlp_ratios = [8, 8, 4, 4]
359 | 
360 |     in_channels = input_shape.channels
361 | 
362 |     return PyramidVisionTransformerV2(
363 |         patch_size=4,
364 |         depths=depths[name],
365 |         in_chans=in_channels,
366 |         embed_dims=embed_dims,
367 |         num_heads=[1, 2, 5, 8],
368 |         mlp_ratios=mlp_ratios,
369 |         drop_rate=0.0,
370 |         drop_path_rate=0.1,
371 |         sr_ratios=[8, 4, 2, 1],
372 |         qkv_bias=True,
373 |         norm_layer=partial(nn.LayerNorm, eps=1e-6),
374 |         out_features=out_features,
375 |         linear=linear
376 |     )
377 | 
378 | 


--------------------------------------------------------------------------------
/sparseinst/backbones/resnet.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #  Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
  3 | 
  4 | import math
  5 | import torch.nn as nn
  6 | from timm.models.resnet import BasicBlock, Bottleneck
  7 | from timm.models.layers import DropBlock2d, DropPath, AvgPool2dSame
  8 | 
  9 | from detectron2.layers import ShapeSpec, FrozenBatchNorm2d
 10 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY
 11 | from detectron2.layers import NaiveSyncBatchNorm, DeformConv
 12 | 
 13 | 
 14 | def get_padding(kernel_size, stride, dilation=1):
 15 |     padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
 16 |     return padding
 17 | 
 18 | 
 19 | """
 20 | inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
 21 |                  reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
 22 |                  attn_layer=None, aa_layer=None, drop_block=None, drop_path=None
 23 | """
 24 | 
 25 | 
 26 | class DeformableBottleneck(nn.Module):
 27 |     expansion = 4
 28 | 
 29 |     def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
 30 |                  reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
 31 |                  attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
 32 |         super().__init__()
 33 | 
 34 |         width = int(math.floor(planes * (base_width / 64)) * cardinality)
 35 |         first_planes = width // reduce_first
 36 |         outplanes = planes * self.expansion
 37 |         first_dilation = first_dilation or dilation
 38 |         # use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation)
 39 | 
 40 |         self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False)
 41 |         self.bn1 = norm_layer(first_planes)
 42 |         self.act1 = act_layer(inplace=True)
 43 | 
 44 |         self.conv2_offset = nn.Conv2d(
 45 |             first_planes,
 46 |             18,
 47 |             kernel_size=3,
 48 |             stride=stride,
 49 |             padding=first_dilation,
 50 |             dilation=first_dilation
 51 |         )
 52 |         self.conv2 = DeformConv(
 53 |             first_planes,
 54 |             width,
 55 |             kernel_size=3,
 56 |             stride=stride,
 57 |             padding=first_dilation,
 58 |             bias=False,
 59 |             dilation=first_dilation,
 60 |         )
 61 | 
 62 |         self.bn2 = norm_layer(width)
 63 |         self.act2 = act_layer(inplace=True)
 64 |         # self.aa = aa_layer(channels=width, stride=stride) if use_aa else None
 65 | 
 66 |         self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False)
 67 |         self.bn3 = norm_layer(outplanes)
 68 | 
 69 |         # self.se = create_attn(attn_layer, outplanes)
 70 | 
 71 |         self.act3 = act_layer(inplace=True)
 72 |         self.downsample = downsample
 73 |         self.stride = stride
 74 |         self.dilation = dilation
 75 |         # self.drop_block = drop_block
 76 |         # self.drop_path = drop_path
 77 | 
 78 |         nn.init.constant_(self.conv2_offset.weight, 0)
 79 |         nn.init.constant_(self.conv2_offset.bias, 0)
 80 | 
 81 |     def zero_init_last_bn(self):
 82 |         nn.init.zeros_(self.bn3.weight)
 83 | 
 84 |     def forward(self, x):
 85 |         shortcut = x
 86 | 
 87 |         x = self.conv1(x)
 88 |         x = self.bn1(x)
 89 | 
 90 |         x = self.act1(x)
 91 | 
 92 |         offset = self.conv2_offset(x)
 93 |         x = self.conv2(x, offset)
 94 |         x = self.bn2(x)
 95 |         x = self.act2(x)
 96 | 
 97 |         x = self.conv3(x)
 98 |         x = self.bn3(x)
 99 | 
100 |         if self.downsample is not None:
101 |             shortcut = self.downsample(shortcut)
102 |         x += shortcut
103 |         x = self.act3(x)
104 | 
105 |         return x
106 | 
107 | 
108 | BLOCK_TYPE = {
109 |     "basic": BasicBlock,
110 |     "bottleneck": Bottleneck,
111 |     "deform_bottleneck": DeformableBottleneck
112 | }
113 | 
114 | 
115 | def downsample_conv(
116 |         in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
117 |     norm_layer = norm_layer or nn.BatchNorm2d
118 |     kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
119 |     first_dilation = (first_dilation or dilation) if kernel_size > 1 else 1
120 |     p = get_padding(kernel_size, stride, first_dilation)
121 | 
122 |     return nn.Sequential(*[
123 |         nn.Conv2d(
124 |             in_channels, out_channels, kernel_size, stride=stride, padding=p, dilation=first_dilation, bias=False),
125 |         norm_layer(out_channels)
126 |     ])
127 | 
128 | 
129 | def downsample_avg(
130 |         in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
131 |     norm_layer = norm_layer or nn.BatchNorm2d
132 |     avg_stride = stride if dilation == 1 else 1
133 |     if stride == 1 and dilation == 1:
134 |         pool = nn.Identity()
135 |     else:
136 |         avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
137 |         pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
138 | 
139 |     return nn.Sequential(*[
140 |         pool,
141 |         nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False),
142 |         norm_layer(out_channels)
143 |     ])
144 | 
145 | 
146 | def drop_blocks(drop_block_rate=0.):
147 |     return [
148 |         None, None,
149 |         DropBlock2d(drop_block_rate, 5, 0.25) if drop_block_rate else None,
150 |         DropBlock2d(drop_block_rate, 3, 1.00) if drop_block_rate else None]
151 | 
152 | 
153 | def make_blocks(
154 |         stage_block, channels, block_repeats, inplanes, reduce_first=1, output_stride=32,
155 |         down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., **kwargs):
156 |     stages = []
157 |     feature_info = []
158 |     net_num_blocks = sum(block_repeats)
159 |     net_block_idx = 0
160 |     net_stride = 4
161 |     dilation = prev_dilation = 1
162 |     for stage_idx, (planes, num_blocks, db) in enumerate(zip(channels, block_repeats, drop_blocks(drop_block_rate))):
163 |         # choose block_fn through the BLOCK_TYPE
164 |         block_fn = BLOCK_TYPE[stage_block[stage_idx]]
165 | 
166 |         stage_name = f'layer{stage_idx + 1}'  # never liked this name, but weight compat requires it
167 |         stride = 1 if stage_idx == 0 else 2
168 |         if net_stride >= output_stride:
169 |             dilation *= stride
170 |             stride = 1
171 |         else:
172 |             net_stride *= stride
173 | 
174 |         downsample = None
175 |         if stride != 1 or inplanes != planes * block_fn.expansion:
176 |             down_kwargs = dict(
177 |                 in_channels=inplanes, out_channels=planes * block_fn.expansion, kernel_size=down_kernel_size,
178 |                 stride=stride, dilation=dilation, first_dilation=prev_dilation, norm_layer=kwargs.get('norm_layer'))
179 |             downsample = downsample_avg(
180 |                 **down_kwargs) if avg_down else downsample_conv(**down_kwargs)
181 | 
182 |         block_kwargs = dict(reduce_first=reduce_first, dilation=dilation, drop_block=db, **kwargs)
183 |         blocks = []
184 |         for block_idx in range(num_blocks):
185 |             downsample = downsample if block_idx == 0 else None
186 |             stride = stride if block_idx == 0 else 1
187 |             block_dpr = drop_path_rate * net_block_idx / \
188 |                 (net_num_blocks - 1)  # stochastic depth linear decay rule
189 |             blocks.append(block_fn(
190 |                 inplanes, planes, stride, downsample, first_dilation=prev_dilation,
191 |                 drop_path=DropPath(block_dpr) if block_dpr > 0. else None, **block_kwargs))
192 |             prev_dilation = dilation
193 |             inplanes = planes * block_fn.expansion
194 |             net_block_idx += 1
195 | 
196 |         stages.append((stage_name, nn.Sequential(*blocks)))
197 |         feature_info.append(dict(num_chs=inplanes, reduction=net_stride, module=stage_name))
198 | 
199 |     return stages, feature_info
200 | 
201 | 
202 | class ResNet(Backbone):
203 |     """ResNet / ResNeXt / SE-ResNeXt / SE-Net
204 | 
205 |     This class implements all variants of ResNet, ResNeXt, SE-ResNeXt, and SENet that
206 |       * have > 1 stride in the 3x3 conv layer of bottleneck
207 |       * have conv-bn-act ordering
208 | 
209 |     This ResNet impl supports a number of stem and downsample options based on the v1c, v1d, v1e, and v1s
210 |     variants included in the MXNet Gluon ResNetV1b model. The C and D variants are also discussed in the
211 |     'Bag of Tricks' paper: https://arxiv.org/pdf/1812.01187. The B variant is equivalent to torchvision default.
212 | 
213 |     ResNet variants (the same modifications can be used in SE/ResNeXt models as well):
214 |       * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b
215 |       * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64)
216 |       * d - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64), average pool in downsample
217 |       * e - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128), average pool in downsample
218 |       * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128)
219 |       * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample
220 |       * tn - 3 layer deep 3x3 stem, stem width = 32 (24, 32, 64), average pool in downsample
221 | 
222 |     ResNeXt
223 |       * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths
224 |       * same c,d, e, s variants as ResNet can be enabled
225 | 
226 |     SE-ResNeXt
227 |       * normal - 7x7 stem, stem_width = 64
228 |       * same c, d, e, s variants as ResNet can be enabled
229 | 
230 |     SENet-154 - 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64,
231 |         reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block
232 | 
233 |     Parameters
234 |     ----------
235 |     block : Block
236 |         Class for the residual block. Options are BasicBlockGl, BottleneckGl.
237 |     layers : list of int
238 |         Numbers of layers in each block
239 |     num_classes : int, default 1000
240 |         Number of classification classes.
241 |     in_chans : int, default 3
242 |         Number of input (color) channels.
243 |     cardinality : int, default 1
244 |         Number of convolution groups for 3x3 conv in Bottleneck.
245 |     base_width : int, default 64
246 |         Factor determining bottleneck channels. `planes * base_width / 64 * cardinality`
247 |     stem_width : int, default 64
248 |         Number of channels in stem convolutions
249 |     stem_type : str, default ''
250 |         The type of stem:
251 |           * '', default - a single 7x7 conv with a width of stem_width
252 |           * 'deep' - three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2
253 |           * 'deep_tiered' - three 3x3 conv layers of widths stem_width//4 * 3, stem_width, stem_width * 2
254 |     block_reduce_first: int, default 1
255 |         Reduction factor for first convolution output width of residual blocks,
256 |         1 for all archs except senets, where 2
257 |     down_kernel_size: int, default 1
258 |         Kernel size of residual block downsampling path, 1x1 for most archs, 3x3 for senets
259 |     avg_down : bool, default False
260 |         Whether to use average pooling for projection skip connection between stages/downsample.
261 |     output_stride : int, default 32
262 |         Set the output stride of the network, 32, 16, or 8. Typically used in segmentation.
263 |     act_layer : nn.Module, activation layer
264 |     norm_layer : nn.Module, normalization layer
265 |     aa_layer : nn.Module, anti-aliasing layer
266 |     drop_rate : float, default 0.
267 |         Dropout probability before classifier, for training
268 |     global_pool : str, default 'avg'
269 |         Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
270 |     """
271 | 
272 |     def __init__(self, block_types, layers, in_chans=3,
273 |                  cardinality=1, base_width=64, stem_width=64, stem_type='', replace_stem_pool=False,
274 |                  output_stride=32, block_reduce_first=1, down_kernel_size=1, avg_down=False,
275 |                  act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_rate=0.0, drop_path_rate=0.,
276 |                  drop_block_rate=0., global_pool='avg', zero_init_last_bn=True, block_args=None, out_features=None):
277 |         block_args = block_args or dict()
278 |         assert output_stride in (8, 16, 32)
279 |         # self.num_classes = num_classes
280 |         self.drop_rate = drop_rate
281 |         super(ResNet, self).__init__()
282 | 
283 |         # Stem
284 |         deep_stem = 'deep' in stem_type
285 |         inplanes = stem_width * 2 if deep_stem else 64
286 |         if deep_stem:
287 |             stem_chs = (stem_width, stem_width)
288 |             if 'tiered' in stem_type:
289 |                 stem_chs = (3 * (stem_width // 4), stem_width)
290 |             self.conv1 = nn.Sequential(*[
291 |                 nn.Conv2d(in_chans, stem_chs[0], 3, stride=2, padding=1, bias=False),
292 |                 norm_layer(stem_chs[0]),
293 |                 act_layer(inplace=True),
294 |                 nn.Conv2d(stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False),
295 |                 norm_layer(stem_chs[1]),
296 |                 act_layer(inplace=True),
297 |                 nn.Conv2d(stem_chs[1], inplanes, 3, stride=1, padding=1, bias=False)])
298 |         else:
299 |             self.conv1 = nn.Conv2d(in_chans, inplanes, kernel_size=7,
300 |                                    stride=2, padding=3, bias=False)
301 |         self.bn1 = norm_layer(inplanes)
302 |         self.act1 = act_layer(inplace=True)
303 |         self.feature_info = [dict(num_chs=inplanes, reduction=2, module='act1')]
304 | 
305 |         # Stem Pooling
306 |         if replace_stem_pool:
307 |             self.maxpool = nn.Sequential(*filter(None, [
308 |                 nn.Conv2d(inplanes, inplanes, 3, stride=1 if aa_layer else 2, padding=1, bias=False),
309 |                 aa_layer(channels=inplanes, stride=2) if aa_layer else None,
310 |                 norm_layer(inplanes),
311 |                 act_layer(inplace=True)
312 |             ]))
313 |         else:
314 |             if aa_layer is not None:
315 |                 self.maxpool = nn.Sequential(*[
316 |                     nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
317 |                     aa_layer(channels=inplanes, stride=2)])
318 |             else:
319 |                 self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
320 | 
321 |         # Feature Blocks
322 |         channels = [64, 128, 256, 512]
323 |         stage_modules, stage_feature_info = make_blocks(
324 |             block_types, channels, layers, inplanes, cardinality=cardinality, base_width=base_width,
325 |             output_stride=output_stride, reduce_first=block_reduce_first, avg_down=avg_down,
326 |             down_kernel_size=down_kernel_size, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer,
327 |             drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, **block_args)
328 |         for stage in stage_modules:
329 |             self.add_module(*stage)  # layer1, layer2, etc
330 |         self.feature_info.extend(stage_feature_info)
331 | 
332 |         for n, m in self.named_modules():
333 |             if isinstance(m, nn.BatchNorm2d):
334 |                 nn.init.constant_(m.weight, 1.)
335 |                 nn.init.constant_(m.bias, 0.)
336 |         if zero_init_last_bn:
337 |             for m in self.modules():
338 |                 if hasattr(m, 'zero_init_last_bn'):
339 |                     m.zero_init_last_bn()
340 | 
341 |         out_features_names = ["res2", "res3", "res4", "res5"]
342 |         self._out_feature_strides = dict(zip(out_features_names, [4, 8, 16, 32]))
343 |         self._out_feature_channels = dict(
344 |             zip(out_features_names, [x * BLOCK_TYPE[block_types[0]].expansion for x in [64, 128, 256, 512]]))
345 |         if out_features is None:
346 |             self._out_features = out_features_names
347 |         else:
348 |             self._out_features = out_features
349 | 
350 |     def output_shape(self):
351 |         return {
352 |             name: ShapeSpec(
353 |                 channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
354 |             )
355 |             for name in self._out_features
356 |         }
357 | 
358 |     def size_divisibility(self):
359 |         return 32
360 | 
361 |     def forward(self, x):
362 |         x = self.conv1(x)
363 |         x = self.bn1(x)
364 |         x = self.act1(x)
365 |         x = self.maxpool(x)
366 |         outputs = {}
367 |         x = self.layer1(x)
368 |         # outputs["res2"] = x
369 |         x = self.layer2(x)
370 |         outputs["res3"] = x
371 |         x = self.layer3(x)
372 |         outputs["res4"] = x
373 |         x = self.layer4(x)
374 |         outputs["res5"] = x
375 |         return outputs
376 | 
377 | 
378 | @BACKBONE_REGISTRY.register()
379 | def build_resnet_vd_backbone(cfg, input_shape):
380 | 
381 |     depth = cfg.MODEL.RESNETS.DEPTH
382 |     norm_name = cfg.MODEL.RESNETS.NORM
383 |     if norm_name == "FrozenBN":
384 |         norm = FrozenBatchNorm2d
385 |     elif norm_name == "SyncBN":
386 |         norm = NaiveSyncBatchNorm
387 |     else:
388 |         norm = nn.BatchNorm2d
389 |     if depth == 50:
390 |         layers = [3, 4, 6, 3]
391 |     elif depth == 101:
392 |         layers = [3, 4, 23, 3]
393 |     else:
394 |         raise NotImplementedError()
395 | 
396 |     stage_blocks = []
397 |     use_deformable = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
398 |     for idx in range(4):
399 |         if use_deformable[idx]:
400 |             stage_blocks.append("deform_bottleneck")
401 |         else:
402 |             stage_blocks.append("bottleneck")
403 | 
404 |     model = ResNet(stage_blocks, layers, stem_type="deep",
405 |                    stem_width=32, avg_down=True, norm_layer=norm)
406 |     return model
407 | 


--------------------------------------------------------------------------------
/sparseinst/caffe2sparseinst.py:
--------------------------------------------------------------------------------
 1 | from detectron2.export.caffe2_modeling import *
 2 | from sparseinst import SparseInst
 3 | import numpy as np
 4 | from matplotlib import pyplot as plt
 5 | 
 6 | class Caffe2SparseInst(Caffe2MetaArch):
 7 |     def __init__(self, cfg, torch_model):
 8 |         assert isinstance(torch_model, SparseInst)
 9 |         # torch_model.backbone.size_divisibility = 32
10 |         super().__init__(cfg, torch_model)
11 |         self.torch_model = torch_model
12 |         self.pixel_mean = self.torch_model.pixel_mean/255
13 |         self.pixel_std = self.torch_model.pixel_std/255
14 | 
15 |     def get_caffe2_inputs(self, batched_inputs):
16 |         inputs = super().get_caffe2_inputs(batched_inputs)
17 |         return inputs[0]/255
18 | 
19 |     def encode_additional_info(self, predict_net, init_net):
20 |         pass
21 | 
22 |     def normalizer(self, image):
23 |         image = (image - self.pixel_mean) / self.pixel_std
24 |         return image
25 | 
26 |     @mock_torch_nn_functional_interpolate()
27 |     def forward(self, inputs):
28 |         images = self.normalizer(inputs)
29 |         images = ImageList.from_tensors([images], 32)[0]
30 |         # forward
31 |         features = self.torch_model.backbone(images)
32 |         features = self.torch_model.encoder(features)
33 |         output = self.torch_model.decoder(features)
34 |         pred_scores = output["pred_logits"].sigmoid()
35 |         pred_masks = output["pred_masks"].sigmoid()
36 |         pred_objectness = output["pred_scores"].sigmoid()
37 |         pred_scores2 = torch.sqrt(pred_scores * pred_objectness)
38 | 
39 |         # scores, masks = np.squeeze(pred_scores2), np.squeeze(pred_masks)
40 |         # keep = torch.argmax(scores, axis=1)
41 |         # masks = [masks[label, :, :] for i, label in enumerate(keep) if scores[i, label] > 0.35]
42 |         # fig = plt.figure()
43 |         # num_masks = len(masks)
44 |         # for i, mask in enumerate(masks, 1):
45 |         #     fig.add_subplot(1, num_masks, i)
46 |         #     plt.imshow(mask.data.cpu())
47 |         # plt.show()
48 |         # plt.ion()
49 |         
50 |         # return
51 | 
52 |         return pred_scores2, pred_masks
53 | 
54 |     @staticmethod
55 |     def get_outputs_converter(predict_net, init_net):
56 |         pass
57 | 
58 | 
59 | META_ARCH_CAFFE2_EXPORT_TYPE_MAP['SparseInst'] = Caffe2SparseInst


--------------------------------------------------------------------------------
/sparseinst/coco_evaluation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pycocotools.mask as mask_util
 3 | from detectron2.structures import BoxMode
 4 | from detectron2.evaluation import COCOEvaluator
 5 | 
 6 | 
 7 | def instances_to_coco_json(instances, img_id):
 8 |     """
 9 |     Dump an "Instances" object to a COCO-format json that's used for evaluation.
10 | 
11 |     Args:
12 |         instances (Instances):
13 |         img_id (int): the image id
14 | 
15 |     Returns:
16 |         list[dict]: list of json annotations in COCO format.
17 |     """
18 |     num_instance = len(instances)
19 |     if num_instance == 0:
20 |         return []
21 | 
22 |     # NOTE: pure instance segmentation
23 |     has_box = instances.has("pred_boxes")
24 |     if has_box:
25 |         boxes = instances.pred_boxes.tensor.numpy()
26 |         boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
27 |         boxes = boxes.tolist()
28 | 
29 |     scores = instances.scores.tolist()
30 |     classes = instances.pred_classes.tolist()
31 | 
32 |     has_mask = instances.has("pred_masks")
33 |     if has_mask:
34 |         # use RLE to encode the masks, because they are too large and takes memory
35 |         # since this evaluator stores outputs of the entire dataset
36 |         rles = [
37 |             mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
38 |             for mask in instances.pred_masks
39 |         ]
40 |         for rle in rles:
41 |             # "counts" is an array encoded by mask_util as a byte-stream. Python3's
42 |             # json writer which always produces strings cannot serialize a bytestream
43 |             # unless you decode it. Thankfully, utf-8 works out (which is also what
44 |             # the pycocotools/_mask.pyx does).
45 |             rle["counts"] = rle["counts"].decode("utf-8")
46 | 
47 |     has_keypoints = instances.has("pred_keypoints")
48 |     if has_keypoints:
49 |         keypoints = instances.pred_keypoints
50 | 
51 |     results = []
52 |     for k in range(num_instance):
53 |         result = {
54 |             "image_id": img_id,
55 |             "category_id": classes[k],
56 |             "score": scores[k],
57 |         }
58 |         if has_box:
59 |             result["bbox"] = boxes[k]
60 |         if has_mask:
61 |             result["segmentation"] = rles[k]
62 |         if has_keypoints:
63 |             # In COCO annotations,
64 |             # keypoints coordinates are pixel indices.
65 |             # However our predictions are floating point coordinates.
66 |             # Therefore we subtract 0.5 to be consistent with the annotation format.
67 |             # This is the inverse of data loading logic in `datasets/coco.py`.
68 |             keypoints[k][:, :2] -= 0.5
69 |             result["keypoints"] = keypoints[k].flatten().tolist()
70 |         results.append(result)
71 |     return results
72 | 
73 | 
74 | class COCOMaskEvaluator(COCOEvaluator):
75 | 
76 |     def process(self, inputs, outputs):
77 |         """
78 |         Args:
79 |             inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
80 |                 It is a list of dict. Each dict corresponds to an image and
81 |                 contains keys like "height", "width", "file_name", "image_id".
82 |             outputs: the outputs of a COCO model. It is a list of dicts with key
83 |                 "instances" that contains :class:`Instances`.
84 |         """
85 |         for input, output in zip(inputs, outputs):
86 |             prediction = {"image_id": input["image_id"]}
87 | 
88 |             if "instances" in output:
89 |                 instances = output["instances"].to(self._cpu_device)
90 |                 prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
91 |             if "proposals" in output:
92 |                 prediction["proposals"] = output["proposals"].to(self._cpu_device)
93 |             if len(prediction) > 1:
94 |                 self._predictions.append(prediction)


--------------------------------------------------------------------------------
/sparseinst/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
 2 | 
 3 | from detectron2.config import CfgNode as CN
 4 | 
 5 | def add_sparse_inst_config(cfg):
 6 | 
 7 |     cfg.MODEL.DEVICE = 'cuda'
 8 |     cfg.MODEL.MASK_ON = True
 9 |     # [SparseInst]
10 |     cfg.MODEL.SPARSE_INST = CN()
11 | 
12 |     # parameters for inference
13 |     cfg.MODEL.SPARSE_INST.CLS_THRESHOLD = 0.005
14 |     cfg.MODEL.SPARSE_INST.MASK_THRESHOLD = 0.45
15 |     cfg.MODEL.SPARSE_INST.MAX_DETECTIONS = 100
16 | 
17 |     # [Encoder]
18 |     cfg.MODEL.SPARSE_INST.ENCODER = CN()
19 |     cfg.MODEL.SPARSE_INST.ENCODER.NAME = "FPNEncoder"
20 |     cfg.MODEL.SPARSE_INST.ENCODER.NORM = ""
21 |     cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES = ["res3", "res4", "res5"]
22 |     cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS = 256
23 | 
24 |     # [Decoder]
25 |     cfg.MODEL.SPARSE_INST.DECODER = CN()
26 |     cfg.MODEL.SPARSE_INST.DECODER.NAME = "BaseIAMDecoder"
27 |     cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS = 100
28 |     cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES = 80
29 |     # kernels for mask features
30 |     cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM = 128
31 |     # upsample factor for output masks
32 |     cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR = 2.0
33 |     cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM = False
34 |     cfg.MODEL.SPARSE_INST.DECODER.GROUPS = 4    
35 |     # decoder.inst_branch
36 |     cfg.MODEL.SPARSE_INST.DECODER.INST = CN()
37 |     cfg.MODEL.SPARSE_INST.DECODER.INST.DIM = 256
38 |     cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS = 4
39 |     # decoder.mask_branch
40 |     cfg.MODEL.SPARSE_INST.DECODER.MASK = CN()
41 |     cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM = 256
42 |     cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS = 4
43 | 
44 |     # [Loss]
45 |     cfg.MODEL.SPARSE_INST.LOSS = CN()
46 |     cfg.MODEL.SPARSE_INST.LOSS.NAME = "SparseInstCriterion"
47 |     cfg.MODEL.SPARSE_INST.LOSS.ITEMS = ("labels", "masks")
48 |     # loss weight
49 |     cfg.MODEL.SPARSE_INST.LOSS.CLASS_WEIGHT = 2.0
50 |     cfg.MODEL.SPARSE_INST.LOSS.MASK_PIXEL_WEIGHT = 5.0
51 |     cfg.MODEL.SPARSE_INST.LOSS.MASK_DICE_WEIGHT = 2.0
52 |     # iou-aware objectness loss weight
53 |     cfg.MODEL.SPARSE_INST.LOSS.OBJECTNESS_WEIGHT = 1.0
54 | 
55 |     # [Matcher]
56 |     cfg.MODEL.SPARSE_INST.MATCHER = CN()
57 |     cfg.MODEL.SPARSE_INST.MATCHER.NAME = "SparseInstMatcher"
58 |     cfg.MODEL.SPARSE_INST.MATCHER.ALPHA = 0.8
59 |     cfg.MODEL.SPARSE_INST.MATCHER.BETA = 0.2
60 | 
61 |     # [Optimizer]
62 |     cfg.SOLVER.OPTIMIZER = "ADAMW"
63 |     cfg.SOLVER.BACKBONE_MULTIPLIER = 1.0
64 |     cfg.SOLVER.AMSGRAD = False
65 | 
66 |     # [Dataset mapper]
67 |     cfg.MODEL.SPARSE_INST.DATASET_MAPPER = "SparseInstDatasetMapper"
68 | 
69 |     # [Pyramid Vision Transformer]
70 |     cfg.MODEL.PVT = CN()
71 |     cfg.MODEL.PVT.NAME = "b1"
72 |     cfg.MODEL.PVT.OUT_FEATURES = ["p2", "p3", "p4"]
73 |     cfg.MODEL.PVT.LINEAR = False
74 | 
75 |     cfg.MODEL.CSPNET = CN()
76 |     cfg.MODEL.CSPNET.NAME = "darknet53"
77 |     cfg.MODEL.CSPNET.NORM = ""
78 |     # (csp-)darknet: csp1, csp2, csp3, csp4
79 |     cfg.MODEL.CSPNET.OUT_FEATURES = ["csp1", "csp2", "csp3", "csp4"]
80 | 
81 | 


--------------------------------------------------------------------------------
/sparseinst/d2_predictor.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | import atexit
  3 | import bisect
  4 | import multiprocessing as mp
  5 | from collections import deque
  6 | import cv2
  7 | import torch
  8 | 
  9 | from detectron2.data import MetadataCatalog
 10 | from detectron2.engine.defaults import DefaultPredictor
 11 | from detectron2.utils.video_visualizer import VideoVisualizer
 12 | from detectron2.utils.visualizer import ColorMode, Visualizer
 13 | 
 14 | 
 15 | class VisualizationDemo(object):
 16 |     def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
 17 |         """
 18 |         Args:
 19 |             cfg (CfgNode):
 20 |             instance_mode (ColorMode):
 21 |             parallel (bool): whether to run the model in different processes from visualization.
 22 |                 Useful since the visualization logic can be slow.
 23 |         """
 24 |         self.img_format = cfg.INPUT.FORMAT
 25 |         self.metadata = MetadataCatalog.get(
 26 |             cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
 27 |         )
 28 |         self.cpu_device = torch.device("cpu")
 29 |         self.instance_mode = instance_mode
 30 | 
 31 |         self.parallel = parallel
 32 |         if parallel:
 33 |             num_gpu = torch.cuda.device_count()
 34 |             self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
 35 |         else:
 36 |             self.predictor = DefaultPredictor(cfg)
 37 | 
 38 |     def run_on_image(self, image, confidence_threshold):
 39 |         """
 40 |         Args:
 41 |             image (np.ndarray): an image of shape (H, W, C) (in BGR order).
 42 |                 This is the format used by OpenCV.
 43 | 
 44 |         Returns:
 45 |             predictions (dict): the output of the model.
 46 |             vis_output (VisImage): the visualized image output.
 47 |         """
 48 |         vis_output = None
 49 |         predictions = self.predictor(image)
 50 |         visualizer = Visualizer(image, self.metadata,
 51 |                                 instance_mode=self.instance_mode)
 52 |         if "panoptic_seg" in predictions:
 53 |             panoptic_seg, segments_info = predictions["panoptic_seg"]
 54 |             vis_output = visualizer.draw_panoptic_seg_predictions(
 55 |                 panoptic_seg.to(self.cpu_device), segments_info
 56 |             )
 57 |         else:
 58 |             if "sem_seg" in predictions:
 59 |                 vis_output = visualizer.draw_sem_seg(
 60 |                     predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
 61 |                 )
 62 |             if "instances" in predictions:
 63 |                 instances = predictions["instances"].to(self.cpu_device)
 64 |                 instances = instances[instances.scores > confidence_threshold]
 65 |                 predictions["instances"] = instances
 66 |                 vis_output = visualizer.draw_instance_predictions(
 67 |                     predictions=instances)
 68 | 
 69 |         return predictions, vis_output
 70 | 
 71 |     def _frame_from_video(self, video):
 72 |         while video.isOpened():
 73 |             success, frame = video.read()
 74 |             if success:
 75 |                 yield frame
 76 |             else:
 77 |                 break
 78 | 
 79 |     def run_on_video(self, video, confidence_threshold):
 80 |         """
 81 |         Visualizes predictions on frames of the input video.
 82 | 
 83 |         Args:
 84 |             video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
 85 |                 either a webcam or a video file.
 86 | 
 87 |         Yields:
 88 |             ndarray: BGR visualizations of each video frame.
 89 |         """
 90 |         video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
 91 | 
 92 |         def process_predictions(frame, predictions):
 93 |             frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
 94 |             if "panoptic_seg" in predictions:
 95 |                 panoptic_seg, segments_info = predictions["panoptic_seg"]
 96 |                 vis_frame = video_visualizer.draw_panoptic_seg_predictions(
 97 |                     frame, panoptic_seg.to(self.cpu_device), segments_info
 98 |                 )
 99 |             elif "instances" in predictions:
100 |                 predictions = predictions["instances"].to(self.cpu_device)
101 |                 predictions = predictions[predictions.scores >
102 |                                           confidence_threshold]
103 |                 vis_frame = video_visualizer.draw_instance_predictions(
104 |                     frame, predictions)
105 |             elif "sem_seg" in predictions:
106 |                 vis_frame = video_visualizer.draw_sem_seg(
107 |                     frame, predictions["sem_seg"].argmax(
108 |                         dim=0).to(self.cpu_device)
109 |                 )
110 | 
111 |             # Converts Matplotlib RGB format to OpenCV BGR format
112 |             vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
113 |             return vis_frame
114 | 
115 |         frame_gen = self._frame_from_video(video)
116 |         if self.parallel:
117 |             buffer_size = self.predictor.default_buffer_size
118 | 
119 |             frame_data = deque()
120 | 
121 |             for cnt, frame in enumerate(frame_gen):
122 |                 frame_data.append(frame)
123 |                 self.predictor.put(frame)
124 | 
125 |                 if cnt >= buffer_size:
126 |                     frame = frame_data.popleft()
127 |                     predictions = self.predictor.get()
128 |                     yield process_predictions(frame, predictions)
129 | 
130 |             while len(frame_data):
131 |                 frame = frame_data.popleft()
132 |                 predictions = self.predictor.get()
133 |                 yield process_predictions(frame, predictions)
134 |         else:
135 |             for frame in frame_gen:
136 |                 yield process_predictions(frame, self.predictor(frame))
137 | 
138 | 
139 | class AsyncPredictor:
140 |     """
141 |     A predictor that runs the model asynchronously, possibly on >1 GPUs.
142 |     Because rendering the visualization takes considerably amount of time,
143 |     this helps improve throughput a little bit when rendering videos.
144 |     """
145 | 
146 |     class _StopToken:
147 |         pass
148 | 
149 |     class _PredictWorker(mp.Process):
150 |         def __init__(self, cfg, task_queue, result_queue):
151 |             self.cfg = cfg
152 |             self.task_queue = task_queue
153 |             self.result_queue = result_queue
154 |             super().__init__()
155 | 
156 |         def run(self):
157 |             predictor = DefaultPredictor(self.cfg)
158 | 
159 |             while True:
160 |                 task = self.task_queue.get()
161 |                 if isinstance(task, AsyncPredictor._StopToken):
162 |                     break
163 |                 idx, data = task
164 |                 result = predictor(data)
165 |                 self.result_queue.put((idx, result))
166 | 
167 |     def __init__(self, cfg, num_gpus: int = 1):
168 |         """
169 |         Args:
170 |             cfg (CfgNode):
171 |             num_gpus (int): if 0, will run on CPU
172 |         """
173 |         num_workers = max(num_gpus, 1)
174 |         self.task_queue = mp.Queue(maxsize=num_workers * 3)
175 |         self.result_queue = mp.Queue(maxsize=num_workers * 3)
176 |         self.procs = []
177 |         for gpuid in range(max(num_gpus, 1)):
178 |             cfg = cfg.clone()
179 |             cfg.defrost()
180 |             cfg.MODEL.DEVICE = "cuda:{}".format(
181 |                 gpuid) if num_gpus > 0 else "cpu"
182 |             self.procs.append(
183 |                 AsyncPredictor._PredictWorker(
184 |                     cfg, self.task_queue, self.result_queue)
185 |             )
186 | 
187 |         self.put_idx = 0
188 |         self.get_idx = 0
189 |         self.result_rank = []
190 |         self.result_data = []
191 | 
192 |         for p in self.procs:
193 |             p.start()
194 |         atexit.register(self.shutdown)
195 | 
196 |     def put(self, image):
197 |         self.put_idx += 1
198 |         self.task_queue.put((self.put_idx, image))
199 | 
200 |     def get(self):
201 |         self.get_idx += 1  # the index needed for this request
202 |         if len(self.result_rank) and self.result_rank[0] == self.get_idx:
203 |             res = self.result_data[0]
204 |             del self.result_data[0], self.result_rank[0]
205 |             return res
206 | 
207 |         while True:
208 |             # make sure the results are returned in the correct order
209 |             idx, res = self.result_queue.get()
210 |             if idx == self.get_idx:
211 |                 return res
212 |             insert = bisect.bisect(self.result_rank, idx)
213 |             self.result_rank.insert(insert, idx)
214 |             self.result_data.insert(insert, res)
215 | 
216 |     def __len__(self):
217 |         return self.put_idx - self.get_idx
218 | 
219 |     def __call__(self, image):
220 |         self.put(image)
221 |         return self.get()
222 | 
223 |     def shutdown(self):
224 |         for _ in self.procs:
225 |             self.task_queue.put(AsyncPredictor._StopToken())
226 | 
227 |     @property
228 |     def default_buffer_size(self):
229 |         return len(self.procs) * 5


--------------------------------------------------------------------------------
/sparseinst/dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import logging
  3 | import numpy as np
  4 | import torch
  5 | 
  6 | 
  7 | from detectron2.data import detection_utils as utils
  8 | from detectron2.data import transforms as T
  9 | 
 10 | """
 11 | This file contains the default mapping that's applied to "dataset dicts".
 12 | """
 13 | 
 14 | __all__ = ["SparseInstDatasetMapper"]
 15 | 
 16 | 
 17 | def build_transform_gen(cfg, is_train):
 18 |     """
 19 |     Create a list of default :class:`Augmentation` from config.
 20 |     Now it includes resizing and flipping.
 21 |     Returns:
 22 |         list[Augmentation]
 23 |     """
 24 |     augmentation = []
 25 | 
 26 |     if is_train:
 27 |         min_size = cfg.INPUT.MIN_SIZE_TRAIN
 28 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
 29 |         sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
 30 |     else:
 31 |         min_size = cfg.INPUT.MIN_SIZE_TEST
 32 |         max_size = cfg.INPUT.MAX_SIZE_TEST
 33 |         sample_style = "choice"
 34 |     if is_train and cfg.INPUT.RANDOM_FLIP != "none":
 35 |         augmentation.append(
 36 |             T.RandomFlip(
 37 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 38 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 39 |             )
 40 |         )
 41 |     if is_train:
 42 |         augmentation.append(
 43 |             T.ResizeShortestEdge(min_size, max_size, sample_style)
 44 |         )
 45 |     return augmentation
 46 | 
 47 | 
 48 | class SparseInstDatasetMapper:
 49 |     """
 50 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 51 |     and map it into a format used by the model.
 52 |     This is the default callable to be used to map your dataset dict into training data.
 53 |     You may need to follow it to implement your own one for customized logic,
 54 |     such as a different way to read or transform images.
 55 |     See :doc:`/tutorials/data_loading` for details.
 56 |     The callable currently does the following:
 57 |     1. Read the image from "file_name"
 58 |     2. Applies cropping/geometric transforms to the image and annotations
 59 |     3. Prepare data and annotations to Tensor and :class:`Instances`
 60 |     """
 61 |     # @classmethod
 62 | 
 63 |     def __init__(self, cfg, is_train: bool = True):
 64 |         augs = build_transform_gen(cfg, is_train)
 65 |         self.default_aug = T.AugmentationList(augs)
 66 |         if cfg.INPUT.CROP.ENABLED and is_train:
 67 |             crop_gen = [
 68 |                 T.ResizeShortestEdge([400, 500, 600], sample_style='choice'),
 69 |                 T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)
 70 |             ]
 71 |             recompute_boxes = cfg.MODEL.MASK_ON
 72 |             augs = augs[:-1] + crop_gen + augs[-1:]
 73 |             self.crop_aug = T.AugmentationList(augs)
 74 |         else:
 75 |             self.crop_aug = None
 76 |             recompute_boxes = False
 77 | 
 78 |         # self.augs = augs
 79 |         self.is_train = is_train
 80 |         self.image_format = cfg.INPUT.FORMAT
 81 |         self.use_instance_mask = cfg.MODEL.MASK_ON
 82 |         self.instance_mask_format = cfg.INPUT.MASK_FORMAT
 83 |         self.recompute_boxes = recompute_boxes
 84 | 
 85 |         logger = logging.getLogger(__name__)
 86 |         mode = "training" if is_train else "inference"
 87 |         logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augs}")
 88 | 
 89 |     def __call__(self, dataset_dict):
 90 |         """
 91 |         Args:
 92 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
 93 |         Returns:
 94 |             dict: a format that builtin models in detectron2 accept
 95 |         """
 96 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
 97 |         # USER: Write your own image loading if it's not from a file
 98 |         image = utils.read_image(dataset_dict["file_name"], format=self.image_format)
 99 |         utils.check_image_size(dataset_dict, image)
100 | 
101 |         # USER: Remove if you don't do semantic/panoptic segmentation.
102 |         if "sem_seg_file_name" in dataset_dict:
103 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2)
104 |         else:
105 |             sem_seg_gt = None
106 | 
107 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
108 | 
109 |         if self.crop_aug is None:
110 |             transforms = self.default_aug(aug_input)
111 |         else:
112 |             if np.random.rand() > 0.5:
113 |                 transforms = self.crop_aug(aug_input)
114 |             else:
115 |                 transforms = self.default_aug(aug_input)
116 |         # transforms = self.augmentations(aug_input)
117 |         image, sem_seg_gt = aug_input.image, aug_input.sem_seg
118 | 
119 |         image_shape = image.shape[:2]  # h, w
120 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
121 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
122 |         # Therefore it's important to use torch.Tensor.
123 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
124 |         if sem_seg_gt is not None:
125 |             dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long"))
126 | 
127 |         if not self.is_train:
128 |             # USER: Modify this if you want to keep them for some reason.
129 |             dataset_dict.pop("annotations", None)
130 |             dataset_dict.pop("sem_seg_file_name", None)
131 |             return dataset_dict
132 | 
133 |         if "annotations" in dataset_dict:
134 |             # USER: Modify this if you want to keep them for some reason.
135 |             for anno in dataset_dict["annotations"]:
136 |                 anno.pop("keypoints", None)
137 |                 if not self.use_instance_mask:
138 |                     anno.pop("segmentation", None)
139 | 
140 |             # USER: Implement additional transformations if you have other types of data
141 |             annos = [
142 |                 utils.transform_instance_annotations(
143 |                     obj, transforms, image_shape)
144 |                 for obj in dataset_dict.pop("annotations")
145 |                 if obj.get("iscrowd", 0) == 0
146 |             ]
147 |             instances = utils.annotations_to_instances(
148 |                 annos, image_shape, mask_format=self.instance_mask_format
149 |             )
150 | 
151 |             # After transforms such as cropping are applied, the bounding box may no longer
152 |             # tightly bound the object. As an example, imagine a triangle object
153 |             # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
154 |             # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
155 |             # the intersection of original bounding box and the cropping box.
156 |             if self.recompute_boxes:
157 |                 instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
158 |             dataset_dict["instances"] = utils.filter_empty_instances(instances)
159 |         return dataset_dict


--------------------------------------------------------------------------------
/sparseinst/decoder.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch.nn import init
  5 | import torch.nn.functional as F
  6 | from torch.utils.checkpoint import checkpoint
  7 | from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill
  8 | 
  9 | from detectron2.utils.registry import Registry
 10 | from detectron2.layers import Conv2d
 11 | 
 12 | SPARSE_INST_DECODER_REGISTRY = Registry("SPARSE_INST_DECODER")
 13 | SPARSE_INST_DECODER_REGISTRY.__doc__ = "registry for SparseInst decoder"
 14 | 
 15 | def _make_stack_3x3_convs(num_convs, in_channels, out_channels):
 16 |     convs = []
 17 |     for _ in range(num_convs):
 18 |         convs.append(
 19 |             Conv2d(in_channels, out_channels, 3, padding=1))
 20 |         convs.append(nn.ReLU(True))
 21 |         in_channels = out_channels
 22 |     return nn.Sequential(*convs)
 23 | 
 24 | 
 25 | class InstanceBranch(nn.Module):
 26 | 
 27 |     def __init__(self, cfg, in_channels):
 28 |         super().__init__()
 29 |         # norm = cfg.MODEL.SPARSE_INST.DECODER.NORM
 30 |         dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM
 31 |         num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS
 32 |         num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS
 33 |         kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
 34 |         self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
 35 | 
 36 |         self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
 37 |         # iam prediction, a simple conv
 38 |         self.iam_conv = nn.Conv2d(dim, num_masks, 3, padding=1)
 39 | 
 40 |         # outputs
 41 |         self.cls_score = nn.Linear(dim, self.num_classes)
 42 |         self.mask_kernel = nn.Linear(dim, kernel_dim)
 43 |         self.objectness = nn.Linear(dim, 1)
 44 | 
 45 |         self.prior_prob = 0.01
 46 |         self._init_weights()
 47 | 
 48 |     def _init_weights(self):
 49 |         for m in self.inst_convs.modules():
 50 |             if isinstance(m, nn.Conv2d):
 51 |                 c2_msra_fill(m)
 52 |         bias_value = -math.log((1 - self.prior_prob) / self.prior_prob)
 53 |         for module in [self.iam_conv, self.cls_score]:
 54 |             init.constant_(module.bias, bias_value)
 55 |         init.normal_(self.iam_conv.weight, std=0.01)
 56 |         init.normal_(self.cls_score.weight, std=0.01)
 57 | 
 58 |         init.normal_(self.mask_kernel.weight, std=0.01)
 59 |         init.constant_(self.mask_kernel.bias, 0.0)
 60 | 
 61 |     def forward(self, features):
 62 |         # instance features (x4 convs)
 63 |         features = self.inst_convs(features)
 64 |         # predict instance activation maps
 65 |         iam = self.iam_conv(features)
 66 |         iam_prob = iam.sigmoid()
 67 | 
 68 |         B, N = iam_prob.shape[:2]
 69 |         C = features.size(1)
 70 |         # BxNxHxW -> BxNx(HW)
 71 |         iam_prob = iam_prob.view(B, N, -1)
 72 |         # aggregate features: BxCxHxW -> Bx(HW)xC
 73 |         inst_features = torch.bmm(iam_prob, features.view(B, C, -1).permute(0, 2, 1))
 74 |         normalizer = iam_prob.sum(-1).clamp(min=1e-6)
 75 |         inst_features = inst_features / normalizer[:, :, None]
 76 |         # predict classification & segmentation kernel & objectness
 77 |         pred_logits = self.cls_score(inst_features)
 78 |         pred_kernel = self.mask_kernel(inst_features)
 79 |         pred_scores = self.objectness(inst_features)
 80 |         return pred_logits, pred_kernel, pred_scores, iam
 81 | 
 82 | 
 83 | class MaskBranch(nn.Module):
 84 | 
 85 |     def __init__(self, cfg, in_channels):
 86 |         super().__init__()
 87 |         dim = cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM
 88 |         num_convs = cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS
 89 |         kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
 90 |         self.mask_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
 91 |         self.projection = nn.Conv2d(dim, kernel_dim, kernel_size=1)
 92 |         self._init_weights()
 93 | 
 94 |     def _init_weights(self):
 95 |         for m in self.mask_convs.modules():
 96 |             if isinstance(m, nn.Conv2d):
 97 |                 c2_msra_fill(m)
 98 |         c2_msra_fill(self.projection)
 99 | 
100 |     def forward(self, features):
101 |         # mask features (x4 convs)
102 |         features = self.mask_convs(features)
103 |         # features = checkpoint(self.mask_convs,features)
104 |         return self.projection(features)
105 | 
106 | 
107 | @SPARSE_INST_DECODER_REGISTRY.register()
108 | class BaseIAMDecoder(nn.Module):
109 | 
110 |     def __init__(self, cfg):
111 |         super().__init__()
112 |         # add 2 for coordinates
113 |         in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2
114 | 
115 |         self.scale_factor = cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR
116 |         self.output_iam = cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM
117 |         
118 |         self.inst_branch = InstanceBranch(cfg, in_channels)
119 |         self.mask_branch = MaskBranch(cfg, in_channels)
120 | 
121 |     @torch.no_grad()
122 |     def compute_coordinates(self, x):
123 |         h, w = x.size(2), x.size(3)
124 |         input_1 = -1
125 |         input_1 = int(input_1)
126 |         input_3 = 1
127 |         input_3= int(input_3)
128 |         input_2 = h
129 |         input_2= int(input_2)
130 |         input_4 = w
131 |         input_4= int(input_4)
132 | 
133 |         y_loc = torch.linspace(input_1, input_3, input_2, device=x.device)
134 |         x_loc = torch.linspace(input_1, input_3, input_4, device=x.device)
135 |         #y_loc = torch.arange(-1, 1+(2/h), 2/(h-1), device=x.device)
136 |         #x_loc = torch.arange(-1, 1+(2/w), 2/(w-1), device=x.device)
137 |         y_loc, x_loc = torch.meshgrid(y_loc, x_loc)
138 |         y_loc = y_loc.expand([x.shape[0], 1, -1, -1])
139 |         x_loc = x_loc.expand([x.shape[0], 1, -1, -1])
140 |         locations = torch.cat([x_loc, y_loc], 1)
141 |         return locations.to(x)
142 |     
143 |     def forward(self, features):
144 |         coord_features = self.compute_coordinates(features)
145 |         features = torch.cat([coord_features, features], dim=1)
146 |         pred_logits, pred_kernel, pred_scores, iam = self.inst_branch(features)
147 |         mask_features = self.mask_branch(features)
148 | 
149 |         N = pred_kernel.shape[1]
150 |         # mask_features: BxCxHxW
151 |         B, C, H, W = mask_features.shape
152 |         pred_masks = torch.bmm(pred_kernel, mask_features.view(B, C, H * W)).view(B, N, H, W)
153 | 
154 |         pred_masks = F.interpolate(
155 |             pred_masks, scale_factor=self.scale_factor,
156 |             mode='bilinear', align_corners=False)
157 | 
158 |         output = {
159 |             "pred_logits": pred_logits,
160 |             "pred_masks": pred_masks,
161 |             "pred_scores": pred_scores,
162 |         }
163 | 
164 |         if self.output_iam:
165 |             iam = F.interpolate(iam, scale_factor=self.scale_factor,
166 |                                 mode='bilinear', align_corners=False)
167 |             output['pred_iam'] = iam
168 | 
169 |         return output
170 | 
171 | 
172 | class GroupInstanceBranch(nn.Module):
173 | 
174 |     def __init__(self, cfg, in_channels):
175 |         super().__init__()    
176 |         dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM
177 |         num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS
178 |         num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS
179 |         kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
180 |         self.num_groups = cfg.MODEL.SPARSE_INST.DECODER.GROUPS
181 |         self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
182 | 
183 |         self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
184 |         # iam prediction, a group conv
185 |         expand_dim = dim * self.num_groups
186 |         self.iam_conv = nn.Conv2d(dim, num_masks * self.num_groups, 3, padding=1, groups=self.num_groups)
187 |         # outputs
188 |         self.fc = nn.Linear(expand_dim, expand_dim)
189 | 
190 |         self.cls_score = nn.Linear(expand_dim, self.num_classes)
191 |         self.mask_kernel = nn.Linear(expand_dim, kernel_dim)
192 |         self.objectness = nn.Linear(expand_dim, 1)
193 | 
194 |         self.prior_prob = 0.01
195 |         self._init_weights()
196 | 
197 |     def _init_weights(self):
198 |         for m in self.inst_convs.modules():
199 |             if isinstance(m, nn.Conv2d):
200 |                 c2_msra_fill(m)
201 |         bias_value = -math.log((1 - self.prior_prob) / self.prior_prob)
202 |         for module in [self.iam_conv, self.cls_score]:
203 |             init.constant_(module.bias, bias_value)
204 |         init.normal_(self.iam_conv.weight, std=0.01)
205 |         init.normal_(self.cls_score.weight, std=0.01)
206 | 
207 |         init.normal_(self.mask_kernel.weight, std=0.01)
208 |         init.constant_(self.mask_kernel.bias, 0.0)
209 |         c2_xavier_fill(self.fc)
210 | 
211 |     def forward(self, features):
212 |         # instance features (x4 convs)
213 |         features = self.inst_convs(features)
214 |         # predict instance activation maps
215 |         iam = self.iam_conv(features)
216 |         iam_prob = iam.sigmoid()
217 | 
218 |         B, N = iam_prob.shape[:2]
219 |         C = features.size(1)
220 |         # BxNxHxW -> BxNx(HW)
221 |         iam_prob = iam_prob.view(B, N, -1)
222 |         # aggregate features: BxCxHxW -> Bx(HW)xC
223 |         inst_features = torch.bmm(iam_prob, features.view(B, C, -1).permute(0, 2, 1))
224 |         normalizer = iam_prob.sum(-1).clamp(min=1e-6)
225 |         inst_features = inst_features / normalizer[:, :, None]
226 | 
227 |         inst_features = inst_features.reshape(
228 |             B, 4, N // 4, -1).transpose(1, 2).reshape(B, N // 4, -1)
229 | 
230 |         inst_features = F.relu_(self.fc(inst_features))
231 |         # predict classification & segmentation kernel & objectness
232 |         pred_logits = self.cls_score(inst_features)
233 |         pred_kernel = self.mask_kernel(inst_features)
234 |         pred_scores = self.objectness(inst_features)
235 |         return pred_logits, pred_kernel, pred_scores, iam
236 |     
237 | 
238 | 
239 | @SPARSE_INST_DECODER_REGISTRY.register()
240 | class GroupIAMDecoder(BaseIAMDecoder):
241 |     
242 |     def __init__(self, cfg):
243 |         super().__init__(cfg)
244 |         in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2
245 |         self.inst_branch = GroupInstanceBranch(cfg, in_channels)
246 |     
247 | 
248 | 
249 | def build_sparse_inst_decoder(cfg):
250 |     name = cfg.MODEL.SPARSE_INST.DECODER.NAME
251 |     return SPARSE_INST_DECODER_REGISTRY.get(name)(cfg)
252 | 


--------------------------------------------------------------------------------
/sparseinst/encoder.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import numpy as np
 5 | import torch.nn.functional as F
 6 | 
 7 | from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill
 8 | 
 9 | from detectron2.utils.registry import Registry
10 | from detectron2.layers import Conv2d
11 | 
12 | SPARSE_INST_ENCODER_REGISTRY = Registry("SPARSE_INST_ENCODER")
13 | SPARSE_INST_ENCODER_REGISTRY.__doc__ = "registry for SparseInst decoder"
14 | 
15 | 
16 | class PyramidPoolingModule(nn.Module):
17 | 
18 |     def __init__(self, in_channels, channels=512, sizes=(1, 2, 3, 6)):
19 |         super().__init__()
20 |         self.stages = []
21 |         self.stages = nn.ModuleList(
22 |             [self._make_stage(in_channels, channels, size) for size in sizes]
23 |         )
24 |         self.bottleneck = Conv2d(
25 |             in_channels + len(sizes) * channels, in_channels, 1)
26 | 
27 |     def _make_stage(self, features, out_features, size):
28 |         # prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
29 |         stride = np.floor(10/size).astype(np.int32)
30 |         kernel = 10-(size-1)*stride
31 |         # print(size, stride, kernel)
32 |         prior = torch.nn.AvgPool2d(kernel_size=kernel,stride=stride)
33 |         conv = Conv2d(features, out_features, 1)
34 |         return nn.Sequential(prior, conv)
35 | 
36 |     def forward(self, feats):
37 |         h, w = feats.size(2), feats.size(3)
38 |         priors = [F.interpolate(input=F.relu_(stage(feats)), size=(
39 |             h, w), mode='bilinear', align_corners=False) for stage in self.stages] + [feats]
40 |         out = F.relu_(self.bottleneck(torch.cat(priors, 1)))
41 |         return out
42 | 
43 | 
44 | @SPARSE_INST_ENCODER_REGISTRY.register()
45 | class InstanceContextEncoder(nn.Module):
46 |     """ 
47 |     Instance Context Encoder
48 |     1. construct feature pyramids from ResNet
49 |     2. enlarge receptive fields (ppm)
50 |     3. multi-scale fusion 
51 |     """
52 | 
53 |     def __init__(self, cfg, input_shape):
54 |         super().__init__()
55 |         self.num_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS
56 |         self.in_features = cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES
57 |         # self.norm = cfg.MODEL.SPARSE_INST.ENCODER.NORM
58 |         # depthwise = cfg.MODEL.SPARSE_INST.ENCODER.DEPTHWISE
59 |         self.in_channels = [input_shape[f].channels for f in self.in_features]
60 |         # self.using_bias = self.norm == ""
61 |         fpn_laterals = []
62 |         fpn_outputs = []
63 |         # groups = self.num_channels if depthwise else 1
64 |         for in_channel in reversed(self.in_channels):
65 |             lateral_conv = Conv2d(in_channel, self.num_channels, 1)
66 |             output_conv = Conv2d(self.num_channels, self.num_channels, 3, padding=1)
67 |             c2_xavier_fill(lateral_conv)
68 |             c2_xavier_fill(output_conv)
69 |             fpn_laterals.append(lateral_conv)
70 |             fpn_outputs.append(output_conv)
71 |         self.fpn_laterals = nn.ModuleList(fpn_laterals)
72 |         self.fpn_outputs = nn.ModuleList(fpn_outputs)
73 |         # ppm
74 |         self.ppm = PyramidPoolingModule(self.num_channels, self.num_channels // 4)
75 |         # final fusion
76 |         self.fusion = nn.Conv2d(self.num_channels * 3, self.num_channels, 1)
77 |         c2_msra_fill(self.fusion)
78 | 
79 |     def forward(self, features):
80 |         features = [features[f] for f in self.in_features]
81 |         features = features[::-1]
82 |         prev_features = self.ppm(self.fpn_laterals[0](features[0]))
83 |         outputs = [self.fpn_outputs[0](prev_features)]
84 |         for feature, lat_conv, output_conv in zip(features[1:], self.fpn_laterals[1:], self.fpn_outputs[1:]):
85 |             lat_features = lat_conv(feature)
86 |             top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode='nearest')
87 |             prev_features = lat_features + top_down_features
88 |             outputs.insert(0, output_conv(prev_features))
89 |         size = outputs[0].shape[2:]
90 |         features = [
91 |             outputs[0]] + [F.interpolate(x, size, mode='bilinear', align_corners=False) for x in outputs[1:]]
92 |         features = self.fusion(torch.cat(features, dim=1))
93 |         return features
94 | 
95 | 
96 | def build_sparse_inst_encoder(cfg, input_shape):
97 |     name = cfg.MODEL.SPARSE_INST.ENCODER.NAME
98 |     return SPARSE_INST_ENCODER_REGISTRY.get(name)(cfg, input_shape)


--------------------------------------------------------------------------------
/sparseinst/input.ppm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/input.ppm


--------------------------------------------------------------------------------
/sparseinst/loss.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.cuda.amp import autocast
  7 | from scipy.optimize import linear_sum_assignment
  8 | from fvcore.nn import sigmoid_focal_loss_jit
  9 | 
 10 | from detectron2.utils.registry import Registry
 11 | 
 12 | from .utils import nested_masks_from_list, is_dist_avail_and_initialized, get_world_size
 13 | 
 14 | SPARSE_INST_MATCHER_REGISTRY = Registry("SPARSE_INST_MATCHER")
 15 | SPARSE_INST_MATCHER_REGISTRY.__doc__ = "Matcher for SparseInst"
 16 | SPARSE_INST_CRITERION_REGISTRY = Registry("SPARSE_INST_CRITERION")
 17 | SPARSE_INST_CRITERION_REGISTRY.__doc__ = "Criterion for SparseInst"
 18 | 
 19 | 
 20 | def compute_mask_iou(inputs, targets):
 21 |     inputs = inputs.sigmoid()
 22 |     # thresholding
 23 |     binarized_inputs = (inputs >= 0.4).float()
 24 |     targets = (targets > 0.5).float()
 25 |     intersection = (binarized_inputs * targets).sum(-1)
 26 |     union = targets.sum(-1) + binarized_inputs.sum(-1) - intersection
 27 |     score = intersection / (union + 1e-6)
 28 |     return score
 29 | 
 30 | 
 31 | def dice_score(inputs, targets):
 32 |     inputs = inputs.sigmoid()
 33 |     numerator = 2 * torch.matmul(inputs, targets.t())
 34 |     denominator = (
 35 |         inputs * inputs).sum(-1)[:, None] + (targets * targets).sum(-1)
 36 |     score = numerator / (denominator + 1e-4)
 37 |     return score
 38 | 
 39 | 
 40 | def dice_loss(inputs, targets, reduction='sum'):
 41 |     inputs = inputs.sigmoid()
 42 |     assert inputs.shape == targets.shape
 43 |     numerator = 2 * (inputs * targets).sum(1)
 44 |     denominator = (inputs * inputs).sum(-1) + (targets * targets).sum(-1)
 45 |     loss = 1 - (numerator) / (denominator + 1e-4)
 46 |     if reduction == 'none':
 47 |         return loss
 48 |     return loss.sum()
 49 | 
 50 | 
 51 | @SPARSE_INST_CRITERION_REGISTRY.register()
 52 | class SparseInstCriterion(nn.Module):
 53 |     # This part is partially derivated from: https://github.com/facebookresearch/detr/blob/main/models/detr.py
 54 | 
 55 |     def __init__(self, cfg, matcher):
 56 |         super().__init__()
 57 |         self.matcher = matcher
 58 |         self.losses = cfg.MODEL.SPARSE_INST.LOSS.ITEMS
 59 |         self.weight_dict = self.get_weight_dict(cfg)
 60 |         self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
 61 | 
 62 |     def get_weight_dict(self, cfg):
 63 |         losses = ("loss_ce", "loss_mask", "loss_dice", "loss_objectness")
 64 |         weight_dict = {}
 65 |         ce_weight = cfg.MODEL.SPARSE_INST.LOSS.CLASS_WEIGHT
 66 |         mask_weight = cfg.MODEL.SPARSE_INST.LOSS.MASK_PIXEL_WEIGHT
 67 |         dice_weight = cfg.MODEL.SPARSE_INST.LOSS.MASK_DICE_WEIGHT
 68 |         objectness_weight = cfg.MODEL.SPARSE_INST.LOSS.OBJECTNESS_WEIGHT
 69 | 
 70 |         weight_dict = dict(
 71 |             zip(losses, (ce_weight, mask_weight, dice_weight, objectness_weight)))
 72 |         return weight_dict
 73 | 
 74 |     def _get_src_permutation_idx(self, indices):
 75 |         # permute predictions following indices
 76 |         batch_idx = torch.cat([torch.full_like(src, i)
 77 |                               for i, (src, _) in enumerate(indices)])
 78 |         src_idx = torch.cat([src for (src, _) in indices])
 79 |         return batch_idx, src_idx
 80 | 
 81 |     def _get_tgt_permutation_idx(self, indices):
 82 |         # permute targets following indices
 83 |         batch_idx = torch.cat([torch.full_like(tgt, i)
 84 |                               for i, (_, tgt) in enumerate(indices)])
 85 |         tgt_idx = torch.cat([tgt for (_, tgt) in indices])
 86 |         return batch_idx, tgt_idx
 87 | 
 88 |     def loss_labels(self, outputs, targets, indices, num_instances, input_shape=None):
 89 |         assert "pred_logits" in outputs
 90 |         src_logits = outputs['pred_logits']
 91 |         idx = self._get_src_permutation_idx(indices)
 92 |         target_classes_o = torch.cat([t["labels"][J]
 93 |                                      for t, (_, J) in zip(targets, indices)])
 94 |         target_classes = torch.full(src_logits.shape[:2], self.num_classes,
 95 |                                     dtype=torch.int64, device=src_logits.device)
 96 |         target_classes[idx] = target_classes_o
 97 | 
 98 |         src_logits = src_logits.flatten(0, 1)
 99 |         # prepare one_hot target.
100 |         target_classes = target_classes.flatten(0, 1)
101 |         pos_inds = torch.nonzero(
102 |             target_classes != self.num_classes, as_tuple=True)[0]
103 |         labels = torch.zeros_like(src_logits)
104 |         labels[pos_inds, target_classes[pos_inds]] = 1
105 |         # comp focal loss.
106 |         class_loss = sigmoid_focal_loss_jit(
107 |             src_logits,
108 |             labels,
109 |             alpha=0.25,
110 |             gamma=2.0,
111 |             reduction="sum",
112 |         ) / num_instances
113 |         losses = {'loss_ce': class_loss}
114 |         return losses
115 | 
116 |     def loss_masks_with_iou_objectness(self, outputs, targets, indices, num_instances, input_shape):
117 |         src_idx = self._get_src_permutation_idx(indices)
118 |         tgt_idx = self._get_tgt_permutation_idx(indices)
119 |         # Bx100xHxW
120 |         assert "pred_masks" in outputs
121 |         assert "pred_scores" in outputs
122 |         src_iou_scores = outputs["pred_scores"]
123 |         src_masks = outputs["pred_masks"]
124 |         with torch.no_grad():
125 |             target_masks, _ = nested_masks_from_list(
126 |                 [t["masks"].tensor for t in targets], input_shape).decompose()
127 |         num_masks = [len(t["masks"]) for t in targets]
128 |         target_masks = target_masks.to(src_masks)
129 |         if len(target_masks) == 0:
130 |             losses = {
131 |                 "loss_dice": src_masks.sum() * 0.0,
132 |                 "loss_mask": src_masks.sum() * 0.0,
133 |                 "loss_objectness": src_iou_scores.sum() * 0.0
134 |             }
135 |             return losses
136 | 
137 |         src_masks = src_masks[src_idx]
138 |         target_masks = F.interpolate(
139 |             target_masks[:, None], size=src_masks.shape[-2:], mode='bilinear', align_corners=False).squeeze(1)
140 | 
141 |         src_masks = src_masks.flatten(1)
142 |         # FIXME: tgt_idx
143 |         mix_tgt_idx = torch.zeros_like(tgt_idx[1])
144 |         cum_sum = 0
145 |         for num_mask in num_masks:
146 |             mix_tgt_idx[cum_sum: cum_sum + num_mask] = cum_sum
147 |             cum_sum += num_mask
148 |         mix_tgt_idx += tgt_idx[1]
149 | 
150 |         target_masks = target_masks[mix_tgt_idx].flatten(1)
151 | 
152 |         with torch.no_grad():
153 |             ious = compute_mask_iou(src_masks, target_masks)
154 | 
155 |         tgt_iou_scores = ious
156 |         src_iou_scores = src_iou_scores[src_idx]
157 |         tgt_iou_scores = tgt_iou_scores.flatten(0)
158 |         src_iou_scores = src_iou_scores.flatten(0)
159 | 
160 |         losses = {
161 |             "loss_objectness": F.binary_cross_entropy_with_logits(src_iou_scores, tgt_iou_scores, reduction='mean'),
162 |             "loss_dice": dice_loss(src_masks, target_masks) / num_instances,
163 |             "loss_mask": F.binary_cross_entropy_with_logits(src_masks, target_masks, reduction='mean')
164 |         }
165 |         return losses
166 | 
167 |     def get_loss(self, loss, outputs, targets, indices, num_instances, **kwargs):
168 |         loss_map = {
169 |             "labels": self.loss_labels,
170 |             "masks": self.loss_masks_with_iou_objectness,
171 |         }
172 |         if loss == "loss_objectness":
173 |             # NOTE: loss_objectness will be calculated in `loss_masks_with_iou_objectness`
174 |             return {}
175 |         assert loss in loss_map
176 |         return loss_map[loss](outputs, targets, indices, num_instances, **kwargs)
177 | 
178 |     def forward(self, outputs, targets, input_shape):
179 | 
180 |         outputs_without_aux = {k: v for k,
181 |                                v in outputs.items() if k != 'aux_outputs'}
182 | 
183 |         # Retrieve the matching between the outputs of the last layer and the targets
184 |         indices = self.matcher(outputs_without_aux, targets, input_shape)
185 |         # Compute the average number of target boxes accross all nodes, for normalization purposes
186 |         num_instances = sum(len(t["labels"]) for t in targets)
187 |         num_instances = torch.as_tensor(
188 |             [num_instances], dtype=torch.float, device=next(iter(outputs.values())).device)
189 |         if is_dist_avail_and_initialized():
190 |             torch.distributed.all_reduce(num_instances)
191 |         num_instances = torch.clamp(
192 |             num_instances / get_world_size(), min=1).item()
193 |         # Compute all the requested losses
194 |         losses = {}
195 |         for loss in self.losses:
196 |             losses.update(self.get_loss(loss, outputs, targets, indices,
197 |                                         num_instances, input_shape=input_shape))
198 | 
199 |         for k in losses.keys():
200 |             if k in self.weight_dict:
201 |                 losses[k] *= self.weight_dict[k]
202 | 
203 |         return losses
204 | 
205 | 
206 | @SPARSE_INST_MATCHER_REGISTRY.register()
207 | class SparseInstMatcherV1(nn.Module):
208 | 
209 |     def __init__(self, cfg):
210 |         super().__init__()
211 |         self.alpha = cfg.MODEL.SPARSE_INST.MATCHER.ALPHA
212 |         self.beta = cfg.MODEL.SPARSE_INST.MATCHER.BETA
213 |         self.mask_score = dice_score
214 | 
215 |     @torch.no_grad()
216 |     def forward(self, outputs, targets, input_shape):
217 |         B, N, H, W = outputs["pred_masks"].shape
218 |         pred_masks = outputs['pred_masks']
219 |         pred_logits = outputs['pred_logits'].sigmoid()
220 | 
221 |         indices = []
222 | 
223 |         for i in range(B):
224 |             tgt_ids = targets[i]["labels"]
225 |             # no annotations
226 |             if tgt_ids.shape[0] == 0:
227 |                 indices.append((torch.as_tensor([]),
228 |                                 torch.as_tensor([])))
229 |                 continue
230 | 
231 |             tgt_masks = targets[i]['masks'].tensor.to(pred_masks)
232 |             pred_logit = pred_logits[i]
233 |             out_masks = pred_masks[i]
234 | 
235 |             # upsampling:
236 |             # (1) padding/
237 |             # (2) upsampling to 1x input size (input_shape)
238 |             # (3) downsampling to 0.25x input size (output mask size)
239 |             ori_h, ori_w = tgt_masks.size(1), tgt_masks.size(2)
240 |             tgt_masks_ = torch.zeros(
241 |                 (1, tgt_masks.size(0), input_shape[0], input_shape[1])).to(pred_masks)
242 |             tgt_masks_[0, :, :ori_h, :ori_w] = tgt_masks
243 |             tgt_masks = F.interpolate(
244 |                 tgt_masks_, size=out_masks.shape[-2:], mode='bilinear', align_corners=False)[0]
245 | 
246 |             # compute dice score and classification score
247 |             tgt_masks = tgt_masks.flatten(1)
248 |             out_masks = out_masks.flatten(1)
249 | 
250 |             mask_score = self.mask_score(out_masks, tgt_masks)
251 |             # Nx(Number of gts)
252 |             matching_prob = pred_logit[:, tgt_ids]
253 |             C = (mask_score ** self.alpha) * (matching_prob ** self.beta)
254 |             # hungarian matching
255 |             inds = linear_sum_assignment(C.cpu(), maximize=True)
256 |             indices.append(inds)
257 |         return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
258 | 
259 | 
260 | @SPARSE_INST_MATCHER_REGISTRY.register()
261 | class SparseInstMatcher(nn.Module):
262 | 
263 |     def __init__(self, cfg):
264 |         super().__init__()
265 |         self.alpha = cfg.MODEL.SPARSE_INST.MATCHER.ALPHA
266 |         self.beta = cfg.MODEL.SPARSE_INST.MATCHER.BETA
267 |         self.mask_score = dice_score
268 | 
269 |     def forward(self, outputs, targets, input_shape):
270 |         with torch.no_grad():
271 |             B, N, H, W = outputs["pred_masks"].shape
272 |             pred_masks = outputs['pred_masks']
273 |             pred_logits = outputs['pred_logits'].sigmoid()
274 | 
275 |             tgt_ids = torch.cat([v["labels"] for v in targets])
276 | 
277 |             if tgt_ids.shape[0] == 0:
278 |                 return [(torch.as_tensor([]).to(pred_logits), torch.as_tensor([]).to(pred_logits))] * B
279 |             tgt_masks, _ = nested_masks_from_list(
280 |                 [t["masks"].tensor for t in targets], input_shape).decompose()
281 |             device = pred_masks.device
282 |             tgt_masks = tgt_masks.to(pred_masks)
283 | 
284 |             tgt_masks = F.interpolate(
285 |                 tgt_masks[:, None], size=pred_masks.shape[-2:], mode="bilinear", align_corners=False).squeeze(1)
286 | 
287 |             pred_masks = pred_masks.view(B * N, -1)
288 |             tgt_masks = tgt_masks.flatten(1)
289 |             with autocast(enabled=False):
290 |                 pred_masks = pred_masks.float()
291 |                 tgt_masks = tgt_masks.float()
292 |                 pred_logits = pred_logits.float()
293 |                 mask_score = self.mask_score(pred_masks, tgt_masks)
294 |                 # Nx(Number of gts)
295 |                 matching_prob = pred_logits.view(B * N, -1)[:, tgt_ids]
296 |                 C = (mask_score ** self.alpha) * (matching_prob ** self.beta)
297 | 
298 |             C = C.view(B, N, -1).cpu()
299 |             # hungarian matching
300 |             sizes = [len(v["masks"]) for v in targets]
301 |             indices = [linear_sum_assignment(c[i], maximize=True)
302 |                        for i, c in enumerate(C.split(sizes, -1))]
303 |             indices = [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(
304 |                 j, dtype=torch.int64)) for i, j in indices]
305 |             return indices
306 | 
307 | 
308 | def build_sparse_inst_matcher(cfg):
309 |     name = cfg.MODEL.SPARSE_INST.MATCHER.NAME
310 |     return SPARSE_INST_MATCHER_REGISTRY.get(name)(cfg)
311 | 
312 | 
313 | def build_sparse_inst_criterion(cfg):
314 |     matcher = build_sparse_inst_matcher(cfg)
315 |     name = cfg.MODEL.SPARSE_INST.LOSS.NAME
316 |     return SPARSE_INST_CRITERION_REGISTRY.get(name)(cfg, matcher)
317 | 


--------------------------------------------------------------------------------
/sparseinst/sparseinst.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from detectron2.modeling import build_backbone
  8 | from detectron2.structures import ImageList, Instances, BitMasks
  9 | from detectron2.modeling import META_ARCH_REGISTRY, build_backbone
 10 | import numpy as np
 11 | from .encoder import build_sparse_inst_encoder
 12 | from .decoder import build_sparse_inst_decoder
 13 | from .loss import build_sparse_inst_criterion
 14 | from .utils import nested_tensor_from_tensor_list
 15 | 
 16 | __all__ = ["SparseInst"]
 17 | 
 18 | 
 19 | @torch.jit.script
 20 | def rescoring_mask(scores, mask_pred, masks):
 21 |     mask_pred_ = mask_pred.float()
 22 |     return scores * ((masks * mask_pred_).sum([1, 2]) / (mask_pred_.sum([1, 2]).double() + 1e-6).float())
 23 | 
 24 | 
 25 | @META_ARCH_REGISTRY.register()
 26 | class SparseInst(nn.Module):
 27 | 
 28 |     def __init__(self, cfg):
 29 |         super().__init__()
 30 | 
 31 |         # move to target device
 32 |         self.device = torch.device(cfg.MODEL.DEVICE)
 33 |         self.use_cp = True
 34 | 
 35 |         print("NOM DU DEVICE UTILISE2", torch.cuda.get_device_name())
 36 |         # backbone
 37 |         self.backbone = build_backbone(cfg)
 38 |         self.size_divisibility = self.backbone.size_divisibility
 39 |         output_shape = self.backbone.output_shape()
 40 | 
 41 |         # encoder & decoder
 42 |         self.encoder = build_sparse_inst_encoder(cfg, output_shape)
 43 |         self.decoder = build_sparse_inst_decoder(cfg)
 44 | 
 45 |         # matcher & loss (matcher is built in loss)
 46 |         self.criterion = build_sparse_inst_criterion(cfg)
 47 | 
 48 |         # data and preprocessing
 49 |         self.mask_format = cfg.INPUT.MASK_FORMAT
 50 | 
 51 |         self.pixel_mean = torch.Tensor(
 52 |             cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
 53 |         self.pixel_std = torch.Tensor(
 54 |             cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
 55 |         # self.normalizer = lambda x: (x - pixel_mean) / pixel_std
 56 | 
 57 |         # inference
 58 |         self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD
 59 |         self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD
 60 |         self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS
 61 | 
 62 |     def normalizer(self, image):
 63 |         image = (image - self.pixel_mean) / self.pixel_std
 64 |         return image
 65 | 
 66 |     def preprocess_inputs(self, batched_inputs):
 67 |         images = [x["image"].to(self.device) for x in batched_inputs]
 68 |         images = [self.normalizer(x) for x in images]
 69 |         images = ImageList.from_tensors(images, 32)
 70 |         return images
 71 | 
 72 |     def prepare_targets(self, targets):
 73 |         new_targets = []
 74 |         for targets_per_image in targets:
 75 |             target = {}
 76 |             gt_classes = targets_per_image.gt_classes
 77 |             target["labels"] = gt_classes.to(self.device)
 78 |             h, w = targets_per_image.image_size
 79 |             if not targets_per_image.has('gt_masks'):
 80 |                 gt_masks = BitMasks(torch.empty(0, h, w))
 81 |             else:
 82 |                 gt_masks = targets_per_image.gt_masks
 83 |                 if self.mask_format == "polygon":
 84 |                     if len(gt_masks.polygons) == 0:
 85 |                         gt_masks = BitMasks(torch.empty(0, h, w))
 86 |                     else:
 87 |                         gt_masks = BitMasks.from_polygon_masks(
 88 |                             gt_masks.polygons, h, w)
 89 | 
 90 |             target["masks"] = gt_masks.to(self.device)
 91 |             new_targets.append(target)
 92 | 
 93 |         return new_targets
 94 | 
 95 |     def forward(self, batched_inputs):
 96 |         images = self.preprocess_inputs(batched_inputs)
 97 |         if isinstance(images, (list, torch.Tensor)):
 98 |             images = nested_tensor_from_tensor_list(images)
 99 |         max_shape = images.tensor.shape[2:]
100 |         features = self.backbone(images.tensor)
101 |         features = self.encoder(features)
102 |         output = self.decoder(features)
103 | 
104 |         if self.training:
105 |             gt_instances = [x["instances"].to(
106 |                 self.device) for x in batched_inputs]
107 |             targets = self.prepare_targets(gt_instances)
108 |             losses = self.criterion(output, targets, max_shape)
109 |             return losses
110 |         else:
111 |             results = self.inference(
112 |                 output, batched_inputs, max_shape, images.image_sizes)
113 |             processed_results = [{"instances": r} for r in results]
114 |             return processed_results
115 |     
116 |     def forward_test_3(self, images):
117 |         # images = self.preprocess_inputs(batched_inputs)
118 |         # if isinstance(images, (list, torch.Tensor)):
119 |         #     images = nested_tensor_from_tensor_list(images)
120 |         max_shape = images.shape[2:]
121 |         # forward
122 |         # if self.use_cp:
123 |         #     features = self.backbone(images.tensor)
124 |         #     features = checkpoint(self.encoder,features)
125 |         #     output = self.decoder(features)
126 |         # else:
127 |         features = self.backbone(images)
128 |         features = self.encoder(features)
129 |         output = self.decoder(features)
130 | 
131 |         if self.training:
132 |             gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
133 |             targets = self.prepare_targets(gt_instances)
134 |             losses = self.criterion(output, targets, max_shape)
135 |             return losses
136 |         else:
137 |             results = self.inference_test_3(output, images)
138 |             # import pdb; pdb.set_trace()
139 |             # processed_results = [{"instances": r} for r in results]
140 | 
141 |             out_scores = torch.cat([r.scores.unsqueeze(0) for r in results], dim=0)
142 |             out_pred_classes = torch.cat([r.pred_classes.unsqueeze(0) for r in results], dim=0)
143 |             out_pred_masks = torch.cat([r.pred_masks for r in results], dim=0)
144 |             return (out_scores, out_pred_classes, out_pred_masks)
145 |         return processed_results
146 | 
147 |     def inference(self, output, batched_inputs, max_shape, image_sizes):
148 |     # max_detections = self.max_detections
149 |         results = []
150 |         pred_scores = output["pred_logits"].sigmoid()
151 |         pred_masks = output["pred_masks"].sigmoid()
152 |         pred_objectness = output["pred_scores"].sigmoid()
153 |         pred_scores = torch.sqrt(pred_scores * pred_objectness)
154 |         for _, (scores_per_image, mask_pred_per_image, batched_input, img_shape) in enumerate(zip(
155 |                 pred_scores, pred_masks, batched_inputs, image_sizes)):
156 | 
157 |             ori_shape = (batched_input["height"], batched_input["width"])
158 |             result = Instances(ori_shape)
159 |             # max/argmax
160 |             scores, labels = scores_per_image.max(dim=-1)
161 |             # cls threshold
162 |             keep = scores > self.cls_threshold
163 |             scores = scores[keep]
164 |             labels = labels[keep]
165 |             mask_pred_per_image = mask_pred_per_image[keep]
166 |             if scores.size(0) == 0:
167 |                 result.scores = scores
168 |                 result.pred_classes = labels
169 |                 results.append(result)
170 |                 continue
171 | 
172 |             h, w = img_shape
173 |             # rescoring mask using maskness
174 |             scores = rescoring_mask(
175 |                 scores, mask_pred_per_image > self.mask_threshold, mask_pred_per_image)
176 |             # upsample the masks to the original resolution:
177 |             # (1) upsampling the masks to the padded inputs, remove the padding area
178 |             # (2) upsampling/downsampling the masks to the original sizes
179 |             
180 |             mask_pred_per_image = F.interpolate(
181 |                 mask_pred_per_image.unsqueeze(1), size=max_shape, mode="bilinear", align_corners=False)[:, :, :h, :w]
182 |             mask_pred_per_image = F.interpolate(
183 |                 mask_pred_per_image, size=ori_shape, mode='bilinear', align_corners=False).squeeze(1)
184 | 
185 |             mask_pred = mask_pred_per_image > self.mask_threshold
186 |             # fix the bug for visualization
187 |             # mask_pred = BitMasks(mask_pred)
188 | 
189 |             # using Detectron2 Instances to store the final results
190 |             result.pred_masks = mask_pred
191 |             result.scores = scores
192 |             result.pred_classes = labels
193 |             results.append(result)
194 |         return results
195 | 
196 |     def inference_test_3(self, output, images):
197 |         # max_detections = self.max_detections
198 |         results = []
199 |         pred_scores = output["pred_logits"].sigmoid()
200 |         pred_masks = output["pred_masks"].sigmoid()
201 |         pred_objectness = output["pred_scores"].sigmoid()
202 |         pred_scores = torch.sqrt(pred_scores * pred_objectness)
203 |     
204 |         for _, (scores_per_image, mask_pred_per_image, image) in enumerate(zip(
205 |                 pred_scores, pred_masks, images)):
206 | 
207 |             shape = image.shape[1:]
208 |             result = Instances(shape)
209 | 
210 |             scores, labels = scores_per_image.max(dim=-1)
211 | 
212 |             if scores.size(0) == 0:
213 |                 result.scores = scores
214 |                 result.pred_classes = labels
215 |                 results.append(result)
216 |                 continue
217 | 
218 |             h, w = shape
219 |             # rescoring mask using maskness
220 |             scores = rescoring_mask(scores, mask_pred_per_image > self.mask_threshold, mask_pred_per_image)
221 |             # using Detectron2 Instances to store the final results
222 |                        
223 |             result.pred_masks = mask_pred_per_image #mask_pred
224 |             result.scores = scores
225 |             result.pred_classes = labels
226 |             results.append(result)
227 | 
228 |         return results
229 | 


--------------------------------------------------------------------------------
/sparseinst/utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from typing import Optional, List
  3 | 
  4 | import torch
  5 | from torch import Tensor
  6 | import torch.distributed as dist
  7 | import torch.nn.functional as F
  8 | import torchvision
  9 | 
 10 | 
 11 | def _max_by_axis(the_list):
 12 |     # type: (List[List[int]]) -> List[int]
 13 |     maxes = the_list[0]
 14 |     for sublist in the_list[1:]:
 15 |         for index, item in enumerate(sublist):
 16 |             maxes[index] = max(maxes[index], item)
 17 |     return maxes
 18 | 
 19 | 
 20 | class NestedTensor(object):
 21 |     def __init__(self, tensors, mask: Optional[Tensor]):
 22 |         self.tensors = tensors
 23 |         self.mask = mask
 24 | 
 25 |     def to(self, device):
 26 |         cast_tensor = self.tensors.to(device)
 27 |         mask = self.mask
 28 |         if mask is not None:
 29 |             assert mask is not None
 30 |             cast_mask = mask.to(device)
 31 |         else:
 32 |             cast_mask = None
 33 |         return NestedTensor(cast_tensor, cast_mask)
 34 | 
 35 |     def decompose(self):
 36 |         return self.tensors, self.mask
 37 | 
 38 |     def __repr__(self):
 39 |         return str(self.tensors)
 40 | 
 41 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 42 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 43 | 
 44 | 
 45 | @torch.jit.unused
 46 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 47 |     max_size = []
 48 |     for i in range(tensor_list[0].dim()):
 49 |         max_size_i = torch.max(torch.stack([img.shape[i]
 50 |                                             for img in tensor_list]).to(torch.float32)).to(torch.int64)
 51 |         max_size.append(max_size_i)
 52 |     max_size = tuple(max_size)
 53 | 
 54 |     # work around for
 55 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 56 |     # m[: img.shape[1], :img.shape[2]] = False
 57 |     # which is not yet supported in onnx
 58 |     padded_imgs = []
 59 |     padded_masks = []
 60 |     for img in tensor_list:
 61 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 62 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 63 |         padded_imgs.append(padded_img)
 64 | 
 65 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 66 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 67 |         padded_masks.append(padded_mask.to(torch.bool))
 68 | 
 69 |     tensor = torch.stack(padded_imgs)
 70 |     mask = torch.stack(padded_masks)
 71 | 
 72 |     return NestedTensor(tensor, mask=mask)
 73 | 
 74 | 
 75 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 76 |     # TODO make this more general
 77 |     if tensor_list[0].ndim == 3:
 78 |         if torchvision._is_tracing():
 79 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 80 |             # call _onnx_nested_tensor_from_tensor_list() instead
 81 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 82 | 
 83 |         # TODO make it support different-sized images
 84 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 85 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 86 |         batch_shape = [len(tensor_list)] + max_size
 87 |         b, c, h, w = batch_shape
 88 |         dtype = tensor_list[0].dtype
 89 |         device = tensor_list[0].device
 90 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 91 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 92 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 93 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 94 |             m[: img.shape[1], :img.shape[2]] = False
 95 |     else:
 96 |         raise ValueError('not supported')
 97 |     return NestedTensor(tensor, mask)
 98 | 
 99 | 
100 | def nested_masks_from_list(tensor_list: List[Tensor], input_shape=None):
101 |     if tensor_list[0].ndim == 3:
102 |         dim_size = sum([img.shape[0] for img in tensor_list])
103 |         if input_shape is None:
104 |             max_size = _max_by_axis([list(img.shape[-2:]) for img in tensor_list])
105 |         else:
106 |             max_size = [input_shape[0], input_shape[1]]
107 |         batch_shape = [dim_size] + max_size
108 |         # b, h, w = batch_shape
109 |         dtype = tensor_list[0].dtype
110 |         device = tensor_list[0].device
111 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
112 |         mask = torch.zeros(batch_shape, dtype=torch.bool, device=device)
113 |         idx = 0
114 |         for img in tensor_list:
115 |             c = img.shape[0]
116 |             c_ = idx + c
117 |             tensor[idx: c_, :img.shape[1], : img.shape[2]].copy_(img)
118 |             mask[idx: c_, :img.shape[1], :img.shape[2]] = True
119 |             idx = c_
120 |     else:
121 |         raise ValueError('not supported')
122 |     return NestedTensor(tensor, mask)
123 | 
124 | 
125 | def is_dist_avail_and_initialized():
126 |     if not dist.is_available():
127 |         return False
128 |     if not dist.is_initialized():
129 |         return False
130 |     return True
131 | 
132 | 
133 | def get_world_size():
134 |     if not is_dist_avail_and_initialized():
135 |         return 1
136 |     return dist.get_world_size()
137 | 
138 | 
139 | def aligned_bilinear(tensor, factor):
140 |     # borrowed from Adelaidet: https://github1s.com/aim-uofa/AdelaiDet/blob/HEAD/adet/utils/comm.py
141 |     assert tensor.dim() == 4
142 |     assert factor >= 1
143 |     assert int(factor) == factor
144 | 
145 |     if factor == 1:
146 |         return tensor
147 | 
148 |     h, w = tensor.size()[2:]
149 |     tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate")
150 |     oh = factor * h + 1
151 |     ow = factor * w + 1
152 |     tensor = F.interpolate(
153 |         tensor, size=(oh, ow),
154 |         mode='bilinear',
155 |         align_corners=True
156 |     )
157 |     tensor = F.pad(
158 |         tensor, pad=(factor // 2, 0, factor // 2, 0),
159 |         mode="replicate"
160 |     )
161 | 
162 |     return tensor[:, :, :oh - 1, :ow - 1]
163 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | print("It works")
2 | print("okok")


--------------------------------------------------------------------------------
/test_net.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.cuda.amp import autocast
  8 | 
  9 | from detectron2.config import get_cfg
 10 | from detectron2.modeling import build_backbone
 11 | from detectron2.checkpoint import DetectionCheckpointer
 12 | from detectron2.structures import ImageList, Instances, BitMasks
 13 | from detectron2.engine import default_argument_parser, default_setup
 14 | from detectron2.data import build_detection_test_loader
 15 | from detectron2.evaluation import COCOEvaluator, print_csv_format
 16 | 
 17 | from sparseinst import build_sparse_inst_encoder, build_sparse_inst_decoder, add_sparse_inst_config
 18 | from sparseinst import COCOMaskEvaluator
 19 | 
 20 | 
 21 | device = torch.device('cuda:0')
 22 | dtype = torch.float32
 23 | 
 24 | __all__ = ["SparseInst"]
 25 | 
 26 | pixel_mean = torch.Tensor([123.675, 116.280, 103.530]).to(device).view(3, 1, 1)
 27 | pixel_std = torch.Tensor([58.395, 57.120, 57.375]).to(device).view(3, 1, 1)
 28 | 
 29 | 
 30 | @torch.jit.script
 31 | def normalizer(x, mean, std): return (x - mean) / std
 32 | 
 33 | 
 34 | def synchronize():
 35 |     torch.cuda.synchronize()
 36 | 
 37 | 
 38 | def process_batched_inputs(batched_inputs):
 39 |     images = [x["image"].to(device) for x in batched_inputs]
 40 |     images = [normalizer(x, pixel_mean, pixel_std) for x in images]
 41 |     images = ImageList.from_tensors(images, 32)
 42 |     ori_size = (batched_inputs[0]["height"], batched_inputs[0]["width"])
 43 |     return images.tensor, images.image_sizes[0], ori_size
 44 | 
 45 | 
 46 | @torch.jit.script
 47 | def rescoring_mask(scores, mask_pred, masks):
 48 |     mask_pred_ = mask_pred.float()
 49 |     return scores * ((masks * mask_pred_).sum([1, 2]) / (mask_pred_.sum([1, 2]) + 1e-6))
 50 | 
 51 | 
 52 | class SparseInst(nn.Module):
 53 | 
 54 |     def __init__(self, cfg):
 55 | 
 56 |         super().__init__()
 57 | 
 58 |         self.device = torch.device(cfg.MODEL.DEVICE)
 59 |         # backbone
 60 |         self.backbone = build_backbone(cfg)
 61 |         self.size_divisibility = self.backbone.size_divisibility
 62 | 
 63 |         output_shape = self.backbone.output_shape()
 64 | 
 65 |         self.encoder = build_sparse_inst_encoder(cfg, output_shape)
 66 |         self.decoder = build_sparse_inst_decoder(cfg)
 67 | 
 68 |         self.to(self.device)
 69 | 
 70 |         # inference
 71 |         self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD
 72 |         self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD
 73 |         self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS
 74 |         self.mask_format = cfg.INPUT.MASK_FORMAT
 75 |         self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
 76 | 
 77 |     def forward(self, image, resized_size, ori_size):
 78 |         max_size = image.shape[2:]
 79 |         features = self.backbone(image)
 80 |         features = self.encoder(features)
 81 |         output = self.decoder(features)
 82 |         result = self.inference_single(
 83 |             output, resized_size, max_size, ori_size)
 84 |         return result
 85 | 
 86 |     def inference_single(self, outputs, img_shape, pad_shape, ori_shape):
 87 |         """
 88 |         inference for only one sample
 89 |         Args:
 90 |             scores (tensor): [NxC]
 91 |             masks (tensor): [NxHxW]
 92 |             img_shape (list): (h1, w1), image after resized
 93 |             pad_shape (list): (h2, w2), padded resized image
 94 |             ori_shape (list): (h3, w3), original shape h3*w3 < h1*w1 < h2*w2
 95 |         """
 96 |         result = Instances(ori_shape)
 97 |         # scoring
 98 |         pred_logits = outputs["pred_logits"][0].sigmoid()
 99 |         pred_scores = outputs["pred_scores"][0].sigmoid().squeeze()
100 |         pred_masks = outputs["pred_masks"][0].sigmoid()
101 |         # obtain scores
102 |         scores, labels = pred_logits.max(dim=-1)
103 |         # remove by thresholding
104 |         keep = scores > self.cls_threshold
105 |         scores = torch.sqrt(scores[keep] * pred_scores[keep])
106 |         labels = labels[keep]
107 |         pred_masks = pred_masks[keep]
108 | 
109 |         if scores.size(0) == 0:
110 |             return None
111 |         scores = rescoring_mask(scores, pred_masks > 0.45, pred_masks)
112 |         h, w = img_shape
113 |         # resize masks
114 |         pred_masks = F.interpolate(pred_masks.unsqueeze(1), size=pad_shape,
115 |                                    mode="bilinear", align_corners=False)[:, :, :h, :w]
116 |         pred_masks = F.interpolate(pred_masks, size=ori_shape, mode='bilinear',
117 |                                    align_corners=False).squeeze(1)
118 |         mask_pred = pred_masks > self.mask_threshold
119 | 
120 |         mask_pred = BitMasks(mask_pred)
121 |         result.pred_masks = mask_pred
122 |         result.scores = scores
123 |         result.pred_classes = labels
124 |         return result
125 | 
126 | 
127 | def test_sparseinst_speed(cfg, fp16=True):
128 |     device = torch.device('cuda:0')
129 | 
130 |     model = SparseInst(cfg)
131 |     model.eval()
132 |     model.to(device)
133 |     print(model)
134 |     size = (cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST)
135 |     DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
136 |         cfg.MODEL.WEIGHTS, resume=False)
137 | 
138 |     torch.backends.cudnn.enable = True
139 |     torch.backends.cudnn.benchmark = True
140 | 
141 |     output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
142 | 
143 |     evaluator = COCOMaskEvaluator(
144 |         cfg.DATASETS.TEST[0], ("segm",), False, output_folder)
145 |     evaluator.reset()
146 |     model.to(device)
147 |     model.eval()
148 |     data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
149 |     durations = []
150 | 
151 |     with autocast(enabled=fp16):
152 |         with torch.no_grad():
153 |             for idx, inputs in enumerate(data_loader):
154 |                 images, resized_size, ori_size = process_batched_inputs(inputs)
155 |                 synchronize()
156 |                 start_time = time.perf_counter()
157 |                 output = model(images, resized_size, ori_size)
158 |                 print(len(output))
159 |                 print(output)
160 |                 synchronize()
161 |                 end = time.perf_counter() - start_time
162 | 
163 |                 durations.append(end)
164 |                 if idx % 100 == 0:
165 |                     print("process: [{}/{}] fps: {:.3f}".format(idx,
166 |                                                                 len(data_loader), 1/np.mean(durations[100:])))
167 |                 evaluator.process(inputs, [{"instances": output}])
168 |     # evaluate
169 |     results = evaluator.evaluate()
170 |     print_csv_format(results)
171 | 
172 |     latency = np.mean(durations[100:])
173 |     fps = 1 / latency
174 |     print("speed: {:.4f}s FPS: {:.2f}".format(latency, fps))
175 | 
176 | 
177 | def setup(args):
178 |     """
179 |     Create configs and perform basic setups.
180 |     """
181 |     cfg = get_cfg()
182 |     add_sparse_inst_config(cfg)
183 |     cfg.merge_from_file(args.config_file)
184 |     cfg.merge_from_list(args.opts)
185 |     cfg.freeze()
186 |     default_setup(cfg, args)
187 |     return cfg
188 | 
189 | 
190 | if __name__ == '__main__':
191 | 
192 |     args = default_argument_parser()
193 |     args.add_argument("--fp16", action="store_true",
194 |                       help="support fp16 for inference")
195 |     args = args.parse_args()
196 |     print("Command Line Args:", args)
197 |     cfg = setup(args)
198 |     test_sparseinst_speed(cfg, fp16=args.fp16)
199 | 


--------------------------------------------------------------------------------
/train_net.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import itertools
  3 | import time
  4 | from typing import Any, Dict, List, Set
  5 | 
  6 | import torch
  7 | from torch import optim
  8 | 
  9 | import detectron2.utils.comm as comm
 10 | from detectron2.checkpoint import DetectionCheckpointer
 11 | from detectron2.config import get_cfg
 12 | from detectron2.utils.logger import setup_logger
 13 | from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetMapper
 14 | from detectron2.engine import AutogradProfiler, DefaultTrainer, default_argument_parser, default_setup, launch
 15 | from detectron2.evaluation import COCOEvaluator, verify_results
 16 | from detectron2.solver.build import maybe_add_gradient_clipping
 17 | from detectron2.evaluation import (
 18 |     CityscapesInstanceEvaluator,
 19 |     CityscapesSemSegEvaluator,
 20 |     COCOEvaluator,
 21 |     COCOPanopticEvaluator,
 22 |     DatasetEvaluators,
 23 |     LVISEvaluator,
 24 |     PascalVOCDetectionEvaluator,
 25 |     SemSegEvaluator,
 26 |     verify_results,
 27 | )
 28 | 
 29 | from sparseinst import add_sparse_inst_config, COCOMaskEvaluator
 30 | 
 31 | 
 32 | class Trainer(DefaultTrainer):
 33 | 
 34 |     @classmethod
 35 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
 36 |         """
 37 |         Create evaluator(s) for a given dataset.
 38 |         This uses the special metadata "evaluator_type" associated with each builtin dataset.
 39 |         For your own dataset, you can simply create an evaluator manually in your
 40 |         script and do not have to worry about the hacky if-else logic here.
 41 |         """
 42 |         if output_folder is None:
 43 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
 44 |         evaluator_list = []
 45 |         evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
 46 |         if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
 47 |             evaluator_list.append(
 48 |                 SemSegEvaluator(
 49 |                     dataset_name,
 50 |                     distributed=True,
 51 |                     num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
 52 |                     ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
 53 |                     output_dir=output_folder,
 54 |                 )
 55 |             )
 56 |         if evaluator_type in ["coco", "coco_panoptic_seg"]:
 57 |             evaluator_list.append(COCOMaskEvaluator(dataset_name, ("segm", ), True, output_folder))
 58 |         if evaluator_type == "coco_panoptic_seg":
 59 |             evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
 60 |         if evaluator_type == "cityscapes_instance":
 61 |             assert (
 62 |                 torch.cuda.device_count() >= comm.get_rank()
 63 |             ), "CityscapesEvaluator currently do not work with multiple machines."
 64 |             return CityscapesInstanceEvaluator(dataset_name)
 65 |         if evaluator_type == "cityscapes_sem_seg":
 66 |             assert (
 67 |                 torch.cuda.device_count() >= comm.get_rank()
 68 |             ), "CityscapesEvaluator currently do not work with multiple machines."
 69 |             return CityscapesSemSegEvaluator(dataset_name)
 70 |         elif evaluator_type == "pascal_voc":
 71 |             return PascalVOCDetectionEvaluator(dataset_name)
 72 |         elif evaluator_type == "lvis":
 73 |             return LVISEvaluator(dataset_name, cfg, True, output_folder)
 74 |         if len(evaluator_list) == 0:
 75 |             raise NotImplementedError(
 76 |                 "no Evaluator for the dataset {} with the type {}".format(
 77 |                     dataset_name, evaluator_type
 78 |                 )
 79 |             )
 80 |         elif len(evaluator_list) == 1:
 81 |             return evaluator_list[0]
 82 |         return DatasetEvaluators(evaluator_list)
 83 | 
 84 |     @classmethod
 85 |     def build_optimizer(cls, cfg, model):
 86 |         params: List[Dict[str, Any]] = []
 87 |         memo: Set[torch.nn.parameter.Parameter] = set()
 88 |         for key, value in model.named_parameters(recurse=True):
 89 |             if not value.requires_grad:
 90 |                 continue
 91 |             # Avoid duplicating parameters
 92 |             if value in memo:
 93 |                 continue
 94 |             memo.add(value)
 95 |             lr = cfg.SOLVER.BASE_LR
 96 |             weight_decay = cfg.SOLVER.WEIGHT_DECAY
 97 |             if "backbone" in key:
 98 |                 lr = lr * cfg.SOLVER.BACKBONE_MULTIPLIER
 99 |             # for transformer
100 |             if "patch_embed" in key or "cls_token" in key:
101 |                 weight_decay = 0.0
102 |             if "norm" in key:
103 |                 weight_decay = 0.0
104 |             params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
105 | 
106 |         def maybe_add_full_model_gradient_clipping(optim):  # optim: the optimizer class
107 |             # detectron2 doesn't have full  model gradient clipping now
108 |             clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
109 |             enable = (
110 |                 cfg.SOLVER.CLIP_GRADIENTS.ENABLED
111 |                 and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
112 |                 and clip_norm_val > 0.0
113 |             )
114 | 
115 |             class FullModelGradientClippingOptimizer(optim):
116 |                 def step(self, closure=None):
117 |                     all_params = itertools.chain(*[x["params"] for x in self.param_groups])
118 |                     torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
119 |                     super().step(closure=closure)
120 | 
121 |             return FullModelGradientClippingOptimizer if enable else optim
122 | 
123 |         optimizer_type = cfg.SOLVER.OPTIMIZER
124 |         if optimizer_type == "SGD":
125 |             optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
126 |                 params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM
127 |             )
128 |         elif optimizer_type == "ADAMW":
129 |             optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
130 |                 params, cfg.SOLVER.BASE_LR, amsgrad=cfg.SOLVER.AMSGRAD
131 |             )
132 |         else:
133 |             raise NotImplementedError(f"no optimizer type {optimizer_type}")
134 |         if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
135 |             optimizer = maybe_add_gradient_clipping(cfg, optimizer)
136 |         return optimizer
137 | 
138 |     @classmethod
139 |     def build_train_loader(cls, cfg):
140 |         if cfg.MODEL.SPARSE_INST.DATASET_MAPPER == "SparseInstDatasetMapper":
141 |             from sparseinst import SparseInstDatasetMapper
142 |             mapper = SparseInstDatasetMapper(cfg, is_train=True)
143 |         else:
144 |             mapper = None
145 |         return build_detection_train_loader(cfg, mapper=mapper)
146 | 
147 | 
148 | def setup(args):
149 |     """
150 |     Create configs and perform basic setups.
151 |     """
152 |     cfg = get_cfg()
153 |     add_sparse_inst_config(cfg)
154 |     cfg.merge_from_file(args.config_file)
155 |     cfg.merge_from_list(args.opts)
156 |     cfg.freeze()
157 |     default_setup(cfg, args)
158 |     # Setup logger for "sparseinst" module
159 |     setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="sparseinst")
160 |     return cfg
161 | 
162 | 
163 | def main(args):
164 |     cfg = setup(args)
165 | 
166 |     if args.eval_only:
167 |         model = Trainer.build_model(cfg)
168 |         DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
169 |             cfg.MODEL.WEIGHTS, resume=args.resume)
170 |         res = Trainer.test(cfg, model)
171 |         if comm.is_main_process():
172 |             verify_results(cfg, res)
173 |         return res
174 | 
175 |     trainer = Trainer(cfg)
176 |     trainer.resume_or_load(resume=args.resume)
177 |     return trainer.train()
178 | 
179 | 
180 | if __name__ == "__main__":
181 |     args = default_argument_parser().parse_args()
182 |     print("Command Line Args:", args)
183 |     launch(
184 |         main,
185 |         args.num_gpus,
186 |         num_machines=args.num_machines,
187 |         machine_rank=args.machine_rank,
188 |         dist_url=args.dist_url,
189 |         args=(args,),
190 |     )
191 | 


--------------------------------------------------------------------------------