├── .gitattributes
├── LICENCE
├── README.md
├── assets
├── animate.gif
├── banner.gif
├── figures
│ ├── 000000006471.jpg
│ └── 000000014439.jpg
└── sparseinst.png
├── configs
├── Base-SparseInst.yaml
├── Sparse_Inst_r50_giam_onnx.yaml
├── sparse_inst_cspdarknet53_giam.yaml
├── sparse_inst_darknet53_giam.yaml
├── sparse_inst_pvt_b1_giam.yaml
├── sparse_inst_pvt_b2_li_giam.yaml
├── sparse_inst_r101_dcn_giam.yaml
├── sparse_inst_r101_giam.yaml
├── sparse_inst_r50_base.yaml
├── sparse_inst_r50_dcn_giam_aug.yaml
├── sparse_inst_r50_giam.yaml
├── sparse_inst_r50_giam_aug.yaml
├── sparse_inst_r50_giam_fp16.yaml
├── sparse_inst_r50_giam_soft.yaml
├── sparse_inst_r50vd_base.yaml
├── sparse_inst_r50vd_dcn_giam.yaml
├── sparse_inst_r50vd_dcn_giam_aug.yaml
├── sparse_inst_r50vd_giam.yaml
└── sparse_inst_r50vd_giam_aug.yaml
├── convert_onnx.py
├── convert_tensorrt.py
├── datasets
├── prepare_ade20k_sem_seg.py
├── prepare_cocofied_lvis.py
├── prepare_for_tests.sh
└── prepare_panoptic_fpn.py
├── demo.py
├── engine
├── __pycache__
│ └── defaults.cpython-36.pyc
└── defaults.py
├── eval_tensorrt_onnx.py
├── input
└── input_image
│ ├── 640x640.jpg
│ ├── cup.jpg
│ ├── femme.jpg
│ ├── homme.jpg
│ ├── horses.jpg
│ ├── image1.jpg
│ ├── input.jpg
│ ├── results.png
│ ├── skate.jpg
│ └── turkish_coffee.jpg
├── onnx
└── __pycache__
│ └── image_processing.cpython-36.pyc
├── output
├── mnist.tar.gz
├── mnist
│ ├── model.onnx
│ ├── test_data_set_0
│ │ ├── input_0.pb
│ │ └── output_0.pb
│ ├── test_data_set_1
│ │ ├── input_0.pb
│ │ └── output_0.pb
│ └── test_data_set_2
│ │ ├── input_0.pb
│ │ └── output_0.pb
└── sparse_inst_r50_giam
│ ├── config.yaml
│ └── log.txt
├── results
├── 640_result.jpg
├── result_onnx.png
└── result_tensorrt.png
├── sparseinst
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-36.pyc
│ ├── caffe2sparseinst.cpython-36.pyc
│ ├── coco_evaluation.cpython-36.pyc
│ ├── config.cpython-36.pyc
│ ├── d2_predictor.cpython-36.pyc
│ ├── dataset_mapper.cpython-36.pyc
│ ├── decoder.cpython-36.pyc
│ ├── encoder.cpython-36.pyc
│ ├── loss.cpython-36.pyc
│ ├── sparseinst.cpython-36.pyc
│ └── utils.cpython-36.pyc
├── backbones
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── cspnet.cpython-36.pyc
│ │ ├── pvt.cpython-36.pyc
│ │ └── resnet.cpython-36.pyc
│ ├── cspnet.py
│ ├── pvt.py
│ └── resnet.py
├── caffe2sparseinst.py
├── coco_evaluation.py
├── config.py
├── d2_predictor.py
├── dataset_mapper.py
├── decoder.py
├── encoder.py
├── input.ppm
├── loss.py
├── sparseinst.py
└── utils.py
├── test.py
├── test_net.py
└── train_net.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Hust Visual Learning Team
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SparseInst_TensorRT
2 | **This repository implement the real-time Instance Segmentation Algorithm named SparseInst with TensoRT and ONNX.**
3 |
4 | ## Some remarks
5 | - The initial repository on which I build mine is from **hustvl/SparseInst**__ repository (https://github.com/hustvl/SparseInst.git), for additional information about the installation of SparseInst, refer to the original repository.
6 | - This project is built upon the excellent framework detectron2, and you should install detectron2 first, please check official installation guide for more details. (https://github.com/facebookresearch/detectron2.git)
7 | - For command other than TensoRT and ONNX inference, please refer to the initial repository (e.g test_net.py).
8 | - If you face any problem during the parsing time, don't hesitate to drop an issue or a :star: if there aren't any. _**if you have compatibility problem, check the model weights uploaded in the table below and go directly in the testing section**_.
9 | - Be aware that in order to parse the model to ONNX and TensorRT, the files sparseinst.py, encoder.py and decoder.py has been modified/slightly modified, don't forget to check the modifications if you come from the initial repository.
10 |
11 |
12 | ## Prerequisites
13 |
14 | Click me
15 |
16 | - Install Pytorch (1.10.0) and TorchVision (0.11.1)
17 | ```
18 | pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
19 |
20 | If other versions of torch are needed, select yours by putting torch==1.11.0+cu102 for example.
21 | ```
22 | - Install CUDA (10.2) and cuDNN (8.0.0) : https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=WSL-Ubuntu&target_version=2.0&target_type=deb_local
23 |
24 | - For WSL-Ubuntu :
25 | ```
26 | sudo wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-wsl-ubuntu.pin
27 | sudo mv cuda-wsl-ubuntu.pin /etc/apt/preferences.d/cuda-repository-pin-600
28 | sudo wget https://developer.download.nvidia.com/compute/cuda/11.7.1/local_insta
29 | llers/cuda-repo-wsl-ubuntu-11-7-local_11.7.1-1_amd64.deb
30 | sudo dpkg -i cuda-repo-wsl-ubuntu-11-7-local_11.7.1-1_amd64.deb
31 | sudo cp /var/cuda-repo-wsl-ubuntu-11-7-local/cuda-96193861-keyring.gpg /usr/share/keyrings/
32 | sudo apt-get update
33 | sudo apt-get -y install cuda
34 | ```
35 |
36 | - Install TensorRT (8.0.1.6), if you are using an nvidia edge device, TensorRT should already be installed
37 | ```
38 | python3 -m pip install --upgrade setuptools pip
39 | python3 -m pip install nvidia-pyindex
40 | python3 -m pip install --upgrade nvidia-tensorrt
41 |
42 | Verify installation by writing : assert tensorrt.Builder(tensorrt.Logger())
43 | ```
44 | - Install ONNX and ONNXruntime
45 | ```
46 | pip install onnxruntime-gpu
47 | pip install onnxruntime
48 | pip install numpy protobuf==4.21.5
49 | pip install onnx
50 | ```
51 | - Install all the other packages needed to run the original SparseInst algorithm (Should be done if you have installed Dectectron2)
52 |
53 |
54 |
55 | ## Models and Results for TensorRT and ONNX inference script:
56 |
57 | The inference speed for Pytorch, ONNX and TensorRT has been compared and shown in the table below. SparseInst running with TensoRT achieved more a less 3 times faster inference speed of SparseInst than running with Pytorch. Lowering the input size of the image can lead to a decent real-time speed.
58 | The models from TensorRT and ONNX are built upon the first Pytorch listed weights in the table below : SparseInst R-50 G-IAM.
59 |
60 | *Note: All the computations has been done on a Nvidia Jetson TX2 Jetpack 4.6. Further test will be done on a Nvidia 2070 RTI*
61 |
62 |
63 |
64 | | Model | Input Size | Inference Speed| Weights
65 | | :--- | :---: | :---: | ---: |
66 | | Pytorch | 640 | 1.71 FPS | [model](https://drive.google.com/file/d/130gyxYT6r9j5Nwp5nCo_wthYPuTwa9c4/view?usp=sharing)|
67 | | TensorRT | 320 | 20.32 FPS |[model](https://drive.google.com/file/d/17-eBWVrpnwv0ueeDsEmAqSKlNh3If4AI/view?usp=sharing)|
68 | | TensorRT | 640 | 6.00 FPS |[model](https://drive.google.com/file/d/1Kh97LZNzsuBJTeDVXwRKx8CiX7CeMI3v/view?usp=sharing)|
69 | | ONNX | 320 | 0.22 FPS |[model](https://drive.google.com/file/d/1H6YH3YUPaA4vO3IyIGaZNAkGBsU9xHCH/view?usp=sharing)|
70 | | ONNX | 640 |0.03 FPS |[model](https://drive.google.com/file/d/1GEoQssyJ9MZRnEISiatF_tREpdGAnSjk/view?usp=sharing)|
71 |
72 |
73 |
74 |

79 |
80 |
81 |
82 | ## Building the ONNX model :
83 |
84 | To build the model from Pytorch to ONNX, you need to run the following command. You can set the arguments to default. Please check if the config path and the model weights path are correctly set up.
85 | ```
86 |
87 | ```
88 |
89 | ## Building the TensorRT model :
90 |
91 | To build the model from ONNX to TensorRT, you need to run the following command. You can set the arguments to default. If you have any problem while parsing the model to TensorRT, don't hesitate to ask.
92 | ```
93 | 0, "Please specify a directory with args.output"
120 | out_filename = args.output
121 | visualized_output.save(out_filename)
122 | else:
123 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
124 | cv2.imshow(
125 | WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
126 | if cv2.waitKey(0) == 27:
127 | break # esc to quit
128 | elif args.webcam:
129 | assert args.input is None, "Cannot have both --input and --webcam!"
130 | assert args.output is None, "output not yet supported with --webcam!"
131 | cam = cv2.VideoCapture(0)
132 | for vis in tqdm.tqdm(demo.run_on_video(cam, args.confidence_threshold)):
133 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
134 | cv2.imshow(WINDOW_NAME, vis)
135 | if cv2.waitKey(1) == 27:
136 | break # esc to quit
137 | cam.release()
138 | cv2.destroyAllWindows()
139 | elif args.video_input:
140 | video = cv2.VideoCapture(args.video_input)
141 | width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
142 | height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
143 | frames_per_second = video.get(cv2.CAP_PROP_FPS)
144 | num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
145 | basename = os.path.basename(args.video_input)
146 |
147 | if args.output:
148 | if os.path.isdir(args.output):
149 | output_fname = os.path.join(args.output, basename)
150 | output_fname = os.path.splitext(output_fname)[0] + ".mkv"
151 | else:
152 | output_fname = args.output
153 | assert not os.path.isfile(output_fname), output_fname
154 | output_file = cv2.VideoWriter(
155 | filename=output_fname,
156 | # some installation of opencv may not support x264 (due to its license),
157 | # you can try other format (e.g. MPEG)
158 | fourcc=cv2.VideoWriter_fourcc(*"mp4v"),
159 | fps=float(frames_per_second),
160 | frameSize=(width, height),
161 | isColor=True,
162 | )
163 | assert os.path.isfile(args.video_input)
164 | for vis_frame in tqdm.tqdm(demo.run_on_video(video, args.confidence_threshold), total=num_frames):
165 | if args.output:
166 | output_file.write(vis_frame)
167 | else:
168 | cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
169 | cv2.imshow(basename, vis_frame)
170 | if cv2.waitKey(1) == 27:
171 | break # esc to quit
172 | video.release()
173 | if args.output:
174 | output_file.release()
175 | else:
176 | cv2.destroyAllWindows()
177 |
--------------------------------------------------------------------------------
/engine/__pycache__/defaults.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/engine/__pycache__/defaults.cpython-36.pyc
--------------------------------------------------------------------------------
/engine/defaults.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 |
4 | """
5 | This file contains components with some default boilerplate logic user may need
6 | in training / testing. They will not work for everyone, but many users may find them useful.
7 |
8 | The behavior of functions/classes in this file is subject to change,
9 | since they are meant to represent the "common default behavior" people need in their projects.
10 | """
11 | import numpy as np
12 | import argparse
13 | import logging
14 | import os
15 | import sys
16 | import weakref
17 | from collections import OrderedDict
18 | from typing import Optional
19 | import torch
20 | from fvcore.nn.precise_bn import get_bn_modules
21 | from omegaconf import OmegaConf
22 | from torch.nn.parallel import DistributedDataParallel
23 |
24 | import detectron2.data.transforms as T
25 | from detectron2.checkpoint import DetectionCheckpointer
26 | from detectron2.config import CfgNode, LazyConfig
27 | from detectron2.data import (
28 | MetadataCatalog,
29 | build_detection_test_loader,
30 | build_detection_train_loader,
31 | )
32 | from detectron2.evaluation import (
33 | DatasetEvaluator,
34 | inference_on_dataset,
35 | print_csv_format,
36 | verify_results,
37 | )
38 | from detectron2.modeling import build_model
39 | from detectron2.solver import build_lr_scheduler, build_optimizer
40 | from detectron2.utils import comm
41 | from detectron2.utils.collect_env import collect_env_info
42 | from detectron2.utils.env import seed_all_rng
43 | from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
44 | from detectron2.utils.file_io import PathManager
45 | from detectron2.utils.logger import setup_logger
46 |
47 |
48 |
49 | __all__ = [
50 | "DefaultPredictor",
51 | ]
52 |
53 | class DefaultPredictor:
54 |
55 |
56 | def __init__(self, cfg):
57 | self.cfg = cfg.clone() # cfg can be modified by model
58 | self.model = build_model(self.cfg)
59 | self.model.eval()
60 | if len(cfg.DATASETS.TEST):
61 | self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
62 |
63 | checkpointer = DetectionCheckpointer(self.model)
64 | checkpointer.load(cfg.MODEL.WEIGHTS)
65 |
66 | self.aug = T.ResizeShortestEdge(
67 | [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
68 | )
69 |
70 | self.input_format = cfg.INPUT.FORMAT
71 | assert self.input_format in ["RGB", "BGR"], self.input_format
72 |
73 | def __call__(self, original_image):
74 | """
75 | Args:
76 | original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
77 |
78 | Returns:
79 | predictions (dict):
80 | the output of the model for one image only.
81 | See :doc:`/tutorials/models` for details about the format.
82 | """
83 | with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258
84 | # Apply pre-processing to image.
85 | if self.input_format == "RGB":
86 | # whether the model expects BGR inputs or RGB
87 | original_image = original_image[:, :, ::-1]
88 | height, width = original_image.shape[:2]
89 | image = self.aug.get_transform(original_image).apply_image(original_image)
90 | image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
91 | print("shape of image", np.shape(image))
92 | print("image defaults", image)
93 | inputs = {"image": image, "height": height, "width": width}
94 | predictions = self.model([inputs])[0]
95 |
96 | return predictions
97 |
98 |
--------------------------------------------------------------------------------
/input/input_image/640x640.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/640x640.jpg
--------------------------------------------------------------------------------
/input/input_image/cup.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/cup.jpg
--------------------------------------------------------------------------------
/input/input_image/femme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/femme.jpg
--------------------------------------------------------------------------------
/input/input_image/homme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/homme.jpg
--------------------------------------------------------------------------------
/input/input_image/horses.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/horses.jpg
--------------------------------------------------------------------------------
/input/input_image/image1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/image1.jpg
--------------------------------------------------------------------------------
/input/input_image/input.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/input.jpg
--------------------------------------------------------------------------------
/input/input_image/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/results.png
--------------------------------------------------------------------------------
/input/input_image/skate.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/skate.jpg
--------------------------------------------------------------------------------
/input/input_image/turkish_coffee.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/turkish_coffee.jpg
--------------------------------------------------------------------------------
/onnx/__pycache__/image_processing.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/onnx/__pycache__/image_processing.cpython-36.pyc
--------------------------------------------------------------------------------
/output/mnist.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist.tar.gz
--------------------------------------------------------------------------------
/output/mnist/model.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/model.onnx
--------------------------------------------------------------------------------
/output/mnist/test_data_set_0/input_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_0/input_0.pb
--------------------------------------------------------------------------------
/output/mnist/test_data_set_0/output_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_0/output_0.pb
--------------------------------------------------------------------------------
/output/mnist/test_data_set_1/input_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_1/input_0.pb
--------------------------------------------------------------------------------
/output/mnist/test_data_set_1/output_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_1/output_0.pb
--------------------------------------------------------------------------------
/output/mnist/test_data_set_2/input_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_2/input_0.pb
--------------------------------------------------------------------------------
/output/mnist/test_data_set_2/output_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_2/output_0.pb
--------------------------------------------------------------------------------
/output/sparse_inst_r50_giam/config.yaml:
--------------------------------------------------------------------------------
1 | CUDNN_BENCHMARK: false
2 | DATALOADER:
3 | ASPECT_RATIO_GROUPING: true
4 | FILTER_EMPTY_ANNOTATIONS: true
5 | NUM_WORKERS: 4
6 | REPEAT_THRESHOLD: 0.0
7 | SAMPLER_TRAIN: TrainingSampler
8 | DATASETS:
9 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
10 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
11 | PROPOSAL_FILES_TEST: []
12 | PROPOSAL_FILES_TRAIN: []
13 | TEST:
14 | - coco_2017_val
15 | TRAIN:
16 | - coco_2017_train
17 | GLOBAL:
18 | HACK: 1.0
19 | INPUT:
20 | CROP:
21 | ENABLED: false
22 | SIZE:
23 | - 0.9
24 | - 0.9
25 | TYPE: relative_range
26 | FORMAT: RGB
27 | MASK_FORMAT: bitmask
28 | MAX_SIZE_TEST: 853
29 | MAX_SIZE_TRAIN: 853
30 | MIN_SIZE_TEST: 512
31 | MIN_SIZE_TRAIN:
32 | - 416
33 | - 448
34 | - 480
35 | - 512
36 | - 544
37 | - 576
38 | - 608
39 | - 640
40 | MIN_SIZE_TRAIN_SAMPLING: choice
41 | RANDOM_FLIP: horizontal
42 | MODEL:
43 | ANCHOR_GENERATOR:
44 | ANGLES:
45 | - - -90
46 | - 0
47 | - 90
48 | ASPECT_RATIOS:
49 | - - 0.5
50 | - 1.0
51 | - 2.0
52 | NAME: DefaultAnchorGenerator
53 | OFFSET: 0.0
54 | SIZES:
55 | - - 32
56 | - 64
57 | - 128
58 | - 256
59 | - 512
60 | BACKBONE:
61 | FREEZE_AT: 0
62 | NAME: build_resnet_backbone
63 | CSPNET:
64 | NAME: darknet53
65 | NORM: ''
66 | OUT_FEATURES:
67 | - csp1
68 | - csp2
69 | - csp3
70 | - csp4
71 | DEVICE: cuda
72 | FPN:
73 | FUSE_TYPE: sum
74 | IN_FEATURES: []
75 | NORM: ''
76 | OUT_CHANNELS: 256
77 | KEYPOINT_ON: false
78 | LOAD_PROPOSALS: false
79 | MASK_ON: true
80 | META_ARCHITECTURE: SparseInst
81 | PANOPTIC_FPN:
82 | COMBINE:
83 | ENABLED: true
84 | INSTANCES_CONFIDENCE_THRESH: 0.5
85 | OVERLAP_THRESH: 0.5
86 | STUFF_AREA_LIMIT: 4096
87 | INSTANCE_LOSS_WEIGHT: 1.0
88 | PIXEL_MEAN:
89 | - 123.675
90 | - 116.28
91 | - 103.53
92 | PIXEL_STD:
93 | - 58.395
94 | - 57.12
95 | - 57.375
96 | PROPOSAL_GENERATOR:
97 | MIN_SIZE: 0
98 | NAME: RPN
99 | PVT:
100 | LINEAR: false
101 | NAME: b1
102 | OUT_FEATURES:
103 | - p2
104 | - p3
105 | - p4
106 | RESNETS:
107 | DEFORM_MODULATED: false
108 | DEFORM_NUM_GROUPS: 1
109 | DEFORM_ON_PER_STAGE:
110 | - false
111 | - false
112 | - false
113 | - false
114 | DEPTH: 50
115 | NORM: FrozenBN
116 | NUM_GROUPS: 1
117 | OUT_FEATURES:
118 | - res3
119 | - res4
120 | - res5
121 | RES2_OUT_CHANNELS: 256
122 | RES5_DILATION: 1
123 | STEM_OUT_CHANNELS: 64
124 | STRIDE_IN_1X1: false
125 | WIDTH_PER_GROUP: 64
126 | RETINANET:
127 | BBOX_REG_LOSS_TYPE: smooth_l1
128 | BBOX_REG_WEIGHTS:
129 | - 1.0
130 | - 1.0
131 | - 1.0
132 | - 1.0
133 | FOCAL_LOSS_ALPHA: 0.25
134 | FOCAL_LOSS_GAMMA: 2.0
135 | IN_FEATURES:
136 | - p3
137 | - p4
138 | - p5
139 | - p6
140 | - p7
141 | IOU_LABELS:
142 | - 0
143 | - -1
144 | - 1
145 | IOU_THRESHOLDS:
146 | - 0.4
147 | - 0.5
148 | NMS_THRESH_TEST: 0.5
149 | NORM: ''
150 | NUM_CLASSES: 80
151 | NUM_CONVS: 4
152 | PRIOR_PROB: 0.01
153 | SCORE_THRESH_TEST: 0.05
154 | SMOOTH_L1_LOSS_BETA: 0.1
155 | TOPK_CANDIDATES_TEST: 1000
156 | ROI_BOX_CASCADE_HEAD:
157 | BBOX_REG_WEIGHTS:
158 | - - 10.0
159 | - 10.0
160 | - 5.0
161 | - 5.0
162 | - - 20.0
163 | - 20.0
164 | - 10.0
165 | - 10.0
166 | - - 30.0
167 | - 30.0
168 | - 15.0
169 | - 15.0
170 | IOUS:
171 | - 0.5
172 | - 0.6
173 | - 0.7
174 | ROI_BOX_HEAD:
175 | BBOX_REG_LOSS_TYPE: smooth_l1
176 | BBOX_REG_LOSS_WEIGHT: 1.0
177 | BBOX_REG_WEIGHTS:
178 | - 10.0
179 | - 10.0
180 | - 5.0
181 | - 5.0
182 | CLS_AGNOSTIC_BBOX_REG: false
183 | CONV_DIM: 256
184 | FC_DIM: 1024
185 | NAME: ''
186 | NORM: ''
187 | NUM_CONV: 0
188 | NUM_FC: 0
189 | POOLER_RESOLUTION: 14
190 | POOLER_SAMPLING_RATIO: 0
191 | POOLER_TYPE: ROIAlignV2
192 | SMOOTH_L1_BETA: 0.0
193 | TRAIN_ON_PRED_BOXES: false
194 | ROI_HEADS:
195 | BATCH_SIZE_PER_IMAGE: 512
196 | IN_FEATURES:
197 | - res4
198 | IOU_LABELS:
199 | - 0
200 | - 1
201 | IOU_THRESHOLDS:
202 | - 0.5
203 | NAME: Res5ROIHeads
204 | NMS_THRESH_TEST: 0.5
205 | NUM_CLASSES: 80
206 | POSITIVE_FRACTION: 0.25
207 | PROPOSAL_APPEND_GT: true
208 | SCORE_THRESH_TEST: 0.05
209 | ROI_KEYPOINT_HEAD:
210 | CONV_DIMS:
211 | - 512
212 | - 512
213 | - 512
214 | - 512
215 | - 512
216 | - 512
217 | - 512
218 | - 512
219 | LOSS_WEIGHT: 1.0
220 | MIN_KEYPOINTS_PER_IMAGE: 1
221 | NAME: KRCNNConvDeconvUpsampleHead
222 | NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
223 | NUM_KEYPOINTS: 17
224 | POOLER_RESOLUTION: 14
225 | POOLER_SAMPLING_RATIO: 0
226 | POOLER_TYPE: ROIAlignV2
227 | ROI_MASK_HEAD:
228 | CLS_AGNOSTIC_MASK: false
229 | CONV_DIM: 256
230 | NAME: MaskRCNNConvUpsampleHead
231 | NORM: ''
232 | NUM_CONV: 0
233 | POOLER_RESOLUTION: 14
234 | POOLER_SAMPLING_RATIO: 0
235 | POOLER_TYPE: ROIAlignV2
236 | RPN:
237 | BATCH_SIZE_PER_IMAGE: 256
238 | BBOX_REG_LOSS_TYPE: smooth_l1
239 | BBOX_REG_LOSS_WEIGHT: 1.0
240 | BBOX_REG_WEIGHTS:
241 | - 1.0
242 | - 1.0
243 | - 1.0
244 | - 1.0
245 | BOUNDARY_THRESH: -1
246 | CONV_DIMS:
247 | - -1
248 | HEAD_NAME: StandardRPNHead
249 | IN_FEATURES:
250 | - res4
251 | IOU_LABELS:
252 | - 0
253 | - -1
254 | - 1
255 | IOU_THRESHOLDS:
256 | - 0.3
257 | - 0.7
258 | LOSS_WEIGHT: 1.0
259 | NMS_THRESH: 0.7
260 | POSITIVE_FRACTION: 0.5
261 | POST_NMS_TOPK_TEST: 1000
262 | POST_NMS_TOPK_TRAIN: 2000
263 | PRE_NMS_TOPK_TEST: 6000
264 | PRE_NMS_TOPK_TRAIN: 12000
265 | SMOOTH_L1_BETA: 0.0
266 | SEM_SEG_HEAD:
267 | COMMON_STRIDE: 4
268 | CONVS_DIM: 128
269 | IGNORE_VALUE: 255
270 | IN_FEATURES:
271 | - p2
272 | - p3
273 | - p4
274 | - p5
275 | LOSS_WEIGHT: 1.0
276 | NAME: SemSegFPNHead
277 | NORM: GN
278 | NUM_CLASSES: 54
279 | SPARSE_INST:
280 | CLS_THRESHOLD: 0.005
281 | DATASET_MAPPER: SparseInstDatasetMapper
282 | DECODER:
283 | GROUPS: 4
284 | INST:
285 | CONVS: 4
286 | DIM: 256
287 | KERNEL_DIM: 128
288 | MASK:
289 | CONVS: 4
290 | DIM: 256
291 | NAME: GroupIAMDecoder
292 | NUM_CLASSES: 80
293 | NUM_MASKS: 100
294 | OUTPUT_IAM: false
295 | SCALE_FACTOR: 2.0
296 | ENCODER:
297 | IN_FEATURES:
298 | - res3
299 | - res4
300 | - res5
301 | NAME: InstanceContextEncoder
302 | NORM: ''
303 | NUM_CHANNELS: 256
304 | LOSS:
305 | CLASS_WEIGHT: 2.0
306 | ITEMS:
307 | - labels
308 | - masks
309 | MASK_DICE_WEIGHT: 2.0
310 | MASK_PIXEL_WEIGHT: 5.0
311 | NAME: SparseInstCriterion
312 | OBJECTNESS_WEIGHT: 1.0
313 | MASK_THRESHOLD: 0.45
314 | MATCHER:
315 | ALPHA: 0.8
316 | BETA: 0.2
317 | NAME: SparseInstMatcher
318 | MAX_DETECTIONS: 100
319 | WEIGHTS: sparse_inst_r50_giam_aug_2b7d68.pth
320 | OUTPUT_DIR: output/sparse_inst_r50_giam
321 | SEED: -1
322 | SOLVER:
323 | AMP:
324 | ENABLED: false
325 | AMSGRAD: false
326 | BACKBONE_MULTIPLIER: 1.0
327 | BASE_LR: 5.0e-05
328 | BIAS_LR_FACTOR: 1.0
329 | CHECKPOINT_PERIOD: 5000
330 | CLIP_GRADIENTS:
331 | CLIP_TYPE: value
332 | CLIP_VALUE: 1.0
333 | ENABLED: false
334 | NORM_TYPE: 2.0
335 | GAMMA: 0.1
336 | IMS_PER_BATCH: 64
337 | LR_SCHEDULER_NAME: WarmupMultiStepLR
338 | MAX_ITER: 270000
339 | MOMENTUM: 0.9
340 | NESTEROV: false
341 | OPTIMIZER: ADAMW
342 | REFERENCE_WORLD_SIZE: 0
343 | STEPS:
344 | - 210000
345 | - 250000
346 | WARMUP_FACTOR: 0.001
347 | WARMUP_ITERS: 1000
348 | WARMUP_METHOD: linear
349 | WEIGHT_DECAY: 0.05
350 | WEIGHT_DECAY_BIAS: null
351 | WEIGHT_DECAY_NORM: 0.0
352 | TEST:
353 | AUG:
354 | ENABLED: false
355 | FLIP: true
356 | MAX_SIZE: 4000
357 | MIN_SIZES:
358 | - 400
359 | - 500
360 | - 600
361 | - 700
362 | - 800
363 | - 900
364 | - 1000
365 | - 1100
366 | - 1200
367 | DETECTIONS_PER_IMAGE: 100
368 | EVAL_PERIOD: 7330
369 | EXPECTED_RESULTS: []
370 | KEYPOINT_OKS_SIGMAS: []
371 | PRECISE_BN:
372 | ENABLED: false
373 | NUM_ITER: 200
374 | VERSION: 2
375 | VIS_PERIOD: 0
376 |
--------------------------------------------------------------------------------
/results/640_result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/results/640_result.jpg
--------------------------------------------------------------------------------
/results/result_onnx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/results/result_onnx.png
--------------------------------------------------------------------------------
/results/result_tensorrt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/results/result_tensorrt.png
--------------------------------------------------------------------------------
/sparseinst/__init__.py:
--------------------------------------------------------------------------------
1 | from .sparseinst import SparseInst
2 | from .encoder import build_sparse_inst_encoder
3 | from .decoder import build_sparse_inst_decoder
4 | from .config import add_sparse_inst_config
5 | from .loss import build_sparse_inst_criterion
6 | from .dataset_mapper import SparseInstDatasetMapper
7 | from .coco_evaluation import COCOMaskEvaluator
8 | from .backbones import build_resnet_vd_backbone, build_pyramid_vision_transformer
9 | from .d2_predictor import VisualizationDemo
10 |
--------------------------------------------------------------------------------
/sparseinst/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/__pycache__/caffe2sparseinst.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/caffe2sparseinst.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/__pycache__/coco_evaluation.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/coco_evaluation.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/config.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/__pycache__/d2_predictor.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/d2_predictor.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/__pycache__/dataset_mapper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/dataset_mapper.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/__pycache__/decoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/decoder.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/__pycache__/encoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/encoder.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/__pycache__/loss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/loss.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/__pycache__/sparseinst.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/sparseinst.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet import build_resnet_vd_backbone
2 | from .pvt import build_pyramid_vision_transformer
3 | from .cspnet import build_cspnet_backbone
--------------------------------------------------------------------------------
/sparseinst/backbones/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/backbones/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/backbones/__pycache__/cspnet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/backbones/__pycache__/cspnet.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/backbones/__pycache__/pvt.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/backbones/__pycache__/pvt.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/backbones/__pycache__/resnet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/backbones/__pycache__/resnet.cpython-36.pyc
--------------------------------------------------------------------------------
/sparseinst/backbones/cspnet.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 | from timm.models.layers import ConvBnAct, DropPath, AvgPool2dSame, create_attn
7 |
8 |
9 | from detectron2.layers import ShapeSpec, FrozenBatchNorm2d
10 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY
11 |
12 |
13 | model_cfgs = dict(
14 | cspresnet50=dict(
15 | stem=dict(out_chs=64, kernel_size=7, stride=2, pool='max'),
16 | stage=dict(
17 | out_chs=(128, 256, 512, 1024),
18 | depth=(3, 3, 5, 2),
19 | stride=(1,) + (2,) * 3,
20 | exp_ratio=(2.,) * 4,
21 | bottle_ratio=(0.5,) * 4,
22 | block_ratio=(1.,) * 4,
23 | cross_linear=True,
24 | )
25 | ),
26 | cspresnet50d=dict(
27 | stem=dict(out_chs=[32, 32, 64], kernel_size=3, stride=2, pool='max'),
28 | stage=dict(
29 | out_chs=(128, 256, 512, 1024),
30 | depth=(3, 3, 5, 2),
31 | stride=(1,) + (2,) * 3,
32 | exp_ratio=(2.,) * 4,
33 | bottle_ratio=(0.5,) * 4,
34 | block_ratio=(1.,) * 4,
35 | cross_linear=True,
36 | )
37 | ),
38 | cspresnet50w=dict(
39 | stem=dict(out_chs=[32, 32, 64], kernel_size=3, stride=2, pool='max'),
40 | stage=dict(
41 | out_chs=(256, 512, 1024, 2048),
42 | depth=(3, 3, 5, 2),
43 | stride=(1,) + (2,) * 3,
44 | exp_ratio=(1.,) * 4,
45 | bottle_ratio=(0.25,) * 4,
46 | block_ratio=(0.5,) * 4,
47 | cross_linear=True,
48 | )
49 | ),
50 | cspresnext50=dict(
51 | stem=dict(out_chs=64, kernel_size=7, stride=2, pool='max'),
52 | stage=dict(
53 | out_chs=(256, 512, 1024, 2048),
54 | depth=(3, 3, 5, 2),
55 | stride=(1,) + (2,) * 3,
56 | groups=(32,) * 4,
57 | exp_ratio=(1.,) * 4,
58 | bottle_ratio=(1.,) * 4,
59 | block_ratio=(0.5,) * 4,
60 | cross_linear=True,
61 | )
62 | ),
63 | cspdarknet53=dict(
64 | stem=dict(out_chs=32, kernel_size=3, stride=1, pool=''),
65 | stage=dict(
66 | out_chs=(64, 128, 256, 512, 1024),
67 | depth=(1, 2, 8, 8, 4),
68 | stride=(2,) * 5,
69 | exp_ratio=(2.,) + (1.,) * 4,
70 | bottle_ratio=(0.5,) + (1.0,) * 4,
71 | block_ratio=(1.,) + (0.5,) * 4,
72 | down_growth=True,
73 | )
74 | ),
75 | darknet53=dict(
76 | stem=dict(out_chs=32, kernel_size=3, stride=1, pool=''),
77 | stage=dict(
78 | out_chs=(64, 128, 256, 512, 1024),
79 | depth=(1, 2, 8, 8, 4),
80 | stride=(2,) * 5,
81 | bottle_ratio=(0.5,) * 5,
82 | block_ratio=(1.,) * 5,
83 | )
84 | )
85 | )
86 |
87 |
88 | def create_stem(
89 | in_chans=3, out_chs=32, kernel_size=3, stride=2, pool='',
90 | act_layer=None, norm_layer=None, aa_layer=None):
91 | stem = nn.Sequential()
92 | if not isinstance(out_chs, (tuple, list)):
93 | out_chs = [out_chs]
94 | assert len(out_chs)
95 | in_c = in_chans
96 | for i, out_c in enumerate(out_chs):
97 | conv_name = f'conv{i + 1}'
98 | stem.add_module(conv_name, ConvBnAct(
99 | in_c, out_c, kernel_size, stride=stride if i == 0 else 1,
100 | act_layer=act_layer, norm_layer=norm_layer))
101 | in_c = out_c
102 | last_conv = conv_name
103 | if pool:
104 | if aa_layer is not None:
105 | stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
106 | stem.add_module('aa', aa_layer(channels=in_c, stride=2))
107 | else:
108 | stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
109 | return stem, dict(num_chs=in_c, reduction=stride, module='.'.join(['stem', last_conv]))
110 |
111 |
112 | class ResBottleneck(nn.Module):
113 | """ ResNe(X)t Bottleneck Block
114 | """
115 |
116 | def __init__(self, in_chs, out_chs, dilation=1, bottle_ratio=0.25, groups=1,
117 | act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_last=False,
118 | attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
119 | super(ResBottleneck, self).__init__()
120 | mid_chs = int(round(out_chs * bottle_ratio))
121 | ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer,
122 | aa_layer=aa_layer, drop_block=drop_block)
123 |
124 | self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
125 | self.conv2 = ConvBnAct(mid_chs, mid_chs, kernel_size=3,
126 | dilation=dilation, groups=groups, **ckwargs)
127 | self.attn2 = create_attn(attn_layer, channels=mid_chs) if not attn_last else None
128 | self.conv3 = ConvBnAct(mid_chs, out_chs, kernel_size=1, apply_act=False, **ckwargs)
129 | self.attn3 = create_attn(attn_layer, channels=out_chs) if attn_last else None
130 | self.drop_path = drop_path
131 | self.act3 = act_layer(inplace=True)
132 |
133 | def zero_init_last_bn(self):
134 | nn.init.zeros_(self.conv3.bn.weight)
135 |
136 | def forward(self, x):
137 | shortcut = x
138 | x = self.conv1(x)
139 | x = self.conv2(x)
140 | if self.attn2 is not None:
141 | x = self.attn2(x)
142 | x = self.conv3(x)
143 | if self.attn3 is not None:
144 | x = self.attn3(x)
145 | if self.drop_path is not None:
146 | x = self.drop_path(x)
147 | x = x + shortcut
148 | # FIXME partial shortcut needed if first block handled as per original, not used for my current impl
149 | #x[:, :shortcut.size(1)] += shortcut
150 | x = self.act3(x)
151 | return x
152 |
153 |
154 | class DarkBlock(nn.Module):
155 | """ DarkNet Block
156 | """
157 |
158 | def __init__(self, in_chs, out_chs, dilation=1, bottle_ratio=0.5, groups=1,
159 | act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_layer=None, aa_layer=None,
160 | drop_block=None, drop_path=None):
161 | super(DarkBlock, self).__init__()
162 | mid_chs = int(round(out_chs * bottle_ratio))
163 | ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer,
164 | aa_layer=aa_layer, drop_block=drop_block)
165 | self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
166 | self.conv2 = ConvBnAct(mid_chs, out_chs, kernel_size=3,
167 | dilation=dilation, groups=groups, **ckwargs)
168 | self.attn = create_attn(attn_layer, channels=out_chs)
169 | self.drop_path = drop_path
170 |
171 | def zero_init_last_bn(self):
172 | nn.init.zeros_(self.conv2.bn.weight)
173 |
174 | def forward(self, x):
175 | shortcut = x
176 | x = self.conv1(x)
177 | x = self.conv2(x)
178 | if self.attn is not None:
179 | x = self.attn(x)
180 | if self.drop_path is not None:
181 | x = self.drop_path(x)
182 | x = x + shortcut
183 | return x
184 |
185 |
186 | class CrossStage(nn.Module):
187 | """Cross Stage."""
188 |
189 | def __init__(self, in_chs, out_chs, stride, dilation, depth, block_ratio=1., bottle_ratio=1., exp_ratio=1.,
190 | groups=1, first_dilation=None, down_growth=False, cross_linear=False, block_dpr=None,
191 | block_fn=ResBottleneck, **block_kwargs):
192 | super(CrossStage, self).__init__()
193 | first_dilation = first_dilation or dilation
194 | down_chs = out_chs if down_growth else in_chs # grow downsample channels to output channels
195 | exp_chs = int(round(out_chs * exp_ratio))
196 | block_out_chs = int(round(out_chs * block_ratio))
197 | conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'),
198 | norm_layer=block_kwargs.get('norm_layer'))
199 |
200 | if stride != 1 or first_dilation != dilation:
201 | self.conv_down = ConvBnAct(
202 | in_chs, down_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
203 | aa_layer=block_kwargs.get('aa_layer', None), **conv_kwargs)
204 | prev_chs = down_chs
205 | else:
206 | self.conv_down = None
207 | prev_chs = in_chs
208 |
209 | # FIXME this 1x1 expansion is pushed down into the cross and block paths in the darknet cfgs. Also,
210 | # there is also special case for the first stage for some of the model that results in uneven split
211 | # across the two paths. I did it this way for simplicity for now.
212 | self.conv_exp = ConvBnAct(prev_chs, exp_chs, kernel_size=1,
213 | apply_act=not cross_linear, **conv_kwargs)
214 | prev_chs = exp_chs // 2 # output of conv_exp is always split in two
215 |
216 | self.blocks = nn.Sequential()
217 | for i in range(depth):
218 | drop_path = DropPath(block_dpr[i]) if block_dpr and block_dpr[i] else None
219 | self.blocks.add_module(str(i), block_fn(
220 | prev_chs, block_out_chs, dilation, bottle_ratio, groups, drop_path=drop_path, **block_kwargs))
221 | prev_chs = block_out_chs
222 |
223 | # transition convs
224 | self.conv_transition_b = ConvBnAct(prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs)
225 | self.conv_transition = ConvBnAct(exp_chs, out_chs, kernel_size=1, **conv_kwargs)
226 |
227 | def forward(self, x):
228 | if self.conv_down is not None:
229 | x = self.conv_down(x)
230 | x = self.conv_exp(x)
231 | split = x.shape[1] // 2
232 | xs, xb = x[:, :split], x[:, split:]
233 | xb = self.blocks(xb)
234 | xb = self.conv_transition_b(xb).contiguous()
235 | out = self.conv_transition(torch.cat([xs, xb], dim=1))
236 | return out
237 |
238 |
239 | class DarkStage(nn.Module):
240 | """DarkNet stage."""
241 |
242 | def __init__(self, in_chs, out_chs, stride, dilation, depth, block_ratio=1., bottle_ratio=1., groups=1,
243 | first_dilation=None, block_fn=ResBottleneck, block_dpr=None, **block_kwargs):
244 | super(DarkStage, self).__init__()
245 | first_dilation = first_dilation or dilation
246 |
247 | self.conv_down = ConvBnAct(
248 | in_chs, out_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
249 | act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'),
250 | aa_layer=block_kwargs.get('aa_layer', None))
251 |
252 | prev_chs = out_chs
253 | block_out_chs = int(round(out_chs * block_ratio))
254 | self.blocks = nn.Sequential()
255 | for i in range(depth):
256 | drop_path = DropPath(block_dpr[i]) if block_dpr and block_dpr[i] else None
257 | self.blocks.add_module(str(i), block_fn(
258 | prev_chs, block_out_chs, dilation, bottle_ratio, groups, drop_path=drop_path, **block_kwargs))
259 | prev_chs = block_out_chs
260 |
261 | def forward(self, x):
262 | x = self.conv_down(x)
263 | x = self.blocks(x)
264 | return x
265 |
266 |
267 | def _cfg_to_stage_args(cfg, curr_stride=2, output_stride=32, drop_path_rate=0.):
268 | # get per stage args for stage and containing blocks, calculate strides to meet target output_stride
269 | num_stages = len(cfg['depth'])
270 | if 'groups' not in cfg:
271 | cfg['groups'] = (1,) * num_stages
272 | if 'down_growth' in cfg and not isinstance(cfg['down_growth'], (list, tuple)):
273 | cfg['down_growth'] = (cfg['down_growth'],) * num_stages
274 | if 'cross_linear' in cfg and not isinstance(cfg['cross_linear'], (list, tuple)):
275 | cfg['cross_linear'] = (cfg['cross_linear'],) * num_stages
276 | cfg['block_dpr'] = [None] * num_stages if not drop_path_rate else \
277 | [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg['depth'])).split(cfg['depth'])]
278 | stage_strides = []
279 | stage_dilations = []
280 | stage_first_dilations = []
281 | dilation = 1
282 | for cfg_stride in cfg['stride']:
283 | stage_first_dilations.append(dilation)
284 | if curr_stride >= output_stride:
285 | dilation *= cfg_stride
286 | stride = 1
287 | else:
288 | stride = cfg_stride
289 | curr_stride *= stride
290 | stage_strides.append(stride)
291 | stage_dilations.append(dilation)
292 | cfg['stride'] = stage_strides
293 | cfg['dilation'] = stage_dilations
294 | cfg['first_dilation'] = stage_first_dilations
295 | stage_args = [dict(zip(cfg.keys(), values)) for values in zip(*cfg.values())]
296 | return stage_args
297 |
298 |
299 | class CSPNet(Backbone):
300 | """Cross Stage Partial base model.
301 |
302 | Paper: `CSPNet: A New Backbone that can Enhance Learning Capability of CNN` - https://arxiv.org/abs/1911.11929
303 | Ref Impl: https://github.com/WongKinYiu/CrossStagePartialNetworks
304 |
305 | NOTE: There are differences in the way I handle the 1x1 'expansion' conv in this impl vs the
306 | darknet impl. I did it this way for simplicity and less special cases.
307 | """
308 |
309 | def __init__(self, cfg, in_chans=3, output_stride=32, global_pool='avg', drop_rate=0.,
310 | act_layer=nn.LeakyReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_path_rate=0.,
311 | zero_init_last_bn=True, stage_fn=CrossStage, block_fn=ResBottleneck, out_features=None):
312 | super().__init__()
313 | self.drop_rate = drop_rate
314 | assert output_stride in (8, 16, 32)
315 | layer_args = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer)
316 |
317 | # Construct the stem
318 | self.stem, stem_feat_info = create_stem(in_chans, **cfg['stem'], **layer_args)
319 | self.feature_info = [stem_feat_info]
320 | prev_chs = stem_feat_info['num_chs']
321 | curr_stride = stem_feat_info['reduction'] # reduction does not include pool
322 | if cfg['stem']['pool']:
323 | curr_stride *= 2
324 |
325 | # Construct the stages
326 | per_stage_args = _cfg_to_stage_args(
327 | cfg['stage'], curr_stride=curr_stride, output_stride=output_stride, drop_path_rate=drop_path_rate)
328 | self.stages = nn.Sequential()
329 | out_channels = []
330 | out_strides = []
331 | for i, sa in enumerate(per_stage_args):
332 | self.stages.add_module(
333 | str(i), stage_fn(prev_chs, **sa, **layer_args, block_fn=block_fn))
334 | prev_chs = sa['out_chs']
335 | curr_stride *= sa['stride']
336 | self.feature_info += [dict(num_chs=prev_chs,
337 | reduction=curr_stride, module=f'stages.{i}')]
338 | out_channels.append(prev_chs)
339 | out_strides.append(curr_stride)
340 |
341 | for m in self.modules():
342 | if isinstance(m, nn.Conv2d):
343 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
344 | elif isinstance(m, nn.BatchNorm2d):
345 | nn.init.ones_(m.weight)
346 | nn.init.zeros_(m.bias)
347 | elif isinstance(m, nn.Linear):
348 | nn.init.normal_(m.weight, mean=0.0, std=0.01)
349 | nn.init.zeros_(m.bias)
350 | if zero_init_last_bn:
351 | for m in self.modules():
352 | if hasattr(m, 'zero_init_last_bn'):
353 | m.zero_init_last_bn()
354 |
355 | # cspdarknet: csp1, csp2, csp3, csp4
356 | # cspresnet: csp0, csp1, csp2, csp3
357 | out_features_names = ["csp{}".format(i) for i in range(len(per_stage_args))]
358 | self._out_feature_strides = dict(zip(out_features_names, out_strides))
359 | self._out_feature_channels = dict(zip(out_features_names, out_channels))
360 | if out_features is None:
361 | self._out_features = out_features_names
362 | else:
363 | self._out_features = out_features
364 |
365 | def output_shape(self):
366 | return {
367 | name: ShapeSpec(
368 | channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
369 | )
370 | for name in self._out_features
371 | }
372 |
373 | def size_divisibility(self):
374 | return 32
375 |
376 | def forward(self, x):
377 | x = self.stem(x)
378 | outputs = {}
379 | for i, stage in enumerate(self.stages):
380 | name = f"csp{i}"
381 | x = stage(x)
382 | if name in self._out_features:
383 | outputs[name] = x
384 | return outputs
385 |
386 |
387 | @BACKBONE_REGISTRY.register()
388 | def build_cspnet_backbone(cfg, input_shape=None):
389 |
390 | cspnet_name = cfg.MODEL.CSPNET.NAME
391 | norm_name = cfg.MODEL.CSPNET.NORM
392 | out_features = cfg.MODEL.CSPNET.OUT_FEATURES
393 | # DarkNet53 doesn't have batch norm
394 | if norm_name == "FrozenBN":
395 | norm = FrozenBatchNorm2d
396 | elif norm_name == "SyncBN":
397 | from detectron2.layers import NaiveSyncBatchNorm
398 | norm = NaiveSyncBatchNorm
399 | else:
400 | norm = nn.BatchNorm2d
401 |
402 | assert cspnet_name in ["cspresnet50", "cspresnet50d", "cspresnet50w",
403 | "cspresnext50", "cspdarknet53", "darknet53"]
404 |
405 | model_cfg = model_cfgs[cspnet_name]
406 |
407 | if "darknet" in cspnet_name:
408 | block_fn = DarkBlock
409 | else:
410 | block_fn = ResBottleneck
411 |
412 | if cspnet_name == "darknet53":
413 | stage_fn = DarkStage
414 | else:
415 | stage_fn = CrossStage
416 |
417 | model = CSPNet(
418 | model_cfg,
419 | in_chans=input_shape.channels,
420 | norm_layer=norm,
421 | stage_fn=stage_fn,
422 | block_fn=block_fn,
423 | out_features=out_features)
424 | return model
425 |
--------------------------------------------------------------------------------
/sparseinst/backbones/pvt.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from functools import partial
6 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_
7 | from detectron2.layers import ShapeSpec
8 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY
9 |
10 |
11 | class Mlp(nn.Module):
12 | def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False):
13 | super().__init__()
14 | out_features = out_features or in_features
15 | hidden_features = hidden_features or in_features
16 | self.fc1 = nn.Linear(in_features, hidden_features)
17 | self.dwconv = DWConv(hidden_features)
18 | self.act = act_layer()
19 | self.fc2 = nn.Linear(hidden_features, out_features)
20 | self.drop = nn.Dropout(drop)
21 | self.linear = linear
22 | if self.linear:
23 | self.relu = nn.ReLU(inplace=True)
24 | self.apply(self._init_weights)
25 |
26 | def _init_weights(self, m):
27 | if isinstance(m, nn.Linear):
28 | trunc_normal_(m.weight, std=.02)
29 | if isinstance(m, nn.Linear) and m.bias is not None:
30 | nn.init.constant_(m.bias, 0)
31 | elif isinstance(m, nn.LayerNorm):
32 | nn.init.constant_(m.bias, 0)
33 | nn.init.constant_(m.weight, 1.0)
34 | elif isinstance(m, nn.Conv2d):
35 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
36 | fan_out //= m.groups
37 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
38 | if m.bias is not None:
39 | m.bias.data.zero_()
40 |
41 | def forward(self, x, H, W):
42 | x = self.fc1(x)
43 | if self.linear:
44 | x = self.relu(x)
45 | x = self.dwconv(x, H, W)
46 | x = self.act(x)
47 | x = self.drop(x)
48 | x = self.fc2(x)
49 | x = self.drop(x)
50 | return x
51 |
52 |
53 | class Attention(nn.Module):
54 | def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False):
55 | super().__init__()
56 | assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
57 |
58 | self.dim = dim
59 | self.num_heads = num_heads
60 | head_dim = dim // num_heads
61 | self.scale = qk_scale or head_dim ** -0.5
62 |
63 | self.q = nn.Linear(dim, dim, bias=qkv_bias)
64 | self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
65 | self.attn_drop = nn.Dropout(attn_drop)
66 | self.proj = nn.Linear(dim, dim)
67 | self.proj_drop = nn.Dropout(proj_drop)
68 |
69 | self.linear = linear
70 | self.sr_ratio = sr_ratio
71 | if not linear:
72 | if sr_ratio > 1:
73 | self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
74 | self.norm = nn.LayerNorm(dim)
75 | else:
76 | self.pool = nn.AdaptiveAvgPool2d(7)
77 | self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1)
78 | self.norm = nn.LayerNorm(dim)
79 | self.act = nn.GELU()
80 | self.apply(self._init_weights)
81 |
82 | def _init_weights(self, m):
83 | if isinstance(m, nn.Linear):
84 | trunc_normal_(m.weight, std=.02)
85 | if isinstance(m, nn.Linear) and m.bias is not None:
86 | nn.init.constant_(m.bias, 0)
87 | elif isinstance(m, nn.LayerNorm):
88 | nn.init.constant_(m.bias, 0)
89 | nn.init.constant_(m.weight, 1.0)
90 | elif isinstance(m, nn.Conv2d):
91 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
92 | fan_out //= m.groups
93 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
94 | if m.bias is not None:
95 | m.bias.data.zero_()
96 |
97 | def forward(self, x, H, W):
98 | B, N, C = x.shape
99 | q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
100 |
101 | if not self.linear:
102 | if self.sr_ratio > 1:
103 | x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
104 | x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
105 | x_ = self.norm(x_)
106 | kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
107 | else:
108 | kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
109 | else:
110 | x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
111 | x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1)
112 | x_ = self.norm(x_)
113 | x_ = self.act(x_)
114 | kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
115 | k, v = kv[0], kv[1]
116 |
117 | attn = (q @ k.transpose(-2, -1)) * self.scale
118 | attn = attn.softmax(dim=-1)
119 | attn = self.attn_drop(attn)
120 |
121 | x = (attn @ v).transpose(1, 2).reshape(B, N, C)
122 | x = self.proj(x)
123 | x = self.proj_drop(x)
124 |
125 | return x
126 |
127 |
128 | class Block(nn.Module):
129 |
130 | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
131 | drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False):
132 | super().__init__()
133 | self.norm1 = norm_layer(dim)
134 | self.attn = Attention(
135 | dim,
136 | num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
137 | attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear)
138 | # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
139 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
140 | self.norm2 = norm_layer(dim)
141 | mlp_hidden_dim = int(dim * mlp_ratio)
142 | self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear)
143 |
144 | self.apply(self._init_weights)
145 |
146 | def _init_weights(self, m):
147 | if isinstance(m, nn.Linear):
148 | trunc_normal_(m.weight, std=.02)
149 | if isinstance(m, nn.Linear) and m.bias is not None:
150 | nn.init.constant_(m.bias, 0)
151 | elif isinstance(m, nn.LayerNorm):
152 | nn.init.constant_(m.bias, 0)
153 | nn.init.constant_(m.weight, 1.0)
154 | elif isinstance(m, nn.Conv2d):
155 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
156 | fan_out //= m.groups
157 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
158 | if m.bias is not None:
159 | m.bias.data.zero_()
160 |
161 | def forward(self, x, H, W):
162 | x = x + self.drop_path(self.attn(self.norm1(x), H, W))
163 | x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
164 |
165 | return x
166 |
167 |
168 | class OverlapPatchEmbed(nn.Module):
169 | """ Image to Patch Embedding
170 | """
171 |
172 | def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
173 | super().__init__()
174 | img_size = to_2tuple(img_size)
175 | patch_size = to_2tuple(patch_size)
176 |
177 | self.img_size = img_size
178 | self.patch_size = patch_size
179 | self.H, self.W = img_size[0] // stride, img_size[1] // stride
180 | self.num_patches = self.H * self.W
181 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
182 | padding=(patch_size[0] // 2, patch_size[1] // 2))
183 | self.norm = nn.LayerNorm(embed_dim)
184 |
185 | self.apply(self._init_weights)
186 |
187 | def _init_weights(self, m):
188 | if isinstance(m, nn.Linear):
189 | trunc_normal_(m.weight, std=.02)
190 | if isinstance(m, nn.Linear) and m.bias is not None:
191 | nn.init.constant_(m.bias, 0)
192 | elif isinstance(m, nn.LayerNorm):
193 | nn.init.constant_(m.bias, 0)
194 | nn.init.constant_(m.weight, 1.0)
195 | elif isinstance(m, nn.Conv2d):
196 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
197 | fan_out //= m.groups
198 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
199 | if m.bias is not None:
200 | m.bias.data.zero_()
201 |
202 | def forward(self, x):
203 | x = self.proj(x)
204 | _, _, H, W = x.shape
205 | x = x.flatten(2).transpose(1, 2)
206 | x = self.norm(x)
207 |
208 | return x, H, W
209 |
210 |
211 | class PyramidVisionTransformerV2(Backbone):
212 | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dims=[64, 128, 256, 512],
213 | num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
214 | attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, depths=[3, 4, 6, 3],
215 | sr_ratios=[8, 4, 2, 1], num_stages=4, linear=False, out_features=None):
216 | super().__init__()
217 | self.depths = depths
218 | self.num_stages = num_stages
219 | self.linear = linear
220 |
221 | dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
222 | cur = 0
223 |
224 | for i in range(num_stages):
225 | patch_embed = OverlapPatchEmbed(img_size=img_size if i == 0 else img_size // (2 ** (i + 1)),
226 | patch_size=7 if i == 0 else 3,
227 | stride=4 if i == 0 else 2,
228 | in_chans=in_chans if i == 0 else embed_dims[i - 1],
229 | embed_dim=embed_dims[i])
230 |
231 | block = nn.ModuleList([Block(
232 | dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias,
233 | qk_scale=qk_scale,
234 | drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer,
235 | sr_ratio=sr_ratios[i], linear=linear)
236 | for j in range(depths[i])])
237 | norm = norm_layer(embed_dims[i])
238 | cur += depths[i]
239 |
240 | setattr(self, f"patch_embed{i + 1}", patch_embed)
241 | setattr(self, f"block{i + 1}", block)
242 | setattr(self, f"norm{i + 1}", norm)
243 |
244 | out_features_names = ["p1", "p2", "p3", "p4"]
245 | self._out_feature_strides = dict(zip(out_features_names, [4, 8, 16, 32]))
246 | self._out_feature_channels = dict(zip(out_features_names, embed_dims))
247 | if out_features is None:
248 | self._out_features = out_features_names
249 | else:
250 | self._out_features = out_features
251 | self.out_features_names = out_features_names
252 | self.apply(self._init_weights)
253 |
254 | def _init_weights(self, m):
255 | if isinstance(m, nn.Linear):
256 | trunc_normal_(m.weight, std=.02)
257 | if isinstance(m, nn.Linear) and m.bias is not None:
258 | nn.init.constant_(m.bias, 0)
259 | elif isinstance(m, nn.LayerNorm):
260 | nn.init.constant_(m.bias, 0)
261 | nn.init.constant_(m.weight, 1.0)
262 | elif isinstance(m, nn.Conv2d):
263 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
264 | fan_out //= m.groups
265 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
266 | if m.bias is not None:
267 | m.bias.data.zero_()
268 |
269 | def freeze_patch_emb(self):
270 | self.patch_embed1.requires_grad = False
271 |
272 | @torch.jit.ignore
273 | def no_weight_decay(self):
274 | return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'} # has pos_embed may be better
275 |
276 |
277 | def output_shape(self):
278 | return {
279 | name: ShapeSpec(
280 | channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
281 | )
282 | for name in self._out_features
283 | }
284 |
285 | def size_divisibility(self):
286 | return 32
287 |
288 |
289 | def forward(self, x):
290 | B = x.shape[0]
291 | outputs = {}
292 |
293 | for i in range(self.num_stages):
294 | patch_embed = getattr(self, f"patch_embed{i + 1}")
295 | block = getattr(self, f"block{i + 1}")
296 | norm = getattr(self, f"norm{i + 1}")
297 | x, H, W = patch_embed(x)
298 | for blk in block:
299 | x = blk(x, H, W)
300 | x = norm(x)
301 | x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
302 | if self.out_features_names[i] in self._out_features:
303 | outputs[self.out_features_names[i]] = x
304 | return outputs
305 |
306 |
307 | class DWConv(nn.Module):
308 | def __init__(self, dim=768):
309 | super(DWConv, self).__init__()
310 | self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
311 |
312 | def forward(self, x, H, W):
313 | B, N, C = x.shape
314 | x = x.transpose(1, 2).view(B, C, H, W)
315 | x = self.dwconv(x)
316 | x = x.flatten(2).transpose(1, 2)
317 |
318 | return x
319 |
320 |
321 | def _conv_filter(state_dict, patch_size=16):
322 | """ convert patch embedding weight from manual patchify + linear proj to conv"""
323 | out_dict = {}
324 | for k, v in state_dict.items():
325 | if 'patch_embed.proj.weight' in k:
326 | v = v.reshape((v.shape[0], 3, patch_size, patch_size))
327 | out_dict[k] = v
328 |
329 | return out_dict
330 |
331 |
332 | @BACKBONE_REGISTRY.register()
333 | def build_pyramid_vision_transformer(cfg, input_shape):
334 | name = cfg.MODEL.PVT.NAME
335 | linear = cfg.MODEL.PVT.LINEAR
336 | out_features = cfg.MODEL.PVT.OUT_FEATURES
337 |
338 | if linear:
339 | name = "b2"
340 |
341 | if name == "b0":
342 | embed_dims=[32, 64, 160, 256]
343 | else:
344 | embed_dims=[64, 128, 320, 512]
345 |
346 | depths = {
347 | "b0": [2, 2, 2, 2],
348 | "b1": [2, 2, 2, 2],
349 | "b2": [3, 4, 6, 3],
350 | "b3": [3, 4, 18, 3],
351 | "b4": [3, 8, 27, 3],
352 | "b5": [3, 6, 40, 3]
353 | }
354 |
355 | if name == "b5":
356 | mlp_ratios = [4, 4, 4, 4]
357 | else:
358 | mlp_ratios = [8, 8, 4, 4]
359 |
360 | in_channels = input_shape.channels
361 |
362 | return PyramidVisionTransformerV2(
363 | patch_size=4,
364 | depths=depths[name],
365 | in_chans=in_channels,
366 | embed_dims=embed_dims,
367 | num_heads=[1, 2, 5, 8],
368 | mlp_ratios=mlp_ratios,
369 | drop_rate=0.0,
370 | drop_path_rate=0.1,
371 | sr_ratios=[8, 4, 2, 1],
372 | qkv_bias=True,
373 | norm_layer=partial(nn.LayerNorm, eps=1e-6),
374 | out_features=out_features,
375 | linear=linear
376 | )
377 |
378 |
--------------------------------------------------------------------------------
/sparseinst/backbones/resnet.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
3 |
4 | import math
5 | import torch.nn as nn
6 | from timm.models.resnet import BasicBlock, Bottleneck
7 | from timm.models.layers import DropBlock2d, DropPath, AvgPool2dSame
8 |
9 | from detectron2.layers import ShapeSpec, FrozenBatchNorm2d
10 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY
11 | from detectron2.layers import NaiveSyncBatchNorm, DeformConv
12 |
13 |
14 | def get_padding(kernel_size, stride, dilation=1):
15 | padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
16 | return padding
17 |
18 |
19 | """
20 | inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
21 | reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
22 | attn_layer=None, aa_layer=None, drop_block=None, drop_path=None
23 | """
24 |
25 |
26 | class DeformableBottleneck(nn.Module):
27 | expansion = 4
28 |
29 | def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
30 | reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
31 | attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
32 | super().__init__()
33 |
34 | width = int(math.floor(planes * (base_width / 64)) * cardinality)
35 | first_planes = width // reduce_first
36 | outplanes = planes * self.expansion
37 | first_dilation = first_dilation or dilation
38 | # use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation)
39 |
40 | self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False)
41 | self.bn1 = norm_layer(first_planes)
42 | self.act1 = act_layer(inplace=True)
43 |
44 | self.conv2_offset = nn.Conv2d(
45 | first_planes,
46 | 18,
47 | kernel_size=3,
48 | stride=stride,
49 | padding=first_dilation,
50 | dilation=first_dilation
51 | )
52 | self.conv2 = DeformConv(
53 | first_planes,
54 | width,
55 | kernel_size=3,
56 | stride=stride,
57 | padding=first_dilation,
58 | bias=False,
59 | dilation=first_dilation,
60 | )
61 |
62 | self.bn2 = norm_layer(width)
63 | self.act2 = act_layer(inplace=True)
64 | # self.aa = aa_layer(channels=width, stride=stride) if use_aa else None
65 |
66 | self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False)
67 | self.bn3 = norm_layer(outplanes)
68 |
69 | # self.se = create_attn(attn_layer, outplanes)
70 |
71 | self.act3 = act_layer(inplace=True)
72 | self.downsample = downsample
73 | self.stride = stride
74 | self.dilation = dilation
75 | # self.drop_block = drop_block
76 | # self.drop_path = drop_path
77 |
78 | nn.init.constant_(self.conv2_offset.weight, 0)
79 | nn.init.constant_(self.conv2_offset.bias, 0)
80 |
81 | def zero_init_last_bn(self):
82 | nn.init.zeros_(self.bn3.weight)
83 |
84 | def forward(self, x):
85 | shortcut = x
86 |
87 | x = self.conv1(x)
88 | x = self.bn1(x)
89 |
90 | x = self.act1(x)
91 |
92 | offset = self.conv2_offset(x)
93 | x = self.conv2(x, offset)
94 | x = self.bn2(x)
95 | x = self.act2(x)
96 |
97 | x = self.conv3(x)
98 | x = self.bn3(x)
99 |
100 | if self.downsample is not None:
101 | shortcut = self.downsample(shortcut)
102 | x += shortcut
103 | x = self.act3(x)
104 |
105 | return x
106 |
107 |
108 | BLOCK_TYPE = {
109 | "basic": BasicBlock,
110 | "bottleneck": Bottleneck,
111 | "deform_bottleneck": DeformableBottleneck
112 | }
113 |
114 |
115 | def downsample_conv(
116 | in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
117 | norm_layer = norm_layer or nn.BatchNorm2d
118 | kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
119 | first_dilation = (first_dilation or dilation) if kernel_size > 1 else 1
120 | p = get_padding(kernel_size, stride, first_dilation)
121 |
122 | return nn.Sequential(*[
123 | nn.Conv2d(
124 | in_channels, out_channels, kernel_size, stride=stride, padding=p, dilation=first_dilation, bias=False),
125 | norm_layer(out_channels)
126 | ])
127 |
128 |
129 | def downsample_avg(
130 | in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
131 | norm_layer = norm_layer or nn.BatchNorm2d
132 | avg_stride = stride if dilation == 1 else 1
133 | if stride == 1 and dilation == 1:
134 | pool = nn.Identity()
135 | else:
136 | avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
137 | pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
138 |
139 | return nn.Sequential(*[
140 | pool,
141 | nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False),
142 | norm_layer(out_channels)
143 | ])
144 |
145 |
146 | def drop_blocks(drop_block_rate=0.):
147 | return [
148 | None, None,
149 | DropBlock2d(drop_block_rate, 5, 0.25) if drop_block_rate else None,
150 | DropBlock2d(drop_block_rate, 3, 1.00) if drop_block_rate else None]
151 |
152 |
153 | def make_blocks(
154 | stage_block, channels, block_repeats, inplanes, reduce_first=1, output_stride=32,
155 | down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., **kwargs):
156 | stages = []
157 | feature_info = []
158 | net_num_blocks = sum(block_repeats)
159 | net_block_idx = 0
160 | net_stride = 4
161 | dilation = prev_dilation = 1
162 | for stage_idx, (planes, num_blocks, db) in enumerate(zip(channels, block_repeats, drop_blocks(drop_block_rate))):
163 | # choose block_fn through the BLOCK_TYPE
164 | block_fn = BLOCK_TYPE[stage_block[stage_idx]]
165 |
166 | stage_name = f'layer{stage_idx + 1}' # never liked this name, but weight compat requires it
167 | stride = 1 if stage_idx == 0 else 2
168 | if net_stride >= output_stride:
169 | dilation *= stride
170 | stride = 1
171 | else:
172 | net_stride *= stride
173 |
174 | downsample = None
175 | if stride != 1 or inplanes != planes * block_fn.expansion:
176 | down_kwargs = dict(
177 | in_channels=inplanes, out_channels=planes * block_fn.expansion, kernel_size=down_kernel_size,
178 | stride=stride, dilation=dilation, first_dilation=prev_dilation, norm_layer=kwargs.get('norm_layer'))
179 | downsample = downsample_avg(
180 | **down_kwargs) if avg_down else downsample_conv(**down_kwargs)
181 |
182 | block_kwargs = dict(reduce_first=reduce_first, dilation=dilation, drop_block=db, **kwargs)
183 | blocks = []
184 | for block_idx in range(num_blocks):
185 | downsample = downsample if block_idx == 0 else None
186 | stride = stride if block_idx == 0 else 1
187 | block_dpr = drop_path_rate * net_block_idx / \
188 | (net_num_blocks - 1) # stochastic depth linear decay rule
189 | blocks.append(block_fn(
190 | inplanes, planes, stride, downsample, first_dilation=prev_dilation,
191 | drop_path=DropPath(block_dpr) if block_dpr > 0. else None, **block_kwargs))
192 | prev_dilation = dilation
193 | inplanes = planes * block_fn.expansion
194 | net_block_idx += 1
195 |
196 | stages.append((stage_name, nn.Sequential(*blocks)))
197 | feature_info.append(dict(num_chs=inplanes, reduction=net_stride, module=stage_name))
198 |
199 | return stages, feature_info
200 |
201 |
202 | class ResNet(Backbone):
203 | """ResNet / ResNeXt / SE-ResNeXt / SE-Net
204 |
205 | This class implements all variants of ResNet, ResNeXt, SE-ResNeXt, and SENet that
206 | * have > 1 stride in the 3x3 conv layer of bottleneck
207 | * have conv-bn-act ordering
208 |
209 | This ResNet impl supports a number of stem and downsample options based on the v1c, v1d, v1e, and v1s
210 | variants included in the MXNet Gluon ResNetV1b model. The C and D variants are also discussed in the
211 | 'Bag of Tricks' paper: https://arxiv.org/pdf/1812.01187. The B variant is equivalent to torchvision default.
212 |
213 | ResNet variants (the same modifications can be used in SE/ResNeXt models as well):
214 | * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b
215 | * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64)
216 | * d - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64), average pool in downsample
217 | * e - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128), average pool in downsample
218 | * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128)
219 | * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample
220 | * tn - 3 layer deep 3x3 stem, stem width = 32 (24, 32, 64), average pool in downsample
221 |
222 | ResNeXt
223 | * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths
224 | * same c,d, e, s variants as ResNet can be enabled
225 |
226 | SE-ResNeXt
227 | * normal - 7x7 stem, stem_width = 64
228 | * same c, d, e, s variants as ResNet can be enabled
229 |
230 | SENet-154 - 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64,
231 | reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block
232 |
233 | Parameters
234 | ----------
235 | block : Block
236 | Class for the residual block. Options are BasicBlockGl, BottleneckGl.
237 | layers : list of int
238 | Numbers of layers in each block
239 | num_classes : int, default 1000
240 | Number of classification classes.
241 | in_chans : int, default 3
242 | Number of input (color) channels.
243 | cardinality : int, default 1
244 | Number of convolution groups for 3x3 conv in Bottleneck.
245 | base_width : int, default 64
246 | Factor determining bottleneck channels. `planes * base_width / 64 * cardinality`
247 | stem_width : int, default 64
248 | Number of channels in stem convolutions
249 | stem_type : str, default ''
250 | The type of stem:
251 | * '', default - a single 7x7 conv with a width of stem_width
252 | * 'deep' - three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2
253 | * 'deep_tiered' - three 3x3 conv layers of widths stem_width//4 * 3, stem_width, stem_width * 2
254 | block_reduce_first: int, default 1
255 | Reduction factor for first convolution output width of residual blocks,
256 | 1 for all archs except senets, where 2
257 | down_kernel_size: int, default 1
258 | Kernel size of residual block downsampling path, 1x1 for most archs, 3x3 for senets
259 | avg_down : bool, default False
260 | Whether to use average pooling for projection skip connection between stages/downsample.
261 | output_stride : int, default 32
262 | Set the output stride of the network, 32, 16, or 8. Typically used in segmentation.
263 | act_layer : nn.Module, activation layer
264 | norm_layer : nn.Module, normalization layer
265 | aa_layer : nn.Module, anti-aliasing layer
266 | drop_rate : float, default 0.
267 | Dropout probability before classifier, for training
268 | global_pool : str, default 'avg'
269 | Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
270 | """
271 |
272 | def __init__(self, block_types, layers, in_chans=3,
273 | cardinality=1, base_width=64, stem_width=64, stem_type='', replace_stem_pool=False,
274 | output_stride=32, block_reduce_first=1, down_kernel_size=1, avg_down=False,
275 | act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_rate=0.0, drop_path_rate=0.,
276 | drop_block_rate=0., global_pool='avg', zero_init_last_bn=True, block_args=None, out_features=None):
277 | block_args = block_args or dict()
278 | assert output_stride in (8, 16, 32)
279 | # self.num_classes = num_classes
280 | self.drop_rate = drop_rate
281 | super(ResNet, self).__init__()
282 |
283 | # Stem
284 | deep_stem = 'deep' in stem_type
285 | inplanes = stem_width * 2 if deep_stem else 64
286 | if deep_stem:
287 | stem_chs = (stem_width, stem_width)
288 | if 'tiered' in stem_type:
289 | stem_chs = (3 * (stem_width // 4), stem_width)
290 | self.conv1 = nn.Sequential(*[
291 | nn.Conv2d(in_chans, stem_chs[0], 3, stride=2, padding=1, bias=False),
292 | norm_layer(stem_chs[0]),
293 | act_layer(inplace=True),
294 | nn.Conv2d(stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False),
295 | norm_layer(stem_chs[1]),
296 | act_layer(inplace=True),
297 | nn.Conv2d(stem_chs[1], inplanes, 3, stride=1, padding=1, bias=False)])
298 | else:
299 | self.conv1 = nn.Conv2d(in_chans, inplanes, kernel_size=7,
300 | stride=2, padding=3, bias=False)
301 | self.bn1 = norm_layer(inplanes)
302 | self.act1 = act_layer(inplace=True)
303 | self.feature_info = [dict(num_chs=inplanes, reduction=2, module='act1')]
304 |
305 | # Stem Pooling
306 | if replace_stem_pool:
307 | self.maxpool = nn.Sequential(*filter(None, [
308 | nn.Conv2d(inplanes, inplanes, 3, stride=1 if aa_layer else 2, padding=1, bias=False),
309 | aa_layer(channels=inplanes, stride=2) if aa_layer else None,
310 | norm_layer(inplanes),
311 | act_layer(inplace=True)
312 | ]))
313 | else:
314 | if aa_layer is not None:
315 | self.maxpool = nn.Sequential(*[
316 | nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
317 | aa_layer(channels=inplanes, stride=2)])
318 | else:
319 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
320 |
321 | # Feature Blocks
322 | channels = [64, 128, 256, 512]
323 | stage_modules, stage_feature_info = make_blocks(
324 | block_types, channels, layers, inplanes, cardinality=cardinality, base_width=base_width,
325 | output_stride=output_stride, reduce_first=block_reduce_first, avg_down=avg_down,
326 | down_kernel_size=down_kernel_size, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer,
327 | drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, **block_args)
328 | for stage in stage_modules:
329 | self.add_module(*stage) # layer1, layer2, etc
330 | self.feature_info.extend(stage_feature_info)
331 |
332 | for n, m in self.named_modules():
333 | if isinstance(m, nn.BatchNorm2d):
334 | nn.init.constant_(m.weight, 1.)
335 | nn.init.constant_(m.bias, 0.)
336 | if zero_init_last_bn:
337 | for m in self.modules():
338 | if hasattr(m, 'zero_init_last_bn'):
339 | m.zero_init_last_bn()
340 |
341 | out_features_names = ["res2", "res3", "res4", "res5"]
342 | self._out_feature_strides = dict(zip(out_features_names, [4, 8, 16, 32]))
343 | self._out_feature_channels = dict(
344 | zip(out_features_names, [x * BLOCK_TYPE[block_types[0]].expansion for x in [64, 128, 256, 512]]))
345 | if out_features is None:
346 | self._out_features = out_features_names
347 | else:
348 | self._out_features = out_features
349 |
350 | def output_shape(self):
351 | return {
352 | name: ShapeSpec(
353 | channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
354 | )
355 | for name in self._out_features
356 | }
357 |
358 | def size_divisibility(self):
359 | return 32
360 |
361 | def forward(self, x):
362 | x = self.conv1(x)
363 | x = self.bn1(x)
364 | x = self.act1(x)
365 | x = self.maxpool(x)
366 | outputs = {}
367 | x = self.layer1(x)
368 | # outputs["res2"] = x
369 | x = self.layer2(x)
370 | outputs["res3"] = x
371 | x = self.layer3(x)
372 | outputs["res4"] = x
373 | x = self.layer4(x)
374 | outputs["res5"] = x
375 | return outputs
376 |
377 |
378 | @BACKBONE_REGISTRY.register()
379 | def build_resnet_vd_backbone(cfg, input_shape):
380 |
381 | depth = cfg.MODEL.RESNETS.DEPTH
382 | norm_name = cfg.MODEL.RESNETS.NORM
383 | if norm_name == "FrozenBN":
384 | norm = FrozenBatchNorm2d
385 | elif norm_name == "SyncBN":
386 | norm = NaiveSyncBatchNorm
387 | else:
388 | norm = nn.BatchNorm2d
389 | if depth == 50:
390 | layers = [3, 4, 6, 3]
391 | elif depth == 101:
392 | layers = [3, 4, 23, 3]
393 | else:
394 | raise NotImplementedError()
395 |
396 | stage_blocks = []
397 | use_deformable = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
398 | for idx in range(4):
399 | if use_deformable[idx]:
400 | stage_blocks.append("deform_bottleneck")
401 | else:
402 | stage_blocks.append("bottleneck")
403 |
404 | model = ResNet(stage_blocks, layers, stem_type="deep",
405 | stem_width=32, avg_down=True, norm_layer=norm)
406 | return model
407 |
--------------------------------------------------------------------------------
/sparseinst/caffe2sparseinst.py:
--------------------------------------------------------------------------------
1 | from detectron2.export.caffe2_modeling import *
2 | from sparseinst import SparseInst
3 | import numpy as np
4 | from matplotlib import pyplot as plt
5 |
6 | class Caffe2SparseInst(Caffe2MetaArch):
7 | def __init__(self, cfg, torch_model):
8 | assert isinstance(torch_model, SparseInst)
9 | # torch_model.backbone.size_divisibility = 32
10 | super().__init__(cfg, torch_model)
11 | self.torch_model = torch_model
12 | self.pixel_mean = self.torch_model.pixel_mean/255
13 | self.pixel_std = self.torch_model.pixel_std/255
14 |
15 | def get_caffe2_inputs(self, batched_inputs):
16 | inputs = super().get_caffe2_inputs(batched_inputs)
17 | return inputs[0]/255
18 |
19 | def encode_additional_info(self, predict_net, init_net):
20 | pass
21 |
22 | def normalizer(self, image):
23 | image = (image - self.pixel_mean) / self.pixel_std
24 | return image
25 |
26 | @mock_torch_nn_functional_interpolate()
27 | def forward(self, inputs):
28 | images = self.normalizer(inputs)
29 | images = ImageList.from_tensors([images], 32)[0]
30 | # forward
31 | features = self.torch_model.backbone(images)
32 | features = self.torch_model.encoder(features)
33 | output = self.torch_model.decoder(features)
34 | pred_scores = output["pred_logits"].sigmoid()
35 | pred_masks = output["pred_masks"].sigmoid()
36 | pred_objectness = output["pred_scores"].sigmoid()
37 | pred_scores2 = torch.sqrt(pred_scores * pred_objectness)
38 |
39 | # scores, masks = np.squeeze(pred_scores2), np.squeeze(pred_masks)
40 | # keep = torch.argmax(scores, axis=1)
41 | # masks = [masks[label, :, :] for i, label in enumerate(keep) if scores[i, label] > 0.35]
42 | # fig = plt.figure()
43 | # num_masks = len(masks)
44 | # for i, mask in enumerate(masks, 1):
45 | # fig.add_subplot(1, num_masks, i)
46 | # plt.imshow(mask.data.cpu())
47 | # plt.show()
48 | # plt.ion()
49 |
50 | # return
51 |
52 | return pred_scores2, pred_masks
53 |
54 | @staticmethod
55 | def get_outputs_converter(predict_net, init_net):
56 | pass
57 |
58 |
59 | META_ARCH_CAFFE2_EXPORT_TYPE_MAP['SparseInst'] = Caffe2SparseInst
--------------------------------------------------------------------------------
/sparseinst/coco_evaluation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pycocotools.mask as mask_util
3 | from detectron2.structures import BoxMode
4 | from detectron2.evaluation import COCOEvaluator
5 |
6 |
7 | def instances_to_coco_json(instances, img_id):
8 | """
9 | Dump an "Instances" object to a COCO-format json that's used for evaluation.
10 |
11 | Args:
12 | instances (Instances):
13 | img_id (int): the image id
14 |
15 | Returns:
16 | list[dict]: list of json annotations in COCO format.
17 | """
18 | num_instance = len(instances)
19 | if num_instance == 0:
20 | return []
21 |
22 | # NOTE: pure instance segmentation
23 | has_box = instances.has("pred_boxes")
24 | if has_box:
25 | boxes = instances.pred_boxes.tensor.numpy()
26 | boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
27 | boxes = boxes.tolist()
28 |
29 | scores = instances.scores.tolist()
30 | classes = instances.pred_classes.tolist()
31 |
32 | has_mask = instances.has("pred_masks")
33 | if has_mask:
34 | # use RLE to encode the masks, because they are too large and takes memory
35 | # since this evaluator stores outputs of the entire dataset
36 | rles = [
37 | mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
38 | for mask in instances.pred_masks
39 | ]
40 | for rle in rles:
41 | # "counts" is an array encoded by mask_util as a byte-stream. Python3's
42 | # json writer which always produces strings cannot serialize a bytestream
43 | # unless you decode it. Thankfully, utf-8 works out (which is also what
44 | # the pycocotools/_mask.pyx does).
45 | rle["counts"] = rle["counts"].decode("utf-8")
46 |
47 | has_keypoints = instances.has("pred_keypoints")
48 | if has_keypoints:
49 | keypoints = instances.pred_keypoints
50 |
51 | results = []
52 | for k in range(num_instance):
53 | result = {
54 | "image_id": img_id,
55 | "category_id": classes[k],
56 | "score": scores[k],
57 | }
58 | if has_box:
59 | result["bbox"] = boxes[k]
60 | if has_mask:
61 | result["segmentation"] = rles[k]
62 | if has_keypoints:
63 | # In COCO annotations,
64 | # keypoints coordinates are pixel indices.
65 | # However our predictions are floating point coordinates.
66 | # Therefore we subtract 0.5 to be consistent with the annotation format.
67 | # This is the inverse of data loading logic in `datasets/coco.py`.
68 | keypoints[k][:, :2] -= 0.5
69 | result["keypoints"] = keypoints[k].flatten().tolist()
70 | results.append(result)
71 | return results
72 |
73 |
74 | class COCOMaskEvaluator(COCOEvaluator):
75 |
76 | def process(self, inputs, outputs):
77 | """
78 | Args:
79 | inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
80 | It is a list of dict. Each dict corresponds to an image and
81 | contains keys like "height", "width", "file_name", "image_id".
82 | outputs: the outputs of a COCO model. It is a list of dicts with key
83 | "instances" that contains :class:`Instances`.
84 | """
85 | for input, output in zip(inputs, outputs):
86 | prediction = {"image_id": input["image_id"]}
87 |
88 | if "instances" in output:
89 | instances = output["instances"].to(self._cpu_device)
90 | prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
91 | if "proposals" in output:
92 | prediction["proposals"] = output["proposals"].to(self._cpu_device)
93 | if len(prediction) > 1:
94 | self._predictions.append(prediction)
--------------------------------------------------------------------------------
/sparseinst/config.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
2 |
3 | from detectron2.config import CfgNode as CN
4 |
5 | def add_sparse_inst_config(cfg):
6 |
7 | cfg.MODEL.DEVICE = 'cuda'
8 | cfg.MODEL.MASK_ON = True
9 | # [SparseInst]
10 | cfg.MODEL.SPARSE_INST = CN()
11 |
12 | # parameters for inference
13 | cfg.MODEL.SPARSE_INST.CLS_THRESHOLD = 0.005
14 | cfg.MODEL.SPARSE_INST.MASK_THRESHOLD = 0.45
15 | cfg.MODEL.SPARSE_INST.MAX_DETECTIONS = 100
16 |
17 | # [Encoder]
18 | cfg.MODEL.SPARSE_INST.ENCODER = CN()
19 | cfg.MODEL.SPARSE_INST.ENCODER.NAME = "FPNEncoder"
20 | cfg.MODEL.SPARSE_INST.ENCODER.NORM = ""
21 | cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES = ["res3", "res4", "res5"]
22 | cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS = 256
23 |
24 | # [Decoder]
25 | cfg.MODEL.SPARSE_INST.DECODER = CN()
26 | cfg.MODEL.SPARSE_INST.DECODER.NAME = "BaseIAMDecoder"
27 | cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS = 100
28 | cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES = 80
29 | # kernels for mask features
30 | cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM = 128
31 | # upsample factor for output masks
32 | cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR = 2.0
33 | cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM = False
34 | cfg.MODEL.SPARSE_INST.DECODER.GROUPS = 4
35 | # decoder.inst_branch
36 | cfg.MODEL.SPARSE_INST.DECODER.INST = CN()
37 | cfg.MODEL.SPARSE_INST.DECODER.INST.DIM = 256
38 | cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS = 4
39 | # decoder.mask_branch
40 | cfg.MODEL.SPARSE_INST.DECODER.MASK = CN()
41 | cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM = 256
42 | cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS = 4
43 |
44 | # [Loss]
45 | cfg.MODEL.SPARSE_INST.LOSS = CN()
46 | cfg.MODEL.SPARSE_INST.LOSS.NAME = "SparseInstCriterion"
47 | cfg.MODEL.SPARSE_INST.LOSS.ITEMS = ("labels", "masks")
48 | # loss weight
49 | cfg.MODEL.SPARSE_INST.LOSS.CLASS_WEIGHT = 2.0
50 | cfg.MODEL.SPARSE_INST.LOSS.MASK_PIXEL_WEIGHT = 5.0
51 | cfg.MODEL.SPARSE_INST.LOSS.MASK_DICE_WEIGHT = 2.0
52 | # iou-aware objectness loss weight
53 | cfg.MODEL.SPARSE_INST.LOSS.OBJECTNESS_WEIGHT = 1.0
54 |
55 | # [Matcher]
56 | cfg.MODEL.SPARSE_INST.MATCHER = CN()
57 | cfg.MODEL.SPARSE_INST.MATCHER.NAME = "SparseInstMatcher"
58 | cfg.MODEL.SPARSE_INST.MATCHER.ALPHA = 0.8
59 | cfg.MODEL.SPARSE_INST.MATCHER.BETA = 0.2
60 |
61 | # [Optimizer]
62 | cfg.SOLVER.OPTIMIZER = "ADAMW"
63 | cfg.SOLVER.BACKBONE_MULTIPLIER = 1.0
64 | cfg.SOLVER.AMSGRAD = False
65 |
66 | # [Dataset mapper]
67 | cfg.MODEL.SPARSE_INST.DATASET_MAPPER = "SparseInstDatasetMapper"
68 |
69 | # [Pyramid Vision Transformer]
70 | cfg.MODEL.PVT = CN()
71 | cfg.MODEL.PVT.NAME = "b1"
72 | cfg.MODEL.PVT.OUT_FEATURES = ["p2", "p3", "p4"]
73 | cfg.MODEL.PVT.LINEAR = False
74 |
75 | cfg.MODEL.CSPNET = CN()
76 | cfg.MODEL.CSPNET.NAME = "darknet53"
77 | cfg.MODEL.CSPNET.NORM = ""
78 | # (csp-)darknet: csp1, csp2, csp3, csp4
79 | cfg.MODEL.CSPNET.OUT_FEATURES = ["csp1", "csp2", "csp3", "csp4"]
80 |
81 |
--------------------------------------------------------------------------------
/sparseinst/d2_predictor.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 | import atexit
3 | import bisect
4 | import multiprocessing as mp
5 | from collections import deque
6 | import cv2
7 | import torch
8 |
9 | from detectron2.data import MetadataCatalog
10 | from detectron2.engine.defaults import DefaultPredictor
11 | from detectron2.utils.video_visualizer import VideoVisualizer
12 | from detectron2.utils.visualizer import ColorMode, Visualizer
13 |
14 |
15 | class VisualizationDemo(object):
16 | def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
17 | """
18 | Args:
19 | cfg (CfgNode):
20 | instance_mode (ColorMode):
21 | parallel (bool): whether to run the model in different processes from visualization.
22 | Useful since the visualization logic can be slow.
23 | """
24 | self.img_format = cfg.INPUT.FORMAT
25 | self.metadata = MetadataCatalog.get(
26 | cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
27 | )
28 | self.cpu_device = torch.device("cpu")
29 | self.instance_mode = instance_mode
30 |
31 | self.parallel = parallel
32 | if parallel:
33 | num_gpu = torch.cuda.device_count()
34 | self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
35 | else:
36 | self.predictor = DefaultPredictor(cfg)
37 |
38 | def run_on_image(self, image, confidence_threshold):
39 | """
40 | Args:
41 | image (np.ndarray): an image of shape (H, W, C) (in BGR order).
42 | This is the format used by OpenCV.
43 |
44 | Returns:
45 | predictions (dict): the output of the model.
46 | vis_output (VisImage): the visualized image output.
47 | """
48 | vis_output = None
49 | predictions = self.predictor(image)
50 | visualizer = Visualizer(image, self.metadata,
51 | instance_mode=self.instance_mode)
52 | if "panoptic_seg" in predictions:
53 | panoptic_seg, segments_info = predictions["panoptic_seg"]
54 | vis_output = visualizer.draw_panoptic_seg_predictions(
55 | panoptic_seg.to(self.cpu_device), segments_info
56 | )
57 | else:
58 | if "sem_seg" in predictions:
59 | vis_output = visualizer.draw_sem_seg(
60 | predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
61 | )
62 | if "instances" in predictions:
63 | instances = predictions["instances"].to(self.cpu_device)
64 | instances = instances[instances.scores > confidence_threshold]
65 | predictions["instances"] = instances
66 | vis_output = visualizer.draw_instance_predictions(
67 | predictions=instances)
68 |
69 | return predictions, vis_output
70 |
71 | def _frame_from_video(self, video):
72 | while video.isOpened():
73 | success, frame = video.read()
74 | if success:
75 | yield frame
76 | else:
77 | break
78 |
79 | def run_on_video(self, video, confidence_threshold):
80 | """
81 | Visualizes predictions on frames of the input video.
82 |
83 | Args:
84 | video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
85 | either a webcam or a video file.
86 |
87 | Yields:
88 | ndarray: BGR visualizations of each video frame.
89 | """
90 | video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
91 |
92 | def process_predictions(frame, predictions):
93 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
94 | if "panoptic_seg" in predictions:
95 | panoptic_seg, segments_info = predictions["panoptic_seg"]
96 | vis_frame = video_visualizer.draw_panoptic_seg_predictions(
97 | frame, panoptic_seg.to(self.cpu_device), segments_info
98 | )
99 | elif "instances" in predictions:
100 | predictions = predictions["instances"].to(self.cpu_device)
101 | predictions = predictions[predictions.scores >
102 | confidence_threshold]
103 | vis_frame = video_visualizer.draw_instance_predictions(
104 | frame, predictions)
105 | elif "sem_seg" in predictions:
106 | vis_frame = video_visualizer.draw_sem_seg(
107 | frame, predictions["sem_seg"].argmax(
108 | dim=0).to(self.cpu_device)
109 | )
110 |
111 | # Converts Matplotlib RGB format to OpenCV BGR format
112 | vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
113 | return vis_frame
114 |
115 | frame_gen = self._frame_from_video(video)
116 | if self.parallel:
117 | buffer_size = self.predictor.default_buffer_size
118 |
119 | frame_data = deque()
120 |
121 | for cnt, frame in enumerate(frame_gen):
122 | frame_data.append(frame)
123 | self.predictor.put(frame)
124 |
125 | if cnt >= buffer_size:
126 | frame = frame_data.popleft()
127 | predictions = self.predictor.get()
128 | yield process_predictions(frame, predictions)
129 |
130 | while len(frame_data):
131 | frame = frame_data.popleft()
132 | predictions = self.predictor.get()
133 | yield process_predictions(frame, predictions)
134 | else:
135 | for frame in frame_gen:
136 | yield process_predictions(frame, self.predictor(frame))
137 |
138 |
139 | class AsyncPredictor:
140 | """
141 | A predictor that runs the model asynchronously, possibly on >1 GPUs.
142 | Because rendering the visualization takes considerably amount of time,
143 | this helps improve throughput a little bit when rendering videos.
144 | """
145 |
146 | class _StopToken:
147 | pass
148 |
149 | class _PredictWorker(mp.Process):
150 | def __init__(self, cfg, task_queue, result_queue):
151 | self.cfg = cfg
152 | self.task_queue = task_queue
153 | self.result_queue = result_queue
154 | super().__init__()
155 |
156 | def run(self):
157 | predictor = DefaultPredictor(self.cfg)
158 |
159 | while True:
160 | task = self.task_queue.get()
161 | if isinstance(task, AsyncPredictor._StopToken):
162 | break
163 | idx, data = task
164 | result = predictor(data)
165 | self.result_queue.put((idx, result))
166 |
167 | def __init__(self, cfg, num_gpus: int = 1):
168 | """
169 | Args:
170 | cfg (CfgNode):
171 | num_gpus (int): if 0, will run on CPU
172 | """
173 | num_workers = max(num_gpus, 1)
174 | self.task_queue = mp.Queue(maxsize=num_workers * 3)
175 | self.result_queue = mp.Queue(maxsize=num_workers * 3)
176 | self.procs = []
177 | for gpuid in range(max(num_gpus, 1)):
178 | cfg = cfg.clone()
179 | cfg.defrost()
180 | cfg.MODEL.DEVICE = "cuda:{}".format(
181 | gpuid) if num_gpus > 0 else "cpu"
182 | self.procs.append(
183 | AsyncPredictor._PredictWorker(
184 | cfg, self.task_queue, self.result_queue)
185 | )
186 |
187 | self.put_idx = 0
188 | self.get_idx = 0
189 | self.result_rank = []
190 | self.result_data = []
191 |
192 | for p in self.procs:
193 | p.start()
194 | atexit.register(self.shutdown)
195 |
196 | def put(self, image):
197 | self.put_idx += 1
198 | self.task_queue.put((self.put_idx, image))
199 |
200 | def get(self):
201 | self.get_idx += 1 # the index needed for this request
202 | if len(self.result_rank) and self.result_rank[0] == self.get_idx:
203 | res = self.result_data[0]
204 | del self.result_data[0], self.result_rank[0]
205 | return res
206 |
207 | while True:
208 | # make sure the results are returned in the correct order
209 | idx, res = self.result_queue.get()
210 | if idx == self.get_idx:
211 | return res
212 | insert = bisect.bisect(self.result_rank, idx)
213 | self.result_rank.insert(insert, idx)
214 | self.result_data.insert(insert, res)
215 |
216 | def __len__(self):
217 | return self.put_idx - self.get_idx
218 |
219 | def __call__(self, image):
220 | self.put(image)
221 | return self.get()
222 |
223 | def shutdown(self):
224 | for _ in self.procs:
225 | self.task_queue.put(AsyncPredictor._StopToken())
226 |
227 | @property
228 | def default_buffer_size(self):
229 | return len(self.procs) * 5
--------------------------------------------------------------------------------
/sparseinst/dataset_mapper.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import logging
3 | import numpy as np
4 | import torch
5 |
6 |
7 | from detectron2.data import detection_utils as utils
8 | from detectron2.data import transforms as T
9 |
10 | """
11 | This file contains the default mapping that's applied to "dataset dicts".
12 | """
13 |
14 | __all__ = ["SparseInstDatasetMapper"]
15 |
16 |
17 | def build_transform_gen(cfg, is_train):
18 | """
19 | Create a list of default :class:`Augmentation` from config.
20 | Now it includes resizing and flipping.
21 | Returns:
22 | list[Augmentation]
23 | """
24 | augmentation = []
25 |
26 | if is_train:
27 | min_size = cfg.INPUT.MIN_SIZE_TRAIN
28 | max_size = cfg.INPUT.MAX_SIZE_TRAIN
29 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
30 | else:
31 | min_size = cfg.INPUT.MIN_SIZE_TEST
32 | max_size = cfg.INPUT.MAX_SIZE_TEST
33 | sample_style = "choice"
34 | if is_train and cfg.INPUT.RANDOM_FLIP != "none":
35 | augmentation.append(
36 | T.RandomFlip(
37 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
38 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
39 | )
40 | )
41 | if is_train:
42 | augmentation.append(
43 | T.ResizeShortestEdge(min_size, max_size, sample_style)
44 | )
45 | return augmentation
46 |
47 |
48 | class SparseInstDatasetMapper:
49 | """
50 | A callable which takes a dataset dict in Detectron2 Dataset format,
51 | and map it into a format used by the model.
52 | This is the default callable to be used to map your dataset dict into training data.
53 | You may need to follow it to implement your own one for customized logic,
54 | such as a different way to read or transform images.
55 | See :doc:`/tutorials/data_loading` for details.
56 | The callable currently does the following:
57 | 1. Read the image from "file_name"
58 | 2. Applies cropping/geometric transforms to the image and annotations
59 | 3. Prepare data and annotations to Tensor and :class:`Instances`
60 | """
61 | # @classmethod
62 |
63 | def __init__(self, cfg, is_train: bool = True):
64 | augs = build_transform_gen(cfg, is_train)
65 | self.default_aug = T.AugmentationList(augs)
66 | if cfg.INPUT.CROP.ENABLED and is_train:
67 | crop_gen = [
68 | T.ResizeShortestEdge([400, 500, 600], sample_style='choice'),
69 | T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)
70 | ]
71 | recompute_boxes = cfg.MODEL.MASK_ON
72 | augs = augs[:-1] + crop_gen + augs[-1:]
73 | self.crop_aug = T.AugmentationList(augs)
74 | else:
75 | self.crop_aug = None
76 | recompute_boxes = False
77 |
78 | # self.augs = augs
79 | self.is_train = is_train
80 | self.image_format = cfg.INPUT.FORMAT
81 | self.use_instance_mask = cfg.MODEL.MASK_ON
82 | self.instance_mask_format = cfg.INPUT.MASK_FORMAT
83 | self.recompute_boxes = recompute_boxes
84 |
85 | logger = logging.getLogger(__name__)
86 | mode = "training" if is_train else "inference"
87 | logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augs}")
88 |
89 | def __call__(self, dataset_dict):
90 | """
91 | Args:
92 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
93 | Returns:
94 | dict: a format that builtin models in detectron2 accept
95 | """
96 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
97 | # USER: Write your own image loading if it's not from a file
98 | image = utils.read_image(dataset_dict["file_name"], format=self.image_format)
99 | utils.check_image_size(dataset_dict, image)
100 |
101 | # USER: Remove if you don't do semantic/panoptic segmentation.
102 | if "sem_seg_file_name" in dataset_dict:
103 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2)
104 | else:
105 | sem_seg_gt = None
106 |
107 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
108 |
109 | if self.crop_aug is None:
110 | transforms = self.default_aug(aug_input)
111 | else:
112 | if np.random.rand() > 0.5:
113 | transforms = self.crop_aug(aug_input)
114 | else:
115 | transforms = self.default_aug(aug_input)
116 | # transforms = self.augmentations(aug_input)
117 | image, sem_seg_gt = aug_input.image, aug_input.sem_seg
118 |
119 | image_shape = image.shape[:2] # h, w
120 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
121 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
122 | # Therefore it's important to use torch.Tensor.
123 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
124 | if sem_seg_gt is not None:
125 | dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long"))
126 |
127 | if not self.is_train:
128 | # USER: Modify this if you want to keep them for some reason.
129 | dataset_dict.pop("annotations", None)
130 | dataset_dict.pop("sem_seg_file_name", None)
131 | return dataset_dict
132 |
133 | if "annotations" in dataset_dict:
134 | # USER: Modify this if you want to keep them for some reason.
135 | for anno in dataset_dict["annotations"]:
136 | anno.pop("keypoints", None)
137 | if not self.use_instance_mask:
138 | anno.pop("segmentation", None)
139 |
140 | # USER: Implement additional transformations if you have other types of data
141 | annos = [
142 | utils.transform_instance_annotations(
143 | obj, transforms, image_shape)
144 | for obj in dataset_dict.pop("annotations")
145 | if obj.get("iscrowd", 0) == 0
146 | ]
147 | instances = utils.annotations_to_instances(
148 | annos, image_shape, mask_format=self.instance_mask_format
149 | )
150 |
151 | # After transforms such as cropping are applied, the bounding box may no longer
152 | # tightly bound the object. As an example, imagine a triangle object
153 | # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
154 | # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
155 | # the intersection of original bounding box and the cropping box.
156 | if self.recompute_boxes:
157 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
158 | dataset_dict["instances"] = utils.filter_empty_instances(instances)
159 | return dataset_dict
--------------------------------------------------------------------------------
/sparseinst/decoder.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn as nn
4 | from torch.nn import init
5 | import torch.nn.functional as F
6 | from torch.utils.checkpoint import checkpoint
7 | from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill
8 |
9 | from detectron2.utils.registry import Registry
10 | from detectron2.layers import Conv2d
11 |
12 | SPARSE_INST_DECODER_REGISTRY = Registry("SPARSE_INST_DECODER")
13 | SPARSE_INST_DECODER_REGISTRY.__doc__ = "registry for SparseInst decoder"
14 |
15 | def _make_stack_3x3_convs(num_convs, in_channels, out_channels):
16 | convs = []
17 | for _ in range(num_convs):
18 | convs.append(
19 | Conv2d(in_channels, out_channels, 3, padding=1))
20 | convs.append(nn.ReLU(True))
21 | in_channels = out_channels
22 | return nn.Sequential(*convs)
23 |
24 |
25 | class InstanceBranch(nn.Module):
26 |
27 | def __init__(self, cfg, in_channels):
28 | super().__init__()
29 | # norm = cfg.MODEL.SPARSE_INST.DECODER.NORM
30 | dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM
31 | num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS
32 | num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS
33 | kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
34 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
35 |
36 | self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
37 | # iam prediction, a simple conv
38 | self.iam_conv = nn.Conv2d(dim, num_masks, 3, padding=1)
39 |
40 | # outputs
41 | self.cls_score = nn.Linear(dim, self.num_classes)
42 | self.mask_kernel = nn.Linear(dim, kernel_dim)
43 | self.objectness = nn.Linear(dim, 1)
44 |
45 | self.prior_prob = 0.01
46 | self._init_weights()
47 |
48 | def _init_weights(self):
49 | for m in self.inst_convs.modules():
50 | if isinstance(m, nn.Conv2d):
51 | c2_msra_fill(m)
52 | bias_value = -math.log((1 - self.prior_prob) / self.prior_prob)
53 | for module in [self.iam_conv, self.cls_score]:
54 | init.constant_(module.bias, bias_value)
55 | init.normal_(self.iam_conv.weight, std=0.01)
56 | init.normal_(self.cls_score.weight, std=0.01)
57 |
58 | init.normal_(self.mask_kernel.weight, std=0.01)
59 | init.constant_(self.mask_kernel.bias, 0.0)
60 |
61 | def forward(self, features):
62 | # instance features (x4 convs)
63 | features = self.inst_convs(features)
64 | # predict instance activation maps
65 | iam = self.iam_conv(features)
66 | iam_prob = iam.sigmoid()
67 |
68 | B, N = iam_prob.shape[:2]
69 | C = features.size(1)
70 | # BxNxHxW -> BxNx(HW)
71 | iam_prob = iam_prob.view(B, N, -1)
72 | # aggregate features: BxCxHxW -> Bx(HW)xC
73 | inst_features = torch.bmm(iam_prob, features.view(B, C, -1).permute(0, 2, 1))
74 | normalizer = iam_prob.sum(-1).clamp(min=1e-6)
75 | inst_features = inst_features / normalizer[:, :, None]
76 | # predict classification & segmentation kernel & objectness
77 | pred_logits = self.cls_score(inst_features)
78 | pred_kernel = self.mask_kernel(inst_features)
79 | pred_scores = self.objectness(inst_features)
80 | return pred_logits, pred_kernel, pred_scores, iam
81 |
82 |
83 | class MaskBranch(nn.Module):
84 |
85 | def __init__(self, cfg, in_channels):
86 | super().__init__()
87 | dim = cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM
88 | num_convs = cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS
89 | kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
90 | self.mask_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
91 | self.projection = nn.Conv2d(dim, kernel_dim, kernel_size=1)
92 | self._init_weights()
93 |
94 | def _init_weights(self):
95 | for m in self.mask_convs.modules():
96 | if isinstance(m, nn.Conv2d):
97 | c2_msra_fill(m)
98 | c2_msra_fill(self.projection)
99 |
100 | def forward(self, features):
101 | # mask features (x4 convs)
102 | features = self.mask_convs(features)
103 | # features = checkpoint(self.mask_convs,features)
104 | return self.projection(features)
105 |
106 |
107 | @SPARSE_INST_DECODER_REGISTRY.register()
108 | class BaseIAMDecoder(nn.Module):
109 |
110 | def __init__(self, cfg):
111 | super().__init__()
112 | # add 2 for coordinates
113 | in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2
114 |
115 | self.scale_factor = cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR
116 | self.output_iam = cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM
117 |
118 | self.inst_branch = InstanceBranch(cfg, in_channels)
119 | self.mask_branch = MaskBranch(cfg, in_channels)
120 |
121 | @torch.no_grad()
122 | def compute_coordinates(self, x):
123 | h, w = x.size(2), x.size(3)
124 | input_1 = -1
125 | input_1 = int(input_1)
126 | input_3 = 1
127 | input_3= int(input_3)
128 | input_2 = h
129 | input_2= int(input_2)
130 | input_4 = w
131 | input_4= int(input_4)
132 |
133 | y_loc = torch.linspace(input_1, input_3, input_2, device=x.device)
134 | x_loc = torch.linspace(input_1, input_3, input_4, device=x.device)
135 | #y_loc = torch.arange(-1, 1+(2/h), 2/(h-1), device=x.device)
136 | #x_loc = torch.arange(-1, 1+(2/w), 2/(w-1), device=x.device)
137 | y_loc, x_loc = torch.meshgrid(y_loc, x_loc)
138 | y_loc = y_loc.expand([x.shape[0], 1, -1, -1])
139 | x_loc = x_loc.expand([x.shape[0], 1, -1, -1])
140 | locations = torch.cat([x_loc, y_loc], 1)
141 | return locations.to(x)
142 |
143 | def forward(self, features):
144 | coord_features = self.compute_coordinates(features)
145 | features = torch.cat([coord_features, features], dim=1)
146 | pred_logits, pred_kernel, pred_scores, iam = self.inst_branch(features)
147 | mask_features = self.mask_branch(features)
148 |
149 | N = pred_kernel.shape[1]
150 | # mask_features: BxCxHxW
151 | B, C, H, W = mask_features.shape
152 | pred_masks = torch.bmm(pred_kernel, mask_features.view(B, C, H * W)).view(B, N, H, W)
153 |
154 | pred_masks = F.interpolate(
155 | pred_masks, scale_factor=self.scale_factor,
156 | mode='bilinear', align_corners=False)
157 |
158 | output = {
159 | "pred_logits": pred_logits,
160 | "pred_masks": pred_masks,
161 | "pred_scores": pred_scores,
162 | }
163 |
164 | if self.output_iam:
165 | iam = F.interpolate(iam, scale_factor=self.scale_factor,
166 | mode='bilinear', align_corners=False)
167 | output['pred_iam'] = iam
168 |
169 | return output
170 |
171 |
172 | class GroupInstanceBranch(nn.Module):
173 |
174 | def __init__(self, cfg, in_channels):
175 | super().__init__()
176 | dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM
177 | num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS
178 | num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS
179 | kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
180 | self.num_groups = cfg.MODEL.SPARSE_INST.DECODER.GROUPS
181 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
182 |
183 | self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
184 | # iam prediction, a group conv
185 | expand_dim = dim * self.num_groups
186 | self.iam_conv = nn.Conv2d(dim, num_masks * self.num_groups, 3, padding=1, groups=self.num_groups)
187 | # outputs
188 | self.fc = nn.Linear(expand_dim, expand_dim)
189 |
190 | self.cls_score = nn.Linear(expand_dim, self.num_classes)
191 | self.mask_kernel = nn.Linear(expand_dim, kernel_dim)
192 | self.objectness = nn.Linear(expand_dim, 1)
193 |
194 | self.prior_prob = 0.01
195 | self._init_weights()
196 |
197 | def _init_weights(self):
198 | for m in self.inst_convs.modules():
199 | if isinstance(m, nn.Conv2d):
200 | c2_msra_fill(m)
201 | bias_value = -math.log((1 - self.prior_prob) / self.prior_prob)
202 | for module in [self.iam_conv, self.cls_score]:
203 | init.constant_(module.bias, bias_value)
204 | init.normal_(self.iam_conv.weight, std=0.01)
205 | init.normal_(self.cls_score.weight, std=0.01)
206 |
207 | init.normal_(self.mask_kernel.weight, std=0.01)
208 | init.constant_(self.mask_kernel.bias, 0.0)
209 | c2_xavier_fill(self.fc)
210 |
211 | def forward(self, features):
212 | # instance features (x4 convs)
213 | features = self.inst_convs(features)
214 | # predict instance activation maps
215 | iam = self.iam_conv(features)
216 | iam_prob = iam.sigmoid()
217 |
218 | B, N = iam_prob.shape[:2]
219 | C = features.size(1)
220 | # BxNxHxW -> BxNx(HW)
221 | iam_prob = iam_prob.view(B, N, -1)
222 | # aggregate features: BxCxHxW -> Bx(HW)xC
223 | inst_features = torch.bmm(iam_prob, features.view(B, C, -1).permute(0, 2, 1))
224 | normalizer = iam_prob.sum(-1).clamp(min=1e-6)
225 | inst_features = inst_features / normalizer[:, :, None]
226 |
227 | inst_features = inst_features.reshape(
228 | B, 4, N // 4, -1).transpose(1, 2).reshape(B, N // 4, -1)
229 |
230 | inst_features = F.relu_(self.fc(inst_features))
231 | # predict classification & segmentation kernel & objectness
232 | pred_logits = self.cls_score(inst_features)
233 | pred_kernel = self.mask_kernel(inst_features)
234 | pred_scores = self.objectness(inst_features)
235 | return pred_logits, pred_kernel, pred_scores, iam
236 |
237 |
238 |
239 | @SPARSE_INST_DECODER_REGISTRY.register()
240 | class GroupIAMDecoder(BaseIAMDecoder):
241 |
242 | def __init__(self, cfg):
243 | super().__init__(cfg)
244 | in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2
245 | self.inst_branch = GroupInstanceBranch(cfg, in_channels)
246 |
247 |
248 |
249 | def build_sparse_inst_decoder(cfg):
250 | name = cfg.MODEL.SPARSE_INST.DECODER.NAME
251 | return SPARSE_INST_DECODER_REGISTRY.get(name)(cfg)
252 |
--------------------------------------------------------------------------------
/sparseinst/encoder.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn as nn
4 | import numpy as np
5 | import torch.nn.functional as F
6 |
7 | from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill
8 |
9 | from detectron2.utils.registry import Registry
10 | from detectron2.layers import Conv2d
11 |
12 | SPARSE_INST_ENCODER_REGISTRY = Registry("SPARSE_INST_ENCODER")
13 | SPARSE_INST_ENCODER_REGISTRY.__doc__ = "registry for SparseInst decoder"
14 |
15 |
16 | class PyramidPoolingModule(nn.Module):
17 |
18 | def __init__(self, in_channels, channels=512, sizes=(1, 2, 3, 6)):
19 | super().__init__()
20 | self.stages = []
21 | self.stages = nn.ModuleList(
22 | [self._make_stage(in_channels, channels, size) for size in sizes]
23 | )
24 | self.bottleneck = Conv2d(
25 | in_channels + len(sizes) * channels, in_channels, 1)
26 |
27 | def _make_stage(self, features, out_features, size):
28 | # prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
29 | stride = np.floor(10/size).astype(np.int32)
30 | kernel = 10-(size-1)*stride
31 | # print(size, stride, kernel)
32 | prior = torch.nn.AvgPool2d(kernel_size=kernel,stride=stride)
33 | conv = Conv2d(features, out_features, 1)
34 | return nn.Sequential(prior, conv)
35 |
36 | def forward(self, feats):
37 | h, w = feats.size(2), feats.size(3)
38 | priors = [F.interpolate(input=F.relu_(stage(feats)), size=(
39 | h, w), mode='bilinear', align_corners=False) for stage in self.stages] + [feats]
40 | out = F.relu_(self.bottleneck(torch.cat(priors, 1)))
41 | return out
42 |
43 |
44 | @SPARSE_INST_ENCODER_REGISTRY.register()
45 | class InstanceContextEncoder(nn.Module):
46 | """
47 | Instance Context Encoder
48 | 1. construct feature pyramids from ResNet
49 | 2. enlarge receptive fields (ppm)
50 | 3. multi-scale fusion
51 | """
52 |
53 | def __init__(self, cfg, input_shape):
54 | super().__init__()
55 | self.num_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS
56 | self.in_features = cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES
57 | # self.norm = cfg.MODEL.SPARSE_INST.ENCODER.NORM
58 | # depthwise = cfg.MODEL.SPARSE_INST.ENCODER.DEPTHWISE
59 | self.in_channels = [input_shape[f].channels for f in self.in_features]
60 | # self.using_bias = self.norm == ""
61 | fpn_laterals = []
62 | fpn_outputs = []
63 | # groups = self.num_channels if depthwise else 1
64 | for in_channel in reversed(self.in_channels):
65 | lateral_conv = Conv2d(in_channel, self.num_channels, 1)
66 | output_conv = Conv2d(self.num_channels, self.num_channels, 3, padding=1)
67 | c2_xavier_fill(lateral_conv)
68 | c2_xavier_fill(output_conv)
69 | fpn_laterals.append(lateral_conv)
70 | fpn_outputs.append(output_conv)
71 | self.fpn_laterals = nn.ModuleList(fpn_laterals)
72 | self.fpn_outputs = nn.ModuleList(fpn_outputs)
73 | # ppm
74 | self.ppm = PyramidPoolingModule(self.num_channels, self.num_channels // 4)
75 | # final fusion
76 | self.fusion = nn.Conv2d(self.num_channels * 3, self.num_channels, 1)
77 | c2_msra_fill(self.fusion)
78 |
79 | def forward(self, features):
80 | features = [features[f] for f in self.in_features]
81 | features = features[::-1]
82 | prev_features = self.ppm(self.fpn_laterals[0](features[0]))
83 | outputs = [self.fpn_outputs[0](prev_features)]
84 | for feature, lat_conv, output_conv in zip(features[1:], self.fpn_laterals[1:], self.fpn_outputs[1:]):
85 | lat_features = lat_conv(feature)
86 | top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode='nearest')
87 | prev_features = lat_features + top_down_features
88 | outputs.insert(0, output_conv(prev_features))
89 | size = outputs[0].shape[2:]
90 | features = [
91 | outputs[0]] + [F.interpolate(x, size, mode='bilinear', align_corners=False) for x in outputs[1:]]
92 | features = self.fusion(torch.cat(features, dim=1))
93 | return features
94 |
95 |
96 | def build_sparse_inst_encoder(cfg, input_shape):
97 | name = cfg.MODEL.SPARSE_INST.ENCODER.NAME
98 | return SPARSE_INST_ENCODER_REGISTRY.get(name)(cfg, input_shape)
--------------------------------------------------------------------------------
/sparseinst/input.ppm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/input.ppm
--------------------------------------------------------------------------------
/sparseinst/loss.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | from torch.cuda.amp import autocast
7 | from scipy.optimize import linear_sum_assignment
8 | from fvcore.nn import sigmoid_focal_loss_jit
9 |
10 | from detectron2.utils.registry import Registry
11 |
12 | from .utils import nested_masks_from_list, is_dist_avail_and_initialized, get_world_size
13 |
14 | SPARSE_INST_MATCHER_REGISTRY = Registry("SPARSE_INST_MATCHER")
15 | SPARSE_INST_MATCHER_REGISTRY.__doc__ = "Matcher for SparseInst"
16 | SPARSE_INST_CRITERION_REGISTRY = Registry("SPARSE_INST_CRITERION")
17 | SPARSE_INST_CRITERION_REGISTRY.__doc__ = "Criterion for SparseInst"
18 |
19 |
20 | def compute_mask_iou(inputs, targets):
21 | inputs = inputs.sigmoid()
22 | # thresholding
23 | binarized_inputs = (inputs >= 0.4).float()
24 | targets = (targets > 0.5).float()
25 | intersection = (binarized_inputs * targets).sum(-1)
26 | union = targets.sum(-1) + binarized_inputs.sum(-1) - intersection
27 | score = intersection / (union + 1e-6)
28 | return score
29 |
30 |
31 | def dice_score(inputs, targets):
32 | inputs = inputs.sigmoid()
33 | numerator = 2 * torch.matmul(inputs, targets.t())
34 | denominator = (
35 | inputs * inputs).sum(-1)[:, None] + (targets * targets).sum(-1)
36 | score = numerator / (denominator + 1e-4)
37 | return score
38 |
39 |
40 | def dice_loss(inputs, targets, reduction='sum'):
41 | inputs = inputs.sigmoid()
42 | assert inputs.shape == targets.shape
43 | numerator = 2 * (inputs * targets).sum(1)
44 | denominator = (inputs * inputs).sum(-1) + (targets * targets).sum(-1)
45 | loss = 1 - (numerator) / (denominator + 1e-4)
46 | if reduction == 'none':
47 | return loss
48 | return loss.sum()
49 |
50 |
51 | @SPARSE_INST_CRITERION_REGISTRY.register()
52 | class SparseInstCriterion(nn.Module):
53 | # This part is partially derivated from: https://github.com/facebookresearch/detr/blob/main/models/detr.py
54 |
55 | def __init__(self, cfg, matcher):
56 | super().__init__()
57 | self.matcher = matcher
58 | self.losses = cfg.MODEL.SPARSE_INST.LOSS.ITEMS
59 | self.weight_dict = self.get_weight_dict(cfg)
60 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
61 |
62 | def get_weight_dict(self, cfg):
63 | losses = ("loss_ce", "loss_mask", "loss_dice", "loss_objectness")
64 | weight_dict = {}
65 | ce_weight = cfg.MODEL.SPARSE_INST.LOSS.CLASS_WEIGHT
66 | mask_weight = cfg.MODEL.SPARSE_INST.LOSS.MASK_PIXEL_WEIGHT
67 | dice_weight = cfg.MODEL.SPARSE_INST.LOSS.MASK_DICE_WEIGHT
68 | objectness_weight = cfg.MODEL.SPARSE_INST.LOSS.OBJECTNESS_WEIGHT
69 |
70 | weight_dict = dict(
71 | zip(losses, (ce_weight, mask_weight, dice_weight, objectness_weight)))
72 | return weight_dict
73 |
74 | def _get_src_permutation_idx(self, indices):
75 | # permute predictions following indices
76 | batch_idx = torch.cat([torch.full_like(src, i)
77 | for i, (src, _) in enumerate(indices)])
78 | src_idx = torch.cat([src for (src, _) in indices])
79 | return batch_idx, src_idx
80 |
81 | def _get_tgt_permutation_idx(self, indices):
82 | # permute targets following indices
83 | batch_idx = torch.cat([torch.full_like(tgt, i)
84 | for i, (_, tgt) in enumerate(indices)])
85 | tgt_idx = torch.cat([tgt for (_, tgt) in indices])
86 | return batch_idx, tgt_idx
87 |
88 | def loss_labels(self, outputs, targets, indices, num_instances, input_shape=None):
89 | assert "pred_logits" in outputs
90 | src_logits = outputs['pred_logits']
91 | idx = self._get_src_permutation_idx(indices)
92 | target_classes_o = torch.cat([t["labels"][J]
93 | for t, (_, J) in zip(targets, indices)])
94 | target_classes = torch.full(src_logits.shape[:2], self.num_classes,
95 | dtype=torch.int64, device=src_logits.device)
96 | target_classes[idx] = target_classes_o
97 |
98 | src_logits = src_logits.flatten(0, 1)
99 | # prepare one_hot target.
100 | target_classes = target_classes.flatten(0, 1)
101 | pos_inds = torch.nonzero(
102 | target_classes != self.num_classes, as_tuple=True)[0]
103 | labels = torch.zeros_like(src_logits)
104 | labels[pos_inds, target_classes[pos_inds]] = 1
105 | # comp focal loss.
106 | class_loss = sigmoid_focal_loss_jit(
107 | src_logits,
108 | labels,
109 | alpha=0.25,
110 | gamma=2.0,
111 | reduction="sum",
112 | ) / num_instances
113 | losses = {'loss_ce': class_loss}
114 | return losses
115 |
116 | def loss_masks_with_iou_objectness(self, outputs, targets, indices, num_instances, input_shape):
117 | src_idx = self._get_src_permutation_idx(indices)
118 | tgt_idx = self._get_tgt_permutation_idx(indices)
119 | # Bx100xHxW
120 | assert "pred_masks" in outputs
121 | assert "pred_scores" in outputs
122 | src_iou_scores = outputs["pred_scores"]
123 | src_masks = outputs["pred_masks"]
124 | with torch.no_grad():
125 | target_masks, _ = nested_masks_from_list(
126 | [t["masks"].tensor for t in targets], input_shape).decompose()
127 | num_masks = [len(t["masks"]) for t in targets]
128 | target_masks = target_masks.to(src_masks)
129 | if len(target_masks) == 0:
130 | losses = {
131 | "loss_dice": src_masks.sum() * 0.0,
132 | "loss_mask": src_masks.sum() * 0.0,
133 | "loss_objectness": src_iou_scores.sum() * 0.0
134 | }
135 | return losses
136 |
137 | src_masks = src_masks[src_idx]
138 | target_masks = F.interpolate(
139 | target_masks[:, None], size=src_masks.shape[-2:], mode='bilinear', align_corners=False).squeeze(1)
140 |
141 | src_masks = src_masks.flatten(1)
142 | # FIXME: tgt_idx
143 | mix_tgt_idx = torch.zeros_like(tgt_idx[1])
144 | cum_sum = 0
145 | for num_mask in num_masks:
146 | mix_tgt_idx[cum_sum: cum_sum + num_mask] = cum_sum
147 | cum_sum += num_mask
148 | mix_tgt_idx += tgt_idx[1]
149 |
150 | target_masks = target_masks[mix_tgt_idx].flatten(1)
151 |
152 | with torch.no_grad():
153 | ious = compute_mask_iou(src_masks, target_masks)
154 |
155 | tgt_iou_scores = ious
156 | src_iou_scores = src_iou_scores[src_idx]
157 | tgt_iou_scores = tgt_iou_scores.flatten(0)
158 | src_iou_scores = src_iou_scores.flatten(0)
159 |
160 | losses = {
161 | "loss_objectness": F.binary_cross_entropy_with_logits(src_iou_scores, tgt_iou_scores, reduction='mean'),
162 | "loss_dice": dice_loss(src_masks, target_masks) / num_instances,
163 | "loss_mask": F.binary_cross_entropy_with_logits(src_masks, target_masks, reduction='mean')
164 | }
165 | return losses
166 |
167 | def get_loss(self, loss, outputs, targets, indices, num_instances, **kwargs):
168 | loss_map = {
169 | "labels": self.loss_labels,
170 | "masks": self.loss_masks_with_iou_objectness,
171 | }
172 | if loss == "loss_objectness":
173 | # NOTE: loss_objectness will be calculated in `loss_masks_with_iou_objectness`
174 | return {}
175 | assert loss in loss_map
176 | return loss_map[loss](outputs, targets, indices, num_instances, **kwargs)
177 |
178 | def forward(self, outputs, targets, input_shape):
179 |
180 | outputs_without_aux = {k: v for k,
181 | v in outputs.items() if k != 'aux_outputs'}
182 |
183 | # Retrieve the matching between the outputs of the last layer and the targets
184 | indices = self.matcher(outputs_without_aux, targets, input_shape)
185 | # Compute the average number of target boxes accross all nodes, for normalization purposes
186 | num_instances = sum(len(t["labels"]) for t in targets)
187 | num_instances = torch.as_tensor(
188 | [num_instances], dtype=torch.float, device=next(iter(outputs.values())).device)
189 | if is_dist_avail_and_initialized():
190 | torch.distributed.all_reduce(num_instances)
191 | num_instances = torch.clamp(
192 | num_instances / get_world_size(), min=1).item()
193 | # Compute all the requested losses
194 | losses = {}
195 | for loss in self.losses:
196 | losses.update(self.get_loss(loss, outputs, targets, indices,
197 | num_instances, input_shape=input_shape))
198 |
199 | for k in losses.keys():
200 | if k in self.weight_dict:
201 | losses[k] *= self.weight_dict[k]
202 |
203 | return losses
204 |
205 |
206 | @SPARSE_INST_MATCHER_REGISTRY.register()
207 | class SparseInstMatcherV1(nn.Module):
208 |
209 | def __init__(self, cfg):
210 | super().__init__()
211 | self.alpha = cfg.MODEL.SPARSE_INST.MATCHER.ALPHA
212 | self.beta = cfg.MODEL.SPARSE_INST.MATCHER.BETA
213 | self.mask_score = dice_score
214 |
215 | @torch.no_grad()
216 | def forward(self, outputs, targets, input_shape):
217 | B, N, H, W = outputs["pred_masks"].shape
218 | pred_masks = outputs['pred_masks']
219 | pred_logits = outputs['pred_logits'].sigmoid()
220 |
221 | indices = []
222 |
223 | for i in range(B):
224 | tgt_ids = targets[i]["labels"]
225 | # no annotations
226 | if tgt_ids.shape[0] == 0:
227 | indices.append((torch.as_tensor([]),
228 | torch.as_tensor([])))
229 | continue
230 |
231 | tgt_masks = targets[i]['masks'].tensor.to(pred_masks)
232 | pred_logit = pred_logits[i]
233 | out_masks = pred_masks[i]
234 |
235 | # upsampling:
236 | # (1) padding/
237 | # (2) upsampling to 1x input size (input_shape)
238 | # (3) downsampling to 0.25x input size (output mask size)
239 | ori_h, ori_w = tgt_masks.size(1), tgt_masks.size(2)
240 | tgt_masks_ = torch.zeros(
241 | (1, tgt_masks.size(0), input_shape[0], input_shape[1])).to(pred_masks)
242 | tgt_masks_[0, :, :ori_h, :ori_w] = tgt_masks
243 | tgt_masks = F.interpolate(
244 | tgt_masks_, size=out_masks.shape[-2:], mode='bilinear', align_corners=False)[0]
245 |
246 | # compute dice score and classification score
247 | tgt_masks = tgt_masks.flatten(1)
248 | out_masks = out_masks.flatten(1)
249 |
250 | mask_score = self.mask_score(out_masks, tgt_masks)
251 | # Nx(Number of gts)
252 | matching_prob = pred_logit[:, tgt_ids]
253 | C = (mask_score ** self.alpha) * (matching_prob ** self.beta)
254 | # hungarian matching
255 | inds = linear_sum_assignment(C.cpu(), maximize=True)
256 | indices.append(inds)
257 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
258 |
259 |
260 | @SPARSE_INST_MATCHER_REGISTRY.register()
261 | class SparseInstMatcher(nn.Module):
262 |
263 | def __init__(self, cfg):
264 | super().__init__()
265 | self.alpha = cfg.MODEL.SPARSE_INST.MATCHER.ALPHA
266 | self.beta = cfg.MODEL.SPARSE_INST.MATCHER.BETA
267 | self.mask_score = dice_score
268 |
269 | def forward(self, outputs, targets, input_shape):
270 | with torch.no_grad():
271 | B, N, H, W = outputs["pred_masks"].shape
272 | pred_masks = outputs['pred_masks']
273 | pred_logits = outputs['pred_logits'].sigmoid()
274 |
275 | tgt_ids = torch.cat([v["labels"] for v in targets])
276 |
277 | if tgt_ids.shape[0] == 0:
278 | return [(torch.as_tensor([]).to(pred_logits), torch.as_tensor([]).to(pred_logits))] * B
279 | tgt_masks, _ = nested_masks_from_list(
280 | [t["masks"].tensor for t in targets], input_shape).decompose()
281 | device = pred_masks.device
282 | tgt_masks = tgt_masks.to(pred_masks)
283 |
284 | tgt_masks = F.interpolate(
285 | tgt_masks[:, None], size=pred_masks.shape[-2:], mode="bilinear", align_corners=False).squeeze(1)
286 |
287 | pred_masks = pred_masks.view(B * N, -1)
288 | tgt_masks = tgt_masks.flatten(1)
289 | with autocast(enabled=False):
290 | pred_masks = pred_masks.float()
291 | tgt_masks = tgt_masks.float()
292 | pred_logits = pred_logits.float()
293 | mask_score = self.mask_score(pred_masks, tgt_masks)
294 | # Nx(Number of gts)
295 | matching_prob = pred_logits.view(B * N, -1)[:, tgt_ids]
296 | C = (mask_score ** self.alpha) * (matching_prob ** self.beta)
297 |
298 | C = C.view(B, N, -1).cpu()
299 | # hungarian matching
300 | sizes = [len(v["masks"]) for v in targets]
301 | indices = [linear_sum_assignment(c[i], maximize=True)
302 | for i, c in enumerate(C.split(sizes, -1))]
303 | indices = [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(
304 | j, dtype=torch.int64)) for i, j in indices]
305 | return indices
306 |
307 |
308 | def build_sparse_inst_matcher(cfg):
309 | name = cfg.MODEL.SPARSE_INST.MATCHER.NAME
310 | return SPARSE_INST_MATCHER_REGISTRY.get(name)(cfg)
311 |
312 |
313 | def build_sparse_inst_criterion(cfg):
314 | matcher = build_sparse_inst_matcher(cfg)
315 | name = cfg.MODEL.SPARSE_INST.LOSS.NAME
316 | return SPARSE_INST_CRITERION_REGISTRY.get(name)(cfg, matcher)
317 |
--------------------------------------------------------------------------------
/sparseinst/sparseinst.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 | from detectron2.modeling import build_backbone
8 | from detectron2.structures import ImageList, Instances, BitMasks
9 | from detectron2.modeling import META_ARCH_REGISTRY, build_backbone
10 | import numpy as np
11 | from .encoder import build_sparse_inst_encoder
12 | from .decoder import build_sparse_inst_decoder
13 | from .loss import build_sparse_inst_criterion
14 | from .utils import nested_tensor_from_tensor_list
15 |
16 | __all__ = ["SparseInst"]
17 |
18 |
19 | @torch.jit.script
20 | def rescoring_mask(scores, mask_pred, masks):
21 | mask_pred_ = mask_pred.float()
22 | return scores * ((masks * mask_pred_).sum([1, 2]) / (mask_pred_.sum([1, 2]).double() + 1e-6).float())
23 |
24 |
25 | @META_ARCH_REGISTRY.register()
26 | class SparseInst(nn.Module):
27 |
28 | def __init__(self, cfg):
29 | super().__init__()
30 |
31 | # move to target device
32 | self.device = torch.device(cfg.MODEL.DEVICE)
33 | self.use_cp = True
34 |
35 | print("NOM DU DEVICE UTILISE2", torch.cuda.get_device_name())
36 | # backbone
37 | self.backbone = build_backbone(cfg)
38 | self.size_divisibility = self.backbone.size_divisibility
39 | output_shape = self.backbone.output_shape()
40 |
41 | # encoder & decoder
42 | self.encoder = build_sparse_inst_encoder(cfg, output_shape)
43 | self.decoder = build_sparse_inst_decoder(cfg)
44 |
45 | # matcher & loss (matcher is built in loss)
46 | self.criterion = build_sparse_inst_criterion(cfg)
47 |
48 | # data and preprocessing
49 | self.mask_format = cfg.INPUT.MASK_FORMAT
50 |
51 | self.pixel_mean = torch.Tensor(
52 | cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
53 | self.pixel_std = torch.Tensor(
54 | cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
55 | # self.normalizer = lambda x: (x - pixel_mean) / pixel_std
56 |
57 | # inference
58 | self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD
59 | self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD
60 | self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS
61 |
62 | def normalizer(self, image):
63 | image = (image - self.pixel_mean) / self.pixel_std
64 | return image
65 |
66 | def preprocess_inputs(self, batched_inputs):
67 | images = [x["image"].to(self.device) for x in batched_inputs]
68 | images = [self.normalizer(x) for x in images]
69 | images = ImageList.from_tensors(images, 32)
70 | return images
71 |
72 | def prepare_targets(self, targets):
73 | new_targets = []
74 | for targets_per_image in targets:
75 | target = {}
76 | gt_classes = targets_per_image.gt_classes
77 | target["labels"] = gt_classes.to(self.device)
78 | h, w = targets_per_image.image_size
79 | if not targets_per_image.has('gt_masks'):
80 | gt_masks = BitMasks(torch.empty(0, h, w))
81 | else:
82 | gt_masks = targets_per_image.gt_masks
83 | if self.mask_format == "polygon":
84 | if len(gt_masks.polygons) == 0:
85 | gt_masks = BitMasks(torch.empty(0, h, w))
86 | else:
87 | gt_masks = BitMasks.from_polygon_masks(
88 | gt_masks.polygons, h, w)
89 |
90 | target["masks"] = gt_masks.to(self.device)
91 | new_targets.append(target)
92 |
93 | return new_targets
94 |
95 | def forward(self, batched_inputs):
96 | images = self.preprocess_inputs(batched_inputs)
97 | if isinstance(images, (list, torch.Tensor)):
98 | images = nested_tensor_from_tensor_list(images)
99 | max_shape = images.tensor.shape[2:]
100 | features = self.backbone(images.tensor)
101 | features = self.encoder(features)
102 | output = self.decoder(features)
103 |
104 | if self.training:
105 | gt_instances = [x["instances"].to(
106 | self.device) for x in batched_inputs]
107 | targets = self.prepare_targets(gt_instances)
108 | losses = self.criterion(output, targets, max_shape)
109 | return losses
110 | else:
111 | results = self.inference(
112 | output, batched_inputs, max_shape, images.image_sizes)
113 | processed_results = [{"instances": r} for r in results]
114 | return processed_results
115 |
116 | def forward_test_3(self, images):
117 | # images = self.preprocess_inputs(batched_inputs)
118 | # if isinstance(images, (list, torch.Tensor)):
119 | # images = nested_tensor_from_tensor_list(images)
120 | max_shape = images.shape[2:]
121 | # forward
122 | # if self.use_cp:
123 | # features = self.backbone(images.tensor)
124 | # features = checkpoint(self.encoder,features)
125 | # output = self.decoder(features)
126 | # else:
127 | features = self.backbone(images)
128 | features = self.encoder(features)
129 | output = self.decoder(features)
130 |
131 | if self.training:
132 | gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
133 | targets = self.prepare_targets(gt_instances)
134 | losses = self.criterion(output, targets, max_shape)
135 | return losses
136 | else:
137 | results = self.inference_test_3(output, images)
138 | # import pdb; pdb.set_trace()
139 | # processed_results = [{"instances": r} for r in results]
140 |
141 | out_scores = torch.cat([r.scores.unsqueeze(0) for r in results], dim=0)
142 | out_pred_classes = torch.cat([r.pred_classes.unsqueeze(0) for r in results], dim=0)
143 | out_pred_masks = torch.cat([r.pred_masks for r in results], dim=0)
144 | return (out_scores, out_pred_classes, out_pred_masks)
145 | return processed_results
146 |
147 | def inference(self, output, batched_inputs, max_shape, image_sizes):
148 | # max_detections = self.max_detections
149 | results = []
150 | pred_scores = output["pred_logits"].sigmoid()
151 | pred_masks = output["pred_masks"].sigmoid()
152 | pred_objectness = output["pred_scores"].sigmoid()
153 | pred_scores = torch.sqrt(pred_scores * pred_objectness)
154 | for _, (scores_per_image, mask_pred_per_image, batched_input, img_shape) in enumerate(zip(
155 | pred_scores, pred_masks, batched_inputs, image_sizes)):
156 |
157 | ori_shape = (batched_input["height"], batched_input["width"])
158 | result = Instances(ori_shape)
159 | # max/argmax
160 | scores, labels = scores_per_image.max(dim=-1)
161 | # cls threshold
162 | keep = scores > self.cls_threshold
163 | scores = scores[keep]
164 | labels = labels[keep]
165 | mask_pred_per_image = mask_pred_per_image[keep]
166 | if scores.size(0) == 0:
167 | result.scores = scores
168 | result.pred_classes = labels
169 | results.append(result)
170 | continue
171 |
172 | h, w = img_shape
173 | # rescoring mask using maskness
174 | scores = rescoring_mask(
175 | scores, mask_pred_per_image > self.mask_threshold, mask_pred_per_image)
176 | # upsample the masks to the original resolution:
177 | # (1) upsampling the masks to the padded inputs, remove the padding area
178 | # (2) upsampling/downsampling the masks to the original sizes
179 |
180 | mask_pred_per_image = F.interpolate(
181 | mask_pred_per_image.unsqueeze(1), size=max_shape, mode="bilinear", align_corners=False)[:, :, :h, :w]
182 | mask_pred_per_image = F.interpolate(
183 | mask_pred_per_image, size=ori_shape, mode='bilinear', align_corners=False).squeeze(1)
184 |
185 | mask_pred = mask_pred_per_image > self.mask_threshold
186 | # fix the bug for visualization
187 | # mask_pred = BitMasks(mask_pred)
188 |
189 | # using Detectron2 Instances to store the final results
190 | result.pred_masks = mask_pred
191 | result.scores = scores
192 | result.pred_classes = labels
193 | results.append(result)
194 | return results
195 |
196 | def inference_test_3(self, output, images):
197 | # max_detections = self.max_detections
198 | results = []
199 | pred_scores = output["pred_logits"].sigmoid()
200 | pred_masks = output["pred_masks"].sigmoid()
201 | pred_objectness = output["pred_scores"].sigmoid()
202 | pred_scores = torch.sqrt(pred_scores * pred_objectness)
203 |
204 | for _, (scores_per_image, mask_pred_per_image, image) in enumerate(zip(
205 | pred_scores, pred_masks, images)):
206 |
207 | shape = image.shape[1:]
208 | result = Instances(shape)
209 |
210 | scores, labels = scores_per_image.max(dim=-1)
211 |
212 | if scores.size(0) == 0:
213 | result.scores = scores
214 | result.pred_classes = labels
215 | results.append(result)
216 | continue
217 |
218 | h, w = shape
219 | # rescoring mask using maskness
220 | scores = rescoring_mask(scores, mask_pred_per_image > self.mask_threshold, mask_pred_per_image)
221 | # using Detectron2 Instances to store the final results
222 |
223 | result.pred_masks = mask_pred_per_image #mask_pred
224 | result.scores = scores
225 | result.pred_classes = labels
226 | results.append(result)
227 |
228 | return results
229 |
--------------------------------------------------------------------------------
/sparseinst/utils.py:
--------------------------------------------------------------------------------
1 |
2 | from typing import Optional, List
3 |
4 | import torch
5 | from torch import Tensor
6 | import torch.distributed as dist
7 | import torch.nn.functional as F
8 | import torchvision
9 |
10 |
11 | def _max_by_axis(the_list):
12 | # type: (List[List[int]]) -> List[int]
13 | maxes = the_list[0]
14 | for sublist in the_list[1:]:
15 | for index, item in enumerate(sublist):
16 | maxes[index] = max(maxes[index], item)
17 | return maxes
18 |
19 |
20 | class NestedTensor(object):
21 | def __init__(self, tensors, mask: Optional[Tensor]):
22 | self.tensors = tensors
23 | self.mask = mask
24 |
25 | def to(self, device):
26 | cast_tensor = self.tensors.to(device)
27 | mask = self.mask
28 | if mask is not None:
29 | assert mask is not None
30 | cast_mask = mask.to(device)
31 | else:
32 | cast_mask = None
33 | return NestedTensor(cast_tensor, cast_mask)
34 |
35 | def decompose(self):
36 | return self.tensors, self.mask
37 |
38 | def __repr__(self):
39 | return str(self.tensors)
40 |
41 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
42 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
43 |
44 |
45 | @torch.jit.unused
46 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
47 | max_size = []
48 | for i in range(tensor_list[0].dim()):
49 | max_size_i = torch.max(torch.stack([img.shape[i]
50 | for img in tensor_list]).to(torch.float32)).to(torch.int64)
51 | max_size.append(max_size_i)
52 | max_size = tuple(max_size)
53 |
54 | # work around for
55 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
56 | # m[: img.shape[1], :img.shape[2]] = False
57 | # which is not yet supported in onnx
58 | padded_imgs = []
59 | padded_masks = []
60 | for img in tensor_list:
61 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
62 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
63 | padded_imgs.append(padded_img)
64 |
65 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
66 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
67 | padded_masks.append(padded_mask.to(torch.bool))
68 |
69 | tensor = torch.stack(padded_imgs)
70 | mask = torch.stack(padded_masks)
71 |
72 | return NestedTensor(tensor, mask=mask)
73 |
74 |
75 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
76 | # TODO make this more general
77 | if tensor_list[0].ndim == 3:
78 | if torchvision._is_tracing():
79 | # nested_tensor_from_tensor_list() does not export well to ONNX
80 | # call _onnx_nested_tensor_from_tensor_list() instead
81 | return _onnx_nested_tensor_from_tensor_list(tensor_list)
82 |
83 | # TODO make it support different-sized images
84 | max_size = _max_by_axis([list(img.shape) for img in tensor_list])
85 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
86 | batch_shape = [len(tensor_list)] + max_size
87 | b, c, h, w = batch_shape
88 | dtype = tensor_list[0].dtype
89 | device = tensor_list[0].device
90 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
91 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
92 | for img, pad_img, m in zip(tensor_list, tensor, mask):
93 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
94 | m[: img.shape[1], :img.shape[2]] = False
95 | else:
96 | raise ValueError('not supported')
97 | return NestedTensor(tensor, mask)
98 |
99 |
100 | def nested_masks_from_list(tensor_list: List[Tensor], input_shape=None):
101 | if tensor_list[0].ndim == 3:
102 | dim_size = sum([img.shape[0] for img in tensor_list])
103 | if input_shape is None:
104 | max_size = _max_by_axis([list(img.shape[-2:]) for img in tensor_list])
105 | else:
106 | max_size = [input_shape[0], input_shape[1]]
107 | batch_shape = [dim_size] + max_size
108 | # b, h, w = batch_shape
109 | dtype = tensor_list[0].dtype
110 | device = tensor_list[0].device
111 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
112 | mask = torch.zeros(batch_shape, dtype=torch.bool, device=device)
113 | idx = 0
114 | for img in tensor_list:
115 | c = img.shape[0]
116 | c_ = idx + c
117 | tensor[idx: c_, :img.shape[1], : img.shape[2]].copy_(img)
118 | mask[idx: c_, :img.shape[1], :img.shape[2]] = True
119 | idx = c_
120 | else:
121 | raise ValueError('not supported')
122 | return NestedTensor(tensor, mask)
123 |
124 |
125 | def is_dist_avail_and_initialized():
126 | if not dist.is_available():
127 | return False
128 | if not dist.is_initialized():
129 | return False
130 | return True
131 |
132 |
133 | def get_world_size():
134 | if not is_dist_avail_and_initialized():
135 | return 1
136 | return dist.get_world_size()
137 |
138 |
139 | def aligned_bilinear(tensor, factor):
140 | # borrowed from Adelaidet: https://github1s.com/aim-uofa/AdelaiDet/blob/HEAD/adet/utils/comm.py
141 | assert tensor.dim() == 4
142 | assert factor >= 1
143 | assert int(factor) == factor
144 |
145 | if factor == 1:
146 | return tensor
147 |
148 | h, w = tensor.size()[2:]
149 | tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate")
150 | oh = factor * h + 1
151 | ow = factor * w + 1
152 | tensor = F.interpolate(
153 | tensor, size=(oh, ow),
154 | mode='bilinear',
155 | align_corners=True
156 | )
157 | tensor = F.pad(
158 | tensor, pad=(factor // 2, 0, factor // 2, 0),
159 | mode="replicate"
160 | )
161 |
162 | return tensor[:, :, :oh - 1, :ow - 1]
163 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | print("It works")
2 | print("okok")
--------------------------------------------------------------------------------
/test_net.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import numpy as np
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | from torch.cuda.amp import autocast
8 |
9 | from detectron2.config import get_cfg
10 | from detectron2.modeling import build_backbone
11 | from detectron2.checkpoint import DetectionCheckpointer
12 | from detectron2.structures import ImageList, Instances, BitMasks
13 | from detectron2.engine import default_argument_parser, default_setup
14 | from detectron2.data import build_detection_test_loader
15 | from detectron2.evaluation import COCOEvaluator, print_csv_format
16 |
17 | from sparseinst import build_sparse_inst_encoder, build_sparse_inst_decoder, add_sparse_inst_config
18 | from sparseinst import COCOMaskEvaluator
19 |
20 |
21 | device = torch.device('cuda:0')
22 | dtype = torch.float32
23 |
24 | __all__ = ["SparseInst"]
25 |
26 | pixel_mean = torch.Tensor([123.675, 116.280, 103.530]).to(device).view(3, 1, 1)
27 | pixel_std = torch.Tensor([58.395, 57.120, 57.375]).to(device).view(3, 1, 1)
28 |
29 |
30 | @torch.jit.script
31 | def normalizer(x, mean, std): return (x - mean) / std
32 |
33 |
34 | def synchronize():
35 | torch.cuda.synchronize()
36 |
37 |
38 | def process_batched_inputs(batched_inputs):
39 | images = [x["image"].to(device) for x in batched_inputs]
40 | images = [normalizer(x, pixel_mean, pixel_std) for x in images]
41 | images = ImageList.from_tensors(images, 32)
42 | ori_size = (batched_inputs[0]["height"], batched_inputs[0]["width"])
43 | return images.tensor, images.image_sizes[0], ori_size
44 |
45 |
46 | @torch.jit.script
47 | def rescoring_mask(scores, mask_pred, masks):
48 | mask_pred_ = mask_pred.float()
49 | return scores * ((masks * mask_pred_).sum([1, 2]) / (mask_pred_.sum([1, 2]) + 1e-6))
50 |
51 |
52 | class SparseInst(nn.Module):
53 |
54 | def __init__(self, cfg):
55 |
56 | super().__init__()
57 |
58 | self.device = torch.device(cfg.MODEL.DEVICE)
59 | # backbone
60 | self.backbone = build_backbone(cfg)
61 | self.size_divisibility = self.backbone.size_divisibility
62 |
63 | output_shape = self.backbone.output_shape()
64 |
65 | self.encoder = build_sparse_inst_encoder(cfg, output_shape)
66 | self.decoder = build_sparse_inst_decoder(cfg)
67 |
68 | self.to(self.device)
69 |
70 | # inference
71 | self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD
72 | self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD
73 | self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS
74 | self.mask_format = cfg.INPUT.MASK_FORMAT
75 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
76 |
77 | def forward(self, image, resized_size, ori_size):
78 | max_size = image.shape[2:]
79 | features = self.backbone(image)
80 | features = self.encoder(features)
81 | output = self.decoder(features)
82 | result = self.inference_single(
83 | output, resized_size, max_size, ori_size)
84 | return result
85 |
86 | def inference_single(self, outputs, img_shape, pad_shape, ori_shape):
87 | """
88 | inference for only one sample
89 | Args:
90 | scores (tensor): [NxC]
91 | masks (tensor): [NxHxW]
92 | img_shape (list): (h1, w1), image after resized
93 | pad_shape (list): (h2, w2), padded resized image
94 | ori_shape (list): (h3, w3), original shape h3*w3 < h1*w1 < h2*w2
95 | """
96 | result = Instances(ori_shape)
97 | # scoring
98 | pred_logits = outputs["pred_logits"][0].sigmoid()
99 | pred_scores = outputs["pred_scores"][0].sigmoid().squeeze()
100 | pred_masks = outputs["pred_masks"][0].sigmoid()
101 | # obtain scores
102 | scores, labels = pred_logits.max(dim=-1)
103 | # remove by thresholding
104 | keep = scores > self.cls_threshold
105 | scores = torch.sqrt(scores[keep] * pred_scores[keep])
106 | labels = labels[keep]
107 | pred_masks = pred_masks[keep]
108 |
109 | if scores.size(0) == 0:
110 | return None
111 | scores = rescoring_mask(scores, pred_masks > 0.45, pred_masks)
112 | h, w = img_shape
113 | # resize masks
114 | pred_masks = F.interpolate(pred_masks.unsqueeze(1), size=pad_shape,
115 | mode="bilinear", align_corners=False)[:, :, :h, :w]
116 | pred_masks = F.interpolate(pred_masks, size=ori_shape, mode='bilinear',
117 | align_corners=False).squeeze(1)
118 | mask_pred = pred_masks > self.mask_threshold
119 |
120 | mask_pred = BitMasks(mask_pred)
121 | result.pred_masks = mask_pred
122 | result.scores = scores
123 | result.pred_classes = labels
124 | return result
125 |
126 |
127 | def test_sparseinst_speed(cfg, fp16=True):
128 | device = torch.device('cuda:0')
129 |
130 | model = SparseInst(cfg)
131 | model.eval()
132 | model.to(device)
133 | print(model)
134 | size = (cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST)
135 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
136 | cfg.MODEL.WEIGHTS, resume=False)
137 |
138 | torch.backends.cudnn.enable = True
139 | torch.backends.cudnn.benchmark = True
140 |
141 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
142 |
143 | evaluator = COCOMaskEvaluator(
144 | cfg.DATASETS.TEST[0], ("segm",), False, output_folder)
145 | evaluator.reset()
146 | model.to(device)
147 | model.eval()
148 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
149 | durations = []
150 |
151 | with autocast(enabled=fp16):
152 | with torch.no_grad():
153 | for idx, inputs in enumerate(data_loader):
154 | images, resized_size, ori_size = process_batched_inputs(inputs)
155 | synchronize()
156 | start_time = time.perf_counter()
157 | output = model(images, resized_size, ori_size)
158 | print(len(output))
159 | print(output)
160 | synchronize()
161 | end = time.perf_counter() - start_time
162 |
163 | durations.append(end)
164 | if idx % 100 == 0:
165 | print("process: [{}/{}] fps: {:.3f}".format(idx,
166 | len(data_loader), 1/np.mean(durations[100:])))
167 | evaluator.process(inputs, [{"instances": output}])
168 | # evaluate
169 | results = evaluator.evaluate()
170 | print_csv_format(results)
171 |
172 | latency = np.mean(durations[100:])
173 | fps = 1 / latency
174 | print("speed: {:.4f}s FPS: {:.2f}".format(latency, fps))
175 |
176 |
177 | def setup(args):
178 | """
179 | Create configs and perform basic setups.
180 | """
181 | cfg = get_cfg()
182 | add_sparse_inst_config(cfg)
183 | cfg.merge_from_file(args.config_file)
184 | cfg.merge_from_list(args.opts)
185 | cfg.freeze()
186 | default_setup(cfg, args)
187 | return cfg
188 |
189 |
190 | if __name__ == '__main__':
191 |
192 | args = default_argument_parser()
193 | args.add_argument("--fp16", action="store_true",
194 | help="support fp16 for inference")
195 | args = args.parse_args()
196 | print("Command Line Args:", args)
197 | cfg = setup(args)
198 | test_sparseinst_speed(cfg, fp16=args.fp16)
199 |
--------------------------------------------------------------------------------
/train_net.py:
--------------------------------------------------------------------------------
1 | import os
2 | import itertools
3 | import time
4 | from typing import Any, Dict, List, Set
5 |
6 | import torch
7 | from torch import optim
8 |
9 | import detectron2.utils.comm as comm
10 | from detectron2.checkpoint import DetectionCheckpointer
11 | from detectron2.config import get_cfg
12 | from detectron2.utils.logger import setup_logger
13 | from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetMapper
14 | from detectron2.engine import AutogradProfiler, DefaultTrainer, default_argument_parser, default_setup, launch
15 | from detectron2.evaluation import COCOEvaluator, verify_results
16 | from detectron2.solver.build import maybe_add_gradient_clipping
17 | from detectron2.evaluation import (
18 | CityscapesInstanceEvaluator,
19 | CityscapesSemSegEvaluator,
20 | COCOEvaluator,
21 | COCOPanopticEvaluator,
22 | DatasetEvaluators,
23 | LVISEvaluator,
24 | PascalVOCDetectionEvaluator,
25 | SemSegEvaluator,
26 | verify_results,
27 | )
28 |
29 | from sparseinst import add_sparse_inst_config, COCOMaskEvaluator
30 |
31 |
32 | class Trainer(DefaultTrainer):
33 |
34 | @classmethod
35 | def build_evaluator(cls, cfg, dataset_name, output_folder=None):
36 | """
37 | Create evaluator(s) for a given dataset.
38 | This uses the special metadata "evaluator_type" associated with each builtin dataset.
39 | For your own dataset, you can simply create an evaluator manually in your
40 | script and do not have to worry about the hacky if-else logic here.
41 | """
42 | if output_folder is None:
43 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
44 | evaluator_list = []
45 | evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
46 | if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
47 | evaluator_list.append(
48 | SemSegEvaluator(
49 | dataset_name,
50 | distributed=True,
51 | num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
52 | ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
53 | output_dir=output_folder,
54 | )
55 | )
56 | if evaluator_type in ["coco", "coco_panoptic_seg"]:
57 | evaluator_list.append(COCOMaskEvaluator(dataset_name, ("segm", ), True, output_folder))
58 | if evaluator_type == "coco_panoptic_seg":
59 | evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
60 | if evaluator_type == "cityscapes_instance":
61 | assert (
62 | torch.cuda.device_count() >= comm.get_rank()
63 | ), "CityscapesEvaluator currently do not work with multiple machines."
64 | return CityscapesInstanceEvaluator(dataset_name)
65 | if evaluator_type == "cityscapes_sem_seg":
66 | assert (
67 | torch.cuda.device_count() >= comm.get_rank()
68 | ), "CityscapesEvaluator currently do not work with multiple machines."
69 | return CityscapesSemSegEvaluator(dataset_name)
70 | elif evaluator_type == "pascal_voc":
71 | return PascalVOCDetectionEvaluator(dataset_name)
72 | elif evaluator_type == "lvis":
73 | return LVISEvaluator(dataset_name, cfg, True, output_folder)
74 | if len(evaluator_list) == 0:
75 | raise NotImplementedError(
76 | "no Evaluator for the dataset {} with the type {}".format(
77 | dataset_name, evaluator_type
78 | )
79 | )
80 | elif len(evaluator_list) == 1:
81 | return evaluator_list[0]
82 | return DatasetEvaluators(evaluator_list)
83 |
84 | @classmethod
85 | def build_optimizer(cls, cfg, model):
86 | params: List[Dict[str, Any]] = []
87 | memo: Set[torch.nn.parameter.Parameter] = set()
88 | for key, value in model.named_parameters(recurse=True):
89 | if not value.requires_grad:
90 | continue
91 | # Avoid duplicating parameters
92 | if value in memo:
93 | continue
94 | memo.add(value)
95 | lr = cfg.SOLVER.BASE_LR
96 | weight_decay = cfg.SOLVER.WEIGHT_DECAY
97 | if "backbone" in key:
98 | lr = lr * cfg.SOLVER.BACKBONE_MULTIPLIER
99 | # for transformer
100 | if "patch_embed" in key or "cls_token" in key:
101 | weight_decay = 0.0
102 | if "norm" in key:
103 | weight_decay = 0.0
104 | params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
105 |
106 | def maybe_add_full_model_gradient_clipping(optim): # optim: the optimizer class
107 | # detectron2 doesn't have full model gradient clipping now
108 | clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
109 | enable = (
110 | cfg.SOLVER.CLIP_GRADIENTS.ENABLED
111 | and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
112 | and clip_norm_val > 0.0
113 | )
114 |
115 | class FullModelGradientClippingOptimizer(optim):
116 | def step(self, closure=None):
117 | all_params = itertools.chain(*[x["params"] for x in self.param_groups])
118 | torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
119 | super().step(closure=closure)
120 |
121 | return FullModelGradientClippingOptimizer if enable else optim
122 |
123 | optimizer_type = cfg.SOLVER.OPTIMIZER
124 | if optimizer_type == "SGD":
125 | optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
126 | params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM
127 | )
128 | elif optimizer_type == "ADAMW":
129 | optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
130 | params, cfg.SOLVER.BASE_LR, amsgrad=cfg.SOLVER.AMSGRAD
131 | )
132 | else:
133 | raise NotImplementedError(f"no optimizer type {optimizer_type}")
134 | if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
135 | optimizer = maybe_add_gradient_clipping(cfg, optimizer)
136 | return optimizer
137 |
138 | @classmethod
139 | def build_train_loader(cls, cfg):
140 | if cfg.MODEL.SPARSE_INST.DATASET_MAPPER == "SparseInstDatasetMapper":
141 | from sparseinst import SparseInstDatasetMapper
142 | mapper = SparseInstDatasetMapper(cfg, is_train=True)
143 | else:
144 | mapper = None
145 | return build_detection_train_loader(cfg, mapper=mapper)
146 |
147 |
148 | def setup(args):
149 | """
150 | Create configs and perform basic setups.
151 | """
152 | cfg = get_cfg()
153 | add_sparse_inst_config(cfg)
154 | cfg.merge_from_file(args.config_file)
155 | cfg.merge_from_list(args.opts)
156 | cfg.freeze()
157 | default_setup(cfg, args)
158 | # Setup logger for "sparseinst" module
159 | setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="sparseinst")
160 | return cfg
161 |
162 |
163 | def main(args):
164 | cfg = setup(args)
165 |
166 | if args.eval_only:
167 | model = Trainer.build_model(cfg)
168 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
169 | cfg.MODEL.WEIGHTS, resume=args.resume)
170 | res = Trainer.test(cfg, model)
171 | if comm.is_main_process():
172 | verify_results(cfg, res)
173 | return res
174 |
175 | trainer = Trainer(cfg)
176 | trainer.resume_or_load(resume=args.resume)
177 | return trainer.train()
178 |
179 |
180 | if __name__ == "__main__":
181 | args = default_argument_parser().parse_args()
182 | print("Command Line Args:", args)
183 | launch(
184 | main,
185 | args.num_gpus,
186 | num_machines=args.num_machines,
187 | machine_rank=args.machine_rank,
188 | dist_url=args.dist_url,
189 | args=(args,),
190 | )
191 |
--------------------------------------------------------------------------------