├── .gitattributes ├── LICENCE ├── README.md ├── assets ├── animate.gif ├── banner.gif ├── figures │ ├── 000000006471.jpg │ └── 000000014439.jpg └── sparseinst.png ├── configs ├── Base-SparseInst.yaml ├── Sparse_Inst_r50_giam_onnx.yaml ├── sparse_inst_cspdarknet53_giam.yaml ├── sparse_inst_darknet53_giam.yaml ├── sparse_inst_pvt_b1_giam.yaml ├── sparse_inst_pvt_b2_li_giam.yaml ├── sparse_inst_r101_dcn_giam.yaml ├── sparse_inst_r101_giam.yaml ├── sparse_inst_r50_base.yaml ├── sparse_inst_r50_dcn_giam_aug.yaml ├── sparse_inst_r50_giam.yaml ├── sparse_inst_r50_giam_aug.yaml ├── sparse_inst_r50_giam_fp16.yaml ├── sparse_inst_r50_giam_soft.yaml ├── sparse_inst_r50vd_base.yaml ├── sparse_inst_r50vd_dcn_giam.yaml ├── sparse_inst_r50vd_dcn_giam_aug.yaml ├── sparse_inst_r50vd_giam.yaml └── sparse_inst_r50vd_giam_aug.yaml ├── convert_onnx.py ├── convert_tensorrt.py ├── datasets ├── prepare_ade20k_sem_seg.py ├── prepare_cocofied_lvis.py ├── prepare_for_tests.sh └── prepare_panoptic_fpn.py ├── demo.py ├── engine ├── __pycache__ │ └── defaults.cpython-36.pyc └── defaults.py ├── eval_tensorrt_onnx.py ├── input └── input_image │ ├── 640x640.jpg │ ├── cup.jpg │ ├── femme.jpg │ ├── homme.jpg │ ├── horses.jpg │ ├── image1.jpg │ ├── input.jpg │ ├── results.png │ ├── skate.jpg │ └── turkish_coffee.jpg ├── onnx └── __pycache__ │ └── image_processing.cpython-36.pyc ├── output ├── mnist.tar.gz ├── mnist │ ├── model.onnx │ ├── test_data_set_0 │ │ ├── input_0.pb │ │ └── output_0.pb │ ├── test_data_set_1 │ │ ├── input_0.pb │ │ └── output_0.pb │ └── test_data_set_2 │ │ ├── input_0.pb │ │ └── output_0.pb └── sparse_inst_r50_giam │ ├── config.yaml │ └── log.txt ├── results ├── 640_result.jpg ├── result_onnx.png └── result_tensorrt.png ├── sparseinst ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── caffe2sparseinst.cpython-36.pyc │ ├── coco_evaluation.cpython-36.pyc │ ├── config.cpython-36.pyc │ ├── d2_predictor.cpython-36.pyc │ ├── dataset_mapper.cpython-36.pyc │ ├── decoder.cpython-36.pyc │ ├── encoder.cpython-36.pyc │ ├── loss.cpython-36.pyc │ ├── sparseinst.cpython-36.pyc │ └── utils.cpython-36.pyc ├── backbones │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── cspnet.cpython-36.pyc │ │ ├── pvt.cpython-36.pyc │ │ └── resnet.cpython-36.pyc │ ├── cspnet.py │ ├── pvt.py │ └── resnet.py ├── caffe2sparseinst.py ├── coco_evaluation.py ├── config.py ├── d2_predictor.py ├── dataset_mapper.py ├── decoder.py ├── encoder.py ├── input.ppm ├── loss.py ├── sparseinst.py └── utils.py ├── test.py ├── test_net.py └── train_net.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Hust Visual Learning Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SparseInst_TensorRT 2 | **This repository implement the real-time Instance Segmentation Algorithm named SparseInst with TensoRT and ONNX.** 3 | 4 | ## Some remarks 5 | - The initial repository on which I build mine is from **hustvl/SparseInst**__ repository (https://github.com/hustvl/SparseInst.git), for additional information about the installation of SparseInst, refer to the original repository. 6 | - This project is built upon the excellent framework detectron2, and you should install detectron2 first, please check official installation guide for more details. (https://github.com/facebookresearch/detectron2.git) 7 | - For command other than TensoRT and ONNX inference, please refer to the initial repository (e.g test_net.py). 8 | - If you face any problem during the parsing time, don't hesitate to drop an issue or a :star: if there aren't any. _**if you have compatibility problem, check the model weights uploaded in the table below and go directly in the testing section**_. 9 | - Be aware that in order to parse the model to ONNX and TensorRT, the files sparseinst.py, encoder.py and decoder.py has been modified/slightly modified, don't forget to check the modifications if you come from the initial repository. 10 | 11 | 12 | ## Prerequisites 13 |
14 | Click me 15 | 16 | - Install Pytorch (1.10.0) and TorchVision (0.11.1) 17 | ``` 18 | pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116 19 | 20 | If other versions of torch are needed, select yours by putting torch==1.11.0+cu102 for example. 21 | ``` 22 | - Install CUDA (10.2) and cuDNN (8.0.0) : https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=WSL-Ubuntu&target_version=2.0&target_type=deb_local 23 | 24 | - For WSL-Ubuntu : 25 | ``` 26 | sudo wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-wsl-ubuntu.pin 27 | sudo mv cuda-wsl-ubuntu.pin /etc/apt/preferences.d/cuda-repository-pin-600 28 | sudo wget https://developer.download.nvidia.com/compute/cuda/11.7.1/local_insta 29 | llers/cuda-repo-wsl-ubuntu-11-7-local_11.7.1-1_amd64.deb 30 | sudo dpkg -i cuda-repo-wsl-ubuntu-11-7-local_11.7.1-1_amd64.deb 31 | sudo cp /var/cuda-repo-wsl-ubuntu-11-7-local/cuda-96193861-keyring.gpg /usr/share/keyrings/ 32 | sudo apt-get update 33 | sudo apt-get -y install cuda 34 | ``` 35 | 36 | - Install TensorRT (8.0.1.6), if you are using an nvidia edge device, TensorRT should already be installed 37 | ``` 38 | python3 -m pip install --upgrade setuptools pip 39 | python3 -m pip install nvidia-pyindex 40 | python3 -m pip install --upgrade nvidia-tensorrt 41 | 42 | Verify installation by writing : assert tensorrt.Builder(tensorrt.Logger()) 43 | ``` 44 | - Install ONNX and ONNXruntime 45 | ``` 46 | pip install onnxruntime-gpu 47 | pip install onnxruntime 48 | pip install numpy protobuf==4.21.5 49 | pip install onnx 50 | ``` 51 | - Install all the other packages needed to run the original SparseInst algorithm (Should be done if you have installed Dectectron2) 52 | 53 |
54 | 55 | ## Models and Results for TensorRT and ONNX inference script: 56 | 57 | The inference speed for Pytorch, ONNX and TensorRT has been compared and shown in the table below. SparseInst running with TensoRT achieved more a less 3 times faster inference speed of SparseInst than running with Pytorch. Lowering the input size of the image can lead to a decent real-time speed. 58 | The models from TensorRT and ONNX are built upon the first Pytorch listed weights in the table below : SparseInst R-50 G-IAM. 59 | 60 | *Note: All the computations has been done on a Nvidia Jetson TX2 Jetpack 4.6. Further test will be done on a Nvidia 2070 RTI* 61 | 62 |
63 | 64 | | Model | Input Size | Inference Speed| Weights 65 | | :--- | :---: | :---: | ---: | 66 | | Pytorch | 640 | 1.71 FPS | [model](https://drive.google.com/file/d/130gyxYT6r9j5Nwp5nCo_wthYPuTwa9c4/view?usp=sharing)| 67 | | TensorRT | 320 | 20.32 FPS |[model](https://drive.google.com/file/d/17-eBWVrpnwv0ueeDsEmAqSKlNh3If4AI/view?usp=sharing)| 68 | | TensorRT | 640 | 6.00 FPS |[model](https://drive.google.com/file/d/1Kh97LZNzsuBJTeDVXwRKx8CiX7CeMI3v/view?usp=sharing)| 69 | | ONNX | 320 | 0.22 FPS |[model](https://drive.google.com/file/d/1H6YH3YUPaA4vO3IyIGaZNAkGBsU9xHCH/view?usp=sharing)| 70 | | ONNX | 640 |0.03 FPS |[model](https://drive.google.com/file/d/1GEoQssyJ9MZRnEISiatF_tREpdGAnSjk/view?usp=sharing)| 71 | 72 | 73 | 74 | Alt text 79 | 80 |
81 | 82 | ## Building the ONNX model : 83 | 84 | To build the model from Pytorch to ONNX, you need to run the following command. You can set the arguments to default. Please check if the config path and the model weights path are correctly set up. 85 | ``` 86 | 87 | ``` 88 | 89 | ## Building the TensorRT model : 90 | 91 | To build the model from ONNX to TensorRT, you need to run the following command. You can set the arguments to default. If you have any problem while parsing the model to TensorRT, don't hesitate to ask. 92 | ``` 93 | 0, "Please specify a directory with args.output" 120 | out_filename = args.output 121 | visualized_output.save(out_filename) 122 | else: 123 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) 124 | cv2.imshow( 125 | WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) 126 | if cv2.waitKey(0) == 27: 127 | break # esc to quit 128 | elif args.webcam: 129 | assert args.input is None, "Cannot have both --input and --webcam!" 130 | assert args.output is None, "output not yet supported with --webcam!" 131 | cam = cv2.VideoCapture(0) 132 | for vis in tqdm.tqdm(demo.run_on_video(cam, args.confidence_threshold)): 133 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) 134 | cv2.imshow(WINDOW_NAME, vis) 135 | if cv2.waitKey(1) == 27: 136 | break # esc to quit 137 | cam.release() 138 | cv2.destroyAllWindows() 139 | elif args.video_input: 140 | video = cv2.VideoCapture(args.video_input) 141 | width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) 142 | height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) 143 | frames_per_second = video.get(cv2.CAP_PROP_FPS) 144 | num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) 145 | basename = os.path.basename(args.video_input) 146 | 147 | if args.output: 148 | if os.path.isdir(args.output): 149 | output_fname = os.path.join(args.output, basename) 150 | output_fname = os.path.splitext(output_fname)[0] + ".mkv" 151 | else: 152 | output_fname = args.output 153 | assert not os.path.isfile(output_fname), output_fname 154 | output_file = cv2.VideoWriter( 155 | filename=output_fname, 156 | # some installation of opencv may not support x264 (due to its license), 157 | # you can try other format (e.g. MPEG) 158 | fourcc=cv2.VideoWriter_fourcc(*"mp4v"), 159 | fps=float(frames_per_second), 160 | frameSize=(width, height), 161 | isColor=True, 162 | ) 163 | assert os.path.isfile(args.video_input) 164 | for vis_frame in tqdm.tqdm(demo.run_on_video(video, args.confidence_threshold), total=num_frames): 165 | if args.output: 166 | output_file.write(vis_frame) 167 | else: 168 | cv2.namedWindow(basename, cv2.WINDOW_NORMAL) 169 | cv2.imshow(basename, vis_frame) 170 | if cv2.waitKey(1) == 27: 171 | break # esc to quit 172 | video.release() 173 | if args.output: 174 | output_file.release() 175 | else: 176 | cv2.destroyAllWindows() 177 | -------------------------------------------------------------------------------- /engine/__pycache__/defaults.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/engine/__pycache__/defaults.cpython-36.pyc -------------------------------------------------------------------------------- /engine/defaults.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | """ 5 | This file contains components with some default boilerplate logic user may need 6 | in training / testing. They will not work for everyone, but many users may find them useful. 7 | 8 | The behavior of functions/classes in this file is subject to change, 9 | since they are meant to represent the "common default behavior" people need in their projects. 10 | """ 11 | import numpy as np 12 | import argparse 13 | import logging 14 | import os 15 | import sys 16 | import weakref 17 | from collections import OrderedDict 18 | from typing import Optional 19 | import torch 20 | from fvcore.nn.precise_bn import get_bn_modules 21 | from omegaconf import OmegaConf 22 | from torch.nn.parallel import DistributedDataParallel 23 | 24 | import detectron2.data.transforms as T 25 | from detectron2.checkpoint import DetectionCheckpointer 26 | from detectron2.config import CfgNode, LazyConfig 27 | from detectron2.data import ( 28 | MetadataCatalog, 29 | build_detection_test_loader, 30 | build_detection_train_loader, 31 | ) 32 | from detectron2.evaluation import ( 33 | DatasetEvaluator, 34 | inference_on_dataset, 35 | print_csv_format, 36 | verify_results, 37 | ) 38 | from detectron2.modeling import build_model 39 | from detectron2.solver import build_lr_scheduler, build_optimizer 40 | from detectron2.utils import comm 41 | from detectron2.utils.collect_env import collect_env_info 42 | from detectron2.utils.env import seed_all_rng 43 | from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter 44 | from detectron2.utils.file_io import PathManager 45 | from detectron2.utils.logger import setup_logger 46 | 47 | 48 | 49 | __all__ = [ 50 | "DefaultPredictor", 51 | ] 52 | 53 | class DefaultPredictor: 54 | 55 | 56 | def __init__(self, cfg): 57 | self.cfg = cfg.clone() # cfg can be modified by model 58 | self.model = build_model(self.cfg) 59 | self.model.eval() 60 | if len(cfg.DATASETS.TEST): 61 | self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0]) 62 | 63 | checkpointer = DetectionCheckpointer(self.model) 64 | checkpointer.load(cfg.MODEL.WEIGHTS) 65 | 66 | self.aug = T.ResizeShortestEdge( 67 | [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST 68 | ) 69 | 70 | self.input_format = cfg.INPUT.FORMAT 71 | assert self.input_format in ["RGB", "BGR"], self.input_format 72 | 73 | def __call__(self, original_image): 74 | """ 75 | Args: 76 | original_image (np.ndarray): an image of shape (H, W, C) (in BGR order). 77 | 78 | Returns: 79 | predictions (dict): 80 | the output of the model for one image only. 81 | See :doc:`/tutorials/models` for details about the format. 82 | """ 83 | with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258 84 | # Apply pre-processing to image. 85 | if self.input_format == "RGB": 86 | # whether the model expects BGR inputs or RGB 87 | original_image = original_image[:, :, ::-1] 88 | height, width = original_image.shape[:2] 89 | image = self.aug.get_transform(original_image).apply_image(original_image) 90 | image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) 91 | print("shape of image", np.shape(image)) 92 | print("image defaults", image) 93 | inputs = {"image": image, "height": height, "width": width} 94 | predictions = self.model([inputs])[0] 95 | 96 | return predictions 97 | 98 | -------------------------------------------------------------------------------- /input/input_image/640x640.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/640x640.jpg -------------------------------------------------------------------------------- /input/input_image/cup.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/cup.jpg -------------------------------------------------------------------------------- /input/input_image/femme.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/femme.jpg -------------------------------------------------------------------------------- /input/input_image/homme.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/homme.jpg -------------------------------------------------------------------------------- /input/input_image/horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/horses.jpg -------------------------------------------------------------------------------- /input/input_image/image1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/image1.jpg -------------------------------------------------------------------------------- /input/input_image/input.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/input.jpg -------------------------------------------------------------------------------- /input/input_image/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/results.png -------------------------------------------------------------------------------- /input/input_image/skate.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/skate.jpg -------------------------------------------------------------------------------- /input/input_image/turkish_coffee.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/input/input_image/turkish_coffee.jpg -------------------------------------------------------------------------------- /onnx/__pycache__/image_processing.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/onnx/__pycache__/image_processing.cpython-36.pyc -------------------------------------------------------------------------------- /output/mnist.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist.tar.gz -------------------------------------------------------------------------------- /output/mnist/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/model.onnx -------------------------------------------------------------------------------- /output/mnist/test_data_set_0/input_0.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_0/input_0.pb -------------------------------------------------------------------------------- /output/mnist/test_data_set_0/output_0.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_0/output_0.pb -------------------------------------------------------------------------------- /output/mnist/test_data_set_1/input_0.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_1/input_0.pb -------------------------------------------------------------------------------- /output/mnist/test_data_set_1/output_0.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_1/output_0.pb -------------------------------------------------------------------------------- /output/mnist/test_data_set_2/input_0.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_2/input_0.pb -------------------------------------------------------------------------------- /output/mnist/test_data_set_2/output_0.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/output/mnist/test_data_set_2/output_0.pb -------------------------------------------------------------------------------- /output/sparse_inst_r50_giam/config.yaml: -------------------------------------------------------------------------------- 1 | CUDNN_BENCHMARK: false 2 | DATALOADER: 3 | ASPECT_RATIO_GROUPING: true 4 | FILTER_EMPTY_ANNOTATIONS: true 5 | NUM_WORKERS: 4 6 | REPEAT_THRESHOLD: 0.0 7 | SAMPLER_TRAIN: TrainingSampler 8 | DATASETS: 9 | PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000 10 | PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000 11 | PROPOSAL_FILES_TEST: [] 12 | PROPOSAL_FILES_TRAIN: [] 13 | TEST: 14 | - coco_2017_val 15 | TRAIN: 16 | - coco_2017_train 17 | GLOBAL: 18 | HACK: 1.0 19 | INPUT: 20 | CROP: 21 | ENABLED: false 22 | SIZE: 23 | - 0.9 24 | - 0.9 25 | TYPE: relative_range 26 | FORMAT: RGB 27 | MASK_FORMAT: bitmask 28 | MAX_SIZE_TEST: 853 29 | MAX_SIZE_TRAIN: 853 30 | MIN_SIZE_TEST: 512 31 | MIN_SIZE_TRAIN: 32 | - 416 33 | - 448 34 | - 480 35 | - 512 36 | - 544 37 | - 576 38 | - 608 39 | - 640 40 | MIN_SIZE_TRAIN_SAMPLING: choice 41 | RANDOM_FLIP: horizontal 42 | MODEL: 43 | ANCHOR_GENERATOR: 44 | ANGLES: 45 | - - -90 46 | - 0 47 | - 90 48 | ASPECT_RATIOS: 49 | - - 0.5 50 | - 1.0 51 | - 2.0 52 | NAME: DefaultAnchorGenerator 53 | OFFSET: 0.0 54 | SIZES: 55 | - - 32 56 | - 64 57 | - 128 58 | - 256 59 | - 512 60 | BACKBONE: 61 | FREEZE_AT: 0 62 | NAME: build_resnet_backbone 63 | CSPNET: 64 | NAME: darknet53 65 | NORM: '' 66 | OUT_FEATURES: 67 | - csp1 68 | - csp2 69 | - csp3 70 | - csp4 71 | DEVICE: cuda 72 | FPN: 73 | FUSE_TYPE: sum 74 | IN_FEATURES: [] 75 | NORM: '' 76 | OUT_CHANNELS: 256 77 | KEYPOINT_ON: false 78 | LOAD_PROPOSALS: false 79 | MASK_ON: true 80 | META_ARCHITECTURE: SparseInst 81 | PANOPTIC_FPN: 82 | COMBINE: 83 | ENABLED: true 84 | INSTANCES_CONFIDENCE_THRESH: 0.5 85 | OVERLAP_THRESH: 0.5 86 | STUFF_AREA_LIMIT: 4096 87 | INSTANCE_LOSS_WEIGHT: 1.0 88 | PIXEL_MEAN: 89 | - 123.675 90 | - 116.28 91 | - 103.53 92 | PIXEL_STD: 93 | - 58.395 94 | - 57.12 95 | - 57.375 96 | PROPOSAL_GENERATOR: 97 | MIN_SIZE: 0 98 | NAME: RPN 99 | PVT: 100 | LINEAR: false 101 | NAME: b1 102 | OUT_FEATURES: 103 | - p2 104 | - p3 105 | - p4 106 | RESNETS: 107 | DEFORM_MODULATED: false 108 | DEFORM_NUM_GROUPS: 1 109 | DEFORM_ON_PER_STAGE: 110 | - false 111 | - false 112 | - false 113 | - false 114 | DEPTH: 50 115 | NORM: FrozenBN 116 | NUM_GROUPS: 1 117 | OUT_FEATURES: 118 | - res3 119 | - res4 120 | - res5 121 | RES2_OUT_CHANNELS: 256 122 | RES5_DILATION: 1 123 | STEM_OUT_CHANNELS: 64 124 | STRIDE_IN_1X1: false 125 | WIDTH_PER_GROUP: 64 126 | RETINANET: 127 | BBOX_REG_LOSS_TYPE: smooth_l1 128 | BBOX_REG_WEIGHTS: 129 | - 1.0 130 | - 1.0 131 | - 1.0 132 | - 1.0 133 | FOCAL_LOSS_ALPHA: 0.25 134 | FOCAL_LOSS_GAMMA: 2.0 135 | IN_FEATURES: 136 | - p3 137 | - p4 138 | - p5 139 | - p6 140 | - p7 141 | IOU_LABELS: 142 | - 0 143 | - -1 144 | - 1 145 | IOU_THRESHOLDS: 146 | - 0.4 147 | - 0.5 148 | NMS_THRESH_TEST: 0.5 149 | NORM: '' 150 | NUM_CLASSES: 80 151 | NUM_CONVS: 4 152 | PRIOR_PROB: 0.01 153 | SCORE_THRESH_TEST: 0.05 154 | SMOOTH_L1_LOSS_BETA: 0.1 155 | TOPK_CANDIDATES_TEST: 1000 156 | ROI_BOX_CASCADE_HEAD: 157 | BBOX_REG_WEIGHTS: 158 | - - 10.0 159 | - 10.0 160 | - 5.0 161 | - 5.0 162 | - - 20.0 163 | - 20.0 164 | - 10.0 165 | - 10.0 166 | - - 30.0 167 | - 30.0 168 | - 15.0 169 | - 15.0 170 | IOUS: 171 | - 0.5 172 | - 0.6 173 | - 0.7 174 | ROI_BOX_HEAD: 175 | BBOX_REG_LOSS_TYPE: smooth_l1 176 | BBOX_REG_LOSS_WEIGHT: 1.0 177 | BBOX_REG_WEIGHTS: 178 | - 10.0 179 | - 10.0 180 | - 5.0 181 | - 5.0 182 | CLS_AGNOSTIC_BBOX_REG: false 183 | CONV_DIM: 256 184 | FC_DIM: 1024 185 | NAME: '' 186 | NORM: '' 187 | NUM_CONV: 0 188 | NUM_FC: 0 189 | POOLER_RESOLUTION: 14 190 | POOLER_SAMPLING_RATIO: 0 191 | POOLER_TYPE: ROIAlignV2 192 | SMOOTH_L1_BETA: 0.0 193 | TRAIN_ON_PRED_BOXES: false 194 | ROI_HEADS: 195 | BATCH_SIZE_PER_IMAGE: 512 196 | IN_FEATURES: 197 | - res4 198 | IOU_LABELS: 199 | - 0 200 | - 1 201 | IOU_THRESHOLDS: 202 | - 0.5 203 | NAME: Res5ROIHeads 204 | NMS_THRESH_TEST: 0.5 205 | NUM_CLASSES: 80 206 | POSITIVE_FRACTION: 0.25 207 | PROPOSAL_APPEND_GT: true 208 | SCORE_THRESH_TEST: 0.05 209 | ROI_KEYPOINT_HEAD: 210 | CONV_DIMS: 211 | - 512 212 | - 512 213 | - 512 214 | - 512 215 | - 512 216 | - 512 217 | - 512 218 | - 512 219 | LOSS_WEIGHT: 1.0 220 | MIN_KEYPOINTS_PER_IMAGE: 1 221 | NAME: KRCNNConvDeconvUpsampleHead 222 | NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true 223 | NUM_KEYPOINTS: 17 224 | POOLER_RESOLUTION: 14 225 | POOLER_SAMPLING_RATIO: 0 226 | POOLER_TYPE: ROIAlignV2 227 | ROI_MASK_HEAD: 228 | CLS_AGNOSTIC_MASK: false 229 | CONV_DIM: 256 230 | NAME: MaskRCNNConvUpsampleHead 231 | NORM: '' 232 | NUM_CONV: 0 233 | POOLER_RESOLUTION: 14 234 | POOLER_SAMPLING_RATIO: 0 235 | POOLER_TYPE: ROIAlignV2 236 | RPN: 237 | BATCH_SIZE_PER_IMAGE: 256 238 | BBOX_REG_LOSS_TYPE: smooth_l1 239 | BBOX_REG_LOSS_WEIGHT: 1.0 240 | BBOX_REG_WEIGHTS: 241 | - 1.0 242 | - 1.0 243 | - 1.0 244 | - 1.0 245 | BOUNDARY_THRESH: -1 246 | CONV_DIMS: 247 | - -1 248 | HEAD_NAME: StandardRPNHead 249 | IN_FEATURES: 250 | - res4 251 | IOU_LABELS: 252 | - 0 253 | - -1 254 | - 1 255 | IOU_THRESHOLDS: 256 | - 0.3 257 | - 0.7 258 | LOSS_WEIGHT: 1.0 259 | NMS_THRESH: 0.7 260 | POSITIVE_FRACTION: 0.5 261 | POST_NMS_TOPK_TEST: 1000 262 | POST_NMS_TOPK_TRAIN: 2000 263 | PRE_NMS_TOPK_TEST: 6000 264 | PRE_NMS_TOPK_TRAIN: 12000 265 | SMOOTH_L1_BETA: 0.0 266 | SEM_SEG_HEAD: 267 | COMMON_STRIDE: 4 268 | CONVS_DIM: 128 269 | IGNORE_VALUE: 255 270 | IN_FEATURES: 271 | - p2 272 | - p3 273 | - p4 274 | - p5 275 | LOSS_WEIGHT: 1.0 276 | NAME: SemSegFPNHead 277 | NORM: GN 278 | NUM_CLASSES: 54 279 | SPARSE_INST: 280 | CLS_THRESHOLD: 0.005 281 | DATASET_MAPPER: SparseInstDatasetMapper 282 | DECODER: 283 | GROUPS: 4 284 | INST: 285 | CONVS: 4 286 | DIM: 256 287 | KERNEL_DIM: 128 288 | MASK: 289 | CONVS: 4 290 | DIM: 256 291 | NAME: GroupIAMDecoder 292 | NUM_CLASSES: 80 293 | NUM_MASKS: 100 294 | OUTPUT_IAM: false 295 | SCALE_FACTOR: 2.0 296 | ENCODER: 297 | IN_FEATURES: 298 | - res3 299 | - res4 300 | - res5 301 | NAME: InstanceContextEncoder 302 | NORM: '' 303 | NUM_CHANNELS: 256 304 | LOSS: 305 | CLASS_WEIGHT: 2.0 306 | ITEMS: 307 | - labels 308 | - masks 309 | MASK_DICE_WEIGHT: 2.0 310 | MASK_PIXEL_WEIGHT: 5.0 311 | NAME: SparseInstCriterion 312 | OBJECTNESS_WEIGHT: 1.0 313 | MASK_THRESHOLD: 0.45 314 | MATCHER: 315 | ALPHA: 0.8 316 | BETA: 0.2 317 | NAME: SparseInstMatcher 318 | MAX_DETECTIONS: 100 319 | WEIGHTS: sparse_inst_r50_giam_aug_2b7d68.pth 320 | OUTPUT_DIR: output/sparse_inst_r50_giam 321 | SEED: -1 322 | SOLVER: 323 | AMP: 324 | ENABLED: false 325 | AMSGRAD: false 326 | BACKBONE_MULTIPLIER: 1.0 327 | BASE_LR: 5.0e-05 328 | BIAS_LR_FACTOR: 1.0 329 | CHECKPOINT_PERIOD: 5000 330 | CLIP_GRADIENTS: 331 | CLIP_TYPE: value 332 | CLIP_VALUE: 1.0 333 | ENABLED: false 334 | NORM_TYPE: 2.0 335 | GAMMA: 0.1 336 | IMS_PER_BATCH: 64 337 | LR_SCHEDULER_NAME: WarmupMultiStepLR 338 | MAX_ITER: 270000 339 | MOMENTUM: 0.9 340 | NESTEROV: false 341 | OPTIMIZER: ADAMW 342 | REFERENCE_WORLD_SIZE: 0 343 | STEPS: 344 | - 210000 345 | - 250000 346 | WARMUP_FACTOR: 0.001 347 | WARMUP_ITERS: 1000 348 | WARMUP_METHOD: linear 349 | WEIGHT_DECAY: 0.05 350 | WEIGHT_DECAY_BIAS: null 351 | WEIGHT_DECAY_NORM: 0.0 352 | TEST: 353 | AUG: 354 | ENABLED: false 355 | FLIP: true 356 | MAX_SIZE: 4000 357 | MIN_SIZES: 358 | - 400 359 | - 500 360 | - 600 361 | - 700 362 | - 800 363 | - 900 364 | - 1000 365 | - 1100 366 | - 1200 367 | DETECTIONS_PER_IMAGE: 100 368 | EVAL_PERIOD: 7330 369 | EXPECTED_RESULTS: [] 370 | KEYPOINT_OKS_SIGMAS: [] 371 | PRECISE_BN: 372 | ENABLED: false 373 | NUM_ITER: 200 374 | VERSION: 2 375 | VIS_PERIOD: 0 376 | -------------------------------------------------------------------------------- /results/640_result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/results/640_result.jpg -------------------------------------------------------------------------------- /results/result_onnx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/results/result_onnx.png -------------------------------------------------------------------------------- /results/result_tensorrt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/results/result_tensorrt.png -------------------------------------------------------------------------------- /sparseinst/__init__.py: -------------------------------------------------------------------------------- 1 | from .sparseinst import SparseInst 2 | from .encoder import build_sparse_inst_encoder 3 | from .decoder import build_sparse_inst_decoder 4 | from .config import add_sparse_inst_config 5 | from .loss import build_sparse_inst_criterion 6 | from .dataset_mapper import SparseInstDatasetMapper 7 | from .coco_evaluation import COCOMaskEvaluator 8 | from .backbones import build_resnet_vd_backbone, build_pyramid_vision_transformer 9 | from .d2_predictor import VisualizationDemo 10 | -------------------------------------------------------------------------------- /sparseinst/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/__pycache__/caffe2sparseinst.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/caffe2sparseinst.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/__pycache__/coco_evaluation.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/coco_evaluation.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/__pycache__/d2_predictor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/d2_predictor.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/__pycache__/dataset_mapper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/dataset_mapper.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/__pycache__/decoder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/decoder.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/__pycache__/encoder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/encoder.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/__pycache__/loss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/loss.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/__pycache__/sparseinst.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/sparseinst.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet import build_resnet_vd_backbone 2 | from .pvt import build_pyramid_vision_transformer 3 | from .cspnet import build_cspnet_backbone -------------------------------------------------------------------------------- /sparseinst/backbones/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/backbones/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/backbones/__pycache__/cspnet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/backbones/__pycache__/cspnet.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/backbones/__pycache__/pvt.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/backbones/__pycache__/pvt.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/backbones/__pycache__/resnet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/backbones/__pycache__/resnet.cpython-36.pyc -------------------------------------------------------------------------------- /sparseinst/backbones/cspnet.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from timm.models.layers import ConvBnAct, DropPath, AvgPool2dSame, create_attn 7 | 8 | 9 | from detectron2.layers import ShapeSpec, FrozenBatchNorm2d 10 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY 11 | 12 | 13 | model_cfgs = dict( 14 | cspresnet50=dict( 15 | stem=dict(out_chs=64, kernel_size=7, stride=2, pool='max'), 16 | stage=dict( 17 | out_chs=(128, 256, 512, 1024), 18 | depth=(3, 3, 5, 2), 19 | stride=(1,) + (2,) * 3, 20 | exp_ratio=(2.,) * 4, 21 | bottle_ratio=(0.5,) * 4, 22 | block_ratio=(1.,) * 4, 23 | cross_linear=True, 24 | ) 25 | ), 26 | cspresnet50d=dict( 27 | stem=dict(out_chs=[32, 32, 64], kernel_size=3, stride=2, pool='max'), 28 | stage=dict( 29 | out_chs=(128, 256, 512, 1024), 30 | depth=(3, 3, 5, 2), 31 | stride=(1,) + (2,) * 3, 32 | exp_ratio=(2.,) * 4, 33 | bottle_ratio=(0.5,) * 4, 34 | block_ratio=(1.,) * 4, 35 | cross_linear=True, 36 | ) 37 | ), 38 | cspresnet50w=dict( 39 | stem=dict(out_chs=[32, 32, 64], kernel_size=3, stride=2, pool='max'), 40 | stage=dict( 41 | out_chs=(256, 512, 1024, 2048), 42 | depth=(3, 3, 5, 2), 43 | stride=(1,) + (2,) * 3, 44 | exp_ratio=(1.,) * 4, 45 | bottle_ratio=(0.25,) * 4, 46 | block_ratio=(0.5,) * 4, 47 | cross_linear=True, 48 | ) 49 | ), 50 | cspresnext50=dict( 51 | stem=dict(out_chs=64, kernel_size=7, stride=2, pool='max'), 52 | stage=dict( 53 | out_chs=(256, 512, 1024, 2048), 54 | depth=(3, 3, 5, 2), 55 | stride=(1,) + (2,) * 3, 56 | groups=(32,) * 4, 57 | exp_ratio=(1.,) * 4, 58 | bottle_ratio=(1.,) * 4, 59 | block_ratio=(0.5,) * 4, 60 | cross_linear=True, 61 | ) 62 | ), 63 | cspdarknet53=dict( 64 | stem=dict(out_chs=32, kernel_size=3, stride=1, pool=''), 65 | stage=dict( 66 | out_chs=(64, 128, 256, 512, 1024), 67 | depth=(1, 2, 8, 8, 4), 68 | stride=(2,) * 5, 69 | exp_ratio=(2.,) + (1.,) * 4, 70 | bottle_ratio=(0.5,) + (1.0,) * 4, 71 | block_ratio=(1.,) + (0.5,) * 4, 72 | down_growth=True, 73 | ) 74 | ), 75 | darknet53=dict( 76 | stem=dict(out_chs=32, kernel_size=3, stride=1, pool=''), 77 | stage=dict( 78 | out_chs=(64, 128, 256, 512, 1024), 79 | depth=(1, 2, 8, 8, 4), 80 | stride=(2,) * 5, 81 | bottle_ratio=(0.5,) * 5, 82 | block_ratio=(1.,) * 5, 83 | ) 84 | ) 85 | ) 86 | 87 | 88 | def create_stem( 89 | in_chans=3, out_chs=32, kernel_size=3, stride=2, pool='', 90 | act_layer=None, norm_layer=None, aa_layer=None): 91 | stem = nn.Sequential() 92 | if not isinstance(out_chs, (tuple, list)): 93 | out_chs = [out_chs] 94 | assert len(out_chs) 95 | in_c = in_chans 96 | for i, out_c in enumerate(out_chs): 97 | conv_name = f'conv{i + 1}' 98 | stem.add_module(conv_name, ConvBnAct( 99 | in_c, out_c, kernel_size, stride=stride if i == 0 else 1, 100 | act_layer=act_layer, norm_layer=norm_layer)) 101 | in_c = out_c 102 | last_conv = conv_name 103 | if pool: 104 | if aa_layer is not None: 105 | stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=1, padding=1)) 106 | stem.add_module('aa', aa_layer(channels=in_c, stride=2)) 107 | else: 108 | stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) 109 | return stem, dict(num_chs=in_c, reduction=stride, module='.'.join(['stem', last_conv])) 110 | 111 | 112 | class ResBottleneck(nn.Module): 113 | """ ResNe(X)t Bottleneck Block 114 | """ 115 | 116 | def __init__(self, in_chs, out_chs, dilation=1, bottle_ratio=0.25, groups=1, 117 | act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_last=False, 118 | attn_layer=None, aa_layer=None, drop_block=None, drop_path=None): 119 | super(ResBottleneck, self).__init__() 120 | mid_chs = int(round(out_chs * bottle_ratio)) 121 | ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer, 122 | aa_layer=aa_layer, drop_block=drop_block) 123 | 124 | self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs) 125 | self.conv2 = ConvBnAct(mid_chs, mid_chs, kernel_size=3, 126 | dilation=dilation, groups=groups, **ckwargs) 127 | self.attn2 = create_attn(attn_layer, channels=mid_chs) if not attn_last else None 128 | self.conv3 = ConvBnAct(mid_chs, out_chs, kernel_size=1, apply_act=False, **ckwargs) 129 | self.attn3 = create_attn(attn_layer, channels=out_chs) if attn_last else None 130 | self.drop_path = drop_path 131 | self.act3 = act_layer(inplace=True) 132 | 133 | def zero_init_last_bn(self): 134 | nn.init.zeros_(self.conv3.bn.weight) 135 | 136 | def forward(self, x): 137 | shortcut = x 138 | x = self.conv1(x) 139 | x = self.conv2(x) 140 | if self.attn2 is not None: 141 | x = self.attn2(x) 142 | x = self.conv3(x) 143 | if self.attn3 is not None: 144 | x = self.attn3(x) 145 | if self.drop_path is not None: 146 | x = self.drop_path(x) 147 | x = x + shortcut 148 | # FIXME partial shortcut needed if first block handled as per original, not used for my current impl 149 | #x[:, :shortcut.size(1)] += shortcut 150 | x = self.act3(x) 151 | return x 152 | 153 | 154 | class DarkBlock(nn.Module): 155 | """ DarkNet Block 156 | """ 157 | 158 | def __init__(self, in_chs, out_chs, dilation=1, bottle_ratio=0.5, groups=1, 159 | act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_layer=None, aa_layer=None, 160 | drop_block=None, drop_path=None): 161 | super(DarkBlock, self).__init__() 162 | mid_chs = int(round(out_chs * bottle_ratio)) 163 | ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer, 164 | aa_layer=aa_layer, drop_block=drop_block) 165 | self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs) 166 | self.conv2 = ConvBnAct(mid_chs, out_chs, kernel_size=3, 167 | dilation=dilation, groups=groups, **ckwargs) 168 | self.attn = create_attn(attn_layer, channels=out_chs) 169 | self.drop_path = drop_path 170 | 171 | def zero_init_last_bn(self): 172 | nn.init.zeros_(self.conv2.bn.weight) 173 | 174 | def forward(self, x): 175 | shortcut = x 176 | x = self.conv1(x) 177 | x = self.conv2(x) 178 | if self.attn is not None: 179 | x = self.attn(x) 180 | if self.drop_path is not None: 181 | x = self.drop_path(x) 182 | x = x + shortcut 183 | return x 184 | 185 | 186 | class CrossStage(nn.Module): 187 | """Cross Stage.""" 188 | 189 | def __init__(self, in_chs, out_chs, stride, dilation, depth, block_ratio=1., bottle_ratio=1., exp_ratio=1., 190 | groups=1, first_dilation=None, down_growth=False, cross_linear=False, block_dpr=None, 191 | block_fn=ResBottleneck, **block_kwargs): 192 | super(CrossStage, self).__init__() 193 | first_dilation = first_dilation or dilation 194 | down_chs = out_chs if down_growth else in_chs # grow downsample channels to output channels 195 | exp_chs = int(round(out_chs * exp_ratio)) 196 | block_out_chs = int(round(out_chs * block_ratio)) 197 | conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'), 198 | norm_layer=block_kwargs.get('norm_layer')) 199 | 200 | if stride != 1 or first_dilation != dilation: 201 | self.conv_down = ConvBnAct( 202 | in_chs, down_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups, 203 | aa_layer=block_kwargs.get('aa_layer', None), **conv_kwargs) 204 | prev_chs = down_chs 205 | else: 206 | self.conv_down = None 207 | prev_chs = in_chs 208 | 209 | # FIXME this 1x1 expansion is pushed down into the cross and block paths in the darknet cfgs. Also, 210 | # there is also special case for the first stage for some of the model that results in uneven split 211 | # across the two paths. I did it this way for simplicity for now. 212 | self.conv_exp = ConvBnAct(prev_chs, exp_chs, kernel_size=1, 213 | apply_act=not cross_linear, **conv_kwargs) 214 | prev_chs = exp_chs // 2 # output of conv_exp is always split in two 215 | 216 | self.blocks = nn.Sequential() 217 | for i in range(depth): 218 | drop_path = DropPath(block_dpr[i]) if block_dpr and block_dpr[i] else None 219 | self.blocks.add_module(str(i), block_fn( 220 | prev_chs, block_out_chs, dilation, bottle_ratio, groups, drop_path=drop_path, **block_kwargs)) 221 | prev_chs = block_out_chs 222 | 223 | # transition convs 224 | self.conv_transition_b = ConvBnAct(prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs) 225 | self.conv_transition = ConvBnAct(exp_chs, out_chs, kernel_size=1, **conv_kwargs) 226 | 227 | def forward(self, x): 228 | if self.conv_down is not None: 229 | x = self.conv_down(x) 230 | x = self.conv_exp(x) 231 | split = x.shape[1] // 2 232 | xs, xb = x[:, :split], x[:, split:] 233 | xb = self.blocks(xb) 234 | xb = self.conv_transition_b(xb).contiguous() 235 | out = self.conv_transition(torch.cat([xs, xb], dim=1)) 236 | return out 237 | 238 | 239 | class DarkStage(nn.Module): 240 | """DarkNet stage.""" 241 | 242 | def __init__(self, in_chs, out_chs, stride, dilation, depth, block_ratio=1., bottle_ratio=1., groups=1, 243 | first_dilation=None, block_fn=ResBottleneck, block_dpr=None, **block_kwargs): 244 | super(DarkStage, self).__init__() 245 | first_dilation = first_dilation or dilation 246 | 247 | self.conv_down = ConvBnAct( 248 | in_chs, out_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups, 249 | act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'), 250 | aa_layer=block_kwargs.get('aa_layer', None)) 251 | 252 | prev_chs = out_chs 253 | block_out_chs = int(round(out_chs * block_ratio)) 254 | self.blocks = nn.Sequential() 255 | for i in range(depth): 256 | drop_path = DropPath(block_dpr[i]) if block_dpr and block_dpr[i] else None 257 | self.blocks.add_module(str(i), block_fn( 258 | prev_chs, block_out_chs, dilation, bottle_ratio, groups, drop_path=drop_path, **block_kwargs)) 259 | prev_chs = block_out_chs 260 | 261 | def forward(self, x): 262 | x = self.conv_down(x) 263 | x = self.blocks(x) 264 | return x 265 | 266 | 267 | def _cfg_to_stage_args(cfg, curr_stride=2, output_stride=32, drop_path_rate=0.): 268 | # get per stage args for stage and containing blocks, calculate strides to meet target output_stride 269 | num_stages = len(cfg['depth']) 270 | if 'groups' not in cfg: 271 | cfg['groups'] = (1,) * num_stages 272 | if 'down_growth' in cfg and not isinstance(cfg['down_growth'], (list, tuple)): 273 | cfg['down_growth'] = (cfg['down_growth'],) * num_stages 274 | if 'cross_linear' in cfg and not isinstance(cfg['cross_linear'], (list, tuple)): 275 | cfg['cross_linear'] = (cfg['cross_linear'],) * num_stages 276 | cfg['block_dpr'] = [None] * num_stages if not drop_path_rate else \ 277 | [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg['depth'])).split(cfg['depth'])] 278 | stage_strides = [] 279 | stage_dilations = [] 280 | stage_first_dilations = [] 281 | dilation = 1 282 | for cfg_stride in cfg['stride']: 283 | stage_first_dilations.append(dilation) 284 | if curr_stride >= output_stride: 285 | dilation *= cfg_stride 286 | stride = 1 287 | else: 288 | stride = cfg_stride 289 | curr_stride *= stride 290 | stage_strides.append(stride) 291 | stage_dilations.append(dilation) 292 | cfg['stride'] = stage_strides 293 | cfg['dilation'] = stage_dilations 294 | cfg['first_dilation'] = stage_first_dilations 295 | stage_args = [dict(zip(cfg.keys(), values)) for values in zip(*cfg.values())] 296 | return stage_args 297 | 298 | 299 | class CSPNet(Backbone): 300 | """Cross Stage Partial base model. 301 | 302 | Paper: `CSPNet: A New Backbone that can Enhance Learning Capability of CNN` - https://arxiv.org/abs/1911.11929 303 | Ref Impl: https://github.com/WongKinYiu/CrossStagePartialNetworks 304 | 305 | NOTE: There are differences in the way I handle the 1x1 'expansion' conv in this impl vs the 306 | darknet impl. I did it this way for simplicity and less special cases. 307 | """ 308 | 309 | def __init__(self, cfg, in_chans=3, output_stride=32, global_pool='avg', drop_rate=0., 310 | act_layer=nn.LeakyReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_path_rate=0., 311 | zero_init_last_bn=True, stage_fn=CrossStage, block_fn=ResBottleneck, out_features=None): 312 | super().__init__() 313 | self.drop_rate = drop_rate 314 | assert output_stride in (8, 16, 32) 315 | layer_args = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer) 316 | 317 | # Construct the stem 318 | self.stem, stem_feat_info = create_stem(in_chans, **cfg['stem'], **layer_args) 319 | self.feature_info = [stem_feat_info] 320 | prev_chs = stem_feat_info['num_chs'] 321 | curr_stride = stem_feat_info['reduction'] # reduction does not include pool 322 | if cfg['stem']['pool']: 323 | curr_stride *= 2 324 | 325 | # Construct the stages 326 | per_stage_args = _cfg_to_stage_args( 327 | cfg['stage'], curr_stride=curr_stride, output_stride=output_stride, drop_path_rate=drop_path_rate) 328 | self.stages = nn.Sequential() 329 | out_channels = [] 330 | out_strides = [] 331 | for i, sa in enumerate(per_stage_args): 332 | self.stages.add_module( 333 | str(i), stage_fn(prev_chs, **sa, **layer_args, block_fn=block_fn)) 334 | prev_chs = sa['out_chs'] 335 | curr_stride *= sa['stride'] 336 | self.feature_info += [dict(num_chs=prev_chs, 337 | reduction=curr_stride, module=f'stages.{i}')] 338 | out_channels.append(prev_chs) 339 | out_strides.append(curr_stride) 340 | 341 | for m in self.modules(): 342 | if isinstance(m, nn.Conv2d): 343 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 344 | elif isinstance(m, nn.BatchNorm2d): 345 | nn.init.ones_(m.weight) 346 | nn.init.zeros_(m.bias) 347 | elif isinstance(m, nn.Linear): 348 | nn.init.normal_(m.weight, mean=0.0, std=0.01) 349 | nn.init.zeros_(m.bias) 350 | if zero_init_last_bn: 351 | for m in self.modules(): 352 | if hasattr(m, 'zero_init_last_bn'): 353 | m.zero_init_last_bn() 354 | 355 | # cspdarknet: csp1, csp2, csp3, csp4 356 | # cspresnet: csp0, csp1, csp2, csp3 357 | out_features_names = ["csp{}".format(i) for i in range(len(per_stage_args))] 358 | self._out_feature_strides = dict(zip(out_features_names, out_strides)) 359 | self._out_feature_channels = dict(zip(out_features_names, out_channels)) 360 | if out_features is None: 361 | self._out_features = out_features_names 362 | else: 363 | self._out_features = out_features 364 | 365 | def output_shape(self): 366 | return { 367 | name: ShapeSpec( 368 | channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] 369 | ) 370 | for name in self._out_features 371 | } 372 | 373 | def size_divisibility(self): 374 | return 32 375 | 376 | def forward(self, x): 377 | x = self.stem(x) 378 | outputs = {} 379 | for i, stage in enumerate(self.stages): 380 | name = f"csp{i}" 381 | x = stage(x) 382 | if name in self._out_features: 383 | outputs[name] = x 384 | return outputs 385 | 386 | 387 | @BACKBONE_REGISTRY.register() 388 | def build_cspnet_backbone(cfg, input_shape=None): 389 | 390 | cspnet_name = cfg.MODEL.CSPNET.NAME 391 | norm_name = cfg.MODEL.CSPNET.NORM 392 | out_features = cfg.MODEL.CSPNET.OUT_FEATURES 393 | # DarkNet53 doesn't have batch norm 394 | if norm_name == "FrozenBN": 395 | norm = FrozenBatchNorm2d 396 | elif norm_name == "SyncBN": 397 | from detectron2.layers import NaiveSyncBatchNorm 398 | norm = NaiveSyncBatchNorm 399 | else: 400 | norm = nn.BatchNorm2d 401 | 402 | assert cspnet_name in ["cspresnet50", "cspresnet50d", "cspresnet50w", 403 | "cspresnext50", "cspdarknet53", "darknet53"] 404 | 405 | model_cfg = model_cfgs[cspnet_name] 406 | 407 | if "darknet" in cspnet_name: 408 | block_fn = DarkBlock 409 | else: 410 | block_fn = ResBottleneck 411 | 412 | if cspnet_name == "darknet53": 413 | stage_fn = DarkStage 414 | else: 415 | stage_fn = CrossStage 416 | 417 | model = CSPNet( 418 | model_cfg, 419 | in_chans=input_shape.channels, 420 | norm_layer=norm, 421 | stage_fn=stage_fn, 422 | block_fn=block_fn, 423 | out_features=out_features) 424 | return model 425 | -------------------------------------------------------------------------------- /sparseinst/backbones/pvt.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from functools import partial 6 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_ 7 | from detectron2.layers import ShapeSpec 8 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY 9 | 10 | 11 | class Mlp(nn.Module): 12 | def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False): 13 | super().__init__() 14 | out_features = out_features or in_features 15 | hidden_features = hidden_features or in_features 16 | self.fc1 = nn.Linear(in_features, hidden_features) 17 | self.dwconv = DWConv(hidden_features) 18 | self.act = act_layer() 19 | self.fc2 = nn.Linear(hidden_features, out_features) 20 | self.drop = nn.Dropout(drop) 21 | self.linear = linear 22 | if self.linear: 23 | self.relu = nn.ReLU(inplace=True) 24 | self.apply(self._init_weights) 25 | 26 | def _init_weights(self, m): 27 | if isinstance(m, nn.Linear): 28 | trunc_normal_(m.weight, std=.02) 29 | if isinstance(m, nn.Linear) and m.bias is not None: 30 | nn.init.constant_(m.bias, 0) 31 | elif isinstance(m, nn.LayerNorm): 32 | nn.init.constant_(m.bias, 0) 33 | nn.init.constant_(m.weight, 1.0) 34 | elif isinstance(m, nn.Conv2d): 35 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 36 | fan_out //= m.groups 37 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) 38 | if m.bias is not None: 39 | m.bias.data.zero_() 40 | 41 | def forward(self, x, H, W): 42 | x = self.fc1(x) 43 | if self.linear: 44 | x = self.relu(x) 45 | x = self.dwconv(x, H, W) 46 | x = self.act(x) 47 | x = self.drop(x) 48 | x = self.fc2(x) 49 | x = self.drop(x) 50 | return x 51 | 52 | 53 | class Attention(nn.Module): 54 | def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False): 55 | super().__init__() 56 | assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}." 57 | 58 | self.dim = dim 59 | self.num_heads = num_heads 60 | head_dim = dim // num_heads 61 | self.scale = qk_scale or head_dim ** -0.5 62 | 63 | self.q = nn.Linear(dim, dim, bias=qkv_bias) 64 | self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias) 65 | self.attn_drop = nn.Dropout(attn_drop) 66 | self.proj = nn.Linear(dim, dim) 67 | self.proj_drop = nn.Dropout(proj_drop) 68 | 69 | self.linear = linear 70 | self.sr_ratio = sr_ratio 71 | if not linear: 72 | if sr_ratio > 1: 73 | self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio) 74 | self.norm = nn.LayerNorm(dim) 75 | else: 76 | self.pool = nn.AdaptiveAvgPool2d(7) 77 | self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1) 78 | self.norm = nn.LayerNorm(dim) 79 | self.act = nn.GELU() 80 | self.apply(self._init_weights) 81 | 82 | def _init_weights(self, m): 83 | if isinstance(m, nn.Linear): 84 | trunc_normal_(m.weight, std=.02) 85 | if isinstance(m, nn.Linear) and m.bias is not None: 86 | nn.init.constant_(m.bias, 0) 87 | elif isinstance(m, nn.LayerNorm): 88 | nn.init.constant_(m.bias, 0) 89 | nn.init.constant_(m.weight, 1.0) 90 | elif isinstance(m, nn.Conv2d): 91 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 92 | fan_out //= m.groups 93 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) 94 | if m.bias is not None: 95 | m.bias.data.zero_() 96 | 97 | def forward(self, x, H, W): 98 | B, N, C = x.shape 99 | q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) 100 | 101 | if not self.linear: 102 | if self.sr_ratio > 1: 103 | x_ = x.permute(0, 2, 1).reshape(B, C, H, W) 104 | x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1) 105 | x_ = self.norm(x_) 106 | kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 107 | else: 108 | kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 109 | else: 110 | x_ = x.permute(0, 2, 1).reshape(B, C, H, W) 111 | x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1) 112 | x_ = self.norm(x_) 113 | x_ = self.act(x_) 114 | kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 115 | k, v = kv[0], kv[1] 116 | 117 | attn = (q @ k.transpose(-2, -1)) * self.scale 118 | attn = attn.softmax(dim=-1) 119 | attn = self.attn_drop(attn) 120 | 121 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 122 | x = self.proj(x) 123 | x = self.proj_drop(x) 124 | 125 | return x 126 | 127 | 128 | class Block(nn.Module): 129 | 130 | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., 131 | drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False): 132 | super().__init__() 133 | self.norm1 = norm_layer(dim) 134 | self.attn = Attention( 135 | dim, 136 | num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, 137 | attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear) 138 | # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here 139 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 140 | self.norm2 = norm_layer(dim) 141 | mlp_hidden_dim = int(dim * mlp_ratio) 142 | self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear) 143 | 144 | self.apply(self._init_weights) 145 | 146 | def _init_weights(self, m): 147 | if isinstance(m, nn.Linear): 148 | trunc_normal_(m.weight, std=.02) 149 | if isinstance(m, nn.Linear) and m.bias is not None: 150 | nn.init.constant_(m.bias, 0) 151 | elif isinstance(m, nn.LayerNorm): 152 | nn.init.constant_(m.bias, 0) 153 | nn.init.constant_(m.weight, 1.0) 154 | elif isinstance(m, nn.Conv2d): 155 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 156 | fan_out //= m.groups 157 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) 158 | if m.bias is not None: 159 | m.bias.data.zero_() 160 | 161 | def forward(self, x, H, W): 162 | x = x + self.drop_path(self.attn(self.norm1(x), H, W)) 163 | x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) 164 | 165 | return x 166 | 167 | 168 | class OverlapPatchEmbed(nn.Module): 169 | """ Image to Patch Embedding 170 | """ 171 | 172 | def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768): 173 | super().__init__() 174 | img_size = to_2tuple(img_size) 175 | patch_size = to_2tuple(patch_size) 176 | 177 | self.img_size = img_size 178 | self.patch_size = patch_size 179 | self.H, self.W = img_size[0] // stride, img_size[1] // stride 180 | self.num_patches = self.H * self.W 181 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, 182 | padding=(patch_size[0] // 2, patch_size[1] // 2)) 183 | self.norm = nn.LayerNorm(embed_dim) 184 | 185 | self.apply(self._init_weights) 186 | 187 | def _init_weights(self, m): 188 | if isinstance(m, nn.Linear): 189 | trunc_normal_(m.weight, std=.02) 190 | if isinstance(m, nn.Linear) and m.bias is not None: 191 | nn.init.constant_(m.bias, 0) 192 | elif isinstance(m, nn.LayerNorm): 193 | nn.init.constant_(m.bias, 0) 194 | nn.init.constant_(m.weight, 1.0) 195 | elif isinstance(m, nn.Conv2d): 196 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 197 | fan_out //= m.groups 198 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) 199 | if m.bias is not None: 200 | m.bias.data.zero_() 201 | 202 | def forward(self, x): 203 | x = self.proj(x) 204 | _, _, H, W = x.shape 205 | x = x.flatten(2).transpose(1, 2) 206 | x = self.norm(x) 207 | 208 | return x, H, W 209 | 210 | 211 | class PyramidVisionTransformerV2(Backbone): 212 | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dims=[64, 128, 256, 512], 213 | num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0., 214 | attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, depths=[3, 4, 6, 3], 215 | sr_ratios=[8, 4, 2, 1], num_stages=4, linear=False, out_features=None): 216 | super().__init__() 217 | self.depths = depths 218 | self.num_stages = num_stages 219 | self.linear = linear 220 | 221 | dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule 222 | cur = 0 223 | 224 | for i in range(num_stages): 225 | patch_embed = OverlapPatchEmbed(img_size=img_size if i == 0 else img_size // (2 ** (i + 1)), 226 | patch_size=7 if i == 0 else 3, 227 | stride=4 if i == 0 else 2, 228 | in_chans=in_chans if i == 0 else embed_dims[i - 1], 229 | embed_dim=embed_dims[i]) 230 | 231 | block = nn.ModuleList([Block( 232 | dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias, 233 | qk_scale=qk_scale, 234 | drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer, 235 | sr_ratio=sr_ratios[i], linear=linear) 236 | for j in range(depths[i])]) 237 | norm = norm_layer(embed_dims[i]) 238 | cur += depths[i] 239 | 240 | setattr(self, f"patch_embed{i + 1}", patch_embed) 241 | setattr(self, f"block{i + 1}", block) 242 | setattr(self, f"norm{i + 1}", norm) 243 | 244 | out_features_names = ["p1", "p2", "p3", "p4"] 245 | self._out_feature_strides = dict(zip(out_features_names, [4, 8, 16, 32])) 246 | self._out_feature_channels = dict(zip(out_features_names, embed_dims)) 247 | if out_features is None: 248 | self._out_features = out_features_names 249 | else: 250 | self._out_features = out_features 251 | self.out_features_names = out_features_names 252 | self.apply(self._init_weights) 253 | 254 | def _init_weights(self, m): 255 | if isinstance(m, nn.Linear): 256 | trunc_normal_(m.weight, std=.02) 257 | if isinstance(m, nn.Linear) and m.bias is not None: 258 | nn.init.constant_(m.bias, 0) 259 | elif isinstance(m, nn.LayerNorm): 260 | nn.init.constant_(m.bias, 0) 261 | nn.init.constant_(m.weight, 1.0) 262 | elif isinstance(m, nn.Conv2d): 263 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 264 | fan_out //= m.groups 265 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) 266 | if m.bias is not None: 267 | m.bias.data.zero_() 268 | 269 | def freeze_patch_emb(self): 270 | self.patch_embed1.requires_grad = False 271 | 272 | @torch.jit.ignore 273 | def no_weight_decay(self): 274 | return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'} # has pos_embed may be better 275 | 276 | 277 | def output_shape(self): 278 | return { 279 | name: ShapeSpec( 280 | channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] 281 | ) 282 | for name in self._out_features 283 | } 284 | 285 | def size_divisibility(self): 286 | return 32 287 | 288 | 289 | def forward(self, x): 290 | B = x.shape[0] 291 | outputs = {} 292 | 293 | for i in range(self.num_stages): 294 | patch_embed = getattr(self, f"patch_embed{i + 1}") 295 | block = getattr(self, f"block{i + 1}") 296 | norm = getattr(self, f"norm{i + 1}") 297 | x, H, W = patch_embed(x) 298 | for blk in block: 299 | x = blk(x, H, W) 300 | x = norm(x) 301 | x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() 302 | if self.out_features_names[i] in self._out_features: 303 | outputs[self.out_features_names[i]] = x 304 | return outputs 305 | 306 | 307 | class DWConv(nn.Module): 308 | def __init__(self, dim=768): 309 | super(DWConv, self).__init__() 310 | self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim) 311 | 312 | def forward(self, x, H, W): 313 | B, N, C = x.shape 314 | x = x.transpose(1, 2).view(B, C, H, W) 315 | x = self.dwconv(x) 316 | x = x.flatten(2).transpose(1, 2) 317 | 318 | return x 319 | 320 | 321 | def _conv_filter(state_dict, patch_size=16): 322 | """ convert patch embedding weight from manual patchify + linear proj to conv""" 323 | out_dict = {} 324 | for k, v in state_dict.items(): 325 | if 'patch_embed.proj.weight' in k: 326 | v = v.reshape((v.shape[0], 3, patch_size, patch_size)) 327 | out_dict[k] = v 328 | 329 | return out_dict 330 | 331 | 332 | @BACKBONE_REGISTRY.register() 333 | def build_pyramid_vision_transformer(cfg, input_shape): 334 | name = cfg.MODEL.PVT.NAME 335 | linear = cfg.MODEL.PVT.LINEAR 336 | out_features = cfg.MODEL.PVT.OUT_FEATURES 337 | 338 | if linear: 339 | name = "b2" 340 | 341 | if name == "b0": 342 | embed_dims=[32, 64, 160, 256] 343 | else: 344 | embed_dims=[64, 128, 320, 512] 345 | 346 | depths = { 347 | "b0": [2, 2, 2, 2], 348 | "b1": [2, 2, 2, 2], 349 | "b2": [3, 4, 6, 3], 350 | "b3": [3, 4, 18, 3], 351 | "b4": [3, 8, 27, 3], 352 | "b5": [3, 6, 40, 3] 353 | } 354 | 355 | if name == "b5": 356 | mlp_ratios = [4, 4, 4, 4] 357 | else: 358 | mlp_ratios = [8, 8, 4, 4] 359 | 360 | in_channels = input_shape.channels 361 | 362 | return PyramidVisionTransformerV2( 363 | patch_size=4, 364 | depths=depths[name], 365 | in_chans=in_channels, 366 | embed_dims=embed_dims, 367 | num_heads=[1, 2, 5, 8], 368 | mlp_ratios=mlp_ratios, 369 | drop_rate=0.0, 370 | drop_path_rate=0.1, 371 | sr_ratios=[8, 4, 2, 1], 372 | qkv_bias=True, 373 | norm_layer=partial(nn.LayerNorm, eps=1e-6), 374 | out_features=out_features, 375 | linear=linear 376 | ) 377 | 378 | -------------------------------------------------------------------------------- /sparseinst/backbones/resnet.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved 3 | 4 | import math 5 | import torch.nn as nn 6 | from timm.models.resnet import BasicBlock, Bottleneck 7 | from timm.models.layers import DropBlock2d, DropPath, AvgPool2dSame 8 | 9 | from detectron2.layers import ShapeSpec, FrozenBatchNorm2d 10 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY 11 | from detectron2.layers import NaiveSyncBatchNorm, DeformConv 12 | 13 | 14 | def get_padding(kernel_size, stride, dilation=1): 15 | padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 16 | return padding 17 | 18 | 19 | """ 20 | inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64, 21 | reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, 22 | attn_layer=None, aa_layer=None, drop_block=None, drop_path=None 23 | """ 24 | 25 | 26 | class DeformableBottleneck(nn.Module): 27 | expansion = 4 28 | 29 | def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64, 30 | reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, 31 | attn_layer=None, aa_layer=None, drop_block=None, drop_path=None): 32 | super().__init__() 33 | 34 | width = int(math.floor(planes * (base_width / 64)) * cardinality) 35 | first_planes = width // reduce_first 36 | outplanes = planes * self.expansion 37 | first_dilation = first_dilation or dilation 38 | # use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation) 39 | 40 | self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False) 41 | self.bn1 = norm_layer(first_planes) 42 | self.act1 = act_layer(inplace=True) 43 | 44 | self.conv2_offset = nn.Conv2d( 45 | first_planes, 46 | 18, 47 | kernel_size=3, 48 | stride=stride, 49 | padding=first_dilation, 50 | dilation=first_dilation 51 | ) 52 | self.conv2 = DeformConv( 53 | first_planes, 54 | width, 55 | kernel_size=3, 56 | stride=stride, 57 | padding=first_dilation, 58 | bias=False, 59 | dilation=first_dilation, 60 | ) 61 | 62 | self.bn2 = norm_layer(width) 63 | self.act2 = act_layer(inplace=True) 64 | # self.aa = aa_layer(channels=width, stride=stride) if use_aa else None 65 | 66 | self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False) 67 | self.bn3 = norm_layer(outplanes) 68 | 69 | # self.se = create_attn(attn_layer, outplanes) 70 | 71 | self.act3 = act_layer(inplace=True) 72 | self.downsample = downsample 73 | self.stride = stride 74 | self.dilation = dilation 75 | # self.drop_block = drop_block 76 | # self.drop_path = drop_path 77 | 78 | nn.init.constant_(self.conv2_offset.weight, 0) 79 | nn.init.constant_(self.conv2_offset.bias, 0) 80 | 81 | def zero_init_last_bn(self): 82 | nn.init.zeros_(self.bn3.weight) 83 | 84 | def forward(self, x): 85 | shortcut = x 86 | 87 | x = self.conv1(x) 88 | x = self.bn1(x) 89 | 90 | x = self.act1(x) 91 | 92 | offset = self.conv2_offset(x) 93 | x = self.conv2(x, offset) 94 | x = self.bn2(x) 95 | x = self.act2(x) 96 | 97 | x = self.conv3(x) 98 | x = self.bn3(x) 99 | 100 | if self.downsample is not None: 101 | shortcut = self.downsample(shortcut) 102 | x += shortcut 103 | x = self.act3(x) 104 | 105 | return x 106 | 107 | 108 | BLOCK_TYPE = { 109 | "basic": BasicBlock, 110 | "bottleneck": Bottleneck, 111 | "deform_bottleneck": DeformableBottleneck 112 | } 113 | 114 | 115 | def downsample_conv( 116 | in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None): 117 | norm_layer = norm_layer or nn.BatchNorm2d 118 | kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size 119 | first_dilation = (first_dilation or dilation) if kernel_size > 1 else 1 120 | p = get_padding(kernel_size, stride, first_dilation) 121 | 122 | return nn.Sequential(*[ 123 | nn.Conv2d( 124 | in_channels, out_channels, kernel_size, stride=stride, padding=p, dilation=first_dilation, bias=False), 125 | norm_layer(out_channels) 126 | ]) 127 | 128 | 129 | def downsample_avg( 130 | in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None): 131 | norm_layer = norm_layer or nn.BatchNorm2d 132 | avg_stride = stride if dilation == 1 else 1 133 | if stride == 1 and dilation == 1: 134 | pool = nn.Identity() 135 | else: 136 | avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d 137 | pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False) 138 | 139 | return nn.Sequential(*[ 140 | pool, 141 | nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False), 142 | norm_layer(out_channels) 143 | ]) 144 | 145 | 146 | def drop_blocks(drop_block_rate=0.): 147 | return [ 148 | None, None, 149 | DropBlock2d(drop_block_rate, 5, 0.25) if drop_block_rate else None, 150 | DropBlock2d(drop_block_rate, 3, 1.00) if drop_block_rate else None] 151 | 152 | 153 | def make_blocks( 154 | stage_block, channels, block_repeats, inplanes, reduce_first=1, output_stride=32, 155 | down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., **kwargs): 156 | stages = [] 157 | feature_info = [] 158 | net_num_blocks = sum(block_repeats) 159 | net_block_idx = 0 160 | net_stride = 4 161 | dilation = prev_dilation = 1 162 | for stage_idx, (planes, num_blocks, db) in enumerate(zip(channels, block_repeats, drop_blocks(drop_block_rate))): 163 | # choose block_fn through the BLOCK_TYPE 164 | block_fn = BLOCK_TYPE[stage_block[stage_idx]] 165 | 166 | stage_name = f'layer{stage_idx + 1}' # never liked this name, but weight compat requires it 167 | stride = 1 if stage_idx == 0 else 2 168 | if net_stride >= output_stride: 169 | dilation *= stride 170 | stride = 1 171 | else: 172 | net_stride *= stride 173 | 174 | downsample = None 175 | if stride != 1 or inplanes != planes * block_fn.expansion: 176 | down_kwargs = dict( 177 | in_channels=inplanes, out_channels=planes * block_fn.expansion, kernel_size=down_kernel_size, 178 | stride=stride, dilation=dilation, first_dilation=prev_dilation, norm_layer=kwargs.get('norm_layer')) 179 | downsample = downsample_avg( 180 | **down_kwargs) if avg_down else downsample_conv(**down_kwargs) 181 | 182 | block_kwargs = dict(reduce_first=reduce_first, dilation=dilation, drop_block=db, **kwargs) 183 | blocks = [] 184 | for block_idx in range(num_blocks): 185 | downsample = downsample if block_idx == 0 else None 186 | stride = stride if block_idx == 0 else 1 187 | block_dpr = drop_path_rate * net_block_idx / \ 188 | (net_num_blocks - 1) # stochastic depth linear decay rule 189 | blocks.append(block_fn( 190 | inplanes, planes, stride, downsample, first_dilation=prev_dilation, 191 | drop_path=DropPath(block_dpr) if block_dpr > 0. else None, **block_kwargs)) 192 | prev_dilation = dilation 193 | inplanes = planes * block_fn.expansion 194 | net_block_idx += 1 195 | 196 | stages.append((stage_name, nn.Sequential(*blocks))) 197 | feature_info.append(dict(num_chs=inplanes, reduction=net_stride, module=stage_name)) 198 | 199 | return stages, feature_info 200 | 201 | 202 | class ResNet(Backbone): 203 | """ResNet / ResNeXt / SE-ResNeXt / SE-Net 204 | 205 | This class implements all variants of ResNet, ResNeXt, SE-ResNeXt, and SENet that 206 | * have > 1 stride in the 3x3 conv layer of bottleneck 207 | * have conv-bn-act ordering 208 | 209 | This ResNet impl supports a number of stem and downsample options based on the v1c, v1d, v1e, and v1s 210 | variants included in the MXNet Gluon ResNetV1b model. The C and D variants are also discussed in the 211 | 'Bag of Tricks' paper: https://arxiv.org/pdf/1812.01187. The B variant is equivalent to torchvision default. 212 | 213 | ResNet variants (the same modifications can be used in SE/ResNeXt models as well): 214 | * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b 215 | * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64) 216 | * d - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64), average pool in downsample 217 | * e - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128), average pool in downsample 218 | * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128) 219 | * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample 220 | * tn - 3 layer deep 3x3 stem, stem width = 32 (24, 32, 64), average pool in downsample 221 | 222 | ResNeXt 223 | * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths 224 | * same c,d, e, s variants as ResNet can be enabled 225 | 226 | SE-ResNeXt 227 | * normal - 7x7 stem, stem_width = 64 228 | * same c, d, e, s variants as ResNet can be enabled 229 | 230 | SENet-154 - 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64, 231 | reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block 232 | 233 | Parameters 234 | ---------- 235 | block : Block 236 | Class for the residual block. Options are BasicBlockGl, BottleneckGl. 237 | layers : list of int 238 | Numbers of layers in each block 239 | num_classes : int, default 1000 240 | Number of classification classes. 241 | in_chans : int, default 3 242 | Number of input (color) channels. 243 | cardinality : int, default 1 244 | Number of convolution groups for 3x3 conv in Bottleneck. 245 | base_width : int, default 64 246 | Factor determining bottleneck channels. `planes * base_width / 64 * cardinality` 247 | stem_width : int, default 64 248 | Number of channels in stem convolutions 249 | stem_type : str, default '' 250 | The type of stem: 251 | * '', default - a single 7x7 conv with a width of stem_width 252 | * 'deep' - three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2 253 | * 'deep_tiered' - three 3x3 conv layers of widths stem_width//4 * 3, stem_width, stem_width * 2 254 | block_reduce_first: int, default 1 255 | Reduction factor for first convolution output width of residual blocks, 256 | 1 for all archs except senets, where 2 257 | down_kernel_size: int, default 1 258 | Kernel size of residual block downsampling path, 1x1 for most archs, 3x3 for senets 259 | avg_down : bool, default False 260 | Whether to use average pooling for projection skip connection between stages/downsample. 261 | output_stride : int, default 32 262 | Set the output stride of the network, 32, 16, or 8. Typically used in segmentation. 263 | act_layer : nn.Module, activation layer 264 | norm_layer : nn.Module, normalization layer 265 | aa_layer : nn.Module, anti-aliasing layer 266 | drop_rate : float, default 0. 267 | Dropout probability before classifier, for training 268 | global_pool : str, default 'avg' 269 | Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax' 270 | """ 271 | 272 | def __init__(self, block_types, layers, in_chans=3, 273 | cardinality=1, base_width=64, stem_width=64, stem_type='', replace_stem_pool=False, 274 | output_stride=32, block_reduce_first=1, down_kernel_size=1, avg_down=False, 275 | act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_rate=0.0, drop_path_rate=0., 276 | drop_block_rate=0., global_pool='avg', zero_init_last_bn=True, block_args=None, out_features=None): 277 | block_args = block_args or dict() 278 | assert output_stride in (8, 16, 32) 279 | # self.num_classes = num_classes 280 | self.drop_rate = drop_rate 281 | super(ResNet, self).__init__() 282 | 283 | # Stem 284 | deep_stem = 'deep' in stem_type 285 | inplanes = stem_width * 2 if deep_stem else 64 286 | if deep_stem: 287 | stem_chs = (stem_width, stem_width) 288 | if 'tiered' in stem_type: 289 | stem_chs = (3 * (stem_width // 4), stem_width) 290 | self.conv1 = nn.Sequential(*[ 291 | nn.Conv2d(in_chans, stem_chs[0], 3, stride=2, padding=1, bias=False), 292 | norm_layer(stem_chs[0]), 293 | act_layer(inplace=True), 294 | nn.Conv2d(stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False), 295 | norm_layer(stem_chs[1]), 296 | act_layer(inplace=True), 297 | nn.Conv2d(stem_chs[1], inplanes, 3, stride=1, padding=1, bias=False)]) 298 | else: 299 | self.conv1 = nn.Conv2d(in_chans, inplanes, kernel_size=7, 300 | stride=2, padding=3, bias=False) 301 | self.bn1 = norm_layer(inplanes) 302 | self.act1 = act_layer(inplace=True) 303 | self.feature_info = [dict(num_chs=inplanes, reduction=2, module='act1')] 304 | 305 | # Stem Pooling 306 | if replace_stem_pool: 307 | self.maxpool = nn.Sequential(*filter(None, [ 308 | nn.Conv2d(inplanes, inplanes, 3, stride=1 if aa_layer else 2, padding=1, bias=False), 309 | aa_layer(channels=inplanes, stride=2) if aa_layer else None, 310 | norm_layer(inplanes), 311 | act_layer(inplace=True) 312 | ])) 313 | else: 314 | if aa_layer is not None: 315 | self.maxpool = nn.Sequential(*[ 316 | nn.MaxPool2d(kernel_size=3, stride=1, padding=1), 317 | aa_layer(channels=inplanes, stride=2)]) 318 | else: 319 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 320 | 321 | # Feature Blocks 322 | channels = [64, 128, 256, 512] 323 | stage_modules, stage_feature_info = make_blocks( 324 | block_types, channels, layers, inplanes, cardinality=cardinality, base_width=base_width, 325 | output_stride=output_stride, reduce_first=block_reduce_first, avg_down=avg_down, 326 | down_kernel_size=down_kernel_size, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, 327 | drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, **block_args) 328 | for stage in stage_modules: 329 | self.add_module(*stage) # layer1, layer2, etc 330 | self.feature_info.extend(stage_feature_info) 331 | 332 | for n, m in self.named_modules(): 333 | if isinstance(m, nn.BatchNorm2d): 334 | nn.init.constant_(m.weight, 1.) 335 | nn.init.constant_(m.bias, 0.) 336 | if zero_init_last_bn: 337 | for m in self.modules(): 338 | if hasattr(m, 'zero_init_last_bn'): 339 | m.zero_init_last_bn() 340 | 341 | out_features_names = ["res2", "res3", "res4", "res5"] 342 | self._out_feature_strides = dict(zip(out_features_names, [4, 8, 16, 32])) 343 | self._out_feature_channels = dict( 344 | zip(out_features_names, [x * BLOCK_TYPE[block_types[0]].expansion for x in [64, 128, 256, 512]])) 345 | if out_features is None: 346 | self._out_features = out_features_names 347 | else: 348 | self._out_features = out_features 349 | 350 | def output_shape(self): 351 | return { 352 | name: ShapeSpec( 353 | channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] 354 | ) 355 | for name in self._out_features 356 | } 357 | 358 | def size_divisibility(self): 359 | return 32 360 | 361 | def forward(self, x): 362 | x = self.conv1(x) 363 | x = self.bn1(x) 364 | x = self.act1(x) 365 | x = self.maxpool(x) 366 | outputs = {} 367 | x = self.layer1(x) 368 | # outputs["res2"] = x 369 | x = self.layer2(x) 370 | outputs["res3"] = x 371 | x = self.layer3(x) 372 | outputs["res4"] = x 373 | x = self.layer4(x) 374 | outputs["res5"] = x 375 | return outputs 376 | 377 | 378 | @BACKBONE_REGISTRY.register() 379 | def build_resnet_vd_backbone(cfg, input_shape): 380 | 381 | depth = cfg.MODEL.RESNETS.DEPTH 382 | norm_name = cfg.MODEL.RESNETS.NORM 383 | if norm_name == "FrozenBN": 384 | norm = FrozenBatchNorm2d 385 | elif norm_name == "SyncBN": 386 | norm = NaiveSyncBatchNorm 387 | else: 388 | norm = nn.BatchNorm2d 389 | if depth == 50: 390 | layers = [3, 4, 6, 3] 391 | elif depth == 101: 392 | layers = [3, 4, 23, 3] 393 | else: 394 | raise NotImplementedError() 395 | 396 | stage_blocks = [] 397 | use_deformable = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE 398 | for idx in range(4): 399 | if use_deformable[idx]: 400 | stage_blocks.append("deform_bottleneck") 401 | else: 402 | stage_blocks.append("bottleneck") 403 | 404 | model = ResNet(stage_blocks, layers, stem_type="deep", 405 | stem_width=32, avg_down=True, norm_layer=norm) 406 | return model 407 | -------------------------------------------------------------------------------- /sparseinst/caffe2sparseinst.py: -------------------------------------------------------------------------------- 1 | from detectron2.export.caffe2_modeling import * 2 | from sparseinst import SparseInst 3 | import numpy as np 4 | from matplotlib import pyplot as plt 5 | 6 | class Caffe2SparseInst(Caffe2MetaArch): 7 | def __init__(self, cfg, torch_model): 8 | assert isinstance(torch_model, SparseInst) 9 | # torch_model.backbone.size_divisibility = 32 10 | super().__init__(cfg, torch_model) 11 | self.torch_model = torch_model 12 | self.pixel_mean = self.torch_model.pixel_mean/255 13 | self.pixel_std = self.torch_model.pixel_std/255 14 | 15 | def get_caffe2_inputs(self, batched_inputs): 16 | inputs = super().get_caffe2_inputs(batched_inputs) 17 | return inputs[0]/255 18 | 19 | def encode_additional_info(self, predict_net, init_net): 20 | pass 21 | 22 | def normalizer(self, image): 23 | image = (image - self.pixel_mean) / self.pixel_std 24 | return image 25 | 26 | @mock_torch_nn_functional_interpolate() 27 | def forward(self, inputs): 28 | images = self.normalizer(inputs) 29 | images = ImageList.from_tensors([images], 32)[0] 30 | # forward 31 | features = self.torch_model.backbone(images) 32 | features = self.torch_model.encoder(features) 33 | output = self.torch_model.decoder(features) 34 | pred_scores = output["pred_logits"].sigmoid() 35 | pred_masks = output["pred_masks"].sigmoid() 36 | pred_objectness = output["pred_scores"].sigmoid() 37 | pred_scores2 = torch.sqrt(pred_scores * pred_objectness) 38 | 39 | # scores, masks = np.squeeze(pred_scores2), np.squeeze(pred_masks) 40 | # keep = torch.argmax(scores, axis=1) 41 | # masks = [masks[label, :, :] for i, label in enumerate(keep) if scores[i, label] > 0.35] 42 | # fig = plt.figure() 43 | # num_masks = len(masks) 44 | # for i, mask in enumerate(masks, 1): 45 | # fig.add_subplot(1, num_masks, i) 46 | # plt.imshow(mask.data.cpu()) 47 | # plt.show() 48 | # plt.ion() 49 | 50 | # return 51 | 52 | return pred_scores2, pred_masks 53 | 54 | @staticmethod 55 | def get_outputs_converter(predict_net, init_net): 56 | pass 57 | 58 | 59 | META_ARCH_CAFFE2_EXPORT_TYPE_MAP['SparseInst'] = Caffe2SparseInst -------------------------------------------------------------------------------- /sparseinst/coco_evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pycocotools.mask as mask_util 3 | from detectron2.structures import BoxMode 4 | from detectron2.evaluation import COCOEvaluator 5 | 6 | 7 | def instances_to_coco_json(instances, img_id): 8 | """ 9 | Dump an "Instances" object to a COCO-format json that's used for evaluation. 10 | 11 | Args: 12 | instances (Instances): 13 | img_id (int): the image id 14 | 15 | Returns: 16 | list[dict]: list of json annotations in COCO format. 17 | """ 18 | num_instance = len(instances) 19 | if num_instance == 0: 20 | return [] 21 | 22 | # NOTE: pure instance segmentation 23 | has_box = instances.has("pred_boxes") 24 | if has_box: 25 | boxes = instances.pred_boxes.tensor.numpy() 26 | boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) 27 | boxes = boxes.tolist() 28 | 29 | scores = instances.scores.tolist() 30 | classes = instances.pred_classes.tolist() 31 | 32 | has_mask = instances.has("pred_masks") 33 | if has_mask: 34 | # use RLE to encode the masks, because they are too large and takes memory 35 | # since this evaluator stores outputs of the entire dataset 36 | rles = [ 37 | mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] 38 | for mask in instances.pred_masks 39 | ] 40 | for rle in rles: 41 | # "counts" is an array encoded by mask_util as a byte-stream. Python3's 42 | # json writer which always produces strings cannot serialize a bytestream 43 | # unless you decode it. Thankfully, utf-8 works out (which is also what 44 | # the pycocotools/_mask.pyx does). 45 | rle["counts"] = rle["counts"].decode("utf-8") 46 | 47 | has_keypoints = instances.has("pred_keypoints") 48 | if has_keypoints: 49 | keypoints = instances.pred_keypoints 50 | 51 | results = [] 52 | for k in range(num_instance): 53 | result = { 54 | "image_id": img_id, 55 | "category_id": classes[k], 56 | "score": scores[k], 57 | } 58 | if has_box: 59 | result["bbox"] = boxes[k] 60 | if has_mask: 61 | result["segmentation"] = rles[k] 62 | if has_keypoints: 63 | # In COCO annotations, 64 | # keypoints coordinates are pixel indices. 65 | # However our predictions are floating point coordinates. 66 | # Therefore we subtract 0.5 to be consistent with the annotation format. 67 | # This is the inverse of data loading logic in `datasets/coco.py`. 68 | keypoints[k][:, :2] -= 0.5 69 | result["keypoints"] = keypoints[k].flatten().tolist() 70 | results.append(result) 71 | return results 72 | 73 | 74 | class COCOMaskEvaluator(COCOEvaluator): 75 | 76 | def process(self, inputs, outputs): 77 | """ 78 | Args: 79 | inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). 80 | It is a list of dict. Each dict corresponds to an image and 81 | contains keys like "height", "width", "file_name", "image_id". 82 | outputs: the outputs of a COCO model. It is a list of dicts with key 83 | "instances" that contains :class:`Instances`. 84 | """ 85 | for input, output in zip(inputs, outputs): 86 | prediction = {"image_id": input["image_id"]} 87 | 88 | if "instances" in output: 89 | instances = output["instances"].to(self._cpu_device) 90 | prediction["instances"] = instances_to_coco_json(instances, input["image_id"]) 91 | if "proposals" in output: 92 | prediction["proposals"] = output["proposals"].to(self._cpu_device) 93 | if len(prediction) > 1: 94 | self._predictions.append(prediction) -------------------------------------------------------------------------------- /sparseinst/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved 2 | 3 | from detectron2.config import CfgNode as CN 4 | 5 | def add_sparse_inst_config(cfg): 6 | 7 | cfg.MODEL.DEVICE = 'cuda' 8 | cfg.MODEL.MASK_ON = True 9 | # [SparseInst] 10 | cfg.MODEL.SPARSE_INST = CN() 11 | 12 | # parameters for inference 13 | cfg.MODEL.SPARSE_INST.CLS_THRESHOLD = 0.005 14 | cfg.MODEL.SPARSE_INST.MASK_THRESHOLD = 0.45 15 | cfg.MODEL.SPARSE_INST.MAX_DETECTIONS = 100 16 | 17 | # [Encoder] 18 | cfg.MODEL.SPARSE_INST.ENCODER = CN() 19 | cfg.MODEL.SPARSE_INST.ENCODER.NAME = "FPNEncoder" 20 | cfg.MODEL.SPARSE_INST.ENCODER.NORM = "" 21 | cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES = ["res3", "res4", "res5"] 22 | cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS = 256 23 | 24 | # [Decoder] 25 | cfg.MODEL.SPARSE_INST.DECODER = CN() 26 | cfg.MODEL.SPARSE_INST.DECODER.NAME = "BaseIAMDecoder" 27 | cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS = 100 28 | cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES = 80 29 | # kernels for mask features 30 | cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM = 128 31 | # upsample factor for output masks 32 | cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR = 2.0 33 | cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM = False 34 | cfg.MODEL.SPARSE_INST.DECODER.GROUPS = 4 35 | # decoder.inst_branch 36 | cfg.MODEL.SPARSE_INST.DECODER.INST = CN() 37 | cfg.MODEL.SPARSE_INST.DECODER.INST.DIM = 256 38 | cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS = 4 39 | # decoder.mask_branch 40 | cfg.MODEL.SPARSE_INST.DECODER.MASK = CN() 41 | cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM = 256 42 | cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS = 4 43 | 44 | # [Loss] 45 | cfg.MODEL.SPARSE_INST.LOSS = CN() 46 | cfg.MODEL.SPARSE_INST.LOSS.NAME = "SparseInstCriterion" 47 | cfg.MODEL.SPARSE_INST.LOSS.ITEMS = ("labels", "masks") 48 | # loss weight 49 | cfg.MODEL.SPARSE_INST.LOSS.CLASS_WEIGHT = 2.0 50 | cfg.MODEL.SPARSE_INST.LOSS.MASK_PIXEL_WEIGHT = 5.0 51 | cfg.MODEL.SPARSE_INST.LOSS.MASK_DICE_WEIGHT = 2.0 52 | # iou-aware objectness loss weight 53 | cfg.MODEL.SPARSE_INST.LOSS.OBJECTNESS_WEIGHT = 1.0 54 | 55 | # [Matcher] 56 | cfg.MODEL.SPARSE_INST.MATCHER = CN() 57 | cfg.MODEL.SPARSE_INST.MATCHER.NAME = "SparseInstMatcher" 58 | cfg.MODEL.SPARSE_INST.MATCHER.ALPHA = 0.8 59 | cfg.MODEL.SPARSE_INST.MATCHER.BETA = 0.2 60 | 61 | # [Optimizer] 62 | cfg.SOLVER.OPTIMIZER = "ADAMW" 63 | cfg.SOLVER.BACKBONE_MULTIPLIER = 1.0 64 | cfg.SOLVER.AMSGRAD = False 65 | 66 | # [Dataset mapper] 67 | cfg.MODEL.SPARSE_INST.DATASET_MAPPER = "SparseInstDatasetMapper" 68 | 69 | # [Pyramid Vision Transformer] 70 | cfg.MODEL.PVT = CN() 71 | cfg.MODEL.PVT.NAME = "b1" 72 | cfg.MODEL.PVT.OUT_FEATURES = ["p2", "p3", "p4"] 73 | cfg.MODEL.PVT.LINEAR = False 74 | 75 | cfg.MODEL.CSPNET = CN() 76 | cfg.MODEL.CSPNET.NAME = "darknet53" 77 | cfg.MODEL.CSPNET.NORM = "" 78 | # (csp-)darknet: csp1, csp2, csp3, csp4 79 | cfg.MODEL.CSPNET.OUT_FEATURES = ["csp1", "csp2", "csp3", "csp4"] 80 | 81 | -------------------------------------------------------------------------------- /sparseinst/d2_predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import atexit 3 | import bisect 4 | import multiprocessing as mp 5 | from collections import deque 6 | import cv2 7 | import torch 8 | 9 | from detectron2.data import MetadataCatalog 10 | from detectron2.engine.defaults import DefaultPredictor 11 | from detectron2.utils.video_visualizer import VideoVisualizer 12 | from detectron2.utils.visualizer import ColorMode, Visualizer 13 | 14 | 15 | class VisualizationDemo(object): 16 | def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): 17 | """ 18 | Args: 19 | cfg (CfgNode): 20 | instance_mode (ColorMode): 21 | parallel (bool): whether to run the model in different processes from visualization. 22 | Useful since the visualization logic can be slow. 23 | """ 24 | self.img_format = cfg.INPUT.FORMAT 25 | self.metadata = MetadataCatalog.get( 26 | cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" 27 | ) 28 | self.cpu_device = torch.device("cpu") 29 | self.instance_mode = instance_mode 30 | 31 | self.parallel = parallel 32 | if parallel: 33 | num_gpu = torch.cuda.device_count() 34 | self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu) 35 | else: 36 | self.predictor = DefaultPredictor(cfg) 37 | 38 | def run_on_image(self, image, confidence_threshold): 39 | """ 40 | Args: 41 | image (np.ndarray): an image of shape (H, W, C) (in BGR order). 42 | This is the format used by OpenCV. 43 | 44 | Returns: 45 | predictions (dict): the output of the model. 46 | vis_output (VisImage): the visualized image output. 47 | """ 48 | vis_output = None 49 | predictions = self.predictor(image) 50 | visualizer = Visualizer(image, self.metadata, 51 | instance_mode=self.instance_mode) 52 | if "panoptic_seg" in predictions: 53 | panoptic_seg, segments_info = predictions["panoptic_seg"] 54 | vis_output = visualizer.draw_panoptic_seg_predictions( 55 | panoptic_seg.to(self.cpu_device), segments_info 56 | ) 57 | else: 58 | if "sem_seg" in predictions: 59 | vis_output = visualizer.draw_sem_seg( 60 | predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) 61 | ) 62 | if "instances" in predictions: 63 | instances = predictions["instances"].to(self.cpu_device) 64 | instances = instances[instances.scores > confidence_threshold] 65 | predictions["instances"] = instances 66 | vis_output = visualizer.draw_instance_predictions( 67 | predictions=instances) 68 | 69 | return predictions, vis_output 70 | 71 | def _frame_from_video(self, video): 72 | while video.isOpened(): 73 | success, frame = video.read() 74 | if success: 75 | yield frame 76 | else: 77 | break 78 | 79 | def run_on_video(self, video, confidence_threshold): 80 | """ 81 | Visualizes predictions on frames of the input video. 82 | 83 | Args: 84 | video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be 85 | either a webcam or a video file. 86 | 87 | Yields: 88 | ndarray: BGR visualizations of each video frame. 89 | """ 90 | video_visualizer = VideoVisualizer(self.metadata, self.instance_mode) 91 | 92 | def process_predictions(frame, predictions): 93 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) 94 | if "panoptic_seg" in predictions: 95 | panoptic_seg, segments_info = predictions["panoptic_seg"] 96 | vis_frame = video_visualizer.draw_panoptic_seg_predictions( 97 | frame, panoptic_seg.to(self.cpu_device), segments_info 98 | ) 99 | elif "instances" in predictions: 100 | predictions = predictions["instances"].to(self.cpu_device) 101 | predictions = predictions[predictions.scores > 102 | confidence_threshold] 103 | vis_frame = video_visualizer.draw_instance_predictions( 104 | frame, predictions) 105 | elif "sem_seg" in predictions: 106 | vis_frame = video_visualizer.draw_sem_seg( 107 | frame, predictions["sem_seg"].argmax( 108 | dim=0).to(self.cpu_device) 109 | ) 110 | 111 | # Converts Matplotlib RGB format to OpenCV BGR format 112 | vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR) 113 | return vis_frame 114 | 115 | frame_gen = self._frame_from_video(video) 116 | if self.parallel: 117 | buffer_size = self.predictor.default_buffer_size 118 | 119 | frame_data = deque() 120 | 121 | for cnt, frame in enumerate(frame_gen): 122 | frame_data.append(frame) 123 | self.predictor.put(frame) 124 | 125 | if cnt >= buffer_size: 126 | frame = frame_data.popleft() 127 | predictions = self.predictor.get() 128 | yield process_predictions(frame, predictions) 129 | 130 | while len(frame_data): 131 | frame = frame_data.popleft() 132 | predictions = self.predictor.get() 133 | yield process_predictions(frame, predictions) 134 | else: 135 | for frame in frame_gen: 136 | yield process_predictions(frame, self.predictor(frame)) 137 | 138 | 139 | class AsyncPredictor: 140 | """ 141 | A predictor that runs the model asynchronously, possibly on >1 GPUs. 142 | Because rendering the visualization takes considerably amount of time, 143 | this helps improve throughput a little bit when rendering videos. 144 | """ 145 | 146 | class _StopToken: 147 | pass 148 | 149 | class _PredictWorker(mp.Process): 150 | def __init__(self, cfg, task_queue, result_queue): 151 | self.cfg = cfg 152 | self.task_queue = task_queue 153 | self.result_queue = result_queue 154 | super().__init__() 155 | 156 | def run(self): 157 | predictor = DefaultPredictor(self.cfg) 158 | 159 | while True: 160 | task = self.task_queue.get() 161 | if isinstance(task, AsyncPredictor._StopToken): 162 | break 163 | idx, data = task 164 | result = predictor(data) 165 | self.result_queue.put((idx, result)) 166 | 167 | def __init__(self, cfg, num_gpus: int = 1): 168 | """ 169 | Args: 170 | cfg (CfgNode): 171 | num_gpus (int): if 0, will run on CPU 172 | """ 173 | num_workers = max(num_gpus, 1) 174 | self.task_queue = mp.Queue(maxsize=num_workers * 3) 175 | self.result_queue = mp.Queue(maxsize=num_workers * 3) 176 | self.procs = [] 177 | for gpuid in range(max(num_gpus, 1)): 178 | cfg = cfg.clone() 179 | cfg.defrost() 180 | cfg.MODEL.DEVICE = "cuda:{}".format( 181 | gpuid) if num_gpus > 0 else "cpu" 182 | self.procs.append( 183 | AsyncPredictor._PredictWorker( 184 | cfg, self.task_queue, self.result_queue) 185 | ) 186 | 187 | self.put_idx = 0 188 | self.get_idx = 0 189 | self.result_rank = [] 190 | self.result_data = [] 191 | 192 | for p in self.procs: 193 | p.start() 194 | atexit.register(self.shutdown) 195 | 196 | def put(self, image): 197 | self.put_idx += 1 198 | self.task_queue.put((self.put_idx, image)) 199 | 200 | def get(self): 201 | self.get_idx += 1 # the index needed for this request 202 | if len(self.result_rank) and self.result_rank[0] == self.get_idx: 203 | res = self.result_data[0] 204 | del self.result_data[0], self.result_rank[0] 205 | return res 206 | 207 | while True: 208 | # make sure the results are returned in the correct order 209 | idx, res = self.result_queue.get() 210 | if idx == self.get_idx: 211 | return res 212 | insert = bisect.bisect(self.result_rank, idx) 213 | self.result_rank.insert(insert, idx) 214 | self.result_data.insert(insert, res) 215 | 216 | def __len__(self): 217 | return self.put_idx - self.get_idx 218 | 219 | def __call__(self, image): 220 | self.put(image) 221 | return self.get() 222 | 223 | def shutdown(self): 224 | for _ in self.procs: 225 | self.task_queue.put(AsyncPredictor._StopToken()) 226 | 227 | @property 228 | def default_buffer_size(self): 229 | return len(self.procs) * 5 -------------------------------------------------------------------------------- /sparseinst/dataset_mapper.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | import numpy as np 4 | import torch 5 | 6 | 7 | from detectron2.data import detection_utils as utils 8 | from detectron2.data import transforms as T 9 | 10 | """ 11 | This file contains the default mapping that's applied to "dataset dicts". 12 | """ 13 | 14 | __all__ = ["SparseInstDatasetMapper"] 15 | 16 | 17 | def build_transform_gen(cfg, is_train): 18 | """ 19 | Create a list of default :class:`Augmentation` from config. 20 | Now it includes resizing and flipping. 21 | Returns: 22 | list[Augmentation] 23 | """ 24 | augmentation = [] 25 | 26 | if is_train: 27 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 28 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 29 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 30 | else: 31 | min_size = cfg.INPUT.MIN_SIZE_TEST 32 | max_size = cfg.INPUT.MAX_SIZE_TEST 33 | sample_style = "choice" 34 | if is_train and cfg.INPUT.RANDOM_FLIP != "none": 35 | augmentation.append( 36 | T.RandomFlip( 37 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", 38 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 39 | ) 40 | ) 41 | if is_train: 42 | augmentation.append( 43 | T.ResizeShortestEdge(min_size, max_size, sample_style) 44 | ) 45 | return augmentation 46 | 47 | 48 | class SparseInstDatasetMapper: 49 | """ 50 | A callable which takes a dataset dict in Detectron2 Dataset format, 51 | and map it into a format used by the model. 52 | This is the default callable to be used to map your dataset dict into training data. 53 | You may need to follow it to implement your own one for customized logic, 54 | such as a different way to read or transform images. 55 | See :doc:`/tutorials/data_loading` for details. 56 | The callable currently does the following: 57 | 1. Read the image from "file_name" 58 | 2. Applies cropping/geometric transforms to the image and annotations 59 | 3. Prepare data and annotations to Tensor and :class:`Instances` 60 | """ 61 | # @classmethod 62 | 63 | def __init__(self, cfg, is_train: bool = True): 64 | augs = build_transform_gen(cfg, is_train) 65 | self.default_aug = T.AugmentationList(augs) 66 | if cfg.INPUT.CROP.ENABLED and is_train: 67 | crop_gen = [ 68 | T.ResizeShortestEdge([400, 500, 600], sample_style='choice'), 69 | T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE) 70 | ] 71 | recompute_boxes = cfg.MODEL.MASK_ON 72 | augs = augs[:-1] + crop_gen + augs[-1:] 73 | self.crop_aug = T.AugmentationList(augs) 74 | else: 75 | self.crop_aug = None 76 | recompute_boxes = False 77 | 78 | # self.augs = augs 79 | self.is_train = is_train 80 | self.image_format = cfg.INPUT.FORMAT 81 | self.use_instance_mask = cfg.MODEL.MASK_ON 82 | self.instance_mask_format = cfg.INPUT.MASK_FORMAT 83 | self.recompute_boxes = recompute_boxes 84 | 85 | logger = logging.getLogger(__name__) 86 | mode = "training" if is_train else "inference" 87 | logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augs}") 88 | 89 | def __call__(self, dataset_dict): 90 | """ 91 | Args: 92 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 93 | Returns: 94 | dict: a format that builtin models in detectron2 accept 95 | """ 96 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 97 | # USER: Write your own image loading if it's not from a file 98 | image = utils.read_image(dataset_dict["file_name"], format=self.image_format) 99 | utils.check_image_size(dataset_dict, image) 100 | 101 | # USER: Remove if you don't do semantic/panoptic segmentation. 102 | if "sem_seg_file_name" in dataset_dict: 103 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2) 104 | else: 105 | sem_seg_gt = None 106 | 107 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 108 | 109 | if self.crop_aug is None: 110 | transforms = self.default_aug(aug_input) 111 | else: 112 | if np.random.rand() > 0.5: 113 | transforms = self.crop_aug(aug_input) 114 | else: 115 | transforms = self.default_aug(aug_input) 116 | # transforms = self.augmentations(aug_input) 117 | image, sem_seg_gt = aug_input.image, aug_input.sem_seg 118 | 119 | image_shape = image.shape[:2] # h, w 120 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 121 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 122 | # Therefore it's important to use torch.Tensor. 123 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 124 | if sem_seg_gt is not None: 125 | dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long")) 126 | 127 | if not self.is_train: 128 | # USER: Modify this if you want to keep them for some reason. 129 | dataset_dict.pop("annotations", None) 130 | dataset_dict.pop("sem_seg_file_name", None) 131 | return dataset_dict 132 | 133 | if "annotations" in dataset_dict: 134 | # USER: Modify this if you want to keep them for some reason. 135 | for anno in dataset_dict["annotations"]: 136 | anno.pop("keypoints", None) 137 | if not self.use_instance_mask: 138 | anno.pop("segmentation", None) 139 | 140 | # USER: Implement additional transformations if you have other types of data 141 | annos = [ 142 | utils.transform_instance_annotations( 143 | obj, transforms, image_shape) 144 | for obj in dataset_dict.pop("annotations") 145 | if obj.get("iscrowd", 0) == 0 146 | ] 147 | instances = utils.annotations_to_instances( 148 | annos, image_shape, mask_format=self.instance_mask_format 149 | ) 150 | 151 | # After transforms such as cropping are applied, the bounding box may no longer 152 | # tightly bound the object. As an example, imagine a triangle object 153 | # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight 154 | # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to 155 | # the intersection of original bounding box and the cropping box. 156 | if self.recompute_boxes: 157 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes() 158 | dataset_dict["instances"] = utils.filter_empty_instances(instances) 159 | return dataset_dict -------------------------------------------------------------------------------- /sparseinst/decoder.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | from torch.nn import init 5 | import torch.nn.functional as F 6 | from torch.utils.checkpoint import checkpoint 7 | from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill 8 | 9 | from detectron2.utils.registry import Registry 10 | from detectron2.layers import Conv2d 11 | 12 | SPARSE_INST_DECODER_REGISTRY = Registry("SPARSE_INST_DECODER") 13 | SPARSE_INST_DECODER_REGISTRY.__doc__ = "registry for SparseInst decoder" 14 | 15 | def _make_stack_3x3_convs(num_convs, in_channels, out_channels): 16 | convs = [] 17 | for _ in range(num_convs): 18 | convs.append( 19 | Conv2d(in_channels, out_channels, 3, padding=1)) 20 | convs.append(nn.ReLU(True)) 21 | in_channels = out_channels 22 | return nn.Sequential(*convs) 23 | 24 | 25 | class InstanceBranch(nn.Module): 26 | 27 | def __init__(self, cfg, in_channels): 28 | super().__init__() 29 | # norm = cfg.MODEL.SPARSE_INST.DECODER.NORM 30 | dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM 31 | num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS 32 | num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS 33 | kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM 34 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES 35 | 36 | self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim) 37 | # iam prediction, a simple conv 38 | self.iam_conv = nn.Conv2d(dim, num_masks, 3, padding=1) 39 | 40 | # outputs 41 | self.cls_score = nn.Linear(dim, self.num_classes) 42 | self.mask_kernel = nn.Linear(dim, kernel_dim) 43 | self.objectness = nn.Linear(dim, 1) 44 | 45 | self.prior_prob = 0.01 46 | self._init_weights() 47 | 48 | def _init_weights(self): 49 | for m in self.inst_convs.modules(): 50 | if isinstance(m, nn.Conv2d): 51 | c2_msra_fill(m) 52 | bias_value = -math.log((1 - self.prior_prob) / self.prior_prob) 53 | for module in [self.iam_conv, self.cls_score]: 54 | init.constant_(module.bias, bias_value) 55 | init.normal_(self.iam_conv.weight, std=0.01) 56 | init.normal_(self.cls_score.weight, std=0.01) 57 | 58 | init.normal_(self.mask_kernel.weight, std=0.01) 59 | init.constant_(self.mask_kernel.bias, 0.0) 60 | 61 | def forward(self, features): 62 | # instance features (x4 convs) 63 | features = self.inst_convs(features) 64 | # predict instance activation maps 65 | iam = self.iam_conv(features) 66 | iam_prob = iam.sigmoid() 67 | 68 | B, N = iam_prob.shape[:2] 69 | C = features.size(1) 70 | # BxNxHxW -> BxNx(HW) 71 | iam_prob = iam_prob.view(B, N, -1) 72 | # aggregate features: BxCxHxW -> Bx(HW)xC 73 | inst_features = torch.bmm(iam_prob, features.view(B, C, -1).permute(0, 2, 1)) 74 | normalizer = iam_prob.sum(-1).clamp(min=1e-6) 75 | inst_features = inst_features / normalizer[:, :, None] 76 | # predict classification & segmentation kernel & objectness 77 | pred_logits = self.cls_score(inst_features) 78 | pred_kernel = self.mask_kernel(inst_features) 79 | pred_scores = self.objectness(inst_features) 80 | return pred_logits, pred_kernel, pred_scores, iam 81 | 82 | 83 | class MaskBranch(nn.Module): 84 | 85 | def __init__(self, cfg, in_channels): 86 | super().__init__() 87 | dim = cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM 88 | num_convs = cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS 89 | kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM 90 | self.mask_convs = _make_stack_3x3_convs(num_convs, in_channels, dim) 91 | self.projection = nn.Conv2d(dim, kernel_dim, kernel_size=1) 92 | self._init_weights() 93 | 94 | def _init_weights(self): 95 | for m in self.mask_convs.modules(): 96 | if isinstance(m, nn.Conv2d): 97 | c2_msra_fill(m) 98 | c2_msra_fill(self.projection) 99 | 100 | def forward(self, features): 101 | # mask features (x4 convs) 102 | features = self.mask_convs(features) 103 | # features = checkpoint(self.mask_convs,features) 104 | return self.projection(features) 105 | 106 | 107 | @SPARSE_INST_DECODER_REGISTRY.register() 108 | class BaseIAMDecoder(nn.Module): 109 | 110 | def __init__(self, cfg): 111 | super().__init__() 112 | # add 2 for coordinates 113 | in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2 114 | 115 | self.scale_factor = cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR 116 | self.output_iam = cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM 117 | 118 | self.inst_branch = InstanceBranch(cfg, in_channels) 119 | self.mask_branch = MaskBranch(cfg, in_channels) 120 | 121 | @torch.no_grad() 122 | def compute_coordinates(self, x): 123 | h, w = x.size(2), x.size(3) 124 | input_1 = -1 125 | input_1 = int(input_1) 126 | input_3 = 1 127 | input_3= int(input_3) 128 | input_2 = h 129 | input_2= int(input_2) 130 | input_4 = w 131 | input_4= int(input_4) 132 | 133 | y_loc = torch.linspace(input_1, input_3, input_2, device=x.device) 134 | x_loc = torch.linspace(input_1, input_3, input_4, device=x.device) 135 | #y_loc = torch.arange(-1, 1+(2/h), 2/(h-1), device=x.device) 136 | #x_loc = torch.arange(-1, 1+(2/w), 2/(w-1), device=x.device) 137 | y_loc, x_loc = torch.meshgrid(y_loc, x_loc) 138 | y_loc = y_loc.expand([x.shape[0], 1, -1, -1]) 139 | x_loc = x_loc.expand([x.shape[0], 1, -1, -1]) 140 | locations = torch.cat([x_loc, y_loc], 1) 141 | return locations.to(x) 142 | 143 | def forward(self, features): 144 | coord_features = self.compute_coordinates(features) 145 | features = torch.cat([coord_features, features], dim=1) 146 | pred_logits, pred_kernel, pred_scores, iam = self.inst_branch(features) 147 | mask_features = self.mask_branch(features) 148 | 149 | N = pred_kernel.shape[1] 150 | # mask_features: BxCxHxW 151 | B, C, H, W = mask_features.shape 152 | pred_masks = torch.bmm(pred_kernel, mask_features.view(B, C, H * W)).view(B, N, H, W) 153 | 154 | pred_masks = F.interpolate( 155 | pred_masks, scale_factor=self.scale_factor, 156 | mode='bilinear', align_corners=False) 157 | 158 | output = { 159 | "pred_logits": pred_logits, 160 | "pred_masks": pred_masks, 161 | "pred_scores": pred_scores, 162 | } 163 | 164 | if self.output_iam: 165 | iam = F.interpolate(iam, scale_factor=self.scale_factor, 166 | mode='bilinear', align_corners=False) 167 | output['pred_iam'] = iam 168 | 169 | return output 170 | 171 | 172 | class GroupInstanceBranch(nn.Module): 173 | 174 | def __init__(self, cfg, in_channels): 175 | super().__init__() 176 | dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM 177 | num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS 178 | num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS 179 | kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM 180 | self.num_groups = cfg.MODEL.SPARSE_INST.DECODER.GROUPS 181 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES 182 | 183 | self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim) 184 | # iam prediction, a group conv 185 | expand_dim = dim * self.num_groups 186 | self.iam_conv = nn.Conv2d(dim, num_masks * self.num_groups, 3, padding=1, groups=self.num_groups) 187 | # outputs 188 | self.fc = nn.Linear(expand_dim, expand_dim) 189 | 190 | self.cls_score = nn.Linear(expand_dim, self.num_classes) 191 | self.mask_kernel = nn.Linear(expand_dim, kernel_dim) 192 | self.objectness = nn.Linear(expand_dim, 1) 193 | 194 | self.prior_prob = 0.01 195 | self._init_weights() 196 | 197 | def _init_weights(self): 198 | for m in self.inst_convs.modules(): 199 | if isinstance(m, nn.Conv2d): 200 | c2_msra_fill(m) 201 | bias_value = -math.log((1 - self.prior_prob) / self.prior_prob) 202 | for module in [self.iam_conv, self.cls_score]: 203 | init.constant_(module.bias, bias_value) 204 | init.normal_(self.iam_conv.weight, std=0.01) 205 | init.normal_(self.cls_score.weight, std=0.01) 206 | 207 | init.normal_(self.mask_kernel.weight, std=0.01) 208 | init.constant_(self.mask_kernel.bias, 0.0) 209 | c2_xavier_fill(self.fc) 210 | 211 | def forward(self, features): 212 | # instance features (x4 convs) 213 | features = self.inst_convs(features) 214 | # predict instance activation maps 215 | iam = self.iam_conv(features) 216 | iam_prob = iam.sigmoid() 217 | 218 | B, N = iam_prob.shape[:2] 219 | C = features.size(1) 220 | # BxNxHxW -> BxNx(HW) 221 | iam_prob = iam_prob.view(B, N, -1) 222 | # aggregate features: BxCxHxW -> Bx(HW)xC 223 | inst_features = torch.bmm(iam_prob, features.view(B, C, -1).permute(0, 2, 1)) 224 | normalizer = iam_prob.sum(-1).clamp(min=1e-6) 225 | inst_features = inst_features / normalizer[:, :, None] 226 | 227 | inst_features = inst_features.reshape( 228 | B, 4, N // 4, -1).transpose(1, 2).reshape(B, N // 4, -1) 229 | 230 | inst_features = F.relu_(self.fc(inst_features)) 231 | # predict classification & segmentation kernel & objectness 232 | pred_logits = self.cls_score(inst_features) 233 | pred_kernel = self.mask_kernel(inst_features) 234 | pred_scores = self.objectness(inst_features) 235 | return pred_logits, pred_kernel, pred_scores, iam 236 | 237 | 238 | 239 | @SPARSE_INST_DECODER_REGISTRY.register() 240 | class GroupIAMDecoder(BaseIAMDecoder): 241 | 242 | def __init__(self, cfg): 243 | super().__init__(cfg) 244 | in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2 245 | self.inst_branch = GroupInstanceBranch(cfg, in_channels) 246 | 247 | 248 | 249 | def build_sparse_inst_decoder(cfg): 250 | name = cfg.MODEL.SPARSE_INST.DECODER.NAME 251 | return SPARSE_INST_DECODER_REGISTRY.get(name)(cfg) 252 | -------------------------------------------------------------------------------- /sparseinst/encoder.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | import torch.nn.functional as F 6 | 7 | from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill 8 | 9 | from detectron2.utils.registry import Registry 10 | from detectron2.layers import Conv2d 11 | 12 | SPARSE_INST_ENCODER_REGISTRY = Registry("SPARSE_INST_ENCODER") 13 | SPARSE_INST_ENCODER_REGISTRY.__doc__ = "registry for SparseInst decoder" 14 | 15 | 16 | class PyramidPoolingModule(nn.Module): 17 | 18 | def __init__(self, in_channels, channels=512, sizes=(1, 2, 3, 6)): 19 | super().__init__() 20 | self.stages = [] 21 | self.stages = nn.ModuleList( 22 | [self._make_stage(in_channels, channels, size) for size in sizes] 23 | ) 24 | self.bottleneck = Conv2d( 25 | in_channels + len(sizes) * channels, in_channels, 1) 26 | 27 | def _make_stage(self, features, out_features, size): 28 | # prior = nn.AdaptiveAvgPool2d(output_size=(size, size)) 29 | stride = np.floor(10/size).astype(np.int32) 30 | kernel = 10-(size-1)*stride 31 | # print(size, stride, kernel) 32 | prior = torch.nn.AvgPool2d(kernel_size=kernel,stride=stride) 33 | conv = Conv2d(features, out_features, 1) 34 | return nn.Sequential(prior, conv) 35 | 36 | def forward(self, feats): 37 | h, w = feats.size(2), feats.size(3) 38 | priors = [F.interpolate(input=F.relu_(stage(feats)), size=( 39 | h, w), mode='bilinear', align_corners=False) for stage in self.stages] + [feats] 40 | out = F.relu_(self.bottleneck(torch.cat(priors, 1))) 41 | return out 42 | 43 | 44 | @SPARSE_INST_ENCODER_REGISTRY.register() 45 | class InstanceContextEncoder(nn.Module): 46 | """ 47 | Instance Context Encoder 48 | 1. construct feature pyramids from ResNet 49 | 2. enlarge receptive fields (ppm) 50 | 3. multi-scale fusion 51 | """ 52 | 53 | def __init__(self, cfg, input_shape): 54 | super().__init__() 55 | self.num_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS 56 | self.in_features = cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES 57 | # self.norm = cfg.MODEL.SPARSE_INST.ENCODER.NORM 58 | # depthwise = cfg.MODEL.SPARSE_INST.ENCODER.DEPTHWISE 59 | self.in_channels = [input_shape[f].channels for f in self.in_features] 60 | # self.using_bias = self.norm == "" 61 | fpn_laterals = [] 62 | fpn_outputs = [] 63 | # groups = self.num_channels if depthwise else 1 64 | for in_channel in reversed(self.in_channels): 65 | lateral_conv = Conv2d(in_channel, self.num_channels, 1) 66 | output_conv = Conv2d(self.num_channels, self.num_channels, 3, padding=1) 67 | c2_xavier_fill(lateral_conv) 68 | c2_xavier_fill(output_conv) 69 | fpn_laterals.append(lateral_conv) 70 | fpn_outputs.append(output_conv) 71 | self.fpn_laterals = nn.ModuleList(fpn_laterals) 72 | self.fpn_outputs = nn.ModuleList(fpn_outputs) 73 | # ppm 74 | self.ppm = PyramidPoolingModule(self.num_channels, self.num_channels // 4) 75 | # final fusion 76 | self.fusion = nn.Conv2d(self.num_channels * 3, self.num_channels, 1) 77 | c2_msra_fill(self.fusion) 78 | 79 | def forward(self, features): 80 | features = [features[f] for f in self.in_features] 81 | features = features[::-1] 82 | prev_features = self.ppm(self.fpn_laterals[0](features[0])) 83 | outputs = [self.fpn_outputs[0](prev_features)] 84 | for feature, lat_conv, output_conv in zip(features[1:], self.fpn_laterals[1:], self.fpn_outputs[1:]): 85 | lat_features = lat_conv(feature) 86 | top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode='nearest') 87 | prev_features = lat_features + top_down_features 88 | outputs.insert(0, output_conv(prev_features)) 89 | size = outputs[0].shape[2:] 90 | features = [ 91 | outputs[0]] + [F.interpolate(x, size, mode='bilinear', align_corners=False) for x in outputs[1:]] 92 | features = self.fusion(torch.cat(features, dim=1)) 93 | return features 94 | 95 | 96 | def build_sparse_inst_encoder(cfg, input_shape): 97 | name = cfg.MODEL.SPARSE_INST.ENCODER.NAME 98 | return SPARSE_INST_ENCODER_REGISTRY.get(name)(cfg, input_shape) -------------------------------------------------------------------------------- /sparseinst/input.ppm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leandro-svg/SparseInst_TensorRT/eeb43a82f9690d57f212fc81ac1f1c50445239b6/sparseinst/input.ppm -------------------------------------------------------------------------------- /sparseinst/loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.cuda.amp import autocast 7 | from scipy.optimize import linear_sum_assignment 8 | from fvcore.nn import sigmoid_focal_loss_jit 9 | 10 | from detectron2.utils.registry import Registry 11 | 12 | from .utils import nested_masks_from_list, is_dist_avail_and_initialized, get_world_size 13 | 14 | SPARSE_INST_MATCHER_REGISTRY = Registry("SPARSE_INST_MATCHER") 15 | SPARSE_INST_MATCHER_REGISTRY.__doc__ = "Matcher for SparseInst" 16 | SPARSE_INST_CRITERION_REGISTRY = Registry("SPARSE_INST_CRITERION") 17 | SPARSE_INST_CRITERION_REGISTRY.__doc__ = "Criterion for SparseInst" 18 | 19 | 20 | def compute_mask_iou(inputs, targets): 21 | inputs = inputs.sigmoid() 22 | # thresholding 23 | binarized_inputs = (inputs >= 0.4).float() 24 | targets = (targets > 0.5).float() 25 | intersection = (binarized_inputs * targets).sum(-1) 26 | union = targets.sum(-1) + binarized_inputs.sum(-1) - intersection 27 | score = intersection / (union + 1e-6) 28 | return score 29 | 30 | 31 | def dice_score(inputs, targets): 32 | inputs = inputs.sigmoid() 33 | numerator = 2 * torch.matmul(inputs, targets.t()) 34 | denominator = ( 35 | inputs * inputs).sum(-1)[:, None] + (targets * targets).sum(-1) 36 | score = numerator / (denominator + 1e-4) 37 | return score 38 | 39 | 40 | def dice_loss(inputs, targets, reduction='sum'): 41 | inputs = inputs.sigmoid() 42 | assert inputs.shape == targets.shape 43 | numerator = 2 * (inputs * targets).sum(1) 44 | denominator = (inputs * inputs).sum(-1) + (targets * targets).sum(-1) 45 | loss = 1 - (numerator) / (denominator + 1e-4) 46 | if reduction == 'none': 47 | return loss 48 | return loss.sum() 49 | 50 | 51 | @SPARSE_INST_CRITERION_REGISTRY.register() 52 | class SparseInstCriterion(nn.Module): 53 | # This part is partially derivated from: https://github.com/facebookresearch/detr/blob/main/models/detr.py 54 | 55 | def __init__(self, cfg, matcher): 56 | super().__init__() 57 | self.matcher = matcher 58 | self.losses = cfg.MODEL.SPARSE_INST.LOSS.ITEMS 59 | self.weight_dict = self.get_weight_dict(cfg) 60 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES 61 | 62 | def get_weight_dict(self, cfg): 63 | losses = ("loss_ce", "loss_mask", "loss_dice", "loss_objectness") 64 | weight_dict = {} 65 | ce_weight = cfg.MODEL.SPARSE_INST.LOSS.CLASS_WEIGHT 66 | mask_weight = cfg.MODEL.SPARSE_INST.LOSS.MASK_PIXEL_WEIGHT 67 | dice_weight = cfg.MODEL.SPARSE_INST.LOSS.MASK_DICE_WEIGHT 68 | objectness_weight = cfg.MODEL.SPARSE_INST.LOSS.OBJECTNESS_WEIGHT 69 | 70 | weight_dict = dict( 71 | zip(losses, (ce_weight, mask_weight, dice_weight, objectness_weight))) 72 | return weight_dict 73 | 74 | def _get_src_permutation_idx(self, indices): 75 | # permute predictions following indices 76 | batch_idx = torch.cat([torch.full_like(src, i) 77 | for i, (src, _) in enumerate(indices)]) 78 | src_idx = torch.cat([src for (src, _) in indices]) 79 | return batch_idx, src_idx 80 | 81 | def _get_tgt_permutation_idx(self, indices): 82 | # permute targets following indices 83 | batch_idx = torch.cat([torch.full_like(tgt, i) 84 | for i, (_, tgt) in enumerate(indices)]) 85 | tgt_idx = torch.cat([tgt for (_, tgt) in indices]) 86 | return batch_idx, tgt_idx 87 | 88 | def loss_labels(self, outputs, targets, indices, num_instances, input_shape=None): 89 | assert "pred_logits" in outputs 90 | src_logits = outputs['pred_logits'] 91 | idx = self._get_src_permutation_idx(indices) 92 | target_classes_o = torch.cat([t["labels"][J] 93 | for t, (_, J) in zip(targets, indices)]) 94 | target_classes = torch.full(src_logits.shape[:2], self.num_classes, 95 | dtype=torch.int64, device=src_logits.device) 96 | target_classes[idx] = target_classes_o 97 | 98 | src_logits = src_logits.flatten(0, 1) 99 | # prepare one_hot target. 100 | target_classes = target_classes.flatten(0, 1) 101 | pos_inds = torch.nonzero( 102 | target_classes != self.num_classes, as_tuple=True)[0] 103 | labels = torch.zeros_like(src_logits) 104 | labels[pos_inds, target_classes[pos_inds]] = 1 105 | # comp focal loss. 106 | class_loss = sigmoid_focal_loss_jit( 107 | src_logits, 108 | labels, 109 | alpha=0.25, 110 | gamma=2.0, 111 | reduction="sum", 112 | ) / num_instances 113 | losses = {'loss_ce': class_loss} 114 | return losses 115 | 116 | def loss_masks_with_iou_objectness(self, outputs, targets, indices, num_instances, input_shape): 117 | src_idx = self._get_src_permutation_idx(indices) 118 | tgt_idx = self._get_tgt_permutation_idx(indices) 119 | # Bx100xHxW 120 | assert "pred_masks" in outputs 121 | assert "pred_scores" in outputs 122 | src_iou_scores = outputs["pred_scores"] 123 | src_masks = outputs["pred_masks"] 124 | with torch.no_grad(): 125 | target_masks, _ = nested_masks_from_list( 126 | [t["masks"].tensor for t in targets], input_shape).decompose() 127 | num_masks = [len(t["masks"]) for t in targets] 128 | target_masks = target_masks.to(src_masks) 129 | if len(target_masks) == 0: 130 | losses = { 131 | "loss_dice": src_masks.sum() * 0.0, 132 | "loss_mask": src_masks.sum() * 0.0, 133 | "loss_objectness": src_iou_scores.sum() * 0.0 134 | } 135 | return losses 136 | 137 | src_masks = src_masks[src_idx] 138 | target_masks = F.interpolate( 139 | target_masks[:, None], size=src_masks.shape[-2:], mode='bilinear', align_corners=False).squeeze(1) 140 | 141 | src_masks = src_masks.flatten(1) 142 | # FIXME: tgt_idx 143 | mix_tgt_idx = torch.zeros_like(tgt_idx[1]) 144 | cum_sum = 0 145 | for num_mask in num_masks: 146 | mix_tgt_idx[cum_sum: cum_sum + num_mask] = cum_sum 147 | cum_sum += num_mask 148 | mix_tgt_idx += tgt_idx[1] 149 | 150 | target_masks = target_masks[mix_tgt_idx].flatten(1) 151 | 152 | with torch.no_grad(): 153 | ious = compute_mask_iou(src_masks, target_masks) 154 | 155 | tgt_iou_scores = ious 156 | src_iou_scores = src_iou_scores[src_idx] 157 | tgt_iou_scores = tgt_iou_scores.flatten(0) 158 | src_iou_scores = src_iou_scores.flatten(0) 159 | 160 | losses = { 161 | "loss_objectness": F.binary_cross_entropy_with_logits(src_iou_scores, tgt_iou_scores, reduction='mean'), 162 | "loss_dice": dice_loss(src_masks, target_masks) / num_instances, 163 | "loss_mask": F.binary_cross_entropy_with_logits(src_masks, target_masks, reduction='mean') 164 | } 165 | return losses 166 | 167 | def get_loss(self, loss, outputs, targets, indices, num_instances, **kwargs): 168 | loss_map = { 169 | "labels": self.loss_labels, 170 | "masks": self.loss_masks_with_iou_objectness, 171 | } 172 | if loss == "loss_objectness": 173 | # NOTE: loss_objectness will be calculated in `loss_masks_with_iou_objectness` 174 | return {} 175 | assert loss in loss_map 176 | return loss_map[loss](outputs, targets, indices, num_instances, **kwargs) 177 | 178 | def forward(self, outputs, targets, input_shape): 179 | 180 | outputs_without_aux = {k: v for k, 181 | v in outputs.items() if k != 'aux_outputs'} 182 | 183 | # Retrieve the matching between the outputs of the last layer and the targets 184 | indices = self.matcher(outputs_without_aux, targets, input_shape) 185 | # Compute the average number of target boxes accross all nodes, for normalization purposes 186 | num_instances = sum(len(t["labels"]) for t in targets) 187 | num_instances = torch.as_tensor( 188 | [num_instances], dtype=torch.float, device=next(iter(outputs.values())).device) 189 | if is_dist_avail_and_initialized(): 190 | torch.distributed.all_reduce(num_instances) 191 | num_instances = torch.clamp( 192 | num_instances / get_world_size(), min=1).item() 193 | # Compute all the requested losses 194 | losses = {} 195 | for loss in self.losses: 196 | losses.update(self.get_loss(loss, outputs, targets, indices, 197 | num_instances, input_shape=input_shape)) 198 | 199 | for k in losses.keys(): 200 | if k in self.weight_dict: 201 | losses[k] *= self.weight_dict[k] 202 | 203 | return losses 204 | 205 | 206 | @SPARSE_INST_MATCHER_REGISTRY.register() 207 | class SparseInstMatcherV1(nn.Module): 208 | 209 | def __init__(self, cfg): 210 | super().__init__() 211 | self.alpha = cfg.MODEL.SPARSE_INST.MATCHER.ALPHA 212 | self.beta = cfg.MODEL.SPARSE_INST.MATCHER.BETA 213 | self.mask_score = dice_score 214 | 215 | @torch.no_grad() 216 | def forward(self, outputs, targets, input_shape): 217 | B, N, H, W = outputs["pred_masks"].shape 218 | pred_masks = outputs['pred_masks'] 219 | pred_logits = outputs['pred_logits'].sigmoid() 220 | 221 | indices = [] 222 | 223 | for i in range(B): 224 | tgt_ids = targets[i]["labels"] 225 | # no annotations 226 | if tgt_ids.shape[0] == 0: 227 | indices.append((torch.as_tensor([]), 228 | torch.as_tensor([]))) 229 | continue 230 | 231 | tgt_masks = targets[i]['masks'].tensor.to(pred_masks) 232 | pred_logit = pred_logits[i] 233 | out_masks = pred_masks[i] 234 | 235 | # upsampling: 236 | # (1) padding/ 237 | # (2) upsampling to 1x input size (input_shape) 238 | # (3) downsampling to 0.25x input size (output mask size) 239 | ori_h, ori_w = tgt_masks.size(1), tgt_masks.size(2) 240 | tgt_masks_ = torch.zeros( 241 | (1, tgt_masks.size(0), input_shape[0], input_shape[1])).to(pred_masks) 242 | tgt_masks_[0, :, :ori_h, :ori_w] = tgt_masks 243 | tgt_masks = F.interpolate( 244 | tgt_masks_, size=out_masks.shape[-2:], mode='bilinear', align_corners=False)[0] 245 | 246 | # compute dice score and classification score 247 | tgt_masks = tgt_masks.flatten(1) 248 | out_masks = out_masks.flatten(1) 249 | 250 | mask_score = self.mask_score(out_masks, tgt_masks) 251 | # Nx(Number of gts) 252 | matching_prob = pred_logit[:, tgt_ids] 253 | C = (mask_score ** self.alpha) * (matching_prob ** self.beta) 254 | # hungarian matching 255 | inds = linear_sum_assignment(C.cpu(), maximize=True) 256 | indices.append(inds) 257 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] 258 | 259 | 260 | @SPARSE_INST_MATCHER_REGISTRY.register() 261 | class SparseInstMatcher(nn.Module): 262 | 263 | def __init__(self, cfg): 264 | super().__init__() 265 | self.alpha = cfg.MODEL.SPARSE_INST.MATCHER.ALPHA 266 | self.beta = cfg.MODEL.SPARSE_INST.MATCHER.BETA 267 | self.mask_score = dice_score 268 | 269 | def forward(self, outputs, targets, input_shape): 270 | with torch.no_grad(): 271 | B, N, H, W = outputs["pred_masks"].shape 272 | pred_masks = outputs['pred_masks'] 273 | pred_logits = outputs['pred_logits'].sigmoid() 274 | 275 | tgt_ids = torch.cat([v["labels"] for v in targets]) 276 | 277 | if tgt_ids.shape[0] == 0: 278 | return [(torch.as_tensor([]).to(pred_logits), torch.as_tensor([]).to(pred_logits))] * B 279 | tgt_masks, _ = nested_masks_from_list( 280 | [t["masks"].tensor for t in targets], input_shape).decompose() 281 | device = pred_masks.device 282 | tgt_masks = tgt_masks.to(pred_masks) 283 | 284 | tgt_masks = F.interpolate( 285 | tgt_masks[:, None], size=pred_masks.shape[-2:], mode="bilinear", align_corners=False).squeeze(1) 286 | 287 | pred_masks = pred_masks.view(B * N, -1) 288 | tgt_masks = tgt_masks.flatten(1) 289 | with autocast(enabled=False): 290 | pred_masks = pred_masks.float() 291 | tgt_masks = tgt_masks.float() 292 | pred_logits = pred_logits.float() 293 | mask_score = self.mask_score(pred_masks, tgt_masks) 294 | # Nx(Number of gts) 295 | matching_prob = pred_logits.view(B * N, -1)[:, tgt_ids] 296 | C = (mask_score ** self.alpha) * (matching_prob ** self.beta) 297 | 298 | C = C.view(B, N, -1).cpu() 299 | # hungarian matching 300 | sizes = [len(v["masks"]) for v in targets] 301 | indices = [linear_sum_assignment(c[i], maximize=True) 302 | for i, c in enumerate(C.split(sizes, -1))] 303 | indices = [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor( 304 | j, dtype=torch.int64)) for i, j in indices] 305 | return indices 306 | 307 | 308 | def build_sparse_inst_matcher(cfg): 309 | name = cfg.MODEL.SPARSE_INST.MATCHER.NAME 310 | return SPARSE_INST_MATCHER_REGISTRY.get(name)(cfg) 311 | 312 | 313 | def build_sparse_inst_criterion(cfg): 314 | matcher = build_sparse_inst_matcher(cfg) 315 | name = cfg.MODEL.SPARSE_INST.LOSS.NAME 316 | return SPARSE_INST_CRITERION_REGISTRY.get(name)(cfg, matcher) 317 | -------------------------------------------------------------------------------- /sparseinst/sparseinst.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from detectron2.modeling import build_backbone 8 | from detectron2.structures import ImageList, Instances, BitMasks 9 | from detectron2.modeling import META_ARCH_REGISTRY, build_backbone 10 | import numpy as np 11 | from .encoder import build_sparse_inst_encoder 12 | from .decoder import build_sparse_inst_decoder 13 | from .loss import build_sparse_inst_criterion 14 | from .utils import nested_tensor_from_tensor_list 15 | 16 | __all__ = ["SparseInst"] 17 | 18 | 19 | @torch.jit.script 20 | def rescoring_mask(scores, mask_pred, masks): 21 | mask_pred_ = mask_pred.float() 22 | return scores * ((masks * mask_pred_).sum([1, 2]) / (mask_pred_.sum([1, 2]).double() + 1e-6).float()) 23 | 24 | 25 | @META_ARCH_REGISTRY.register() 26 | class SparseInst(nn.Module): 27 | 28 | def __init__(self, cfg): 29 | super().__init__() 30 | 31 | # move to target device 32 | self.device = torch.device(cfg.MODEL.DEVICE) 33 | self.use_cp = True 34 | 35 | print("NOM DU DEVICE UTILISE2", torch.cuda.get_device_name()) 36 | # backbone 37 | self.backbone = build_backbone(cfg) 38 | self.size_divisibility = self.backbone.size_divisibility 39 | output_shape = self.backbone.output_shape() 40 | 41 | # encoder & decoder 42 | self.encoder = build_sparse_inst_encoder(cfg, output_shape) 43 | self.decoder = build_sparse_inst_decoder(cfg) 44 | 45 | # matcher & loss (matcher is built in loss) 46 | self.criterion = build_sparse_inst_criterion(cfg) 47 | 48 | # data and preprocessing 49 | self.mask_format = cfg.INPUT.MASK_FORMAT 50 | 51 | self.pixel_mean = torch.Tensor( 52 | cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1) 53 | self.pixel_std = torch.Tensor( 54 | cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1) 55 | # self.normalizer = lambda x: (x - pixel_mean) / pixel_std 56 | 57 | # inference 58 | self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD 59 | self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD 60 | self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS 61 | 62 | def normalizer(self, image): 63 | image = (image - self.pixel_mean) / self.pixel_std 64 | return image 65 | 66 | def preprocess_inputs(self, batched_inputs): 67 | images = [x["image"].to(self.device) for x in batched_inputs] 68 | images = [self.normalizer(x) for x in images] 69 | images = ImageList.from_tensors(images, 32) 70 | return images 71 | 72 | def prepare_targets(self, targets): 73 | new_targets = [] 74 | for targets_per_image in targets: 75 | target = {} 76 | gt_classes = targets_per_image.gt_classes 77 | target["labels"] = gt_classes.to(self.device) 78 | h, w = targets_per_image.image_size 79 | if not targets_per_image.has('gt_masks'): 80 | gt_masks = BitMasks(torch.empty(0, h, w)) 81 | else: 82 | gt_masks = targets_per_image.gt_masks 83 | if self.mask_format == "polygon": 84 | if len(gt_masks.polygons) == 0: 85 | gt_masks = BitMasks(torch.empty(0, h, w)) 86 | else: 87 | gt_masks = BitMasks.from_polygon_masks( 88 | gt_masks.polygons, h, w) 89 | 90 | target["masks"] = gt_masks.to(self.device) 91 | new_targets.append(target) 92 | 93 | return new_targets 94 | 95 | def forward(self, batched_inputs): 96 | images = self.preprocess_inputs(batched_inputs) 97 | if isinstance(images, (list, torch.Tensor)): 98 | images = nested_tensor_from_tensor_list(images) 99 | max_shape = images.tensor.shape[2:] 100 | features = self.backbone(images.tensor) 101 | features = self.encoder(features) 102 | output = self.decoder(features) 103 | 104 | if self.training: 105 | gt_instances = [x["instances"].to( 106 | self.device) for x in batched_inputs] 107 | targets = self.prepare_targets(gt_instances) 108 | losses = self.criterion(output, targets, max_shape) 109 | return losses 110 | else: 111 | results = self.inference( 112 | output, batched_inputs, max_shape, images.image_sizes) 113 | processed_results = [{"instances": r} for r in results] 114 | return processed_results 115 | 116 | def forward_test_3(self, images): 117 | # images = self.preprocess_inputs(batched_inputs) 118 | # if isinstance(images, (list, torch.Tensor)): 119 | # images = nested_tensor_from_tensor_list(images) 120 | max_shape = images.shape[2:] 121 | # forward 122 | # if self.use_cp: 123 | # features = self.backbone(images.tensor) 124 | # features = checkpoint(self.encoder,features) 125 | # output = self.decoder(features) 126 | # else: 127 | features = self.backbone(images) 128 | features = self.encoder(features) 129 | output = self.decoder(features) 130 | 131 | if self.training: 132 | gt_instances = [x["instances"].to(self.device) for x in batched_inputs] 133 | targets = self.prepare_targets(gt_instances) 134 | losses = self.criterion(output, targets, max_shape) 135 | return losses 136 | else: 137 | results = self.inference_test_3(output, images) 138 | # import pdb; pdb.set_trace() 139 | # processed_results = [{"instances": r} for r in results] 140 | 141 | out_scores = torch.cat([r.scores.unsqueeze(0) for r in results], dim=0) 142 | out_pred_classes = torch.cat([r.pred_classes.unsqueeze(0) for r in results], dim=0) 143 | out_pred_masks = torch.cat([r.pred_masks for r in results], dim=0) 144 | return (out_scores, out_pred_classes, out_pred_masks) 145 | return processed_results 146 | 147 | def inference(self, output, batched_inputs, max_shape, image_sizes): 148 | # max_detections = self.max_detections 149 | results = [] 150 | pred_scores = output["pred_logits"].sigmoid() 151 | pred_masks = output["pred_masks"].sigmoid() 152 | pred_objectness = output["pred_scores"].sigmoid() 153 | pred_scores = torch.sqrt(pred_scores * pred_objectness) 154 | for _, (scores_per_image, mask_pred_per_image, batched_input, img_shape) in enumerate(zip( 155 | pred_scores, pred_masks, batched_inputs, image_sizes)): 156 | 157 | ori_shape = (batched_input["height"], batched_input["width"]) 158 | result = Instances(ori_shape) 159 | # max/argmax 160 | scores, labels = scores_per_image.max(dim=-1) 161 | # cls threshold 162 | keep = scores > self.cls_threshold 163 | scores = scores[keep] 164 | labels = labels[keep] 165 | mask_pred_per_image = mask_pred_per_image[keep] 166 | if scores.size(0) == 0: 167 | result.scores = scores 168 | result.pred_classes = labels 169 | results.append(result) 170 | continue 171 | 172 | h, w = img_shape 173 | # rescoring mask using maskness 174 | scores = rescoring_mask( 175 | scores, mask_pred_per_image > self.mask_threshold, mask_pred_per_image) 176 | # upsample the masks to the original resolution: 177 | # (1) upsampling the masks to the padded inputs, remove the padding area 178 | # (2) upsampling/downsampling the masks to the original sizes 179 | 180 | mask_pred_per_image = F.interpolate( 181 | mask_pred_per_image.unsqueeze(1), size=max_shape, mode="bilinear", align_corners=False)[:, :, :h, :w] 182 | mask_pred_per_image = F.interpolate( 183 | mask_pred_per_image, size=ori_shape, mode='bilinear', align_corners=False).squeeze(1) 184 | 185 | mask_pred = mask_pred_per_image > self.mask_threshold 186 | # fix the bug for visualization 187 | # mask_pred = BitMasks(mask_pred) 188 | 189 | # using Detectron2 Instances to store the final results 190 | result.pred_masks = mask_pred 191 | result.scores = scores 192 | result.pred_classes = labels 193 | results.append(result) 194 | return results 195 | 196 | def inference_test_3(self, output, images): 197 | # max_detections = self.max_detections 198 | results = [] 199 | pred_scores = output["pred_logits"].sigmoid() 200 | pred_masks = output["pred_masks"].sigmoid() 201 | pred_objectness = output["pred_scores"].sigmoid() 202 | pred_scores = torch.sqrt(pred_scores * pred_objectness) 203 | 204 | for _, (scores_per_image, mask_pred_per_image, image) in enumerate(zip( 205 | pred_scores, pred_masks, images)): 206 | 207 | shape = image.shape[1:] 208 | result = Instances(shape) 209 | 210 | scores, labels = scores_per_image.max(dim=-1) 211 | 212 | if scores.size(0) == 0: 213 | result.scores = scores 214 | result.pred_classes = labels 215 | results.append(result) 216 | continue 217 | 218 | h, w = shape 219 | # rescoring mask using maskness 220 | scores = rescoring_mask(scores, mask_pred_per_image > self.mask_threshold, mask_pred_per_image) 221 | # using Detectron2 Instances to store the final results 222 | 223 | result.pred_masks = mask_pred_per_image #mask_pred 224 | result.scores = scores 225 | result.pred_classes = labels 226 | results.append(result) 227 | 228 | return results 229 | -------------------------------------------------------------------------------- /sparseinst/utils.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import Optional, List 3 | 4 | import torch 5 | from torch import Tensor 6 | import torch.distributed as dist 7 | import torch.nn.functional as F 8 | import torchvision 9 | 10 | 11 | def _max_by_axis(the_list): 12 | # type: (List[List[int]]) -> List[int] 13 | maxes = the_list[0] 14 | for sublist in the_list[1:]: 15 | for index, item in enumerate(sublist): 16 | maxes[index] = max(maxes[index], item) 17 | return maxes 18 | 19 | 20 | class NestedTensor(object): 21 | def __init__(self, tensors, mask: Optional[Tensor]): 22 | self.tensors = tensors 23 | self.mask = mask 24 | 25 | def to(self, device): 26 | cast_tensor = self.tensors.to(device) 27 | mask = self.mask 28 | if mask is not None: 29 | assert mask is not None 30 | cast_mask = mask.to(device) 31 | else: 32 | cast_mask = None 33 | return NestedTensor(cast_tensor, cast_mask) 34 | 35 | def decompose(self): 36 | return self.tensors, self.mask 37 | 38 | def __repr__(self): 39 | return str(self.tensors) 40 | 41 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 42 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 43 | 44 | 45 | @torch.jit.unused 46 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 47 | max_size = [] 48 | for i in range(tensor_list[0].dim()): 49 | max_size_i = torch.max(torch.stack([img.shape[i] 50 | for img in tensor_list]).to(torch.float32)).to(torch.int64) 51 | max_size.append(max_size_i) 52 | max_size = tuple(max_size) 53 | 54 | # work around for 55 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 56 | # m[: img.shape[1], :img.shape[2]] = False 57 | # which is not yet supported in onnx 58 | padded_imgs = [] 59 | padded_masks = [] 60 | for img in tensor_list: 61 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 62 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 63 | padded_imgs.append(padded_img) 64 | 65 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 66 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 67 | padded_masks.append(padded_mask.to(torch.bool)) 68 | 69 | tensor = torch.stack(padded_imgs) 70 | mask = torch.stack(padded_masks) 71 | 72 | return NestedTensor(tensor, mask=mask) 73 | 74 | 75 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 76 | # TODO make this more general 77 | if tensor_list[0].ndim == 3: 78 | if torchvision._is_tracing(): 79 | # nested_tensor_from_tensor_list() does not export well to ONNX 80 | # call _onnx_nested_tensor_from_tensor_list() instead 81 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 82 | 83 | # TODO make it support different-sized images 84 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 85 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 86 | batch_shape = [len(tensor_list)] + max_size 87 | b, c, h, w = batch_shape 88 | dtype = tensor_list[0].dtype 89 | device = tensor_list[0].device 90 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 91 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 92 | for img, pad_img, m in zip(tensor_list, tensor, mask): 93 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 94 | m[: img.shape[1], :img.shape[2]] = False 95 | else: 96 | raise ValueError('not supported') 97 | return NestedTensor(tensor, mask) 98 | 99 | 100 | def nested_masks_from_list(tensor_list: List[Tensor], input_shape=None): 101 | if tensor_list[0].ndim == 3: 102 | dim_size = sum([img.shape[0] for img in tensor_list]) 103 | if input_shape is None: 104 | max_size = _max_by_axis([list(img.shape[-2:]) for img in tensor_list]) 105 | else: 106 | max_size = [input_shape[0], input_shape[1]] 107 | batch_shape = [dim_size] + max_size 108 | # b, h, w = batch_shape 109 | dtype = tensor_list[0].dtype 110 | device = tensor_list[0].device 111 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 112 | mask = torch.zeros(batch_shape, dtype=torch.bool, device=device) 113 | idx = 0 114 | for img in tensor_list: 115 | c = img.shape[0] 116 | c_ = idx + c 117 | tensor[idx: c_, :img.shape[1], : img.shape[2]].copy_(img) 118 | mask[idx: c_, :img.shape[1], :img.shape[2]] = True 119 | idx = c_ 120 | else: 121 | raise ValueError('not supported') 122 | return NestedTensor(tensor, mask) 123 | 124 | 125 | def is_dist_avail_and_initialized(): 126 | if not dist.is_available(): 127 | return False 128 | if not dist.is_initialized(): 129 | return False 130 | return True 131 | 132 | 133 | def get_world_size(): 134 | if not is_dist_avail_and_initialized(): 135 | return 1 136 | return dist.get_world_size() 137 | 138 | 139 | def aligned_bilinear(tensor, factor): 140 | # borrowed from Adelaidet: https://github1s.com/aim-uofa/AdelaiDet/blob/HEAD/adet/utils/comm.py 141 | assert tensor.dim() == 4 142 | assert factor >= 1 143 | assert int(factor) == factor 144 | 145 | if factor == 1: 146 | return tensor 147 | 148 | h, w = tensor.size()[2:] 149 | tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate") 150 | oh = factor * h + 1 151 | ow = factor * w + 1 152 | tensor = F.interpolate( 153 | tensor, size=(oh, ow), 154 | mode='bilinear', 155 | align_corners=True 156 | ) 157 | tensor = F.pad( 158 | tensor, pad=(factor // 2, 0, factor // 2, 0), 159 | mode="replicate" 160 | ) 161 | 162 | return tensor[:, :, :oh - 1, :ow - 1] 163 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | print("It works") 2 | print("okok") -------------------------------------------------------------------------------- /test_net.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.cuda.amp import autocast 8 | 9 | from detectron2.config import get_cfg 10 | from detectron2.modeling import build_backbone 11 | from detectron2.checkpoint import DetectionCheckpointer 12 | from detectron2.structures import ImageList, Instances, BitMasks 13 | from detectron2.engine import default_argument_parser, default_setup 14 | from detectron2.data import build_detection_test_loader 15 | from detectron2.evaluation import COCOEvaluator, print_csv_format 16 | 17 | from sparseinst import build_sparse_inst_encoder, build_sparse_inst_decoder, add_sparse_inst_config 18 | from sparseinst import COCOMaskEvaluator 19 | 20 | 21 | device = torch.device('cuda:0') 22 | dtype = torch.float32 23 | 24 | __all__ = ["SparseInst"] 25 | 26 | pixel_mean = torch.Tensor([123.675, 116.280, 103.530]).to(device).view(3, 1, 1) 27 | pixel_std = torch.Tensor([58.395, 57.120, 57.375]).to(device).view(3, 1, 1) 28 | 29 | 30 | @torch.jit.script 31 | def normalizer(x, mean, std): return (x - mean) / std 32 | 33 | 34 | def synchronize(): 35 | torch.cuda.synchronize() 36 | 37 | 38 | def process_batched_inputs(batched_inputs): 39 | images = [x["image"].to(device) for x in batched_inputs] 40 | images = [normalizer(x, pixel_mean, pixel_std) for x in images] 41 | images = ImageList.from_tensors(images, 32) 42 | ori_size = (batched_inputs[0]["height"], batched_inputs[0]["width"]) 43 | return images.tensor, images.image_sizes[0], ori_size 44 | 45 | 46 | @torch.jit.script 47 | def rescoring_mask(scores, mask_pred, masks): 48 | mask_pred_ = mask_pred.float() 49 | return scores * ((masks * mask_pred_).sum([1, 2]) / (mask_pred_.sum([1, 2]) + 1e-6)) 50 | 51 | 52 | class SparseInst(nn.Module): 53 | 54 | def __init__(self, cfg): 55 | 56 | super().__init__() 57 | 58 | self.device = torch.device(cfg.MODEL.DEVICE) 59 | # backbone 60 | self.backbone = build_backbone(cfg) 61 | self.size_divisibility = self.backbone.size_divisibility 62 | 63 | output_shape = self.backbone.output_shape() 64 | 65 | self.encoder = build_sparse_inst_encoder(cfg, output_shape) 66 | self.decoder = build_sparse_inst_decoder(cfg) 67 | 68 | self.to(self.device) 69 | 70 | # inference 71 | self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD 72 | self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD 73 | self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS 74 | self.mask_format = cfg.INPUT.MASK_FORMAT 75 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES 76 | 77 | def forward(self, image, resized_size, ori_size): 78 | max_size = image.shape[2:] 79 | features = self.backbone(image) 80 | features = self.encoder(features) 81 | output = self.decoder(features) 82 | result = self.inference_single( 83 | output, resized_size, max_size, ori_size) 84 | return result 85 | 86 | def inference_single(self, outputs, img_shape, pad_shape, ori_shape): 87 | """ 88 | inference for only one sample 89 | Args: 90 | scores (tensor): [NxC] 91 | masks (tensor): [NxHxW] 92 | img_shape (list): (h1, w1), image after resized 93 | pad_shape (list): (h2, w2), padded resized image 94 | ori_shape (list): (h3, w3), original shape h3*w3 < h1*w1 < h2*w2 95 | """ 96 | result = Instances(ori_shape) 97 | # scoring 98 | pred_logits = outputs["pred_logits"][0].sigmoid() 99 | pred_scores = outputs["pred_scores"][0].sigmoid().squeeze() 100 | pred_masks = outputs["pred_masks"][0].sigmoid() 101 | # obtain scores 102 | scores, labels = pred_logits.max(dim=-1) 103 | # remove by thresholding 104 | keep = scores > self.cls_threshold 105 | scores = torch.sqrt(scores[keep] * pred_scores[keep]) 106 | labels = labels[keep] 107 | pred_masks = pred_masks[keep] 108 | 109 | if scores.size(0) == 0: 110 | return None 111 | scores = rescoring_mask(scores, pred_masks > 0.45, pred_masks) 112 | h, w = img_shape 113 | # resize masks 114 | pred_masks = F.interpolate(pred_masks.unsqueeze(1), size=pad_shape, 115 | mode="bilinear", align_corners=False)[:, :, :h, :w] 116 | pred_masks = F.interpolate(pred_masks, size=ori_shape, mode='bilinear', 117 | align_corners=False).squeeze(1) 118 | mask_pred = pred_masks > self.mask_threshold 119 | 120 | mask_pred = BitMasks(mask_pred) 121 | result.pred_masks = mask_pred 122 | result.scores = scores 123 | result.pred_classes = labels 124 | return result 125 | 126 | 127 | def test_sparseinst_speed(cfg, fp16=True): 128 | device = torch.device('cuda:0') 129 | 130 | model = SparseInst(cfg) 131 | model.eval() 132 | model.to(device) 133 | print(model) 134 | size = (cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST) 135 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 136 | cfg.MODEL.WEIGHTS, resume=False) 137 | 138 | torch.backends.cudnn.enable = True 139 | torch.backends.cudnn.benchmark = True 140 | 141 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 142 | 143 | evaluator = COCOMaskEvaluator( 144 | cfg.DATASETS.TEST[0], ("segm",), False, output_folder) 145 | evaluator.reset() 146 | model.to(device) 147 | model.eval() 148 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) 149 | durations = [] 150 | 151 | with autocast(enabled=fp16): 152 | with torch.no_grad(): 153 | for idx, inputs in enumerate(data_loader): 154 | images, resized_size, ori_size = process_batched_inputs(inputs) 155 | synchronize() 156 | start_time = time.perf_counter() 157 | output = model(images, resized_size, ori_size) 158 | print(len(output)) 159 | print(output) 160 | synchronize() 161 | end = time.perf_counter() - start_time 162 | 163 | durations.append(end) 164 | if idx % 100 == 0: 165 | print("process: [{}/{}] fps: {:.3f}".format(idx, 166 | len(data_loader), 1/np.mean(durations[100:]))) 167 | evaluator.process(inputs, [{"instances": output}]) 168 | # evaluate 169 | results = evaluator.evaluate() 170 | print_csv_format(results) 171 | 172 | latency = np.mean(durations[100:]) 173 | fps = 1 / latency 174 | print("speed: {:.4f}s FPS: {:.2f}".format(latency, fps)) 175 | 176 | 177 | def setup(args): 178 | """ 179 | Create configs and perform basic setups. 180 | """ 181 | cfg = get_cfg() 182 | add_sparse_inst_config(cfg) 183 | cfg.merge_from_file(args.config_file) 184 | cfg.merge_from_list(args.opts) 185 | cfg.freeze() 186 | default_setup(cfg, args) 187 | return cfg 188 | 189 | 190 | if __name__ == '__main__': 191 | 192 | args = default_argument_parser() 193 | args.add_argument("--fp16", action="store_true", 194 | help="support fp16 for inference") 195 | args = args.parse_args() 196 | print("Command Line Args:", args) 197 | cfg = setup(args) 198 | test_sparseinst_speed(cfg, fp16=args.fp16) 199 | -------------------------------------------------------------------------------- /train_net.py: -------------------------------------------------------------------------------- 1 | import os 2 | import itertools 3 | import time 4 | from typing import Any, Dict, List, Set 5 | 6 | import torch 7 | from torch import optim 8 | 9 | import detectron2.utils.comm as comm 10 | from detectron2.checkpoint import DetectionCheckpointer 11 | from detectron2.config import get_cfg 12 | from detectron2.utils.logger import setup_logger 13 | from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetMapper 14 | from detectron2.engine import AutogradProfiler, DefaultTrainer, default_argument_parser, default_setup, launch 15 | from detectron2.evaluation import COCOEvaluator, verify_results 16 | from detectron2.solver.build import maybe_add_gradient_clipping 17 | from detectron2.evaluation import ( 18 | CityscapesInstanceEvaluator, 19 | CityscapesSemSegEvaluator, 20 | COCOEvaluator, 21 | COCOPanopticEvaluator, 22 | DatasetEvaluators, 23 | LVISEvaluator, 24 | PascalVOCDetectionEvaluator, 25 | SemSegEvaluator, 26 | verify_results, 27 | ) 28 | 29 | from sparseinst import add_sparse_inst_config, COCOMaskEvaluator 30 | 31 | 32 | class Trainer(DefaultTrainer): 33 | 34 | @classmethod 35 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 36 | """ 37 | Create evaluator(s) for a given dataset. 38 | This uses the special metadata "evaluator_type" associated with each builtin dataset. 39 | For your own dataset, you can simply create an evaluator manually in your 40 | script and do not have to worry about the hacky if-else logic here. 41 | """ 42 | if output_folder is None: 43 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 44 | evaluator_list = [] 45 | evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type 46 | if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: 47 | evaluator_list.append( 48 | SemSegEvaluator( 49 | dataset_name, 50 | distributed=True, 51 | num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 52 | ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 53 | output_dir=output_folder, 54 | ) 55 | ) 56 | if evaluator_type in ["coco", "coco_panoptic_seg"]: 57 | evaluator_list.append(COCOMaskEvaluator(dataset_name, ("segm", ), True, output_folder)) 58 | if evaluator_type == "coco_panoptic_seg": 59 | evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) 60 | if evaluator_type == "cityscapes_instance": 61 | assert ( 62 | torch.cuda.device_count() >= comm.get_rank() 63 | ), "CityscapesEvaluator currently do not work with multiple machines." 64 | return CityscapesInstanceEvaluator(dataset_name) 65 | if evaluator_type == "cityscapes_sem_seg": 66 | assert ( 67 | torch.cuda.device_count() >= comm.get_rank() 68 | ), "CityscapesEvaluator currently do not work with multiple machines." 69 | return CityscapesSemSegEvaluator(dataset_name) 70 | elif evaluator_type == "pascal_voc": 71 | return PascalVOCDetectionEvaluator(dataset_name) 72 | elif evaluator_type == "lvis": 73 | return LVISEvaluator(dataset_name, cfg, True, output_folder) 74 | if len(evaluator_list) == 0: 75 | raise NotImplementedError( 76 | "no Evaluator for the dataset {} with the type {}".format( 77 | dataset_name, evaluator_type 78 | ) 79 | ) 80 | elif len(evaluator_list) == 1: 81 | return evaluator_list[0] 82 | return DatasetEvaluators(evaluator_list) 83 | 84 | @classmethod 85 | def build_optimizer(cls, cfg, model): 86 | params: List[Dict[str, Any]] = [] 87 | memo: Set[torch.nn.parameter.Parameter] = set() 88 | for key, value in model.named_parameters(recurse=True): 89 | if not value.requires_grad: 90 | continue 91 | # Avoid duplicating parameters 92 | if value in memo: 93 | continue 94 | memo.add(value) 95 | lr = cfg.SOLVER.BASE_LR 96 | weight_decay = cfg.SOLVER.WEIGHT_DECAY 97 | if "backbone" in key: 98 | lr = lr * cfg.SOLVER.BACKBONE_MULTIPLIER 99 | # for transformer 100 | if "patch_embed" in key or "cls_token" in key: 101 | weight_decay = 0.0 102 | if "norm" in key: 103 | weight_decay = 0.0 104 | params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] 105 | 106 | def maybe_add_full_model_gradient_clipping(optim): # optim: the optimizer class 107 | # detectron2 doesn't have full model gradient clipping now 108 | clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE 109 | enable = ( 110 | cfg.SOLVER.CLIP_GRADIENTS.ENABLED 111 | and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model" 112 | and clip_norm_val > 0.0 113 | ) 114 | 115 | class FullModelGradientClippingOptimizer(optim): 116 | def step(self, closure=None): 117 | all_params = itertools.chain(*[x["params"] for x in self.param_groups]) 118 | torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val) 119 | super().step(closure=closure) 120 | 121 | return FullModelGradientClippingOptimizer if enable else optim 122 | 123 | optimizer_type = cfg.SOLVER.OPTIMIZER 124 | if optimizer_type == "SGD": 125 | optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)( 126 | params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM 127 | ) 128 | elif optimizer_type == "ADAMW": 129 | optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)( 130 | params, cfg.SOLVER.BASE_LR, amsgrad=cfg.SOLVER.AMSGRAD 131 | ) 132 | else: 133 | raise NotImplementedError(f"no optimizer type {optimizer_type}") 134 | if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model": 135 | optimizer = maybe_add_gradient_clipping(cfg, optimizer) 136 | return optimizer 137 | 138 | @classmethod 139 | def build_train_loader(cls, cfg): 140 | if cfg.MODEL.SPARSE_INST.DATASET_MAPPER == "SparseInstDatasetMapper": 141 | from sparseinst import SparseInstDatasetMapper 142 | mapper = SparseInstDatasetMapper(cfg, is_train=True) 143 | else: 144 | mapper = None 145 | return build_detection_train_loader(cfg, mapper=mapper) 146 | 147 | 148 | def setup(args): 149 | """ 150 | Create configs and perform basic setups. 151 | """ 152 | cfg = get_cfg() 153 | add_sparse_inst_config(cfg) 154 | cfg.merge_from_file(args.config_file) 155 | cfg.merge_from_list(args.opts) 156 | cfg.freeze() 157 | default_setup(cfg, args) 158 | # Setup logger for "sparseinst" module 159 | setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="sparseinst") 160 | return cfg 161 | 162 | 163 | def main(args): 164 | cfg = setup(args) 165 | 166 | if args.eval_only: 167 | model = Trainer.build_model(cfg) 168 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 169 | cfg.MODEL.WEIGHTS, resume=args.resume) 170 | res = Trainer.test(cfg, model) 171 | if comm.is_main_process(): 172 | verify_results(cfg, res) 173 | return res 174 | 175 | trainer = Trainer(cfg) 176 | trainer.resume_or_load(resume=args.resume) 177 | return trainer.train() 178 | 179 | 180 | if __name__ == "__main__": 181 | args = default_argument_parser().parse_args() 182 | print("Command Line Args:", args) 183 | launch( 184 | main, 185 | args.num_gpus, 186 | num_machines=args.num_machines, 187 | machine_rank=args.machine_rank, 188 | dist_url=args.dist_url, 189 | args=(args,), 190 | ) 191 | --------------------------------------------------------------------------------