├── assets ├── banner.gif ├── animate.gif ├── sparseinst.png └── figures │ ├── 000000006471.jpg │ └── 000000014439.jpg ├── configs ├── sparse_inst_r50_giam.yaml ├── sparse_inst_r101_giam.yaml ├── sparse_inst_r50_giam_fp16.yaml ├── sparse_inst_r50_base.yaml ├── sparse_inst_r50_giam_soft.yaml ├── sparse_inst_r50vd_giam.yaml ├── sparse_inst_r101_dcn_giam.yaml ├── sparse_inst_r50_giam_aug.yaml ├── sparse_inst_r50vd_base.yaml ├── sparse_inst_darknet53_giam.yaml ├── sparse_inst_r50vd_dcn_giam.yaml ├── sparse_inst_pvt_b1_giam.yaml ├── sparse_inst_r50_dcn_giam_aug.yaml ├── sparse_inst_r50vd_giam_aug.yaml ├── sparse_inst_pvt_b2_li_giam.yaml ├── sparse_inst_cspdarknet53_giam.yaml ├── sparse_inst_r50vd_dcn_giam_aug.yaml └── Base-SparseInst.yaml ├── sparseinst ├── backbones │ ├── __init__.py │ ├── pvt.py │ ├── resnet.py │ └── cspnet.py ├── __init__.py ├── config.py ├── coco_evaluation.py ├── encoder.py ├── utils.py ├── dataset_mapper.py ├── sparseinst.py ├── d2_predictor.py ├── decoder.py └── loss.py ├── mindspore ├── sparseinst │ ├── __init__.py │ ├── config.py │ ├── encoder.py │ ├── resnet.py │ ├── sparseinst.py │ └── decoder.py ├── README.md ├── dict.py └── test.py ├── .gitignore ├── LICENCE ├── Dockerfile ├── onnx └── convert_onnx.py ├── demo.py ├── tools ├── get_flops.py ├── test_net.py └── train_net.py └── README.md /assets/banner.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hustvl/SparseInst/HEAD/assets/banner.gif -------------------------------------------------------------------------------- /assets/animate.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hustvl/SparseInst/HEAD/assets/animate.gif -------------------------------------------------------------------------------- /assets/sparseinst.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hustvl/SparseInst/HEAD/assets/sparseinst.png -------------------------------------------------------------------------------- /assets/figures/000000006471.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hustvl/SparseInst/HEAD/assets/figures/000000006471.jpg -------------------------------------------------------------------------------- /assets/figures/000000014439.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hustvl/SparseInst/HEAD/assets/figures/000000014439.jpg -------------------------------------------------------------------------------- /configs/sparse_inst_r50_giam.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/R-50.pkl" 4 | OUTPUT_DIR: "output/sparse_inst_r50_giam" -------------------------------------------------------------------------------- /sparseinst/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet import build_resnet_vd_backbone 2 | from .pvt import build_pyramid_vision_transformer 3 | from .cspnet import build_cspnet_backbone -------------------------------------------------------------------------------- /configs/sparse_inst_r101_giam.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | OUTPUT_DIR: "output/sparse_inst_r101_giam" -------------------------------------------------------------------------------- /configs/sparse_inst_r50_giam_fp16.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/R-50.pkl" 4 | SOLVER: 5 | AMP: 6 | ENABLED: True 7 | OUTPUT_DIR: "output/sparse_inst_r50_giam_fp16" -------------------------------------------------------------------------------- /configs/sparse_inst_r50_base.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/R-50.pkl" 4 | SPARSE_INST: 5 | DECODER: 6 | NAME: "BaseIAMDecoder" 7 | OUTPUT_DIR: "output/sparse_inst_r50_base" -------------------------------------------------------------------------------- /mindspore/sparseinst/__init__.py: -------------------------------------------------------------------------------- 1 | from .sparseinst import SparseInst 2 | from .config import cfg,update_config 3 | from .resnet import build_resnet50 4 | from .encoder import InstanceContextEncoder 5 | from .decoder import GroupIAMDecoder -------------------------------------------------------------------------------- /configs/sparse_inst_r50_giam_soft.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/R-50.pkl" 4 | SPARSE_INST: 5 | DECODER: 6 | NAME: "GroupIAMSoftDecoder" 7 | OUTPUT_DIR: "output/sparse_inst_r50_giam_soft" -------------------------------------------------------------------------------- /configs/sparse_inst_r50vd_giam.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/resnet50d_ra2-464e36ba.pth" 4 | BACKBONE: 5 | FREEZE_AT: 0 6 | NAME: "build_resnet_vd_backbone" 7 | OUTPUT_DIR: "output/sparse_inst_r50vd_giam" 8 | -------------------------------------------------------------------------------- /configs/sparse_inst_r101_dcn_giam.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/R-101.pkl" 4 | RESNETS: 5 | DEPTH: 101 6 | DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5 7 | OUTPUT_DIR: "output/sparse_inst_r101_dcn_giam" -------------------------------------------------------------------------------- /configs/sparse_inst_r50_giam_aug.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/R-50.pkl" 4 | INPUT: 5 | CROP: 6 | ENABLED: True 7 | TYPE: "absolute_range" 8 | SIZE: (384, 600) 9 | MASK_FORMAT: "polygon" 10 | OUTPUT_DIR: "output/sparse_inst_r50_giam_aug" -------------------------------------------------------------------------------- /configs/sparse_inst_r50vd_base.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/resnet50d_ra2-464e36ba.pth" 4 | BACKBONE: 5 | FREEZE_AT: 0 6 | NAME: "build_resnet_vd_backbone" 7 | SPARSE_INST: 8 | DECODER: 9 | NAME: "BaseIAMDecoder" 10 | OUTPUT_DIR: "output/sparse_inst_r50_base" -------------------------------------------------------------------------------- /configs/sparse_inst_darknet53_giam.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "" 4 | BACKBONE: 5 | NAME: "build_cspnet_backbone" 6 | SPARSE_INST: 7 | ENCODER: 8 | IN_FEATURES: ["csp2", "csp3", "csp4"] 9 | CSPNET: 10 | NAME: "darknet53" 11 | OUT_FEATURES: ["csp2", "csp3", "csp4"] 12 | OUTPUT_DIR: "output/sparse_inst_darknet53_giam" -------------------------------------------------------------------------------- /configs/sparse_inst_r50vd_dcn_giam.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/resnet50d_ra2-464e36ba.pth" 4 | BACKBONE: 5 | FREEZE_AT: 0 6 | NAME: "build_resnet_vd_backbone" 7 | RESNETS: 8 | DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5 9 | OUTPUT_DIR: "output/sparse_inst_r50vd_dcn_giam" 10 | 11 | -------------------------------------------------------------------------------- /configs/sparse_inst_pvt_b1_giam.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/pvt_v2_b1.pth" 4 | BACKBONE: 5 | NAME: "build_pyramid_vision_transformer" 6 | SPARSE_INST: 7 | ENCODER: 8 | IN_FEATURES: ["p2", "p3", "p4"] 9 | PVT: 10 | NAME: "b1" 11 | OUT_FEATURES: ["p2", "p3", "p4"] 12 | OUTPUT_DIR: "output/sparse_inst_pvt_b1_giam" -------------------------------------------------------------------------------- /configs/sparse_inst_r50_dcn_giam_aug.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/R-50.pkl" 4 | RESNETS: 5 | DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5 6 | INPUT: 7 | CROP: 8 | ENABLED: True 9 | TYPE: "absolute_range" 10 | SIZE: (384, 600) 11 | MASK_FORMAT: "polygon" 12 | OUTPUT_DIR: "output/sparse_inst_r50_dcn_giam_aug" -------------------------------------------------------------------------------- /configs/sparse_inst_r50vd_giam_aug.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/resnet50d_ra2-464e36ba.pth" 4 | BACKBONE: 5 | FREEZE_AT: 0 6 | NAME: "build_resnet_vd_backbone" 7 | INPUT: 8 | CROP: 9 | ENABLED: True 10 | TYPE: "absolute_range" 11 | SIZE: (384, 600) 12 | MASK_FORMAT: "polygon" 13 | OUTPUT_DIR: "output/sparse_inst_r50vd_giam_aug" 14 | -------------------------------------------------------------------------------- /configs/sparse_inst_pvt_b2_li_giam.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/pvt_v2_b2_li.pth" 4 | BACKBONE: 5 | NAME: "build_pyramid_vision_transformer" 6 | SPARSE_INST: 7 | ENCODER: 8 | IN_FEATURES: ["p2", "p3", "p4"] 9 | PVT: 10 | NAME: "b2" 11 | LINEAR: True 12 | OUT_FEATURES: ["p2", "p3", "p4"] 13 | OUTPUT_DIR: "output/sparse_inst_pvt_b2_linear_giam" -------------------------------------------------------------------------------- /configs/sparse_inst_cspdarknet53_giam.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/cspdarknet53_ra_256-d05c7c21.pth" 4 | BACKBONE: 5 | NAME: "build_cspnet_backbone" 6 | SPARSE_INST: 7 | ENCODER: 8 | IN_FEATURES: ["csp2", "csp3", "csp4"] 9 | DECODER: 10 | NAME: "GroupIAMSoftDecoder" 11 | CSPNET: 12 | NAME: "cspdarknet53" 13 | OUT_FEATURES: ["csp2", "csp3", "csp4"] 14 | OUTPUT_DIR: "output/sparse_inst_cspdarknet53_giam" -------------------------------------------------------------------------------- /sparseinst/__init__.py: -------------------------------------------------------------------------------- 1 | from .sparseinst import SparseInst 2 | from .encoder import build_sparse_inst_encoder 3 | from .decoder import build_sparse_inst_decoder 4 | from .config import add_sparse_inst_config 5 | from .loss import build_sparse_inst_criterion 6 | from .dataset_mapper import SparseInstDatasetMapper 7 | from .coco_evaluation import COCOMaskEvaluator 8 | from .backbones import build_resnet_vd_backbone, build_pyramid_vision_transformer 9 | from .d2_predictor import VisualizationDemo 10 | -------------------------------------------------------------------------------- /configs/sparse_inst_r50vd_dcn_giam_aug.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-SparseInst.yaml" 2 | MODEL: 3 | WEIGHTS: "pretrained_models/resnet50d_ra2-464e36ba.pth" 4 | BACKBONE: 5 | FREEZE_AT: 0 6 | NAME: "build_resnet_vd_backbone" 7 | RESNETS: 8 | DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5 9 | INPUT: 10 | CROP: 11 | ENABLED: True 12 | TYPE: "absolute_range" 13 | SIZE: (384, 600) 14 | MASK_FORMAT: "polygon" 15 | OUTPUT_DIR: "output/sparse_inst_r50vd_dcn_giam_aug" 16 | 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # output dir 2 | output 3 | output* 4 | instant_test_output 5 | inference_test_output 6 | 7 | 8 | *.png 9 | *.json 10 | *.diff 11 | 12 | # compilation and distribution 13 | __pycache__ 14 | _ext 15 | *.pyc 16 | *.pyd 17 | *.so 18 | detectron2.egg-info/ 19 | build/ 20 | dist/ 21 | wheels/ 22 | 23 | # pytorch/python/numpy formats 24 | *.pth 25 | *.pkl 26 | *.npy 27 | 28 | # ipython/jupyter notebooks 29 | *.ipynb 30 | **/.ipynb_checkpoints/ 31 | 32 | # Editor temporaries 33 | *.swn 34 | *.swo 35 | *.swp 36 | *~ 37 | 38 | # editor settings 39 | .idea 40 | .vscode 41 | _darcs 42 | 43 | # project dirs 44 | /detectron2/model_zoo/configs 45 | /datasets/* 46 | !/datasets/*.* 47 | /projects/*/datasets 48 | /models 49 | 50 | # mac file 51 | .DS_Store -------------------------------------------------------------------------------- /mindspore/README.md: -------------------------------------------------------------------------------- 1 | # SparseInst on MindSpore 2 | 3 | ## Installation 4 | 5 | 1. create python 3.8 environment 6 | ```bash 7 | conda create -n sparseinst-ms python=3.8 8 | ``` 9 | 2. activate the new environment 10 | ```bash 11 | conda activate sparseinst-ms 12 | ``` 13 | 14 | 3. install mindspore 15 | ``` bash 16 | pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/1.8.1/MindSpore/gpu/x86_64/cuda-11.1/mindspore_gpu-1.8.1-cp38-cp38-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://pypi.tuna.tsinghua.edu.cn/simple 17 | ``` 18 | 19 | 4. install dependencies 20 | ```bash 21 | pip install mindvision pycocotools opencv-python numpy yacs 22 | ``` 23 | 24 | ## Model 25 | 26 | We provide the basic SparseInst-R50-GIAM in [BaiduPan](https://pan.baidu.com/s/1ZmZ6nqZrwt4ALYP1B2kdCA?pwd=7xsb). 27 | 28 | ## Demo 29 | 30 | ```bash 31 | python test.py --config /path/to/your/checkpoint --image_name /path/to/your/image --visualize 32 | ``` 33 | 34 | The results will be saved in ./image_name/ 35 | -------------------------------------------------------------------------------- /configs/Base-SparseInst.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "SparseInst" 3 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 4 | PIXEL_MEAN: [123.675, 116.280, 103.530] 5 | PIXEL_STD: [58.395, 57.120, 57.375] 6 | BACKBONE: 7 | FREEZE_AT: 0 8 | NAME: "build_resnet_backbone" 9 | RESNETS: 10 | NORM: "FrozenBN" 11 | DEPTH: 50 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res3", "res4", "res5"] 14 | SPARSE_INST: 15 | ENCODER: 16 | NAME: "InstanceContextEncoder" 17 | DECODER: 18 | NAME: "GroupIAMDecoder" 19 | DATASETS: 20 | TRAIN: ("coco_2017_train",) 21 | TEST: ("coco_2017_val",) 22 | SOLVER: 23 | IMS_PER_BATCH: 64 24 | BASE_LR: 0.00005 25 | STEPS: (210000, 250000) 26 | MAX_ITER: 270000 27 | WEIGHT_DECAY: 0.05 28 | INPUT: 29 | MIN_SIZE_TRAIN: (416, 448, 480, 512, 544, 576, 608, 640) 30 | MAX_SIZE_TRAIN: 853 31 | MIN_SIZE_TEST: 640 32 | MAX_SIZE_TEST: 853 33 | FORMAT: "RGB" 34 | MASK_FORMAT: "bitmask" 35 | TEST: 36 | EVAL_PERIOD: 7330 37 | DATALOADER: 38 | NUM_WORKERS: 6 39 | VERSION: 2 40 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Hust Visual Learning Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /mindspore/dict.py: -------------------------------------------------------------------------------- 1 | id2category={1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane', 6: 'bus', 7: 'train', 8: 'truck',9: 'boat', 10: 'traffic light', 11: 'fire hydrant', 13: 'stop sign', 14: 'parking meter', 15: 'bench',16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', 24: 'zebra',25: 'giraffe', 27: 'backpack', 28: 'umbrella', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee',35: 'skis', 36: 'snowboard', 37: 'sports ball', 38: 'kite', 39: 'baseball bat', 40: 'baseball glove',41: 'skateboard', 42: 'surfboard', 43: 'tennis racket', 44: 'bottle', 46: 'wine glass', 47: 'cup',48: 'fork', 49: 'knife', 50: 'spoon', 51: 'bowl', 52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange',56: 'broccoli', 57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair', 63: 'couch',64: 'potted plant', 65: 'bed', 67: 'dining table', 70: 'toilet', 72: 'tv', 73: 'laptop', 74: 'mouse',75: 'remote', 76: 'keyboard', 77: 'cell phone', 78: 'microwave', 79: 'oven', 80: 'toaster', 81: 'sink',82: 'refrigerator', 84: 'book', 85: 'clock', 86: 'vase', 87: 'scissors', 88: 'teddy bear', 89: 'hair drier',90: 'toothbrush'} -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-devel 2 | LABEL Service="SparseInstanceActivation" 3 | 4 | ENV TZ=Europe/Moscow 5 | ENV DETECTRON_TAG=v0.3 6 | ARG DEBIAN_FRONTEND=noninteractive 7 | 8 | RUN apt-key del 7fa2af80 && \ 9 | apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \ 10 | apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub 11 | RUN apt update && apt install vim git g++ python3-tk ffmpeg libsm6 libxext6 -y 12 | 13 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 14 | python3 -m pip install --no-cache-dir opencv-python opencv-contrib-python scipy 15 | 16 | WORKDIR /workspace 17 | RUN git clone https://github.com/facebookresearch/detectron2.git && \ 18 | cd detectron2/ && git checkout tags/${DETECTRON_TAG} && python3 setup.py build develop 19 | 20 | RUN python3 -m pip uninstall -y iopath fvcore portalocker yacs && \ 21 | python3 -m pip install --no-cache-dir iopath fvcore portalocker yacs timm pyyaml==5.1 shapely 22 | 23 | RUN git clone https://github.com/hustvl/SparseInst 24 | WORKDIR /workspace/SparseInst 25 | RUN ln -s /usr/bin/python3 /usr/bin/python 26 | 27 | ENTRYPOINT bash 28 | -------------------------------------------------------------------------------- /mindspore/sparseinst/config.py: -------------------------------------------------------------------------------- 1 | from yacs.config import CfgNode as CN 2 | import os 3 | 4 | 5 | def update_config(cfg, args): 6 | cfg.defrost() 7 | cfg.merge_from_file(args.cfg) 8 | cfg.freeze() 9 | return cfg 10 | 11 | cfg = CN() 12 | 13 | cfg.MODEL=CN() 14 | cfg.MODEL.SPARSE_INST=CN() 15 | cfg.MODEL.SPARSE_INST.ENCODER=CN() 16 | cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS=256 17 | cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES=['res3','res4','res5'] 18 | 19 | cfg.MODEL.SPARSE_INST.DECODER=CN() 20 | cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS = 100 21 | cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES = 80 22 | cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM = 128 23 | cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR = 2 24 | cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM = False 25 | cfg.MODEL.SPARSE_INST.DECODER.GROUPS = 4 26 | 27 | cfg.MODEL.SPARSE_INST.DECODER.INST=CN() 28 | cfg.MODEL.SPARSE_INST.DECODER.INST.DIM = 256 29 | cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS = 4 30 | 31 | cfg.MODEL.SPARSE_INST.DECODER.MASK=CN() 32 | cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM = 256 33 | cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS = 4 34 | 35 | cfg.MODEL.SPARSE_INST.CLS_THRESHOLD = 0.005 36 | cfg.MODEL.SPARSE_INST.MASK_THRESHOLD = 0.45 37 | cfg.MODEL.SPARSE_INST.MAX_DETECTIONS = 100 38 | 39 | cfg.MODEL.PIXEL_MEAN=[123.675, 116.280, 103.530] 40 | cfg.MODEL.PIXEL_STD=[58.395, 57.120, 57.375] 41 | -------------------------------------------------------------------------------- /sparseinst/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved 2 | 3 | from detectron2.config import CfgNode as CN 4 | 5 | def add_sparse_inst_config(cfg): 6 | 7 | cfg.MODEL.DEVICE = 'cuda' 8 | cfg.MODEL.MASK_ON = True 9 | # [SparseInst] 10 | cfg.MODEL.SPARSE_INST = CN() 11 | 12 | # parameters for inference 13 | cfg.MODEL.SPARSE_INST.CLS_THRESHOLD = 0.005 14 | cfg.MODEL.SPARSE_INST.MASK_THRESHOLD = 0.45 15 | cfg.MODEL.SPARSE_INST.MAX_DETECTIONS = 100 16 | 17 | # [Encoder] 18 | cfg.MODEL.SPARSE_INST.ENCODER = CN() 19 | cfg.MODEL.SPARSE_INST.ENCODER.NAME = "FPNPPMEncoder" 20 | cfg.MODEL.SPARSE_INST.ENCODER.NORM = "" 21 | cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES = ["res3", "res4", "res5"] 22 | cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS = 256 23 | 24 | # [Decoder] 25 | cfg.MODEL.SPARSE_INST.DECODER = CN() 26 | cfg.MODEL.SPARSE_INST.DECODER.NAME = "BaseIAMDecoder" 27 | cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS = 100 28 | cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES = 80 29 | # kernels for mask features 30 | cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM = 128 31 | # upsample factor for output masks 32 | cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR = 2.0 33 | cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM = False 34 | cfg.MODEL.SPARSE_INST.DECODER.GROUPS = 4 35 | # decoder.inst_branch 36 | cfg.MODEL.SPARSE_INST.DECODER.INST = CN() 37 | cfg.MODEL.SPARSE_INST.DECODER.INST.DIM = 256 38 | cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS = 4 39 | # decoder.mask_branch 40 | cfg.MODEL.SPARSE_INST.DECODER.MASK = CN() 41 | cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM = 256 42 | cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS = 4 43 | 44 | # [Loss] 45 | cfg.MODEL.SPARSE_INST.LOSS = CN() 46 | cfg.MODEL.SPARSE_INST.LOSS.NAME = "SparseInstCriterion" 47 | cfg.MODEL.SPARSE_INST.LOSS.ITEMS = ("labels", "masks") 48 | # loss weight 49 | cfg.MODEL.SPARSE_INST.LOSS.CLASS_WEIGHT = 2.0 50 | cfg.MODEL.SPARSE_INST.LOSS.MASK_PIXEL_WEIGHT = 5.0 51 | cfg.MODEL.SPARSE_INST.LOSS.MASK_DICE_WEIGHT = 2.0 52 | # iou-aware objectness loss weight 53 | cfg.MODEL.SPARSE_INST.LOSS.OBJECTNESS_WEIGHT = 1.0 54 | 55 | # [Matcher] 56 | cfg.MODEL.SPARSE_INST.MATCHER = CN() 57 | cfg.MODEL.SPARSE_INST.MATCHER.NAME = "SparseInstMatcher" 58 | cfg.MODEL.SPARSE_INST.MATCHER.ALPHA = 0.8 59 | cfg.MODEL.SPARSE_INST.MATCHER.BETA = 0.2 60 | 61 | # [Optimizer] 62 | cfg.SOLVER.OPTIMIZER = "ADAMW" 63 | cfg.SOLVER.BACKBONE_MULTIPLIER = 1.0 64 | cfg.SOLVER.AMSGRAD = False 65 | 66 | # [Dataset mapper] 67 | cfg.MODEL.SPARSE_INST.DATASET_MAPPER = "SparseInstDatasetMapper" 68 | 69 | # [Pyramid Vision Transformer] 70 | cfg.MODEL.PVT = CN() 71 | cfg.MODEL.PVT.NAME = "b1" 72 | cfg.MODEL.PVT.OUT_FEATURES = ["p2", "p3", "p4"] 73 | cfg.MODEL.PVT.LINEAR = False 74 | 75 | cfg.MODEL.CSPNET = CN() 76 | cfg.MODEL.CSPNET.NAME = "darknet53" 77 | cfg.MODEL.CSPNET.NORM = "" 78 | # (csp-)darknet: csp1, csp2, csp3, csp4 79 | cfg.MODEL.CSPNET.OUT_FEATURES = ["csp1", "csp2", "csp3", "csp4"] 80 | 81 | -------------------------------------------------------------------------------- /mindspore/sparseinst/encoder.py: -------------------------------------------------------------------------------- 1 | import mindspore 2 | from mindspore import Tensor 3 | import mindspore.nn as nn 4 | from mindspore.nn import Conv2d 5 | import mindspore.ops as ops 6 | 7 | __all__=["InstanceContextEncoder"] 8 | 9 | 10 | class PyramidPoolingModule(nn.Cell): 11 | def __init__(self,in_channels,channels=512,sizes=(1,2,3,6)): 12 | super().__init__() 13 | self.stages=[] 14 | self.stages=nn.CellList([self._make_stage(in_channels,channels,size) for size in sizes]) 15 | self.bottleneck=Conv2d(in_channels+len(sizes)*channels,in_channels,1,has_bias=False) 16 | 17 | def _make_stage(self,features,out_features,size): 18 | prior=nn.AdaptiveAvgPool2d(output_size=(size,size)) 19 | conv=nn.Conv2d(features,out_features,1,has_bias=True) 20 | return nn.SequentialCell(prior,conv) 21 | 22 | def construct(self,feats): 23 | h, w = feats.shape[2], feats.shape[3] 24 | 25 | prior=[ops.ResizeBilinear((h,w))(ops.ReLU()(stage(feats))) for stage in self.stages]+[feats] 26 | out=ops.ReLU()(self.bottleneck(ops.Concat(axis=1)(prior))) 27 | return out 28 | 29 | 30 | 31 | class InstanceContextEncoder(nn.Cell): 32 | def __init__(self,cfg,input_shape): 33 | super().__init__() 34 | self.num_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS #256 35 | self.in_features = cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES #[‘res3','res4','res5'] 36 | # self.norm = cfg.MODEL.SPARSE_INST.ENCODER.NORM 37 | # depthwise = cfg.MODEL.SPARSE_INST.ENCODER.DEPTHWISE 38 | self.in_channels = [input_shape[f] for f in self.in_features] 39 | # self.using_bias = self.norm == "" 40 | fpn_laterals = [] 41 | fpn_outputs = [] 42 | # groups = self.num_channels if depthwise else 1 43 | for in_channel in reversed(self.in_channels): 44 | lateral_conv = nn.Conv2d(in_channel, self.num_channels, 1,has_bias=True) 45 | output_conv = nn.Conv2d(self.num_channels, self.num_channels, 3,has_bias=True) 46 | fpn_laterals.append(lateral_conv) 47 | fpn_outputs.append(output_conv) 48 | self.fpn_laterals = nn.CellList(fpn_laterals) 49 | self.fpn_outputs = nn.CellList(fpn_outputs) 50 | # ppm 51 | self.ppm = PyramidPoolingModule(self.num_channels, self.num_channels // 4) 52 | # final fusion 53 | self.fusion = nn.Conv2d(self.num_channels * 3, self.num_channels, 1,has_bias=True) 54 | 55 | 56 | def construct(self, features): #features:dict 57 | features = [features[f] for f in self.in_features] 58 | features = features[::-1] 59 | prev_features = self.ppm(self.fpn_laterals[0](features[0])) 60 | outputs = [self.fpn_outputs[0](prev_features)] 61 | 62 | for feature, lat_conv, output_conv in zip(features[1:], self.fpn_laterals[1:], self.fpn_outputs[1:]): 63 | lat_features = lat_conv(feature) 64 | 65 | h,w=prev_features.shape[2],prev_features.shape[3] 66 | top_down_features = ops.ResizeNearestNeighbor(size=(h*2,w*2))(prev_features)### 67 | 68 | prev_features = lat_features + top_down_features 69 | outputs.insert(0, output_conv(prev_features)) 70 | 71 | size = outputs[0].shape[2:] 72 | features = [outputs[0]] + [ops.ResizeBilinear(size)(x) for x in outputs[1:]] 73 | 74 | features = self.fusion(ops.Concat(axis=1)(features)) 75 | return features 76 | -------------------------------------------------------------------------------- /mindspore/sparseinst/resnet.py: -------------------------------------------------------------------------------- 1 | import mindspore 2 | import mindspore.ops as ops 3 | import mindspore.nn as nn 4 | from typing import Type, Union, List, Optional 5 | from mindvision.classification.models import ResidualBlock,ResidualBlockBase 6 | from mindvision.engine.class_factory import ClassFactory, ModuleType 7 | from mindvision.classification.models.blocks import ConvNormActivation 8 | from collections import OrderedDict 9 | 10 | 11 | class ResNet(nn.Cell): 12 | """ 13 | ResNet architecture. 14 | 15 | Args: 16 | block (Type[Union[ResidualBlockBase, ResidualBlock]]): THe block for network. 17 | layer_nums (list): The numbers of block in different layers. 18 | group (int): The number of Group convolutions. Default: 1. 19 | base_width (int): The width of per group. Default: 64. 20 | norm (nn.Cell, optional): The module specifying the normalization layer to use. Default: None. 21 | 22 | Inputs: 23 | - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`. 24 | """ 25 | def __init__(self, 26 | block: Type[Union[ResidualBlockBase, ResidualBlock]], 27 | layer_nums: List[int], 28 | group: int = 1, 29 | base_width: int = 64, 30 | norm: Optional[nn.Cell] = None 31 | ) -> None: 32 | super(ResNet, self).__init__() 33 | self.output_shape={} 34 | if not norm: 35 | norm = nn.BatchNorm2d 36 | self.norm = norm 37 | self.in_channels = 64 38 | self.group = group 39 | self.base_with = base_width 40 | self.stem=OrderedDict() 41 | conv1 = ConvNormActivation( 42 | 3, self.in_channels, kernel_size=7, stride=2, norm=norm) 43 | self.stem['conv1']=conv1 44 | self.stem['maxpool2d']=nn.MaxPool2d(kernel_size=3,stride=2,pad_mode='same') 45 | self.stem=nn.SequentialCell(self.stem) 46 | self.res2 = self._make_layer(block, 64, layer_nums[0]) 47 | self.res3 = self._make_layer(block, 128, layer_nums[1], stride=2) 48 | self.res4 = self._make_layer(block, 256, layer_nums[2], stride=2) 49 | self.res5 = self._make_layer(block, 512, layer_nums[3], stride=2) 50 | 51 | def _make_layer(self, 52 | block: Type[Union[ResidualBlockBase, ResidualBlock]], 53 | channel: int, 54 | block_nums: int, 55 | stride: int = 1 56 | ): 57 | 58 | down_sample = None 59 | 60 | if stride != 1 or self.in_channels != self.in_channels * block.expansion: 61 | down_sample = ConvNormActivation( 62 | self.in_channels, 63 | channel * block.expansion, 64 | kernel_size=1, 65 | stride=stride, 66 | norm=self.norm, 67 | activation=None) 68 | layers = [] 69 | layers.append( 70 | block( 71 | self.in_channels, 72 | channel, 73 | stride=stride, 74 | down_sample=down_sample, 75 | group=self.group, 76 | base_width=self.base_with, 77 | norm=self.norm 78 | ) 79 | ) 80 | self.in_channels = channel * block.expansion 81 | 82 | for _ in range(1, block_nums): 83 | layers.append( 84 | block( 85 | self.in_channels, 86 | channel, 87 | group=self.group, 88 | base_width=self.base_with, 89 | norm=self.norm 90 | ) 91 | ) 92 | 93 | return nn.SequentialCell(layers) 94 | 95 | def output_channel(self): 96 | output_channel={'res3':512,'res4':1024,'res5':2048} 97 | return output_channel 98 | 99 | def construct(self, x): 100 | output={} 101 | x = self.stem(x) 102 | 103 | x = self.res2(x) 104 | x = self.res3(x) 105 | output['res3']=x 106 | x = self.res4(x) 107 | output['res4']=x 108 | x = self.res5(x) 109 | output['res5']=x 110 | 111 | return output 112 | 113 | 114 | 115 | def build_resnet50(): 116 | return ResNet(ResidualBlock,[3,4,6,3]) 117 | 118 | -------------------------------------------------------------------------------- /sparseinst/coco_evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pycocotools.mask as mask_util 3 | from detectron2.structures import BoxMode 4 | from detectron2.evaluation import COCOEvaluator 5 | 6 | 7 | def instances_to_coco_json(instances, img_id): 8 | """ 9 | Dump an "Instances" object to a COCO-format json that's used for evaluation. 10 | 11 | Args: 12 | instances (Instances): 13 | img_id (int): the image id 14 | 15 | Returns: 16 | list[dict]: list of json annotations in COCO format. 17 | """ 18 | num_instance = len(instances) 19 | if num_instance == 0: 20 | return [] 21 | 22 | # NOTE: pure instance segmentation 23 | has_box = instances.has("pred_boxes") 24 | if has_box: 25 | boxes = instances.pred_boxes.tensor.numpy() 26 | boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) 27 | boxes = boxes.tolist() 28 | 29 | scores = instances.scores.tolist() 30 | classes = instances.pred_classes.tolist() 31 | 32 | has_mask = instances.has("pred_masks") 33 | if has_mask: 34 | # use RLE to encode the masks, because they are too large and takes memory 35 | # since this evaluator stores outputs of the entire dataset 36 | rles = [ 37 | mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] 38 | for mask in instances.pred_masks 39 | ] 40 | for rle in rles: 41 | # "counts" is an array encoded by mask_util as a byte-stream. Python3's 42 | # json writer which always produces strings cannot serialize a bytestream 43 | # unless you decode it. Thankfully, utf-8 works out (which is also what 44 | # the pycocotools/_mask.pyx does). 45 | rle["counts"] = rle["counts"].decode("utf-8") 46 | 47 | has_keypoints = instances.has("pred_keypoints") 48 | if has_keypoints: 49 | keypoints = instances.pred_keypoints 50 | 51 | results = [] 52 | for k in range(num_instance): 53 | result = { 54 | "image_id": img_id, 55 | "category_id": classes[k], 56 | "score": scores[k], 57 | } 58 | if has_box: 59 | result["bbox"] = boxes[k] 60 | if has_mask: 61 | result["segmentation"] = rles[k] 62 | if has_keypoints: 63 | # In COCO annotations, 64 | # keypoints coordinates are pixel indices. 65 | # However our predictions are floating point coordinates. 66 | # Therefore we subtract 0.5 to be consistent with the annotation format. 67 | # This is the inverse of data loading logic in `datasets/coco.py`. 68 | keypoints[k][:, :2] -= 0.5 69 | result["keypoints"] = keypoints[k].flatten().tolist() 70 | results.append(result) 71 | return results 72 | 73 | 74 | class COCOMaskEvaluator(COCOEvaluator): 75 | 76 | def process(self, inputs, outputs): 77 | """ 78 | Args: 79 | inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). 80 | It is a list of dict. Each dict corresponds to an image and 81 | contains keys like "height", "width", "file_name", "image_id". 82 | outputs: the outputs of a COCO model. It is a list of dicts with key 83 | "instances" that contains :class:`Instances`. 84 | """ 85 | for input, output in zip(inputs, outputs): 86 | prediction = {"image_id": input["image_id"]} 87 | 88 | if "instances" in output: 89 | instances = output["instances"].to(self._cpu_device) 90 | prediction["instances"] = instances_to_coco_json(instances, input["image_id"]) 91 | if "proposals" in output: 92 | prediction["proposals"] = output["proposals"].to(self._cpu_device) 93 | if len(prediction) > 1: 94 | self._predictions.append(prediction) -------------------------------------------------------------------------------- /sparseinst/encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved 2 | 3 | import math 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill 9 | 10 | from detectron2.utils.registry import Registry 11 | from detectron2.layers import Conv2d 12 | 13 | SPARSE_INST_ENCODER_REGISTRY = Registry("SPARSE_INST_ENCODER") 14 | SPARSE_INST_ENCODER_REGISTRY.__doc__ = "registry for SparseInst decoder" 15 | 16 | 17 | class PyramidPoolingModule(nn.Module): 18 | 19 | def __init__(self, in_channels, channels=512, sizes=(1, 2, 3, 6)): 20 | super().__init__() 21 | self.stages = [] 22 | self.stages = nn.ModuleList( 23 | [self._make_stage(in_channels, channels, size) for size in sizes] 24 | ) 25 | self.bottleneck = Conv2d( 26 | in_channels + len(sizes) * channels, in_channels, 1) 27 | 28 | def _make_stage(self, features, out_features, size): 29 | prior = nn.AdaptiveAvgPool2d(output_size=(size, size)) 30 | conv = Conv2d(features, out_features, 1) 31 | return nn.Sequential(prior, conv) 32 | 33 | def forward(self, feats): 34 | h, w = feats.size(2), feats.size(3) 35 | priors = [F.interpolate(input=F.relu_(stage(feats)), size=( 36 | h, w), mode='bilinear', align_corners=False) for stage in self.stages] + [feats] 37 | out = F.relu_(self.bottleneck(torch.cat(priors, 1))) 38 | return out 39 | 40 | 41 | 42 | @SPARSE_INST_ENCODER_REGISTRY.register() 43 | class InstanceContextEncoder(nn.Module): 44 | """ 45 | Instance Context Encoder 46 | 1. construct feature pyramids from ResNet 47 | 2. enlarge receptive fields (ppm) 48 | 3. multi-scale fusion 49 | """ 50 | 51 | def __init__(self, cfg, input_shape): 52 | super().__init__() 53 | self.num_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS 54 | self.in_features = cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES 55 | self.in_channels = [input_shape[f].channels for f in self.in_features] 56 | fpn_laterals = [] 57 | fpn_outputs = [] 58 | for in_channel in reversed(self.in_channels): 59 | lateral_conv = Conv2d(in_channel, self.num_channels, 1) 60 | output_conv = Conv2d(self.num_channels, self.num_channels, 3, padding=1) 61 | c2_xavier_fill(lateral_conv) 62 | c2_xavier_fill(output_conv) 63 | fpn_laterals.append(lateral_conv) 64 | fpn_outputs.append(output_conv) 65 | self.fpn_laterals = nn.ModuleList(fpn_laterals) 66 | self.fpn_outputs = nn.ModuleList(fpn_outputs) 67 | # ppm 68 | self.ppm = PyramidPoolingModule(self.num_channels, self.num_channels // 4) 69 | # final fusion 70 | self.fusion = nn.Conv2d(self.num_channels * 3, self.num_channels, 1) 71 | c2_msra_fill(self.fusion) 72 | 73 | def forward(self, features): 74 | features = [features[f] for f in self.in_features] 75 | features = features[::-1] 76 | prev_features = self.ppm(self.fpn_laterals[0](features[0])) 77 | outputs = [self.fpn_outputs[0](prev_features)] 78 | for feature, lat_conv, output_conv in zip(features[1:], self.fpn_laterals[1:], self.fpn_outputs[1:]): 79 | lat_features = lat_conv(feature) 80 | top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode='nearest') 81 | prev_features = lat_features + top_down_features 82 | outputs.insert(0, output_conv(prev_features)) 83 | size = outputs[0].shape[2:] 84 | features = [ 85 | outputs[0]] + [F.interpolate(x, size, mode='bilinear', align_corners=False) for x in outputs[1:]] 86 | features = self.fusion(torch.cat(features, dim=1)) 87 | return features 88 | 89 | 90 | def build_sparse_inst_encoder(cfg, input_shape): 91 | name = cfg.MODEL.SPARSE_INST.ENCODER.NAME 92 | return SPARSE_INST_ENCODER_REGISTRY.get(name)(cfg, input_shape) 93 | -------------------------------------------------------------------------------- /mindspore/sparseinst/sparseinst.py: -------------------------------------------------------------------------------- 1 | import mindspore 2 | import mindspore.nn as nn 3 | import mindspore.ops as ops 4 | from mindspore import Tensor 5 | import numpy as np 6 | import cv2 7 | 8 | from .resnet import build_resnet50 9 | from .encoder import InstanceContextEncoder 10 | from .decoder import GroupIAMDecoder 11 | 12 | 13 | __all__=["SparseInst"] 14 | 15 | def rescoring_mask(scores, mask_pred, masks): 16 | mask_pred_ = mask_pred.astype('float32') 17 | return scores * ((masks * mask_pred_).sum(axis=(1, 2)) / (mask_pred_.sum(axis=(1, 2)) + 1e-6)) 18 | 19 | class SparseInst(nn.Cell): 20 | def __init__(self,cfg,is_train=False): 21 | super().__init__() 22 | 23 | self.backbone=build_resnet50() 24 | self.encoder=InstanceContextEncoder(cfg,self.backbone.output_channel()) 25 | self.decoder=GroupIAMDecoder(cfg) 26 | 27 | self.pixel_mean=Tensor(cfg.MODEL.PIXEL_MEAN).view((3,1,1)) 28 | self.pixel_std=Tensor(cfg.MODEL.PIXEL_STD).view((3,1,1)) 29 | 30 | self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD 31 | self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD 32 | self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS 33 | 34 | self.training=is_train 35 | 36 | def normalizer(self, image): 37 | image=(image-self.pixel_mean)/self.pixel_std 38 | return image 39 | 40 | def padding(self,image,size_divisibility=32,pad_value=0.0): 41 | h,w=image.shape[2],image.shape[3] 42 | bottom=(h//size_divisibility+1)*size_divisibility-h 43 | right=(w//size_divisibility+1)*size_divisibility-w 44 | return ops.Pad(((0,0),(0,0),(0,bottom),(0,right)))(image) 45 | 46 | def preprocess_inputs(self,batched_inputs): 47 | images=self.padding(self.normalizer(batched_inputs)) 48 | return images 49 | 50 | def construct(self,batched_inputs): 51 | 52 | #input :Tensor(N,C,H,W) 53 | #output = { 54 | #"pred_logits": pred_logits, 55 | #"pred_masks": pred_masks, 56 | #"pred_scores": pred_scores, 57 | #} 58 | image_sizes=[batched_inputs['image'].shape[2:]] 59 | images=self.preprocess_inputs(batched_inputs['image']) 60 | max_shape=images.shape[2:] 61 | features=self.backbone(images) 62 | features=self.encoder(features) 63 | output=self.decoder(features) 64 | if self.training: 65 | return output 66 | else: 67 | results=self.inference(output,[batched_inputs],max_shape,image_sizes) 68 | processed_results=[{'instances':r} for r in results] 69 | return processed_results 70 | 71 | 72 | def inference(self,output,batched_inputs,max_shape,image_sizes): 73 | results = [] 74 | pred_scores = ops.Sigmoid()(output["pred_logits"]) 75 | pred_masks = ops.Sigmoid()(output["pred_masks"]) 76 | pred_objectness = ops.Sigmoid()(output["pred_scores"]) 77 | pred_scores = ops.Sqrt()(pred_scores * pred_objectness) 78 | for _, (scores_per_image, mask_pred_per_image, batched_input, img_shape) in enumerate(zip(pred_scores, pred_masks, batched_inputs, image_sizes)): 79 | 80 | labels,scores = ops.max(scores_per_image,axis=-1) 81 | keep = scores > self.cls_threshold 82 | scores = ops.masked_select(scores,keep) 83 | labels = ops.masked_select(labels,keep) 84 | n,h,w=mask_pred_per_image.shape 85 | mask_pred_per_image = ops.masked_select(mask_pred_per_image,keep.view(n,1,1)).view(-1,h,w) 86 | result={} 87 | if scores.shape[0]==0: 88 | result['scores']=scores 89 | result['category_id']=labels 90 | results.append(result) 91 | continue 92 | h,w=img_shape 93 | ori_shape=batched_input['image_size'] 94 | scores = rescoring_mask(scores, mask_pred_per_image > self.mask_threshold, mask_pred_per_image) 95 | mask_pred_per_image=ops.interpolate(ops.ExpandDims()(mask_pred_per_image,1),sizes=max_shape,mode='bilinear') 96 | mask_pred_per_image=mask_pred_per_image.asnumpy() 97 | mask_pred_per_image=mask_pred_per_image[:,:,:h,:w] 98 | mask_pred_per_image=Tensor(mask_pred_per_image) 99 | 100 | mask_pred_per_image=ops.interpolate(mask_pred_per_image,sizes=ori_shape,mode='bilinear') 101 | mask_pred_per_image=ops.squeeze(mask_pred_per_image,axis=1) 102 | mask_pred=mask_pred_per_image>self.mask_threshold 103 | mask_pred=mask_pred.astype('uint8') 104 | 105 | result['segmentation'] = mask_pred 106 | result['scores'] = scores 107 | result['category_id'] = labels 108 | results.append(result) 109 | 110 | return results -------------------------------------------------------------------------------- /onnx/convert_onnx.py: -------------------------------------------------------------------------------- 1 | import math 2 | import argparse 3 | 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | from detectron2.layers import Conv2d 9 | from detectron2.utils.logger import setup_logger 10 | from detectron2.modeling import build_model 11 | from detectron2.checkpoint import DetectionCheckpointer 12 | from detectron2.config import get_cfg 13 | 14 | from sparseinst import add_sparse_inst_config 15 | 16 | 17 | class PyramidPoolingModuleONNX(nn.Module): 18 | 19 | def __init__(self, in_channels, channels, input_size, pool_sizes=(1, 2, 3, 6)): 20 | super().__init__() 21 | self.stages = [] 22 | self.stages = nn.ModuleList( 23 | [self._make_stage(in_channels, channels, input_size, pool_size) 24 | for pool_size in pool_sizes] 25 | ) 26 | self.bottleneck = Conv2d( 27 | in_channels + len(pool_sizes) * channels, in_channels, 1) 28 | 29 | def _make_stage(self, features, out_features, input_size, pool_size): 30 | stride_y = math.floor((input_size[0] / pool_size)) 31 | stride_x = math.floor((input_size[1] / pool_size)) 32 | kernel_y = input_size[0] - (pool_size - 1) * stride_y 33 | kernel_x = input_size[1] - (pool_size - 1) * stride_x 34 | prior = nn.AvgPool2d(kernel_size=( 35 | kernel_y, kernel_x), stride=(stride_y, stride_x)) 36 | conv = Conv2d(features, out_features, 1) 37 | return nn.Sequential(prior, conv) 38 | 39 | def forward(self, feats): 40 | h, w = feats.size(2), feats.size(3) 41 | priors = [F.interpolate( 42 | input=F.relu_(stage(feats)), size=(h, w), mode='bilinear', align_corners=False) for stage in self.stages] + [feats] 43 | out = F.relu_(self.bottleneck(torch.cat(priors, 1))) 44 | return out 45 | 46 | 47 | def main(): 48 | parser = argparse.ArgumentParser( 49 | description="Export model to the onnx format") 50 | parser.add_argument( 51 | "--config-file", 52 | default="configs/sparse_inst_r50_giam.yaml", 53 | metavar="FILE", 54 | help="path to config file", 55 | ) 56 | parser.add_argument('--width', default=640, type=int) 57 | parser.add_argument('--height', default=640, type=int) 58 | parser.add_argument('--level', default=0, type=int) 59 | parser.add_argument( 60 | "--output", 61 | default="output/sparseinst.onnx", 62 | metavar="FILE", 63 | help="path to the output onnx file", 64 | ) 65 | parser.add_argument( 66 | "--opts", 67 | help="Modify config options using the command-line 'KEY VALUE' pairs", 68 | default=[], 69 | nargs=argparse.REMAINDER, 70 | ) 71 | 72 | cfg = get_cfg() 73 | add_sparse_inst_config(cfg) 74 | args = parser.parse_args() 75 | cfg.merge_from_file(args.config_file) 76 | cfg.merge_from_list(args.opts) 77 | 78 | # norm for ONNX: change FrozenBN back to BN 79 | cfg.MODEL.BACKBONE.FREEZE_AT = 0 80 | cfg.MODEL.RESNETS.NORM = "BN" 81 | 82 | cfg.freeze() 83 | 84 | output_dir = cfg.OUTPUT_DIR 85 | logger = setup_logger(output=output_dir) 86 | logger.info(cfg) 87 | 88 | height = args.height 89 | width = args.width 90 | 91 | model = build_model(cfg) 92 | num_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS 93 | onnx_ppm = PyramidPoolingModuleONNX( 94 | num_channels, num_channels // 4, (height // 32, width // 32)) 95 | model.encoder.ppm = onnx_ppm 96 | model.to(cfg.MODEL.DEVICE) 97 | logger.info("Model:\n{}".format(model)) 98 | 99 | checkpointer = DetectionCheckpointer(model) 100 | _ = checkpointer.load(cfg.MODEL.WEIGHTS) 101 | logger.info("load Model:\n{}".format(cfg.MODEL.WEIGHTS)) 102 | 103 | input_names = ["input_image"] 104 | dummy_input = torch.zeros((1, 3, height, width)).to(cfg.MODEL.DEVICE) 105 | output_names = ["scores", "masks"] 106 | 107 | model.forward = model.forward_test 108 | 109 | torch.onnx.export( 110 | model, 111 | dummy_input, 112 | args.output, 113 | verbose=True, 114 | input_names=input_names, 115 | output_names=output_names, 116 | keep_initializers_as_inputs=False, 117 | opset_version=12, 118 | ) 119 | 120 | logger.info("Done. The onnx model is saved into {}.".format(args.output)) 121 | 122 | 123 | if __name__ == "__main__": 124 | main() 125 | -------------------------------------------------------------------------------- /sparseinst/utils.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import Optional, List 3 | 4 | import torch 5 | from torch import Tensor 6 | import torch.distributed as dist 7 | import torch.nn.functional as F 8 | import torchvision 9 | 10 | 11 | def _max_by_axis(the_list): 12 | # type: (List[List[int]]) -> List[int] 13 | maxes = the_list[0] 14 | for sublist in the_list[1:]: 15 | for index, item in enumerate(sublist): 16 | maxes[index] = max(maxes[index], item) 17 | return maxes 18 | 19 | 20 | class NestedTensor(object): 21 | def __init__(self, tensors, mask: Optional[Tensor]): 22 | self.tensors = tensors 23 | self.mask = mask 24 | 25 | def to(self, device): 26 | cast_tensor = self.tensors.to(device) 27 | mask = self.mask 28 | if mask is not None: 29 | assert mask is not None 30 | cast_mask = mask.to(device) 31 | else: 32 | cast_mask = None 33 | return NestedTensor(cast_tensor, cast_mask) 34 | 35 | def decompose(self): 36 | return self.tensors, self.mask 37 | 38 | def __repr__(self): 39 | return str(self.tensors) 40 | 41 | # _onnx_nested_tensor_from_tensor_list() is an implementation of 42 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing. 43 | 44 | 45 | @torch.jit.unused 46 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: 47 | max_size = [] 48 | for i in range(tensor_list[0].dim()): 49 | max_size_i = torch.max(torch.stack([img.shape[i] 50 | for img in tensor_list]).to(torch.float32)).to(torch.int64) 51 | max_size.append(max_size_i) 52 | max_size = tuple(max_size) 53 | 54 | # work around for 55 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 56 | # m[: img.shape[1], :img.shape[2]] = False 57 | # which is not yet supported in onnx 58 | padded_imgs = [] 59 | padded_masks = [] 60 | for img in tensor_list: 61 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 62 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 63 | padded_imgs.append(padded_img) 64 | 65 | m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) 66 | padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) 67 | padded_masks.append(padded_mask.to(torch.bool)) 68 | 69 | tensor = torch.stack(padded_imgs) 70 | mask = torch.stack(padded_masks) 71 | 72 | return NestedTensor(tensor, mask=mask) 73 | 74 | 75 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 76 | # TODO make this more general 77 | if tensor_list[0].ndim == 3: 78 | if torchvision._is_tracing(): 79 | # nested_tensor_from_tensor_list() does not export well to ONNX 80 | # call _onnx_nested_tensor_from_tensor_list() instead 81 | return _onnx_nested_tensor_from_tensor_list(tensor_list) 82 | 83 | # TODO make it support different-sized images 84 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 85 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 86 | batch_shape = [len(tensor_list)] + max_size 87 | b, c, h, w = batch_shape 88 | dtype = tensor_list[0].dtype 89 | device = tensor_list[0].device 90 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 91 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 92 | for img, pad_img, m in zip(tensor_list, tensor, mask): 93 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 94 | m[: img.shape[1], :img.shape[2]] = False 95 | else: 96 | raise ValueError('not supported') 97 | return NestedTensor(tensor, mask) 98 | 99 | 100 | def nested_masks_from_list(tensor_list: List[Tensor], input_shape=None): 101 | if tensor_list[0].ndim == 3: 102 | dim_size = sum([img.shape[0] for img in tensor_list]) 103 | if input_shape is None: 104 | max_size = _max_by_axis([list(img.shape[-2:]) for img in tensor_list]) 105 | else: 106 | max_size = [input_shape[0], input_shape[1]] 107 | batch_shape = [dim_size] + max_size 108 | # b, h, w = batch_shape 109 | dtype = tensor_list[0].dtype 110 | device = tensor_list[0].device 111 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 112 | mask = torch.zeros(batch_shape, dtype=torch.bool, device=device) 113 | idx = 0 114 | for img in tensor_list: 115 | c = img.shape[0] 116 | c_ = idx + c 117 | tensor[idx: c_, :img.shape[1], : img.shape[2]].copy_(img) 118 | mask[idx: c_, :img.shape[1], :img.shape[2]] = True 119 | idx = c_ 120 | else: 121 | raise ValueError('not supported') 122 | return NestedTensor(tensor, mask) 123 | 124 | 125 | def is_dist_avail_and_initialized(): 126 | if not dist.is_available(): 127 | return False 128 | if not dist.is_initialized(): 129 | return False 130 | return True 131 | 132 | 133 | def get_world_size(): 134 | if not is_dist_avail_and_initialized(): 135 | return 1 136 | return dist.get_world_size() 137 | 138 | 139 | def aligned_bilinear(tensor, factor): 140 | # borrowed from Adelaidet: https://github1s.com/aim-uofa/AdelaiDet/blob/HEAD/adet/utils/comm.py 141 | assert tensor.dim() == 4 142 | assert factor >= 1 143 | assert int(factor) == factor 144 | 145 | if factor == 1: 146 | return tensor 147 | 148 | h, w = tensor.size()[2:] 149 | tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate") 150 | oh = factor * h + 1 151 | ow = factor * w + 1 152 | tensor = F.interpolate( 153 | tensor, size=(oh, ow), 154 | mode='bilinear', 155 | align_corners=True 156 | ) 157 | tensor = F.pad( 158 | tensor, pad=(factor // 2, 0, factor // 2, 0), 159 | mode="replicate" 160 | ) 161 | 162 | return tensor[:, :, :oh - 1, :ow - 1] 163 | -------------------------------------------------------------------------------- /mindspore/test.py: -------------------------------------------------------------------------------- 1 | import mindspore 2 | import argparse 3 | import numpy as np 4 | from sparseinst import SparseInst, cfg 5 | import cv2 6 | import os 7 | import json 8 | from mindspore import Tensor, ops 9 | from pycocotools.coco import COCO 10 | from pycocotools.cocoeval import COCOeval 11 | import tqdm 12 | from dict import id2category 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser() 16 | # general 17 | #parser.add_argument('--cfg',help='experiment configure file name',required=True,type=str) 18 | parser.add_argument('--checkpoint',help="checkpoint path",type=str) 19 | parser.add_argument('--json_save_path',help="result json scve path",required=False,type=str) 20 | parser.add_argument("--visualize",action="store_true",help="Run or not.") 21 | parser.add_argument('--image_name',help="image to visual",required=False,type=str) 22 | parser.add_argument('--coco_path',help="coco to path",required=False,type=str) 23 | parser.add_argument('--dir_path',help="coco to visual",required=False,type=str) 24 | args = parser.parse_args() 25 | return args 26 | 27 | 28 | def load_net(path): 29 | param_dict = mindspore.load_checkpoint(path) 30 | net = SparseInst(cfg) 31 | mindspore.load_param_into_net(net, param_dict) 32 | model=mindspore.Model(network=net) 33 | return model 34 | 35 | def resize_img(img,short_length=640,long_length=864): 36 | h,w=img.shape[2:] 37 | image_size=(h,w) 38 | 39 | if h>w: 40 | h=int(h/w*short_length) 41 | if h>long_length: 42 | w=int(w/h*long_length) 43 | h=long_length 44 | else: 45 | w=short_length 46 | else: 47 | w=int(w/h*short_length) 48 | if w>long_length: 49 | h=int(h/w*long_length) 50 | w=long_length 51 | else: 52 | h=short_length 53 | img=ops.interpolate(img,sizes=(h,w),mode='bilinear') 54 | return {'image':img,'image_size':image_size} ########## 55 | 56 | def read_img(name): 57 | image=cv2.imread(name) 58 | _image=image.copy() 59 | image=cv2.cvtColor(image,cv2.COLOR_BGR2RGB) 60 | image=Tensor(image).astype('float32') 61 | image=ops.transpose(image,(2,0,1)) 62 | image=ops.expand_dims(image,0) 63 | return image,_image 64 | 65 | class Dataset: 66 | def __init__(self,coco_path,dir_path,short_length=640,long_length=864,visualize=True): 67 | self.short_length=short_length 68 | self.long_length=long_length 69 | self.visualize=visualize 70 | self.coco = COCO(coco_path) 71 | self.ids = list(self.coco.imgs.keys()) 72 | self.dir=dir_path 73 | 74 | def __len__(self): 75 | return len(self.ids) 76 | 77 | def _get_image_path(self, file_name): 78 | images_dir=self.dir 79 | return os.path.join(images_dir, file_name) 80 | 81 | def __getitem__(self,index): 82 | coco=self.coco 83 | img_id=self.ids[index] 84 | file_name=coco.loadImgs(img_id)[0]['file_name'] 85 | file_name=self._get_image_path(file_name) 86 | image,ori_image=read_img(file_name) 87 | image=resize_img(image) 88 | return {'image':image,'ori_image':ori_image,'image_id':self.ids[index]} 89 | 90 | 91 | class Evaluator: 92 | def __init__(self, coco_path): 93 | self.coco = COCO(coco_path) 94 | 95 | def evaluate(self, res_file): 96 | coco_dt = self.coco.loadRes(res_file) 97 | coco_eval = COCOeval(self.coco, coco_dt, "segm") 98 | coco_eval.evaluate() 99 | coco_eval.accumulate() 100 | coco_eval.summarize() 101 | info_str = [] 102 | stats_names = ['AP', 'Ap .5', 'AP .75','AP (M)', 'AP (L)', 'AR', 'AR .5', 'AR .75', 'AR (M)', 'AR (L)'] 103 | for ind, name in enumerate(stats_names): 104 | info_str.append((name, coco_eval.stats[ind])) 105 | return info_str 106 | 107 | 108 | def read_names(path): 109 | files = os.listdir(path) 110 | files=[os.path.join(path,name) for name in files] 111 | return files 112 | 113 | def visualization(masks,image,name,path): 114 | masks=[mask*255 for mask in masks] 115 | h,w=masks[0].shape 116 | path=path+name+'/' 117 | if not os.path.exists(path): 118 | os.mkdir(path) 119 | _=[cv2.imwrite(path+'image_mask'+str(i)+".jpg",((mask.reshape(h,w,1).astype(np.float32)/255.0)*image.astype(np.float32)).astype(np.uint8)) for i,mask in enumerate(masks)] 120 | _=[cv2.imwrite(path+'mask'+str(i)+'.jpg',mask) for i,mask in enumerate(masks)] 121 | 122 | class Runner: 123 | def __init__(self,dataset,model,visualize=True): 124 | self.dataset=dataset 125 | self.model=model 126 | self.visualize=visualize 127 | self.dict=list(id2category.keys()) 128 | def __call__(self,idx): 129 | input=self.dataset[idx] 130 | ori_image=input['ori_image'] 131 | image_id=input['image_id'] 132 | input=input['image'] 133 | output=self.model.predict(input)[0]['instances'] 134 | if 'pred_masks' in output.keys(): 135 | output['segmentation'] = output['segmentation'].asnumpy() 136 | if not self.visualize: 137 | output['']=self.mask2rle(output) 138 | output['scores'] = output['scores'].asnumpy().astype(float).tolist() 139 | output['category_id'] = output['category_id'].asnumpy().astype(int).tolist() 140 | output['category_id']=[self.dict[i] for i in output['category_id']] 141 | output['image_id']=int(image_id) 142 | del input 143 | if not self.visualize: 144 | del ori_image 145 | all_pred=[] 146 | for i,mask in enumerate(output['segmentation']): 147 | all_pred.append({'image_id':image_id,'category_id':output['category_id'][i],'segmentation':mask,'score':output['scores'][i]}) 148 | del output 149 | return all_pred ################# 150 | else: 151 | output['ori_image']=ori_image 152 | return output 153 | def mask2rle(self,outputs): 154 | masks=outputs['pred_masks'] 155 | masks=[mask for mask in masks] 156 | def f(img): 157 | ''' 158 | img: numpy array, 1 - mask, 0 - background 159 | Returns run length as string formated 160 | ''' 161 | pixels= img.T.flatten() 162 | pixels = np.concatenate([[0], pixels, [0]]) 163 | runs = np.where(pixels[1:] != pixels[:-1])[0] + 1 164 | runs[1::2] -= runs[::2] 165 | return ' '.join(str(x) for x in runs) 166 | rle=[f(mask) for mask in masks] 167 | return rle 168 | 169 | 170 | def main(): 171 | args = parse_args() 172 | mindspore.set_context(mode=mindspore.PYNATIVE_MODE) 173 | model=load_net(args.checkpoint) 174 | if args.visualize: 175 | image,ori_image=read_img(args.image_name) 176 | image=resize_img(image) 177 | dataset=[{'image':image,'ori_image':ori_image,'image_id':args.image_name.split('/')[-1].split('.')[0]}] 178 | else: 179 | dataset=Dataset(args.coco_path,args.dir_path,visualize=args.visualize) 180 | runner=Runner(dataset=dataset,model=model,visualize=args.visualize) 181 | results=[] 182 | for i in range(len(dataset)): 183 | results.append(runner(i)) 184 | print(i) 185 | if args.visualize: 186 | _=[visualization(res['pred_masks'],res['ori_image'],res['image_id'],'./') for res in results] 187 | else: 188 | res_file=os.path.join(args.json_save_path, "segment_coco_results.json") 189 | json.dump(results, open(res_file, 'w')) 190 | eva=Evaluator(args.coco_path) 191 | info_str=eva.evaluate(res_file) 192 | 193 | 194 | if __name__=="__main__": 195 | main() 196 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import argparse 3 | import glob 4 | import multiprocessing as mp 5 | import os 6 | import time 7 | import cv2 8 | import tqdm 9 | 10 | from detectron2.config import get_cfg 11 | from detectron2.data.detection_utils import read_image 12 | from detectron2.utils.logger import setup_logger 13 | 14 | from sparseinst import VisualizationDemo, add_sparse_inst_config 15 | 16 | 17 | # constants 18 | WINDOW_NAME = "COCO detections" 19 | 20 | 21 | def setup_cfg(args): 22 | # load config from file and command-line arguments 23 | cfg = get_cfg() 24 | add_sparse_inst_config(cfg) 25 | cfg.merge_from_file(args.config_file) 26 | cfg.merge_from_list(args.opts) 27 | # Set score_threshold for builtin models 28 | cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold 29 | cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold 30 | cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold 31 | cfg.freeze() 32 | return cfg 33 | 34 | 35 | def get_parser(): 36 | parser = argparse.ArgumentParser( 37 | description="Detectron2 demo for builtin models") 38 | parser.add_argument( 39 | "--config-file", 40 | default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml", 41 | metavar="FILE", 42 | help="path to config file", 43 | ) 44 | parser.add_argument("--webcam", action="store_true", 45 | help="Take inputs from webcam.") 46 | parser.add_argument("--video-input", help="Path to video file.") 47 | parser.add_argument( 48 | "--input", 49 | nargs="+", 50 | help="A list of space separated input images; " 51 | "or a single glob pattern such as 'directory/*.jpg'", 52 | ) 53 | parser.add_argument( 54 | "--output", 55 | help="A file or directory to save output visualizations. " 56 | "If not given, will show output in an OpenCV window.", 57 | ) 58 | 59 | parser.add_argument( 60 | "--confidence-threshold", 61 | type=float, 62 | default=0.5, 63 | help="Minimum score for instance predictions to be shown", 64 | ) 65 | parser.add_argument( 66 | "--opts", 67 | help="Modify config options using the command-line 'KEY VALUE' pairs", 68 | default=[], 69 | nargs=argparse.REMAINDER, 70 | ) 71 | return parser 72 | 73 | 74 | if __name__ == "__main__": 75 | mp.set_start_method("spawn", force=True) 76 | args = get_parser().parse_args() 77 | setup_logger(name="fvcore") 78 | logger = setup_logger() 79 | logger.info("Arguments: " + str(args)) 80 | 81 | cfg = setup_cfg(args) 82 | 83 | demo = VisualizationDemo(cfg) 84 | 85 | if args.input: 86 | if len(args.input) == 1: 87 | args.input = glob.glob(os.path.expanduser(args.input[0])) 88 | assert args.input, "The input path(s) was not found" 89 | for path in tqdm.tqdm(args.input, disable=not args.output): 90 | # use PIL, to be consistent with evaluation 91 | # img = read_image(path, format="BGR") 92 | # OneNet uses RGB input as default 93 | img = read_image(path, format="RGB") 94 | start_time = time.time() 95 | predictions, visualized_output = demo.run_on_image( 96 | img, args.confidence_threshold) 97 | logger.info( 98 | "{}: {} in {:.2f}s".format( 99 | path, 100 | "detected {} instances".format( 101 | len(predictions["instances"])) 102 | if "instances" in predictions 103 | else "finished", 104 | time.time() - start_time, 105 | ) 106 | ) 107 | 108 | if args.output: 109 | if os.path.isdir(args.output): 110 | assert os.path.isdir(args.output), args.output 111 | out_filename = os.path.join( 112 | args.output, os.path.basename(path)) 113 | else: 114 | assert len( 115 | args.output) > 0, "Please specify a directory with args.output" 116 | out_filename = args.output 117 | visualized_output.save(out_filename) 118 | else: 119 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) 120 | cv2.imshow( 121 | WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) 122 | if cv2.waitKey(0) == 27: 123 | break # esc to quit 124 | elif args.webcam: 125 | assert args.input is None, "Cannot have both --input and --webcam!" 126 | assert args.output is None, "output not yet supported with --webcam!" 127 | cam = cv2.VideoCapture(0) 128 | for vis in tqdm.tqdm(demo.run_on_video(cam, args.confidence_threshold)): 129 | cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) 130 | cv2.imshow(WINDOW_NAME, vis) 131 | if cv2.waitKey(1) == 27: 132 | break # esc to quit 133 | cam.release() 134 | cv2.destroyAllWindows() 135 | elif args.video_input: 136 | video = cv2.VideoCapture(args.video_input) 137 | width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) 138 | height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) 139 | frames_per_second = video.get(cv2.CAP_PROP_FPS) 140 | num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) 141 | basename = os.path.basename(args.video_input) 142 | 143 | if args.output: 144 | if os.path.isdir(args.output): 145 | output_fname = os.path.join(args.output, basename) 146 | output_fname = os.path.splitext(output_fname)[0] + ".mkv" 147 | else: 148 | output_fname = args.output 149 | assert not os.path.isfile(output_fname), output_fname 150 | output_file = cv2.VideoWriter( 151 | filename=output_fname, 152 | # some installation of opencv may not support x264 (due to its license), 153 | # you can try other format (e.g. MPEG) 154 | fourcc=cv2.VideoWriter_fourcc(*"x264"), 155 | fps=float(frames_per_second), 156 | frameSize=(width, height), 157 | isColor=True, 158 | ) 159 | assert os.path.isfile(args.video_input) 160 | for vis_frame in tqdm.tqdm(demo.run_on_video(video, args.confidence_threshold), total=num_frames): 161 | if args.output: 162 | output_file.write(vis_frame) 163 | else: 164 | cv2.namedWindow(basename, cv2.WINDOW_NORMAL) 165 | cv2.imshow(basename, vis_frame) 166 | if cv2.waitKey(1) == 27: 167 | break # esc to quit 168 | video.release() 169 | if args.output: 170 | output_file.release() 171 | else: 172 | cv2.destroyAllWindows() 173 | -------------------------------------------------------------------------------- /sparseinst/dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import copy 3 | import logging 4 | import numpy as np 5 | import torch 6 | 7 | 8 | from detectron2.data import detection_utils as utils 9 | from detectron2.data import transforms as T 10 | 11 | """ 12 | This file contains the default mapping that's applied to "dataset dicts". 13 | """ 14 | 15 | __all__ = ["SparseInstDatasetMapper"] 16 | 17 | 18 | def build_transform_gen(cfg, is_train): 19 | """ 20 | Create a list of default :class:`Augmentation` from config. 21 | Now it includes resizing and flipping. 22 | 23 | Returns: 24 | list[Augmentation] 25 | """ 26 | augmentation = [] 27 | 28 | if is_train: 29 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 30 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 31 | sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING 32 | else: 33 | min_size = cfg.INPUT.MIN_SIZE_TEST 34 | max_size = cfg.INPUT.MAX_SIZE_TEST 35 | sample_style = "choice" 36 | if is_train and cfg.INPUT.RANDOM_FLIP != "none": 37 | augmentation.append( 38 | T.RandomFlip( 39 | horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", 40 | vertical=cfg.INPUT.RANDOM_FLIP == "vertical", 41 | ) 42 | ) 43 | if is_train: 44 | # 800,1333, 0.6 45 | # 600, 1000 46 | # aspect ratio fixed 47 | augmentation.append( 48 | T.ResizeShortestEdge(min_size, max_size, sample_style) 49 | ) 50 | return augmentation 51 | 52 | 53 | class SparseInstDatasetMapper: 54 | """ 55 | A callable which takes a dataset dict in Detectron2 Dataset format, 56 | and map it into a format used by the model. 57 | 58 | This is the default callable to be used to map your dataset dict into training data. 59 | You may need to follow it to implement your own one for customized logic, 60 | such as a different way to read or transform images. 61 | See :doc:`/tutorials/data_loading` for details. 62 | 63 | The callable currently does the following: 64 | 65 | 1. Read the image from "file_name" 66 | 2. Applies cropping/geometric transforms to the image and annotations 67 | 3. Prepare data and annotations to Tensor and :class:`Instances` 68 | """ 69 | # @classmethod 70 | 71 | def __init__(self, cfg, is_train: bool = True): 72 | augs = build_transform_gen(cfg, is_train) 73 | self.default_aug = T.AugmentationList(augs) 74 | if cfg.INPUT.CROP.ENABLED and is_train: 75 | crop_gen = [ 76 | T.ResizeShortestEdge([400, 500, 600], sample_style='choice'), 77 | T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE) 78 | ] 79 | recompute_boxes = cfg.MODEL.MASK_ON 80 | augs = augs[:-1] + crop_gen + augs[-1:] 81 | self.crop_aug = T.AugmentationList(augs) 82 | else: 83 | self.crop_aug = None 84 | recompute_boxes = False 85 | 86 | # self.augs = augs 87 | self.is_train = is_train 88 | self.image_format = cfg.INPUT.FORMAT 89 | self.use_instance_mask = cfg.MODEL.MASK_ON 90 | self.instance_mask_format = cfg.INPUT.MASK_FORMAT 91 | self.recompute_boxes = recompute_boxes 92 | 93 | logger = logging.getLogger(__name__) 94 | mode = "training" if is_train else "inference" 95 | logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augs}") 96 | 97 | def __call__(self, dataset_dict): 98 | """ 99 | Args: 100 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 101 | 102 | Returns: 103 | dict: a format that builtin models in detectron2 accept 104 | """ 105 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 106 | # USER: Write your own image loading if it's not from a file 107 | image = utils.read_image(dataset_dict["file_name"], format=self.image_format) 108 | utils.check_image_size(dataset_dict, image) 109 | 110 | # USER: Remove if you don't do semantic/panoptic segmentation. 111 | if "sem_seg_file_name" in dataset_dict: 112 | sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2) 113 | else: 114 | sem_seg_gt = None 115 | 116 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 117 | 118 | if self.crop_aug is None: 119 | transforms = self.default_aug(aug_input) 120 | else: 121 | if np.random.rand() > 0.5: 122 | transforms = self.crop_aug(aug_input) 123 | else: 124 | transforms = self.default_aug(aug_input) 125 | # transforms = self.augmentations(aug_input) 126 | image, sem_seg_gt = aug_input.image, aug_input.sem_seg 127 | 128 | image_shape = image.shape[:2] # h, w 129 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 130 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 131 | # Therefore it's important to use torch.Tensor. 132 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 133 | if sem_seg_gt is not None: 134 | dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long")) 135 | 136 | if not self.is_train: 137 | # USER: Modify this if you want to keep them for some reason. 138 | dataset_dict.pop("annotations", None) 139 | dataset_dict.pop("sem_seg_file_name", None) 140 | return dataset_dict 141 | 142 | if "annotations" in dataset_dict: 143 | # USER: Modify this if you want to keep them for some reason. 144 | for anno in dataset_dict["annotations"]: 145 | anno.pop("keypoints", None) 146 | if not self.use_instance_mask: 147 | anno.pop("segmentation", None) 148 | 149 | # USER: Implement additional transformations if you have other types of data 150 | annos = [ 151 | utils.transform_instance_annotations( 152 | obj, transforms, image_shape) 153 | for obj in dataset_dict.pop("annotations") 154 | if obj.get("iscrowd", 0) == 0 155 | ] 156 | instances = utils.annotations_to_instances( 157 | annos, image_shape, mask_format=self.instance_mask_format 158 | ) 159 | 160 | # After transforms such as cropping are applied, the bounding box may no longer 161 | # tightly bound the object. As an example, imagine a triangle object 162 | # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight 163 | # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to 164 | # the intersection of original bounding box and the cropping box. 165 | if self.recompute_boxes: 166 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes() 167 | dataset_dict["instances"] = utils.filter_empty_instances(instances) 168 | return dataset_dict 169 | -------------------------------------------------------------------------------- /mindspore/sparseinst/decoder.py: -------------------------------------------------------------------------------- 1 | import mindspore 2 | from mindspore import Tensor 3 | import mindspore.nn as nn 4 | import mindspore.ops as ops 5 | from mindspore.nn import Conv2d 6 | 7 | 8 | __all__=["BaseIAMDecoder","GroupIAMDecoder"] 9 | 10 | def _make_stack_3x3_convs(num_convs, in_channels, out_channels): 11 | convs = [] 12 | for _ in range(num_convs): 13 | convs.append( 14 | nn.Conv2d(in_channels, out_channels, 3, has_bias=True)) 15 | convs.append(nn.ReLU()) 16 | in_channels = out_channels 17 | return nn.SequentialCell(*convs) 18 | 19 | 20 | class MaskBranch(nn.Cell): 21 | 22 | def __init__(self, cfg, in_channels): 23 | super().__init__() 24 | dim = cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM#256 25 | num_convs = cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS 26 | kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM 27 | self.mask_convs = _make_stack_3x3_convs(num_convs, in_channels, dim) 28 | self.projection = nn.Conv2d(dim, kernel_dim, kernel_size=1,has_bias=True) 29 | 30 | def construct(self, features): 31 | # mask features (x4 convs) 32 | features = self.mask_convs(features) 33 | return self.projection(features) 34 | 35 | 36 | 37 | class InstanceBranch(nn.Cell): 38 | 39 | def __init__(self, cfg, in_channels): 40 | super().__init__() 41 | # norm = cfg.MODEL.SPARSE_INST.DECODER.NORM 42 | dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM 43 | num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS 44 | num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS 45 | kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM 46 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES 47 | 48 | self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim) 49 | # iam prediction, a simple conv 50 | self.iam_conv = nn.Conv2d(dim, num_masks, 3, has_bias=True) 51 | 52 | # outputs 53 | self.cls_score = nn.Dense(dim, self.num_classes) 54 | self.mask_kernel = nn.Dense(dim, kernel_dim) 55 | self.objectness = nn.Dense(dim, 1) 56 | 57 | 58 | def construct(self, features): 59 | # instance features (x4 convs) 60 | features = self.inst_convs(features) 61 | # predict instance activation maps 62 | iam = self.iam_conv(features) 63 | iam_prob = ops.Sigmoid()(iam) 64 | 65 | B, N = iam_prob.shape[:2] 66 | C = features.shape[1] 67 | # BxNxHxW -> BxNx(HW) 68 | iam_prob = iam_prob.view((B, N, -1)) 69 | # aggregate features: BxCxHxW -> Bx(HW)xC 70 | inst_features=ops.BatchMatMul(transpose_b=True)(iam_prob,features.view((B, C, -1))) 71 | normalizer = ops.clip_by_value(iam_prob.sum(-1),clip_value_min=Tensor(1e-6,mindspore.float32)) 72 | inst_features = inst_features / normalizer[:, :, None] 73 | # predict classification & segmentation kernel & objectness 74 | pred_logits = self.cls_score(inst_features) 75 | pred_kernel = self.mask_kernel(inst_features) 76 | pred_scores = self.objectness(inst_features) 77 | return pred_logits, pred_kernel, pred_scores, iam 78 | 79 | 80 | class BaseIAMDecoder(nn.Cell): 81 | 82 | def __init__(self, cfg): 83 | super().__init__() 84 | # add 2 for coordinates 85 | in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2 86 | 87 | self.scale_factor = cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR 88 | self.output_iam = cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM 89 | 90 | self.resize=nn.ResizeBilinear() 91 | 92 | self.inst_branch = InstanceBranch(cfg, in_channels) 93 | self.mask_branch = MaskBranch(cfg, in_channels) 94 | 95 | 96 | def compute_coordinates(self, x): 97 | h, w = x.shape[2], x.shape[3] 98 | start=Tensor(-1,mindspore.float32) 99 | stop=Tensor(1,mindspore.float32) 100 | y_loc = ops.linspace(start,stop, h) 101 | x_loc = ops.linspace(start,stop, w) 102 | y_loc, x_loc = ops.meshgrid((y_loc, x_loc),indexing='ij') 103 | y_loc=ops.broadcast_to(y_loc,(x.shape[0],1,-1,-1)) 104 | x_loc=ops.broadcast_to(x_loc,(x.shape[0],1,-1,-1)) 105 | locations=ops.concat((x_loc,y_loc),axis=1) 106 | return locations.astype('float32') 107 | 108 | def construct(self, features): 109 | coord_features = self.compute_coordinates(features) 110 | features=ops.concat((coord_features,features),axis=1) 111 | pred_logits, pred_kernel, pred_scores, iam = self.inst_branch(features) 112 | mask_features = self.mask_branch(features) 113 | 114 | N = pred_kernel.shape[1] 115 | # mask_features: BxCxHxW 116 | B, C, H, W = mask_features.shape 117 | pred_masks=ops.BatchMatMul()(pred_kernel,mask_features.view((B,C,H*W))).view((B,N,H,W)) 118 | 119 | 120 | pred_masks=self.resize(pred_masks,scale_factor=self.scale_factor) 121 | output = { 122 | "pred_logits": pred_logits, 123 | "pred_masks": pred_masks, 124 | "pred_scores": pred_scores, 125 | } 126 | 127 | if self.output_iam: 128 | iam=self.resize(iam,scale_factor=self.scale_factor) 129 | output['pred_iam'] = iam 130 | 131 | return output 132 | 133 | 134 | 135 | class GroupInstanceBranch(nn.Cell): 136 | 137 | def __init__(self, cfg, in_channels): 138 | super().__init__() 139 | # norm = cfg.MODEL.SPARSE_INST.DECODER.NORM 140 | dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM 141 | num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS 142 | num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS 143 | kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM 144 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES 145 | self.num_groups = cfg.MODEL.SPARSE_INST.DECODER.GROUPS 146 | 147 | self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim) 148 | # iam prediction, a simple conv 149 | expand_dim = dim * self.num_groups 150 | self.iam_conv = nn.Conv2d(dim, num_masks * self.num_groups, 3, group=self.num_groups,has_bias=True) 151 | 152 | # outputs 153 | self.fc = nn.Dense(expand_dim, expand_dim) 154 | self.cls_score = nn.Dense(expand_dim, self.num_classes) 155 | self.mask_kernel = nn.Dense(expand_dim, kernel_dim) 156 | self.objectness = nn.Dense(expand_dim, 1) 157 | 158 | 159 | def construct(self, features): 160 | # instance features (x4 convs) 161 | features = self.inst_convs(features) 162 | # predict instance activation maps 163 | iam = self.iam_conv(features) 164 | iam_prob = ops.Sigmoid()(iam) 165 | 166 | B, N = iam_prob.shape[:2] 167 | C = features.shape[1] 168 | # BxNxHxW -> BxNx(HW) 169 | iam_prob = iam_prob.view((B, N, -1)) 170 | # aggregate features: BxCxHxW -> Bx(HW)xC 171 | inst_features=ops.BatchMatMul(transpose_b=True)(iam_prob,features.view((B, C, -1))) 172 | normalizer = ops.clip_by_value(iam_prob.sum(-1),clip_value_min=Tensor(1e-6,mindspore.float32)) 173 | inst_features = inst_features / normalizer[:, :, None] 174 | 175 | inst_features=ops.reshape(ops.Transpose()(ops.reshape(inst_features,(B,4,N//4,-1)),(0,2,1,3)),(B,N//4,-1)) 176 | inst_features=ops.ReLU()(self.fc(inst_features)) 177 | # predict classification & segmentation kernel & objectness 178 | pred_logits = self.cls_score(inst_features) 179 | pred_kernel = self.mask_kernel(inst_features) 180 | pred_scores = self.objectness(inst_features) 181 | return pred_logits, pred_kernel, pred_scores, iam 182 | 183 | 184 | 185 | class GroupIAMDecoder(BaseIAMDecoder): 186 | def __init__(self, cfg): 187 | super().__init__(cfg) 188 | in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2 189 | self.inst_branch = GroupInstanceBranch(cfg, in_channels) 190 | 191 | 192 | -------------------------------------------------------------------------------- /tools/get_flops.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import typing 3 | from typing import Dict, List, Counter, Any 4 | import logging 5 | import numpy as np 6 | from collections import Counter 7 | import tqdm 8 | from fvcore.nn import flop_count_table # can also try flop_count_str 9 | from fvcore.nn.jit_handles import conv_flop_jit, Handle, get_shape, conv_flop_count 10 | from detectron2.checkpoint import DetectionCheckpointer 11 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate 12 | from detectron2.data import build_detection_test_loader 13 | from detectron2.engine import default_argument_parser 14 | from detectron2.modeling import build_model 15 | from detectron2.utils.analysis import ( 16 | FlopCountAnalysis, 17 | activation_count_operators, 18 | parameter_count_table, 19 | TracingAdapter 20 | ) 21 | 22 | from detectron2.utils.logger import setup_logger 23 | sys.path.append(".") 24 | from sparseinst import add_sparse_inst_config 25 | 26 | logger = logging.getLogger("detectron2") 27 | 28 | 29 | def dconv_flop_jit(inputs: List[Any], outputs: List[Any]) -> typing.Counter[str]: 30 | """ 31 | Count flops for convolution. 32 | """ 33 | # Inputs of Convolution should be a list of length 12 or 13. They represent: 34 | # 0) input tensor, 1) convolution filter, 2) bias, 3) stride, 4) padding, 35 | # 5) dilation, 6) transposed, 7) out_pad, 8) groups, 9) benchmark_cudnn, 36 | # 10) deterministic_cudnn and 11) user_enabled_cudnn. 37 | # starting with #40737 it will be 12) user_enabled_tf32 38 | # assert len(inputs) == 12 or len(inputs) == 13, len(inputs) 39 | x, _, w = inputs[:3] 40 | x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0])) 41 | return Counter({"conv": conv_flop_count(x_shape, w_shape, out_shape)}) 42 | 43 | 44 | _NEW_SUPPORTED_OPS: Dict[str, Handle] = { 45 | "prim::PythonOp._DeformConv": dconv_flop_jit, 46 | } 47 | 48 | 49 | class MyFlopCountAnalysis(FlopCountAnalysis): 50 | """ 51 | Same as :class:`fvcore.nn.FlopCountAnalysis`, but supports detectron2 models. 52 | """ 53 | 54 | def __init__(self, model, inputs): 55 | """ 56 | Args: 57 | model (nn.Module): 58 | inputs (Any): inputs of the given model. Does not have to be tuple of tensors. 59 | """ 60 | wrapper = TracingAdapter(model, inputs, allow_non_tensor=True) 61 | super().__init__(wrapper, wrapper.flattened_inputs) 62 | self.set_op_handle(**_NEW_SUPPORTED_OPS) 63 | 64 | 65 | def setup(args): 66 | if args.config_file.endswith(".yaml"): 67 | cfg = get_cfg() 68 | add_sparse_inst_config(cfg) 69 | cfg.merge_from_file(args.config_file) 70 | print(cfg.MODEL.WEIGHTS) 71 | cfg.DATALOADER.NUM_WORKERS = 0 72 | cfg.merge_from_list(args.opts) 73 | cfg.freeze() 74 | else: 75 | cfg = LazyConfig.load(args.config_file) 76 | cfg = LazyConfig.apply_overrides(cfg, args.opts) 77 | setup_logger(name="fvcore") 78 | setup_logger() 79 | return cfg 80 | 81 | 82 | def do_flop(cfg): 83 | if isinstance(cfg, CfgNode): 84 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) 85 | model = build_model(cfg) 86 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 87 | else: 88 | data_loader = instantiate(cfg.dataloader.test) 89 | model = instantiate(cfg.model) 90 | model.to(cfg.train.device) 91 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 92 | model.eval() 93 | 94 | counts = Counter() 95 | total_flops = [] 96 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa 97 | flops = MyFlopCountAnalysis(model, data) 98 | if idx > 0: 99 | flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False) 100 | counts += flops.by_operator() 101 | total_flops.append(flops.total()) 102 | # print(flops.unsupported_ops()) 103 | 104 | logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops)) 105 | logger.info( 106 | "Average GFlops for each type of operators:\n" 107 | + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()]) 108 | ) 109 | logger.info( 110 | "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9) 111 | ) 112 | 113 | 114 | def do_activation(cfg): 115 | if isinstance(cfg, CfgNode): 116 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) 117 | model = build_model(cfg) 118 | DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) 119 | else: 120 | data_loader = instantiate(cfg.dataloader.test) 121 | model = instantiate(cfg.model) 122 | model.to(cfg.train.device) 123 | DetectionCheckpointer(model).load(cfg.train.init_checkpoint) 124 | model.eval() 125 | 126 | counts = Counter() 127 | total_activations = [] 128 | for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa 129 | count = activation_count_operators(model, data) 130 | counts += count 131 | total_activations.append(sum(count.values())) 132 | logger.info( 133 | "(Million) Activations for Each Type of Operators:\n" 134 | + str([(k, v / idx) for k, v in counts.items()]) 135 | ) 136 | logger.info( 137 | "Total (Million) Activations: {}±{}".format( 138 | np.mean(total_activations), np.std(total_activations) 139 | ) 140 | ) 141 | 142 | 143 | def do_parameter(cfg): 144 | if isinstance(cfg, CfgNode): 145 | model = build_model(cfg) 146 | else: 147 | model = instantiate(cfg.model) 148 | logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5)) 149 | 150 | 151 | def do_structure(cfg): 152 | if isinstance(cfg, CfgNode): 153 | model = build_model(cfg) 154 | else: 155 | model = instantiate(cfg.model) 156 | logger.info("Model Structure:\n" + str(model)) 157 | 158 | 159 | if __name__ == "__main__": 160 | parser = default_argument_parser( 161 | epilog=""" 162 | Examples: 163 | 164 | To show parameters of a model: 165 | $ ./analyze_model.py --tasks parameter \\ 166 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml 167 | 168 | Flops and activations are data-dependent, therefore inputs and model weights 169 | are needed to count them: 170 | 171 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\ 172 | --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\ 173 | MODEL.WEIGHTS /path/to/model.pkl 174 | """ 175 | ) 176 | parser.add_argument( 177 | "--tasks", 178 | choices=["flop", "activation", "parameter", "structure"], 179 | required=True, 180 | nargs="+", 181 | ) 182 | parser.add_argument( 183 | "-n", 184 | "--num-inputs", 185 | default=100, 186 | type=int, 187 | help="number of inputs used to compute statistics for flops/activations, " 188 | "both are data dependent.", 189 | ) 190 | args = parser.parse_args() 191 | assert not args.eval_only 192 | assert args.num_gpus == 1 193 | cfg = setup(args) 194 | 195 | for task in args.tasks: 196 | { 197 | "flop": do_flop, 198 | "activation": do_activation, 199 | "parameter": do_parameter, 200 | "structure": do_structure, 201 | }[task](cfg) 202 | -------------------------------------------------------------------------------- /tools/test_net.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.cuda.amp import autocast 9 | 10 | from detectron2.config import get_cfg 11 | from detectron2.modeling import build_backbone 12 | from detectron2.checkpoint import DetectionCheckpointer 13 | from detectron2.structures import ImageList, Instances, BitMasks 14 | from detectron2.engine import default_argument_parser, default_setup 15 | from detectron2.data import build_detection_test_loader 16 | from detectron2.evaluation import COCOEvaluator, print_csv_format 17 | 18 | sys.path.append(".") 19 | from sparseinst import build_sparse_inst_encoder, build_sparse_inst_decoder, add_sparse_inst_config 20 | from sparseinst import COCOMaskEvaluator 21 | 22 | 23 | device = torch.device('cuda:0') 24 | dtype = torch.float32 25 | 26 | __all__ = ["SparseInst"] 27 | 28 | pixel_mean = torch.Tensor([123.675, 116.280, 103.530]).to(device).view(3, 1, 1) 29 | pixel_std = torch.Tensor([58.395, 57.120, 57.375]).to(device).view(3, 1, 1) 30 | 31 | 32 | @torch.jit.script 33 | def normalizer(x, mean, std): return (x - mean) / std 34 | 35 | 36 | def synchronize(): 37 | torch.cuda.synchronize() 38 | 39 | 40 | def process_batched_inputs(batched_inputs): 41 | images = [x["image"].to(device) for x in batched_inputs] 42 | images = [normalizer(x, pixel_mean, pixel_std) for x in images] 43 | images = ImageList.from_tensors(images, 32) 44 | ori_size = (batched_inputs[0]["height"], batched_inputs[0]["width"]) 45 | return images.tensor, images.image_sizes[0], ori_size 46 | 47 | 48 | @torch.jit.script 49 | def rescoring_mask(scores, mask_pred, masks): 50 | mask_pred_ = mask_pred.float() 51 | return scores * ((masks * mask_pred_).sum([1, 2]) / (mask_pred_.sum([1, 2]) + 1e-6)) 52 | 53 | 54 | class SparseInst(nn.Module): 55 | 56 | def __init__(self, cfg): 57 | 58 | super().__init__() 59 | 60 | self.device = torch.device(cfg.MODEL.DEVICE) 61 | # backbone 62 | self.backbone = build_backbone(cfg) 63 | self.size_divisibility = self.backbone.size_divisibility 64 | 65 | output_shape = self.backbone.output_shape() 66 | 67 | self.encoder = build_sparse_inst_encoder(cfg, output_shape) 68 | self.decoder = build_sparse_inst_decoder(cfg) 69 | 70 | self.to(self.device) 71 | 72 | # inference 73 | self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD 74 | self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD 75 | self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS 76 | self.mask_format = cfg.INPUT.MASK_FORMAT 77 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES 78 | 79 | def forward(self, image, resized_size, ori_size): 80 | max_size = image.shape[2:] 81 | features = self.backbone(image) 82 | features = self.encoder(features) 83 | output = self.decoder(features) 84 | result = self.inference_single( 85 | output, resized_size, max_size, ori_size) 86 | return result 87 | 88 | def inference_single(self, outputs, img_shape, pad_shape, ori_shape): 89 | """ 90 | inference for only one sample 91 | Args: 92 | scores (tensor): [NxC] 93 | masks (tensor): [NxHxW] 94 | img_shape (list): (h1, w1), image after resized 95 | pad_shape (list): (h2, w2), padded resized image 96 | ori_shape (list): (h3, w3), original shape h3*w3 < h1*w1 < h2*w2 97 | """ 98 | result = Instances(ori_shape) 99 | # scoring 100 | pred_logits = outputs["pred_logits"][0].sigmoid() 101 | pred_scores = outputs["pred_scores"][0].sigmoid().squeeze() 102 | pred_masks = outputs["pred_masks"][0].sigmoid() 103 | # obtain scores 104 | scores, labels = pred_logits.max(dim=-1) 105 | # remove by thresholding 106 | keep = scores > self.cls_threshold 107 | scores = torch.sqrt(scores[keep] * pred_scores[keep]) 108 | labels = labels[keep] 109 | pred_masks = pred_masks[keep] 110 | 111 | if scores.size(0) == 0: 112 | return None 113 | scores = rescoring_mask(scores, pred_masks > 0.45, pred_masks) 114 | h, w = img_shape 115 | # resize masks 116 | pred_masks = F.interpolate(pred_masks.unsqueeze(1), size=pad_shape, 117 | mode="bilinear", align_corners=False)[:, :, :h, :w] 118 | pred_masks = F.interpolate(pred_masks, size=ori_shape, mode='bilinear', 119 | align_corners=False).squeeze(1) 120 | mask_pred = pred_masks > self.mask_threshold 121 | 122 | mask_pred = BitMasks(mask_pred) 123 | result.pred_masks = mask_pred 124 | result.scores = scores 125 | result.pred_classes = labels 126 | return result 127 | 128 | 129 | def test_sparseinst_speed(cfg, fp16=False): 130 | device = torch.device('cuda:0') 131 | 132 | model = SparseInst(cfg) 133 | model.eval() 134 | model.to(device) 135 | print(model) 136 | size = (cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST) 137 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 138 | cfg.MODEL.WEIGHTS, resume=False) 139 | 140 | torch.backends.cudnn.enable = True 141 | torch.backends.cudnn.benchmark = False 142 | 143 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 144 | 145 | evaluator = COCOMaskEvaluator( 146 | cfg.DATASETS.TEST[0], ("segm",), False, output_folder) 147 | evaluator.reset() 148 | model.to(device) 149 | model.eval() 150 | data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) 151 | durations = [] 152 | 153 | with autocast(enabled=fp16): 154 | with torch.no_grad(): 155 | for idx, inputs in enumerate(data_loader): 156 | images, resized_size, ori_size = process_batched_inputs(inputs) 157 | synchronize() 158 | start_time = time.perf_counter() 159 | output = model(images, resized_size, ori_size) 160 | synchronize() 161 | end = time.perf_counter() - start_time 162 | 163 | durations.append(end) 164 | if idx % 1000 == 0: 165 | print("process: [{}/{}] fps: {:.3f}".format(idx, 166 | len(data_loader), 1/np.mean(durations[100:]))) 167 | evaluator.process(inputs, [{"instances": output}]) 168 | # evaluate 169 | results = evaluator.evaluate() 170 | print_csv_format(results) 171 | 172 | latency = np.mean(durations[100:]) 173 | fps = 1 / latency 174 | print("speed: {:.4f}s FPS: {:.2f}".format(latency, fps)) 175 | 176 | 177 | def setup(args): 178 | """ 179 | Create configs and perform basic setups. 180 | """ 181 | cfg = get_cfg() 182 | add_sparse_inst_config(cfg) 183 | cfg.merge_from_file(args.config_file) 184 | cfg.merge_from_list(args.opts) 185 | cfg.freeze() 186 | default_setup(cfg, args) 187 | return cfg 188 | 189 | 190 | if __name__ == '__main__': 191 | 192 | args = default_argument_parser() 193 | args.add_argument("--fp16", action="store_true", 194 | help="support fp16 for inference") 195 | args = args.parse_args() 196 | print("Command Line Args:", args) 197 | cfg = setup(args) 198 | test_sparseinst_speed(cfg, fp16=args.fp16) 199 | -------------------------------------------------------------------------------- /sparseinst/sparseinst.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from detectron2.modeling import build_backbone 8 | from detectron2.structures import ImageList, Instances, BitMasks 9 | from detectron2.modeling import META_ARCH_REGISTRY, build_backbone 10 | 11 | from .encoder import build_sparse_inst_encoder 12 | from .decoder import build_sparse_inst_decoder 13 | from .loss import build_sparse_inst_criterion 14 | from .utils import nested_tensor_from_tensor_list 15 | 16 | __all__ = ["SparseInst"] 17 | 18 | 19 | @torch.jit.script 20 | def rescoring_mask(scores, mask_pred, masks): 21 | mask_pred_ = mask_pred.float() 22 | return scores * ((masks * mask_pred_).sum([1, 2]) / (mask_pred_.sum([1, 2]) + 1e-6)) 23 | 24 | 25 | @META_ARCH_REGISTRY.register() 26 | class SparseInst(nn.Module): 27 | 28 | def __init__(self, cfg): 29 | super().__init__() 30 | 31 | # move to target device 32 | self.device = torch.device(cfg.MODEL.DEVICE) 33 | 34 | # backbone 35 | self.backbone = build_backbone(cfg) 36 | self.size_divisibility = self.backbone.size_divisibility 37 | output_shape = self.backbone.output_shape() 38 | 39 | # encoder & decoder 40 | self.encoder = build_sparse_inst_encoder(cfg, output_shape) 41 | self.decoder = build_sparse_inst_decoder(cfg) 42 | 43 | # matcher & loss (matcher is built in loss) 44 | self.criterion = build_sparse_inst_criterion(cfg) 45 | 46 | # data and preprocessing 47 | self.mask_format = cfg.INPUT.MASK_FORMAT 48 | 49 | self.pixel_mean = torch.Tensor( 50 | cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1) 51 | self.pixel_std = torch.Tensor( 52 | cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1) 53 | # self.normalizer = lambda x: (x - pixel_mean) / pixel_std 54 | 55 | # inference 56 | self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD 57 | self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD 58 | self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS 59 | 60 | def normalizer(self, image): 61 | image = (image - self.pixel_mean) / self.pixel_std 62 | return image 63 | 64 | def preprocess_inputs(self, batched_inputs): 65 | images = [x["image"].to(self.device) for x in batched_inputs] 66 | images = [self.normalizer(x) for x in images] 67 | images = ImageList.from_tensors(images, 32) 68 | return images 69 | 70 | def prepare_targets(self, targets): 71 | new_targets = [] 72 | for targets_per_image in targets: 73 | target = {} 74 | gt_classes = targets_per_image.gt_classes 75 | target["labels"] = gt_classes.to(self.device) 76 | h, w = targets_per_image.image_size 77 | if not targets_per_image.has('gt_masks'): 78 | gt_masks = BitMasks(torch.empty(0, h, w)) 79 | else: 80 | gt_masks = targets_per_image.gt_masks 81 | if self.mask_format == "polygon": 82 | if len(gt_masks.polygons) == 0: 83 | gt_masks = BitMasks(torch.empty(0, h, w)) 84 | else: 85 | gt_masks = BitMasks.from_polygon_masks( 86 | gt_masks.polygons, h, w) 87 | 88 | target["masks"] = gt_masks.to(self.device) 89 | new_targets.append(target) 90 | 91 | return new_targets 92 | 93 | def forward(self, batched_inputs): 94 | images = self.preprocess_inputs(batched_inputs) 95 | if isinstance(images, (list, torch.Tensor)): 96 | images = nested_tensor_from_tensor_list(images) 97 | max_shape = images.tensor.shape[2:] 98 | # forward 99 | features = self.backbone(images.tensor) 100 | features = self.encoder(features) 101 | output = self.decoder(features) 102 | 103 | if self.training: 104 | gt_instances = [x["instances"].to( 105 | self.device) for x in batched_inputs] 106 | targets = self.prepare_targets(gt_instances) 107 | losses = self.criterion(output, targets, max_shape) 108 | return losses 109 | else: 110 | results = self.inference( 111 | output, batched_inputs, max_shape, images.image_sizes) 112 | processed_results = [{"instances": r} for r in results] 113 | return processed_results 114 | 115 | def forward_test(self, images): 116 | # for inference, onnx, tensorrt 117 | # input images: BxCxHxW, fixed, need padding size 118 | # normalize 119 | images = (images - self.pixel_mean[None]) / self.pixel_std[None] 120 | features = self.backbone(images) 121 | features = self.encoder(features) 122 | output = self.decoder(features) 123 | 124 | pred_scores = output["pred_logits"].sigmoid() 125 | pred_masks = output["pred_masks"].sigmoid() 126 | pred_objectness = output["pred_scores"].sigmoid() 127 | pred_scores = torch.sqrt(pred_scores * pred_objectness) 128 | pred_masks = F.interpolate( 129 | pred_masks, scale_factor=4.0, mode="bilinear", align_corners=False) 130 | return pred_scores, pred_masks 131 | 132 | def inference(self, output, batched_inputs, max_shape, image_sizes): 133 | # max_detections = self.max_detections 134 | results = [] 135 | pred_scores = output["pred_logits"].sigmoid() 136 | pred_masks = output["pred_masks"].sigmoid() 137 | pred_objectness = output["pred_scores"].sigmoid() 138 | pred_scores = torch.sqrt(pred_scores * pred_objectness) 139 | 140 | for _, (scores_per_image, mask_pred_per_image, batched_input, img_shape) in enumerate(zip( 141 | pred_scores, pred_masks, batched_inputs, image_sizes)): 142 | 143 | ori_shape = (batched_input["height"], batched_input["width"]) 144 | result = Instances(ori_shape) 145 | # max/argmax 146 | scores, labels = scores_per_image.max(dim=-1) 147 | # cls threshold 148 | keep = scores > self.cls_threshold 149 | scores = scores[keep] 150 | labels = labels[keep] 151 | mask_pred_per_image = mask_pred_per_image[keep] 152 | 153 | if scores.size(0) == 0: 154 | result.scores = scores 155 | result.pred_classes = labels 156 | results.append(result) 157 | continue 158 | 159 | h, w = img_shape 160 | # rescoring mask using maskness 161 | scores = rescoring_mask( 162 | scores, mask_pred_per_image > self.mask_threshold, mask_pred_per_image) 163 | 164 | # upsample the masks to the original resolution: 165 | # (1) upsampling the masks to the padded inputs, remove the padding area 166 | # (2) upsampling/downsampling the masks to the original sizes 167 | mask_pred_per_image = F.interpolate( 168 | mask_pred_per_image.unsqueeze(1), size=max_shape, mode="bilinear", align_corners=False)[:, :, :h, :w] 169 | mask_pred_per_image = F.interpolate( 170 | mask_pred_per_image, size=ori_shape, mode='bilinear', align_corners=False).squeeze(1) 171 | 172 | mask_pred = mask_pred_per_image > self.mask_threshold 173 | # fix the bug for visualization 174 | # mask_pred = BitMasks(mask_pred) 175 | 176 | # using Detectron2 Instances to store the final results 177 | result.pred_masks = mask_pred 178 | result.scores = scores 179 | result.pred_classes = labels 180 | results.append(result) 181 | 182 | return results 183 | -------------------------------------------------------------------------------- /tools/train_net.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import itertools 4 | from typing import Any, Dict, List, Set 5 | import torch 6 | 7 | import detectron2.utils.comm as comm 8 | from detectron2.checkpoint import DetectionCheckpointer 9 | from detectron2.config import get_cfg 10 | from detectron2.utils.logger import setup_logger 11 | from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetMapper 12 | from detectron2.engine import AutogradProfiler, DefaultTrainer, default_argument_parser, default_setup, launch 13 | from detectron2.evaluation import COCOEvaluator, verify_results 14 | from detectron2.solver.build import maybe_add_gradient_clipping 15 | from detectron2.evaluation import ( 16 | CityscapesInstanceEvaluator, 17 | CityscapesSemSegEvaluator, 18 | COCOEvaluator, 19 | COCOPanopticEvaluator, 20 | DatasetEvaluators, 21 | LVISEvaluator, 22 | PascalVOCDetectionEvaluator, 23 | SemSegEvaluator, 24 | verify_results, 25 | ) 26 | 27 | sys.path.append(".") 28 | from sparseinst import add_sparse_inst_config, COCOMaskEvaluator 29 | 30 | 31 | class Trainer(DefaultTrainer): 32 | 33 | @classmethod 34 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 35 | """ 36 | Create evaluator(s) for a given dataset. 37 | This uses the special metadata "evaluator_type" associated with each builtin dataset. 38 | For your own dataset, you can simply create an evaluator manually in your 39 | script and do not have to worry about the hacky if-else logic here. 40 | """ 41 | if output_folder is None: 42 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 43 | evaluator_list = [] 44 | evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type 45 | if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: 46 | evaluator_list.append( 47 | SemSegEvaluator( 48 | dataset_name, 49 | distributed=True, 50 | num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 51 | ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 52 | output_dir=output_folder, 53 | ) 54 | ) 55 | if evaluator_type in ["coco", "coco_panoptic_seg"]: 56 | evaluator_list.append(COCOMaskEvaluator(dataset_name, ("segm", ), True, output_folder)) 57 | if evaluator_type == "coco_panoptic_seg": 58 | evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) 59 | if evaluator_type == "cityscapes_instance": 60 | assert ( 61 | torch.cuda.device_count() >= comm.get_rank() 62 | ), "CityscapesEvaluator currently do not work with multiple machines." 63 | return CityscapesInstanceEvaluator(dataset_name) 64 | if evaluator_type == "cityscapes_sem_seg": 65 | assert ( 66 | torch.cuda.device_count() >= comm.get_rank() 67 | ), "CityscapesEvaluator currently do not work with multiple machines." 68 | return CityscapesSemSegEvaluator(dataset_name) 69 | elif evaluator_type == "pascal_voc": 70 | return PascalVOCDetectionEvaluator(dataset_name) 71 | elif evaluator_type == "lvis": 72 | return LVISEvaluator(dataset_name, cfg, True, output_folder) 73 | if len(evaluator_list) == 0: 74 | raise NotImplementedError( 75 | "no Evaluator for the dataset {} with the type {}".format( 76 | dataset_name, evaluator_type 77 | ) 78 | ) 79 | elif len(evaluator_list) == 1: 80 | return evaluator_list[0] 81 | return DatasetEvaluators(evaluator_list) 82 | 83 | @classmethod 84 | def build_optimizer(cls, cfg, model): 85 | params: List[Dict[str, Any]] = [] 86 | memo: Set[torch.nn.parameter.Parameter] = set() 87 | for key, value in model.named_parameters(recurse=True): 88 | if not value.requires_grad: 89 | continue 90 | # Avoid duplicating parameters 91 | if value in memo: 92 | continue 93 | memo.add(value) 94 | lr = cfg.SOLVER.BASE_LR 95 | weight_decay = cfg.SOLVER.WEIGHT_DECAY 96 | if "backbone" in key: 97 | lr = lr * cfg.SOLVER.BACKBONE_MULTIPLIER 98 | # for transformer 99 | if "patch_embed" in key or "cls_token" in key: 100 | weight_decay = 0.0 101 | if "norm" in key: 102 | weight_decay = 0.0 103 | params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] 104 | 105 | def maybe_add_full_model_gradient_clipping(optim): # optim: the optimizer class 106 | # detectron2 doesn't have full model gradient clipping now 107 | clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE 108 | enable = ( 109 | cfg.SOLVER.CLIP_GRADIENTS.ENABLED 110 | and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model" 111 | and clip_norm_val > 0.0 112 | ) 113 | 114 | class FullModelGradientClippingOptimizer(optim): 115 | def step(self, closure=None): 116 | all_params = itertools.chain(*[x["params"] for x in self.param_groups]) 117 | torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val) 118 | super().step(closure=closure) 119 | 120 | return FullModelGradientClippingOptimizer if enable else optim 121 | 122 | optimizer_type = cfg.SOLVER.OPTIMIZER 123 | if optimizer_type == "SGD": 124 | optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)( 125 | params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM 126 | ) 127 | elif optimizer_type == "ADAMW": 128 | optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)( 129 | params, cfg.SOLVER.BASE_LR, amsgrad=cfg.SOLVER.AMSGRAD 130 | ) 131 | else: 132 | raise NotImplementedError(f"no optimizer type {optimizer_type}") 133 | if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model": 134 | optimizer = maybe_add_gradient_clipping(cfg, optimizer) 135 | return optimizer 136 | 137 | @classmethod 138 | def build_train_loader(cls, cfg): 139 | if cfg.MODEL.SPARSE_INST.DATASET_MAPPER == "SparseInstDatasetMapper": 140 | from sparseinst import SparseInstDatasetMapper 141 | mapper = SparseInstDatasetMapper(cfg, is_train=True) 142 | else: 143 | mapper = None 144 | return build_detection_train_loader(cfg, mapper=mapper) 145 | 146 | 147 | def setup(args): 148 | """ 149 | Create configs and perform basic setups. 150 | """ 151 | cfg = get_cfg() 152 | add_sparse_inst_config(cfg) 153 | cfg.merge_from_file(args.config_file) 154 | cfg.merge_from_list(args.opts) 155 | cfg.freeze() 156 | default_setup(cfg, args) 157 | # Setup logger for "sparseinst" module 158 | setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="sparseinst") 159 | return cfg 160 | 161 | 162 | def main(args): 163 | cfg = setup(args) 164 | 165 | if args.eval_only: 166 | model = Trainer.build_model(cfg) 167 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 168 | cfg.MODEL.WEIGHTS, resume=args.resume) 169 | res = Trainer.test(cfg, model) 170 | if comm.is_main_process(): 171 | verify_results(cfg, res) 172 | return res 173 | 174 | trainer = Trainer(cfg) 175 | trainer.resume_or_load(resume=args.resume) 176 | return trainer.train() 177 | 178 | 179 | if __name__ == "__main__": 180 | args = default_argument_parser().parse_args() 181 | print("Command Line Args:", args) 182 | launch( 183 | main, 184 | args.num_gpus, 185 | num_machines=args.num_machines, 186 | machine_rank=args.machine_rank, 187 | dist_url=args.dist_url, 188 | args=(args,), 189 | ) 190 | -------------------------------------------------------------------------------- /sparseinst/d2_predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import atexit 3 | import bisect 4 | import multiprocessing as mp 5 | from collections import deque 6 | import cv2 7 | import torch 8 | 9 | from detectron2.data import MetadataCatalog 10 | from detectron2.engine.defaults import DefaultPredictor 11 | from detectron2.utils.video_visualizer import VideoVisualizer 12 | from detectron2.utils.visualizer import ColorMode, Visualizer 13 | 14 | 15 | class VisualizationDemo(object): 16 | def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): 17 | """ 18 | Args: 19 | cfg (CfgNode): 20 | instance_mode (ColorMode): 21 | parallel (bool): whether to run the model in different processes from visualization. 22 | Useful since the visualization logic can be slow. 23 | """ 24 | self.img_format = cfg.INPUT.FORMAT 25 | self.metadata = MetadataCatalog.get( 26 | cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" 27 | ) 28 | self.cpu_device = torch.device("cpu") 29 | self.instance_mode = instance_mode 30 | 31 | self.parallel = parallel 32 | if parallel: 33 | num_gpu = torch.cuda.device_count() 34 | self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu) 35 | else: 36 | self.predictor = DefaultPredictor(cfg) 37 | 38 | def run_on_image(self, image, confidence_threshold): 39 | """ 40 | Args: 41 | image (np.ndarray): an image of shape (H, W, C) (in BGR order). 42 | This is the format used by OpenCV. 43 | 44 | Returns: 45 | predictions (dict): the output of the model. 46 | vis_output (VisImage): the visualized image output. 47 | """ 48 | vis_output = None 49 | predictions = self.predictor(image) 50 | visualizer = Visualizer(image, self.metadata, 51 | instance_mode=self.instance_mode) 52 | if "panoptic_seg" in predictions: 53 | panoptic_seg, segments_info = predictions["panoptic_seg"] 54 | vis_output = visualizer.draw_panoptic_seg_predictions( 55 | panoptic_seg.to(self.cpu_device), segments_info 56 | ) 57 | else: 58 | if "sem_seg" in predictions: 59 | vis_output = visualizer.draw_sem_seg( 60 | predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) 61 | ) 62 | if "instances" in predictions: 63 | instances = predictions["instances"].to(self.cpu_device) 64 | instances = instances[instances.scores > confidence_threshold] 65 | predictions["instances"] = instances 66 | vis_output = visualizer.draw_instance_predictions( 67 | predictions=instances) 68 | 69 | return predictions, vis_output 70 | 71 | def _frame_from_video(self, video): 72 | while video.isOpened(): 73 | success, frame = video.read() 74 | if success: 75 | yield frame 76 | else: 77 | break 78 | 79 | def run_on_video(self, video, confidence_threshold): 80 | """ 81 | Visualizes predictions on frames of the input video. 82 | 83 | Args: 84 | video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be 85 | either a webcam or a video file. 86 | 87 | Yields: 88 | ndarray: BGR visualizations of each video frame. 89 | """ 90 | video_visualizer = VideoVisualizer(self.metadata, self.instance_mode) 91 | 92 | def process_predictions(frame, predictions): 93 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) 94 | if "panoptic_seg" in predictions: 95 | panoptic_seg, segments_info = predictions["panoptic_seg"] 96 | vis_frame = video_visualizer.draw_panoptic_seg_predictions( 97 | frame, panoptic_seg.to(self.cpu_device), segments_info 98 | ) 99 | elif "instances" in predictions: 100 | predictions = predictions["instances"].to(self.cpu_device) 101 | predictions = predictions[predictions.scores > 102 | confidence_threshold] 103 | vis_frame = video_visualizer.draw_instance_predictions( 104 | frame, predictions) 105 | elif "sem_seg" in predictions: 106 | vis_frame = video_visualizer.draw_sem_seg( 107 | frame, predictions["sem_seg"].argmax( 108 | dim=0).to(self.cpu_device) 109 | ) 110 | 111 | # Converts Matplotlib RGB format to OpenCV BGR format 112 | vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR) 113 | return vis_frame 114 | 115 | frame_gen = self._frame_from_video(video) 116 | if self.parallel: 117 | buffer_size = self.predictor.default_buffer_size 118 | 119 | frame_data = deque() 120 | 121 | for cnt, frame in enumerate(frame_gen): 122 | frame_data.append(frame) 123 | self.predictor.put(frame) 124 | 125 | if cnt >= buffer_size: 126 | frame = frame_data.popleft() 127 | predictions = self.predictor.get() 128 | yield process_predictions(frame, predictions) 129 | 130 | while len(frame_data): 131 | frame = frame_data.popleft() 132 | predictions = self.predictor.get() 133 | yield process_predictions(frame, predictions) 134 | else: 135 | for frame in frame_gen: 136 | yield process_predictions(frame, self.predictor(frame)) 137 | 138 | 139 | class AsyncPredictor: 140 | """ 141 | A predictor that runs the model asynchronously, possibly on >1 GPUs. 142 | Because rendering the visualization takes considerably amount of time, 143 | this helps improve throughput a little bit when rendering videos. 144 | """ 145 | 146 | class _StopToken: 147 | pass 148 | 149 | class _PredictWorker(mp.Process): 150 | def __init__(self, cfg, task_queue, result_queue): 151 | self.cfg = cfg 152 | self.task_queue = task_queue 153 | self.result_queue = result_queue 154 | super().__init__() 155 | 156 | def run(self): 157 | predictor = DefaultPredictor(self.cfg) 158 | 159 | while True: 160 | task = self.task_queue.get() 161 | if isinstance(task, AsyncPredictor._StopToken): 162 | break 163 | idx, data = task 164 | result = predictor(data) 165 | self.result_queue.put((idx, result)) 166 | 167 | def __init__(self, cfg, num_gpus: int = 1): 168 | """ 169 | Args: 170 | cfg (CfgNode): 171 | num_gpus (int): if 0, will run on CPU 172 | """ 173 | num_workers = max(num_gpus, 1) 174 | self.task_queue = mp.Queue(maxsize=num_workers * 3) 175 | self.result_queue = mp.Queue(maxsize=num_workers * 3) 176 | self.procs = [] 177 | for gpuid in range(max(num_gpus, 1)): 178 | cfg = cfg.clone() 179 | cfg.defrost() 180 | cfg.MODEL.DEVICE = "cuda:{}".format( 181 | gpuid) if num_gpus > 0 else "cpu" 182 | self.procs.append( 183 | AsyncPredictor._PredictWorker( 184 | cfg, self.task_queue, self.result_queue) 185 | ) 186 | 187 | self.put_idx = 0 188 | self.get_idx = 0 189 | self.result_rank = [] 190 | self.result_data = [] 191 | 192 | for p in self.procs: 193 | p.start() 194 | atexit.register(self.shutdown) 195 | 196 | def put(self, image): 197 | self.put_idx += 1 198 | self.task_queue.put((self.put_idx, image)) 199 | 200 | def get(self): 201 | self.get_idx += 1 # the index needed for this request 202 | if len(self.result_rank) and self.result_rank[0] == self.get_idx: 203 | res = self.result_data[0] 204 | del self.result_data[0], self.result_rank[0] 205 | return res 206 | 207 | while True: 208 | # make sure the results are returned in the correct order 209 | idx, res = self.result_queue.get() 210 | if idx == self.get_idx: 211 | return res 212 | insert = bisect.bisect(self.result_rank, idx) 213 | self.result_rank.insert(insert, idx) 214 | self.result_data.insert(insert, res) 215 | 216 | def __len__(self): 217 | return self.put_idx - self.get_idx 218 | 219 | def __call__(self, image): 220 | self.put(image) 221 | return self.get() 222 | 223 | def shutdown(self): 224 | for _ in self.procs: 225 | self.task_queue.put(AsyncPredictor._StopToken()) 226 | 227 | @property 228 | def default_buffer_size(self): 229 | return len(self.procs) * 5 230 | -------------------------------------------------------------------------------- /sparseinst/decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved 2 | 3 | import math 4 | import torch 5 | import torch.nn as nn 6 | from torch.nn import init 7 | import torch.nn.functional as F 8 | 9 | from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill 10 | 11 | from detectron2.utils.registry import Registry 12 | from detectron2.layers import Conv2d 13 | from sparseinst.encoder import SPARSE_INST_ENCODER_REGISTRY 14 | 15 | SPARSE_INST_DECODER_REGISTRY = Registry("SPARSE_INST_DECODER") 16 | SPARSE_INST_DECODER_REGISTRY.__doc__ = "registry for SparseInst decoder" 17 | 18 | 19 | def _make_stack_3x3_convs(num_convs, in_channels, out_channels): 20 | convs = [] 21 | for _ in range(num_convs): 22 | convs.append( 23 | Conv2d(in_channels, out_channels, 3, padding=1)) 24 | convs.append(nn.ReLU(True)) 25 | in_channels = out_channels 26 | return nn.Sequential(*convs) 27 | 28 | 29 | class InstanceBranch(nn.Module): 30 | 31 | def __init__(self, cfg, in_channels): 32 | super().__init__() 33 | # norm = cfg.MODEL.SPARSE_INST.DECODER.NORM 34 | dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM 35 | num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS 36 | num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS 37 | kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM 38 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES 39 | 40 | self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim) 41 | # iam prediction, a simple conv 42 | self.iam_conv = nn.Conv2d(dim, num_masks, 3, padding=1) 43 | 44 | # outputs 45 | self.cls_score = nn.Linear(dim, self.num_classes) 46 | self.mask_kernel = nn.Linear(dim, kernel_dim) 47 | self.objectness = nn.Linear(dim, 1) 48 | 49 | self.prior_prob = 0.01 50 | self._init_weights() 51 | 52 | def _init_weights(self): 53 | for m in self.inst_convs.modules(): 54 | if isinstance(m, nn.Conv2d): 55 | c2_msra_fill(m) 56 | bias_value = -math.log((1 - self.prior_prob) / self.prior_prob) 57 | for module in [self.iam_conv, self.cls_score]: 58 | init.constant_(module.bias, bias_value) 59 | init.normal_(self.iam_conv.weight, std=0.01) 60 | init.normal_(self.cls_score.weight, std=0.01) 61 | 62 | init.normal_(self.mask_kernel.weight, std=0.01) 63 | init.constant_(self.mask_kernel.bias, 0.0) 64 | 65 | def forward(self, features): 66 | # instance features (x4 convs) 67 | features = self.inst_convs(features) 68 | # predict instance activation maps 69 | iam = self.iam_conv(features) 70 | iam_prob = iam.sigmoid() 71 | 72 | B, N = iam_prob.shape[:2] 73 | C = features.size(1) 74 | # BxNxHxW -> BxNx(HW) 75 | iam_prob = iam_prob.view(B, N, -1) 76 | normalizer = iam_prob.sum(-1).clamp(min=1e-6) 77 | iam_prob = iam_prob / normalizer[:, :, None] 78 | # aggregate features: BxCxHxW -> Bx(HW)xC 79 | inst_features = torch.bmm( 80 | iam_prob, features.view(B, C, -1).permute(0, 2, 1)) 81 | # predict classification & segmentation kernel & objectness 82 | pred_logits = self.cls_score(inst_features) 83 | pred_kernel = self.mask_kernel(inst_features) 84 | pred_scores = self.objectness(inst_features) 85 | return pred_logits, pred_kernel, pred_scores, iam 86 | 87 | 88 | class MaskBranch(nn.Module): 89 | 90 | def __init__(self, cfg, in_channels): 91 | super().__init__() 92 | dim = cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM 93 | num_convs = cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS 94 | kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM 95 | self.mask_convs = _make_stack_3x3_convs(num_convs, in_channels, dim) 96 | self.projection = nn.Conv2d(dim, kernel_dim, kernel_size=1) 97 | self._init_weights() 98 | 99 | def _init_weights(self): 100 | for m in self.mask_convs.modules(): 101 | if isinstance(m, nn.Conv2d): 102 | c2_msra_fill(m) 103 | c2_msra_fill(self.projection) 104 | 105 | def forward(self, features): 106 | # mask features (x4 convs) 107 | features = self.mask_convs(features) 108 | return self.projection(features) 109 | 110 | 111 | @SPARSE_INST_DECODER_REGISTRY.register() 112 | class BaseIAMDecoder(nn.Module): 113 | 114 | def __init__(self, cfg): 115 | super().__init__() 116 | # add 2 for coordinates 117 | in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2 118 | 119 | self.scale_factor = cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR 120 | self.output_iam = cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM 121 | 122 | self.inst_branch = InstanceBranch(cfg, in_channels) 123 | self.mask_branch = MaskBranch(cfg, in_channels) 124 | 125 | @torch.no_grad() 126 | def compute_coordinates_linspace(self, x): 127 | # linspace is not supported in ONNX 128 | h, w = x.size(2), x.size(3) 129 | y_loc = torch.linspace(-1, 1, h, device=x.device) 130 | x_loc = torch.linspace(-1, 1, w, device=x.device) 131 | y_loc, x_loc = torch.meshgrid(y_loc, x_loc) 132 | y_loc = y_loc.expand([x.shape[0], 1, -1, -1]) 133 | x_loc = x_loc.expand([x.shape[0], 1, -1, -1]) 134 | locations = torch.cat([x_loc, y_loc], 1) 135 | return locations.to(x) 136 | 137 | @torch.no_grad() 138 | def compute_coordinates(self, x): 139 | h, w = x.size(2), x.size(3) 140 | y_loc = -1.0 + 2.0 * torch.arange(h, device=x.device) / (h - 1) 141 | x_loc = -1.0 + 2.0 * torch.arange(w, device=x.device) / (w - 1) 142 | y_loc, x_loc = torch.meshgrid(y_loc, x_loc) 143 | y_loc = y_loc.expand([x.shape[0], 1, -1, -1]) 144 | x_loc = x_loc.expand([x.shape[0], 1, -1, -1]) 145 | locations = torch.cat([x_loc, y_loc], 1) 146 | return locations.to(x) 147 | 148 | def forward(self, features): 149 | coord_features = self.compute_coordinates(features) 150 | features = torch.cat([coord_features, features], dim=1) 151 | pred_logits, pred_kernel, pred_scores, iam = self.inst_branch(features) 152 | mask_features = self.mask_branch(features) 153 | 154 | N = pred_kernel.shape[1] 155 | # mask_features: BxCxHxW 156 | B, C, H, W = mask_features.shape 157 | pred_masks = torch.bmm(pred_kernel, mask_features.view( 158 | B, C, H * W)).view(B, N, H, W) 159 | 160 | pred_masks = F.interpolate( 161 | pred_masks, scale_factor=self.scale_factor, 162 | mode='bilinear', align_corners=False) 163 | 164 | output = { 165 | "pred_logits": pred_logits, 166 | "pred_masks": pred_masks, 167 | "pred_scores": pred_scores, 168 | } 169 | 170 | if self.output_iam: 171 | iam = F.interpolate(iam, scale_factor=self.scale_factor, 172 | mode='bilinear', align_corners=False) 173 | output['pred_iam'] = iam 174 | 175 | return output 176 | 177 | 178 | class GroupInstanceBranch(nn.Module): 179 | 180 | def __init__(self, cfg, in_channels): 181 | super().__init__() 182 | dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM 183 | num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS 184 | num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS 185 | kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM 186 | self.num_groups = cfg.MODEL.SPARSE_INST.DECODER.GROUPS 187 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES 188 | 189 | self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim) 190 | # iam prediction, a group conv 191 | expand_dim = dim * self.num_groups 192 | self.iam_conv = nn.Conv2d( 193 | dim, num_masks * self.num_groups, 3, padding=1, groups=self.num_groups) 194 | # outputs 195 | self.fc = nn.Linear(expand_dim, expand_dim) 196 | 197 | self.cls_score = nn.Linear(expand_dim, self.num_classes) 198 | self.mask_kernel = nn.Linear(expand_dim, kernel_dim) 199 | self.objectness = nn.Linear(expand_dim, 1) 200 | 201 | self.prior_prob = 0.01 202 | self._init_weights() 203 | 204 | def _init_weights(self): 205 | for m in self.inst_convs.modules(): 206 | if isinstance(m, nn.Conv2d): 207 | c2_msra_fill(m) 208 | bias_value = -math.log((1 - self.prior_prob) / self.prior_prob) 209 | for module in [self.iam_conv, self.cls_score]: 210 | init.constant_(module.bias, bias_value) 211 | init.normal_(self.iam_conv.weight, std=0.01) 212 | init.normal_(self.cls_score.weight, std=0.01) 213 | 214 | init.normal_(self.mask_kernel.weight, std=0.01) 215 | init.constant_(self.mask_kernel.bias, 0.0) 216 | c2_xavier_fill(self.fc) 217 | 218 | def forward(self, features): 219 | # instance features (x4 convs) 220 | features = self.inst_convs(features) 221 | # predict instance activation maps 222 | iam = self.iam_conv(features) 223 | iam_prob = iam.sigmoid() 224 | 225 | B, N = iam_prob.shape[:2] 226 | C = features.size(1) 227 | # BxNxHxW -> BxNx(HW) 228 | iam_prob = iam_prob.view(B, N, -1) 229 | normalizer = iam_prob.sum(-1).clamp(min=1e-6) 230 | iam_prob = iam_prob / normalizer[:, :, None] 231 | 232 | # aggregate features: BxCxHxW -> Bx(HW)xC 233 | inst_features = torch.bmm( 234 | iam_prob, features.view(B, C, -1).permute(0, 2, 1)) 235 | 236 | inst_features = inst_features.reshape( 237 | B, 4, N // self.num_groups, -1).transpose(1, 2).reshape(B, N // self.num_groups, -1) 238 | 239 | inst_features = F.relu_(self.fc(inst_features)) 240 | # predict classification & segmentation kernel & objectness 241 | pred_logits = self.cls_score(inst_features) 242 | pred_kernel = self.mask_kernel(inst_features) 243 | pred_scores = self.objectness(inst_features) 244 | return pred_logits, pred_kernel, pred_scores, iam 245 | 246 | 247 | @SPARSE_INST_DECODER_REGISTRY.register() 248 | class GroupIAMDecoder(BaseIAMDecoder): 249 | 250 | def __init__(self, cfg): 251 | super().__init__(cfg) 252 | in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2 253 | self.inst_branch = GroupInstanceBranch(cfg, in_channels) 254 | 255 | 256 | class GroupInstanceSoftBranch(GroupInstanceBranch): 257 | 258 | def __init__(self, cfg, in_channels): 259 | super().__init__(cfg, in_channels) 260 | self.softmax_bias = nn.Parameter(torch.ones([1, ])) 261 | 262 | def forward(self, features): 263 | # instance features (x4 convs) 264 | features = self.inst_convs(features) 265 | # predict instance activation maps 266 | iam = self.iam_conv(features) 267 | 268 | B, N = iam.shape[:2] 269 | C = features.size(1) 270 | # BxNxHxW -> BxNx(HW) 271 | iam_prob = F.softmax(iam.view(B, N, -1) + self.softmax_bias, dim=-1) 272 | # aggregate features: BxCxHxW -> Bx(HW)xC 273 | inst_features = torch.bmm( 274 | iam_prob, features.view(B, C, -1).permute(0, 2, 1)) 275 | 276 | inst_features = inst_features.reshape( 277 | B, self.num_groups, N // self.num_groups, -1).transpose(1, 2).reshape(B, N // self.num_groups, -1) 278 | 279 | inst_features = F.relu_(self.fc(inst_features)) 280 | # predict classification & segmentation kernel & objectness 281 | pred_logits = self.cls_score(inst_features) 282 | pred_kernel = self.mask_kernel(inst_features) 283 | pred_scores = self.objectness(inst_features) 284 | return pred_logits, pred_kernel, pred_scores, iam 285 | 286 | 287 | @SPARSE_INST_DECODER_REGISTRY.register() 288 | class GroupIAMSoftDecoder(BaseIAMDecoder): 289 | 290 | def __init__(self, cfg): 291 | super().__init__(cfg) 292 | in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2 293 | self.inst_branch = GroupInstanceSoftBranch(cfg, in_channels) 294 | 295 | 296 | def build_sparse_inst_decoder(cfg): 297 | name = cfg.MODEL.SPARSE_INST.DECODER.NAME 298 | return SPARSE_INST_DECODER_REGISTRY.get(name)(cfg) 299 | -------------------------------------------------------------------------------- /sparseinst/loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.cuda.amp import autocast 7 | from scipy.optimize import linear_sum_assignment 8 | from fvcore.nn import sigmoid_focal_loss_jit 9 | 10 | from detectron2.utils.registry import Registry 11 | 12 | from .utils import nested_masks_from_list, is_dist_avail_and_initialized, get_world_size 13 | 14 | SPARSE_INST_MATCHER_REGISTRY = Registry("SPARSE_INST_MATCHER") 15 | SPARSE_INST_MATCHER_REGISTRY.__doc__ = "Matcher for SparseInst" 16 | SPARSE_INST_CRITERION_REGISTRY = Registry("SPARSE_INST_CRITERION") 17 | SPARSE_INST_CRITERION_REGISTRY.__doc__ = "Criterion for SparseInst" 18 | 19 | 20 | def compute_mask_iou(inputs, targets): 21 | inputs = inputs.sigmoid() 22 | # thresholding 23 | binarized_inputs = (inputs >= 0.4).float() 24 | targets = (targets > 0.5).float() 25 | intersection = (binarized_inputs * targets).sum(-1) 26 | union = targets.sum(-1) + binarized_inputs.sum(-1) - intersection 27 | score = intersection / (union + 1e-6) 28 | return score 29 | 30 | 31 | def dice_score(inputs, targets): 32 | inputs = inputs.sigmoid() 33 | numerator = 2 * torch.matmul(inputs, targets.t()) 34 | denominator = ( 35 | inputs * inputs).sum(-1)[:, None] + (targets * targets).sum(-1) 36 | score = numerator / (denominator + 1e-4) 37 | return score 38 | 39 | 40 | def dice_loss(inputs, targets, reduction='sum'): 41 | inputs = inputs.sigmoid() 42 | assert inputs.shape == targets.shape 43 | numerator = 2 * (inputs * targets).sum(1) 44 | denominator = (inputs * inputs).sum(-1) + (targets * targets).sum(-1) 45 | loss = 1 - (numerator) / (denominator + 1e-4) 46 | if reduction == 'none': 47 | return loss 48 | return loss.sum() 49 | 50 | 51 | @SPARSE_INST_CRITERION_REGISTRY.register() 52 | class SparseInstCriterion(nn.Module): 53 | # This part is partially derivated from: https://github.com/facebookresearch/detr/blob/main/models/detr.py 54 | 55 | def __init__(self, cfg, matcher): 56 | super().__init__() 57 | self.matcher = matcher 58 | self.losses = cfg.MODEL.SPARSE_INST.LOSS.ITEMS 59 | self.weight_dict = self.get_weight_dict(cfg) 60 | self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES 61 | 62 | def get_weight_dict(self, cfg): 63 | losses = ("loss_ce", "loss_mask", "loss_dice", "loss_objectness") 64 | weight_dict = {} 65 | ce_weight = cfg.MODEL.SPARSE_INST.LOSS.CLASS_WEIGHT 66 | mask_weight = cfg.MODEL.SPARSE_INST.LOSS.MASK_PIXEL_WEIGHT 67 | dice_weight = cfg.MODEL.SPARSE_INST.LOSS.MASK_DICE_WEIGHT 68 | objectness_weight = cfg.MODEL.SPARSE_INST.LOSS.OBJECTNESS_WEIGHT 69 | 70 | weight_dict = dict( 71 | zip(losses, (ce_weight, mask_weight, dice_weight, objectness_weight))) 72 | return weight_dict 73 | 74 | def _get_src_permutation_idx(self, indices): 75 | # permute predictions following indices 76 | batch_idx = torch.cat([torch.full_like(src, i) 77 | for i, (src, _) in enumerate(indices)]) 78 | src_idx = torch.cat([src for (src, _) in indices]) 79 | return batch_idx, src_idx 80 | 81 | def _get_tgt_permutation_idx(self, indices): 82 | # permute targets following indices 83 | batch_idx = torch.cat([torch.full_like(tgt, i) 84 | for i, (_, tgt) in enumerate(indices)]) 85 | tgt_idx = torch.cat([tgt for (_, tgt) in indices]) 86 | return batch_idx, tgt_idx 87 | 88 | def loss_labels(self, outputs, targets, indices, num_instances, input_shape=None): 89 | assert "pred_logits" in outputs 90 | src_logits = outputs['pred_logits'] 91 | idx = self._get_src_permutation_idx(indices) 92 | target_classes_o = torch.cat([t["labels"][J] 93 | for t, (_, J) in zip(targets, indices)]) 94 | target_classes = torch.full(src_logits.shape[:2], self.num_classes, 95 | dtype=torch.int64, device=src_logits.device) 96 | target_classes[idx] = target_classes_o 97 | 98 | src_logits = src_logits.flatten(0, 1) 99 | # prepare one_hot target. 100 | target_classes = target_classes.flatten(0, 1) 101 | pos_inds = torch.nonzero( 102 | target_classes != self.num_classes, as_tuple=True)[0] 103 | labels = torch.zeros_like(src_logits) 104 | labels[pos_inds, target_classes[pos_inds]] = 1 105 | # comp focal loss. 106 | class_loss = sigmoid_focal_loss_jit( 107 | src_logits, 108 | labels, 109 | alpha=0.25, 110 | gamma=2.0, 111 | reduction="sum", 112 | ) / num_instances 113 | losses = {'loss_ce': class_loss} 114 | return losses 115 | 116 | def loss_masks_with_iou_objectness(self, outputs, targets, indices, num_instances, input_shape): 117 | src_idx = self._get_src_permutation_idx(indices) 118 | tgt_idx = self._get_tgt_permutation_idx(indices) 119 | # Bx100xHxW 120 | assert "pred_masks" in outputs 121 | assert "pred_scores" in outputs 122 | src_iou_scores = outputs["pred_scores"] 123 | src_masks = outputs["pred_masks"] 124 | with torch.no_grad(): 125 | target_masks, _ = nested_masks_from_list( 126 | [t["masks"].tensor for t in targets], input_shape).decompose() 127 | num_masks = [len(t["masks"]) for t in targets] 128 | target_masks = target_masks.to(src_masks) 129 | if len(target_masks) == 0: 130 | losses = { 131 | "loss_dice": src_masks.sum() * 0.0, 132 | "loss_mask": src_masks.sum() * 0.0, 133 | "loss_objectness": src_iou_scores.sum() * 0.0 134 | } 135 | return losses 136 | 137 | src_masks = src_masks[src_idx] 138 | target_masks = F.interpolate( 139 | target_masks[:, None], size=src_masks.shape[-2:], mode='bilinear', align_corners=False).squeeze(1) 140 | 141 | src_masks = src_masks.flatten(1) 142 | # FIXME: tgt_idx 143 | mix_tgt_idx = torch.zeros_like(tgt_idx[1]) 144 | cum_sum = 0 145 | for num_mask in num_masks: 146 | mix_tgt_idx[cum_sum: cum_sum + num_mask] = cum_sum 147 | cum_sum += num_mask 148 | mix_tgt_idx += tgt_idx[1] 149 | 150 | target_masks = target_masks[mix_tgt_idx].flatten(1) 151 | 152 | with torch.no_grad(): 153 | ious = compute_mask_iou(src_masks, target_masks) 154 | 155 | tgt_iou_scores = ious 156 | src_iou_scores = src_iou_scores[src_idx] 157 | tgt_iou_scores = tgt_iou_scores.flatten(0) 158 | src_iou_scores = src_iou_scores.flatten(0) 159 | 160 | losses = { 161 | "loss_objectness": F.binary_cross_entropy_with_logits(src_iou_scores, tgt_iou_scores, reduction='mean'), 162 | "loss_dice": dice_loss(src_masks, target_masks) / num_instances, 163 | "loss_mask": F.binary_cross_entropy_with_logits(src_masks, target_masks, reduction='mean') 164 | } 165 | return losses 166 | 167 | def get_loss(self, loss, outputs, targets, indices, num_instances, **kwargs): 168 | loss_map = { 169 | "labels": self.loss_labels, 170 | "masks": self.loss_masks_with_iou_objectness, 171 | } 172 | if loss == "loss_objectness": 173 | # NOTE: loss_objectness will be calculated in `loss_masks_with_iou_objectness` 174 | return {} 175 | assert loss in loss_map 176 | return loss_map[loss](outputs, targets, indices, num_instances, **kwargs) 177 | 178 | def forward(self, outputs, targets, input_shape): 179 | 180 | outputs_without_aux = {k: v for k, 181 | v in outputs.items() if k != 'aux_outputs'} 182 | 183 | # Retrieve the matching between the outputs of the last layer and the targets 184 | indices = self.matcher(outputs_without_aux, targets, input_shape) 185 | # Compute the average number of target boxes accross all nodes, for normalization purposes 186 | num_instances = sum(len(t["labels"]) for t in targets) 187 | num_instances = torch.as_tensor( 188 | [num_instances], dtype=torch.float, device=next(iter(outputs.values())).device) 189 | if is_dist_avail_and_initialized(): 190 | torch.distributed.all_reduce(num_instances) 191 | num_instances = torch.clamp( 192 | num_instances / get_world_size(), min=1).item() 193 | # Compute all the requested losses 194 | losses = {} 195 | for loss in self.losses: 196 | losses.update(self.get_loss(loss, outputs, targets, indices, 197 | num_instances, input_shape=input_shape)) 198 | 199 | for k in losses.keys(): 200 | if k in self.weight_dict: 201 | losses[k] *= self.weight_dict[k] 202 | 203 | return losses 204 | 205 | 206 | @SPARSE_INST_MATCHER_REGISTRY.register() 207 | class SparseInstMatcherV1(nn.Module): 208 | 209 | def __init__(self, cfg): 210 | super().__init__() 211 | self.alpha = cfg.MODEL.SPARSE_INST.MATCHER.ALPHA 212 | self.beta = cfg.MODEL.SPARSE_INST.MATCHER.BETA 213 | self.mask_score = dice_score 214 | 215 | @torch.no_grad() 216 | def forward(self, outputs, targets, input_shape): 217 | B, N, H, W = outputs["pred_masks"].shape 218 | pred_masks = outputs['pred_masks'] 219 | pred_logits = outputs['pred_logits'].sigmoid() 220 | 221 | indices = [] 222 | 223 | for i in range(B): 224 | tgt_ids = targets[i]["labels"] 225 | # no annotations 226 | if tgt_ids.shape[0] == 0: 227 | indices.append((torch.as_tensor([]), 228 | torch.as_tensor([]))) 229 | continue 230 | 231 | tgt_masks = targets[i]['masks'].tensor.to(pred_masks) 232 | pred_logit = pred_logits[i] 233 | out_masks = pred_masks[i] 234 | 235 | # upsampling: 236 | # (1) padding/ 237 | # (2) upsampling to 1x input size (input_shape) 238 | # (3) downsampling to 0.25x input size (output mask size) 239 | ori_h, ori_w = tgt_masks.size(1), tgt_masks.size(2) 240 | tgt_masks_ = torch.zeros( 241 | (1, tgt_masks.size(0), input_shape[0], input_shape[1])).to(pred_masks) 242 | tgt_masks_[0, :, :ori_h, :ori_w] = tgt_masks 243 | tgt_masks = F.interpolate( 244 | tgt_masks_, size=out_masks.shape[-2:], mode='bilinear', align_corners=False)[0] 245 | 246 | # compute dice score and classification score 247 | tgt_masks = tgt_masks.flatten(1) 248 | out_masks = out_masks.flatten(1) 249 | 250 | mask_score = self.mask_score(out_masks, tgt_masks) 251 | # Nx(Number of gts) 252 | matching_prob = pred_logit[:, tgt_ids] 253 | C = (mask_score ** self.alpha) * (matching_prob ** self.beta) 254 | # hungarian matching 255 | inds = linear_sum_assignment(C.cpu(), maximize=True) 256 | indices.append(inds) 257 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] 258 | 259 | 260 | @SPARSE_INST_MATCHER_REGISTRY.register() 261 | class SparseInstMatcher(nn.Module): 262 | 263 | def __init__(self, cfg): 264 | super().__init__() 265 | self.alpha = cfg.MODEL.SPARSE_INST.MATCHER.ALPHA 266 | self.beta = cfg.MODEL.SPARSE_INST.MATCHER.BETA 267 | self.mask_score = dice_score 268 | 269 | def forward(self, outputs, targets, input_shape): 270 | with torch.no_grad(): 271 | B, N, H, W = outputs["pred_masks"].shape 272 | pred_masks = outputs['pred_masks'] 273 | pred_logits = outputs['pred_logits'].sigmoid() 274 | 275 | tgt_ids = torch.cat([v["labels"] for v in targets]) 276 | 277 | if tgt_ids.shape[0] == 0: 278 | return [(torch.as_tensor([]).to(pred_logits), torch.as_tensor([]).to(pred_logits))] * B 279 | tgt_masks, _ = nested_masks_from_list( 280 | [t["masks"].tensor for t in targets], input_shape).decompose() 281 | device = pred_masks.device 282 | tgt_masks = tgt_masks.to(pred_masks) 283 | 284 | tgt_masks = F.interpolate( 285 | tgt_masks[:, None], size=pred_masks.shape[-2:], mode="bilinear", align_corners=False).squeeze(1) 286 | 287 | pred_masks = pred_masks.view(B * N, -1) 288 | tgt_masks = tgt_masks.flatten(1) 289 | with autocast(enabled=False): 290 | pred_masks = pred_masks.float() 291 | tgt_masks = tgt_masks.float() 292 | pred_logits = pred_logits.float() 293 | mask_score = self.mask_score(pred_masks, tgt_masks) 294 | # Nx(Number of gts) 295 | matching_prob = pred_logits.view(B * N, -1)[:, tgt_ids] 296 | C = (mask_score ** self.alpha) * (matching_prob ** self.beta) 297 | 298 | C = C.view(B, N, -1).cpu() 299 | # hungarian matching 300 | sizes = [len(v["masks"]) for v in targets] 301 | indices = [linear_sum_assignment(c[i], maximize=True) 302 | for i, c in enumerate(C.split(sizes, -1))] 303 | indices = [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor( 304 | j, dtype=torch.int64)) for i, j in indices] 305 | return indices 306 | 307 | 308 | def build_sparse_inst_matcher(cfg): 309 | name = cfg.MODEL.SPARSE_INST.MATCHER.NAME 310 | return SPARSE_INST_MATCHER_REGISTRY.get(name)(cfg) 311 | 312 | 313 | def build_sparse_inst_criterion(cfg): 314 | matcher = build_sparse_inst_matcher(cfg) 315 | name = cfg.MODEL.SPARSE_INST.LOSS.NAME 316 | return SPARSE_INST_CRITERION_REGISTRY.get(name)(cfg, matcher) 317 | -------------------------------------------------------------------------------- /sparseinst/backbones/pvt.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from functools import partial 6 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_ 7 | from detectron2.layers import ShapeSpec 8 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY 9 | 10 | 11 | class Mlp(nn.Module): 12 | def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False): 13 | super().__init__() 14 | out_features = out_features or in_features 15 | hidden_features = hidden_features or in_features 16 | self.fc1 = nn.Linear(in_features, hidden_features) 17 | self.dwconv = DWConv(hidden_features) 18 | self.act = act_layer() 19 | self.fc2 = nn.Linear(hidden_features, out_features) 20 | self.drop = nn.Dropout(drop) 21 | self.linear = linear 22 | if self.linear: 23 | self.relu = nn.ReLU(inplace=True) 24 | self.apply(self._init_weights) 25 | 26 | def _init_weights(self, m): 27 | if isinstance(m, nn.Linear): 28 | trunc_normal_(m.weight, std=.02) 29 | if isinstance(m, nn.Linear) and m.bias is not None: 30 | nn.init.constant_(m.bias, 0) 31 | elif isinstance(m, nn.LayerNorm): 32 | nn.init.constant_(m.bias, 0) 33 | nn.init.constant_(m.weight, 1.0) 34 | elif isinstance(m, nn.Conv2d): 35 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 36 | fan_out //= m.groups 37 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) 38 | if m.bias is not None: 39 | m.bias.data.zero_() 40 | 41 | def forward(self, x, H, W): 42 | x = self.fc1(x) 43 | if self.linear: 44 | x = self.relu(x) 45 | x = self.dwconv(x, H, W) 46 | x = self.act(x) 47 | x = self.drop(x) 48 | x = self.fc2(x) 49 | x = self.drop(x) 50 | return x 51 | 52 | 53 | class Attention(nn.Module): 54 | def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False): 55 | super().__init__() 56 | assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}." 57 | 58 | self.dim = dim 59 | self.num_heads = num_heads 60 | head_dim = dim // num_heads 61 | self.scale = qk_scale or head_dim ** -0.5 62 | 63 | self.q = nn.Linear(dim, dim, bias=qkv_bias) 64 | self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias) 65 | self.attn_drop = nn.Dropout(attn_drop) 66 | self.proj = nn.Linear(dim, dim) 67 | self.proj_drop = nn.Dropout(proj_drop) 68 | 69 | self.linear = linear 70 | self.sr_ratio = sr_ratio 71 | if not linear: 72 | if sr_ratio > 1: 73 | self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio) 74 | self.norm = nn.LayerNorm(dim) 75 | else: 76 | self.pool = nn.AdaptiveAvgPool2d(7) 77 | self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1) 78 | self.norm = nn.LayerNorm(dim) 79 | self.act = nn.GELU() 80 | self.apply(self._init_weights) 81 | 82 | def _init_weights(self, m): 83 | if isinstance(m, nn.Linear): 84 | trunc_normal_(m.weight, std=.02) 85 | if isinstance(m, nn.Linear) and m.bias is not None: 86 | nn.init.constant_(m.bias, 0) 87 | elif isinstance(m, nn.LayerNorm): 88 | nn.init.constant_(m.bias, 0) 89 | nn.init.constant_(m.weight, 1.0) 90 | elif isinstance(m, nn.Conv2d): 91 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 92 | fan_out //= m.groups 93 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) 94 | if m.bias is not None: 95 | m.bias.data.zero_() 96 | 97 | def forward(self, x, H, W): 98 | B, N, C = x.shape 99 | q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) 100 | 101 | if not self.linear: 102 | if self.sr_ratio > 1: 103 | x_ = x.permute(0, 2, 1).reshape(B, C, H, W) 104 | x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1) 105 | x_ = self.norm(x_) 106 | kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 107 | else: 108 | kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 109 | else: 110 | x_ = x.permute(0, 2, 1).reshape(B, C, H, W) 111 | x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1) 112 | x_ = self.norm(x_) 113 | x_ = self.act(x_) 114 | kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 115 | k, v = kv[0], kv[1] 116 | 117 | attn = (q @ k.transpose(-2, -1)) * self.scale 118 | attn = attn.softmax(dim=-1) 119 | attn = self.attn_drop(attn) 120 | 121 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 122 | x = self.proj(x) 123 | x = self.proj_drop(x) 124 | 125 | return x 126 | 127 | 128 | class Block(nn.Module): 129 | 130 | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., 131 | drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False): 132 | super().__init__() 133 | self.norm1 = norm_layer(dim) 134 | self.attn = Attention( 135 | dim, 136 | num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, 137 | attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear) 138 | # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here 139 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 140 | self.norm2 = norm_layer(dim) 141 | mlp_hidden_dim = int(dim * mlp_ratio) 142 | self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear) 143 | 144 | self.apply(self._init_weights) 145 | 146 | def _init_weights(self, m): 147 | if isinstance(m, nn.Linear): 148 | trunc_normal_(m.weight, std=.02) 149 | if isinstance(m, nn.Linear) and m.bias is not None: 150 | nn.init.constant_(m.bias, 0) 151 | elif isinstance(m, nn.LayerNorm): 152 | nn.init.constant_(m.bias, 0) 153 | nn.init.constant_(m.weight, 1.0) 154 | elif isinstance(m, nn.Conv2d): 155 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 156 | fan_out //= m.groups 157 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) 158 | if m.bias is not None: 159 | m.bias.data.zero_() 160 | 161 | def forward(self, x, H, W): 162 | x = x + self.drop_path(self.attn(self.norm1(x), H, W)) 163 | x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) 164 | 165 | return x 166 | 167 | 168 | class OverlapPatchEmbed(nn.Module): 169 | """ Image to Patch Embedding 170 | """ 171 | 172 | def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768): 173 | super().__init__() 174 | img_size = to_2tuple(img_size) 175 | patch_size = to_2tuple(patch_size) 176 | 177 | self.img_size = img_size 178 | self.patch_size = patch_size 179 | self.H, self.W = img_size[0] // stride, img_size[1] // stride 180 | self.num_patches = self.H * self.W 181 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, 182 | padding=(patch_size[0] // 2, patch_size[1] // 2)) 183 | self.norm = nn.LayerNorm(embed_dim) 184 | 185 | self.apply(self._init_weights) 186 | 187 | def _init_weights(self, m): 188 | if isinstance(m, nn.Linear): 189 | trunc_normal_(m.weight, std=.02) 190 | if isinstance(m, nn.Linear) and m.bias is not None: 191 | nn.init.constant_(m.bias, 0) 192 | elif isinstance(m, nn.LayerNorm): 193 | nn.init.constant_(m.bias, 0) 194 | nn.init.constant_(m.weight, 1.0) 195 | elif isinstance(m, nn.Conv2d): 196 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 197 | fan_out //= m.groups 198 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) 199 | if m.bias is not None: 200 | m.bias.data.zero_() 201 | 202 | def forward(self, x): 203 | x = self.proj(x) 204 | _, _, H, W = x.shape 205 | x = x.flatten(2).transpose(1, 2) 206 | x = self.norm(x) 207 | 208 | return x, H, W 209 | 210 | 211 | class PyramidVisionTransformerV2(Backbone): 212 | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dims=[64, 128, 256, 512], 213 | num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0., 214 | attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, depths=[3, 4, 6, 3], 215 | sr_ratios=[8, 4, 2, 1], num_stages=4, linear=False, out_features=None): 216 | super().__init__() 217 | self.depths = depths 218 | self.num_stages = num_stages 219 | self.linear = linear 220 | 221 | dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule 222 | cur = 0 223 | 224 | for i in range(num_stages): 225 | patch_embed = OverlapPatchEmbed(img_size=img_size if i == 0 else img_size // (2 ** (i + 1)), 226 | patch_size=7 if i == 0 else 3, 227 | stride=4 if i == 0 else 2, 228 | in_chans=in_chans if i == 0 else embed_dims[i - 1], 229 | embed_dim=embed_dims[i]) 230 | 231 | block = nn.ModuleList([Block( 232 | dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias, 233 | qk_scale=qk_scale, 234 | drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer, 235 | sr_ratio=sr_ratios[i], linear=linear) 236 | for j in range(depths[i])]) 237 | norm = norm_layer(embed_dims[i]) 238 | cur += depths[i] 239 | 240 | setattr(self, f"patch_embed{i + 1}", patch_embed) 241 | setattr(self, f"block{i + 1}", block) 242 | setattr(self, f"norm{i + 1}", norm) 243 | 244 | out_features_names = ["p1", "p2", "p3", "p4"] 245 | self._out_feature_strides = dict(zip(out_features_names, [4, 8, 16, 32])) 246 | self._out_feature_channels = dict(zip(out_features_names, embed_dims)) 247 | if out_features is None: 248 | self._out_features = out_features_names 249 | else: 250 | self._out_features = out_features 251 | self.out_features_names = out_features_names 252 | self.apply(self._init_weights) 253 | 254 | def _init_weights(self, m): 255 | if isinstance(m, nn.Linear): 256 | trunc_normal_(m.weight, std=.02) 257 | if isinstance(m, nn.Linear) and m.bias is not None: 258 | nn.init.constant_(m.bias, 0) 259 | elif isinstance(m, nn.LayerNorm): 260 | nn.init.constant_(m.bias, 0) 261 | nn.init.constant_(m.weight, 1.0) 262 | elif isinstance(m, nn.Conv2d): 263 | fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 264 | fan_out //= m.groups 265 | m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) 266 | if m.bias is not None: 267 | m.bias.data.zero_() 268 | 269 | def freeze_patch_emb(self): 270 | self.patch_embed1.requires_grad = False 271 | 272 | @torch.jit.ignore 273 | def no_weight_decay(self): 274 | return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'} # has pos_embed may be better 275 | 276 | 277 | def output_shape(self): 278 | return { 279 | name: ShapeSpec( 280 | channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] 281 | ) 282 | for name in self._out_features 283 | } 284 | 285 | def size_divisibility(self): 286 | return 32 287 | 288 | 289 | def forward(self, x): 290 | B = x.shape[0] 291 | outputs = {} 292 | 293 | for i in range(self.num_stages): 294 | patch_embed = getattr(self, f"patch_embed{i + 1}") 295 | block = getattr(self, f"block{i + 1}") 296 | norm = getattr(self, f"norm{i + 1}") 297 | x, H, W = patch_embed(x) 298 | for blk in block: 299 | x = blk(x, H, W) 300 | x = norm(x) 301 | x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() 302 | if self.out_features_names[i] in self._out_features: 303 | outputs[self.out_features_names[i]] = x 304 | return outputs 305 | 306 | 307 | class DWConv(nn.Module): 308 | def __init__(self, dim=768): 309 | super(DWConv, self).__init__() 310 | self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim) 311 | 312 | def forward(self, x, H, W): 313 | B, N, C = x.shape 314 | x = x.transpose(1, 2).view(B, C, H, W) 315 | x = self.dwconv(x) 316 | x = x.flatten(2).transpose(1, 2) 317 | 318 | return x 319 | 320 | 321 | def _conv_filter(state_dict, patch_size=16): 322 | """ convert patch embedding weight from manual patchify + linear proj to conv""" 323 | out_dict = {} 324 | for k, v in state_dict.items(): 325 | if 'patch_embed.proj.weight' in k: 326 | v = v.reshape((v.shape[0], 3, patch_size, patch_size)) 327 | out_dict[k] = v 328 | 329 | return out_dict 330 | 331 | 332 | @BACKBONE_REGISTRY.register() 333 | def build_pyramid_vision_transformer(cfg, input_shape): 334 | name = cfg.MODEL.PVT.NAME 335 | linear = cfg.MODEL.PVT.LINEAR 336 | out_features = cfg.MODEL.PVT.OUT_FEATURES 337 | 338 | if linear: 339 | name = "b2" 340 | 341 | if name == "b0": 342 | embed_dims=[32, 64, 160, 256] 343 | else: 344 | embed_dims=[64, 128, 320, 512] 345 | 346 | depths = { 347 | "b0": [2, 2, 2, 2], 348 | "b1": [2, 2, 2, 2], 349 | "b2": [3, 4, 6, 3], 350 | "b3": [3, 4, 18, 3], 351 | "b4": [3, 8, 27, 3], 352 | "b5": [3, 6, 40, 3] 353 | } 354 | 355 | if name == "b5": 356 | mlp_ratios = [4, 4, 4, 4] 357 | else: 358 | mlp_ratios = [8, 8, 4, 4] 359 | 360 | in_channels = input_shape.channels 361 | 362 | return PyramidVisionTransformerV2( 363 | patch_size=4, 364 | depths=depths[name], 365 | in_chans=in_channels, 366 | embed_dims=embed_dims, 367 | num_heads=[1, 2, 5, 8], 368 | mlp_ratios=mlp_ratios, 369 | drop_rate=0.0, 370 | drop_path_rate=0.1, 371 | sr_ratios=[8, 4, 2, 1], 372 | qkv_bias=True, 373 | norm_layer=partial(nn.LayerNorm, eps=1e-6), 374 | out_features=out_features, 375 | linear=linear 376 | ) 377 | 378 | -------------------------------------------------------------------------------- /sparseinst/backbones/resnet.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved 3 | 4 | import math 5 | import torch.nn as nn 6 | from timm.models.resnet import BasicBlock, Bottleneck 7 | from timm.models.layers import DropBlock2d, DropPath, AvgPool2dSame 8 | 9 | from detectron2.layers import ShapeSpec, FrozenBatchNorm2d 10 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY 11 | from detectron2.layers import NaiveSyncBatchNorm, DeformConv 12 | 13 | 14 | def get_padding(kernel_size, stride, dilation=1): 15 | padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 16 | return padding 17 | 18 | 19 | """ 20 | inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64, 21 | reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, 22 | attn_layer=None, aa_layer=None, drop_block=None, drop_path=None 23 | """ 24 | 25 | 26 | class DeformableBottleneck(nn.Module): 27 | expansion = 4 28 | 29 | def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64, 30 | reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, 31 | attn_layer=None, aa_layer=None, drop_block=None, drop_path=None): 32 | super().__init__() 33 | 34 | width = int(math.floor(planes * (base_width / 64)) * cardinality) 35 | first_planes = width // reduce_first 36 | outplanes = planes * self.expansion 37 | first_dilation = first_dilation or dilation 38 | # use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation) 39 | 40 | self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False) 41 | self.bn1 = norm_layer(first_planes) 42 | self.act1 = act_layer(inplace=True) 43 | 44 | self.conv2_offset = nn.Conv2d( 45 | first_planes, 46 | 18, 47 | kernel_size=3, 48 | stride=stride, 49 | padding=first_dilation, 50 | dilation=first_dilation 51 | ) 52 | self.conv2 = DeformConv( 53 | first_planes, 54 | width, 55 | kernel_size=3, 56 | stride=stride, 57 | padding=first_dilation, 58 | bias=False, 59 | dilation=first_dilation, 60 | ) 61 | 62 | self.bn2 = norm_layer(width) 63 | self.act2 = act_layer(inplace=True) 64 | # self.aa = aa_layer(channels=width, stride=stride) if use_aa else None 65 | 66 | self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False) 67 | self.bn3 = norm_layer(outplanes) 68 | 69 | # self.se = create_attn(attn_layer, outplanes) 70 | 71 | self.act3 = act_layer(inplace=True) 72 | self.downsample = downsample 73 | self.stride = stride 74 | self.dilation = dilation 75 | # self.drop_block = drop_block 76 | # self.drop_path = drop_path 77 | 78 | nn.init.constant_(self.conv2_offset.weight, 0) 79 | nn.init.constant_(self.conv2_offset.bias, 0) 80 | 81 | def zero_init_last_bn(self): 82 | nn.init.zeros_(self.bn3.weight) 83 | 84 | def forward(self, x): 85 | shortcut = x 86 | 87 | x = self.conv1(x) 88 | x = self.bn1(x) 89 | 90 | x = self.act1(x) 91 | 92 | offset = self.conv2_offset(x) 93 | x = self.conv2(x, offset) 94 | x = self.bn2(x) 95 | x = self.act2(x) 96 | 97 | x = self.conv3(x) 98 | x = self.bn3(x) 99 | 100 | if self.downsample is not None: 101 | shortcut = self.downsample(shortcut) 102 | x += shortcut 103 | x = self.act3(x) 104 | 105 | return x 106 | 107 | 108 | BLOCK_TYPE = { 109 | "basic": BasicBlock, 110 | "bottleneck": Bottleneck, 111 | "deform_bottleneck": DeformableBottleneck 112 | } 113 | 114 | 115 | def downsample_conv( 116 | in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None): 117 | norm_layer = norm_layer or nn.BatchNorm2d 118 | kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size 119 | first_dilation = (first_dilation or dilation) if kernel_size > 1 else 1 120 | p = get_padding(kernel_size, stride, first_dilation) 121 | 122 | return nn.Sequential(*[ 123 | nn.Conv2d( 124 | in_channels, out_channels, kernel_size, stride=stride, padding=p, dilation=first_dilation, bias=False), 125 | norm_layer(out_channels) 126 | ]) 127 | 128 | 129 | def downsample_avg( 130 | in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None): 131 | norm_layer = norm_layer or nn.BatchNorm2d 132 | avg_stride = stride if dilation == 1 else 1 133 | if stride == 1 and dilation == 1: 134 | pool = nn.Identity() 135 | else: 136 | avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d 137 | pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False) 138 | 139 | return nn.Sequential(*[ 140 | pool, 141 | nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False), 142 | norm_layer(out_channels) 143 | ]) 144 | 145 | 146 | def drop_blocks(drop_block_rate=0.): 147 | return [ 148 | None, None, 149 | DropBlock2d(drop_block_rate, 5, 0.25) if drop_block_rate else None, 150 | DropBlock2d(drop_block_rate, 3, 1.00) if drop_block_rate else None] 151 | 152 | 153 | def make_blocks( 154 | stage_block, channels, block_repeats, inplanes, reduce_first=1, output_stride=32, 155 | down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., **kwargs): 156 | stages = [] 157 | feature_info = [] 158 | net_num_blocks = sum(block_repeats) 159 | net_block_idx = 0 160 | net_stride = 4 161 | dilation = prev_dilation = 1 162 | for stage_idx, (planes, num_blocks, db) in enumerate(zip(channels, block_repeats, drop_blocks(drop_block_rate))): 163 | # choose block_fn through the BLOCK_TYPE 164 | block_fn = BLOCK_TYPE[stage_block[stage_idx]] 165 | 166 | stage_name = f'layer{stage_idx + 1}' # never liked this name, but weight compat requires it 167 | stride = 1 if stage_idx == 0 else 2 168 | if net_stride >= output_stride: 169 | dilation *= stride 170 | stride = 1 171 | else: 172 | net_stride *= stride 173 | 174 | downsample = None 175 | if stride != 1 or inplanes != planes * block_fn.expansion: 176 | down_kwargs = dict( 177 | in_channels=inplanes, out_channels=planes * block_fn.expansion, kernel_size=down_kernel_size, 178 | stride=stride, dilation=dilation, first_dilation=prev_dilation, norm_layer=kwargs.get('norm_layer')) 179 | downsample = downsample_avg( 180 | **down_kwargs) if avg_down else downsample_conv(**down_kwargs) 181 | 182 | block_kwargs = dict(reduce_first=reduce_first, dilation=dilation, drop_block=db, **kwargs) 183 | blocks = [] 184 | for block_idx in range(num_blocks): 185 | downsample = downsample if block_idx == 0 else None 186 | stride = stride if block_idx == 0 else 1 187 | block_dpr = drop_path_rate * net_block_idx / \ 188 | (net_num_blocks - 1) # stochastic depth linear decay rule 189 | blocks.append(block_fn( 190 | inplanes, planes, stride, downsample, first_dilation=prev_dilation, 191 | drop_path=DropPath(block_dpr) if block_dpr > 0. else None, **block_kwargs)) 192 | prev_dilation = dilation 193 | inplanes = planes * block_fn.expansion 194 | net_block_idx += 1 195 | 196 | stages.append((stage_name, nn.Sequential(*blocks))) 197 | feature_info.append(dict(num_chs=inplanes, reduction=net_stride, module=stage_name)) 198 | 199 | return stages, feature_info 200 | 201 | 202 | class ResNet(Backbone): 203 | """ResNet / ResNeXt / SE-ResNeXt / SE-Net 204 | 205 | This class implements all variants of ResNet, ResNeXt, SE-ResNeXt, and SENet that 206 | * have > 1 stride in the 3x3 conv layer of bottleneck 207 | * have conv-bn-act ordering 208 | 209 | This ResNet impl supports a number of stem and downsample options based on the v1c, v1d, v1e, and v1s 210 | variants included in the MXNet Gluon ResNetV1b model. The C and D variants are also discussed in the 211 | 'Bag of Tricks' paper: https://arxiv.org/pdf/1812.01187. The B variant is equivalent to torchvision default. 212 | 213 | ResNet variants (the same modifications can be used in SE/ResNeXt models as well): 214 | * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b 215 | * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64) 216 | * d - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64), average pool in downsample 217 | * e - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128), average pool in downsample 218 | * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128) 219 | * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample 220 | * tn - 3 layer deep 3x3 stem, stem width = 32 (24, 32, 64), average pool in downsample 221 | 222 | ResNeXt 223 | * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths 224 | * same c,d, e, s variants as ResNet can be enabled 225 | 226 | SE-ResNeXt 227 | * normal - 7x7 stem, stem_width = 64 228 | * same c, d, e, s variants as ResNet can be enabled 229 | 230 | SENet-154 - 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64, 231 | reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block 232 | 233 | Parameters 234 | ---------- 235 | block : Block 236 | Class for the residual block. Options are BasicBlockGl, BottleneckGl. 237 | layers : list of int 238 | Numbers of layers in each block 239 | num_classes : int, default 1000 240 | Number of classification classes. 241 | in_chans : int, default 3 242 | Number of input (color) channels. 243 | cardinality : int, default 1 244 | Number of convolution groups for 3x3 conv in Bottleneck. 245 | base_width : int, default 64 246 | Factor determining bottleneck channels. `planes * base_width / 64 * cardinality` 247 | stem_width : int, default 64 248 | Number of channels in stem convolutions 249 | stem_type : str, default '' 250 | The type of stem: 251 | * '', default - a single 7x7 conv with a width of stem_width 252 | * 'deep' - three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2 253 | * 'deep_tiered' - three 3x3 conv layers of widths stem_width//4 * 3, stem_width, stem_width * 2 254 | block_reduce_first: int, default 1 255 | Reduction factor for first convolution output width of residual blocks, 256 | 1 for all archs except senets, where 2 257 | down_kernel_size: int, default 1 258 | Kernel size of residual block downsampling path, 1x1 for most archs, 3x3 for senets 259 | avg_down : bool, default False 260 | Whether to use average pooling for projection skip connection between stages/downsample. 261 | output_stride : int, default 32 262 | Set the output stride of the network, 32, 16, or 8. Typically used in segmentation. 263 | act_layer : nn.Module, activation layer 264 | norm_layer : nn.Module, normalization layer 265 | aa_layer : nn.Module, anti-aliasing layer 266 | drop_rate : float, default 0. 267 | Dropout probability before classifier, for training 268 | global_pool : str, default 'avg' 269 | Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax' 270 | """ 271 | 272 | def __init__(self, block_types, layers, in_chans=3, 273 | cardinality=1, base_width=64, stem_width=64, stem_type='', replace_stem_pool=False, 274 | output_stride=32, block_reduce_first=1, down_kernel_size=1, avg_down=False, 275 | act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_rate=0.0, drop_path_rate=0., 276 | drop_block_rate=0., global_pool='avg', zero_init_last_bn=True, block_args=None, out_features=None): 277 | block_args = block_args or dict() 278 | assert output_stride in (8, 16, 32) 279 | # self.num_classes = num_classes 280 | self.drop_rate = drop_rate 281 | super(ResNet, self).__init__() 282 | 283 | # Stem 284 | deep_stem = 'deep' in stem_type 285 | inplanes = stem_width * 2 if deep_stem else 64 286 | if deep_stem: 287 | stem_chs = (stem_width, stem_width) 288 | if 'tiered' in stem_type: 289 | stem_chs = (3 * (stem_width // 4), stem_width) 290 | self.conv1 = nn.Sequential(*[ 291 | nn.Conv2d(in_chans, stem_chs[0], 3, stride=2, padding=1, bias=False), 292 | norm_layer(stem_chs[0]), 293 | act_layer(inplace=True), 294 | nn.Conv2d(stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False), 295 | norm_layer(stem_chs[1]), 296 | act_layer(inplace=True), 297 | nn.Conv2d(stem_chs[1], inplanes, 3, stride=1, padding=1, bias=False)]) 298 | else: 299 | self.conv1 = nn.Conv2d(in_chans, inplanes, kernel_size=7, 300 | stride=2, padding=3, bias=False) 301 | self.bn1 = norm_layer(inplanes) 302 | self.act1 = act_layer(inplace=True) 303 | self.feature_info = [dict(num_chs=inplanes, reduction=2, module='act1')] 304 | 305 | # Stem Pooling 306 | if replace_stem_pool: 307 | self.maxpool = nn.Sequential(*filter(None, [ 308 | nn.Conv2d(inplanes, inplanes, 3, stride=1 if aa_layer else 2, padding=1, bias=False), 309 | aa_layer(channels=inplanes, stride=2) if aa_layer else None, 310 | norm_layer(inplanes), 311 | act_layer(inplace=True) 312 | ])) 313 | else: 314 | if aa_layer is not None: 315 | self.maxpool = nn.Sequential(*[ 316 | nn.MaxPool2d(kernel_size=3, stride=1, padding=1), 317 | aa_layer(channels=inplanes, stride=2)]) 318 | else: 319 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 320 | 321 | # Feature Blocks 322 | channels = [64, 128, 256, 512] 323 | stage_modules, stage_feature_info = make_blocks( 324 | block_types, channels, layers, inplanes, cardinality=cardinality, base_width=base_width, 325 | output_stride=output_stride, reduce_first=block_reduce_first, avg_down=avg_down, 326 | down_kernel_size=down_kernel_size, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, 327 | drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, **block_args) 328 | for stage in stage_modules: 329 | self.add_module(*stage) # layer1, layer2, etc 330 | self.feature_info.extend(stage_feature_info) 331 | 332 | for n, m in self.named_modules(): 333 | if isinstance(m, nn.BatchNorm2d): 334 | nn.init.constant_(m.weight, 1.) 335 | nn.init.constant_(m.bias, 0.) 336 | if zero_init_last_bn: 337 | for m in self.modules(): 338 | if hasattr(m, 'zero_init_last_bn'): 339 | m.zero_init_last_bn() 340 | 341 | out_features_names = ["res2", "res3", "res4", "res5"] 342 | self._out_feature_strides = dict(zip(out_features_names, [4, 8, 16, 32])) 343 | self._out_feature_channels = dict( 344 | zip(out_features_names, [x * BLOCK_TYPE[block_types[0]].expansion for x in [64, 128, 256, 512]])) 345 | if out_features is None: 346 | self._out_features = out_features_names 347 | else: 348 | self._out_features = out_features 349 | 350 | def output_shape(self): 351 | return { 352 | name: ShapeSpec( 353 | channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] 354 | ) 355 | for name in self._out_features 356 | } 357 | 358 | def size_divisibility(self): 359 | return 32 360 | 361 | def forward(self, x): 362 | x = self.conv1(x) 363 | x = self.bn1(x) 364 | x = self.act1(x) 365 | x = self.maxpool(x) 366 | outputs = {} 367 | x = self.layer1(x) 368 | # outputs["res2"] = x 369 | x = self.layer2(x) 370 | outputs["res3"] = x 371 | x = self.layer3(x) 372 | outputs["res4"] = x 373 | x = self.layer4(x) 374 | outputs["res5"] = x 375 | return outputs 376 | 377 | 378 | @BACKBONE_REGISTRY.register() 379 | def build_resnet_vd_backbone(cfg, input_shape): 380 | 381 | depth = cfg.MODEL.RESNETS.DEPTH 382 | norm_name = cfg.MODEL.RESNETS.NORM 383 | if norm_name == "FrozenBN": 384 | norm = FrozenBatchNorm2d 385 | elif norm_name == "SyncBN": 386 | norm = NaiveSyncBatchNorm 387 | else: 388 | norm = nn.BatchNorm2d 389 | if depth == 50: 390 | layers = [3, 4, 6, 3] 391 | elif depth == 101: 392 | layers = [3, 4, 23, 3] 393 | else: 394 | raise NotImplementedError() 395 | 396 | stage_blocks = [] 397 | use_deformable = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE 398 | for idx in range(4): 399 | if use_deformable[idx]: 400 | stage_blocks.append("deform_bottleneck") 401 | else: 402 | stage_blocks.append("bottleneck") 403 | 404 | model = ResNet(stage_blocks, layers, stem_type="deep", 405 | stem_width=32, avg_down=True, norm_layer=norm) 406 | return model 407 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
3 |
23 |
68 | ![]() |
219 | ![]() |
220 |
Visualization results (SparseInst-R50-GIAM)
222 |