├── assets
    ├── banner.gif
    ├── animate.gif
    ├── sparseinst.png
    └── figures
    │   ├── 000000006471.jpg
    │   └── 000000014439.jpg
├── configs
    ├── sparse_inst_r50_giam.yaml
    ├── sparse_inst_r101_giam.yaml
    ├── sparse_inst_r50_giam_fp16.yaml
    ├── sparse_inst_r50_base.yaml
    ├── sparse_inst_r50_giam_soft.yaml
    ├── sparse_inst_r50vd_giam.yaml
    ├── sparse_inst_r101_dcn_giam.yaml
    ├── sparse_inst_r50_giam_aug.yaml
    ├── sparse_inst_r50vd_base.yaml
    ├── sparse_inst_darknet53_giam.yaml
    ├── sparse_inst_r50vd_dcn_giam.yaml
    ├── sparse_inst_pvt_b1_giam.yaml
    ├── sparse_inst_r50_dcn_giam_aug.yaml
    ├── sparse_inst_r50vd_giam_aug.yaml
    ├── sparse_inst_pvt_b2_li_giam.yaml
    ├── sparse_inst_cspdarknet53_giam.yaml
    ├── sparse_inst_r50vd_dcn_giam_aug.yaml
    └── Base-SparseInst.yaml
├── sparseinst
    ├── backbones
    │   ├── __init__.py
    │   ├── pvt.py
    │   ├── resnet.py
    │   └── cspnet.py
    ├── __init__.py
    ├── config.py
    ├── coco_evaluation.py
    ├── encoder.py
    ├── utils.py
    ├── dataset_mapper.py
    ├── sparseinst.py
    ├── d2_predictor.py
    ├── decoder.py
    └── loss.py
├── mindspore
    ├── sparseinst
    │   ├── __init__.py
    │   ├── config.py
    │   ├── encoder.py
    │   ├── resnet.py
    │   ├── sparseinst.py
    │   └── decoder.py
    ├── README.md
    ├── dict.py
    └── test.py
├── .gitignore
├── LICENCE
├── Dockerfile
├── onnx
    └── convert_onnx.py
├── demo.py
├── tools
    ├── get_flops.py
    ├── test_net.py
    └── train_net.py
└── README.md


/assets/banner.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hustvl/SparseInst/HEAD/assets/banner.gif


--------------------------------------------------------------------------------
/assets/animate.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hustvl/SparseInst/HEAD/assets/animate.gif


--------------------------------------------------------------------------------
/assets/sparseinst.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hustvl/SparseInst/HEAD/assets/sparseinst.png


--------------------------------------------------------------------------------
/assets/figures/000000006471.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hustvl/SparseInst/HEAD/assets/figures/000000006471.jpg


--------------------------------------------------------------------------------
/assets/figures/000000014439.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hustvl/SparseInst/HEAD/assets/figures/000000014439.jpg


--------------------------------------------------------------------------------
/configs/sparse_inst_r50_giam.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   WEIGHTS: "pretrained_models/R-50.pkl"
4 | OUTPUT_DIR: "output/sparse_inst_r50_giam"


--------------------------------------------------------------------------------
/sparseinst/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet import build_resnet_vd_backbone
2 | from .pvt import build_pyramid_vision_transformer
3 | from .cspnet import build_cspnet_backbone


--------------------------------------------------------------------------------
/configs/sparse_inst_r101_giam.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   WEIGHTS: "pretrained_models/R-101.pkl"
4 |   RESNETS:
5 |     DEPTH: 101
6 | OUTPUT_DIR: "output/sparse_inst_r101_giam"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50_giam_fp16.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   WEIGHTS: "pretrained_models/R-50.pkl"
4 | SOLVER:
5 |   AMP:
6 |     ENABLED: True
7 | OUTPUT_DIR: "output/sparse_inst_r50_giam_fp16"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50_base.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   WEIGHTS: "pretrained_models/R-50.pkl"
4 |   SPARSE_INST:
5 |     DECODER:
6 |       NAME: "BaseIAMDecoder"
7 | OUTPUT_DIR: "output/sparse_inst_r50_base"


--------------------------------------------------------------------------------
/mindspore/sparseinst/__init__.py:
--------------------------------------------------------------------------------
1 | from .sparseinst import SparseInst
2 | from .config import cfg,update_config
3 | from .resnet import build_resnet50
4 | from .encoder import InstanceContextEncoder
5 | from .decoder import GroupIAMDecoder


--------------------------------------------------------------------------------
/configs/sparse_inst_r50_giam_soft.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   WEIGHTS: "pretrained_models/R-50.pkl"
4 |   SPARSE_INST:
5 |     DECODER:
6 |       NAME: "GroupIAMSoftDecoder"
7 | OUTPUT_DIR: "output/sparse_inst_r50_giam_soft"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50vd_giam.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   WEIGHTS: "pretrained_models/resnet50d_ra2-464e36ba.pth"
4 |   BACKBONE:
5 |     FREEZE_AT: 0
6 |     NAME: "build_resnet_vd_backbone"
7 | OUTPUT_DIR: "output/sparse_inst_r50vd_giam"
8 | 


--------------------------------------------------------------------------------
/configs/sparse_inst_r101_dcn_giam.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "Base-SparseInst.yaml"
2 | MODEL:
3 |   WEIGHTS: "pretrained_models/R-101.pkl"
4 |   RESNETS:
5 |     DEPTH: 101
6 |     DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5
7 | OUTPUT_DIR: "output/sparse_inst_r101_dcn_giam"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50_giam_aug.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "pretrained_models/R-50.pkl"
 4 | INPUT:
 5 |   CROP:
 6 |     ENABLED: True
 7 |     TYPE: "absolute_range"
 8 |     SIZE: (384, 600)
 9 |   MASK_FORMAT: "polygon"
10 | OUTPUT_DIR: "output/sparse_inst_r50_giam_aug"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50vd_base.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "pretrained_models/resnet50d_ra2-464e36ba.pth"
 4 |   BACKBONE:
 5 |     FREEZE_AT: 0
 6 |     NAME: "build_resnet_vd_backbone"
 7 |   SPARSE_INST:
 8 |     DECODER:
 9 |       NAME: "BaseIAMDecoder"
10 | OUTPUT_DIR: "output/sparse_inst_r50_base"


--------------------------------------------------------------------------------
/configs/sparse_inst_darknet53_giam.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: ""
 4 |   BACKBONE:
 5 |     NAME: "build_cspnet_backbone"
 6 |   SPARSE_INST:
 7 |     ENCODER:
 8 |       IN_FEATURES: ["csp2", "csp3", "csp4"]
 9 |   CSPNET:
10 |     NAME: "darknet53"
11 |     OUT_FEATURES: ["csp2", "csp3", "csp4"]
12 | OUTPUT_DIR: "output/sparse_inst_darknet53_giam"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50vd_dcn_giam.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "pretrained_models/resnet50d_ra2-464e36ba.pth"
 4 |   BACKBONE:
 5 |     FREEZE_AT: 0
 6 |     NAME: "build_resnet_vd_backbone"
 7 |   RESNETS:
 8 |     DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5
 9 | OUTPUT_DIR: "output/sparse_inst_r50vd_dcn_giam"
10 |   
11 | 


--------------------------------------------------------------------------------
/configs/sparse_inst_pvt_b1_giam.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "pretrained_models/pvt_v2_b1.pth"
 4 |   BACKBONE:
 5 |     NAME: "build_pyramid_vision_transformer"
 6 |   SPARSE_INST:
 7 |     ENCODER:
 8 |       IN_FEATURES: ["p2", "p3", "p4"]
 9 |   PVT:
10 |     NAME: "b1"
11 |     OUT_FEATURES: ["p2", "p3", "p4"]
12 | OUTPUT_DIR: "output/sparse_inst_pvt_b1_giam"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50_dcn_giam_aug.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "pretrained_models/R-50.pkl"
 4 |   RESNETS:
 5 |     DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5
 6 | INPUT:
 7 |   CROP:
 8 |     ENABLED: True
 9 |     TYPE: "absolute_range"
10 |     SIZE: (384, 600)
11 |   MASK_FORMAT: "polygon"
12 | OUTPUT_DIR: "output/sparse_inst_r50_dcn_giam_aug"


--------------------------------------------------------------------------------
/configs/sparse_inst_r50vd_giam_aug.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "pretrained_models/resnet50d_ra2-464e36ba.pth"
 4 |   BACKBONE:
 5 |     FREEZE_AT: 0
 6 |     NAME: "build_resnet_vd_backbone"
 7 | INPUT:
 8 |   CROP:
 9 |     ENABLED: True
10 |     TYPE: "absolute_range"
11 |     SIZE: (384, 600)
12 |   MASK_FORMAT: "polygon"
13 | OUTPUT_DIR: "output/sparse_inst_r50vd_giam_aug"
14 | 


--------------------------------------------------------------------------------
/configs/sparse_inst_pvt_b2_li_giam.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "pretrained_models/pvt_v2_b2_li.pth"
 4 |   BACKBONE:
 5 |     NAME: "build_pyramid_vision_transformer"
 6 |   SPARSE_INST:
 7 |     ENCODER:
 8 |       IN_FEATURES: ["p2", "p3", "p4"]
 9 |   PVT:
10 |     NAME: "b2"
11 |     LINEAR: True
12 |     OUT_FEATURES: ["p2", "p3", "p4"]
13 | OUTPUT_DIR: "output/sparse_inst_pvt_b2_linear_giam"


--------------------------------------------------------------------------------
/configs/sparse_inst_cspdarknet53_giam.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "pretrained_models/cspdarknet53_ra_256-d05c7c21.pth"
 4 |   BACKBONE:
 5 |     NAME: "build_cspnet_backbone"
 6 |   SPARSE_INST:
 7 |     ENCODER:
 8 |       IN_FEATURES: ["csp2", "csp3", "csp4"]
 9 |     DECODER:
10 |       NAME: "GroupIAMSoftDecoder"
11 |   CSPNET:
12 |     NAME: "cspdarknet53"
13 |     OUT_FEATURES: ["csp2", "csp3", "csp4"]
14 | OUTPUT_DIR: "output/sparse_inst_cspdarknet53_giam"


--------------------------------------------------------------------------------
/sparseinst/__init__.py:
--------------------------------------------------------------------------------
 1 | from .sparseinst import SparseInst
 2 | from .encoder import build_sparse_inst_encoder
 3 | from .decoder import build_sparse_inst_decoder
 4 | from .config import add_sparse_inst_config
 5 | from .loss import build_sparse_inst_criterion
 6 | from .dataset_mapper import SparseInstDatasetMapper
 7 | from .coco_evaluation import COCOMaskEvaluator
 8 | from .backbones import build_resnet_vd_backbone, build_pyramid_vision_transformer
 9 | from .d2_predictor import VisualizationDemo
10 | 


--------------------------------------------------------------------------------
/configs/sparse_inst_r50vd_dcn_giam_aug.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-SparseInst.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "pretrained_models/resnet50d_ra2-464e36ba.pth"
 4 |   BACKBONE:
 5 |     FREEZE_AT: 0
 6 |     NAME: "build_resnet_vd_backbone"
 7 |   RESNETS:
 8 |     DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5
 9 | INPUT:
10 |   CROP:
11 |     ENABLED: True
12 |     TYPE: "absolute_range"
13 |     SIZE: (384, 600)
14 |   MASK_FORMAT: "polygon"
15 | OUTPUT_DIR: "output/sparse_inst_r50vd_dcn_giam_aug"
16 |   
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # output dir
 2 | output
 3 | output*
 4 | instant_test_output
 5 | inference_test_output
 6 | 
 7 | 
 8 | *.png
 9 | *.json
10 | *.diff
11 | 
12 | # compilation and distribution
13 | __pycache__
14 | _ext
15 | *.pyc
16 | *.pyd
17 | *.so
18 | detectron2.egg-info/
19 | build/
20 | dist/
21 | wheels/
22 | 
23 | # pytorch/python/numpy formats
24 | *.pth
25 | *.pkl
26 | *.npy
27 | 
28 | # ipython/jupyter notebooks
29 | *.ipynb
30 | **/.ipynb_checkpoints/
31 | 
32 | # Editor temporaries
33 | *.swn
34 | *.swo
35 | *.swp
36 | *~
37 | 
38 | # editor settings
39 | .idea
40 | .vscode
41 | _darcs
42 | 
43 | # project dirs
44 | /detectron2/model_zoo/configs
45 | /datasets/*
46 | !/datasets/*.*
47 | /projects/*/datasets
48 | /models
49 | 
50 | # mac file
51 | .DS_Store


--------------------------------------------------------------------------------
/mindspore/README.md:
--------------------------------------------------------------------------------
 1 | # SparseInst on MindSpore
 2 | 
 3 | ##  Installation  
 4 | 
 5 | 1. create python 3.8 environment  
 6 | ```bash
 7 | conda create -n sparseinst-ms python=3.8  
 8 | ```
 9 | 2. activate the new environment  
10 | ```bash
11 | conda activate sparseinst-ms    
12 | ```
13 | 
14 | 3. install mindspore   
15 | ``` bash
16 | pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/1.8.1/MindSpore/gpu/x86_64/cuda-11.1/mindspore_gpu-1.8.1-cp38-cp38-linux_x86_64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://pypi.tuna.tsinghua.edu.cn/simple 
17 | ``` 
18 | 
19 | 4. install dependencies   
20 | ```bash
21 | pip install mindvision pycocotools opencv-python numpy yacs   
22 | ```
23 | 
24 | ##  Model
25 | 
26 | We provide the basic SparseInst-R50-GIAM in [BaiduPan](https://pan.baidu.com/s/1ZmZ6nqZrwt4ALYP1B2kdCA?pwd=7xsb).
27 | 
28 | ##  Demo
29 | 
30 | ```bash
31 | python test.py --config /path/to/your/checkpoint  --image_name /path/to/your/image --visualize  
32 | ```
33 | 
34 | The results will be saved in ./image_name/
35 | 


--------------------------------------------------------------------------------
/configs/Base-SparseInst.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "SparseInst"
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 4 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 5 |   PIXEL_STD: [58.395, 57.120, 57.375]
 6 |   BACKBONE:
 7 |     FREEZE_AT: 0
 8 |     NAME: "build_resnet_backbone"
 9 |   RESNETS:
10 |     NORM: "FrozenBN"
11 |     DEPTH: 50
12 |     STRIDE_IN_1X1: False
13 |     OUT_FEATURES: ["res3", "res4", "res5"]
14 |   SPARSE_INST:
15 |     ENCODER:
16 |       NAME: "InstanceContextEncoder"
17 |     DECODER:
18 |       NAME: "GroupIAMDecoder"
19 | DATASETS:
20 |   TRAIN: ("coco_2017_train",)
21 |   TEST:  ("coco_2017_val",)
22 | SOLVER:
23 |   IMS_PER_BATCH: 64
24 |   BASE_LR: 0.00005
25 |   STEPS: (210000, 250000)
26 |   MAX_ITER: 270000
27 |   WEIGHT_DECAY: 0.05
28 | INPUT:
29 |   MIN_SIZE_TRAIN: (416, 448, 480, 512, 544, 576, 608, 640)
30 |   MAX_SIZE_TRAIN: 853
31 |   MIN_SIZE_TEST: 640
32 |   MAX_SIZE_TEST: 853
33 |   FORMAT: "RGB"
34 |   MASK_FORMAT: "bitmask"
35 | TEST:
36 |   EVAL_PERIOD: 7330
37 | DATALOADER:
38 |   NUM_WORKERS: 6
39 | VERSION: 2
40 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Hust Visual Learning Team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/mindspore/dict.py:
--------------------------------------------------------------------------------
1 | id2category={1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane', 6: 'bus', 7: 'train', 8: 'truck',9: 'boat', 10: 'traffic light', 11: 'fire hydrant', 13: 'stop sign', 14: 'parking meter', 15: 'bench',16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', 24: 'zebra',25: 'giraffe', 27: 'backpack', 28: 'umbrella', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee',35: 'skis', 36: 'snowboard', 37: 'sports ball', 38: 'kite', 39: 'baseball bat', 40: 'baseball glove',41: 'skateboard', 42: 'surfboard', 43: 'tennis racket', 44: 'bottle', 46: 'wine glass', 47: 'cup',48: 'fork', 49: 'knife', 50: 'spoon', 51: 'bowl', 52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange',56: 'broccoli', 57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair', 63: 'couch',64: 'potted plant', 65: 'bed', 67: 'dining table', 70: 'toilet', 72: 'tv', 73: 'laptop', 74: 'mouse',75: 'remote', 76: 'keyboard', 77: 'cell phone', 78: 'microwave', 79: 'oven', 80: 'toaster', 81: 'sink',82: 'refrigerator', 84: 'book', 85: 'clock', 86: 'vase', 87: 'scissors', 88: 'teddy bear', 89: 'hair drier',90: 'toothbrush'}


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-devel
 2 | LABEL Service="SparseInstanceActivation"
 3 | 
 4 | ENV TZ=Europe/Moscow
 5 | ENV DETECTRON_TAG=v0.3
 6 | ARG DEBIAN_FRONTEND=noninteractive
 7 | 
 8 | RUN apt-key del 7fa2af80 && \
 9 |     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
10 |     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
11 | RUN apt update && apt install vim git g++ python3-tk ffmpeg libsm6 libxext6 -y
12 | 
13 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
14 |     python3 -m pip install --no-cache-dir opencv-python opencv-contrib-python scipy
15 | 
16 | WORKDIR /workspace
17 | RUN git clone https://github.com/facebookresearch/detectron2.git && \
18 |     cd detectron2/ && git checkout tags/${DETECTRON_TAG} && python3 setup.py build develop
19 | 
20 | RUN python3 -m pip uninstall -y iopath fvcore portalocker yacs && \
21 |     python3 -m pip install --no-cache-dir iopath fvcore portalocker yacs timm pyyaml==5.1 shapely
22 | 
23 | RUN git clone https://github.com/hustvl/SparseInst
24 | WORKDIR /workspace/SparseInst
25 | RUN ln -s /usr/bin/python3 /usr/bin/python
26 | 
27 | ENTRYPOINT bash
28 | 


--------------------------------------------------------------------------------
/mindspore/sparseinst/config.py:
--------------------------------------------------------------------------------
 1 | from yacs.config import CfgNode as CN
 2 | import os
 3 | 
 4 | 
 5 | def update_config(cfg, args):
 6 | 	cfg.defrost()
 7 | 	cfg.merge_from_file(args.cfg)
 8 | 	cfg.freeze()
 9 | 	return cfg
10 | 
11 | cfg = CN()
12 | 
13 | cfg.MODEL=CN()
14 | cfg.MODEL.SPARSE_INST=CN()
15 | cfg.MODEL.SPARSE_INST.ENCODER=CN()
16 | cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS=256
17 | cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES=['res3','res4','res5']
18 | 
19 | cfg.MODEL.SPARSE_INST.DECODER=CN()
20 | cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS = 100
21 | cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES = 80
22 | cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM = 128
23 | cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR = 2
24 | cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM = False
25 | cfg.MODEL.SPARSE_INST.DECODER.GROUPS = 4
26 | 
27 | cfg.MODEL.SPARSE_INST.DECODER.INST=CN()
28 | cfg.MODEL.SPARSE_INST.DECODER.INST.DIM = 256
29 | cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS = 4
30 | 
31 | cfg.MODEL.SPARSE_INST.DECODER.MASK=CN()
32 | cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM = 256
33 | cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS = 4
34 | 
35 | cfg.MODEL.SPARSE_INST.CLS_THRESHOLD = 0.005
36 | cfg.MODEL.SPARSE_INST.MASK_THRESHOLD = 0.45
37 | cfg.MODEL.SPARSE_INST.MAX_DETECTIONS = 100
38 | 
39 | cfg.MODEL.PIXEL_MEAN=[123.675, 116.280, 103.530]
40 | cfg.MODEL.PIXEL_STD=[58.395, 57.120, 57.375]
41 | 


--------------------------------------------------------------------------------
/sparseinst/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
 2 | 
 3 | from detectron2.config import CfgNode as CN
 4 | 
 5 | def add_sparse_inst_config(cfg):
 6 | 
 7 |     cfg.MODEL.DEVICE = 'cuda'
 8 |     cfg.MODEL.MASK_ON = True
 9 |     # [SparseInst]
10 |     cfg.MODEL.SPARSE_INST = CN()
11 | 
12 |     # parameters for inference
13 |     cfg.MODEL.SPARSE_INST.CLS_THRESHOLD = 0.005
14 |     cfg.MODEL.SPARSE_INST.MASK_THRESHOLD = 0.45
15 |     cfg.MODEL.SPARSE_INST.MAX_DETECTIONS = 100
16 | 
17 |     # [Encoder]
18 |     cfg.MODEL.SPARSE_INST.ENCODER = CN()
19 |     cfg.MODEL.SPARSE_INST.ENCODER.NAME = "FPNPPMEncoder"
20 |     cfg.MODEL.SPARSE_INST.ENCODER.NORM = ""
21 |     cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES = ["res3", "res4", "res5"]
22 |     cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS = 256
23 | 
24 |     # [Decoder]
25 |     cfg.MODEL.SPARSE_INST.DECODER = CN()
26 |     cfg.MODEL.SPARSE_INST.DECODER.NAME = "BaseIAMDecoder"
27 |     cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS = 100
28 |     cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES = 80
29 |     # kernels for mask features
30 |     cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM = 128
31 |     # upsample factor for output masks
32 |     cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR = 2.0
33 |     cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM = False
34 |     cfg.MODEL.SPARSE_INST.DECODER.GROUPS = 4    
35 |     # decoder.inst_branch
36 |     cfg.MODEL.SPARSE_INST.DECODER.INST = CN()
37 |     cfg.MODEL.SPARSE_INST.DECODER.INST.DIM = 256
38 |     cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS = 4
39 |     # decoder.mask_branch
40 |     cfg.MODEL.SPARSE_INST.DECODER.MASK = CN()
41 |     cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM = 256
42 |     cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS = 4
43 | 
44 |     # [Loss]
45 |     cfg.MODEL.SPARSE_INST.LOSS = CN()
46 |     cfg.MODEL.SPARSE_INST.LOSS.NAME = "SparseInstCriterion"
47 |     cfg.MODEL.SPARSE_INST.LOSS.ITEMS = ("labels", "masks")
48 |     # loss weight
49 |     cfg.MODEL.SPARSE_INST.LOSS.CLASS_WEIGHT = 2.0
50 |     cfg.MODEL.SPARSE_INST.LOSS.MASK_PIXEL_WEIGHT = 5.0
51 |     cfg.MODEL.SPARSE_INST.LOSS.MASK_DICE_WEIGHT = 2.0
52 |     # iou-aware objectness loss weight
53 |     cfg.MODEL.SPARSE_INST.LOSS.OBJECTNESS_WEIGHT = 1.0
54 | 
55 |     # [Matcher]
56 |     cfg.MODEL.SPARSE_INST.MATCHER = CN()
57 |     cfg.MODEL.SPARSE_INST.MATCHER.NAME = "SparseInstMatcher"
58 |     cfg.MODEL.SPARSE_INST.MATCHER.ALPHA = 0.8
59 |     cfg.MODEL.SPARSE_INST.MATCHER.BETA = 0.2
60 | 
61 |     # [Optimizer]
62 |     cfg.SOLVER.OPTIMIZER = "ADAMW"
63 |     cfg.SOLVER.BACKBONE_MULTIPLIER = 1.0
64 |     cfg.SOLVER.AMSGRAD = False
65 | 
66 |     # [Dataset mapper]
67 |     cfg.MODEL.SPARSE_INST.DATASET_MAPPER = "SparseInstDatasetMapper"
68 | 
69 |     # [Pyramid Vision Transformer]
70 |     cfg.MODEL.PVT = CN()
71 |     cfg.MODEL.PVT.NAME = "b1"
72 |     cfg.MODEL.PVT.OUT_FEATURES = ["p2", "p3", "p4"]
73 |     cfg.MODEL.PVT.LINEAR = False
74 | 
75 |     cfg.MODEL.CSPNET = CN()
76 |     cfg.MODEL.CSPNET.NAME = "darknet53"
77 |     cfg.MODEL.CSPNET.NORM = ""
78 |     # (csp-)darknet: csp1, csp2, csp3, csp4
79 |     cfg.MODEL.CSPNET.OUT_FEATURES = ["csp1", "csp2", "csp3", "csp4"]
80 | 
81 | 


--------------------------------------------------------------------------------
/mindspore/sparseinst/encoder.py:
--------------------------------------------------------------------------------
 1 | import mindspore
 2 | from mindspore import Tensor
 3 | import mindspore.nn as nn
 4 | from mindspore.nn import Conv2d
 5 | import mindspore.ops as ops
 6 | 
 7 | __all__=["InstanceContextEncoder"]
 8 | 
 9 | 
10 | class PyramidPoolingModule(nn.Cell):
11 | 	def __init__(self,in_channels,channels=512,sizes=(1,2,3,6)):
12 | 		super().__init__()
13 | 		self.stages=[]
14 | 		self.stages=nn.CellList([self._make_stage(in_channels,channels,size) for size in sizes])
15 | 		self.bottleneck=Conv2d(in_channels+len(sizes)*channels,in_channels,1,has_bias=False)
16 | 
17 | 	def _make_stage(self,features,out_features,size):
18 | 		prior=nn.AdaptiveAvgPool2d(output_size=(size,size))
19 | 		conv=nn.Conv2d(features,out_features,1,has_bias=True)
20 | 		return nn.SequentialCell(prior,conv)
21 | 
22 | 	def construct(self,feats):
23 | 		h, w = feats.shape[2], feats.shape[3]
24 | 
25 | 		prior=[ops.ResizeBilinear((h,w))(ops.ReLU()(stage(feats))) for stage in self.stages]+[feats]
26 | 		out=ops.ReLU()(self.bottleneck(ops.Concat(axis=1)(prior)))
27 | 		return out
28 | 
29 | 
30 | 
31 | class InstanceContextEncoder(nn.Cell):
32 | 	def __init__(self,cfg,input_shape):
33 | 		super().__init__()
34 | 		self.num_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS  #256
35 | 		self.in_features = cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES  #[‘res3','res4','res5']
36 | 		# self.norm = cfg.MODEL.SPARSE_INST.ENCODER.NORM
37 | 		# depthwise = cfg.MODEL.SPARSE_INST.ENCODER.DEPTHWISE
38 | 		self.in_channels = [input_shape[f] for f in self.in_features]
39 | 		# self.using_bias = self.norm == ""
40 | 		fpn_laterals = []
41 | 		fpn_outputs = []
42 | 		# groups = self.num_channels if depthwise else 1
43 | 		for in_channel in reversed(self.in_channels):
44 | 			lateral_conv = nn.Conv2d(in_channel, self.num_channels, 1,has_bias=True)
45 | 			output_conv = nn.Conv2d(self.num_channels, self.num_channels, 3,has_bias=True)
46 | 			fpn_laterals.append(lateral_conv)
47 | 			fpn_outputs.append(output_conv)
48 | 		self.fpn_laterals = nn.CellList(fpn_laterals)
49 | 		self.fpn_outputs = nn.CellList(fpn_outputs)
50 | 		# ppm
51 | 		self.ppm = PyramidPoolingModule(self.num_channels, self.num_channels // 4)
52 | 		# final fusion
53 | 		self.fusion = nn.Conv2d(self.num_channels * 3, self.num_channels, 1,has_bias=True)
54 | 
55 | 
56 | 	def construct(self, features): #features:dict
57 | 		features = [features[f] for f in self.in_features]
58 | 		features = features[::-1]
59 | 		prev_features = self.ppm(self.fpn_laterals[0](features[0]))
60 | 		outputs = [self.fpn_outputs[0](prev_features)]
61 | 
62 | 		for feature, lat_conv, output_conv in zip(features[1:], self.fpn_laterals[1:], self.fpn_outputs[1:]):
63 | 			lat_features = lat_conv(feature)
64 | 
65 | 			h,w=prev_features.shape[2],prev_features.shape[3]
66 | 			top_down_features = ops.ResizeNearestNeighbor(size=(h*2,w*2))(prev_features)###
67 | 
68 | 			prev_features = lat_features + top_down_features
69 | 			outputs.insert(0, output_conv(prev_features))
70 | 
71 | 		size = outputs[0].shape[2:]
72 | 		features = [outputs[0]] + [ops.ResizeBilinear(size)(x) for x in outputs[1:]]
73 | 
74 | 		features = self.fusion(ops.Concat(axis=1)(features))
75 | 		return features
76 | 


--------------------------------------------------------------------------------
/mindspore/sparseinst/resnet.py:
--------------------------------------------------------------------------------
  1 | import mindspore
  2 | import mindspore.ops as ops
  3 | import mindspore.nn as nn
  4 | from typing import Type, Union, List, Optional
  5 | from mindvision.classification.models import ResidualBlock,ResidualBlockBase
  6 | from mindvision.engine.class_factory import ClassFactory, ModuleType
  7 | from mindvision.classification.models.blocks import ConvNormActivation
  8 | from collections import OrderedDict
  9 | 
 10 | 
 11 | class ResNet(nn.Cell):
 12 | 	"""
 13 | 	ResNet architecture.
 14 | 
 15 | 	Args:
 16 | 		block (Type[Union[ResidualBlockBase, ResidualBlock]]): THe block for network.
 17 | 		layer_nums (list): The numbers of block in different layers.
 18 | 		group (int): The number of Group convolutions. Default: 1.
 19 | 		base_width (int): The width of per group. Default: 64.
 20 | 		norm (nn.Cell, optional): The module specifying the normalization layer to use. Default: None.
 21 | 
 22 | 	Inputs:
 23 | 		- **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
 24 | 	"""
 25 | 	def __init__(self,
 26 | 				block: Type[Union[ResidualBlockBase, ResidualBlock]],
 27 | 				layer_nums: List[int],
 28 | 				group: int = 1,
 29 | 				base_width: int = 64,
 30 | 				norm: Optional[nn.Cell] = None
 31 | 				) -> None:
 32 | 		super(ResNet, self).__init__()
 33 | 		self.output_shape={}
 34 | 		if not norm:
 35 | 			norm = nn.BatchNorm2d
 36 | 		self.norm = norm
 37 | 		self.in_channels = 64
 38 | 		self.group = group
 39 | 		self.base_with = base_width
 40 | 		self.stem=OrderedDict()
 41 | 		conv1 = ConvNormActivation(
 42 | 			3, self.in_channels, kernel_size=7, stride=2, norm=norm)
 43 | 		self.stem['conv1']=conv1
 44 | 		self.stem['maxpool2d']=nn.MaxPool2d(kernel_size=3,stride=2,pad_mode='same')
 45 | 		self.stem=nn.SequentialCell(self.stem)
 46 | 		self.res2 = self._make_layer(block, 64, layer_nums[0])
 47 | 		self.res3 = self._make_layer(block, 128, layer_nums[1], stride=2)
 48 | 		self.res4 = self._make_layer(block, 256, layer_nums[2], stride=2)
 49 | 		self.res5 = self._make_layer(block, 512, layer_nums[3], stride=2)
 50 | 
 51 | 	def _make_layer(self,
 52 | 					block: Type[Union[ResidualBlockBase, ResidualBlock]],
 53 | 					channel: int,
 54 | 					block_nums: int,
 55 | 					stride: int = 1
 56 | 					):
 57 | 
 58 | 		down_sample = None
 59 | 
 60 | 		if stride != 1 or self.in_channels != self.in_channels * block.expansion:
 61 | 			down_sample = ConvNormActivation(
 62 | 				self.in_channels,
 63 | 				channel * block.expansion,
 64 | 				kernel_size=1,
 65 | 				stride=stride,
 66 | 				norm=self.norm,
 67 | 				activation=None)
 68 | 		layers = []
 69 | 		layers.append(
 70 | 			block(
 71 | 				self.in_channels,
 72 | 				channel,
 73 | 				stride=stride,
 74 | 				down_sample=down_sample,
 75 | 				group=self.group,
 76 | 				base_width=self.base_with,
 77 | 				norm=self.norm
 78 | 			)
 79 | 		)
 80 | 		self.in_channels = channel * block.expansion
 81 | 
 82 | 		for _ in range(1, block_nums):
 83 | 			layers.append(
 84 | 				block(
 85 | 					self.in_channels,
 86 | 					channel,
 87 | 					group=self.group,
 88 | 					base_width=self.base_with,
 89 | 					norm=self.norm
 90 | 				)
 91 | 			)
 92 | 
 93 | 		return nn.SequentialCell(layers)
 94 | 
 95 | 	def output_channel(self):
 96 | 		output_channel={'res3':512,'res4':1024,'res5':2048}
 97 | 		return output_channel
 98 | 
 99 | 	def construct(self, x):
100 | 		output={}
101 | 		x = self.stem(x)
102 | 
103 | 		x = self.res2(x)
104 | 		x = self.res3(x)
105 | 		output['res3']=x
106 | 		x = self.res4(x)
107 | 		output['res4']=x
108 | 		x = self.res5(x)
109 | 		output['res5']=x
110 | 
111 | 		return output
112 | 
113 | 
114 | 
115 | def build_resnet50():
116 | 	return ResNet(ResidualBlock,[3,4,6,3])
117 | 
118 | 


--------------------------------------------------------------------------------
/sparseinst/coco_evaluation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pycocotools.mask as mask_util
 3 | from detectron2.structures import BoxMode
 4 | from detectron2.evaluation import COCOEvaluator
 5 | 
 6 | 
 7 | def instances_to_coco_json(instances, img_id):
 8 |     """
 9 |     Dump an "Instances" object to a COCO-format json that's used for evaluation.
10 | 
11 |     Args:
12 |         instances (Instances):
13 |         img_id (int): the image id
14 | 
15 |     Returns:
16 |         list[dict]: list of json annotations in COCO format.
17 |     """
18 |     num_instance = len(instances)
19 |     if num_instance == 0:
20 |         return []
21 | 
22 |     # NOTE: pure instance segmentation
23 |     has_box = instances.has("pred_boxes")
24 |     if has_box:
25 |         boxes = instances.pred_boxes.tensor.numpy()
26 |         boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
27 |         boxes = boxes.tolist()
28 | 
29 |     scores = instances.scores.tolist()
30 |     classes = instances.pred_classes.tolist()
31 | 
32 |     has_mask = instances.has("pred_masks")
33 |     if has_mask:
34 |         # use RLE to encode the masks, because they are too large and takes memory
35 |         # since this evaluator stores outputs of the entire dataset
36 |         rles = [
37 |             mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
38 |             for mask in instances.pred_masks
39 |         ]
40 |         for rle in rles:
41 |             # "counts" is an array encoded by mask_util as a byte-stream. Python3's
42 |             # json writer which always produces strings cannot serialize a bytestream
43 |             # unless you decode it. Thankfully, utf-8 works out (which is also what
44 |             # the pycocotools/_mask.pyx does).
45 |             rle["counts"] = rle["counts"].decode("utf-8")
46 | 
47 |     has_keypoints = instances.has("pred_keypoints")
48 |     if has_keypoints:
49 |         keypoints = instances.pred_keypoints
50 | 
51 |     results = []
52 |     for k in range(num_instance):
53 |         result = {
54 |             "image_id": img_id,
55 |             "category_id": classes[k],
56 |             "score": scores[k],
57 |         }
58 |         if has_box:
59 |             result["bbox"] = boxes[k]
60 |         if has_mask:
61 |             result["segmentation"] = rles[k]
62 |         if has_keypoints:
63 |             # In COCO annotations,
64 |             # keypoints coordinates are pixel indices.
65 |             # However our predictions are floating point coordinates.
66 |             # Therefore we subtract 0.5 to be consistent with the annotation format.
67 |             # This is the inverse of data loading logic in `datasets/coco.py`.
68 |             keypoints[k][:, :2] -= 0.5
69 |             result["keypoints"] = keypoints[k].flatten().tolist()
70 |         results.append(result)
71 |     return results
72 | 
73 | 
74 | class COCOMaskEvaluator(COCOEvaluator):
75 | 
76 |     def process(self, inputs, outputs):
77 |         """
78 |         Args:
79 |             inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
80 |                 It is a list of dict. Each dict corresponds to an image and
81 |                 contains keys like "height", "width", "file_name", "image_id".
82 |             outputs: the outputs of a COCO model. It is a list of dicts with key
83 |                 "instances" that contains :class:`Instances`.
84 |         """
85 |         for input, output in zip(inputs, outputs):
86 |             prediction = {"image_id": input["image_id"]}
87 | 
88 |             if "instances" in output:
89 |                 instances = output["instances"].to(self._cpu_device)
90 |                 prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
91 |             if "proposals" in output:
92 |                 prediction["proposals"] = output["proposals"].to(self._cpu_device)
93 |             if len(prediction) > 1:
94 |                 self._predictions.append(prediction)


--------------------------------------------------------------------------------
/sparseinst/encoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
 2 | 
 3 | import math
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | 
 8 | from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill
 9 | 
10 | from detectron2.utils.registry import Registry
11 | from detectron2.layers import Conv2d
12 | 
13 | SPARSE_INST_ENCODER_REGISTRY = Registry("SPARSE_INST_ENCODER")
14 | SPARSE_INST_ENCODER_REGISTRY.__doc__ = "registry for SparseInst decoder"
15 | 
16 | 
17 | class PyramidPoolingModule(nn.Module):
18 | 
19 |     def __init__(self, in_channels, channels=512, sizes=(1, 2, 3, 6)):
20 |         super().__init__()
21 |         self.stages = []
22 |         self.stages = nn.ModuleList(
23 |             [self._make_stage(in_channels, channels, size) for size in sizes]
24 |         )
25 |         self.bottleneck = Conv2d(
26 |             in_channels + len(sizes) * channels, in_channels, 1)
27 | 
28 |     def _make_stage(self, features, out_features, size):
29 |         prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
30 |         conv = Conv2d(features, out_features, 1)
31 |         return nn.Sequential(prior, conv)
32 | 
33 |     def forward(self, feats):
34 |         h, w = feats.size(2), feats.size(3)
35 |         priors = [F.interpolate(input=F.relu_(stage(feats)), size=(
36 |             h, w), mode='bilinear', align_corners=False) for stage in self.stages] + [feats]
37 |         out = F.relu_(self.bottleneck(torch.cat(priors, 1)))
38 |         return out
39 | 
40 | 
41 | 
42 | @SPARSE_INST_ENCODER_REGISTRY.register()
43 | class InstanceContextEncoder(nn.Module):
44 |     """ 
45 |     Instance Context Encoder
46 |     1. construct feature pyramids from ResNet
47 |     2. enlarge receptive fields (ppm)
48 |     3. multi-scale fusion 
49 |     """
50 | 
51 |     def __init__(self, cfg, input_shape):
52 |         super().__init__()
53 |         self.num_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS
54 |         self.in_features = cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES
55 |         self.in_channels = [input_shape[f].channels for f in self.in_features]
56 |         fpn_laterals = []
57 |         fpn_outputs = []
58 |         for in_channel in reversed(self.in_channels):
59 |             lateral_conv = Conv2d(in_channel, self.num_channels, 1)
60 |             output_conv = Conv2d(self.num_channels, self.num_channels, 3, padding=1)
61 |             c2_xavier_fill(lateral_conv)
62 |             c2_xavier_fill(output_conv)
63 |             fpn_laterals.append(lateral_conv)
64 |             fpn_outputs.append(output_conv)
65 |         self.fpn_laterals = nn.ModuleList(fpn_laterals)
66 |         self.fpn_outputs = nn.ModuleList(fpn_outputs)
67 |         # ppm
68 |         self.ppm = PyramidPoolingModule(self.num_channels, self.num_channels // 4)
69 |         # final fusion
70 |         self.fusion = nn.Conv2d(self.num_channels * 3, self.num_channels, 1)
71 |         c2_msra_fill(self.fusion)
72 | 
73 |     def forward(self, features):
74 |         features = [features[f] for f in self.in_features]
75 |         features = features[::-1]
76 |         prev_features = self.ppm(self.fpn_laterals[0](features[0]))
77 |         outputs = [self.fpn_outputs[0](prev_features)]
78 |         for feature, lat_conv, output_conv in zip(features[1:], self.fpn_laterals[1:], self.fpn_outputs[1:]):
79 |             lat_features = lat_conv(feature)
80 |             top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode='nearest')
81 |             prev_features = lat_features + top_down_features
82 |             outputs.insert(0, output_conv(prev_features))
83 |         size = outputs[0].shape[2:]
84 |         features = [
85 |             outputs[0]] + [F.interpolate(x, size, mode='bilinear', align_corners=False) for x in outputs[1:]]
86 |         features = self.fusion(torch.cat(features, dim=1))
87 |         return features
88 | 
89 | 
90 | def build_sparse_inst_encoder(cfg, input_shape):
91 |     name = cfg.MODEL.SPARSE_INST.ENCODER.NAME
92 |     return SPARSE_INST_ENCODER_REGISTRY.get(name)(cfg, input_shape)
93 | 


--------------------------------------------------------------------------------
/mindspore/sparseinst/sparseinst.py:
--------------------------------------------------------------------------------
  1 | import mindspore
  2 | import mindspore.nn as nn
  3 | import mindspore.ops as ops
  4 | from mindspore import Tensor
  5 | import numpy as np
  6 | import cv2
  7 | 
  8 | from .resnet import build_resnet50
  9 | from .encoder import InstanceContextEncoder
 10 | from .decoder import GroupIAMDecoder
 11 | 
 12 | 
 13 | __all__=["SparseInst"]
 14 | 
 15 | def rescoring_mask(scores, mask_pred, masks):
 16 | 	mask_pred_ = mask_pred.astype('float32')
 17 | 	return scores * ((masks * mask_pred_).sum(axis=(1, 2)) / (mask_pred_.sum(axis=(1, 2)) + 1e-6))
 18 | 
 19 | class SparseInst(nn.Cell):
 20 | 	def __init__(self,cfg,is_train=False):
 21 | 		super().__init__()
 22 | 
 23 | 		self.backbone=build_resnet50()
 24 | 		self.encoder=InstanceContextEncoder(cfg,self.backbone.output_channel())
 25 | 		self.decoder=GroupIAMDecoder(cfg)
 26 | 
 27 | 		self.pixel_mean=Tensor(cfg.MODEL.PIXEL_MEAN).view((3,1,1))
 28 | 		self.pixel_std=Tensor(cfg.MODEL.PIXEL_STD).view((3,1,1))
 29 | 
 30 | 		self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD
 31 | 		self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD
 32 | 		self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS
 33 | 
 34 | 		self.training=is_train
 35 | 
 36 | 	def normalizer(self, image):
 37 | 		image=(image-self.pixel_mean)/self.pixel_std
 38 | 		return image
 39 | 
 40 | 	def padding(self,image,size_divisibility=32,pad_value=0.0):
 41 | 		h,w=image.shape[2],image.shape[3]
 42 | 		bottom=(h//size_divisibility+1)*size_divisibility-h
 43 | 		right=(w//size_divisibility+1)*size_divisibility-w
 44 | 		return ops.Pad(((0,0),(0,0),(0,bottom),(0,right)))(image)
 45 | 
 46 | 	def preprocess_inputs(self,batched_inputs):
 47 | 		images=self.padding(self.normalizer(batched_inputs))
 48 | 		return images
 49 | 
 50 | 	def construct(self,batched_inputs):
 51 | 
 52 | 			#input :Tensor(N,C,H,W)
 53 | 			#output = {
 54 | 			#"pred_logits": pred_logits,
 55 | 			#"pred_masks": pred_masks,
 56 | 			#"pred_scores": pred_scores,
 57 | 		#}
 58 | 		image_sizes=[batched_inputs['image'].shape[2:]]
 59 | 		images=self.preprocess_inputs(batched_inputs['image'])
 60 | 		max_shape=images.shape[2:]
 61 | 		features=self.backbone(images)
 62 | 		features=self.encoder(features)
 63 | 		output=self.decoder(features)
 64 | 		if self.training:
 65 | 			return output
 66 | 		else:
 67 | 			results=self.inference(output,[batched_inputs],max_shape,image_sizes)
 68 | 			processed_results=[{'instances':r} for r in results]
 69 | 			return processed_results
 70 | 
 71 | 
 72 | 	def inference(self,output,batched_inputs,max_shape,image_sizes):
 73 | 		results = []
 74 | 		pred_scores = ops.Sigmoid()(output["pred_logits"])
 75 | 		pred_masks = ops.Sigmoid()(output["pred_masks"])
 76 | 		pred_objectness = ops.Sigmoid()(output["pred_scores"])
 77 | 		pred_scores = ops.Sqrt()(pred_scores * pred_objectness)
 78 | 		for _, (scores_per_image, mask_pred_per_image, batched_input, img_shape) in enumerate(zip(pred_scores, pred_masks, batched_inputs, image_sizes)):
 79 | 
 80 | 			labels,scores  = ops.max(scores_per_image,axis=-1)
 81 | 			keep = scores > self.cls_threshold
 82 | 			scores = ops.masked_select(scores,keep)
 83 | 			labels = ops.masked_select(labels,keep)
 84 | 			n,h,w=mask_pred_per_image.shape
 85 | 			mask_pred_per_image = ops.masked_select(mask_pred_per_image,keep.view(n,1,1)).view(-1,h,w)
 86 | 			result={}
 87 | 			if scores.shape[0]==0:
 88 | 				result['scores']=scores
 89 | 				result['category_id']=labels
 90 | 				results.append(result)
 91 | 				continue
 92 | 			h,w=img_shape
 93 | 			ori_shape=batched_input['image_size']
 94 | 			scores = rescoring_mask(scores, mask_pred_per_image > self.mask_threshold, mask_pred_per_image)
 95 | 			mask_pred_per_image=ops.interpolate(ops.ExpandDims()(mask_pred_per_image,1),sizes=max_shape,mode='bilinear')
 96 | 			mask_pred_per_image=mask_pred_per_image.asnumpy()
 97 | 			mask_pred_per_image=mask_pred_per_image[:,:,:h,:w]
 98 | 			mask_pred_per_image=Tensor(mask_pred_per_image)
 99 | 
100 | 			mask_pred_per_image=ops.interpolate(mask_pred_per_image,sizes=ori_shape,mode='bilinear')
101 | 			mask_pred_per_image=ops.squeeze(mask_pred_per_image,axis=1)
102 | 			mask_pred=mask_pred_per_image>self.mask_threshold
103 | 			mask_pred=mask_pred.astype('uint8')
104 | 
105 | 			result['segmentation'] = mask_pred
106 | 			result['scores'] = scores
107 | 			result['category_id'] = labels
108 | 			results.append(result)
109 | 
110 | 		return results


--------------------------------------------------------------------------------
/onnx/convert_onnx.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import argparse
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | from detectron2.layers import Conv2d
  9 | from detectron2.utils.logger import setup_logger
 10 | from detectron2.modeling import build_model
 11 | from detectron2.checkpoint import DetectionCheckpointer
 12 | from detectron2.config import get_cfg
 13 | 
 14 | from sparseinst import add_sparse_inst_config
 15 | 
 16 | 
 17 | class PyramidPoolingModuleONNX(nn.Module):
 18 | 
 19 |     def __init__(self, in_channels, channels, input_size, pool_sizes=(1, 2, 3, 6)):
 20 |         super().__init__()
 21 |         self.stages = []
 22 |         self.stages = nn.ModuleList(
 23 |             [self._make_stage(in_channels, channels, input_size, pool_size)
 24 |              for pool_size in pool_sizes]
 25 |         )
 26 |         self.bottleneck = Conv2d(
 27 |             in_channels + len(pool_sizes) * channels, in_channels, 1)
 28 | 
 29 |     def _make_stage(self, features, out_features, input_size, pool_size):
 30 |         stride_y = math.floor((input_size[0] / pool_size))
 31 |         stride_x = math.floor((input_size[1] / pool_size))
 32 |         kernel_y = input_size[0] - (pool_size - 1) * stride_y
 33 |         kernel_x = input_size[1] - (pool_size - 1) * stride_x
 34 |         prior = nn.AvgPool2d(kernel_size=(
 35 |             kernel_y, kernel_x), stride=(stride_y, stride_x))
 36 |         conv = Conv2d(features, out_features, 1)
 37 |         return nn.Sequential(prior, conv)
 38 | 
 39 |     def forward(self, feats):
 40 |         h, w = feats.size(2), feats.size(3)
 41 |         priors = [F.interpolate(
 42 |             input=F.relu_(stage(feats)), size=(h, w), mode='bilinear', align_corners=False) for stage in self.stages] + [feats]
 43 |         out = F.relu_(self.bottleneck(torch.cat(priors, 1)))
 44 |         return out
 45 | 
 46 | 
 47 | def main():
 48 |     parser = argparse.ArgumentParser(
 49 |         description="Export model to the onnx format")
 50 |     parser.add_argument(
 51 |         "--config-file",
 52 |         default="configs/sparse_inst_r50_giam.yaml",
 53 |         metavar="FILE",
 54 |         help="path to config file",
 55 |     )
 56 |     parser.add_argument('--width', default=640, type=int)
 57 |     parser.add_argument('--height', default=640, type=int)
 58 |     parser.add_argument('--level', default=0, type=int)
 59 |     parser.add_argument(
 60 |         "--output",
 61 |         default="output/sparseinst.onnx",
 62 |         metavar="FILE",
 63 |         help="path to the output onnx file",
 64 |     )
 65 |     parser.add_argument(
 66 |         "--opts",
 67 |         help="Modify config options using the command-line 'KEY VALUE' pairs",
 68 |         default=[],
 69 |         nargs=argparse.REMAINDER,
 70 |     )
 71 | 
 72 |     cfg = get_cfg()
 73 |     add_sparse_inst_config(cfg)
 74 |     args = parser.parse_args()
 75 |     cfg.merge_from_file(args.config_file)
 76 |     cfg.merge_from_list(args.opts)
 77 | 
 78 |     # norm for ONNX: change FrozenBN back to BN
 79 |     cfg.MODEL.BACKBONE.FREEZE_AT = 0
 80 |     cfg.MODEL.RESNETS.NORM = "BN"
 81 | 
 82 |     cfg.freeze()
 83 | 
 84 |     output_dir = cfg.OUTPUT_DIR
 85 |     logger = setup_logger(output=output_dir)
 86 |     logger.info(cfg)
 87 | 
 88 |     height = args.height
 89 |     width = args.width
 90 | 
 91 |     model = build_model(cfg)
 92 |     num_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS
 93 |     onnx_ppm = PyramidPoolingModuleONNX(
 94 |         num_channels, num_channels // 4, (height // 32, width // 32))
 95 |     model.encoder.ppm = onnx_ppm
 96 |     model.to(cfg.MODEL.DEVICE)
 97 |     logger.info("Model:\n{}".format(model))
 98 | 
 99 |     checkpointer = DetectionCheckpointer(model)
100 |     _ = checkpointer.load(cfg.MODEL.WEIGHTS)
101 |     logger.info("load Model:\n{}".format(cfg.MODEL.WEIGHTS))
102 | 
103 |     input_names = ["input_image"]
104 |     dummy_input = torch.zeros((1, 3, height, width)).to(cfg.MODEL.DEVICE)
105 |     output_names = ["scores", "masks"]
106 | 
107 |     model.forward = model.forward_test
108 | 
109 |     torch.onnx.export(
110 |         model,
111 |         dummy_input,
112 |         args.output,
113 |         verbose=True,
114 |         input_names=input_names,
115 |         output_names=output_names,
116 |         keep_initializers_as_inputs=False,
117 |         opset_version=12,
118 |     )
119 | 
120 |     logger.info("Done. The onnx model is saved into {}.".format(args.output))
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     main()
125 | 


--------------------------------------------------------------------------------
/sparseinst/utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from typing import Optional, List
  3 | 
  4 | import torch
  5 | from torch import Tensor
  6 | import torch.distributed as dist
  7 | import torch.nn.functional as F
  8 | import torchvision
  9 | 
 10 | 
 11 | def _max_by_axis(the_list):
 12 |     # type: (List[List[int]]) -> List[int]
 13 |     maxes = the_list[0]
 14 |     for sublist in the_list[1:]:
 15 |         for index, item in enumerate(sublist):
 16 |             maxes[index] = max(maxes[index], item)
 17 |     return maxes
 18 | 
 19 | 
 20 | class NestedTensor(object):
 21 |     def __init__(self, tensors, mask: Optional[Tensor]):
 22 |         self.tensors = tensors
 23 |         self.mask = mask
 24 | 
 25 |     def to(self, device):
 26 |         cast_tensor = self.tensors.to(device)
 27 |         mask = self.mask
 28 |         if mask is not None:
 29 |             assert mask is not None
 30 |             cast_mask = mask.to(device)
 31 |         else:
 32 |             cast_mask = None
 33 |         return NestedTensor(cast_tensor, cast_mask)
 34 | 
 35 |     def decompose(self):
 36 |         return self.tensors, self.mask
 37 | 
 38 |     def __repr__(self):
 39 |         return str(self.tensors)
 40 | 
 41 | # _onnx_nested_tensor_from_tensor_list() is an implementation of
 42 | # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
 43 | 
 44 | 
 45 | @torch.jit.unused
 46 | def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
 47 |     max_size = []
 48 |     for i in range(tensor_list[0].dim()):
 49 |         max_size_i = torch.max(torch.stack([img.shape[i]
 50 |                                             for img in tensor_list]).to(torch.float32)).to(torch.int64)
 51 |         max_size.append(max_size_i)
 52 |     max_size = tuple(max_size)
 53 | 
 54 |     # work around for
 55 |     # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 56 |     # m[: img.shape[1], :img.shape[2]] = False
 57 |     # which is not yet supported in onnx
 58 |     padded_imgs = []
 59 |     padded_masks = []
 60 |     for img in tensor_list:
 61 |         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
 62 |         padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
 63 |         padded_imgs.append(padded_img)
 64 | 
 65 |         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
 66 |         padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
 67 |         padded_masks.append(padded_mask.to(torch.bool))
 68 | 
 69 |     tensor = torch.stack(padded_imgs)
 70 |     mask = torch.stack(padded_masks)
 71 | 
 72 |     return NestedTensor(tensor, mask=mask)
 73 | 
 74 | 
 75 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 76 |     # TODO make this more general
 77 |     if tensor_list[0].ndim == 3:
 78 |         if torchvision._is_tracing():
 79 |             # nested_tensor_from_tensor_list() does not export well to ONNX
 80 |             # call _onnx_nested_tensor_from_tensor_list() instead
 81 |             return _onnx_nested_tensor_from_tensor_list(tensor_list)
 82 | 
 83 |         # TODO make it support different-sized images
 84 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 85 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 86 |         batch_shape = [len(tensor_list)] + max_size
 87 |         b, c, h, w = batch_shape
 88 |         dtype = tensor_list[0].dtype
 89 |         device = tensor_list[0].device
 90 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 91 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 92 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 93 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 94 |             m[: img.shape[1], :img.shape[2]] = False
 95 |     else:
 96 |         raise ValueError('not supported')
 97 |     return NestedTensor(tensor, mask)
 98 | 
 99 | 
100 | def nested_masks_from_list(tensor_list: List[Tensor], input_shape=None):
101 |     if tensor_list[0].ndim == 3:
102 |         dim_size = sum([img.shape[0] for img in tensor_list])
103 |         if input_shape is None:
104 |             max_size = _max_by_axis([list(img.shape[-2:]) for img in tensor_list])
105 |         else:
106 |             max_size = [input_shape[0], input_shape[1]]
107 |         batch_shape = [dim_size] + max_size
108 |         # b, h, w = batch_shape
109 |         dtype = tensor_list[0].dtype
110 |         device = tensor_list[0].device
111 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
112 |         mask = torch.zeros(batch_shape, dtype=torch.bool, device=device)
113 |         idx = 0
114 |         for img in tensor_list:
115 |             c = img.shape[0]
116 |             c_ = idx + c
117 |             tensor[idx: c_, :img.shape[1], : img.shape[2]].copy_(img)
118 |             mask[idx: c_, :img.shape[1], :img.shape[2]] = True
119 |             idx = c_
120 |     else:
121 |         raise ValueError('not supported')
122 |     return NestedTensor(tensor, mask)
123 | 
124 | 
125 | def is_dist_avail_and_initialized():
126 |     if not dist.is_available():
127 |         return False
128 |     if not dist.is_initialized():
129 |         return False
130 |     return True
131 | 
132 | 
133 | def get_world_size():
134 |     if not is_dist_avail_and_initialized():
135 |         return 1
136 |     return dist.get_world_size()
137 | 
138 | 
139 | def aligned_bilinear(tensor, factor):
140 |     # borrowed from Adelaidet: https://github1s.com/aim-uofa/AdelaiDet/blob/HEAD/adet/utils/comm.py
141 |     assert tensor.dim() == 4
142 |     assert factor >= 1
143 |     assert int(factor) == factor
144 | 
145 |     if factor == 1:
146 |         return tensor
147 | 
148 |     h, w = tensor.size()[2:]
149 |     tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate")
150 |     oh = factor * h + 1
151 |     ow = factor * w + 1
152 |     tensor = F.interpolate(
153 |         tensor, size=(oh, ow),
154 |         mode='bilinear',
155 |         align_corners=True
156 |     )
157 |     tensor = F.pad(
158 |         tensor, pad=(factor // 2, 0, factor // 2, 0),
159 |         mode="replicate"
160 |     )
161 | 
162 |     return tensor[:, :, :oh - 1, :ow - 1]
163 | 


--------------------------------------------------------------------------------
/mindspore/test.py:
--------------------------------------------------------------------------------
  1 | import mindspore
  2 | import argparse
  3 | import numpy as np
  4 | from sparseinst import SparseInst, cfg
  5 | import cv2
  6 | import os
  7 | import json
  8 | from mindspore import Tensor, ops
  9 | from pycocotools.coco import COCO
 10 | from pycocotools.cocoeval import COCOeval
 11 | import tqdm
 12 | from dict import id2category
 13 | 
 14 | def parse_args():
 15 | 	parser = argparse.ArgumentParser()
 16 |     # general
 17 | 	#parser.add_argument('--cfg',help='experiment configure file name',required=True,type=str)
 18 | 	parser.add_argument('--checkpoint',help="checkpoint path",type=str)
 19 | 	parser.add_argument('--json_save_path',help="result json scve path",required=False,type=str)
 20 | 	parser.add_argument("--visualize",action="store_true",help="Run or not.")
 21 | 	parser.add_argument('--image_name',help="image to visual",required=False,type=str)
 22 | 	parser.add_argument('--coco_path',help="coco to path",required=False,type=str)
 23 | 	parser.add_argument('--dir_path',help="coco to visual",required=False,type=str)
 24 | 	args = parser.parse_args()
 25 | 	return args
 26 | 
 27 | 
 28 | def load_net(path):
 29 | 	param_dict = mindspore.load_checkpoint(path)
 30 | 	net = SparseInst(cfg)
 31 | 	mindspore.load_param_into_net(net, param_dict)
 32 | 	model=mindspore.Model(network=net)
 33 | 	return model
 34 | 
 35 | def resize_img(img,short_length=640,long_length=864):
 36 | 	h,w=img.shape[2:]
 37 | 	image_size=(h,w)
 38 | 
 39 | 	if h>w:
 40 | 		h=int(h/w*short_length)
 41 | 		if h>long_length:
 42 | 			w=int(w/h*long_length)
 43 | 			h=long_length
 44 | 		else:
 45 | 			w=short_length
 46 | 	else:
 47 | 		w=int(w/h*short_length)
 48 | 		if w>long_length:
 49 | 			h=int(h/w*long_length)
 50 | 			w=long_length
 51 | 		else:
 52 | 			h=short_length
 53 | 	img=ops.interpolate(img,sizes=(h,w),mode='bilinear')
 54 | 	return {'image':img,'image_size':image_size}  ##########
 55 | 
 56 | def read_img(name):
 57 | 	image=cv2.imread(name)
 58 | 	_image=image.copy()
 59 | 	image=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
 60 | 	image=Tensor(image).astype('float32')
 61 | 	image=ops.transpose(image,(2,0,1))
 62 | 	image=ops.expand_dims(image,0)
 63 | 	return image,_image
 64 | 
 65 | class Dataset:
 66 | 	def __init__(self,coco_path,dir_path,short_length=640,long_length=864,visualize=True):
 67 | 		self.short_length=short_length
 68 | 		self.long_length=long_length
 69 | 		self.visualize=visualize
 70 | 		self.coco = COCO(coco_path)
 71 | 		self.ids = list(self.coco.imgs.keys())
 72 | 		self.dir=dir_path
 73 | 
 74 | 	def __len__(self):
 75 | 		return len(self.ids)
 76 | 
 77 | 	def _get_image_path(self, file_name):
 78 | 		images_dir=self.dir
 79 | 		return os.path.join(images_dir, file_name)
 80 | 
 81 | 	def __getitem__(self,index):
 82 | 		coco=self.coco
 83 | 		img_id=self.ids[index]
 84 | 		file_name=coco.loadImgs(img_id)[0]['file_name']
 85 | 		file_name=self._get_image_path(file_name)
 86 | 		image,ori_image=read_img(file_name)
 87 | 		image=resize_img(image)
 88 | 		return {'image':image,'ori_image':ori_image,'image_id':self.ids[index]}
 89 | 
 90 | 
 91 | class Evaluator:
 92 | 	def __init__(self, coco_path):
 93 | 		self.coco = COCO(coco_path)
 94 | 
 95 | 	def evaluate(self, res_file):
 96 | 		coco_dt = self.coco.loadRes(res_file)
 97 | 		coco_eval = COCOeval(self.coco, coco_dt, "segm")
 98 | 		coco_eval.evaluate()
 99 | 		coco_eval.accumulate()
100 | 		coco_eval.summarize()
101 | 		info_str = []
102 | 		stats_names = ['AP', 'Ap .5', 'AP .75','AP (M)', 'AP (L)', 'AR', 'AR .5', 'AR .75', 'AR (M)', 'AR (L)']
103 | 		for ind, name in enumerate(stats_names):
104 | 			info_str.append((name, coco_eval.stats[ind]))
105 | 		return info_str
106 | 
107 | 
108 | def read_names(path):
109 | 	files = os.listdir(path)
110 | 	files=[os.path.join(path,name) for name in files]
111 | 	return files
112 | 
113 | def visualization(masks,image,name,path):
114 | 	masks=[mask*255 for mask in masks]
115 | 	h,w=masks[0].shape
116 | 	path=path+name+'/'
117 | 	if not os.path.exists(path):
118 | 		os.mkdir(path)
119 | 	_=[cv2.imwrite(path+'image_mask'+str(i)+".jpg",((mask.reshape(h,w,1).astype(np.float32)/255.0)*image.astype(np.float32)).astype(np.uint8)) for i,mask in enumerate(masks)]
120 | 	_=[cv2.imwrite(path+'mask'+str(i)+'.jpg',mask) for i,mask in enumerate(masks)]
121 | 
122 | class Runner:
123 | 	def __init__(self,dataset,model,visualize=True):
124 | 		self.dataset=dataset
125 | 		self.model=model
126 | 		self.visualize=visualize
127 | 		self.dict=list(id2category.keys())
128 | 	def __call__(self,idx):
129 | 		input=self.dataset[idx]
130 | 		ori_image=input['ori_image']
131 | 		image_id=input['image_id']
132 | 		input=input['image']
133 | 		output=self.model.predict(input)[0]['instances']
134 | 		if 'pred_masks' in output.keys():
135 | 			output['segmentation'] = output['segmentation'].asnumpy()
136 | 			if not self.visualize:
137 | 				output['']=self.mask2rle(output)
138 | 		output['scores'] = output['scores'].asnumpy().astype(float).tolist()
139 | 		output['category_id'] = output['category_id'].asnumpy().astype(int).tolist()
140 | 		output['category_id']=[self.dict[i] for i in output['category_id']]
141 | 		output['image_id']=int(image_id)
142 | 		del input
143 | 		if not self.visualize:
144 | 			del ori_image
145 | 			all_pred=[]
146 | 			for i,mask in enumerate(output['segmentation']):
147 | 				all_pred.append({'image_id':image_id,'category_id':output['category_id'][i],'segmentation':mask,'score':output['scores'][i]})
148 | 			del output
149 | 			return all_pred #################
150 | 		else:
151 | 			output['ori_image']=ori_image
152 | 			return output
153 | 	def mask2rle(self,outputs):
154 | 		masks=outputs['pred_masks']
155 | 		masks=[mask for mask in masks]
156 | 		def f(img):
157 | 			'''
158 | 			img: numpy array, 1 - mask, 0 - background
159 | 			Returns run length as string formated
160 | 			'''
161 | 			pixels= img.T.flatten()
162 | 			pixels = np.concatenate([[0], pixels, [0]])
163 | 			runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
164 | 			runs[1::2] -= runs[::2]
165 | 			return ' '.join(str(x) for x in runs)
166 | 		rle=[f(mask) for mask in masks]
167 | 		return rle
168 | 
169 | 
170 | def main():
171 | 	args = parse_args()
172 | 	mindspore.set_context(mode=mindspore.PYNATIVE_MODE)
173 | 	model=load_net(args.checkpoint)
174 | 	if args.visualize:
175 | 		image,ori_image=read_img(args.image_name)
176 | 		image=resize_img(image)
177 | 		dataset=[{'image':image,'ori_image':ori_image,'image_id':args.image_name.split('/')[-1].split('.')[0]}]
178 | 	else:
179 | 		dataset=Dataset(args.coco_path,args.dir_path,visualize=args.visualize)
180 | 	runner=Runner(dataset=dataset,model=model,visualize=args.visualize)
181 | 	results=[]
182 | 	for i in range(len(dataset)):
183 | 		results.append(runner(i))
184 | 		print(i)
185 | 	if args.visualize:
186 | 		_=[visualization(res['pred_masks'],res['ori_image'],res['image_id'],'./') for res in results]
187 | 	else:
188 | 		res_file=os.path.join(args.json_save_path, "segment_coco_results.json")
189 | 		json.dump(results, open(res_file, 'w'))
190 | 		eva=Evaluator(args.coco_path)
191 | 		info_str=eva.evaluate(res_file)
192 | 
193 | 
194 | if __name__=="__main__":
195 | 	main()
196 | 	


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | import argparse
  3 | import glob
  4 | import multiprocessing as mp
  5 | import os
  6 | import time
  7 | import cv2
  8 | import tqdm
  9 | 
 10 | from detectron2.config import get_cfg
 11 | from detectron2.data.detection_utils import read_image
 12 | from detectron2.utils.logger import setup_logger
 13 | 
 14 | from sparseinst import VisualizationDemo, add_sparse_inst_config
 15 | 
 16 | 
 17 | # constants
 18 | WINDOW_NAME = "COCO detections"
 19 | 
 20 | 
 21 | def setup_cfg(args):
 22 |     # load config from file and command-line arguments
 23 |     cfg = get_cfg()
 24 |     add_sparse_inst_config(cfg)
 25 |     cfg.merge_from_file(args.config_file)
 26 |     cfg.merge_from_list(args.opts)
 27 |     # Set score_threshold for builtin models
 28 |     cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
 29 |     cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
 30 |     cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
 31 |     cfg.freeze()
 32 |     return cfg
 33 | 
 34 | 
 35 | def get_parser():
 36 |     parser = argparse.ArgumentParser(
 37 |         description="Detectron2 demo for builtin models")
 38 |     parser.add_argument(
 39 |         "--config-file",
 40 |         default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
 41 |         metavar="FILE",
 42 |         help="path to config file",
 43 |     )
 44 |     parser.add_argument("--webcam", action="store_true",
 45 |                         help="Take inputs from webcam.")
 46 |     parser.add_argument("--video-input", help="Path to video file.")
 47 |     parser.add_argument(
 48 |         "--input",
 49 |         nargs="+",
 50 |         help="A list of space separated input images; "
 51 |         "or a single glob pattern such as 'directory/*.jpg'",
 52 |     )
 53 |     parser.add_argument(
 54 |         "--output",
 55 |         help="A file or directory to save output visualizations. "
 56 |         "If not given, will show output in an OpenCV window.",
 57 |     )
 58 | 
 59 |     parser.add_argument(
 60 |         "--confidence-threshold",
 61 |         type=float,
 62 |         default=0.5,
 63 |         help="Minimum score for instance predictions to be shown",
 64 |     )
 65 |     parser.add_argument(
 66 |         "--opts",
 67 |         help="Modify config options using the command-line 'KEY VALUE' pairs",
 68 |         default=[],
 69 |         nargs=argparse.REMAINDER,
 70 |     )
 71 |     return parser
 72 | 
 73 | 
 74 | if __name__ == "__main__":
 75 |     mp.set_start_method("spawn", force=True)
 76 |     args = get_parser().parse_args()
 77 |     setup_logger(name="fvcore")
 78 |     logger = setup_logger()
 79 |     logger.info("Arguments: " + str(args))
 80 | 
 81 |     cfg = setup_cfg(args)
 82 | 
 83 |     demo = VisualizationDemo(cfg)
 84 | 
 85 |     if args.input:
 86 |         if len(args.input) == 1:
 87 |             args.input = glob.glob(os.path.expanduser(args.input[0]))
 88 |             assert args.input, "The input path(s) was not found"
 89 |         for path in tqdm.tqdm(args.input, disable=not args.output):
 90 |             # use PIL, to be consistent with evaluation
 91 |             #             img = read_image(path, format="BGR")
 92 |             # OneNet uses RGB input as default
 93 |             img = read_image(path, format="RGB")
 94 |             start_time = time.time()
 95 |             predictions, visualized_output = demo.run_on_image(
 96 |                 img, args.confidence_threshold)
 97 |             logger.info(
 98 |                 "{}: {} in {:.2f}s".format(
 99 |                     path,
100 |                     "detected {} instances".format(
101 |                         len(predictions["instances"]))
102 |                     if "instances" in predictions
103 |                     else "finished",
104 |                     time.time() - start_time,
105 |                 )
106 |             )
107 | 
108 |             if args.output:
109 |                 if os.path.isdir(args.output):
110 |                     assert os.path.isdir(args.output), args.output
111 |                     out_filename = os.path.join(
112 |                         args.output, os.path.basename(path))
113 |                 else:
114 |                     assert len(
115 |                         args.output) > 0, "Please specify a directory with args.output"
116 |                     out_filename = args.output
117 |                 visualized_output.save(out_filename)
118 |             else:
119 |                 cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
120 |                 cv2.imshow(
121 |                     WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
122 |                 if cv2.waitKey(0) == 27:
123 |                     break  # esc to quit
124 |     elif args.webcam:
125 |         assert args.input is None, "Cannot have both --input and --webcam!"
126 |         assert args.output is None, "output not yet supported with --webcam!"
127 |         cam = cv2.VideoCapture(0)
128 |         for vis in tqdm.tqdm(demo.run_on_video(cam, args.confidence_threshold)):
129 |             cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
130 |             cv2.imshow(WINDOW_NAME, vis)
131 |             if cv2.waitKey(1) == 27:
132 |                 break  # esc to quit
133 |         cam.release()
134 |         cv2.destroyAllWindows()
135 |     elif args.video_input:
136 |         video = cv2.VideoCapture(args.video_input)
137 |         width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
138 |         height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
139 |         frames_per_second = video.get(cv2.CAP_PROP_FPS)
140 |         num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
141 |         basename = os.path.basename(args.video_input)
142 | 
143 |         if args.output:
144 |             if os.path.isdir(args.output):
145 |                 output_fname = os.path.join(args.output, basename)
146 |                 output_fname = os.path.splitext(output_fname)[0] + ".mkv"
147 |             else:
148 |                 output_fname = args.output
149 |             assert not os.path.isfile(output_fname), output_fname
150 |             output_file = cv2.VideoWriter(
151 |                 filename=output_fname,
152 |                 # some installation of opencv may not support x264 (due to its license),
153 |                 # you can try other format (e.g. MPEG)
154 |                 fourcc=cv2.VideoWriter_fourcc(*"x264"),
155 |                 fps=float(frames_per_second),
156 |                 frameSize=(width, height),
157 |                 isColor=True,
158 |             )
159 |         assert os.path.isfile(args.video_input)
160 |         for vis_frame in tqdm.tqdm(demo.run_on_video(video, args.confidence_threshold), total=num_frames):
161 |             if args.output:
162 |                 output_file.write(vis_frame)
163 |             else:
164 |                 cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
165 |                 cv2.imshow(basename, vis_frame)
166 |                 if cv2.waitKey(1) == 27:
167 |                     break  # esc to quit
168 |         video.release()
169 |         if args.output:
170 |             output_file.release()
171 |         else:
172 |             cv2.destroyAllWindows()
173 | 


--------------------------------------------------------------------------------
/sparseinst/dataset_mapper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import copy
  3 | import logging
  4 | import numpy as np
  5 | import torch
  6 | 
  7 | 
  8 | from detectron2.data import detection_utils as utils
  9 | from detectron2.data import transforms as T
 10 | 
 11 | """
 12 | This file contains the default mapping that's applied to "dataset dicts".
 13 | """
 14 | 
 15 | __all__ = ["SparseInstDatasetMapper"]
 16 | 
 17 | 
 18 | def build_transform_gen(cfg, is_train):
 19 |     """
 20 |     Create a list of default :class:`Augmentation` from config.
 21 |     Now it includes resizing and flipping.
 22 | 
 23 |     Returns:
 24 |         list[Augmentation]
 25 |     """
 26 |     augmentation = []
 27 | 
 28 |     if is_train:
 29 |         min_size = cfg.INPUT.MIN_SIZE_TRAIN
 30 |         max_size = cfg.INPUT.MAX_SIZE_TRAIN
 31 |         sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
 32 |     else:
 33 |         min_size = cfg.INPUT.MIN_SIZE_TEST
 34 |         max_size = cfg.INPUT.MAX_SIZE_TEST
 35 |         sample_style = "choice"
 36 |     if is_train and cfg.INPUT.RANDOM_FLIP != "none":
 37 |         augmentation.append(
 38 |             T.RandomFlip(
 39 |                 horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
 40 |                 vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
 41 |             )
 42 |         )
 43 |     if is_train:
 44 |         # 800,1333, 0.6
 45 |         # 600, 1000
 46 |         # aspect ratio fixed
 47 |         augmentation.append(
 48 |             T.ResizeShortestEdge(min_size, max_size, sample_style)
 49 |         )
 50 |     return augmentation
 51 | 
 52 | 
 53 | class SparseInstDatasetMapper:
 54 |     """
 55 |     A callable which takes a dataset dict in Detectron2 Dataset format,
 56 |     and map it into a format used by the model.
 57 | 
 58 |     This is the default callable to be used to map your dataset dict into training data.
 59 |     You may need to follow it to implement your own one for customized logic,
 60 |     such as a different way to read or transform images.
 61 |     See :doc:`/tutorials/data_loading` for details.
 62 | 
 63 |     The callable currently does the following:
 64 | 
 65 |     1. Read the image from "file_name"
 66 |     2. Applies cropping/geometric transforms to the image and annotations
 67 |     3. Prepare data and annotations to Tensor and :class:`Instances`
 68 |     """
 69 |     # @classmethod
 70 | 
 71 |     def __init__(self, cfg, is_train: bool = True):
 72 |         augs = build_transform_gen(cfg, is_train)
 73 |         self.default_aug = T.AugmentationList(augs)
 74 |         if cfg.INPUT.CROP.ENABLED and is_train:
 75 |             crop_gen = [
 76 |                 T.ResizeShortestEdge([400, 500, 600], sample_style='choice'),
 77 |                 T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)
 78 |             ]
 79 |             recompute_boxes = cfg.MODEL.MASK_ON
 80 |             augs = augs[:-1] + crop_gen + augs[-1:]
 81 |             self.crop_aug = T.AugmentationList(augs)
 82 |         else:
 83 |             self.crop_aug = None
 84 |             recompute_boxes = False
 85 | 
 86 |         # self.augs = augs
 87 |         self.is_train = is_train
 88 |         self.image_format = cfg.INPUT.FORMAT
 89 |         self.use_instance_mask = cfg.MODEL.MASK_ON
 90 |         self.instance_mask_format = cfg.INPUT.MASK_FORMAT
 91 |         self.recompute_boxes = recompute_boxes
 92 | 
 93 |         logger = logging.getLogger(__name__)
 94 |         mode = "training" if is_train else "inference"
 95 |         logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augs}")
 96 | 
 97 |     def __call__(self, dataset_dict):
 98 |         """
 99 |         Args:
100 |             dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
101 | 
102 |         Returns:
103 |             dict: a format that builtin models in detectron2 accept
104 |         """
105 |         dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
106 |         # USER: Write your own image loading if it's not from a file
107 |         image = utils.read_image(dataset_dict["file_name"], format=self.image_format)
108 |         utils.check_image_size(dataset_dict, image)
109 | 
110 |         # USER: Remove if you don't do semantic/panoptic segmentation.
111 |         if "sem_seg_file_name" in dataset_dict:
112 |             sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2)
113 |         else:
114 |             sem_seg_gt = None
115 | 
116 |         aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
117 | 
118 |         if self.crop_aug is None:
119 |             transforms = self.default_aug(aug_input)
120 |         else:
121 |             if np.random.rand() > 0.5:
122 |                 transforms = self.crop_aug(aug_input)
123 |             else:
124 |                 transforms = self.default_aug(aug_input)
125 |         # transforms = self.augmentations(aug_input)
126 |         image, sem_seg_gt = aug_input.image, aug_input.sem_seg
127 | 
128 |         image_shape = image.shape[:2]  # h, w
129 |         # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
130 |         # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
131 |         # Therefore it's important to use torch.Tensor.
132 |         dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
133 |         if sem_seg_gt is not None:
134 |             dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long"))
135 | 
136 |         if not self.is_train:
137 |             # USER: Modify this if you want to keep them for some reason.
138 |             dataset_dict.pop("annotations", None)
139 |             dataset_dict.pop("sem_seg_file_name", None)
140 |             return dataset_dict
141 | 
142 |         if "annotations" in dataset_dict:
143 |             # USER: Modify this if you want to keep them for some reason.
144 |             for anno in dataset_dict["annotations"]:
145 |                 anno.pop("keypoints", None)
146 |                 if not self.use_instance_mask:
147 |                     anno.pop("segmentation", None)
148 | 
149 |             # USER: Implement additional transformations if you have other types of data
150 |             annos = [
151 |                 utils.transform_instance_annotations(
152 |                     obj, transforms, image_shape)
153 |                 for obj in dataset_dict.pop("annotations")
154 |                 if obj.get("iscrowd", 0) == 0
155 |             ]
156 |             instances = utils.annotations_to_instances(
157 |                 annos, image_shape, mask_format=self.instance_mask_format
158 |             )
159 | 
160 |             # After transforms such as cropping are applied, the bounding box may no longer
161 |             # tightly bound the object. As an example, imagine a triangle object
162 |             # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
163 |             # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
164 |             # the intersection of original bounding box and the cropping box.
165 |             if self.recompute_boxes:
166 |                 instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
167 |             dataset_dict["instances"] = utils.filter_empty_instances(instances)
168 |         return dataset_dict
169 | 


--------------------------------------------------------------------------------
/mindspore/sparseinst/decoder.py:
--------------------------------------------------------------------------------
  1 | import mindspore
  2 | from mindspore import Tensor
  3 | import mindspore.nn as nn
  4 | import mindspore.ops as ops
  5 | from mindspore.nn import Conv2d
  6 | 
  7 | 
  8 | __all__=["BaseIAMDecoder","GroupIAMDecoder"]
  9 | 
 10 | def _make_stack_3x3_convs(num_convs, in_channels, out_channels):
 11 | 	convs = []
 12 | 	for _ in range(num_convs):
 13 | 		convs.append(
 14 | 			nn.Conv2d(in_channels, out_channels, 3, has_bias=True))
 15 | 		convs.append(nn.ReLU())
 16 | 		in_channels = out_channels
 17 | 	return nn.SequentialCell(*convs)
 18 | 
 19 | 
 20 | class MaskBranch(nn.Cell):
 21 | 
 22 | 	def __init__(self, cfg, in_channels):
 23 | 		super().__init__()
 24 | 		dim = cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM#256
 25 | 		num_convs = cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS
 26 | 		kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
 27 | 		self.mask_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
 28 | 		self.projection = nn.Conv2d(dim, kernel_dim, kernel_size=1,has_bias=True)
 29 | 
 30 | 	def construct(self, features):
 31 |  		# mask features (x4 convs)
 32 | 		features = self.mask_convs(features)
 33 | 		return self.projection(features)
 34 | 
 35 | 
 36 | 
 37 | class InstanceBranch(nn.Cell):
 38 | 
 39 | 	def __init__(self, cfg, in_channels):
 40 | 		super().__init__()
 41 | 		# norm = cfg.MODEL.SPARSE_INST.DECODER.NORM
 42 | 		dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM
 43 | 		num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS
 44 | 		num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS
 45 | 		kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
 46 | 		self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
 47 | 
 48 | 		self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
 49 | 		# iam prediction, a simple conv
 50 | 		self.iam_conv = nn.Conv2d(dim, num_masks, 3, has_bias=True)
 51 | 
 52 | 		 # outputs
 53 | 		self.cls_score = nn.Dense(dim, self.num_classes)
 54 | 		self.mask_kernel = nn.Dense(dim, kernel_dim)
 55 | 		self.objectness = nn.Dense(dim, 1)
 56 | 
 57 | 
 58 | 	def construct(self, features):
 59 | 		# instance features (x4 convs)
 60 | 		features = self.inst_convs(features)
 61 | 		# predict instance activation maps
 62 | 		iam = self.iam_conv(features)
 63 | 		iam_prob = ops.Sigmoid()(iam)
 64 | 
 65 | 		B, N = iam_prob.shape[:2]
 66 | 		C = features.shape[1]
 67 | 		# BxNxHxW -> BxNx(HW)
 68 | 		iam_prob = iam_prob.view((B, N, -1))
 69 | 		# aggregate features: BxCxHxW -> Bx(HW)xC
 70 | 		inst_features=ops.BatchMatMul(transpose_b=True)(iam_prob,features.view((B, C, -1)))
 71 | 		normalizer = ops.clip_by_value(iam_prob.sum(-1),clip_value_min=Tensor(1e-6,mindspore.float32))
 72 | 		inst_features = inst_features / normalizer[:, :, None]
 73 | 		# predict classification & segmentation kernel & objectness
 74 | 		pred_logits = self.cls_score(inst_features)
 75 | 		pred_kernel = self.mask_kernel(inst_features)
 76 | 		pred_scores = self.objectness(inst_features)
 77 | 		return pred_logits, pred_kernel, pred_scores, iam
 78 | 
 79 | 
 80 | class BaseIAMDecoder(nn.Cell):
 81 | 
 82 | 	def __init__(self, cfg):
 83 | 		super().__init__()
 84 | 		# add 2 for coordinates
 85 | 		in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2
 86 | 
 87 | 		self.scale_factor = cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR
 88 | 		self.output_iam = cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM
 89 | 
 90 | 		self.resize=nn.ResizeBilinear()
 91 | 		
 92 | 		self.inst_branch = InstanceBranch(cfg, in_channels)
 93 | 		self.mask_branch = MaskBranch(cfg, in_channels)
 94 | 
 95 | 
 96 | 	def compute_coordinates(self, x):
 97 | 		h, w = x.shape[2], x.shape[3]
 98 | 		start=Tensor(-1,mindspore.float32)
 99 | 		stop=Tensor(1,mindspore.float32)
100 | 		y_loc = ops.linspace(start,stop, h)
101 | 		x_loc = ops.linspace(start,stop, w)
102 | 		y_loc, x_loc = ops.meshgrid((y_loc, x_loc),indexing='ij')
103 | 		y_loc=ops.broadcast_to(y_loc,(x.shape[0],1,-1,-1))
104 | 		x_loc=ops.broadcast_to(x_loc,(x.shape[0],1,-1,-1))
105 | 		locations=ops.concat((x_loc,y_loc),axis=1)
106 | 		return locations.astype('float32')
107 |     
108 | 	def construct(self, features):
109 | 		coord_features = self.compute_coordinates(features)
110 | 		features=ops.concat((coord_features,features),axis=1)
111 | 		pred_logits, pred_kernel, pred_scores, iam = self.inst_branch(features)
112 | 		mask_features = self.mask_branch(features)
113 | 
114 | 		N = pred_kernel.shape[1]
115 | 		# mask_features: BxCxHxW
116 | 		B, C, H, W = mask_features.shape
117 | 		pred_masks=ops.BatchMatMul()(pred_kernel,mask_features.view((B,C,H*W))).view((B,N,H,W))
118 | 
119 | 
120 | 		pred_masks=self.resize(pred_masks,scale_factor=self.scale_factor)
121 | 		output = {
122 | 			"pred_logits": pred_logits,
123 | 			"pred_masks": pred_masks,
124 | 			"pred_scores": pred_scores,
125 | 		}
126 | 
127 | 		if self.output_iam:
128 | 			iam=self.resize(iam,scale_factor=self.scale_factor)
129 | 			output['pred_iam'] = iam
130 | 
131 | 		return output
132 | 
133 | 
134 | 
135 | class GroupInstanceBranch(nn.Cell):
136 | 
137 | 	def __init__(self, cfg, in_channels):
138 | 		super().__init__()
139 | 		# norm = cfg.MODEL.SPARSE_INST.DECODER.NORM
140 | 		dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM
141 | 		num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS
142 | 		num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS
143 | 		kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
144 | 		self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
145 | 		self.num_groups = cfg.MODEL.SPARSE_INST.DECODER.GROUPS
146 | 
147 | 		self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
148 | 		# iam prediction, a simple conv
149 | 		expand_dim = dim * self.num_groups
150 | 		self.iam_conv = nn.Conv2d(dim, num_masks * self.num_groups, 3, group=self.num_groups,has_bias=True)
151 | 
152 | 		 # outputs
153 | 		self.fc = nn.Dense(expand_dim, expand_dim)
154 | 		self.cls_score = nn.Dense(expand_dim, self.num_classes)
155 | 		self.mask_kernel = nn.Dense(expand_dim, kernel_dim)
156 | 		self.objectness = nn.Dense(expand_dim, 1)
157 | 
158 | 
159 | 	def construct(self, features):
160 | 		# instance features (x4 convs)
161 | 		features = self.inst_convs(features)
162 | 		# predict instance activation maps
163 | 		iam = self.iam_conv(features)
164 | 		iam_prob = ops.Sigmoid()(iam)
165 | 
166 | 		B, N = iam_prob.shape[:2]
167 | 		C = features.shape[1]
168 | 		# BxNxHxW -> BxNx(HW)
169 | 		iam_prob = iam_prob.view((B, N, -1))
170 | 		# aggregate features: BxCxHxW -> Bx(HW)xC
171 | 		inst_features=ops.BatchMatMul(transpose_b=True)(iam_prob,features.view((B, C, -1)))
172 | 		normalizer = ops.clip_by_value(iam_prob.sum(-1),clip_value_min=Tensor(1e-6,mindspore.float32))
173 | 		inst_features = inst_features / normalizer[:, :, None]
174 | 
175 | 		inst_features=ops.reshape(ops.Transpose()(ops.reshape(inst_features,(B,4,N//4,-1)),(0,2,1,3)),(B,N//4,-1))
176 | 		inst_features=ops.ReLU()(self.fc(inst_features))
177 | 		# predict classification & segmentation kernel & objectness
178 | 		pred_logits = self.cls_score(inst_features)
179 | 		pred_kernel = self.mask_kernel(inst_features)
180 | 		pred_scores = self.objectness(inst_features)
181 | 		return pred_logits, pred_kernel, pred_scores, iam
182 | 
183 | 
184 | 
185 | class GroupIAMDecoder(BaseIAMDecoder):
186 | 	def __init__(self, cfg):
187 | 		super().__init__(cfg)
188 | 		in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2
189 | 		self.inst_branch = GroupInstanceBranch(cfg, in_channels)
190 | 
191 | 
192 | 


--------------------------------------------------------------------------------
/tools/get_flops.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import typing
  3 | from typing import Dict, List, Counter, Any
  4 | import logging
  5 | import numpy as np
  6 | from collections import Counter
  7 | import tqdm
  8 | from fvcore.nn import flop_count_table  # can also try flop_count_str
  9 | from fvcore.nn.jit_handles import conv_flop_jit, Handle, get_shape, conv_flop_count
 10 | from detectron2.checkpoint import DetectionCheckpointer
 11 | from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
 12 | from detectron2.data import build_detection_test_loader
 13 | from detectron2.engine import default_argument_parser
 14 | from detectron2.modeling import build_model
 15 | from detectron2.utils.analysis import (
 16 |     FlopCountAnalysis,
 17 |     activation_count_operators,
 18 |     parameter_count_table,
 19 |     TracingAdapter
 20 | )
 21 | 
 22 | from detectron2.utils.logger import setup_logger
 23 | sys.path.append(".")
 24 | from sparseinst import add_sparse_inst_config
 25 | 
 26 | logger = logging.getLogger("detectron2")
 27 | 
 28 | 
 29 | def dconv_flop_jit(inputs: List[Any], outputs: List[Any]) -> typing.Counter[str]:
 30 |     """
 31 |     Count flops for convolution.
 32 |     """
 33 |     # Inputs of Convolution should be a list of length 12 or 13. They represent:
 34 |     # 0) input tensor, 1) convolution filter, 2) bias, 3) stride, 4) padding,
 35 |     # 5) dilation, 6) transposed, 7) out_pad, 8) groups, 9) benchmark_cudnn,
 36 |     # 10) deterministic_cudnn and 11) user_enabled_cudnn.
 37 |     # starting with #40737 it will be 12) user_enabled_tf32
 38 |     # assert len(inputs) == 12 or len(inputs) == 13, len(inputs)
 39 |     x, _, w = inputs[:3]
 40 |     x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0]))
 41 |     return Counter({"conv": conv_flop_count(x_shape, w_shape, out_shape)})
 42 | 
 43 | 
 44 | _NEW_SUPPORTED_OPS: Dict[str, Handle] = {
 45 |     "prim::PythonOp._DeformConv": dconv_flop_jit,
 46 | }
 47 | 
 48 | 
 49 | class MyFlopCountAnalysis(FlopCountAnalysis):
 50 |     """
 51 |     Same as :class:`fvcore.nn.FlopCountAnalysis`, but supports detectron2 models.
 52 |     """
 53 | 
 54 |     def __init__(self, model, inputs):
 55 |         """
 56 |         Args:
 57 |             model (nn.Module):
 58 |             inputs (Any): inputs of the given model. Does not have to be tuple of tensors.
 59 |         """
 60 |         wrapper = TracingAdapter(model, inputs, allow_non_tensor=True)
 61 |         super().__init__(wrapper, wrapper.flattened_inputs)
 62 |         self.set_op_handle(**_NEW_SUPPORTED_OPS)
 63 | 
 64 | 
 65 | def setup(args):
 66 |     if args.config_file.endswith(".yaml"):
 67 |         cfg = get_cfg()
 68 |         add_sparse_inst_config(cfg)
 69 |         cfg.merge_from_file(args.config_file)
 70 |         print(cfg.MODEL.WEIGHTS)
 71 |         cfg.DATALOADER.NUM_WORKERS = 0
 72 |         cfg.merge_from_list(args.opts)
 73 |         cfg.freeze()
 74 |     else:
 75 |         cfg = LazyConfig.load(args.config_file)
 76 |         cfg = LazyConfig.apply_overrides(cfg, args.opts)
 77 |     setup_logger(name="fvcore")
 78 |     setup_logger()
 79 |     return cfg
 80 | 
 81 | 
 82 | def do_flop(cfg):
 83 |     if isinstance(cfg, CfgNode):
 84 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
 85 |         model = build_model(cfg)
 86 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
 87 |     else:
 88 |         data_loader = instantiate(cfg.dataloader.test)
 89 |         model = instantiate(cfg.model)
 90 |         model.to(cfg.train.device)
 91 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
 92 |     model.eval()
 93 | 
 94 |     counts = Counter()
 95 |     total_flops = []
 96 |     for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
 97 |         flops = MyFlopCountAnalysis(model, data)
 98 |         if idx > 0:
 99 |             flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False)
100 |         counts += flops.by_operator()
101 |         total_flops.append(flops.total())
102 |         # print(flops.unsupported_ops())
103 | 
104 |     logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops))
105 |     logger.info(
106 |         "Average GFlops for each type of operators:\n"
107 |         + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()])
108 |     )
109 |     logger.info(
110 |         "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9)
111 |     )
112 | 
113 | 
114 | def do_activation(cfg):
115 |     if isinstance(cfg, CfgNode):
116 |         data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
117 |         model = build_model(cfg)
118 |         DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
119 |     else:
120 |         data_loader = instantiate(cfg.dataloader.test)
121 |         model = instantiate(cfg.model)
122 |         model.to(cfg.train.device)
123 |         DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
124 |     model.eval()
125 | 
126 |     counts = Counter()
127 |     total_activations = []
128 |     for idx, data in zip(tqdm.trange(args.num_inputs), data_loader):  # noqa
129 |         count = activation_count_operators(model, data)
130 |         counts += count
131 |         total_activations.append(sum(count.values()))
132 |     logger.info(
133 |         "(Million) Activations for Each Type of Operators:\n"
134 |         + str([(k, v / idx) for k, v in counts.items()])
135 |     )
136 |     logger.info(
137 |         "Total (Million) Activations: {}±{}".format(
138 |             np.mean(total_activations), np.std(total_activations)
139 |         )
140 |     )
141 | 
142 | 
143 | def do_parameter(cfg):
144 |     if isinstance(cfg, CfgNode):
145 |         model = build_model(cfg)
146 |     else:
147 |         model = instantiate(cfg.model)
148 |     logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5))
149 | 
150 | 
151 | def do_structure(cfg):
152 |     if isinstance(cfg, CfgNode):
153 |         model = build_model(cfg)
154 |     else:
155 |         model = instantiate(cfg.model)
156 |     logger.info("Model Structure:\n" + str(model))
157 | 
158 | 
159 | if __name__ == "__main__":
160 |     parser = default_argument_parser(
161 |         epilog="""
162 | Examples:
163 | 
164 | To show parameters of a model:
165 | $ ./analyze_model.py --tasks parameter \\
166 |     --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
167 | 
168 | Flops and activations are data-dependent, therefore inputs and model weights
169 | are needed to count them:
170 | 
171 | $ ./analyze_model.py --num-inputs 100 --tasks flop \\
172 |     --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\
173 |     MODEL.WEIGHTS /path/to/model.pkl
174 | """
175 |     )
176 |     parser.add_argument(
177 |         "--tasks",
178 |         choices=["flop", "activation", "parameter", "structure"],
179 |         required=True,
180 |         nargs="+",
181 |     )
182 |     parser.add_argument(
183 |         "-n",
184 |         "--num-inputs",
185 |         default=100,
186 |         type=int,
187 |         help="number of inputs used to compute statistics for flops/activations, "
188 |         "both are data dependent.",
189 |     )
190 |     args = parser.parse_args()
191 |     assert not args.eval_only
192 |     assert args.num_gpus == 1
193 |     cfg = setup(args)
194 |     
195 |     for task in args.tasks:
196 |         {
197 |             "flop": do_flop,
198 |             "activation": do_activation,
199 |             "parameter": do_parameter,
200 |             "structure": do_structure,
201 |         }[task](cfg)
202 | 


--------------------------------------------------------------------------------
/tools/test_net.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.cuda.amp import autocast
  9 | 
 10 | from detectron2.config import get_cfg
 11 | from detectron2.modeling import build_backbone
 12 | from detectron2.checkpoint import DetectionCheckpointer
 13 | from detectron2.structures import ImageList, Instances, BitMasks
 14 | from detectron2.engine import default_argument_parser, default_setup
 15 | from detectron2.data import build_detection_test_loader
 16 | from detectron2.evaluation import COCOEvaluator, print_csv_format
 17 | 
 18 | sys.path.append(".")
 19 | from sparseinst import build_sparse_inst_encoder, build_sparse_inst_decoder, add_sparse_inst_config
 20 | from sparseinst import COCOMaskEvaluator
 21 | 
 22 | 
 23 | device = torch.device('cuda:0')
 24 | dtype = torch.float32
 25 | 
 26 | __all__ = ["SparseInst"]
 27 | 
 28 | pixel_mean = torch.Tensor([123.675, 116.280, 103.530]).to(device).view(3, 1, 1)
 29 | pixel_std = torch.Tensor([58.395, 57.120, 57.375]).to(device).view(3, 1, 1)
 30 | 
 31 | 
 32 | @torch.jit.script
 33 | def normalizer(x, mean, std): return (x - mean) / std
 34 | 
 35 | 
 36 | def synchronize():
 37 |     torch.cuda.synchronize()
 38 | 
 39 | 
 40 | def process_batched_inputs(batched_inputs):
 41 |     images = [x["image"].to(device) for x in batched_inputs]
 42 |     images = [normalizer(x, pixel_mean, pixel_std) for x in images]
 43 |     images = ImageList.from_tensors(images, 32)
 44 |     ori_size = (batched_inputs[0]["height"], batched_inputs[0]["width"])
 45 |     return images.tensor, images.image_sizes[0], ori_size
 46 | 
 47 | 
 48 | @torch.jit.script
 49 | def rescoring_mask(scores, mask_pred, masks):
 50 |     mask_pred_ = mask_pred.float()
 51 |     return scores * ((masks * mask_pred_).sum([1, 2]) / (mask_pred_.sum([1, 2]) + 1e-6))
 52 | 
 53 | 
 54 | class SparseInst(nn.Module):
 55 | 
 56 |     def __init__(self, cfg):
 57 | 
 58 |         super().__init__()
 59 | 
 60 |         self.device = torch.device(cfg.MODEL.DEVICE)
 61 |         # backbone
 62 |         self.backbone = build_backbone(cfg)
 63 |         self.size_divisibility = self.backbone.size_divisibility
 64 | 
 65 |         output_shape = self.backbone.output_shape()
 66 | 
 67 |         self.encoder = build_sparse_inst_encoder(cfg, output_shape)
 68 |         self.decoder = build_sparse_inst_decoder(cfg)
 69 | 
 70 |         self.to(self.device)
 71 | 
 72 |         # inference
 73 |         self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD
 74 |         self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD
 75 |         self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS
 76 |         self.mask_format = cfg.INPUT.MASK_FORMAT
 77 |         self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
 78 | 
 79 |     def forward(self, image, resized_size, ori_size):
 80 |         max_size = image.shape[2:]
 81 |         features = self.backbone(image)
 82 |         features = self.encoder(features)
 83 |         output = self.decoder(features)
 84 |         result = self.inference_single(
 85 |             output, resized_size, max_size, ori_size)
 86 |         return result
 87 | 
 88 |     def inference_single(self, outputs, img_shape, pad_shape, ori_shape):
 89 |         """
 90 |         inference for only one sample
 91 |         Args:
 92 |             scores (tensor): [NxC]
 93 |             masks (tensor): [NxHxW]
 94 |             img_shape (list): (h1, w1), image after resized
 95 |             pad_shape (list): (h2, w2), padded resized image
 96 |             ori_shape (list): (h3, w3), original shape h3*w3 < h1*w1 < h2*w2
 97 |         """
 98 |         result = Instances(ori_shape)
 99 |         # scoring
100 |         pred_logits = outputs["pred_logits"][0].sigmoid()
101 |         pred_scores = outputs["pred_scores"][0].sigmoid().squeeze()
102 |         pred_masks = outputs["pred_masks"][0].sigmoid()
103 |         # obtain scores
104 |         scores, labels = pred_logits.max(dim=-1)
105 |         # remove by thresholding
106 |         keep = scores > self.cls_threshold
107 |         scores = torch.sqrt(scores[keep] * pred_scores[keep])
108 |         labels = labels[keep]
109 |         pred_masks = pred_masks[keep]
110 | 
111 |         if scores.size(0) == 0:
112 |             return None
113 |         scores = rescoring_mask(scores, pred_masks > 0.45, pred_masks)
114 |         h, w = img_shape
115 |         # resize masks
116 |         pred_masks = F.interpolate(pred_masks.unsqueeze(1), size=pad_shape,
117 |                                    mode="bilinear", align_corners=False)[:, :, :h, :w]
118 |         pred_masks = F.interpolate(pred_masks, size=ori_shape, mode='bilinear',
119 |                                    align_corners=False).squeeze(1)
120 |         mask_pred = pred_masks > self.mask_threshold
121 | 
122 |         mask_pred = BitMasks(mask_pred)
123 |         result.pred_masks = mask_pred
124 |         result.scores = scores
125 |         result.pred_classes = labels
126 |         return result
127 | 
128 | 
129 | def test_sparseinst_speed(cfg, fp16=False):
130 |     device = torch.device('cuda:0')
131 | 
132 |     model = SparseInst(cfg)
133 |     model.eval()
134 |     model.to(device)
135 |     print(model)
136 |     size = (cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST)
137 |     DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
138 |         cfg.MODEL.WEIGHTS, resume=False)
139 | 
140 |     torch.backends.cudnn.enable = True
141 |     torch.backends.cudnn.benchmark = False
142 | 
143 |     output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
144 | 
145 |     evaluator = COCOMaskEvaluator(
146 |         cfg.DATASETS.TEST[0], ("segm",), False, output_folder)
147 |     evaluator.reset()
148 |     model.to(device)
149 |     model.eval()
150 |     data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
151 |     durations = []
152 | 
153 |     with autocast(enabled=fp16):
154 |         with torch.no_grad():
155 |             for idx, inputs in enumerate(data_loader):
156 |                 images, resized_size, ori_size = process_batched_inputs(inputs)
157 |                 synchronize()
158 |                 start_time = time.perf_counter()
159 |                 output = model(images, resized_size, ori_size)
160 |                 synchronize()
161 |                 end = time.perf_counter() - start_time
162 | 
163 |                 durations.append(end)
164 |                 if idx % 1000 == 0:
165 |                     print("process: [{}/{}] fps: {:.3f}".format(idx,
166 |                                                                 len(data_loader), 1/np.mean(durations[100:])))
167 |                 evaluator.process(inputs, [{"instances": output}])
168 |     # evaluate
169 |     results = evaluator.evaluate()
170 |     print_csv_format(results)
171 | 
172 |     latency = np.mean(durations[100:])
173 |     fps = 1 / latency
174 |     print("speed: {:.4f}s FPS: {:.2f}".format(latency, fps))
175 | 
176 | 
177 | def setup(args):
178 |     """
179 |     Create configs and perform basic setups.
180 |     """
181 |     cfg = get_cfg()
182 |     add_sparse_inst_config(cfg)
183 |     cfg.merge_from_file(args.config_file)
184 |     cfg.merge_from_list(args.opts)
185 |     cfg.freeze()
186 |     default_setup(cfg, args)
187 |     return cfg
188 | 
189 | 
190 | if __name__ == '__main__':
191 | 
192 |     args = default_argument_parser()
193 |     args.add_argument("--fp16", action="store_true",
194 |                       help="support fp16 for inference")
195 |     args = args.parse_args()
196 |     print("Command Line Args:", args)
197 |     cfg = setup(args)
198 |     test_sparseinst_speed(cfg, fp16=args.fp16)
199 | 


--------------------------------------------------------------------------------
/sparseinst/sparseinst.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from detectron2.modeling import build_backbone
  8 | from detectron2.structures import ImageList, Instances, BitMasks
  9 | from detectron2.modeling import META_ARCH_REGISTRY, build_backbone
 10 | 
 11 | from .encoder import build_sparse_inst_encoder
 12 | from .decoder import build_sparse_inst_decoder
 13 | from .loss import build_sparse_inst_criterion
 14 | from .utils import nested_tensor_from_tensor_list
 15 | 
 16 | __all__ = ["SparseInst"]
 17 | 
 18 | 
 19 | @torch.jit.script
 20 | def rescoring_mask(scores, mask_pred, masks):
 21 |     mask_pred_ = mask_pred.float()
 22 |     return scores * ((masks * mask_pred_).sum([1, 2]) / (mask_pred_.sum([1, 2]) + 1e-6))
 23 | 
 24 | 
 25 | @META_ARCH_REGISTRY.register()
 26 | class SparseInst(nn.Module):
 27 | 
 28 |     def __init__(self, cfg):
 29 |         super().__init__()
 30 | 
 31 |         # move to target device
 32 |         self.device = torch.device(cfg.MODEL.DEVICE)
 33 | 
 34 |         # backbone
 35 |         self.backbone = build_backbone(cfg)
 36 |         self.size_divisibility = self.backbone.size_divisibility
 37 |         output_shape = self.backbone.output_shape()
 38 | 
 39 |         # encoder & decoder
 40 |         self.encoder = build_sparse_inst_encoder(cfg, output_shape)
 41 |         self.decoder = build_sparse_inst_decoder(cfg)
 42 | 
 43 |         # matcher & loss (matcher is built in loss)
 44 |         self.criterion = build_sparse_inst_criterion(cfg)
 45 | 
 46 |         # data and preprocessing
 47 |         self.mask_format = cfg.INPUT.MASK_FORMAT
 48 | 
 49 |         self.pixel_mean = torch.Tensor(
 50 |             cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
 51 |         self.pixel_std = torch.Tensor(
 52 |             cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
 53 |         # self.normalizer = lambda x: (x - pixel_mean) / pixel_std
 54 | 
 55 |         # inference
 56 |         self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD
 57 |         self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD
 58 |         self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS
 59 | 
 60 |     def normalizer(self, image):
 61 |         image = (image - self.pixel_mean) / self.pixel_std
 62 |         return image
 63 | 
 64 |     def preprocess_inputs(self, batched_inputs):
 65 |         images = [x["image"].to(self.device) for x in batched_inputs]
 66 |         images = [self.normalizer(x) for x in images]
 67 |         images = ImageList.from_tensors(images, 32)
 68 |         return images
 69 | 
 70 |     def prepare_targets(self, targets):
 71 |         new_targets = []
 72 |         for targets_per_image in targets:
 73 |             target = {}
 74 |             gt_classes = targets_per_image.gt_classes
 75 |             target["labels"] = gt_classes.to(self.device)
 76 |             h, w = targets_per_image.image_size
 77 |             if not targets_per_image.has('gt_masks'):
 78 |                 gt_masks = BitMasks(torch.empty(0, h, w))
 79 |             else:
 80 |                 gt_masks = targets_per_image.gt_masks
 81 |                 if self.mask_format == "polygon":
 82 |                     if len(gt_masks.polygons) == 0:
 83 |                         gt_masks = BitMasks(torch.empty(0, h, w))
 84 |                     else:
 85 |                         gt_masks = BitMasks.from_polygon_masks(
 86 |                             gt_masks.polygons, h, w)
 87 | 
 88 |             target["masks"] = gt_masks.to(self.device)
 89 |             new_targets.append(target)
 90 | 
 91 |         return new_targets
 92 | 
 93 |     def forward(self, batched_inputs):
 94 |         images = self.preprocess_inputs(batched_inputs)
 95 |         if isinstance(images, (list, torch.Tensor)):
 96 |             images = nested_tensor_from_tensor_list(images)
 97 |         max_shape = images.tensor.shape[2:]
 98 |         # forward
 99 |         features = self.backbone(images.tensor)
100 |         features = self.encoder(features)
101 |         output = self.decoder(features)
102 | 
103 |         if self.training:
104 |             gt_instances = [x["instances"].to(
105 |                 self.device) for x in batched_inputs]
106 |             targets = self.prepare_targets(gt_instances)
107 |             losses = self.criterion(output, targets, max_shape)
108 |             return losses
109 |         else:
110 |             results = self.inference(
111 |                 output, batched_inputs, max_shape, images.image_sizes)
112 |             processed_results = [{"instances": r} for r in results]
113 |             return processed_results
114 | 
115 |     def forward_test(self, images):
116 |         # for inference, onnx, tensorrt
117 |         # input images: BxCxHxW, fixed, need padding size
118 |         # normalize
119 |         images = (images - self.pixel_mean[None]) / self.pixel_std[None]
120 |         features = self.backbone(images)
121 |         features = self.encoder(features)
122 |         output = self.decoder(features)
123 | 
124 |         pred_scores = output["pred_logits"].sigmoid()
125 |         pred_masks = output["pred_masks"].sigmoid()
126 |         pred_objectness = output["pred_scores"].sigmoid()
127 |         pred_scores = torch.sqrt(pred_scores * pred_objectness)
128 |         pred_masks = F.interpolate(
129 |             pred_masks, scale_factor=4.0, mode="bilinear", align_corners=False)
130 |         return pred_scores, pred_masks
131 | 
132 |     def inference(self, output, batched_inputs, max_shape, image_sizes):
133 |         # max_detections = self.max_detections
134 |         results = []
135 |         pred_scores = output["pred_logits"].sigmoid()
136 |         pred_masks = output["pred_masks"].sigmoid()
137 |         pred_objectness = output["pred_scores"].sigmoid()
138 |         pred_scores = torch.sqrt(pred_scores * pred_objectness)
139 | 
140 |         for _, (scores_per_image, mask_pred_per_image, batched_input, img_shape) in enumerate(zip(
141 |                 pred_scores, pred_masks, batched_inputs, image_sizes)):
142 | 
143 |             ori_shape = (batched_input["height"], batched_input["width"])
144 |             result = Instances(ori_shape)
145 |             # max/argmax
146 |             scores, labels = scores_per_image.max(dim=-1)
147 |             # cls threshold
148 |             keep = scores > self.cls_threshold
149 |             scores = scores[keep]
150 |             labels = labels[keep]
151 |             mask_pred_per_image = mask_pred_per_image[keep]
152 | 
153 |             if scores.size(0) == 0:
154 |                 result.scores = scores
155 |                 result.pred_classes = labels
156 |                 results.append(result)
157 |                 continue
158 | 
159 |             h, w = img_shape
160 |             # rescoring mask using maskness
161 |             scores = rescoring_mask(
162 |                 scores, mask_pred_per_image > self.mask_threshold, mask_pred_per_image)
163 | 
164 |             # upsample the masks to the original resolution:
165 |             # (1) upsampling the masks to the padded inputs, remove the padding area
166 |             # (2) upsampling/downsampling the masks to the original sizes
167 |             mask_pred_per_image = F.interpolate(
168 |                 mask_pred_per_image.unsqueeze(1), size=max_shape, mode="bilinear", align_corners=False)[:, :, :h, :w]
169 |             mask_pred_per_image = F.interpolate(
170 |                 mask_pred_per_image, size=ori_shape, mode='bilinear', align_corners=False).squeeze(1)
171 | 
172 |             mask_pred = mask_pred_per_image > self.mask_threshold
173 |             # fix the bug for visualization
174 |             # mask_pred = BitMasks(mask_pred)
175 | 
176 |             # using Detectron2 Instances to store the final results
177 |             result.pred_masks = mask_pred
178 |             result.scores = scores
179 |             result.pred_classes = labels
180 |             results.append(result)
181 | 
182 |         return results
183 | 


--------------------------------------------------------------------------------
/tools/train_net.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import itertools
  4 | from typing import Any, Dict, List, Set
  5 | import torch
  6 | 
  7 | import detectron2.utils.comm as comm
  8 | from detectron2.checkpoint import DetectionCheckpointer
  9 | from detectron2.config import get_cfg
 10 | from detectron2.utils.logger import setup_logger
 11 | from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetMapper
 12 | from detectron2.engine import AutogradProfiler, DefaultTrainer, default_argument_parser, default_setup, launch
 13 | from detectron2.evaluation import COCOEvaluator, verify_results
 14 | from detectron2.solver.build import maybe_add_gradient_clipping
 15 | from detectron2.evaluation import (
 16 |     CityscapesInstanceEvaluator,
 17 |     CityscapesSemSegEvaluator,
 18 |     COCOEvaluator,
 19 |     COCOPanopticEvaluator,
 20 |     DatasetEvaluators,
 21 |     LVISEvaluator,
 22 |     PascalVOCDetectionEvaluator,
 23 |     SemSegEvaluator,
 24 |     verify_results,
 25 | )
 26 | 
 27 | sys.path.append(".")
 28 | from sparseinst import add_sparse_inst_config, COCOMaskEvaluator
 29 | 
 30 | 
 31 | class Trainer(DefaultTrainer):
 32 | 
 33 |     @classmethod
 34 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
 35 |         """
 36 |         Create evaluator(s) for a given dataset.
 37 |         This uses the special metadata "evaluator_type" associated with each builtin dataset.
 38 |         For your own dataset, you can simply create an evaluator manually in your
 39 |         script and do not have to worry about the hacky if-else logic here.
 40 |         """
 41 |         if output_folder is None:
 42 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
 43 |         evaluator_list = []
 44 |         evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
 45 |         if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
 46 |             evaluator_list.append(
 47 |                 SemSegEvaluator(
 48 |                     dataset_name,
 49 |                     distributed=True,
 50 |                     num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
 51 |                     ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
 52 |                     output_dir=output_folder,
 53 |                 )
 54 |             )
 55 |         if evaluator_type in ["coco", "coco_panoptic_seg"]:
 56 |             evaluator_list.append(COCOMaskEvaluator(dataset_name, ("segm", ), True, output_folder))
 57 |         if evaluator_type == "coco_panoptic_seg":
 58 |             evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
 59 |         if evaluator_type == "cityscapes_instance":
 60 |             assert (
 61 |                 torch.cuda.device_count() >= comm.get_rank()
 62 |             ), "CityscapesEvaluator currently do not work with multiple machines."
 63 |             return CityscapesInstanceEvaluator(dataset_name)
 64 |         if evaluator_type == "cityscapes_sem_seg":
 65 |             assert (
 66 |                 torch.cuda.device_count() >= comm.get_rank()
 67 |             ), "CityscapesEvaluator currently do not work with multiple machines."
 68 |             return CityscapesSemSegEvaluator(dataset_name)
 69 |         elif evaluator_type == "pascal_voc":
 70 |             return PascalVOCDetectionEvaluator(dataset_name)
 71 |         elif evaluator_type == "lvis":
 72 |             return LVISEvaluator(dataset_name, cfg, True, output_folder)
 73 |         if len(evaluator_list) == 0:
 74 |             raise NotImplementedError(
 75 |                 "no Evaluator for the dataset {} with the type {}".format(
 76 |                     dataset_name, evaluator_type
 77 |                 )
 78 |             )
 79 |         elif len(evaluator_list) == 1:
 80 |             return evaluator_list[0]
 81 |         return DatasetEvaluators(evaluator_list)
 82 | 
 83 |     @classmethod
 84 |     def build_optimizer(cls, cfg, model):
 85 |         params: List[Dict[str, Any]] = []
 86 |         memo: Set[torch.nn.parameter.Parameter] = set()
 87 |         for key, value in model.named_parameters(recurse=True):
 88 |             if not value.requires_grad:
 89 |                 continue
 90 |             # Avoid duplicating parameters
 91 |             if value in memo:
 92 |                 continue
 93 |             memo.add(value)
 94 |             lr = cfg.SOLVER.BASE_LR
 95 |             weight_decay = cfg.SOLVER.WEIGHT_DECAY
 96 |             if "backbone" in key:
 97 |                 lr = lr * cfg.SOLVER.BACKBONE_MULTIPLIER
 98 |             # for transformer
 99 |             if "patch_embed" in key or "cls_token" in key:
100 |                 weight_decay = 0.0
101 |             if "norm" in key:
102 |                 weight_decay = 0.0
103 |             params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
104 | 
105 |         def maybe_add_full_model_gradient_clipping(optim):  # optim: the optimizer class
106 |             # detectron2 doesn't have full  model gradient clipping now
107 |             clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
108 |             enable = (
109 |                 cfg.SOLVER.CLIP_GRADIENTS.ENABLED
110 |                 and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
111 |                 and clip_norm_val > 0.0
112 |             )
113 | 
114 |             class FullModelGradientClippingOptimizer(optim):
115 |                 def step(self, closure=None):
116 |                     all_params = itertools.chain(*[x["params"] for x in self.param_groups])
117 |                     torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
118 |                     super().step(closure=closure)
119 | 
120 |             return FullModelGradientClippingOptimizer if enable else optim
121 | 
122 |         optimizer_type = cfg.SOLVER.OPTIMIZER
123 |         if optimizer_type == "SGD":
124 |             optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
125 |                 params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM
126 |             )
127 |         elif optimizer_type == "ADAMW":
128 |             optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
129 |                 params, cfg.SOLVER.BASE_LR, amsgrad=cfg.SOLVER.AMSGRAD
130 |             )
131 |         else:
132 |             raise NotImplementedError(f"no optimizer type {optimizer_type}")
133 |         if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
134 |             optimizer = maybe_add_gradient_clipping(cfg, optimizer)
135 |         return optimizer
136 | 
137 |     @classmethod
138 |     def build_train_loader(cls, cfg):
139 |         if cfg.MODEL.SPARSE_INST.DATASET_MAPPER == "SparseInstDatasetMapper":
140 |             from sparseinst import SparseInstDatasetMapper
141 |             mapper = SparseInstDatasetMapper(cfg, is_train=True)
142 |         else:
143 |             mapper = None
144 |         return build_detection_train_loader(cfg, mapper=mapper)
145 | 
146 | 
147 | def setup(args):
148 |     """
149 |     Create configs and perform basic setups.
150 |     """
151 |     cfg = get_cfg()
152 |     add_sparse_inst_config(cfg)
153 |     cfg.merge_from_file(args.config_file)
154 |     cfg.merge_from_list(args.opts)
155 |     cfg.freeze()
156 |     default_setup(cfg, args)
157 |     # Setup logger for "sparseinst" module
158 |     setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="sparseinst")
159 |     return cfg
160 | 
161 | 
162 | def main(args):
163 |     cfg = setup(args)
164 | 
165 |     if args.eval_only:
166 |         model = Trainer.build_model(cfg)
167 |         DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
168 |             cfg.MODEL.WEIGHTS, resume=args.resume)
169 |         res = Trainer.test(cfg, model)
170 |         if comm.is_main_process():
171 |             verify_results(cfg, res)
172 |         return res
173 | 
174 |     trainer = Trainer(cfg)
175 |     trainer.resume_or_load(resume=args.resume)
176 |     return trainer.train()
177 | 
178 | 
179 | if __name__ == "__main__":
180 |     args = default_argument_parser().parse_args()
181 |     print("Command Line Args:", args)
182 |     launch(
183 |         main,
184 |         args.num_gpus,
185 |         num_machines=args.num_machines,
186 |         machine_rank=args.machine_rank,
187 |         dist_url=args.dist_url,
188 |         args=(args,),
189 |     )
190 | 


--------------------------------------------------------------------------------
/sparseinst/d2_predictor.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | import atexit
  3 | import bisect
  4 | import multiprocessing as mp
  5 | from collections import deque
  6 | import cv2
  7 | import torch
  8 | 
  9 | from detectron2.data import MetadataCatalog
 10 | from detectron2.engine.defaults import DefaultPredictor
 11 | from detectron2.utils.video_visualizer import VideoVisualizer
 12 | from detectron2.utils.visualizer import ColorMode, Visualizer
 13 | 
 14 | 
 15 | class VisualizationDemo(object):
 16 |     def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
 17 |         """
 18 |         Args:
 19 |             cfg (CfgNode):
 20 |             instance_mode (ColorMode):
 21 |             parallel (bool): whether to run the model in different processes from visualization.
 22 |                 Useful since the visualization logic can be slow.
 23 |         """
 24 |         self.img_format = cfg.INPUT.FORMAT
 25 |         self.metadata = MetadataCatalog.get(
 26 |             cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
 27 |         )
 28 |         self.cpu_device = torch.device("cpu")
 29 |         self.instance_mode = instance_mode
 30 | 
 31 |         self.parallel = parallel
 32 |         if parallel:
 33 |             num_gpu = torch.cuda.device_count()
 34 |             self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
 35 |         else:
 36 |             self.predictor = DefaultPredictor(cfg)
 37 | 
 38 |     def run_on_image(self, image, confidence_threshold):
 39 |         """
 40 |         Args:
 41 |             image (np.ndarray): an image of shape (H, W, C) (in BGR order).
 42 |                 This is the format used by OpenCV.
 43 | 
 44 |         Returns:
 45 |             predictions (dict): the output of the model.
 46 |             vis_output (VisImage): the visualized image output.
 47 |         """
 48 |         vis_output = None
 49 |         predictions = self.predictor(image)
 50 |         visualizer = Visualizer(image, self.metadata,
 51 |                                 instance_mode=self.instance_mode)
 52 |         if "panoptic_seg" in predictions:
 53 |             panoptic_seg, segments_info = predictions["panoptic_seg"]
 54 |             vis_output = visualizer.draw_panoptic_seg_predictions(
 55 |                 panoptic_seg.to(self.cpu_device), segments_info
 56 |             )
 57 |         else:
 58 |             if "sem_seg" in predictions:
 59 |                 vis_output = visualizer.draw_sem_seg(
 60 |                     predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
 61 |                 )
 62 |             if "instances" in predictions:
 63 |                 instances = predictions["instances"].to(self.cpu_device)
 64 |                 instances = instances[instances.scores > confidence_threshold]
 65 |                 predictions["instances"] = instances
 66 |                 vis_output = visualizer.draw_instance_predictions(
 67 |                     predictions=instances)
 68 | 
 69 |         return predictions, vis_output
 70 | 
 71 |     def _frame_from_video(self, video):
 72 |         while video.isOpened():
 73 |             success, frame = video.read()
 74 |             if success:
 75 |                 yield frame
 76 |             else:
 77 |                 break
 78 | 
 79 |     def run_on_video(self, video, confidence_threshold):
 80 |         """
 81 |         Visualizes predictions on frames of the input video.
 82 | 
 83 |         Args:
 84 |             video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
 85 |                 either a webcam or a video file.
 86 | 
 87 |         Yields:
 88 |             ndarray: BGR visualizations of each video frame.
 89 |         """
 90 |         video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
 91 | 
 92 |         def process_predictions(frame, predictions):
 93 |             frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
 94 |             if "panoptic_seg" in predictions:
 95 |                 panoptic_seg, segments_info = predictions["panoptic_seg"]
 96 |                 vis_frame = video_visualizer.draw_panoptic_seg_predictions(
 97 |                     frame, panoptic_seg.to(self.cpu_device), segments_info
 98 |                 )
 99 |             elif "instances" in predictions:
100 |                 predictions = predictions["instances"].to(self.cpu_device)
101 |                 predictions = predictions[predictions.scores >
102 |                                           confidence_threshold]
103 |                 vis_frame = video_visualizer.draw_instance_predictions(
104 |                     frame, predictions)
105 |             elif "sem_seg" in predictions:
106 |                 vis_frame = video_visualizer.draw_sem_seg(
107 |                     frame, predictions["sem_seg"].argmax(
108 |                         dim=0).to(self.cpu_device)
109 |                 )
110 | 
111 |             # Converts Matplotlib RGB format to OpenCV BGR format
112 |             vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
113 |             return vis_frame
114 | 
115 |         frame_gen = self._frame_from_video(video)
116 |         if self.parallel:
117 |             buffer_size = self.predictor.default_buffer_size
118 | 
119 |             frame_data = deque()
120 | 
121 |             for cnt, frame in enumerate(frame_gen):
122 |                 frame_data.append(frame)
123 |                 self.predictor.put(frame)
124 | 
125 |                 if cnt >= buffer_size:
126 |                     frame = frame_data.popleft()
127 |                     predictions = self.predictor.get()
128 |                     yield process_predictions(frame, predictions)
129 | 
130 |             while len(frame_data):
131 |                 frame = frame_data.popleft()
132 |                 predictions = self.predictor.get()
133 |                 yield process_predictions(frame, predictions)
134 |         else:
135 |             for frame in frame_gen:
136 |                 yield process_predictions(frame, self.predictor(frame))
137 | 
138 | 
139 | class AsyncPredictor:
140 |     """
141 |     A predictor that runs the model asynchronously, possibly on >1 GPUs.
142 |     Because rendering the visualization takes considerably amount of time,
143 |     this helps improve throughput a little bit when rendering videos.
144 |     """
145 | 
146 |     class _StopToken:
147 |         pass
148 | 
149 |     class _PredictWorker(mp.Process):
150 |         def __init__(self, cfg, task_queue, result_queue):
151 |             self.cfg = cfg
152 |             self.task_queue = task_queue
153 |             self.result_queue = result_queue
154 |             super().__init__()
155 | 
156 |         def run(self):
157 |             predictor = DefaultPredictor(self.cfg)
158 | 
159 |             while True:
160 |                 task = self.task_queue.get()
161 |                 if isinstance(task, AsyncPredictor._StopToken):
162 |                     break
163 |                 idx, data = task
164 |                 result = predictor(data)
165 |                 self.result_queue.put((idx, result))
166 | 
167 |     def __init__(self, cfg, num_gpus: int = 1):
168 |         """
169 |         Args:
170 |             cfg (CfgNode):
171 |             num_gpus (int): if 0, will run on CPU
172 |         """
173 |         num_workers = max(num_gpus, 1)
174 |         self.task_queue = mp.Queue(maxsize=num_workers * 3)
175 |         self.result_queue = mp.Queue(maxsize=num_workers * 3)
176 |         self.procs = []
177 |         for gpuid in range(max(num_gpus, 1)):
178 |             cfg = cfg.clone()
179 |             cfg.defrost()
180 |             cfg.MODEL.DEVICE = "cuda:{}".format(
181 |                 gpuid) if num_gpus > 0 else "cpu"
182 |             self.procs.append(
183 |                 AsyncPredictor._PredictWorker(
184 |                     cfg, self.task_queue, self.result_queue)
185 |             )
186 | 
187 |         self.put_idx = 0
188 |         self.get_idx = 0
189 |         self.result_rank = []
190 |         self.result_data = []
191 | 
192 |         for p in self.procs:
193 |             p.start()
194 |         atexit.register(self.shutdown)
195 | 
196 |     def put(self, image):
197 |         self.put_idx += 1
198 |         self.task_queue.put((self.put_idx, image))
199 | 
200 |     def get(self):
201 |         self.get_idx += 1  # the index needed for this request
202 |         if len(self.result_rank) and self.result_rank[0] == self.get_idx:
203 |             res = self.result_data[0]
204 |             del self.result_data[0], self.result_rank[0]
205 |             return res
206 | 
207 |         while True:
208 |             # make sure the results are returned in the correct order
209 |             idx, res = self.result_queue.get()
210 |             if idx == self.get_idx:
211 |                 return res
212 |             insert = bisect.bisect(self.result_rank, idx)
213 |             self.result_rank.insert(insert, idx)
214 |             self.result_data.insert(insert, res)
215 | 
216 |     def __len__(self):
217 |         return self.put_idx - self.get_idx
218 | 
219 |     def __call__(self, image):
220 |         self.put(image)
221 |         return self.get()
222 | 
223 |     def shutdown(self):
224 |         for _ in self.procs:
225 |             self.task_queue.put(AsyncPredictor._StopToken())
226 | 
227 |     @property
228 |     def default_buffer_size(self):
229 |         return len(self.procs) * 5
230 | 


--------------------------------------------------------------------------------
/sparseinst/decoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
  2 | 
  3 | import math
  4 | import torch
  5 | import torch.nn as nn
  6 | from torch.nn import init
  7 | import torch.nn.functional as F
  8 | 
  9 | from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill
 10 | 
 11 | from detectron2.utils.registry import Registry
 12 | from detectron2.layers import Conv2d
 13 | from sparseinst.encoder import SPARSE_INST_ENCODER_REGISTRY
 14 | 
 15 | SPARSE_INST_DECODER_REGISTRY = Registry("SPARSE_INST_DECODER")
 16 | SPARSE_INST_DECODER_REGISTRY.__doc__ = "registry for SparseInst decoder"
 17 | 
 18 | 
 19 | def _make_stack_3x3_convs(num_convs, in_channels, out_channels):
 20 |     convs = []
 21 |     for _ in range(num_convs):
 22 |         convs.append(
 23 |             Conv2d(in_channels, out_channels, 3, padding=1))
 24 |         convs.append(nn.ReLU(True))
 25 |         in_channels = out_channels
 26 |     return nn.Sequential(*convs)
 27 | 
 28 | 
 29 | class InstanceBranch(nn.Module):
 30 | 
 31 |     def __init__(self, cfg, in_channels):
 32 |         super().__init__()
 33 |         # norm = cfg.MODEL.SPARSE_INST.DECODER.NORM
 34 |         dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM
 35 |         num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS
 36 |         num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS
 37 |         kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
 38 |         self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
 39 | 
 40 |         self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
 41 |         # iam prediction, a simple conv
 42 |         self.iam_conv = nn.Conv2d(dim, num_masks, 3, padding=1)
 43 | 
 44 |         # outputs
 45 |         self.cls_score = nn.Linear(dim, self.num_classes)
 46 |         self.mask_kernel = nn.Linear(dim, kernel_dim)
 47 |         self.objectness = nn.Linear(dim, 1)
 48 | 
 49 |         self.prior_prob = 0.01
 50 |         self._init_weights()
 51 | 
 52 |     def _init_weights(self):
 53 |         for m in self.inst_convs.modules():
 54 |             if isinstance(m, nn.Conv2d):
 55 |                 c2_msra_fill(m)
 56 |         bias_value = -math.log((1 - self.prior_prob) / self.prior_prob)
 57 |         for module in [self.iam_conv, self.cls_score]:
 58 |             init.constant_(module.bias, bias_value)
 59 |         init.normal_(self.iam_conv.weight, std=0.01)
 60 |         init.normal_(self.cls_score.weight, std=0.01)
 61 | 
 62 |         init.normal_(self.mask_kernel.weight, std=0.01)
 63 |         init.constant_(self.mask_kernel.bias, 0.0)
 64 | 
 65 |     def forward(self, features):
 66 |         # instance features (x4 convs)
 67 |         features = self.inst_convs(features)
 68 |         # predict instance activation maps
 69 |         iam = self.iam_conv(features)
 70 |         iam_prob = iam.sigmoid()
 71 | 
 72 |         B, N = iam_prob.shape[:2]
 73 |         C = features.size(1)
 74 |         # BxNxHxW -> BxNx(HW)
 75 |         iam_prob = iam_prob.view(B, N, -1)
 76 |         normalizer = iam_prob.sum(-1).clamp(min=1e-6)
 77 |         iam_prob = iam_prob / normalizer[:, :, None]
 78 |         # aggregate features: BxCxHxW -> Bx(HW)xC
 79 |         inst_features = torch.bmm(
 80 |             iam_prob, features.view(B, C, -1).permute(0, 2, 1))
 81 |         # predict classification & segmentation kernel & objectness
 82 |         pred_logits = self.cls_score(inst_features)
 83 |         pred_kernel = self.mask_kernel(inst_features)
 84 |         pred_scores = self.objectness(inst_features)
 85 |         return pred_logits, pred_kernel, pred_scores, iam
 86 | 
 87 | 
 88 | class MaskBranch(nn.Module):
 89 | 
 90 |     def __init__(self, cfg, in_channels):
 91 |         super().__init__()
 92 |         dim = cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM
 93 |         num_convs = cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS
 94 |         kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
 95 |         self.mask_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
 96 |         self.projection = nn.Conv2d(dim, kernel_dim, kernel_size=1)
 97 |         self._init_weights()
 98 | 
 99 |     def _init_weights(self):
100 |         for m in self.mask_convs.modules():
101 |             if isinstance(m, nn.Conv2d):
102 |                 c2_msra_fill(m)
103 |         c2_msra_fill(self.projection)
104 | 
105 |     def forward(self, features):
106 |         # mask features (x4 convs)
107 |         features = self.mask_convs(features)
108 |         return self.projection(features)
109 | 
110 | 
111 | @SPARSE_INST_DECODER_REGISTRY.register()
112 | class BaseIAMDecoder(nn.Module):
113 | 
114 |     def __init__(self, cfg):
115 |         super().__init__()
116 |         # add 2 for coordinates
117 |         in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2
118 | 
119 |         self.scale_factor = cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR
120 |         self.output_iam = cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM
121 | 
122 |         self.inst_branch = InstanceBranch(cfg, in_channels)
123 |         self.mask_branch = MaskBranch(cfg, in_channels)
124 | 
125 |     @torch.no_grad()
126 |     def compute_coordinates_linspace(self, x):
127 |         # linspace is not supported in ONNX
128 |         h, w = x.size(2), x.size(3)
129 |         y_loc = torch.linspace(-1, 1, h, device=x.device)
130 |         x_loc = torch.linspace(-1, 1, w, device=x.device)
131 |         y_loc, x_loc = torch.meshgrid(y_loc, x_loc)
132 |         y_loc = y_loc.expand([x.shape[0], 1, -1, -1])
133 |         x_loc = x_loc.expand([x.shape[0], 1, -1, -1])
134 |         locations = torch.cat([x_loc, y_loc], 1)
135 |         return locations.to(x)
136 | 
137 |     @torch.no_grad()
138 |     def compute_coordinates(self, x):
139 |         h, w = x.size(2), x.size(3)
140 |         y_loc = -1.0 + 2.0 * torch.arange(h, device=x.device) / (h - 1)
141 |         x_loc = -1.0 + 2.0 * torch.arange(w, device=x.device) / (w - 1)
142 |         y_loc, x_loc = torch.meshgrid(y_loc, x_loc)
143 |         y_loc = y_loc.expand([x.shape[0], 1, -1, -1])
144 |         x_loc = x_loc.expand([x.shape[0], 1, -1, -1])
145 |         locations = torch.cat([x_loc, y_loc], 1)
146 |         return locations.to(x)
147 | 
148 |     def forward(self, features):
149 |         coord_features = self.compute_coordinates(features)
150 |         features = torch.cat([coord_features, features], dim=1)
151 |         pred_logits, pred_kernel, pred_scores, iam = self.inst_branch(features)
152 |         mask_features = self.mask_branch(features)
153 | 
154 |         N = pred_kernel.shape[1]
155 |         # mask_features: BxCxHxW
156 |         B, C, H, W = mask_features.shape
157 |         pred_masks = torch.bmm(pred_kernel, mask_features.view(
158 |             B, C, H * W)).view(B, N, H, W)
159 | 
160 |         pred_masks = F.interpolate(
161 |             pred_masks, scale_factor=self.scale_factor,
162 |             mode='bilinear', align_corners=False)
163 | 
164 |         output = {
165 |             "pred_logits": pred_logits,
166 |             "pred_masks": pred_masks,
167 |             "pred_scores": pred_scores,
168 |         }
169 | 
170 |         if self.output_iam:
171 |             iam = F.interpolate(iam, scale_factor=self.scale_factor,
172 |                                 mode='bilinear', align_corners=False)
173 |             output['pred_iam'] = iam
174 | 
175 |         return output
176 | 
177 | 
178 | class GroupInstanceBranch(nn.Module):
179 | 
180 |     def __init__(self, cfg, in_channels):
181 |         super().__init__()
182 |         dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM
183 |         num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS
184 |         num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS
185 |         kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
186 |         self.num_groups = cfg.MODEL.SPARSE_INST.DECODER.GROUPS
187 |         self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
188 | 
189 |         self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
190 |         # iam prediction, a group conv
191 |         expand_dim = dim * self.num_groups
192 |         self.iam_conv = nn.Conv2d(
193 |             dim, num_masks * self.num_groups, 3, padding=1, groups=self.num_groups)
194 |         # outputs
195 |         self.fc = nn.Linear(expand_dim, expand_dim)
196 | 
197 |         self.cls_score = nn.Linear(expand_dim, self.num_classes)
198 |         self.mask_kernel = nn.Linear(expand_dim, kernel_dim)
199 |         self.objectness = nn.Linear(expand_dim, 1)
200 | 
201 |         self.prior_prob = 0.01
202 |         self._init_weights()
203 | 
204 |     def _init_weights(self):
205 |         for m in self.inst_convs.modules():
206 |             if isinstance(m, nn.Conv2d):
207 |                 c2_msra_fill(m)
208 |         bias_value = -math.log((1 - self.prior_prob) / self.prior_prob)
209 |         for module in [self.iam_conv, self.cls_score]:
210 |             init.constant_(module.bias, bias_value)
211 |         init.normal_(self.iam_conv.weight, std=0.01)
212 |         init.normal_(self.cls_score.weight, std=0.01)
213 | 
214 |         init.normal_(self.mask_kernel.weight, std=0.01)
215 |         init.constant_(self.mask_kernel.bias, 0.0)
216 |         c2_xavier_fill(self.fc)
217 | 
218 |     def forward(self, features):
219 |         # instance features (x4 convs)
220 |         features = self.inst_convs(features)
221 |         # predict instance activation maps
222 |         iam = self.iam_conv(features)
223 |         iam_prob = iam.sigmoid()
224 | 
225 |         B, N = iam_prob.shape[:2]
226 |         C = features.size(1)
227 |         # BxNxHxW -> BxNx(HW)
228 |         iam_prob = iam_prob.view(B, N, -1)
229 |         normalizer = iam_prob.sum(-1).clamp(min=1e-6)
230 |         iam_prob = iam_prob / normalizer[:, :, None]
231 | 
232 |         # aggregate features: BxCxHxW -> Bx(HW)xC
233 |         inst_features = torch.bmm(
234 |             iam_prob, features.view(B, C, -1).permute(0, 2, 1))
235 | 
236 |         inst_features = inst_features.reshape(
237 |             B, 4, N // self.num_groups, -1).transpose(1, 2).reshape(B, N // self.num_groups, -1)
238 | 
239 |         inst_features = F.relu_(self.fc(inst_features))
240 |         # predict classification & segmentation kernel & objectness
241 |         pred_logits = self.cls_score(inst_features)
242 |         pred_kernel = self.mask_kernel(inst_features)
243 |         pred_scores = self.objectness(inst_features)
244 |         return pred_logits, pred_kernel, pred_scores, iam
245 | 
246 | 
247 | @SPARSE_INST_DECODER_REGISTRY.register()
248 | class GroupIAMDecoder(BaseIAMDecoder):
249 | 
250 |     def __init__(self, cfg):
251 |         super().__init__(cfg)
252 |         in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2
253 |         self.inst_branch = GroupInstanceBranch(cfg, in_channels)
254 | 
255 | 
256 | class GroupInstanceSoftBranch(GroupInstanceBranch):
257 | 
258 |     def __init__(self, cfg, in_channels):
259 |         super().__init__(cfg, in_channels)
260 |         self.softmax_bias = nn.Parameter(torch.ones([1, ]))
261 | 
262 |     def forward(self, features):
263 |         # instance features (x4 convs)
264 |         features = self.inst_convs(features)
265 |         # predict instance activation maps
266 |         iam = self.iam_conv(features)
267 | 
268 |         B, N = iam.shape[:2]
269 |         C = features.size(1)
270 |         # BxNxHxW -> BxNx(HW)
271 |         iam_prob = F.softmax(iam.view(B, N, -1) + self.softmax_bias, dim=-1)
272 |         # aggregate features: BxCxHxW -> Bx(HW)xC
273 |         inst_features = torch.bmm(
274 |             iam_prob, features.view(B, C, -1).permute(0, 2, 1))
275 | 
276 |         inst_features = inst_features.reshape(
277 |             B, self.num_groups, N // self.num_groups, -1).transpose(1, 2).reshape(B, N // self.num_groups, -1)
278 | 
279 |         inst_features = F.relu_(self.fc(inst_features))
280 |         # predict classification & segmentation kernel & objectness
281 |         pred_logits = self.cls_score(inst_features)
282 |         pred_kernel = self.mask_kernel(inst_features)
283 |         pred_scores = self.objectness(inst_features)
284 |         return pred_logits, pred_kernel, pred_scores, iam
285 | 
286 | 
287 | @SPARSE_INST_DECODER_REGISTRY.register()
288 | class GroupIAMSoftDecoder(BaseIAMDecoder):
289 | 
290 |     def __init__(self, cfg):
291 |         super().__init__(cfg)
292 |         in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2
293 |         self.inst_branch = GroupInstanceSoftBranch(cfg, in_channels)
294 | 
295 | 
296 | def build_sparse_inst_decoder(cfg):
297 |     name = cfg.MODEL.SPARSE_INST.DECODER.NAME
298 |     return SPARSE_INST_DECODER_REGISTRY.get(name)(cfg)
299 | 


--------------------------------------------------------------------------------
/sparseinst/loss.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.cuda.amp import autocast
  7 | from scipy.optimize import linear_sum_assignment
  8 | from fvcore.nn import sigmoid_focal_loss_jit
  9 | 
 10 | from detectron2.utils.registry import Registry
 11 | 
 12 | from .utils import nested_masks_from_list, is_dist_avail_and_initialized, get_world_size
 13 | 
 14 | SPARSE_INST_MATCHER_REGISTRY = Registry("SPARSE_INST_MATCHER")
 15 | SPARSE_INST_MATCHER_REGISTRY.__doc__ = "Matcher for SparseInst"
 16 | SPARSE_INST_CRITERION_REGISTRY = Registry("SPARSE_INST_CRITERION")
 17 | SPARSE_INST_CRITERION_REGISTRY.__doc__ = "Criterion for SparseInst"
 18 | 
 19 | 
 20 | def compute_mask_iou(inputs, targets):
 21 |     inputs = inputs.sigmoid()
 22 |     # thresholding
 23 |     binarized_inputs = (inputs >= 0.4).float()
 24 |     targets = (targets > 0.5).float()
 25 |     intersection = (binarized_inputs * targets).sum(-1)
 26 |     union = targets.sum(-1) + binarized_inputs.sum(-1) - intersection
 27 |     score = intersection / (union + 1e-6)
 28 |     return score
 29 | 
 30 | 
 31 | def dice_score(inputs, targets):
 32 |     inputs = inputs.sigmoid()
 33 |     numerator = 2 * torch.matmul(inputs, targets.t())
 34 |     denominator = (
 35 |         inputs * inputs).sum(-1)[:, None] + (targets * targets).sum(-1)
 36 |     score = numerator / (denominator + 1e-4)
 37 |     return score
 38 | 
 39 | 
 40 | def dice_loss(inputs, targets, reduction='sum'):
 41 |     inputs = inputs.sigmoid()
 42 |     assert inputs.shape == targets.shape
 43 |     numerator = 2 * (inputs * targets).sum(1)
 44 |     denominator = (inputs * inputs).sum(-1) + (targets * targets).sum(-1)
 45 |     loss = 1 - (numerator) / (denominator + 1e-4)
 46 |     if reduction == 'none':
 47 |         return loss
 48 |     return loss.sum()
 49 | 
 50 | 
 51 | @SPARSE_INST_CRITERION_REGISTRY.register()
 52 | class SparseInstCriterion(nn.Module):
 53 |     # This part is partially derivated from: https://github.com/facebookresearch/detr/blob/main/models/detr.py
 54 | 
 55 |     def __init__(self, cfg, matcher):
 56 |         super().__init__()
 57 |         self.matcher = matcher
 58 |         self.losses = cfg.MODEL.SPARSE_INST.LOSS.ITEMS
 59 |         self.weight_dict = self.get_weight_dict(cfg)
 60 |         self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
 61 | 
 62 |     def get_weight_dict(self, cfg):
 63 |         losses = ("loss_ce", "loss_mask", "loss_dice", "loss_objectness")
 64 |         weight_dict = {}
 65 |         ce_weight = cfg.MODEL.SPARSE_INST.LOSS.CLASS_WEIGHT
 66 |         mask_weight = cfg.MODEL.SPARSE_INST.LOSS.MASK_PIXEL_WEIGHT
 67 |         dice_weight = cfg.MODEL.SPARSE_INST.LOSS.MASK_DICE_WEIGHT
 68 |         objectness_weight = cfg.MODEL.SPARSE_INST.LOSS.OBJECTNESS_WEIGHT
 69 | 
 70 |         weight_dict = dict(
 71 |             zip(losses, (ce_weight, mask_weight, dice_weight, objectness_weight)))
 72 |         return weight_dict
 73 | 
 74 |     def _get_src_permutation_idx(self, indices):
 75 |         # permute predictions following indices
 76 |         batch_idx = torch.cat([torch.full_like(src, i)
 77 |                               for i, (src, _) in enumerate(indices)])
 78 |         src_idx = torch.cat([src for (src, _) in indices])
 79 |         return batch_idx, src_idx
 80 | 
 81 |     def _get_tgt_permutation_idx(self, indices):
 82 |         # permute targets following indices
 83 |         batch_idx = torch.cat([torch.full_like(tgt, i)
 84 |                               for i, (_, tgt) in enumerate(indices)])
 85 |         tgt_idx = torch.cat([tgt for (_, tgt) in indices])
 86 |         return batch_idx, tgt_idx
 87 | 
 88 |     def loss_labels(self, outputs, targets, indices, num_instances, input_shape=None):
 89 |         assert "pred_logits" in outputs
 90 |         src_logits = outputs['pred_logits']
 91 |         idx = self._get_src_permutation_idx(indices)
 92 |         target_classes_o = torch.cat([t["labels"][J]
 93 |                                      for t, (_, J) in zip(targets, indices)])
 94 |         target_classes = torch.full(src_logits.shape[:2], self.num_classes,
 95 |                                     dtype=torch.int64, device=src_logits.device)
 96 |         target_classes[idx] = target_classes_o
 97 | 
 98 |         src_logits = src_logits.flatten(0, 1)
 99 |         # prepare one_hot target.
100 |         target_classes = target_classes.flatten(0, 1)
101 |         pos_inds = torch.nonzero(
102 |             target_classes != self.num_classes, as_tuple=True)[0]
103 |         labels = torch.zeros_like(src_logits)
104 |         labels[pos_inds, target_classes[pos_inds]] = 1
105 |         # comp focal loss.
106 |         class_loss = sigmoid_focal_loss_jit(
107 |             src_logits,
108 |             labels,
109 |             alpha=0.25,
110 |             gamma=2.0,
111 |             reduction="sum",
112 |         ) / num_instances
113 |         losses = {'loss_ce': class_loss}
114 |         return losses
115 | 
116 |     def loss_masks_with_iou_objectness(self, outputs, targets, indices, num_instances, input_shape):
117 |         src_idx = self._get_src_permutation_idx(indices)
118 |         tgt_idx = self._get_tgt_permutation_idx(indices)
119 |         # Bx100xHxW
120 |         assert "pred_masks" in outputs
121 |         assert "pred_scores" in outputs
122 |         src_iou_scores = outputs["pred_scores"]
123 |         src_masks = outputs["pred_masks"]
124 |         with torch.no_grad():
125 |             target_masks, _ = nested_masks_from_list(
126 |                 [t["masks"].tensor for t in targets], input_shape).decompose()
127 |         num_masks = [len(t["masks"]) for t in targets]
128 |         target_masks = target_masks.to(src_masks)
129 |         if len(target_masks) == 0:
130 |             losses = {
131 |                 "loss_dice": src_masks.sum() * 0.0,
132 |                 "loss_mask": src_masks.sum() * 0.0,
133 |                 "loss_objectness": src_iou_scores.sum() * 0.0
134 |             }
135 |             return losses
136 | 
137 |         src_masks = src_masks[src_idx]
138 |         target_masks = F.interpolate(
139 |             target_masks[:, None], size=src_masks.shape[-2:], mode='bilinear', align_corners=False).squeeze(1)
140 | 
141 |         src_masks = src_masks.flatten(1)
142 |         # FIXME: tgt_idx
143 |         mix_tgt_idx = torch.zeros_like(tgt_idx[1])
144 |         cum_sum = 0
145 |         for num_mask in num_masks:
146 |             mix_tgt_idx[cum_sum: cum_sum + num_mask] = cum_sum
147 |             cum_sum += num_mask
148 |         mix_tgt_idx += tgt_idx[1]
149 | 
150 |         target_masks = target_masks[mix_tgt_idx].flatten(1)
151 | 
152 |         with torch.no_grad():
153 |             ious = compute_mask_iou(src_masks, target_masks)
154 | 
155 |         tgt_iou_scores = ious
156 |         src_iou_scores = src_iou_scores[src_idx]
157 |         tgt_iou_scores = tgt_iou_scores.flatten(0)
158 |         src_iou_scores = src_iou_scores.flatten(0)
159 | 
160 |         losses = {
161 |             "loss_objectness": F.binary_cross_entropy_with_logits(src_iou_scores, tgt_iou_scores, reduction='mean'),
162 |             "loss_dice": dice_loss(src_masks, target_masks) / num_instances,
163 |             "loss_mask": F.binary_cross_entropy_with_logits(src_masks, target_masks, reduction='mean')
164 |         }
165 |         return losses
166 | 
167 |     def get_loss(self, loss, outputs, targets, indices, num_instances, **kwargs):
168 |         loss_map = {
169 |             "labels": self.loss_labels,
170 |             "masks": self.loss_masks_with_iou_objectness,
171 |         }
172 |         if loss == "loss_objectness":
173 |             # NOTE: loss_objectness will be calculated in `loss_masks_with_iou_objectness`
174 |             return {}
175 |         assert loss in loss_map
176 |         return loss_map[loss](outputs, targets, indices, num_instances, **kwargs)
177 | 
178 |     def forward(self, outputs, targets, input_shape):
179 | 
180 |         outputs_without_aux = {k: v for k,
181 |                                v in outputs.items() if k != 'aux_outputs'}
182 | 
183 |         # Retrieve the matching between the outputs of the last layer and the targets
184 |         indices = self.matcher(outputs_without_aux, targets, input_shape)
185 |         # Compute the average number of target boxes accross all nodes, for normalization purposes
186 |         num_instances = sum(len(t["labels"]) for t in targets)
187 |         num_instances = torch.as_tensor(
188 |             [num_instances], dtype=torch.float, device=next(iter(outputs.values())).device)
189 |         if is_dist_avail_and_initialized():
190 |             torch.distributed.all_reduce(num_instances)
191 |         num_instances = torch.clamp(
192 |             num_instances / get_world_size(), min=1).item()
193 |         # Compute all the requested losses
194 |         losses = {}
195 |         for loss in self.losses:
196 |             losses.update(self.get_loss(loss, outputs, targets, indices,
197 |                                         num_instances, input_shape=input_shape))
198 | 
199 |         for k in losses.keys():
200 |             if k in self.weight_dict:
201 |                 losses[k] *= self.weight_dict[k]
202 | 
203 |         return losses
204 | 
205 | 
206 | @SPARSE_INST_MATCHER_REGISTRY.register()
207 | class SparseInstMatcherV1(nn.Module):
208 | 
209 |     def __init__(self, cfg):
210 |         super().__init__()
211 |         self.alpha = cfg.MODEL.SPARSE_INST.MATCHER.ALPHA
212 |         self.beta = cfg.MODEL.SPARSE_INST.MATCHER.BETA
213 |         self.mask_score = dice_score
214 | 
215 |     @torch.no_grad()
216 |     def forward(self, outputs, targets, input_shape):
217 |         B, N, H, W = outputs["pred_masks"].shape
218 |         pred_masks = outputs['pred_masks']
219 |         pred_logits = outputs['pred_logits'].sigmoid()
220 | 
221 |         indices = []
222 | 
223 |         for i in range(B):
224 |             tgt_ids = targets[i]["labels"]
225 |             # no annotations
226 |             if tgt_ids.shape[0] == 0:
227 |                 indices.append((torch.as_tensor([]),
228 |                                 torch.as_tensor([])))
229 |                 continue
230 | 
231 |             tgt_masks = targets[i]['masks'].tensor.to(pred_masks)
232 |             pred_logit = pred_logits[i]
233 |             out_masks = pred_masks[i]
234 | 
235 |             # upsampling:
236 |             # (1) padding/
237 |             # (2) upsampling to 1x input size (input_shape)
238 |             # (3) downsampling to 0.25x input size (output mask size)
239 |             ori_h, ori_w = tgt_masks.size(1), tgt_masks.size(2)
240 |             tgt_masks_ = torch.zeros(
241 |                 (1, tgt_masks.size(0), input_shape[0], input_shape[1])).to(pred_masks)
242 |             tgt_masks_[0, :, :ori_h, :ori_w] = tgt_masks
243 |             tgt_masks = F.interpolate(
244 |                 tgt_masks_, size=out_masks.shape[-2:], mode='bilinear', align_corners=False)[0]
245 | 
246 |             # compute dice score and classification score
247 |             tgt_masks = tgt_masks.flatten(1)
248 |             out_masks = out_masks.flatten(1)
249 | 
250 |             mask_score = self.mask_score(out_masks, tgt_masks)
251 |             # Nx(Number of gts)
252 |             matching_prob = pred_logit[:, tgt_ids]
253 |             C = (mask_score ** self.alpha) * (matching_prob ** self.beta)
254 |             # hungarian matching
255 |             inds = linear_sum_assignment(C.cpu(), maximize=True)
256 |             indices.append(inds)
257 |         return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
258 | 
259 | 
260 | @SPARSE_INST_MATCHER_REGISTRY.register()
261 | class SparseInstMatcher(nn.Module):
262 | 
263 |     def __init__(self, cfg):
264 |         super().__init__()
265 |         self.alpha = cfg.MODEL.SPARSE_INST.MATCHER.ALPHA
266 |         self.beta = cfg.MODEL.SPARSE_INST.MATCHER.BETA
267 |         self.mask_score = dice_score
268 | 
269 |     def forward(self, outputs, targets, input_shape):
270 |         with torch.no_grad():
271 |             B, N, H, W = outputs["pred_masks"].shape
272 |             pred_masks = outputs['pred_masks']
273 |             pred_logits = outputs['pred_logits'].sigmoid()
274 | 
275 |             tgt_ids = torch.cat([v["labels"] for v in targets])
276 | 
277 |             if tgt_ids.shape[0] == 0:
278 |                 return [(torch.as_tensor([]).to(pred_logits), torch.as_tensor([]).to(pred_logits))] * B
279 |             tgt_masks, _ = nested_masks_from_list(
280 |                 [t["masks"].tensor for t in targets], input_shape).decompose()
281 |             device = pred_masks.device
282 |             tgt_masks = tgt_masks.to(pred_masks)
283 | 
284 |             tgt_masks = F.interpolate(
285 |                 tgt_masks[:, None], size=pred_masks.shape[-2:], mode="bilinear", align_corners=False).squeeze(1)
286 | 
287 |             pred_masks = pred_masks.view(B * N, -1)
288 |             tgt_masks = tgt_masks.flatten(1)
289 |             with autocast(enabled=False):
290 |                 pred_masks = pred_masks.float()
291 |                 tgt_masks = tgt_masks.float()
292 |                 pred_logits = pred_logits.float()
293 |                 mask_score = self.mask_score(pred_masks, tgt_masks)
294 |                 # Nx(Number of gts)
295 |                 matching_prob = pred_logits.view(B * N, -1)[:, tgt_ids]
296 |                 C = (mask_score ** self.alpha) * (matching_prob ** self.beta)
297 | 
298 |             C = C.view(B, N, -1).cpu()
299 |             # hungarian matching
300 |             sizes = [len(v["masks"]) for v in targets]
301 |             indices = [linear_sum_assignment(c[i], maximize=True)
302 |                        for i, c in enumerate(C.split(sizes, -1))]
303 |             indices = [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(
304 |                 j, dtype=torch.int64)) for i, j in indices]
305 |             return indices
306 | 
307 | 
308 | def build_sparse_inst_matcher(cfg):
309 |     name = cfg.MODEL.SPARSE_INST.MATCHER.NAME
310 |     return SPARSE_INST_MATCHER_REGISTRY.get(name)(cfg)
311 | 
312 | 
313 | def build_sparse_inst_criterion(cfg):
314 |     matcher = build_sparse_inst_matcher(cfg)
315 |     name = cfg.MODEL.SPARSE_INST.LOSS.NAME
316 |     return SPARSE_INST_CRITERION_REGISTRY.get(name)(cfg, matcher)
317 | 


--------------------------------------------------------------------------------
/sparseinst/backbones/pvt.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from functools import partial
  6 | from timm.models.layers import DropPath, to_2tuple, trunc_normal_
  7 | from detectron2.layers import ShapeSpec
  8 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY
  9 | 
 10 | 
 11 | class Mlp(nn.Module):
 12 |     def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False):
 13 |         super().__init__()
 14 |         out_features = out_features or in_features
 15 |         hidden_features = hidden_features or in_features
 16 |         self.fc1 = nn.Linear(in_features, hidden_features)
 17 |         self.dwconv = DWConv(hidden_features)
 18 |         self.act = act_layer()
 19 |         self.fc2 = nn.Linear(hidden_features, out_features)
 20 |         self.drop = nn.Dropout(drop)
 21 |         self.linear = linear
 22 |         if self.linear:
 23 |             self.relu = nn.ReLU(inplace=True)
 24 |         self.apply(self._init_weights)
 25 | 
 26 |     def _init_weights(self, m):
 27 |         if isinstance(m, nn.Linear):
 28 |             trunc_normal_(m.weight, std=.02)
 29 |             if isinstance(m, nn.Linear) and m.bias is not None:
 30 |                 nn.init.constant_(m.bias, 0)
 31 |         elif isinstance(m, nn.LayerNorm):
 32 |             nn.init.constant_(m.bias, 0)
 33 |             nn.init.constant_(m.weight, 1.0)
 34 |         elif isinstance(m, nn.Conv2d):
 35 |             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 36 |             fan_out //= m.groups
 37 |             m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
 38 |             if m.bias is not None:
 39 |                 m.bias.data.zero_()
 40 | 
 41 |     def forward(self, x, H, W):
 42 |         x = self.fc1(x)
 43 |         if self.linear:
 44 |             x = self.relu(x)
 45 |         x = self.dwconv(x, H, W)
 46 |         x = self.act(x)
 47 |         x = self.drop(x)
 48 |         x = self.fc2(x)
 49 |         x = self.drop(x)
 50 |         return x
 51 | 
 52 | 
 53 | class Attention(nn.Module):
 54 |     def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False):
 55 |         super().__init__()
 56 |         assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
 57 | 
 58 |         self.dim = dim
 59 |         self.num_heads = num_heads
 60 |         head_dim = dim // num_heads
 61 |         self.scale = qk_scale or head_dim ** -0.5
 62 | 
 63 |         self.q = nn.Linear(dim, dim, bias=qkv_bias)
 64 |         self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
 65 |         self.attn_drop = nn.Dropout(attn_drop)
 66 |         self.proj = nn.Linear(dim, dim)
 67 |         self.proj_drop = nn.Dropout(proj_drop)
 68 | 
 69 |         self.linear = linear
 70 |         self.sr_ratio = sr_ratio
 71 |         if not linear:
 72 |             if sr_ratio > 1:
 73 |                 self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
 74 |                 self.norm = nn.LayerNorm(dim)
 75 |         else:
 76 |             self.pool = nn.AdaptiveAvgPool2d(7)
 77 |             self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1)
 78 |             self.norm = nn.LayerNorm(dim)
 79 |             self.act = nn.GELU()
 80 |         self.apply(self._init_weights)
 81 | 
 82 |     def _init_weights(self, m):
 83 |         if isinstance(m, nn.Linear):
 84 |             trunc_normal_(m.weight, std=.02)
 85 |             if isinstance(m, nn.Linear) and m.bias is not None:
 86 |                 nn.init.constant_(m.bias, 0)
 87 |         elif isinstance(m, nn.LayerNorm):
 88 |             nn.init.constant_(m.bias, 0)
 89 |             nn.init.constant_(m.weight, 1.0)
 90 |         elif isinstance(m, nn.Conv2d):
 91 |             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 92 |             fan_out //= m.groups
 93 |             m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
 94 |             if m.bias is not None:
 95 |                 m.bias.data.zero_()
 96 | 
 97 |     def forward(self, x, H, W):
 98 |         B, N, C = x.shape
 99 |         q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
100 | 
101 |         if not self.linear:
102 |             if self.sr_ratio > 1:
103 |                 x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
104 |                 x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
105 |                 x_ = self.norm(x_)
106 |                 kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
107 |             else:
108 |                 kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
109 |         else:
110 |             x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
111 |             x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1)
112 |             x_ = self.norm(x_)
113 |             x_ = self.act(x_)
114 |             kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
115 |         k, v = kv[0], kv[1]
116 | 
117 |         attn = (q @ k.transpose(-2, -1)) * self.scale
118 |         attn = attn.softmax(dim=-1)
119 |         attn = self.attn_drop(attn)
120 | 
121 |         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
122 |         x = self.proj(x)
123 |         x = self.proj_drop(x)
124 | 
125 |         return x
126 | 
127 | 
128 | class Block(nn.Module):
129 | 
130 |     def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
131 |                  drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False):
132 |         super().__init__()
133 |         self.norm1 = norm_layer(dim)
134 |         self.attn = Attention(
135 |             dim,
136 |             num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
137 |             attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear)
138 |         # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
139 |         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
140 |         self.norm2 = norm_layer(dim)
141 |         mlp_hidden_dim = int(dim * mlp_ratio)
142 |         self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear)
143 | 
144 |         self.apply(self._init_weights)
145 | 
146 |     def _init_weights(self, m):
147 |         if isinstance(m, nn.Linear):
148 |             trunc_normal_(m.weight, std=.02)
149 |             if isinstance(m, nn.Linear) and m.bias is not None:
150 |                 nn.init.constant_(m.bias, 0)
151 |         elif isinstance(m, nn.LayerNorm):
152 |             nn.init.constant_(m.bias, 0)
153 |             nn.init.constant_(m.weight, 1.0)
154 |         elif isinstance(m, nn.Conv2d):
155 |             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
156 |             fan_out //= m.groups
157 |             m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
158 |             if m.bias is not None:
159 |                 m.bias.data.zero_()
160 | 
161 |     def forward(self, x, H, W):
162 |         x = x + self.drop_path(self.attn(self.norm1(x), H, W))
163 |         x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
164 | 
165 |         return x
166 | 
167 | 
168 | class OverlapPatchEmbed(nn.Module):
169 |     """ Image to Patch Embedding
170 |     """
171 | 
172 |     def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
173 |         super().__init__()
174 |         img_size = to_2tuple(img_size)
175 |         patch_size = to_2tuple(patch_size)
176 | 
177 |         self.img_size = img_size
178 |         self.patch_size = patch_size
179 |         self.H, self.W = img_size[0] // stride, img_size[1] // stride
180 |         self.num_patches = self.H * self.W
181 |         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
182 |                               padding=(patch_size[0] // 2, patch_size[1] // 2))
183 |         self.norm = nn.LayerNorm(embed_dim)
184 | 
185 |         self.apply(self._init_weights)
186 | 
187 |     def _init_weights(self, m):
188 |         if isinstance(m, nn.Linear):
189 |             trunc_normal_(m.weight, std=.02)
190 |             if isinstance(m, nn.Linear) and m.bias is not None:
191 |                 nn.init.constant_(m.bias, 0)
192 |         elif isinstance(m, nn.LayerNorm):
193 |             nn.init.constant_(m.bias, 0)
194 |             nn.init.constant_(m.weight, 1.0)
195 |         elif isinstance(m, nn.Conv2d):
196 |             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
197 |             fan_out //= m.groups
198 |             m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
199 |             if m.bias is not None:
200 |                 m.bias.data.zero_()
201 | 
202 |     def forward(self, x):
203 |         x = self.proj(x)
204 |         _, _, H, W = x.shape
205 |         x = x.flatten(2).transpose(1, 2)
206 |         x = self.norm(x)
207 | 
208 |         return x, H, W
209 | 
210 | 
211 | class PyramidVisionTransformerV2(Backbone):
212 |     def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dims=[64, 128, 256, 512],
213 |                  num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
214 |                  attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, depths=[3, 4, 6, 3],
215 |                  sr_ratios=[8, 4, 2, 1], num_stages=4, linear=False, out_features=None):
216 |         super().__init__()
217 |         self.depths = depths
218 |         self.num_stages = num_stages
219 |         self.linear = linear
220 | 
221 |         dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
222 |         cur = 0
223 | 
224 |         for i in range(num_stages):
225 |             patch_embed = OverlapPatchEmbed(img_size=img_size if i == 0 else img_size // (2 ** (i + 1)),
226 |                                             patch_size=7 if i == 0 else 3,
227 |                                             stride=4 if i == 0 else 2,
228 |                                             in_chans=in_chans if i == 0 else embed_dims[i - 1],
229 |                                             embed_dim=embed_dims[i])
230 | 
231 |             block = nn.ModuleList([Block(
232 |                 dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias,
233 |                 qk_scale=qk_scale,
234 |                 drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer,
235 |                 sr_ratio=sr_ratios[i], linear=linear)
236 |                 for j in range(depths[i])])
237 |             norm = norm_layer(embed_dims[i])
238 |             cur += depths[i]
239 | 
240 |             setattr(self, f"patch_embed{i + 1}", patch_embed)
241 |             setattr(self, f"block{i + 1}", block)
242 |             setattr(self, f"norm{i + 1}", norm)
243 |         
244 |         out_features_names = ["p1", "p2", "p3", "p4"]
245 |         self._out_feature_strides = dict(zip(out_features_names, [4, 8, 16, 32]))
246 |         self._out_feature_channels = dict(zip(out_features_names, embed_dims))
247 |         if out_features is None:
248 |             self._out_features = out_features_names
249 |         else:
250 |             self._out_features = out_features
251 |         self.out_features_names = out_features_names
252 |         self.apply(self._init_weights)
253 | 
254 |     def _init_weights(self, m):
255 |         if isinstance(m, nn.Linear):
256 |             trunc_normal_(m.weight, std=.02)
257 |             if isinstance(m, nn.Linear) and m.bias is not None:
258 |                 nn.init.constant_(m.bias, 0)
259 |         elif isinstance(m, nn.LayerNorm):
260 |             nn.init.constant_(m.bias, 0)
261 |             nn.init.constant_(m.weight, 1.0)
262 |         elif isinstance(m, nn.Conv2d):
263 |             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
264 |             fan_out //= m.groups
265 |             m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
266 |             if m.bias is not None:
267 |                 m.bias.data.zero_()
268 | 
269 |     def freeze_patch_emb(self):
270 |         self.patch_embed1.requires_grad = False
271 | 
272 |     @torch.jit.ignore
273 |     def no_weight_decay(self):
274 |         return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'}  # has pos_embed may be better
275 | 
276 | 
277 |     def output_shape(self):
278 |         return {
279 |             name: ShapeSpec(
280 |                 channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
281 |             )
282 |             for name in self._out_features
283 |         }
284 | 
285 |     def size_divisibility(self):
286 |         return 32
287 | 
288 | 
289 |     def forward(self, x):
290 |         B = x.shape[0]
291 |         outputs = {}
292 | 
293 |         for i in range(self.num_stages):
294 |             patch_embed = getattr(self, f"patch_embed{i + 1}")
295 |             block = getattr(self, f"block{i + 1}")
296 |             norm = getattr(self, f"norm{i + 1}")
297 |             x, H, W = patch_embed(x)
298 |             for blk in block:
299 |                 x = blk(x, H, W)
300 |             x = norm(x)
301 |             x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
302 |             if self.out_features_names[i] in self._out_features:
303 |                 outputs[self.out_features_names[i]] = x
304 |         return outputs
305 | 
306 | 
307 | class DWConv(nn.Module):
308 |     def __init__(self, dim=768):
309 |         super(DWConv, self).__init__()
310 |         self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
311 | 
312 |     def forward(self, x, H, W):
313 |         B, N, C = x.shape
314 |         x = x.transpose(1, 2).view(B, C, H, W)
315 |         x = self.dwconv(x)
316 |         x = x.flatten(2).transpose(1, 2)
317 | 
318 |         return x
319 | 
320 | 
321 | def _conv_filter(state_dict, patch_size=16):
322 |     """ convert patch embedding weight from manual patchify + linear proj to conv"""
323 |     out_dict = {}
324 |     for k, v in state_dict.items():
325 |         if 'patch_embed.proj.weight' in k:
326 |             v = v.reshape((v.shape[0], 3, patch_size, patch_size))
327 |         out_dict[k] = v
328 | 
329 |     return out_dict
330 | 
331 | 
332 | @BACKBONE_REGISTRY.register()
333 | def build_pyramid_vision_transformer(cfg, input_shape):
334 |     name = cfg.MODEL.PVT.NAME
335 |     linear = cfg.MODEL.PVT.LINEAR
336 |     out_features = cfg.MODEL.PVT.OUT_FEATURES
337 | 
338 |     if linear:
339 |         name = "b2"
340 | 
341 |     if name == "b0":
342 |         embed_dims=[32, 64, 160, 256]
343 |     else:
344 |         embed_dims=[64, 128, 320, 512]
345 |     
346 |     depths = {
347 |         "b0": [2, 2, 2, 2],
348 |         "b1": [2, 2, 2, 2],
349 |         "b2": [3, 4, 6, 3],
350 |         "b3": [3, 4, 18, 3],
351 |         "b4": [3, 8, 27, 3],
352 |         "b5": [3, 6, 40, 3]
353 |     }
354 | 
355 |     if name == "b5":
356 |         mlp_ratios = [4, 4, 4, 4]
357 |     else:
358 |         mlp_ratios = [8, 8, 4, 4]
359 | 
360 |     in_channels = input_shape.channels
361 | 
362 |     return PyramidVisionTransformerV2(
363 |         patch_size=4,
364 |         depths=depths[name],
365 |         in_chans=in_channels,
366 |         embed_dims=embed_dims,
367 |         num_heads=[1, 2, 5, 8],
368 |         mlp_ratios=mlp_ratios,
369 |         drop_rate=0.0,
370 |         drop_path_rate=0.1,
371 |         sr_ratios=[8, 4, 2, 1],
372 |         qkv_bias=True,
373 |         norm_layer=partial(nn.LayerNorm, eps=1e-6),
374 |         out_features=out_features,
375 |         linear=linear
376 |     )
377 | 
378 | 


--------------------------------------------------------------------------------
/sparseinst/backbones/resnet.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #  Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
  3 | 
  4 | import math
  5 | import torch.nn as nn
  6 | from timm.models.resnet import BasicBlock, Bottleneck
  7 | from timm.models.layers import DropBlock2d, DropPath, AvgPool2dSame
  8 | 
  9 | from detectron2.layers import ShapeSpec, FrozenBatchNorm2d
 10 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY
 11 | from detectron2.layers import NaiveSyncBatchNorm, DeformConv
 12 | 
 13 | 
 14 | def get_padding(kernel_size, stride, dilation=1):
 15 |     padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
 16 |     return padding
 17 | 
 18 | 
 19 | """
 20 | inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
 21 |                  reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
 22 |                  attn_layer=None, aa_layer=None, drop_block=None, drop_path=None
 23 | """
 24 | 
 25 | 
 26 | class DeformableBottleneck(nn.Module):
 27 |     expansion = 4
 28 | 
 29 |     def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
 30 |                  reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
 31 |                  attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
 32 |         super().__init__()
 33 | 
 34 |         width = int(math.floor(planes * (base_width / 64)) * cardinality)
 35 |         first_planes = width // reduce_first
 36 |         outplanes = planes * self.expansion
 37 |         first_dilation = first_dilation or dilation
 38 |         # use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation)
 39 | 
 40 |         self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False)
 41 |         self.bn1 = norm_layer(first_planes)
 42 |         self.act1 = act_layer(inplace=True)
 43 | 
 44 |         self.conv2_offset = nn.Conv2d(
 45 |             first_planes,
 46 |             18,
 47 |             kernel_size=3,
 48 |             stride=stride,
 49 |             padding=first_dilation,
 50 |             dilation=first_dilation
 51 |         )
 52 |         self.conv2 = DeformConv(
 53 |             first_planes,
 54 |             width,
 55 |             kernel_size=3,
 56 |             stride=stride,
 57 |             padding=first_dilation,
 58 |             bias=False,
 59 |             dilation=first_dilation,
 60 |         )
 61 | 
 62 |         self.bn2 = norm_layer(width)
 63 |         self.act2 = act_layer(inplace=True)
 64 |         # self.aa = aa_layer(channels=width, stride=stride) if use_aa else None
 65 | 
 66 |         self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False)
 67 |         self.bn3 = norm_layer(outplanes)
 68 | 
 69 |         # self.se = create_attn(attn_layer, outplanes)
 70 | 
 71 |         self.act3 = act_layer(inplace=True)
 72 |         self.downsample = downsample
 73 |         self.stride = stride
 74 |         self.dilation = dilation
 75 |         # self.drop_block = drop_block
 76 |         # self.drop_path = drop_path
 77 | 
 78 |         nn.init.constant_(self.conv2_offset.weight, 0)
 79 |         nn.init.constant_(self.conv2_offset.bias, 0)
 80 | 
 81 |     def zero_init_last_bn(self):
 82 |         nn.init.zeros_(self.bn3.weight)
 83 | 
 84 |     def forward(self, x):
 85 |         shortcut = x
 86 | 
 87 |         x = self.conv1(x)
 88 |         x = self.bn1(x)
 89 | 
 90 |         x = self.act1(x)
 91 | 
 92 |         offset = self.conv2_offset(x)
 93 |         x = self.conv2(x, offset)
 94 |         x = self.bn2(x)
 95 |         x = self.act2(x)
 96 | 
 97 |         x = self.conv3(x)
 98 |         x = self.bn3(x)
 99 | 
100 |         if self.downsample is not None:
101 |             shortcut = self.downsample(shortcut)
102 |         x += shortcut
103 |         x = self.act3(x)
104 | 
105 |         return x
106 | 
107 | 
108 | BLOCK_TYPE = {
109 |     "basic": BasicBlock,
110 |     "bottleneck": Bottleneck,
111 |     "deform_bottleneck": DeformableBottleneck
112 | }
113 | 
114 | 
115 | def downsample_conv(
116 |         in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
117 |     norm_layer = norm_layer or nn.BatchNorm2d
118 |     kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
119 |     first_dilation = (first_dilation or dilation) if kernel_size > 1 else 1
120 |     p = get_padding(kernel_size, stride, first_dilation)
121 | 
122 |     return nn.Sequential(*[
123 |         nn.Conv2d(
124 |             in_channels, out_channels, kernel_size, stride=stride, padding=p, dilation=first_dilation, bias=False),
125 |         norm_layer(out_channels)
126 |     ])
127 | 
128 | 
129 | def downsample_avg(
130 |         in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
131 |     norm_layer = norm_layer or nn.BatchNorm2d
132 |     avg_stride = stride if dilation == 1 else 1
133 |     if stride == 1 and dilation == 1:
134 |         pool = nn.Identity()
135 |     else:
136 |         avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
137 |         pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
138 | 
139 |     return nn.Sequential(*[
140 |         pool,
141 |         nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False),
142 |         norm_layer(out_channels)
143 |     ])
144 | 
145 | 
146 | def drop_blocks(drop_block_rate=0.):
147 |     return [
148 |         None, None,
149 |         DropBlock2d(drop_block_rate, 5, 0.25) if drop_block_rate else None,
150 |         DropBlock2d(drop_block_rate, 3, 1.00) if drop_block_rate else None]
151 | 
152 | 
153 | def make_blocks(
154 |         stage_block, channels, block_repeats, inplanes, reduce_first=1, output_stride=32,
155 |         down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., **kwargs):
156 |     stages = []
157 |     feature_info = []
158 |     net_num_blocks = sum(block_repeats)
159 |     net_block_idx = 0
160 |     net_stride = 4
161 |     dilation = prev_dilation = 1
162 |     for stage_idx, (planes, num_blocks, db) in enumerate(zip(channels, block_repeats, drop_blocks(drop_block_rate))):
163 |         # choose block_fn through the BLOCK_TYPE
164 |         block_fn = BLOCK_TYPE[stage_block[stage_idx]]
165 | 
166 |         stage_name = f'layer{stage_idx + 1}'  # never liked this name, but weight compat requires it
167 |         stride = 1 if stage_idx == 0 else 2
168 |         if net_stride >= output_stride:
169 |             dilation *= stride
170 |             stride = 1
171 |         else:
172 |             net_stride *= stride
173 | 
174 |         downsample = None
175 |         if stride != 1 or inplanes != planes * block_fn.expansion:
176 |             down_kwargs = dict(
177 |                 in_channels=inplanes, out_channels=planes * block_fn.expansion, kernel_size=down_kernel_size,
178 |                 stride=stride, dilation=dilation, first_dilation=prev_dilation, norm_layer=kwargs.get('norm_layer'))
179 |             downsample = downsample_avg(
180 |                 **down_kwargs) if avg_down else downsample_conv(**down_kwargs)
181 | 
182 |         block_kwargs = dict(reduce_first=reduce_first, dilation=dilation, drop_block=db, **kwargs)
183 |         blocks = []
184 |         for block_idx in range(num_blocks):
185 |             downsample = downsample if block_idx == 0 else None
186 |             stride = stride if block_idx == 0 else 1
187 |             block_dpr = drop_path_rate * net_block_idx / \
188 |                 (net_num_blocks - 1)  # stochastic depth linear decay rule
189 |             blocks.append(block_fn(
190 |                 inplanes, planes, stride, downsample, first_dilation=prev_dilation,
191 |                 drop_path=DropPath(block_dpr) if block_dpr > 0. else None, **block_kwargs))
192 |             prev_dilation = dilation
193 |             inplanes = planes * block_fn.expansion
194 |             net_block_idx += 1
195 | 
196 |         stages.append((stage_name, nn.Sequential(*blocks)))
197 |         feature_info.append(dict(num_chs=inplanes, reduction=net_stride, module=stage_name))
198 | 
199 |     return stages, feature_info
200 | 
201 | 
202 | class ResNet(Backbone):
203 |     """ResNet / ResNeXt / SE-ResNeXt / SE-Net
204 | 
205 |     This class implements all variants of ResNet, ResNeXt, SE-ResNeXt, and SENet that
206 |       * have > 1 stride in the 3x3 conv layer of bottleneck
207 |       * have conv-bn-act ordering
208 | 
209 |     This ResNet impl supports a number of stem and downsample options based on the v1c, v1d, v1e, and v1s
210 |     variants included in the MXNet Gluon ResNetV1b model. The C and D variants are also discussed in the
211 |     'Bag of Tricks' paper: https://arxiv.org/pdf/1812.01187. The B variant is equivalent to torchvision default.
212 | 
213 |     ResNet variants (the same modifications can be used in SE/ResNeXt models as well):
214 |       * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b
215 |       * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64)
216 |       * d - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64), average pool in downsample
217 |       * e - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128), average pool in downsample
218 |       * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128)
219 |       * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample
220 |       * tn - 3 layer deep 3x3 stem, stem width = 32 (24, 32, 64), average pool in downsample
221 | 
222 |     ResNeXt
223 |       * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths
224 |       * same c,d, e, s variants as ResNet can be enabled
225 | 
226 |     SE-ResNeXt
227 |       * normal - 7x7 stem, stem_width = 64
228 |       * same c, d, e, s variants as ResNet can be enabled
229 | 
230 |     SENet-154 - 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64,
231 |         reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block
232 | 
233 |     Parameters
234 |     ----------
235 |     block : Block
236 |         Class for the residual block. Options are BasicBlockGl, BottleneckGl.
237 |     layers : list of int
238 |         Numbers of layers in each block
239 |     num_classes : int, default 1000
240 |         Number of classification classes.
241 |     in_chans : int, default 3
242 |         Number of input (color) channels.
243 |     cardinality : int, default 1
244 |         Number of convolution groups for 3x3 conv in Bottleneck.
245 |     base_width : int, default 64
246 |         Factor determining bottleneck channels. `planes * base_width / 64 * cardinality`
247 |     stem_width : int, default 64
248 |         Number of channels in stem convolutions
249 |     stem_type : str, default ''
250 |         The type of stem:
251 |           * '', default - a single 7x7 conv with a width of stem_width
252 |           * 'deep' - three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2
253 |           * 'deep_tiered' - three 3x3 conv layers of widths stem_width//4 * 3, stem_width, stem_width * 2
254 |     block_reduce_first: int, default 1
255 |         Reduction factor for first convolution output width of residual blocks,
256 |         1 for all archs except senets, where 2
257 |     down_kernel_size: int, default 1
258 |         Kernel size of residual block downsampling path, 1x1 for most archs, 3x3 for senets
259 |     avg_down : bool, default False
260 |         Whether to use average pooling for projection skip connection between stages/downsample.
261 |     output_stride : int, default 32
262 |         Set the output stride of the network, 32, 16, or 8. Typically used in segmentation.
263 |     act_layer : nn.Module, activation layer
264 |     norm_layer : nn.Module, normalization layer
265 |     aa_layer : nn.Module, anti-aliasing layer
266 |     drop_rate : float, default 0.
267 |         Dropout probability before classifier, for training
268 |     global_pool : str, default 'avg'
269 |         Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
270 |     """
271 | 
272 |     def __init__(self, block_types, layers, in_chans=3,
273 |                  cardinality=1, base_width=64, stem_width=64, stem_type='', replace_stem_pool=False,
274 |                  output_stride=32, block_reduce_first=1, down_kernel_size=1, avg_down=False,
275 |                  act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_rate=0.0, drop_path_rate=0.,
276 |                  drop_block_rate=0., global_pool='avg', zero_init_last_bn=True, block_args=None, out_features=None):
277 |         block_args = block_args or dict()
278 |         assert output_stride in (8, 16, 32)
279 |         # self.num_classes = num_classes
280 |         self.drop_rate = drop_rate
281 |         super(ResNet, self).__init__()
282 | 
283 |         # Stem
284 |         deep_stem = 'deep' in stem_type
285 |         inplanes = stem_width * 2 if deep_stem else 64
286 |         if deep_stem:
287 |             stem_chs = (stem_width, stem_width)
288 |             if 'tiered' in stem_type:
289 |                 stem_chs = (3 * (stem_width // 4), stem_width)
290 |             self.conv1 = nn.Sequential(*[
291 |                 nn.Conv2d(in_chans, stem_chs[0], 3, stride=2, padding=1, bias=False),
292 |                 norm_layer(stem_chs[0]),
293 |                 act_layer(inplace=True),
294 |                 nn.Conv2d(stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False),
295 |                 norm_layer(stem_chs[1]),
296 |                 act_layer(inplace=True),
297 |                 nn.Conv2d(stem_chs[1], inplanes, 3, stride=1, padding=1, bias=False)])
298 |         else:
299 |             self.conv1 = nn.Conv2d(in_chans, inplanes, kernel_size=7,
300 |                                    stride=2, padding=3, bias=False)
301 |         self.bn1 = norm_layer(inplanes)
302 |         self.act1 = act_layer(inplace=True)
303 |         self.feature_info = [dict(num_chs=inplanes, reduction=2, module='act1')]
304 | 
305 |         # Stem Pooling
306 |         if replace_stem_pool:
307 |             self.maxpool = nn.Sequential(*filter(None, [
308 |                 nn.Conv2d(inplanes, inplanes, 3, stride=1 if aa_layer else 2, padding=1, bias=False),
309 |                 aa_layer(channels=inplanes, stride=2) if aa_layer else None,
310 |                 norm_layer(inplanes),
311 |                 act_layer(inplace=True)
312 |             ]))
313 |         else:
314 |             if aa_layer is not None:
315 |                 self.maxpool = nn.Sequential(*[
316 |                     nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
317 |                     aa_layer(channels=inplanes, stride=2)])
318 |             else:
319 |                 self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
320 | 
321 |         # Feature Blocks
322 |         channels = [64, 128, 256, 512]
323 |         stage_modules, stage_feature_info = make_blocks(
324 |             block_types, channels, layers, inplanes, cardinality=cardinality, base_width=base_width,
325 |             output_stride=output_stride, reduce_first=block_reduce_first, avg_down=avg_down,
326 |             down_kernel_size=down_kernel_size, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer,
327 |             drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, **block_args)
328 |         for stage in stage_modules:
329 |             self.add_module(*stage)  # layer1, layer2, etc
330 |         self.feature_info.extend(stage_feature_info)
331 | 
332 |         for n, m in self.named_modules():
333 |             if isinstance(m, nn.BatchNorm2d):
334 |                 nn.init.constant_(m.weight, 1.)
335 |                 nn.init.constant_(m.bias, 0.)
336 |         if zero_init_last_bn:
337 |             for m in self.modules():
338 |                 if hasattr(m, 'zero_init_last_bn'):
339 |                     m.zero_init_last_bn()
340 | 
341 |         out_features_names = ["res2", "res3", "res4", "res5"]
342 |         self._out_feature_strides = dict(zip(out_features_names, [4, 8, 16, 32]))
343 |         self._out_feature_channels = dict(
344 |             zip(out_features_names, [x * BLOCK_TYPE[block_types[0]].expansion for x in [64, 128, 256, 512]]))
345 |         if out_features is None:
346 |             self._out_features = out_features_names
347 |         else:
348 |             self._out_features = out_features
349 | 
350 |     def output_shape(self):
351 |         return {
352 |             name: ShapeSpec(
353 |                 channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
354 |             )
355 |             for name in self._out_features
356 |         }
357 | 
358 |     def size_divisibility(self):
359 |         return 32
360 | 
361 |     def forward(self, x):
362 |         x = self.conv1(x)
363 |         x = self.bn1(x)
364 |         x = self.act1(x)
365 |         x = self.maxpool(x)
366 |         outputs = {}
367 |         x = self.layer1(x)
368 |         # outputs["res2"] = x
369 |         x = self.layer2(x)
370 |         outputs["res3"] = x
371 |         x = self.layer3(x)
372 |         outputs["res4"] = x
373 |         x = self.layer4(x)
374 |         outputs["res5"] = x
375 |         return outputs
376 | 
377 | 
378 | @BACKBONE_REGISTRY.register()
379 | def build_resnet_vd_backbone(cfg, input_shape):
380 | 
381 |     depth = cfg.MODEL.RESNETS.DEPTH
382 |     norm_name = cfg.MODEL.RESNETS.NORM
383 |     if norm_name == "FrozenBN":
384 |         norm = FrozenBatchNorm2d
385 |     elif norm_name == "SyncBN":
386 |         norm = NaiveSyncBatchNorm
387 |     else:
388 |         norm = nn.BatchNorm2d
389 |     if depth == 50:
390 |         layers = [3, 4, 6, 3]
391 |     elif depth == 101:
392 |         layers = [3, 4, 23, 3]
393 |     else:
394 |         raise NotImplementedError()
395 | 
396 |     stage_blocks = []
397 |     use_deformable = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
398 |     for idx in range(4):
399 |         if use_deformable[idx]:
400 |             stage_blocks.append("deform_bottleneck")
401 |         else:
402 |             stage_blocks.append("bottleneck")
403 | 
404 |     model = ResNet(stage_blocks, layers, stem_type="deep",
405 |                    stem_width=32, avg_down=True, norm_layer=norm)
406 |     return model
407 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | <img src="assets/banner.gif">
  3 | <br>
  4 | <br>
  5 | Tianheng Cheng, <a href="https://xwcv.github.io/">Xinggang Wang</a><sup><span>&#8224;</span></sup>, Shaoyu Chen, Wenqiang Zhang, <a href="https://scholar.google.com/citations?user=pCY-bikAAAAJ&hl=zh-CN">Qian Zhang</a>, <a href="https://scholar.google.com/citations?user=IyyEKyIAAAAJ&hl=zh-CN">Chang Huang</a>, <a href="https://zhaoxiangzhang.net/">Zhaoxiang Zhang</a>, <a href="http://eic.hust.edu.cn/professor/liuwenyu/"> Wenyu Liu</a>
  6 | </br>
  7 | (<span>&#8224;</span>: corresponding author)
  8 | 
  9 | <!-- <div><a href="">[Project Page]</a>(comming soon)</div>  -->
 10 | <div>
 11 | <a href="https://arxiv.org/abs/2203.12827">[arXiv paper]</a>
 12 | <a href="https://openaccess.thecvf.com/content/CVPR2022/papers/Cheng_Sparse_Instance_Activation_for_Real-Time_Instance_Segmentation_CVPR_2022_paper.pdf">[CVPR paper]</a>
 13 | <a href="https://drive.google.com/file/d/1xhqQvQ0YVCHd8XQxnCVqef75Hey7kI-d/view?usp=sharing">[slides]</a>
 14 | </div>
 15 | </div>
 16 | 
 17 | 
 18 | 
 19 | ## Highlights 
 20 | 
 21 | <div align="center">
 22 | <img src="assets/animate.gif">
 23 | <br>
 24 | <br>
 25 | <div>
 26 | 
 27 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sparse-instance-activation-for-real-time/real-time-instance-segmentation-on-mscoco)](https://paperswithcode.com/sota/real-time-instance-segmentation-on-mscoco?p=sparse-instance-activation-for-real-time)
 28 | </div>
 29 | </div>
 30 | 
 31 | 
 32 | 
 33 | * SparseInst presents a new object representation method, *i.e.*, Instance Activation Maps (IAM), to adaptively highlight informative regions of objects for recognition.
 34 | * SparseInst is a simple, efficient, and fully convolutional framework without non-maximum suppression (NMS) or sorting, and easy to deploy!
 35 | * SparseInst achieves good trade-off between speed and accuracy, *e.g.*, 37.9 AP and 40 FPS with 608x input.
 36 | 
 37 | 
 38 | 
 39 | ## Updates
 40 | 
 41 | `This project is under active development, please stay tuned!` &#9749;
 42 | 
 43 | * `[2022-10-31]`: We release the models & weights for the [`CSP-DarkNet53`](configs/sparse_inst_cspdarknet53_giam.yaml) backbone. Which is a strong baseline with highly-competitve inference speed and accuracy.
 44 | 
 45 | * `[2022-10-19]`: We provide the implementation and inference code based on [MindSpore](https://www.mindspore.cn/), a nice and efficient Deep Learning framework. Thanks [Ruiqi Wang](https://github.com/RuiqiWang00) for this kind contribution!
 46 | 
 47 | * `[2022-8-9]`: We provide the FLOPs counter [`get_flops.py`](./tools/get_flops.py) to obtain the FLOPs/Parameters of SparseInst. This update also includes some bugfixs.
 48 | 
 49 | * `[2022-7-17]`: `Faster`&#128640;:  SparseInst now supports [training and inference with **FP16**](https://github.com/hustvl/SparseInst#-sparseinst-with-fp16). Inference with FP16 improves the speed by **30\%**. `Robust`: we replace the `Sigmoid + Norm` with [`Softmax`](configs/sparse_inst_r50_giam_softmax.yaml) for numerical stability, especially for ONNX. `Easy-to-Use`: we provide the [script](./onnx/convert_onnx.py) for exporting SparseInst to ONNX models.
 50 | 
 51 | * `[2022-4-29]`: We fix the **common issue** about the visualization `demo.py`, *e.g.,* `ValueError: GenericMask cannot handle ...`. 
 52 | 
 53 | * `[2022-4-7]`: We provide the `demo` code for visualization and inference on images. Besides, we have added more backbones for SparseInst, including [ResNet-101](https://arxiv.org/abs/1512.03385), [CSPDarkNet](https://arxiv.org/pdf/2004.10934v1.pdf), and [PvTv2](https://arxiv.org/abs/2102.12122). We are still supporting more backbones.
 54 | 
 55 | * `[2022-3-25]`: We have released the code and models for SparseInst! 
 56 | 
 57 |  
 58 | 
 59 | ## Overview
 60 | **SparseInst** is a conceptually novel, efficient, and fully convolutional framework for real-time instance segmentation.
 61 | In contrast to region boxes or anchors (centers), SparseInst adopts a sparse set of **instance activation maps** as object representation, to highlight informative regions for each foreground objects.
 62 | Then it obtains the instance-level features by aggregating features according to the highlighted regions for recognition and segmentation.
 63 | The bipartite matching compels the instance activation maps to predict objects in a one-to-one style, thus avoiding non-maximum suppression (NMS) in post-processing. Owing to the simple yet effective designs with instance activation maps, SparseInst has extremely fast inference speed and achieves **40 FPS** and **37.9 AP** on COCO (NVIDIA 2080Ti), significantly outperforms the counter parts in terms of speed and accuracy.
 64 | 
 65 | 
 66 | <center>
 67 | <img src="./assets/sparseinst.png">
 68 | </center>
 69 | 
 70 | 
 71 | ## Models
 72 | 
 73 | We provide two versions of SparseInst, *i.e.*, the basic IAM (3x3 convolution) and the Group IAM (G-IAM for short), with different backbones.
 74 | All models are trained on MS-COCO *train2017*.
 75 | 
 76 | #### Fast models
 77 | 
 78 | | model | backbone | input | aug | AP<sup>val</sup> |  AP  | FPS | weights |
 79 | | :---- | :------  | :---: | :-: |:--------------: | :--: | :-: | :-----: |
 80 | | [SparseInst](configs/sparse_inst_r50_base.yaml) | [R-50](https://drive.google.com/file/d/1Ee6nPXlj1eewAnooYtoPtLzbRp_mDxfB/view?usp=sharing) | 640 | &#x2718; | 32.8 | 33.2 | 44.3 | [model](https://drive.google.com/file/d/12RQLHD5EZKIOvlqW3avUCeYjFG1NPKDy/view?usp=sharing) |
 81 | | [SparseInst](sparse_inst_r50vd_base.yaml) | [R-50-vd](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50d_ra2-464e36ba.pth) | 640 | &#x2718; | 34.1 | 34.5 | 42.6 | [model](https://drive.google.com/file/d/1fjPFy35X2iJu3tYwVdAq4Bel82PfH5kx/view?usp=sharing)|
 82 | | [SparseInst (G-IAM)](configs/sparse_inst_r50_giam.yaml) | [R-50](https://drive.google.com/file/d/1Ee6nPXlj1eewAnooYtoPtLzbRp_mDxfB/view?usp=sharing) | 608 | &#x2718; | 33.4 | 34.0 | 44.6 | [model](https://drive.google.com/file/d/1pXU7Dsa1L7nUiLU9ULG2F6Pl5m5NEguL/view?usp=sharing) |
 83 | | [SparseInst (G-IAM, Softmax)](configs/sparse_inst_r50_giam_soft.yaml) | [R-50](https://drive.google.com/file/d/1Ee6nPXlj1eewAnooYtoPtLzbRp_mDxfB/view?usp=sharing) | 608 | &#x2718; | 33.6 | - | 44.6 | [model](https://drive.google.com/file/d/1doterrG89SjmLxDyU8IhLYRGxVH69sR2/view?usp=sharing) |
 84 | | [SparseInst (G-IAM)](configs/sparse_inst_r50_giam_aug.yaml) | [R-50](https://drive.google.com/file/d/1Ee6nPXlj1eewAnooYtoPtLzbRp_mDxfB/view?usp=sharing) | 608 | &#10003; | 34.2 | 34.7 | 44.6 | [model](https://drive.google.com/file/d/1MK8rO3qtA7vN9KVSBdp0VvZHCNq8-bvz/view?usp=sharing) |
 85 | | [SparseInst (G-IAM)](configs/sparse_inst_r50_dcn_giam_aug.yaml) | [R-50-DCN](https://drive.google.com/file/d/1Ee6nPXlj1eewAnooYtoPtLzbRp_mDxfB/view?usp=sharing) | 608 | &#10003;| 36.4 | 36.8 | 41.6 | [model](https://drive.google.com/file/d/1qxdLRRHbIWEwRYn-NPPeCCk6fhBjc946/view?usp=sharing) |
 86 | | [SparseInst (G-IAM)](configs/sparse_inst_r50vd_giam_aug.yaml) | [R-50-vd](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50d_ra2-464e36ba.pth) | 608 | &#10003;| 35.6 | 36.1 | 42.8| [model](https://drive.google.com/file/d/1dlamg7ych_BdWpPUCuiBXbwE0SXpsfGx/view?usp=sharing) |
 87 | | [SparseInst (G-IAM)](configs/sparse_inst_r50vd_dcn_giam_aug.yaml) | [R-50-vd-DCN](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50d_ra2-464e36ba.pth) | 608 | &#10003; | 37.4 | 37.9 | 40.0  | [model](https://drive.google.com/file/d/1clYPdCNrDNZLbmlAEJ7wjsrOLn1igOpT/view?usp=sharing)|
 88 | | [SparseInst (G-IAM)](configs/sparse_inst_r50vd_dcn_giam_aug.yaml) | [R-50-vd-DCN](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50d_ra2-464e36ba.pth) | 640 | &#10003; | 37.7 | 38.1 | 39.3 |  [model](https://drive.google.com/file/d/1clYPdCNrDNZLbmlAEJ7wjsrOLn1igOpT/view?usp=sharing)| 
 89 | 
 90 | #### SparseInst with other backbones
 91 | 
 92 | | model | backbone | input | AP<sup>val</sup> |  AP  | FPS | weights |
 93 | | :---- | :------ | :---: | :--------------: | :--: | :-: | :-----: |
 94 | | SparseInst (G-IAM) | [CSPDarkNet](configs/sparse_inst_cspdarknet53_giam.yaml) | 640 | 35.1 | -| - | [model](https://drive.google.com/file/d/1rcUJWUbusM216Zbtmo_xB774jdjb3qSt/view?usp=sharing) |
 95 | 
 96 | #### Larger models
 97 | 
 98 | | model | backbone | input | aug  | AP<sup>val</sup> |  AP  | FPS | weights |
 99 | | :---- | :------ | :---: | :---: | :--------------: | :--: | :-: | :-----: |
100 | | [SparseInst (G-IAM)](configs/sparse_inst_r101_giam.yaml) | [R-101](https://drive.google.com/file/d/1-6ZBvC55unwuHvGn-Xf4xuy2Qr1vC7Zo/view?usp=sharing) | 640 | &#x2718; | 34.9 | 35.5 | - | [model](https://drive.google.com/file/d/1EZZck-UNfom652iyDhdaGYbxS0MrO__z/view?usp=sharing)|
101 | | [SparseInst (G-IAM)](configs/sparse_inst_r101_dcn_giam.yaml) | [R-101-DCN](https://drive.google.com/file/d/1-6ZBvC55unwuHvGn-Xf4xuy2Qr1vC7Zo/view?usp=sharing) | 640 | &#x2718; | 36.4 | 36.9 | - | [model](https://drive.google.com/file/d/1shkFvyBmDlWRxl1ActD6VfZJTJYBGBjv/view?usp=sharing) |
102 | 
103 | #### SparseInst with Vision Transformers
104 | 
105 | | model | backbone | input | aug | AP<sup>val</sup> |  AP  | FPS | weights |
106 | | :---- | :------ | :---: | :---: | :--------------: | :--: | :-: | :-----: |
107 | | [SparseInst (G-IAM)](configs/sparse_inst_pvt_b1_giam.yaml) | [PVTv2-B1](https://drive.google.com/file/d/1B7JTO0WqyhFn7nvUlRf6qKQrFzTnRWDC/view?usp=sharing) | 640 |  &#x2718; | 35.3 | 36.0 | 33.5 (48.9<sup>&#x021A1;</sup>)| [model](https://drive.google.com/file/d/13l9JgTz3sF6j3vSVHOOhAYJnCf-QuNe_/view?usp=sharing) |
108 | | [SparseInst (G-IAM)](configs/sparse_inst_pvt_b2_li_giam.yaml) | [PVTv2-B2-li](https://drive.google.com/file/d/1YhjCH4FZa9ekWUqa-JovEfAR2wuUXEtQ/view?usp=sharing) | 640 |  &#x2718; | 37.2 | 38.2 | 26.5 | [model](https://drive.google.com/file/d/1DFxQnFg_UL6kmMoNC4StUKo79RXVHyNF/view?usp=sharing) |
109 | 
110 | <sup>&#x021A1;</sup>: measured on RTX 3090.
111 | 
112 | 
113 | **Note:** 
114 | * **We will continue adding more models** including more efficient convolutional networks, vision transformers, and larger models for high performance and high speed, please stay tuned &#128513;!
115 | * Inference speeds are measured on one NVIDIA 2080Ti unless specified.
116 | * We haven't adopt TensorRT or other tools to accelerate the inference of SparseInst. However, we are working on it now and will provide support for ONNX, TensorRT, MindSpore, [Blade](https://github.com/alibaba/BladeDISC), and other frameworks as soon as possible!
117 | * AP denotes AP evaluated on MS-COCO *test-dev2017*
118 | * *input* denotes the shorter side of the input, *e.g.*, 512x864 and 608x864, we keep the aspect ratio of the input and the longer side is no more than 864.
119 | * The inference speed might slightly change on different machines (2080 Ti) and different versions of detectron (we mainly use [v0.3](https://github.com/facebookresearch/detectron2/tree/v0.3)). If the change is sharp, e.g., > 5ms, please feel free to contact us.
120 | * For `aug` (augmentation), we only adopt the simple random crop (crop size: [384, 600]) provided by detectron2.
121 | * We adopt `weight decay=5e-2` as default setting, which is slightly different from the original paper.
122 | * **[Weights on BaiduPan]**: we also provide trained models on BaiduPan: [ShareLink](https://pan.baidu.com/s/1tot7Wcoi4J1xh8ZS7VikZg) (password: lkdo).
123 | 
124 | ## Installation and Prerequisites
125 | 
126 | This project is built upon the excellent framework [detectron2](https://github.com/facebookreseach/detectron2), and you should install detectron2 first, please check [official installation guide](https://detectron2.readthedocs.io/en/latest/tutorials/install.html) for more details.
127 | 
128 | **Updates:** SparseInst works well on [detectron2-v0.6](https://github.com/facebookresearch/detectron2/tree/v0.6). 
129 | 
130 | **Note:** previously, we mainly use [v0.3](https://github.com/facebookresearch/detectron2/tree/v0.3) of detectron2 for experiments and evaluations. Besides, we also test our code on the newest version [v0.6](https://github.com/facebookresearch/detectron2/tree/v0.6). If you find some bugs or incompatibility problems of higher version of detectron2, please feel free to raise a issue!
131 | 
132 | Install the detectron2:
133 | 
134 | ```bash
135 | git clone https://github.com/facebookresearch/detectron2.git
136 | # if you swith to a specific version, e.g., v0.3 (recommended) or v0.6
137 | git checkout tags/v0.6
138 | # build detectron2
139 | python setup.py build develop
140 | ```
141 | 
142 | ## Getting Start
143 | 
144 | 
145 | ### &#128293; SparseInst with FP16
146 | 
147 | SparseInst with FP16 achieves 30% faster inference speed and saves much training memory, we provide some comparisons about the memory, inference speed, and training speed in the below table.
148 | 
149 | |  FP16 | train mem.(log) | train mem.(`nvidia-smi`) | train speed | infer. speed | 
150 | | :---: | :-------------: | :----------------------: | :---------: | :----------: |
151 | | &#x2718; | 6.0G | 10.5G | 0.8690s/iter | 52.17 FPS |
152 | | &#10003; | 3.9G | 6.8G  | 0.6949s/iter | 67.57 FPS |
153 | 
154 | Note: statistics are measured on NVIDIA 3090. With FP16, we have faster training speed and can also increase the batch size for better performance.
155 | 
156 | * Training with FP16: enable FP16 is simple, you only need to enable `SOLVER.AMP.ENABLED=True`, or add this configuration to the config file.
157 | 
158 | ```bash
159 | python tools/train_net.py --config-file configs/sparse_inst_r50_giam_fp16.yaml --num-gpus 8 SOLVER.AMP.ENABLED True
160 | ```
161 | 
162 | * Testing with FP16: enable FP16 for inference by adding `--fp16`.
163 | 
164 | ```bash
165 | python tools/test_net.py --config-file configs/sparse_inst_r50_giam_fp16.yaml --fp16 MODEL.WEIGHTS model_final.pth 
166 | ```
167 | 
168 | ### Testing SparseInst
169 | 
170 | Before testing, you should specify the config file `<CONFIG>` and the model weights `<MODEL-PATH>`. In addition, you can change the input size by setting the `INPUT.MIN_SIZE_TEST` in both config file or commandline.
171 | 
172 | * [Performance Evaluation] To obtain the evaluation results, *e.g.*, mask AP on COCO, you can run:
173 | 
174 | ```bash
175 | python tools/train_net.py --config-file <CONFIG> --num-gpus <GPUS> --eval MODEL.WEIGHTS <MODEL-PATH>
176 | # example:
177 | python tools/train_net.py --config-file configs/sparse_inst_r50_giam.yaml --num-gpus 8 --eval MODEL.WEIGHTS sparse_inst_r50_giam_aug_2b7d68.pth
178 | ```
179 | 
180 | * [Inference Speed] To obtain the inference speed (FPS) on one GPU device, you can run:
181 | 
182 | ```bash
183 | python tools/test_net.py --config-file <CONFIG> MODEL.WEIGHTS <MODEL-PATH> INPUT.MIN_SIZE_TEST 512
184 | # example:
185 | python tools/test_net.py --config-file configs/sparse_inst_r50_giam.yaml MODEL.WEIGHTS sparse_inst_r50_giam_aug_2b7d68.pth INPUT.MIN_SIZE_TEST 512
186 | ```
187 | 
188 | **Note:** 
189 | * The [`tools/test_net.py`](./tools/test_net.py) only supports **1 GPU** and **1 image per batch** for measuring inference speed.
190 | * The inference time consists of the *pure forward time* and the *post-processing time*. While the evaluation processing, data loading, and pre-processing for wrappers (*e.g.*, ImageList) are not included.
191 | * `COCOMaskEvaluator` is modified from [`COCOEvaluator`](https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/coco_evaluation.py) for evaluating mask-only results.
192 | 
193 | ### FLOPs and Parameters
194 | 
195 | The [`get_flops.py`](tools/get_flops.py) is built based on `detectron2` and `fvcore`. 
196 | 
197 | ```bash
198 | python tools/get_flops.py --config-file <CONFIG> --tasks parameter flop
199 | ```
200 | 
201 | ### Visualizing Images with SparseInst
202 | 
203 | To inference or visualize the segmentation results on your images, you can run:
204 | 
205 | ```bash
206 | python demo.py --config-file <CONFIG> --input <IMAGE-PATH> --output results --opts MODEL.WEIGHTS <MODEL-PATH>
207 | # example
208 | python demo.py --config-file configs/sparse_inst_r50_giam.yaml --input datasets/coco/val2017/* --output results --opt MODEL.WEIGHTS sparse_inst_r50_giam_aug_2b7d68.pth INPUT.MIN_SIZE_TEST 512
209 | ```
210 | * Besides, the `demo.py` also supports inference on video (`--video-input`), camera (`--webcam`). For inference on video, you might refer to [issue #9](https://github.com/hustvl/SparseInst/issues/9) to avoid someerrors.
211 | * `--opts` supports modifications to the config-file, *e.g.,* `INPUT.MIN_SIZE_TEST 512`.
212 | * `--input` can be single image or a folder of images, *e.g.,* `xxx/*`.
213 | * If `--output` is not specified, a popup window will show the visualization results for each image.
214 | * Lowering the `confidence-threshold` will show more instances but with more false positives.
215 | 
216 | <div>
217 | <table align="center">
218 | <td><img src="assets/figures/000000006471.jpg" height=200></td>
219 | <td><img src="assets/figures/000000014439.jpg" height=200></td>
220 | </table>
221 | <span><p align="center">Visualization results (SparseInst-R50-GIAM)</p></span>
222 | </div>
223 | 
224 | 
225 | ### Training SparseInst
226 | 
227 | To train the SparseInst model on COCO dataset with 8 GPUs. 8 GPUs are required for the training. If you only have 4 GPUs or GPU memory is limited, it doesn't matter and you can reduce the batch size through `SOLVER.IMS_PER_BATCH` or reduce the input size. If you adjust the batch size, learning schedule should be adjusted according to the linear scaling rule.
228 | 
229 | ```bash
230 | python tools/train_net.py --config-file <CONFIG> --num-gpus 8 
231 | # example
232 | python tools/train_net.py --config-file configs/sparse_inst_r50vd_dcn_giam_aug.yaml --num-gpus 8
233 | ```
234 | 
235 | 
236 | <!-- ### ONNX Export -->
237 | 
238 | 
239 | ### Custom Training of SparseInst
240 | 
241 | 1. We suggest you convert your custom datasets into the `COCO` format, which enables the usage of the default dataset mappers and loaders. You may find more details in the [official guide of detectron2](https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html#register-a-coco-format-dataset).
242 | 2. You need to check whether `NUM_CLASSES` and `NUM_MASKS` should be changed according to your scenarios or tasks.
243 | 3. Change the configurations accordingly.
244 | 4. After finishing the above procedures, you can easily train SparseInst by `train_net.py`.
245 | 
246 | 
247 | ## Acknowledgements
248 | 
249 | SparseInst is based on [detectron2](https://github.com/facebookresearch/detectron2), [OneNet](https://github.com/PeizeSun/OneNet), [DETR](https://github.com/facebookresearch/detr), and [timm](https://github.com/rwightman/pytorch-image-models), and we sincerely thanks for their code and contribution to the community!
250 | 
251 | 
252 | ## Citing SparseInst
253 | 
254 | If you find SparseInst is useful in your research or applications, please consider giving us a star &#127775; and citing SparseInst by the following BibTeX entry.
255 | 
256 | ```BibTeX
257 | @inproceedings{Cheng2022SparseInst,
258 |   title     =   {Sparse Instance Activation for Real-Time Instance Segmentation},
259 |   author    =   {Cheng, Tianheng and Wang, Xinggang and Chen, Shaoyu and Zhang, Wenqiang and Zhang, Qian and Huang, Chang and Zhang, Zhaoxiang and Liu, Wenyu},
260 |   booktitle =   {Proc. IEEE Conf. Computer Vision and Pattern Recognition (CVPR)},
261 |   year      =   {2022}
262 | }
263 | 
264 | ```
265 | 
266 | 
267 | ## License
268 | 
269 | SparseInst is released under the [MIT Licence](LICENCE).
270 | 


--------------------------------------------------------------------------------
/sparseinst/backbones/cspnet.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from timm.models.layers import create_conv2d, create_act_layer
  7 | from timm.models.layers import DropPath, AvgPool2dSame, create_attn
  8 | 
  9 | 
 10 | from detectron2.layers import ShapeSpec, FrozenBatchNorm2d
 11 | from detectron2.modeling import Backbone, BACKBONE_REGISTRY
 12 | 
 13 | 
 14 | model_cfgs = dict(
 15 |     cspresnet50=dict(
 16 |         stem=dict(out_chs=64, kernel_size=7, stride=2, pool='max'),
 17 |         stage=dict(
 18 |             out_chs=(128, 256, 512, 1024),
 19 |             depth=(3, 3, 5, 2),
 20 |             stride=(1,) + (2,) * 3,
 21 |             exp_ratio=(2.,) * 4,
 22 |             bottle_ratio=(0.5,) * 4,
 23 |             block_ratio=(1.,) * 4,
 24 |             cross_linear=True,
 25 |         )
 26 |     ),
 27 |     cspresnet50d=dict(
 28 |         stem=dict(out_chs=[32, 32, 64], kernel_size=3, stride=2, pool='max'),
 29 |         stage=dict(
 30 |             out_chs=(128, 256, 512, 1024),
 31 |             depth=(3, 3, 5, 2),
 32 |             stride=(1,) + (2,) * 3,
 33 |             exp_ratio=(2.,) * 4,
 34 |             bottle_ratio=(0.5,) * 4,
 35 |             block_ratio=(1.,) * 4,
 36 |             cross_linear=True,
 37 |         )
 38 |     ),
 39 |     cspresnet50w=dict(
 40 |         stem=dict(out_chs=[32, 32, 64], kernel_size=3, stride=2, pool='max'),
 41 |         stage=dict(
 42 |             out_chs=(256, 512, 1024, 2048),
 43 |             depth=(3, 3, 5, 2),
 44 |             stride=(1,) + (2,) * 3,
 45 |             exp_ratio=(1.,) * 4,
 46 |             bottle_ratio=(0.25,) * 4,
 47 |             block_ratio=(0.5,) * 4,
 48 |             cross_linear=True,
 49 |         )
 50 |     ),
 51 |     cspresnext50=dict(
 52 |         stem=dict(out_chs=64, kernel_size=7, stride=2, pool='max'),
 53 |         stage=dict(
 54 |             out_chs=(256, 512, 1024, 2048),
 55 |             depth=(3, 3, 5, 2),
 56 |             stride=(1,) + (2,) * 3,
 57 |             groups=(32,) * 4,
 58 |             exp_ratio=(1.,) * 4,
 59 |             bottle_ratio=(1.,) * 4,
 60 |             block_ratio=(0.5,) * 4,
 61 |             cross_linear=True,
 62 |         )
 63 |     ),
 64 |     cspdarknet53=dict(
 65 |         stem=dict(out_chs=32, kernel_size=3, stride=1, pool=''),
 66 |         stage=dict(
 67 |             out_chs=(64, 128, 256, 512, 1024),
 68 |             depth=(1, 2, 8, 8, 4),
 69 |             stride=(2,) * 5,
 70 |             exp_ratio=(2.,) + (1.,) * 4,
 71 |             bottle_ratio=(0.5,) + (1.0,) * 4,
 72 |             block_ratio=(1.,) + (0.5,) * 4,
 73 |             down_growth=True,
 74 |         )
 75 |     ),
 76 |     darknet53=dict(
 77 |         stem=dict(out_chs=32, kernel_size=3, stride=1, pool=''),
 78 |         stage=dict(
 79 |             out_chs=(64, 128, 256, 512, 1024),
 80 |             depth=(1, 2, 8, 8, 4),
 81 |             stride=(2,) * 5,
 82 |             bottle_ratio=(0.5,) * 5,
 83 |             block_ratio=(1.,) * 5,
 84 |         )
 85 |     )
 86 | )
 87 | 
 88 | class ConvBnAct(nn.Module):
 89 |     def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding='', dilation=1, groups=1,
 90 |                  bias=False, apply_act=True, norm_layer=nn.BatchNorm2d, act_layer=nn.ReLU, aa_layer=None,
 91 |                  drop_block=None):
 92 |         super(ConvBnAct, self).__init__()
 93 |         use_aa = aa_layer is not None
 94 | 
 95 |         self.conv = create_conv2d(
 96 |             in_channels, out_channels, kernel_size, stride=1 if use_aa else stride,
 97 |             padding=padding, dilation=dilation, groups=groups, bias=bias)
 98 | 
 99 |         # NOTE for backwards compatibility with models that use separate norm and act layer definitions
100 |         self.bn = norm_layer(out_channels)
101 |         self.act = act_layer()
102 |         self.aa = aa_layer(
103 |             channels=out_channels) if stride == 2 and use_aa else None
104 | 
105 |     @property
106 |     def in_channels(self):
107 |         return self.conv.in_channels
108 | 
109 |     @property
110 |     def out_channels(self):
111 |         return self.conv.out_channels
112 | 
113 |     def forward(self, x):
114 |         x = self.conv(x)
115 |         x = self.bn(x)
116 |         x = self.act(x)
117 |         if self.aa is not None:
118 |             x = self.aa(x)
119 |         return x
120 | 
121 | 
122 | def create_stem(
123 |         in_chans=3, out_chs=32, kernel_size=3, stride=2, pool='',
124 |         act_layer=None, norm_layer=None, aa_layer=None):
125 |     stem = nn.Sequential()
126 |     if not isinstance(out_chs, (tuple, list)):
127 |         out_chs = [out_chs]
128 |     assert len(out_chs)
129 |     in_c = in_chans
130 |     for i, out_c in enumerate(out_chs):
131 |         conv_name = f'conv{i + 1}'
132 |         stem.add_module(conv_name, ConvBnAct(
133 |             in_c, out_c, kernel_size, stride=stride if i == 0 else 1,
134 |             act_layer=act_layer, norm_layer=norm_layer))
135 |         in_c = out_c
136 |         last_conv = conv_name
137 |     if pool:
138 |         if aa_layer is not None:
139 |             stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
140 |             stem.add_module('aa', aa_layer(channels=in_c, stride=2))
141 |         else:
142 |             stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
143 |     return stem, dict(num_chs=in_c, reduction=stride, module='.'.join(['stem', last_conv]))
144 | 
145 | 
146 | class ResBottleneck(nn.Module):
147 |     """ ResNe(X)t Bottleneck Block
148 |     """
149 | 
150 |     def __init__(self, in_chs, out_chs, dilation=1, bottle_ratio=0.25, groups=1,
151 |                  act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_last=False,
152 |                  attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
153 |         super(ResBottleneck, self).__init__()
154 |         mid_chs = int(round(out_chs * bottle_ratio))
155 |         ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer,
156 |                        aa_layer=aa_layer, drop_block=drop_block)
157 | 
158 |         self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
159 |         self.conv2 = ConvBnAct(mid_chs, mid_chs, kernel_size=3,
160 |                                dilation=dilation, groups=groups, **ckwargs)
161 |         self.attn2 = create_attn(attn_layer, channels=mid_chs) if not attn_last else None
162 |         self.conv3 = ConvBnAct(mid_chs, out_chs, kernel_size=1, apply_act=False, **ckwargs)
163 |         self.attn3 = create_attn(attn_layer, channels=out_chs) if attn_last else None
164 |         self.drop_path = drop_path
165 |         self.act3 = act_layer(inplace=True)
166 | 
167 |     def zero_init_last_bn(self):
168 |         nn.init.zeros_(self.conv3.bn.weight)
169 | 
170 |     def forward(self, x):
171 |         shortcut = x
172 |         x = self.conv1(x)
173 |         x = self.conv2(x)
174 |         if self.attn2 is not None:
175 |             x = self.attn2(x)
176 |         x = self.conv3(x)
177 |         if self.attn3 is not None:
178 |             x = self.attn3(x)
179 |         if self.drop_path is not None:
180 |             x = self.drop_path(x)
181 |         x = x + shortcut
182 |         # FIXME partial shortcut needed if first block handled as per original, not used for my current impl
183 |         #x[:, :shortcut.size(1)] += shortcut
184 |         x = self.act3(x)
185 |         return x
186 | 
187 | 
188 | class DarkBlock(nn.Module):
189 |     """ DarkNet Block
190 |     """
191 | 
192 |     def __init__(self, in_chs, out_chs, dilation=1, bottle_ratio=0.5, groups=1,
193 |                  act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_layer=None, aa_layer=None,
194 |                  drop_block=None, drop_path=None):
195 |         super(DarkBlock, self).__init__()
196 |         mid_chs = int(round(out_chs * bottle_ratio))
197 |         ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer,
198 |                        aa_layer=aa_layer, drop_block=drop_block)
199 |         self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
200 |         self.conv2 = ConvBnAct(mid_chs, out_chs, kernel_size=3,
201 |                                dilation=dilation, groups=groups, **ckwargs)
202 |         self.attn = create_attn(attn_layer, channels=out_chs)
203 |         self.drop_path = drop_path
204 | 
205 |     def zero_init_last_bn(self):
206 |         nn.init.zeros_(self.conv2.bn.weight)
207 | 
208 |     def forward(self, x):
209 |         shortcut = x
210 |         x = self.conv1(x)
211 |         x = self.conv2(x)
212 |         if self.attn is not None:
213 |             x = self.attn(x)
214 |         if self.drop_path is not None:
215 |             x = self.drop_path(x)
216 |         x = x + shortcut
217 |         return x
218 | 
219 | 
220 | class CrossStage(nn.Module):
221 |     """Cross Stage."""
222 | 
223 |     def __init__(self, in_chs, out_chs, stride, dilation, depth, block_ratio=1., bottle_ratio=1., exp_ratio=1.,
224 |                  groups=1, first_dilation=None, down_growth=False, cross_linear=False, block_dpr=None,
225 |                  block_fn=ResBottleneck, **block_kwargs):
226 |         super(CrossStage, self).__init__()
227 |         first_dilation = first_dilation or dilation
228 |         down_chs = out_chs if down_growth else in_chs  # grow downsample channels to output channels
229 |         exp_chs = int(round(out_chs * exp_ratio))
230 |         block_out_chs = int(round(out_chs * block_ratio))
231 |         conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'),
232 |                            norm_layer=block_kwargs.get('norm_layer'))
233 | 
234 |         if stride != 1 or first_dilation != dilation:
235 |             self.conv_down = ConvBnAct(
236 |                 in_chs, down_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
237 |                 aa_layer=block_kwargs.get('aa_layer', None), **conv_kwargs)
238 |             prev_chs = down_chs
239 |         else:
240 |             self.conv_down = None
241 |             prev_chs = in_chs
242 | 
243 |         # FIXME this 1x1 expansion is pushed down into the cross and block paths in the darknet cfgs. Also,
244 |         # there is also special case for the first stage for some of the model that results in uneven split
245 |         # across the two paths. I did it this way for simplicity for now.
246 |         self.conv_exp = ConvBnAct(prev_chs, exp_chs, kernel_size=1,
247 |                                   apply_act=not cross_linear, **conv_kwargs)
248 |         prev_chs = exp_chs // 2  # output of conv_exp is always split in two
249 | 
250 |         self.blocks = nn.Sequential()
251 |         for i in range(depth):
252 |             drop_path = DropPath(block_dpr[i]) if block_dpr and block_dpr[i] else None
253 |             self.blocks.add_module(str(i), block_fn(
254 |                 prev_chs, block_out_chs, dilation, bottle_ratio, groups, drop_path=drop_path, **block_kwargs))
255 |             prev_chs = block_out_chs
256 | 
257 |         # transition convs
258 |         self.conv_transition_b = ConvBnAct(prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs)
259 |         self.conv_transition = ConvBnAct(exp_chs, out_chs, kernel_size=1, **conv_kwargs)
260 | 
261 |     def forward(self, x):
262 |         if self.conv_down is not None:
263 |             x = self.conv_down(x)
264 |         x = self.conv_exp(x)
265 |         split = x.shape[1] // 2
266 |         xs, xb = x[:, :split], x[:, split:]
267 |         xb = self.blocks(xb)
268 |         xb = self.conv_transition_b(xb).contiguous()
269 |         out = self.conv_transition(torch.cat([xs, xb], dim=1))
270 |         return out
271 | 
272 | 
273 | class DarkStage(nn.Module):
274 |     """DarkNet stage."""
275 | 
276 |     def __init__(self, in_chs, out_chs, stride, dilation, depth, block_ratio=1., bottle_ratio=1., groups=1,
277 |                  first_dilation=None, block_fn=ResBottleneck, block_dpr=None, **block_kwargs):
278 |         super(DarkStage, self).__init__()
279 |         first_dilation = first_dilation or dilation
280 | 
281 |         self.conv_down = ConvBnAct(
282 |             in_chs, out_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
283 |             act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'),
284 |             aa_layer=block_kwargs.get('aa_layer', None))
285 | 
286 |         prev_chs = out_chs
287 |         block_out_chs = int(round(out_chs * block_ratio))
288 |         self.blocks = nn.Sequential()
289 |         for i in range(depth):
290 |             drop_path = DropPath(block_dpr[i]) if block_dpr and block_dpr[i] else None
291 |             self.blocks.add_module(str(i), block_fn(
292 |                 prev_chs, block_out_chs, dilation, bottle_ratio, groups, drop_path=drop_path, **block_kwargs))
293 |             prev_chs = block_out_chs
294 | 
295 |     def forward(self, x):
296 |         x = self.conv_down(x)
297 |         x = self.blocks(x)
298 |         return x
299 | 
300 | 
301 | def _cfg_to_stage_args(cfg, curr_stride=2, output_stride=32, drop_path_rate=0.):
302 |     # get per stage args for stage and containing blocks, calculate strides to meet target output_stride
303 |     num_stages = len(cfg['depth'])
304 |     if 'groups' not in cfg:
305 |         cfg['groups'] = (1,) * num_stages
306 |     if 'down_growth' in cfg and not isinstance(cfg['down_growth'], (list, tuple)):
307 |         cfg['down_growth'] = (cfg['down_growth'],) * num_stages
308 |     if 'cross_linear' in cfg and not isinstance(cfg['cross_linear'], (list, tuple)):
309 |         cfg['cross_linear'] = (cfg['cross_linear'],) * num_stages
310 |     cfg['block_dpr'] = [None] * num_stages if not drop_path_rate else \
311 |         [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg['depth'])).split(cfg['depth'])]
312 |     stage_strides = []
313 |     stage_dilations = []
314 |     stage_first_dilations = []
315 |     dilation = 1
316 |     for cfg_stride in cfg['stride']:
317 |         stage_first_dilations.append(dilation)
318 |         if curr_stride >= output_stride:
319 |             dilation *= cfg_stride
320 |             stride = 1
321 |         else:
322 |             stride = cfg_stride
323 |             curr_stride *= stride
324 |         stage_strides.append(stride)
325 |         stage_dilations.append(dilation)
326 |     cfg['stride'] = stage_strides
327 |     cfg['dilation'] = stage_dilations
328 |     cfg['first_dilation'] = stage_first_dilations
329 |     stage_args = [dict(zip(cfg.keys(), values)) for values in zip(*cfg.values())]
330 |     return stage_args
331 | 
332 | 
333 | class CSPNet(Backbone):
334 |     """Cross Stage Partial base model.
335 | 
336 |     Paper: `CSPNet: A New Backbone that can Enhance Learning Capability of CNN` - https://arxiv.org/abs/1911.11929
337 |     Ref Impl: https://github.com/WongKinYiu/CrossStagePartialNetworks
338 | 
339 |     NOTE: There are differences in the way I handle the 1x1 'expansion' conv in this impl vs the
340 |     darknet impl. I did it this way for simplicity and less special cases.
341 |     """
342 | 
343 |     def __init__(self, cfg, in_chans=3, output_stride=32, global_pool='avg', drop_rate=0.,
344 |                  act_layer=nn.LeakyReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_path_rate=0.,
345 |                  zero_init_last_bn=True, stage_fn=CrossStage, block_fn=ResBottleneck, out_features=None):
346 |         super().__init__()
347 |         self.drop_rate = drop_rate
348 |         assert output_stride in (8, 16, 32)
349 |         layer_args = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer)
350 | 
351 |         # Construct the stem
352 |         self.stem, stem_feat_info = create_stem(in_chans, **cfg['stem'], **layer_args)
353 |         self.feature_info = [stem_feat_info]
354 |         prev_chs = stem_feat_info['num_chs']
355 |         curr_stride = stem_feat_info['reduction']  # reduction does not include pool
356 |         if cfg['stem']['pool']:
357 |             curr_stride *= 2
358 | 
359 |         # Construct the stages
360 |         per_stage_args = _cfg_to_stage_args(
361 |             cfg['stage'], curr_stride=curr_stride, output_stride=output_stride, drop_path_rate=drop_path_rate)
362 |         self.stages = nn.Sequential()
363 |         out_channels = []
364 |         out_strides = []
365 |         for i, sa in enumerate(per_stage_args):
366 |             self.stages.add_module(
367 |                 str(i), stage_fn(prev_chs, **sa, **layer_args, block_fn=block_fn))
368 |             prev_chs = sa['out_chs']
369 |             curr_stride *= sa['stride']
370 |             self.feature_info += [dict(num_chs=prev_chs,
371 |                                        reduction=curr_stride, module=f'stages.{i}')]
372 |             out_channels.append(prev_chs)
373 |             out_strides.append(curr_stride)
374 | 
375 |         for m in self.modules():
376 |             if isinstance(m, nn.Conv2d):
377 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
378 |             elif isinstance(m, nn.BatchNorm2d):
379 |                 nn.init.ones_(m.weight)
380 |                 nn.init.zeros_(m.bias)
381 |             elif isinstance(m, nn.Linear):
382 |                 nn.init.normal_(m.weight, mean=0.0, std=0.01)
383 |                 nn.init.zeros_(m.bias)
384 |         if zero_init_last_bn:
385 |             for m in self.modules():
386 |                 if hasattr(m, 'zero_init_last_bn'):
387 |                     m.zero_init_last_bn()
388 | 
389 |         # cspdarknet: csp1, csp2, csp3, csp4
390 |         # cspresnet: csp0, csp1, csp2, csp3
391 |         out_features_names = ["csp{}".format(i) for i in range(len(per_stage_args))]
392 |         self._out_feature_strides = dict(zip(out_features_names, out_strides))
393 |         self._out_feature_channels = dict(zip(out_features_names, out_channels))
394 |         if out_features is None:
395 |             self._out_features = out_features_names
396 |         else:
397 |             self._out_features = out_features
398 | 
399 |     def output_shape(self):
400 |         return {
401 |             name: ShapeSpec(
402 |                 channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
403 |             )
404 |             for name in self._out_features
405 |         }
406 | 
407 |     def size_divisibility(self):
408 |         return 32
409 | 
410 |     def forward(self, x):
411 |         x = self.stem(x)
412 |         outputs = {}
413 |         for i, stage in enumerate(self.stages):
414 |             name = f"csp{i}"
415 |             x = stage(x)
416 |             if name in self._out_features:
417 |                 outputs[name] = x
418 |         return outputs
419 | 
420 | 
421 | @BACKBONE_REGISTRY.register()
422 | def build_cspnet_backbone(cfg, input_shape=None):
423 | 
424 |     cspnet_name = cfg.MODEL.CSPNET.NAME
425 |     norm_name = cfg.MODEL.CSPNET.NORM
426 |     out_features = cfg.MODEL.CSPNET.OUT_FEATURES
427 |     # DarkNet53 doesn't have batch norm
428 |     if norm_name == "FrozenBN":
429 |         norm = FrozenBatchNorm2d
430 |     elif norm_name == "SyncBN":
431 |         norm = nn.SyncBatchNorm
432 |     else:
433 |         norm = nn.BatchNorm2d
434 | 
435 |     assert cspnet_name in ["cspresnet50", "cspresnet50d", "cspresnet50w",
436 |                            "cspresnext50", "cspdarknet53", "darknet53"]
437 | 
438 |     model_cfg = model_cfgs[cspnet_name]
439 | 
440 |     if "darknet" in cspnet_name:
441 |         block_fn = DarkBlock
442 |     else:
443 |         block_fn = ResBottleneck
444 | 
445 |     if cspnet_name == "darknet53":
446 |         stage_fn = DarkStage
447 |     else:
448 |         stage_fn = CrossStage
449 | 
450 |     model = CSPNet(
451 |         model_cfg,
452 |         in_chans=input_shape.channels,
453 |         norm_layer=norm,
454 |         stage_fn=stage_fn,
455 |         block_fn=block_fn,
456 |         out_features=out_features)
457 |     return model
458 | 


--------------------------------------------------------------------------------