├── .gitignore
├── src
    ├── configs
    │   ├── CondInst_P0_1x.yaml
    │   ├── CondInst_fine_tuning_30k.yaml
    │   └── point_selection_from_CondInst.yaml
    ├── register_point_annotations.py
    ├── train_net_point.py
    └── condinst
    │   ├── dynamic_mask_head.py
    │   ├── standard
    │       ├── dynamic_mask_head.py
    │       └── condinst.py
    │   ├── Entropy
    │       ├── dynamic_mask_head.py
    │       └── condinst.py
    │   └── condinst.py
├── scripts
    ├── prepare.sh
    ├── random.py
    ├── initialization.py
    └── entropy.py
├── README.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | .DS_Store
3 | 


--------------------------------------------------------------------------------
/src/configs/CondInst_P0_1x.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: "../MS_R_50_1x.yaml"
2 | 
3 | MODEL:
4 |   BOXINST:
5 |     POINT_LOSS_WEIGHT: 0.1
6 | DATASETS:
7 |   TRAIN: ("coco_2017_train_points_n1",)
8 | INPUT:
9 |   POINT_SUP: True


--------------------------------------------------------------------------------
/src/configs/CondInst_fine_tuning_30k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../MS_R_50_1x.yaml"
 2 | 
 3 | MODEL:
 4 |   WEIGHTS: ""
 5 |   BOXINST:
 6 |     POINT_LOSS_WEIGHT: 0.1
 7 | DATASETS:
 8 |   TRAIN: ("coco_2017_train_points_n1",)
 9 | INPUT:
10 |   POINT_SUP: True
11 | SOLVER:
12 |   IMS_PER_BATCH: 16
13 |   BASE_LR: 0.01
14 |   STEPS: (10000, 20000)
15 |   MAX_ITER: 30000


--------------------------------------------------------------------------------
/src/configs/point_selection_from_CondInst.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "../MS_R_50_1x.yaml"
 2 | 
 3 | MODEL:
 4 |   WEIGHTS: ""
 5 | SOLVER:
 6 |   IMS_PER_BATCH: 8
 7 |   BASE_LR: 0.0 # stop updating the model
 8 |   MAX_ITER: 14800 # 14800*8 ≈ len(train2017)
 9 | INPUT:
10 |   MIN_SIZE_TRAIN: (800,)
11 |   RANDOM_FLIP: none
12 |   HFLIP_TRAIN: False
13 | 
14 | # Currently, the point selection process is implemented by 
15 | # forwarding the model separately after training, while this  
16 | # process can be accelerated by selecting during the last epoch.


--------------------------------------------------------------------------------
/scripts/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export ROOT_PATH = your_root_path # modify
 4 | cd $ROOT_PATH
 5 | 
 6 | # copy source files
 7 | cp APIS/src/train_net_point.py AdelaiDet/tools/
 8 | cp APIS/src/register_point_annotations.py detectron2/projects/PointSup/point_sup/
 9 | cp APIS/src/condinst/* AdelaiDet/adet/modeling/condinst/
10 | mkdir -p AdelaiDet/configs/CondInst/APIS
11 | cp APIS/src/configs/* AdelaiDet/configs/CondInst/APIS/
12 | 
13 | cd detectron2 && export DETECTRON2_DATASETS=$ROOT_PATH/AdelaiDet/datasets && cd ..
14 | 
15 | # generate random points (see PointSup for details)
16 | python detectron2/projects/PointSup/tools/prepare_coco_point_annotations_without_masks.py 10
17 | 
18 | mkdir -p AdelaiDet/models


--------------------------------------------------------------------------------
/scripts/random.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.system('cd ./AdelaiDet')
 4 | 
 5 | # step=2 is P1 in the paper
 6 | for step in range(2, 11):
 7 |     pre_step = step - 1
 8 | 
 9 |     #########################################################################################################
10 |     # train with the selected points
11 |     #########################################################################################################
12 | 
13 |     strs = (f'OMP_NUM_THREADS=1 python tools/train_net_point.py \
14 |             --config-file configs/CondInst/APIS/CondInst_fine_tuning_30k.yaml \
15 |             --num-gpus 8 \
16 |             DATASETS.TRAIN "\'coco_2017_train_points_n{step}_random\'," \
17 |             MODEL.WEIGHTS models/model_{pre_step}.pth \
18 |             MODEL.BOXINST.POINT_LOSS_WEIGHT 1.0 \
19 |             OUTPUT_DIR training_dir/random_logs_n{step}')
20 |     os.system(strs)
21 |     os.system(f'cp training_dir/random_logs_n{step}/model_final.pth models/model_{step}.pth')


--------------------------------------------------------------------------------
/scripts/initialization.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import copy
 4 | 
 5 | os.system('cd ./AdelaiDet')
 6 | 
 7 | # generate random points for p1~p9
 8 | # p(i) should be the subset of p(j) if i<j
 9 | for n in range(1, 10):
10 |     json_file = json.load(open('datasets/coco/annotations/instances_train2017_n10_v1_without_masks.json', 'r'))
11 |     for ann in json_file['annotations']:
12 |         ann['point_coords'] = ann['point_coords'][0:n]
13 |         ann['point_labels'] = ann['point_labels'][0:n]
14 |     json.dump(json_file, open(f'datasets/coco/annotations/instances_train2017_n{n}_v1_without_masks.json', 'w'))
15 | 
16 | # p0 is randomly selected
17 | os.system('cp datasets/coco/annotations/instances_train2017_n1_v1_without_masks.json\
18 | 			datasets/coco/annotations/instances_train2017_n1.json')
19 | 
20 | strs = ('OMP_NUM_THREADS=1 python tools/train_net_point.py \
21 | 		--config-file configs/CondInst/APIS//CondInst_P0_1x.yaml \
22 | 		--num-gpus 8 \
23 | 		MODEL.BOXINST.POINT_LOSS_WEIGHT 0.1 \
24 | 		OUTPUT_DIR training_dir/P0')
25 | os.system(strs)
26 | 
27 | os.system('cp training_dir/P0/model_final.pth models/model_1.pth')


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Active Pointly-Supervised Instance Segmentation (APIS)
 2 | 
 3 | Code for the paper "[Active Pointly-Supervised Instance Segmentation](https://arxiv.org/abs/2207.11493)", ECCV 2022.
 4 | 
 5 | Contact: chufeng.t@foxmail.com
 6 | 
 7 | **NOTE:** This release is currently a preliminary version for APIS, where only the newly added or modified source files are included for simplicity. The provided scripts could help you understand how APIS works. We will release the complete version as well as the checkpoints in the near future.
 8 | 
 9 | ## Preparation
10 | 
11 | This project is based on the open-source toolbox [AdelaiDet](https://github.com/aim-uofa/AdelaiDet) (as well as [Detectron2](https://github.com/facebookresearch/detectron2)).
12 | 
13 | Please refer to [INSTALL.md](https://github.com/aim-uofa/AdelaiDet/blob/master/README.md) for installation and dataset (MS-COCO) preparation.
14 | 
15 | The expected folder structure:
16 | 
17 | ```text
18 | ROOT_PATH
19 | ├── AdelaiDet
20 | │   ├── datasets
21 | │   │   ├── coco
22 | │   │   │   ├── annotations
23 | │   │   │   ├── train2017
24 | │   │   │   ├── val2017
25 | ├── detectron2
26 | ├── APIS
27 | │   ├── scripts
28 | │   ├── src
29 | ```
30 | 
31 | Note that only the newly added or modified source files are included in `APIS/src`.
32 | 
33 | Set `$ROOT_PATH`  in `APIS/scripts/prepare.sh` and run:
34 | 
35 | ```bash
36 | # copy source files and prepare random point annotations
37 | sh APIS/scripts/prepare.sh
38 | ```
39 | 
40 | ## Usage
41 | 
42 | We provide the **one-click scripts** to reproduce the main results in the paper, including the results of the `Random Sampling` and `Entropy` strategies mentioned in the paper.
43 | 
44 | #### 1. model initialization (P0)
45 | 
46 | ```bash
47 | python APIS/scripts/initialization.py
48 | ```
49 | 
50 | #### 2. random sampling (P1~P9)
51 | 
52 | ```bash
53 | python APIS/scripts/random.py
54 | ```
55 | 
56 | #### 3. active selection with the Entropy metric (P1~P9)
57 | 
58 | ```bash
59 | python APIS/scripts/entropy.py
60 | ```
61 | 
62 | 
63 | ## Reference
64 | 
65 | If this work is useful to your research, please cite:
66 | 
67 | ```
68 | @inproceedings{tang2022APIS,
69 |   title={Active Pointly-Supervised Instance Segmentation},
70 |   author={Tang, Chufeng and Xie, Lingxi and Zhang, Gang and Zhang, Xiaopeng and Tian, Qi and Hu, Xiaolin},
71 |   booktitle={European Conference on Computer Vision (ECCV)},
72 |   year={2022}
73 | }
74 | ```
75 | 


--------------------------------------------------------------------------------
/src/register_point_annotations.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | import logging
 3 | import os
 4 | 
 5 | from detectron2.data import DatasetCatalog, MetadataCatalog
 6 | from detectron2.data.datasets.builtin import _get_builtin_metadata
 7 | from detectron2.data.datasets.coco import load_coco_json
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | # COCO dataset
13 | def register_coco_instances_with_points(name, metadata, json_file, image_root):
14 |     """
15 |     Register a dataset in COCO's json annotation format for
16 |     instance segmentation with point annotation.
17 | 
18 |     The point annotation json does not have "segmentation" field, instead,
19 |     it has "point_coords" and "point_labels" fields.
20 | 
21 |     Args:
22 |         name (str): the name that identifies a dataset, e.g. "coco_2014_train".
23 |         metadata (dict): extra metadata associated with this dataset.  You can
24 |             leave it as an empty dict.
25 |         json_file (str): path to the json instance annotation file.
26 |         image_root (str or path-like): directory which contains all the images.
27 |     """
28 |     assert isinstance(name, str), name
29 |     assert isinstance(json_file, (str, os.PathLike)), json_file
30 |     assert isinstance(image_root, (str, os.PathLike)), image_root
31 |     # 1. register a function which returns dicts
32 |     DatasetCatalog.register(
33 |         name, lambda: load_coco_json(json_file, image_root, name, ["point_coords", "point_labels"])
34 |     )
35 | 
36 |     # 2. Optionally, add metadata about this dataset,
37 |     # since they might be useful in evaluation, visualization or logging
38 |     MetadataCatalog.get(name).set(
39 |         json_file=json_file, image_root=image_root, evaluator_type="coco", **metadata
40 |     )
41 | 
42 | 
43 | _PREDEFINED_SPLITS_COCO = {}
44 | _PREDEFINED_SPLITS_COCO["coco"] = {}
45 | 
46 | for i in range(1, 11):
47 |     key = f"coco_2017_train_points_n{i}_random"
48 |     value = (
49 |         "coco/train2017",
50 |         f"coco/annotations/instances_train2017_n{i}_v1_without_masks.json",
51 |     )
52 |     _PREDEFINED_SPLITS_COCO["coco"][key] = value
53 | 
54 | for i in range(1, 11):
55 |     key = f"coco_2017_train_points_n{i}_active"
56 |     value = (
57 |         "coco/train2017",
58 |         f"coco/annotations/instances_train2017_n{i}.json",
59 |     )
60 |     _PREDEFINED_SPLITS_COCO["coco"][key] = value
61 | 
62 | 
63 | def register_all_coco_train_points(root):
64 |     for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items():
65 |         for key, (image_root, json_file) in splits_per_dataset.items():
66 |             # Assume pre-defined datasets live in `./datasets`.
67 |             register_coco_instances_with_points(
68 |                 key,
69 |                 _get_builtin_metadata(dataset_name),
70 |                 os.path.join(root, json_file) if "://" not in json_file else json_file,
71 |                 os.path.join(root, image_root),
72 |             )
73 | 
74 | 
75 | # True for open source;
76 | # Internally at fb, we register them elsewhere
77 | if __name__.endswith(".register_point_annotations"):
78 |     _root = os.getenv("DETECTRON2_DATASETS", "datasets")
79 |     register_all_coco_train_points(_root)
80 | 


--------------------------------------------------------------------------------
/scripts/entropy.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import json
  4 | from tqdm import trange
  5 | import numpy as np
  6 | import copy
  7 | from pycocotools.coco import COCO
  8 | 
  9 | os.system('cd ./AdelaiDet')
 10 | 
 11 | # step=2 is P1 in the paper
 12 | for step in range(2, 11):
 13 |     pre_step = step - 1
 14 |     
 15 |     #########################################################################################################
 16 |     # point selection
 17 |     #########################################################################################################
 18 | 
 19 |     os.make_dirs('points')
 20 |     strs = ('cp adet/modeling/condinst/Entropy/* adet/modeling/condinst')
 21 |     os.system(strs)
 22 | 
 23 |     strs = (f'OMP_NUM_THREADS=1 python ./tools/train_net_point.py \
 24 |             --config-file configs/CondInst/APIS/point_selection_from_CondInst.yaml \
 25 |             --num-gpus 8 \
 26 |             MODEL.WEIGHTS models/model_{pre_step}.pth \
 27 |             OUTPUT_DIR sample_log')
 28 |     os.system(strs)
 29 | 
 30 |     strs = ('cp adet/modeling/condinst/standard/* adet/modeling/condinst')
 31 |     os.system(strs)
 32 | 
 33 |     #########################################################################################################
 34 |     # generate json files for training
 35 |     #########################################################################################################
 36 |     
 37 |     coco_json = json.load(open('datasets/coco/annotations/instances_train2017.json', 'r'))
 38 |     pre_json = json.load(open(f'datasets/coco/annotations/instances_train2017_n{pre_step}.json', 'r'))
 39 |     step_random_json = json.load(open(f'datasets/coco/annotations/instances_train2017_n{step}_v1_without_masks.json', 'r'))
 40 | 
 41 |     coco = COCO('datasets/coco/annotations/instances_train2017.json')
 42 | 
 43 |     img_anns = {}
 44 |     for i in range(len(coco_json['annotations'])):
 45 |         ann = coco_json['annotations'][i]
 46 |         image_id = ann['image_id']
 47 |         if image_id not in img_anns:
 48 |             img_anns[image_id] = []
 49 |         img_anns[image_id].append(i)
 50 | 
 51 |     def load_preds(path):
 52 |         preds = dict()
 53 |         if not os.path.exists(path): return preds
 54 |         load_preds = torch.load(path)
 55 |         for (ins_idx, points) in load_preds:
 56 |             preds[int(ins_idx)] = points
 57 |         return preds
 58 | 
 59 |     points_for_ann = dict()
 60 | 
 61 |     for i_img_id in range(len(img_anns)):
 62 |         img_id = list(img_anns.keys())[i_img_id]
 63 |         anns = img_anns[img_id]
 64 |         
 65 |         points_path = 'points/{}.pt'.format(str(img_id).zfill(12))
 66 |         image_points = load_preds(points_path)
 67 | 
 68 |         for ins_i in range(len(anns)):
 69 |             if ins_i in image_points:
 70 |                 points = image_points[ins_i].tolist()
 71 |                 points_for_ann[anns[ins_i]] = points
 72 |                 
 73 |     new_json = copy.deepcopy(pre_json)
 74 | 
 75 |     not_predicted_cnt = 0
 76 |     for i in range(len(coco_json['annotations'])):
 77 |         if i in points_for_ann:
 78 |             ann = coco_json['annotations'][i]
 79 |             mask = coco.annToMask(ann)
 80 |             points = points_for_ann[i]
 81 |             label = mask[points[0],points[1]]
 82 |             new_json['annotations'][i]['point_coords'].append([points[1], points[0]])
 83 |             new_json['annotations'][i]['point_labels'].append(int(label))
 84 |         else:
 85 |             new_json['annotations'][i]['point_coords'].append(step_random_json['annotations'][i]['point_coords'][pre_step])
 86 |             new_json['annotations'][i]['point_labels'].append(step_random_json['annotations'][i]['point_labels'][pre_step])
 87 |             not_predicted_cnt += 1
 88 |     print(f'{not_predicted_cnt} instances not predicted.')
 89 | 
 90 |     json.dump(new_json, open(f'datasets/coco/annotations/instances_train2017_n{step}.json', 'w'))
 91 |     os.system('rm -rf points')
 92 | 
 93 |     #########################################################################################################
 94 |     # train with the selected points
 95 |     #########################################################################################################
 96 | 
 97 |     strs = (f'OMP_NUM_THREADS=1 python ./tools/train_net_point.py \
 98 |             --config-file configs/CondInst/APIS/CondInst_fine_tuning_30k.yaml \
 99 |             --num-gpus 8 \
100 |             DATASETS.TRAIN "\'coco_2017_train_points_n{step}_active\'," \
101 |             MODEL.WEIGHTS models/model_{pre_step}.pth \
102 |             MODEL.BOXINST.POINT_LOSS_WEIGHT 1.0 \
103 |             OUTPUT_DIR training_dir/entropy_logs_n{step}')
104 |     os.system(strs)
105 |     os.system(f'cp training_dir/entropy_logs_n{step}/model_final.pth models/model_{step}.pth')


--------------------------------------------------------------------------------
/src/train_net_point.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Detection Training Script.
  4 | 
  5 | This scripts reads a given config file and runs the training or evaluation.
  6 | It is an entry point that is made to train standard models in detectron2.
  7 | 
  8 | In order to let one script support training of many models,
  9 | this script contains logic that are specific to these built-in models and therefore
 10 | may not be suitable for your own project.
 11 | For example, your research project perhaps only needs a single "evaluator".
 12 | 
 13 | Therefore, we recommend you to use detectron2 as an library and take
 14 | this file as an example of how to use the library.
 15 | You may want to write your own script with your datasets and other customizations.
 16 | """
 17 | 
 18 | import logging
 19 | import os
 20 | from collections import OrderedDict
 21 | import torch
 22 | from torch.nn.parallel import DistributedDataParallel
 23 | 
 24 | import detectron2.utils.comm as comm
 25 | from detectron2.data import MetadataCatalog, build_detection_train_loader
 26 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
 27 | from detectron2.utils.events import EventStorage
 28 | from detectron2.evaluation import (
 29 |     COCOEvaluator,
 30 |     COCOPanopticEvaluator,
 31 |     DatasetEvaluators,
 32 |     LVISEvaluator,
 33 |     PascalVOCDetectionEvaluator,
 34 |     SemSegEvaluator,
 35 |     verify_results,
 36 | )
 37 | from detectron2.modeling import GeneralizedRCNNWithTTA
 38 | from detectron2.utils.logger import setup_logger
 39 | 
 40 | from adet.data.dataset_mapper import DatasetMapperWithBasis
 41 | from adet.config import get_cfg
 42 | from adet.checkpoint import AdetCheckpointer
 43 | from adet.evaluation import TextEvaluator
 44 | 
 45 | from detectron2.projects.point_sup import PointSupDatasetMapper
 46 | 
 47 | class Trainer(DefaultTrainer):
 48 |     """
 49 |     This is the same Trainer except that we rewrite the
 50 |     `build_train_loader`/`resume_or_load` method.
 51 |     """
 52 |     def build_hooks(self):
 53 |         """
 54 |         Replace `DetectionCheckpointer` with `AdetCheckpointer`.
 55 | 
 56 |         Build a list of default hooks, including timing, evaluation,
 57 |         checkpointing, lr scheduling, precise BN, writing events.
 58 |         """
 59 |         ret = super().build_hooks()
 60 |         for i in range(len(ret)):
 61 |             if isinstance(ret[i], hooks.PeriodicCheckpointer):
 62 |                 self.checkpointer = AdetCheckpointer(
 63 |                     self.model,
 64 |                     self.cfg.OUTPUT_DIR,
 65 |                     optimizer=self.optimizer,
 66 |                     scheduler=self.scheduler,
 67 |                 )
 68 |                 ret[i] = hooks.PeriodicCheckpointer(self.checkpointer, self.cfg.SOLVER.CHECKPOINT_PERIOD)
 69 |         return ret
 70 |     
 71 |     def resume_or_load(self, resume=True):
 72 |         checkpoint = self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume)
 73 |         if resume and self.checkpointer.has_checkpoint():
 74 |             self.start_iter = checkpoint.get("iteration", -1) + 1
 75 | 
 76 |     def train_loop(self, start_iter: int, max_iter: int):
 77 |         """
 78 |         Args:
 79 |             start_iter, max_iter (int): See docs above
 80 |         """
 81 |         logger = logging.getLogger("adet.trainer")
 82 |         logger.info("Starting training from iteration {}".format(start_iter))
 83 | 
 84 |         self.iter = self.start_iter = start_iter
 85 |         self.max_iter = max_iter
 86 | 
 87 |         with EventStorage(start_iter) as self.storage:
 88 |             self.before_train()
 89 |             for self.iter in range(start_iter, max_iter):
 90 |                 self.before_step()
 91 |                 self.run_step()
 92 |                 self.after_step()
 93 |             self.after_train()
 94 | 
 95 |     def train(self):
 96 |         """
 97 |         Run training.
 98 | 
 99 |         Returns:
100 |             OrderedDict of results, if evaluation is enabled. Otherwise None.
101 |         """
102 |         self.train_loop(self.start_iter, self.max_iter)
103 |         if hasattr(self, "_last_eval_results") and comm.is_main_process():
104 |             verify_results(self.cfg, self._last_eval_results)
105 |             return self._last_eval_results
106 | 
107 |     @classmethod
108 |     def build_train_loader(cls, cfg):
109 |         if cfg.INPUT.POINT_SUP:
110 |             mapper = PointSupDatasetMapper(cfg, is_train=True)
111 |         else:
112 |             mapper = DatasetMapperWithBasis(cfg, True)
113 |         return build_detection_train_loader(cfg, mapper=mapper)
114 | 
115 |     @classmethod
116 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
117 |         """
118 |         Create evaluator(s) for a given dataset.
119 |         This uses the special metadata "evaluator_type" associated with each builtin dataset.
120 |         For your own dataset, you can simply create an evaluator manually in your
121 |         script and do not have to worry about the hacky if-else logic here.
122 |         """
123 |         if output_folder is None:
124 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
125 |         evaluator_list = []
126 |         evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
127 |         if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
128 |             evaluator_list.append(
129 |                 SemSegEvaluator(
130 |                     dataset_name,
131 |                     distributed=True,
132 |                     num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
133 |                     ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
134 |                     output_dir=output_folder,
135 |                 )
136 |             )
137 |         if evaluator_type in ["coco", "coco_panoptic_seg"]:
138 |             evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder))
139 |         if evaluator_type == "coco_panoptic_seg":
140 |             evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
141 |         if evaluator_type == "pascal_voc":
142 |             return PascalVOCDetectionEvaluator(dataset_name)
143 |         if evaluator_type == "lvis":
144 |             return LVISEvaluator(dataset_name, cfg, True, output_folder)
145 |         if evaluator_type == "text":
146 |             return TextEvaluator(dataset_name, cfg, True, output_folder)
147 |         if len(evaluator_list) == 0:
148 |             raise NotImplementedError(
149 |                 "no Evaluator for the dataset {} with the type {}".format(
150 |                     dataset_name, evaluator_type
151 |                 )
152 |             )
153 |         if len(evaluator_list) == 1:
154 |             return evaluator_list[0]
155 |         return DatasetEvaluators(evaluator_list)
156 | 
157 |     @classmethod
158 |     def test_with_TTA(cls, cfg, model):
159 |         logger = logging.getLogger("adet.trainer")
160 |         # In the end of training, run an evaluation with TTA
161 |         # Only support some R-CNN models.
162 |         logger.info("Running inference with test-time augmentation ...")
163 |         model = GeneralizedRCNNWithTTA(cfg, model)
164 |         evaluators = [
165 |             cls.build_evaluator(
166 |                 cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
167 |             )
168 |             for name in cfg.DATASETS.TEST
169 |         ]
170 |         res = cls.test(cfg, model, evaluators)
171 |         res = OrderedDict({k + "_TTA": v for k, v in res.items()})
172 |         return res
173 | 
174 | 
175 | def add_point_sup_config(cfg):
176 |     """
177 |     Add config for point supervision.
178 |     """
179 |     # Use point annotation
180 |     cfg.INPUT.POINT_SUP = False
181 |     # Sample only part of points in each iteration.
182 |     # Default: 0, use all available points.
183 |     cfg.INPUT.SAMPLE_POINTS = 0
184 |     # loss weight for loss_point when boxinst is enabled
185 |     cfg.MODEL.BOXINST.POINT_LOSS_WEIGHT = 1.0
186 | 
187 | 
188 | def setup(args):
189 |     """
190 |     Create configs and perform basic setups.
191 |     """
192 |     cfg = get_cfg()
193 |     add_point_sup_config(cfg)
194 |     cfg.merge_from_file(args.config_file)
195 |     cfg.merge_from_list(args.opts)
196 |     cfg.freeze()
197 |     default_setup(cfg, args)
198 | 
199 |     rank = comm.get_rank()
200 |     setup_logger(cfg.OUTPUT_DIR, distributed_rank=rank, name="APIS")
201 | 
202 |     return cfg
203 | 
204 | 
205 | def main(args):
206 |     cfg = setup(args)
207 | 
208 |     if args.eval_only:
209 |         model = Trainer.build_model(cfg)
210 |         AdetCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
211 |             cfg.MODEL.WEIGHTS, resume=args.resume
212 |         )
213 |         res = Trainer.test(cfg, model) # d2 defaults.py
214 |         if comm.is_main_process():
215 |             verify_results(cfg, res)
216 |         if cfg.TEST.AUG.ENABLED:
217 |             res.update(Trainer.test_with_TTA(cfg, model))
218 |         return res
219 | 
220 |     """
221 |     If you'd like to do anything fancier than the standard training logic,
222 |     consider writing your own training loop or subclassing the trainer.
223 |     """
224 |     trainer = Trainer(cfg)
225 |     trainer.resume_or_load(resume=args.resume)
226 |     if cfg.TEST.AUG.ENABLED:
227 |         trainer.register_hooks(
228 |             [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
229 |         )
230 |     return trainer.train()
231 | 
232 | 
233 | if __name__ == "__main__":
234 |     args = default_argument_parser().parse_args()
235 |     print("Command Line Args:", args)
236 |     launch(
237 |         main,
238 |         args.num_gpus,
239 |         num_machines=args.num_machines,
240 |         machine_rank=args.machine_rank,
241 |         dist_url=args.dist_url,
242 |         args=(args,),
243 |     )
244 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/src/condinst/dynamic_mask_head.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | from torch import nn
  4 | 
  5 | from detectron2.utils.events import get_event_storage
  6 | 
  7 | from adet.utils.comm import compute_locations, aligned_bilinear
  8 | 
  9 | from detectron2.projects.point_rend.point_features import point_sample
 10 | from detectron2.projects.point_rend.point_head import roi_mask_point_loss
 11 | 
 12 | from detectron2.layers import cat
 13 | 
 14 | SMOOTH = 1e-6
 15 | def get_mask_iou(outputs, labels):
 16 |     outputs = (outputs>0.5).clone().detach().long()
 17 |     labels = labels.clone().detach().long()
 18 |     intersection = (outputs & labels).float().sum((2, 3)) 
 19 |     union = (outputs | labels).float().sum((2, 3))
 20 |     iou = (intersection + SMOOTH) / (union + SMOOTH)
 21 |     return iou.mean()
 22 | 
 23 | # from detectron2.projects.point_sup.point_utils import get_point_coords_from_point_annotation
 24 | def get_point_coords_from_point_annotation(instances, gt_inds, image_wh):
 25 |     # re-designed for condinst, image_wh (width, height) should be the padded size
 26 |     gt_point_coords = []
 27 |     gt_point_labels = []
 28 | 
 29 |     for per_im in instances:
 30 |         gt_point_coords.append(per_im.gt_point_coords.to(torch.float32) / image_wh)
 31 |         gt_point_labels.append(per_im.gt_point_labels.to(torch.float32).clone())
 32 | 
 33 |     gt_point_coords = torch.cat(gt_point_coords)
 34 |     gt_point_labels = torch.cat(gt_point_labels)
 35 | 
 36 |     gt_point_coords = gt_point_coords[gt_inds]
 37 |     gt_point_labels = gt_point_labels[gt_inds]
 38 | 
 39 |     return gt_point_coords, gt_point_labels
 40 | 
 41 | 
 42 | def compute_project_term(mask_scores, gt_bitmasks):
 43 |     mask_losses_y = dice_coefficient(
 44 |         mask_scores.max(dim=2, keepdim=True)[0],
 45 |         gt_bitmasks.max(dim=2, keepdim=True)[0]
 46 |     )
 47 |     mask_losses_x = dice_coefficient(
 48 |         mask_scores.max(dim=3, keepdim=True)[0],
 49 |         gt_bitmasks.max(dim=3, keepdim=True)[0]
 50 |     )
 51 |     return (mask_losses_x + mask_losses_y).mean()
 52 | 
 53 | 
 54 | def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation):
 55 |     assert mask_logits.dim() == 4
 56 | 
 57 |     log_fg_prob = F.logsigmoid(mask_logits)
 58 |     log_bg_prob = F.logsigmoid(-mask_logits)
 59 | 
 60 |     from adet.modeling.condinst.condinst import unfold_wo_center
 61 |     log_fg_prob_unfold = unfold_wo_center(
 62 |         log_fg_prob, kernel_size=pairwise_size,
 63 |         dilation=pairwise_dilation
 64 |     )
 65 |     log_bg_prob_unfold = unfold_wo_center(
 66 |         log_bg_prob, kernel_size=pairwise_size,
 67 |         dilation=pairwise_dilation
 68 |     )
 69 | 
 70 |     # the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j)
 71 |     # we compute the the probability in log space to avoid numerical instability
 72 |     log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold
 73 |     log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold
 74 | 
 75 |     max_ = torch.max(log_same_fg_prob, log_same_bg_prob)
 76 |     log_same_prob = torch.log(
 77 |         torch.exp(log_same_fg_prob - max_) +
 78 |         torch.exp(log_same_bg_prob - max_)
 79 |     ) + max_
 80 | 
 81 |     # loss = -log(prob)
 82 |     return -log_same_prob[:, 0]
 83 | 
 84 | 
 85 | def dice_coefficient(x, target):
 86 |     eps = 1e-5
 87 |     n_inst = x.size(0)
 88 |     x = x.reshape(n_inst, -1)
 89 |     target = target.reshape(n_inst, -1)
 90 |     intersection = (x * target).sum(dim=1)
 91 |     union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps
 92 |     loss = 1. - (2 * intersection / union)
 93 |     return loss
 94 | 
 95 | 
 96 | def parse_dynamic_params(params, channels, weight_nums, bias_nums):
 97 |     assert params.dim() == 2
 98 |     assert len(weight_nums) == len(bias_nums)
 99 |     assert params.size(1) == sum(weight_nums) + sum(bias_nums)
100 | 
101 |     num_insts = params.size(0)
102 |     num_layers = len(weight_nums)
103 | 
104 |     params_splits = list(torch.split_with_sizes(
105 |         params, weight_nums + bias_nums, dim=1
106 |     ))
107 | 
108 |     weight_splits = params_splits[:num_layers]
109 |     bias_splits = params_splits[num_layers:]
110 | 
111 |     for l in range(num_layers):
112 |         if l < num_layers - 1:
113 |             # out_channels x in_channels x 1 x 1
114 |             weight_splits[l] = weight_splits[l].reshape(num_insts * channels, -1, 1, 1)
115 |             bias_splits[l] = bias_splits[l].reshape(num_insts * channels)
116 |         else:
117 |             # out_channels x in_channels x 1 x 1
118 |             weight_splits[l] = weight_splits[l].reshape(num_insts * 1, -1, 1, 1)
119 |             bias_splits[l] = bias_splits[l].reshape(num_insts)
120 | 
121 |     return weight_splits, bias_splits
122 | 
123 | 
124 | def build_dynamic_mask_head(cfg):
125 |     return DynamicMaskHead(cfg)
126 | 
127 | 
128 | class DynamicMaskHead(nn.Module):
129 |     def __init__(self, cfg):
130 |         super(DynamicMaskHead, self).__init__()
131 |         self.num_layers = cfg.MODEL.CONDINST.MASK_HEAD.NUM_LAYERS
132 |         self.channels = cfg.MODEL.CONDINST.MASK_HEAD.CHANNELS
133 |         self.in_channels = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS
134 |         self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
135 |         self.disable_rel_coords = cfg.MODEL.CONDINST.MASK_HEAD.DISABLE_REL_COORDS
136 | 
137 |         soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST
138 |         self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2]))
139 | 
140 |         # boxinst configs
141 |         self.boxinst_enabled = cfg.MODEL.BOXINST.ENABLED
142 |         self.bottom_pixels_removed = cfg.MODEL.BOXINST.BOTTOM_PIXELS_REMOVED
143 |         self.pairwise_size = cfg.MODEL.BOXINST.PAIRWISE.SIZE
144 |         self.pairwise_dilation = cfg.MODEL.BOXINST.PAIRWISE.DILATION
145 |         self.pairwise_color_thresh = cfg.MODEL.BOXINST.PAIRWISE.COLOR_THRESH
146 |         self._warmup_iters = cfg.MODEL.BOXINST.PAIRWISE.WARMUP_ITERS
147 | 
148 |         # pointsup configs
149 |         self.point_sup_enabled = cfg.INPUT.POINT_SUP
150 |         self.point_loss_weight = cfg.MODEL.BOXINST.POINT_LOSS_WEIGHT
151 | 
152 |         weight_nums, bias_nums = [], []
153 |         for l in range(self.num_layers):
154 |             if l == 0:
155 |                 if not self.disable_rel_coords:
156 |                     weight_nums.append((self.in_channels + 2) * self.channels)
157 |                 else:
158 |                     weight_nums.append(self.in_channels * self.channels)
159 |                 bias_nums.append(self.channels)
160 |             elif l == self.num_layers - 1:
161 |                 weight_nums.append(self.channels * 1)
162 |                 bias_nums.append(1)
163 |             else:
164 |                 weight_nums.append(self.channels * self.channels)
165 |                 bias_nums.append(self.channels)
166 | 
167 |         self.weight_nums = weight_nums
168 |         self.bias_nums = bias_nums
169 |         self.num_gen_params = sum(weight_nums) + sum(bias_nums)
170 | 
171 |         self.register_buffer("_iter", torch.zeros([1]))
172 | 
173 |     def mask_heads_forward(self, features, weights, biases, num_insts):
174 |         '''
175 |         :param features
176 |         :param weights: [w0, w1, ...]
177 |         :param bias: [b0, b1, ...]
178 |         :return:
179 |         '''
180 |         assert features.dim() == 4
181 |         n_layers = len(weights)
182 |         x = features
183 |         for i, (w, b) in enumerate(zip(weights, biases)):
184 |             x = F.conv2d(
185 |                 x, w, bias=b,
186 |                 stride=1, padding=0,
187 |                 groups=num_insts
188 |             )
189 |             if i < n_layers - 1:
190 |                 x = F.relu(x)
191 |         return x
192 | 
193 |     def mask_heads_forward_with_coords(
194 |             self, mask_feats, mask_feat_stride, instances
195 |     ):
196 |         locations = compute_locations(
197 |             mask_feats.size(2), mask_feats.size(3),
198 |             stride=mask_feat_stride, device=mask_feats.device
199 |         )
200 |         n_inst = len(instances)
201 | 
202 |         im_inds = instances.im_inds
203 |         mask_head_params = instances.mask_head_params
204 | 
205 |         N, _, H, W = mask_feats.size()
206 | 
207 |         if not self.disable_rel_coords:
208 |             instance_locations = instances.locations
209 |             relative_coords = instance_locations.reshape(-1, 1, 2) - locations.reshape(1, -1, 2)
210 |             relative_coords = relative_coords.permute(0, 2, 1).float()
211 |             soi = self.sizes_of_interest.float()[instances.fpn_levels]
212 |             relative_coords = relative_coords / soi.reshape(-1, 1, 1)
213 |             relative_coords = relative_coords.to(dtype=mask_feats.dtype)
214 | 
215 |             mask_head_inputs = torch.cat([
216 |                 relative_coords, mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
217 |             ], dim=1)
218 |         else:
219 |             mask_head_inputs = mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
220 | 
221 |         mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W)
222 | 
223 |         weights, biases = parse_dynamic_params(
224 |             mask_head_params, self.channels,
225 |             self.weight_nums, self.bias_nums
226 |         )
227 | 
228 |         mask_logits = self.mask_heads_forward(mask_head_inputs, weights, biases, n_inst)
229 | 
230 |         mask_logits = mask_logits.reshape(-1, 1, H, W)
231 | 
232 |         assert mask_feat_stride >= self.mask_out_stride
233 |         assert mask_feat_stride % self.mask_out_stride == 0
234 |         mask_logits = aligned_bilinear(mask_logits, int(mask_feat_stride / self.mask_out_stride))
235 | 
236 |         return mask_logits
237 | 
238 |     def __call__(self, mask_feats, mask_feat_stride, pred_instances, gt_instances=None):
239 |         if self.training:
240 |             self._iter += 1
241 | 
242 |             gt_inds = pred_instances.gt_inds
243 |             if self.boxinst_enabled or (not self.boxinst_enabled and not self.point_sup_enabled):
244 |                 gt_bitmasks = torch.cat([per_im.gt_bitmasks for per_im in gt_instances])
245 |                 gt_bitmasks = gt_bitmasks[gt_inds].unsqueeze(dim=1).to(dtype=mask_feats.dtype)
246 | 
247 |             losses = {}
248 | 
249 |             if len(pred_instances) == 0:
250 |                 dummy_loss = mask_feats.sum() * 0 + pred_instances.mask_head_params.sum() * 0
251 |                 if not self.boxinst_enabled and not self.point_sup_enabled:
252 |                     # fully-supervised
253 |                     losses["loss_mask"] = dummy_loss
254 |                 else:
255 |                     # BoxInst and/or PointSup settings
256 |                     if self.boxinst_enabled:
257 |                         losses["loss_prj"] = dummy_loss
258 |                         losses["loss_pairwise"] = dummy_loss
259 |                     if self.point_sup_enabled:
260 |                         losses["loss_point"] = dummy_loss
261 |             else:
262 |                 mask_logits = self.mask_heads_forward_with_coords(
263 |                     mask_feats, mask_feat_stride, pred_instances
264 |                 )
265 |                 mask_scores = mask_logits.sigmoid()
266 | 
267 |                 if not self.boxinst_enabled and not self.point_sup_enabled:
268 |                     # fully-supervised CondInst losses
269 |                     mask_losses = dice_coefficient(mask_scores, gt_bitmasks)
270 |                     loss_mask = mask_losses.mean()
271 |                     losses["loss_mask"] = loss_mask
272 |                 else:
273 |                     # BoxInst and/or PointSup losses
274 |                     if self.boxinst_enabled:
275 |                         # box-supervised BoxInst losses
276 |                         image_color_similarity = torch.cat([x.image_color_similarity for x in gt_instances])
277 |                         image_color_similarity = image_color_similarity[gt_inds].to(dtype=mask_feats.dtype)
278 | 
279 |                         loss_prj_term = compute_project_term(mask_scores, gt_bitmasks)
280 | 
281 |                         pairwise_losses = compute_pairwise_term(
282 |                             mask_logits, self.pairwise_size,
283 |                             self.pairwise_dilation
284 |                         )
285 | 
286 |                         weights = (image_color_similarity >= self.pairwise_color_thresh).float() * gt_bitmasks.float()
287 |                         loss_pairwise = (pairwise_losses * weights).sum() / weights.sum().clamp(min=1.0)
288 | 
289 |                         warmup_factor = min(self._iter.item() / float(self._warmup_iters), 1.0)
290 |                         loss_pairwise = loss_pairwise * warmup_factor
291 | 
292 |                         losses.update({
293 |                             "loss_prj": loss_prj_term,
294 |                             "loss_pairwise": loss_pairwise,
295 |                         })
296 |                     if self.point_sup_enabled:
297 |                         # pointly-supervised CondInst losses
298 |                         image_wh = torch.Tensor([mask_logits.size(3) * self.mask_out_stride, mask_logits.size(2) * self.mask_out_stride]).to(mask_logits.device)
299 |                         point_coords, point_labels = get_point_coords_from_point_annotation(gt_instances, gt_inds, image_wh)
300 | 
301 |                         point_logits = point_sample(
302 |                             mask_logits,
303 |                             point_coords,
304 |                             mode='bilinear',
305 |                             align_corners=False,
306 |                         )
307 | 
308 |                         loss_point = roi_mask_point_loss(point_logits, gt_instances, point_labels)
309 | 
310 |                         losses.update({
311 |                             "loss_point": loss_point * self.point_loss_weight,
312 |                         })
313 |             return losses
314 |         else:
315 |             if len(pred_instances) > 0:
316 |                 mask_logits = self.mask_heads_forward_with_coords(
317 |                     mask_feats, mask_feat_stride, pred_instances
318 |                 )
319 |                 pred_instances.pred_global_masks = mask_logits.sigmoid()
320 | 
321 |             return pred_instances
322 | 


--------------------------------------------------------------------------------
/src/condinst/standard/dynamic_mask_head.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | from torch import nn
  4 | 
  5 | from detectron2.utils.events import get_event_storage
  6 | 
  7 | from adet.utils.comm import compute_locations, aligned_bilinear
  8 | 
  9 | from detectron2.projects.point_rend.point_features import point_sample
 10 | from detectron2.projects.point_rend.point_head import roi_mask_point_loss
 11 | 
 12 | from detectron2.layers import cat
 13 | 
 14 | SMOOTH = 1e-6
 15 | def get_mask_iou(outputs, labels):
 16 |     outputs = (outputs>0.5).clone().detach().long()
 17 |     labels = labels.clone().detach().long()
 18 |     intersection = (outputs & labels).float().sum((2, 3)) 
 19 |     union = (outputs | labels).float().sum((2, 3))
 20 |     iou = (intersection + SMOOTH) / (union + SMOOTH)
 21 |     return iou.mean()
 22 | 
 23 | # from detectron2.projects.point_sup.point_utils import get_point_coords_from_point_annotation
 24 | def get_point_coords_from_point_annotation(instances, gt_inds, image_wh):
 25 |     # re-designed for condinst, image_wh (width, height) should be the padded size
 26 |     gt_point_coords = []
 27 |     gt_point_labels = []
 28 | 
 29 |     for per_im in instances:
 30 |         gt_point_coords.append(per_im.gt_point_coords.to(torch.float32) / image_wh)
 31 |         gt_point_labels.append(per_im.gt_point_labels.to(torch.float32).clone())
 32 | 
 33 |     gt_point_coords = torch.cat(gt_point_coords)
 34 |     gt_point_labels = torch.cat(gt_point_labels)
 35 | 
 36 |     gt_point_coords = gt_point_coords[gt_inds]
 37 |     gt_point_labels = gt_point_labels[gt_inds]
 38 | 
 39 |     return gt_point_coords, gt_point_labels
 40 | 
 41 | 
 42 | def compute_project_term(mask_scores, gt_bitmasks):
 43 |     mask_losses_y = dice_coefficient(
 44 |         mask_scores.max(dim=2, keepdim=True)[0],
 45 |         gt_bitmasks.max(dim=2, keepdim=True)[0]
 46 |     )
 47 |     mask_losses_x = dice_coefficient(
 48 |         mask_scores.max(dim=3, keepdim=True)[0],
 49 |         gt_bitmasks.max(dim=3, keepdim=True)[0]
 50 |     )
 51 |     return (mask_losses_x + mask_losses_y).mean()
 52 | 
 53 | 
 54 | def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation):
 55 |     assert mask_logits.dim() == 4
 56 | 
 57 |     log_fg_prob = F.logsigmoid(mask_logits)
 58 |     log_bg_prob = F.logsigmoid(-mask_logits)
 59 | 
 60 |     from adet.modeling.condinst.condinst import unfold_wo_center
 61 |     log_fg_prob_unfold = unfold_wo_center(
 62 |         log_fg_prob, kernel_size=pairwise_size,
 63 |         dilation=pairwise_dilation
 64 |     )
 65 |     log_bg_prob_unfold = unfold_wo_center(
 66 |         log_bg_prob, kernel_size=pairwise_size,
 67 |         dilation=pairwise_dilation
 68 |     )
 69 | 
 70 |     # the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j)
 71 |     # we compute the the probability in log space to avoid numerical instability
 72 |     log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold
 73 |     log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold
 74 | 
 75 |     max_ = torch.max(log_same_fg_prob, log_same_bg_prob)
 76 |     log_same_prob = torch.log(
 77 |         torch.exp(log_same_fg_prob - max_) +
 78 |         torch.exp(log_same_bg_prob - max_)
 79 |     ) + max_
 80 | 
 81 |     # loss = -log(prob)
 82 |     return -log_same_prob[:, 0]
 83 | 
 84 | 
 85 | def dice_coefficient(x, target):
 86 |     eps = 1e-5
 87 |     n_inst = x.size(0)
 88 |     x = x.reshape(n_inst, -1)
 89 |     target = target.reshape(n_inst, -1)
 90 |     intersection = (x * target).sum(dim=1)
 91 |     union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps
 92 |     loss = 1. - (2 * intersection / union)
 93 |     return loss
 94 | 
 95 | 
 96 | def parse_dynamic_params(params, channels, weight_nums, bias_nums):
 97 |     assert params.dim() == 2
 98 |     assert len(weight_nums) == len(bias_nums)
 99 |     assert params.size(1) == sum(weight_nums) + sum(bias_nums)
100 | 
101 |     num_insts = params.size(0)
102 |     num_layers = len(weight_nums)
103 | 
104 |     params_splits = list(torch.split_with_sizes(
105 |         params, weight_nums + bias_nums, dim=1
106 |     ))
107 | 
108 |     weight_splits = params_splits[:num_layers]
109 |     bias_splits = params_splits[num_layers:]
110 | 
111 |     for l in range(num_layers):
112 |         if l < num_layers - 1:
113 |             # out_channels x in_channels x 1 x 1
114 |             weight_splits[l] = weight_splits[l].reshape(num_insts * channels, -1, 1, 1)
115 |             bias_splits[l] = bias_splits[l].reshape(num_insts * channels)
116 |         else:
117 |             # out_channels x in_channels x 1 x 1
118 |             weight_splits[l] = weight_splits[l].reshape(num_insts * 1, -1, 1, 1)
119 |             bias_splits[l] = bias_splits[l].reshape(num_insts)
120 | 
121 |     return weight_splits, bias_splits
122 | 
123 | 
124 | def build_dynamic_mask_head(cfg):
125 |     return DynamicMaskHead(cfg)
126 | 
127 | 
128 | class DynamicMaskHead(nn.Module):
129 |     def __init__(self, cfg):
130 |         super(DynamicMaskHead, self).__init__()
131 |         self.num_layers = cfg.MODEL.CONDINST.MASK_HEAD.NUM_LAYERS
132 |         self.channels = cfg.MODEL.CONDINST.MASK_HEAD.CHANNELS
133 |         self.in_channels = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS
134 |         self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
135 |         self.disable_rel_coords = cfg.MODEL.CONDINST.MASK_HEAD.DISABLE_REL_COORDS
136 | 
137 |         soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST
138 |         self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2]))
139 | 
140 |         # boxinst configs
141 |         self.boxinst_enabled = cfg.MODEL.BOXINST.ENABLED
142 |         self.bottom_pixels_removed = cfg.MODEL.BOXINST.BOTTOM_PIXELS_REMOVED
143 |         self.pairwise_size = cfg.MODEL.BOXINST.PAIRWISE.SIZE
144 |         self.pairwise_dilation = cfg.MODEL.BOXINST.PAIRWISE.DILATION
145 |         self.pairwise_color_thresh = cfg.MODEL.BOXINST.PAIRWISE.COLOR_THRESH
146 |         self._warmup_iters = cfg.MODEL.BOXINST.PAIRWISE.WARMUP_ITERS
147 | 
148 |         # pointsup configs
149 |         self.point_sup_enabled = cfg.INPUT.POINT_SUP
150 |         self.point_loss_weight = cfg.MODEL.BOXINST.POINT_LOSS_WEIGHT
151 | 
152 |         weight_nums, bias_nums = [], []
153 |         for l in range(self.num_layers):
154 |             if l == 0:
155 |                 if not self.disable_rel_coords:
156 |                     weight_nums.append((self.in_channels + 2) * self.channels)
157 |                 else:
158 |                     weight_nums.append(self.in_channels * self.channels)
159 |                 bias_nums.append(self.channels)
160 |             elif l == self.num_layers - 1:
161 |                 weight_nums.append(self.channels * 1)
162 |                 bias_nums.append(1)
163 |             else:
164 |                 weight_nums.append(self.channels * self.channels)
165 |                 bias_nums.append(self.channels)
166 | 
167 |         self.weight_nums = weight_nums
168 |         self.bias_nums = bias_nums
169 |         self.num_gen_params = sum(weight_nums) + sum(bias_nums)
170 | 
171 |         self.register_buffer("_iter", torch.zeros([1]))
172 | 
173 |     def mask_heads_forward(self, features, weights, biases, num_insts):
174 |         '''
175 |         :param features
176 |         :param weights: [w0, w1, ...]
177 |         :param bias: [b0, b1, ...]
178 |         :return:
179 |         '''
180 |         assert features.dim() == 4
181 |         n_layers = len(weights)
182 |         x = features
183 |         for i, (w, b) in enumerate(zip(weights, biases)):
184 |             x = F.conv2d(
185 |                 x, w, bias=b,
186 |                 stride=1, padding=0,
187 |                 groups=num_insts
188 |             )
189 |             if i < n_layers - 1:
190 |                 x = F.relu(x)
191 |         return x
192 | 
193 |     def mask_heads_forward_with_coords(
194 |             self, mask_feats, mask_feat_stride, instances
195 |     ):
196 |         locations = compute_locations(
197 |             mask_feats.size(2), mask_feats.size(3),
198 |             stride=mask_feat_stride, device=mask_feats.device
199 |         )
200 |         n_inst = len(instances)
201 | 
202 |         im_inds = instances.im_inds
203 |         mask_head_params = instances.mask_head_params
204 | 
205 |         N, _, H, W = mask_feats.size()
206 | 
207 |         if not self.disable_rel_coords:
208 |             instance_locations = instances.locations
209 |             relative_coords = instance_locations.reshape(-1, 1, 2) - locations.reshape(1, -1, 2)
210 |             relative_coords = relative_coords.permute(0, 2, 1).float()
211 |             soi = self.sizes_of_interest.float()[instances.fpn_levels]
212 |             relative_coords = relative_coords / soi.reshape(-1, 1, 1)
213 |             relative_coords = relative_coords.to(dtype=mask_feats.dtype)
214 | 
215 |             mask_head_inputs = torch.cat([
216 |                 relative_coords, mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
217 |             ], dim=1)
218 |         else:
219 |             mask_head_inputs = mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
220 | 
221 |         mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W)
222 | 
223 |         weights, biases = parse_dynamic_params(
224 |             mask_head_params, self.channels,
225 |             self.weight_nums, self.bias_nums
226 |         )
227 | 
228 |         mask_logits = self.mask_heads_forward(mask_head_inputs, weights, biases, n_inst)
229 | 
230 |         mask_logits = mask_logits.reshape(-1, 1, H, W)
231 | 
232 |         assert mask_feat_stride >= self.mask_out_stride
233 |         assert mask_feat_stride % self.mask_out_stride == 0
234 |         mask_logits = aligned_bilinear(mask_logits, int(mask_feat_stride / self.mask_out_stride))
235 | 
236 |         return mask_logits
237 | 
238 |     def __call__(self, mask_feats, mask_feat_stride, pred_instances, gt_instances=None):
239 |         if self.training:
240 |             self._iter += 1
241 | 
242 |             gt_inds = pred_instances.gt_inds
243 |             if self.boxinst_enabled or (not self.boxinst_enabled and not self.point_sup_enabled):
244 |                 gt_bitmasks = torch.cat([per_im.gt_bitmasks for per_im in gt_instances])
245 |                 gt_bitmasks = gt_bitmasks[gt_inds].unsqueeze(dim=1).to(dtype=mask_feats.dtype)
246 | 
247 |             losses = {}
248 | 
249 |             if len(pred_instances) == 0:
250 |                 dummy_loss = mask_feats.sum() * 0 + pred_instances.mask_head_params.sum() * 0
251 |                 if not self.boxinst_enabled and not self.point_sup_enabled:
252 |                     # fully-supervised
253 |                     losses["loss_mask"] = dummy_loss
254 |                 else:
255 |                     # BoxInst and/or PointSup settings
256 |                     if self.boxinst_enabled:
257 |                         losses["loss_prj"] = dummy_loss
258 |                         losses["loss_pairwise"] = dummy_loss
259 |                     if self.point_sup_enabled:
260 |                         losses["loss_point"] = dummy_loss
261 |             else:
262 |                 mask_logits = self.mask_heads_forward_with_coords(
263 |                     mask_feats, mask_feat_stride, pred_instances
264 |                 )
265 |                 mask_scores = mask_logits.sigmoid()
266 | 
267 |                 if not self.boxinst_enabled and not self.point_sup_enabled:
268 |                     # fully-supervised CondInst losses
269 |                     mask_losses = dice_coefficient(mask_scores, gt_bitmasks)
270 |                     loss_mask = mask_losses.mean()
271 |                     losses["loss_mask"] = loss_mask
272 |                 else:
273 |                     # BoxInst and/or PointSup losses
274 |                     if self.boxinst_enabled:
275 |                         # box-supervised BoxInst losses
276 |                         image_color_similarity = torch.cat([x.image_color_similarity for x in gt_instances])
277 |                         image_color_similarity = image_color_similarity[gt_inds].to(dtype=mask_feats.dtype)
278 | 
279 |                         loss_prj_term = compute_project_term(mask_scores, gt_bitmasks)
280 | 
281 |                         pairwise_losses = compute_pairwise_term(
282 |                             mask_logits, self.pairwise_size,
283 |                             self.pairwise_dilation
284 |                         )
285 | 
286 |                         weights = (image_color_similarity >= self.pairwise_color_thresh).float() * gt_bitmasks.float()
287 |                         loss_pairwise = (pairwise_losses * weights).sum() / weights.sum().clamp(min=1.0)
288 | 
289 |                         warmup_factor = min(self._iter.item() / float(self._warmup_iters), 1.0)
290 |                         loss_pairwise = loss_pairwise * warmup_factor
291 | 
292 |                         losses.update({
293 |                             "loss_prj": loss_prj_term,
294 |                             "loss_pairwise": loss_pairwise,
295 |                         })
296 |                     if self.point_sup_enabled:
297 |                         # pointly-supervised CondInst losses
298 |                         image_wh = torch.Tensor([mask_logits.size(3) * self.mask_out_stride, mask_logits.size(2) * self.mask_out_stride]).to(mask_logits.device)
299 |                         point_coords, point_labels = get_point_coords_from_point_annotation(gt_instances, gt_inds, image_wh)
300 | 
301 |                         point_logits = point_sample(
302 |                             mask_logits,
303 |                             point_coords,
304 |                             mode='bilinear',
305 |                             align_corners=False,
306 |                         )
307 | 
308 |                         loss_point = roi_mask_point_loss(point_logits, gt_instances, point_labels)
309 | 
310 |                         losses.update({
311 |                             "loss_point": loss_point * self.point_loss_weight,
312 |                         })
313 |             return losses
314 |         else:
315 |             if len(pred_instances) > 0:
316 |                 mask_logits = self.mask_heads_forward_with_coords(
317 |                     mask_feats, mask_feat_stride, pred_instances
318 |                 )
319 |                 pred_instances.pred_global_masks = mask_logits.sigmoid()
320 | 
321 |             return pred_instances
322 | 


--------------------------------------------------------------------------------
/src/condinst/Entropy/dynamic_mask_head.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | from torch import nn
  4 | 
  5 | from detectron2.utils.events import get_event_storage
  6 | 
  7 | from adet.utils.comm import compute_locations, aligned_bilinear
  8 | 
  9 | from detectron2.projects.point_rend.point_features import point_sample
 10 | from detectron2.projects.point_rend.point_head import roi_mask_point_loss
 11 | 
 12 | from detectron2.layers import cat
 13 | 
 14 | 
 15 | SMOOTH = 1e-6
 16 | def get_mask_iou(outputs, labels):
 17 |     outputs = (outputs>0.5).clone().detach().long()
 18 |     labels = labels.clone().detach().long()
 19 |     intersection = (outputs & labels).float().sum((2, 3)) 
 20 |     union = (outputs | labels).float().sum((2, 3))
 21 |     iou = (intersection + SMOOTH) / (union + SMOOTH)
 22 |     return iou.mean()
 23 | 
 24 | 
 25 | # from detectron2.projects.point_sup.point_utils import get_point_coords_from_point_annotation
 26 | def get_point_coords_from_point_annotation(instances, gt_inds, image_wh):
 27 |     # re-designed for condinst, image_wh (width, height) should be the padded size
 28 |     gt_point_coords = []
 29 |     gt_point_labels = []
 30 | 
 31 |     for per_im in instances:
 32 |         gt_point_coords.append(per_im.gt_point_coords.to(torch.float32) / image_wh)
 33 |         gt_point_labels.append(per_im.gt_point_labels.to(torch.float32).clone())
 34 | 
 35 |     gt_point_coords = torch.cat(gt_point_coords)
 36 |     gt_point_labels = torch.cat(gt_point_labels)
 37 | 
 38 |     gt_point_coords = gt_point_coords[gt_inds]
 39 |     gt_point_labels = gt_point_labels[gt_inds]
 40 | 
 41 |     return gt_point_coords, gt_point_labels
 42 | 
 43 | 
 44 | def compute_project_term(mask_scores, gt_bitmasks):
 45 |     mask_losses_y = dice_coefficient(
 46 |         mask_scores.max(dim=2, keepdim=True)[0],
 47 |         gt_bitmasks.max(dim=2, keepdim=True)[0]
 48 |     )
 49 |     mask_losses_x = dice_coefficient(
 50 |         mask_scores.max(dim=3, keepdim=True)[0],
 51 |         gt_bitmasks.max(dim=3, keepdim=True)[0]
 52 |     )
 53 |     return (mask_losses_x + mask_losses_y).mean()
 54 | 
 55 | 
 56 | def compute_pairwise_term(mask_logits, pairwise_size, pairwise_dilation):
 57 |     assert mask_logits.dim() == 4
 58 | 
 59 |     log_fg_prob = F.logsigmoid(mask_logits)
 60 |     log_bg_prob = F.logsigmoid(-mask_logits)
 61 | 
 62 |     from adet.modeling.condinst.condinst import unfold_wo_center
 63 |     log_fg_prob_unfold = unfold_wo_center(
 64 |         log_fg_prob, kernel_size=pairwise_size,
 65 |         dilation=pairwise_dilation
 66 |     )
 67 |     log_bg_prob_unfold = unfold_wo_center(
 68 |         log_bg_prob, kernel_size=pairwise_size,
 69 |         dilation=pairwise_dilation
 70 |     )
 71 | 
 72 |     # the probability of making the same prediction = p_i * p_j + (1 - p_i) * (1 - p_j)
 73 |     # we compute the the probability in log space to avoid numerical instability
 74 |     log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold
 75 |     log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold
 76 | 
 77 |     max_ = torch.max(log_same_fg_prob, log_same_bg_prob)
 78 |     log_same_prob = torch.log(
 79 |         torch.exp(log_same_fg_prob - max_) +
 80 |         torch.exp(log_same_bg_prob - max_)
 81 |     ) + max_
 82 | 
 83 |     # loss = -log(prob)
 84 |     return -log_same_prob[:, 0]
 85 | 
 86 | 
 87 | def dice_coefficient(x, target):
 88 |     eps = 1e-5
 89 |     n_inst = x.size(0)
 90 |     x = x.reshape(n_inst, -1)
 91 |     target = target.reshape(n_inst, -1)
 92 |     intersection = (x * target).sum(dim=1)
 93 |     union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps
 94 |     loss = 1. - (2 * intersection / union)
 95 |     return loss
 96 | 
 97 | 
 98 | def parse_dynamic_params(params, channels, weight_nums, bias_nums):
 99 |     assert params.dim() == 2
100 |     assert len(weight_nums) == len(bias_nums)
101 |     assert params.size(1) == sum(weight_nums) + sum(bias_nums)
102 | 
103 |     num_insts = params.size(0)
104 |     num_layers = len(weight_nums)
105 | 
106 |     params_splits = list(torch.split_with_sizes(
107 |         params, weight_nums + bias_nums, dim=1
108 |     ))
109 | 
110 |     weight_splits = params_splits[:num_layers]
111 |     bias_splits = params_splits[num_layers:]
112 | 
113 |     for l in range(num_layers):
114 |         if l < num_layers - 1:
115 |             # out_channels x in_channels x 1 x 1
116 |             weight_splits[l] = weight_splits[l].reshape(num_insts * channels, -1, 1, 1)
117 |             bias_splits[l] = bias_splits[l].reshape(num_insts * channels)
118 |         else:
119 |             # out_channels x in_channels x 1 x 1
120 |             weight_splits[l] = weight_splits[l].reshape(num_insts * 1, -1, 1, 1)
121 |             bias_splits[l] = bias_splits[l].reshape(num_insts)
122 | 
123 |     return weight_splits, bias_splits
124 | 
125 | 
126 | def build_dynamic_mask_head(cfg):
127 |     return DynamicMaskHead(cfg)
128 | 
129 | 
130 | class DynamicMaskHead(nn.Module):
131 |     def __init__(self, cfg):
132 |         super(DynamicMaskHead, self).__init__()
133 |         self.num_layers = cfg.MODEL.CONDINST.MASK_HEAD.NUM_LAYERS
134 |         self.channels = cfg.MODEL.CONDINST.MASK_HEAD.CHANNELS
135 |         self.in_channels = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS
136 |         self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
137 |         self.disable_rel_coords = cfg.MODEL.CONDINST.MASK_HEAD.DISABLE_REL_COORDS
138 | 
139 |         soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST
140 |         self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2]))
141 | 
142 |         # boxinst configs
143 |         self.boxinst_enabled = cfg.MODEL.BOXINST.ENABLED
144 |         self.bottom_pixels_removed = cfg.MODEL.BOXINST.BOTTOM_PIXELS_REMOVED
145 |         self.pairwise_size = cfg.MODEL.BOXINST.PAIRWISE.SIZE
146 |         self.pairwise_dilation = cfg.MODEL.BOXINST.PAIRWISE.DILATION
147 |         self.pairwise_color_thresh = cfg.MODEL.BOXINST.PAIRWISE.COLOR_THRESH
148 |         self._warmup_iters = cfg.MODEL.BOXINST.PAIRWISE.WARMUP_ITERS
149 | 
150 |         # pointsup configs
151 |         self.point_sup_enabled = cfg.INPUT.POINT_SUP
152 |         self.point_loss_weight = cfg.MODEL.BOXINST.POINT_LOSS_WEIGHT
153 | 
154 |         weight_nums, bias_nums = [], []
155 |         for l in range(self.num_layers):
156 |             if l == 0:
157 |                 if not self.disable_rel_coords:
158 |                     weight_nums.append((self.in_channels + 2) * self.channels)
159 |                 else:
160 |                     weight_nums.append(self.in_channels * self.channels)
161 |                 bias_nums.append(self.channels)
162 |             elif l == self.num_layers - 1:
163 |                 weight_nums.append(self.channels * 1)
164 |                 bias_nums.append(1)
165 |             else:
166 |                 weight_nums.append(self.channels * self.channels)
167 |                 bias_nums.append(self.channels)
168 | 
169 |         self.weight_nums = weight_nums
170 |         self.bias_nums = bias_nums
171 |         self.num_gen_params = sum(weight_nums) + sum(bias_nums)
172 | 
173 |         self.register_buffer("_iter", torch.zeros([1]))
174 | 
175 |     def mask_heads_forward(self, features, weights, biases, num_insts):
176 |         '''
177 |         :param features
178 |         :param weights: [w0, w1, ...]
179 |         :param bias: [b0, b1, ...]
180 |         :return:
181 |         '''
182 |         assert features.dim() == 4
183 |         n_layers = len(weights)
184 |         x = features
185 |         for i, (w, b) in enumerate(zip(weights, biases)):
186 |             x = F.conv2d(
187 |                 x, w, bias=b,
188 |                 stride=1, padding=0,
189 |                 groups=num_insts
190 |             )
191 |             if i < n_layers - 1:
192 |                 x = F.relu(x)
193 |         return x
194 | 
195 |     def mask_heads_forward_with_coords(
196 |             self, mask_feats, mask_feat_stride, instances
197 |     ):
198 |         locations = compute_locations(
199 |             mask_feats.size(2), mask_feats.size(3),
200 |             stride=mask_feat_stride, device=mask_feats.device
201 |         )
202 |         n_inst = len(instances)
203 | 
204 |         im_inds = instances.im_inds
205 |         mask_head_params = instances.mask_head_params
206 | 
207 |         N, _, H, W = mask_feats.size()
208 | 
209 |         if not self.disable_rel_coords:
210 |             instance_locations = instances.locations
211 |             relative_coords = instance_locations.reshape(-1, 1, 2) - locations.reshape(1, -1, 2)
212 |             relative_coords = relative_coords.permute(0, 2, 1).float()
213 |             soi = self.sizes_of_interest.float()[instances.fpn_levels]
214 |             relative_coords = relative_coords / soi.reshape(-1, 1, 1)
215 |             relative_coords = relative_coords.to(dtype=mask_feats.dtype)
216 | 
217 |             mask_head_inputs = torch.cat([
218 |                 relative_coords, mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
219 |             ], dim=1)
220 |         else:
221 |             mask_head_inputs = mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
222 | 
223 |         mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W)
224 | 
225 |         weights, biases = parse_dynamic_params(
226 |             mask_head_params, self.channels,
227 |             self.weight_nums, self.bias_nums
228 |         )
229 | 
230 |         mask_logits = self.mask_heads_forward(mask_head_inputs, weights, biases, n_inst)
231 | 
232 |         mask_logits = mask_logits.reshape(-1, 1, H, W)
233 | 
234 |         assert mask_feat_stride >= self.mask_out_stride
235 |         assert mask_feat_stride % self.mask_out_stride == 0
236 |         mask_logits = aligned_bilinear(mask_logits, int(mask_feat_stride / self.mask_out_stride))
237 | 
238 |         return mask_logits
239 | 
240 |     def __call__(self, mask_feats, mask_feat_stride, pred_instances, gt_instances=None):
241 |         if self.training:
242 |             self._iter += 1
243 | 
244 |             gt_inds = pred_instances.gt_inds
245 |             if self.boxinst_enabled:
246 |                 gt_bitmasks = torch.cat([per_im.gt_bitmasks for per_im in gt_instances])
247 |                 gt_bitmasks = gt_bitmasks[gt_inds].unsqueeze(dim=1).to(dtype=mask_feats.dtype)
248 | 
249 |             losses = {}
250 | 
251 |             dummy_loss = mask_feats.sum() * 0 + pred_instances.mask_head_params.sum() * 0
252 |             if not self.boxinst_enabled and not self.point_sup_enabled:
253 |                 # fully-supervised
254 |                 losses["loss_mask"] = dummy_loss
255 |             else:
256 |                 # BoxInst and/or PointSup settings
257 |                 if self.boxinst_enabled:
258 |                     losses["loss_prj"] = dummy_loss
259 |                     losses["loss_pairwise"] = dummy_loss
260 |                 if self.point_sup_enabled:
261 |                     losses["loss_point"] = dummy_loss
262 | 
263 |             preds = ()
264 |             if len(pred_instances) != 0:
265 |                 mask_logits = self.mask_heads_forward_with_coords(
266 |                     mask_feats, mask_feat_stride, pred_instances
267 |                 )
268 |                 mask_scores = mask_logits.sigmoid()
269 | 
270 |                 resized_im_h, resized_im_w = gt_instances[0].image_size
271 |                 pred_global_masks = aligned_bilinear(
272 |                     mask_scores, int(self.mask_out_stride)
273 |                 )
274 |                 pred_global_masks = pred_global_masks[:, :, :resized_im_h, :resized_im_w]
275 |             
276 |                 preds = (gt_inds, pred_global_masks)
277 | 
278 |             return losses, [preds]
279 | 
280 |             #     if not self.boxinst_enabled and not self.point_sup_enabled:
281 |             #         # fully-supervised CondInst losses
282 |             #         mask_losses = dice_coefficient(mask_scores, gt_bitmasks)
283 |             #         loss_mask = mask_losses.mean()
284 |             #         losses["loss_mask"] = loss_mask
285 |             #     else:
286 |             #         # BoxInst and/or PointSup losses
287 |             #         if self.boxinst_enabled:
288 |             #             # box-supervised BoxInst losses
289 |             #             image_color_similarity = torch.cat([x.image_color_similarity for x in gt_instances])
290 |             #             image_color_similarity = image_color_similarity[gt_inds].to(dtype=mask_feats.dtype)
291 | 
292 |             #             loss_prj_term = compute_project_term(mask_scores, gt_bitmasks)
293 | 
294 |             #             pairwise_losses = compute_pairwise_term(
295 |             #                 mask_logits, self.pairwise_size,
296 |             #                 self.pairwise_dilation
297 |             #             )
298 | 
299 |             #             weights = (image_color_similarity >= self.pairwise_color_thresh).float() * gt_bitmasks.float()
300 |             #             loss_pairwise = (pairwise_losses * weights).sum() / weights.sum().clamp(min=1.0)
301 | 
302 |             #             warmup_factor = min(self._iter.item() / float(self._warmup_iters), 1.0)
303 |             #             loss_pairwise = loss_pairwise * warmup_factor
304 | 
305 |             #             losses.update({
306 |             #                 "loss_prj": loss_prj_term,
307 |             #                 "loss_pairwise": loss_pairwise,
308 |             #             })
309 |             #         if self.point_sup_enabled:
310 |             #             # pointly-supervised CondInst losses
311 |             #             image_wh = torch.Tensor([mask_logits.size(3) * self.mask_out_stride, mask_logits.size(2) * self.mask_out_stride]).to(mask_logits.device)
312 |             #             point_coords, point_labels = get_point_coords_from_point_annotation(gt_instances, gt_inds, image_wh)
313 | 
314 |             #             point_logits = point_sample(
315 |             #                 mask_logits,
316 |             #                 point_coords,
317 |             #                 mode='bilinear',
318 |             #                 align_corners=False,
319 |             #             )
320 | 
321 |             #             loss_point = roi_mask_point_loss(point_logits, gt_instances, point_labels)
322 | 
323 |             #             losses.update({
324 |             #                 "loss_point": loss_point * self.point_loss_weight,
325 |             #             })
326 |             # return losses
327 |         else:
328 |             if len(pred_instances) > 0:
329 |                 mask_logits = self.mask_heads_forward_with_coords(
330 |                     mask_feats, mask_feat_stride, pred_instances
331 |                 )
332 |                 pred_instances.pred_global_masks = mask_logits.sigmoid()
333 | 
334 |             return pred_instances
335 | 


--------------------------------------------------------------------------------
/src/condinst/condinst.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import logging
  3 | from skimage import color
  4 | 
  5 | import torch
  6 | from torch import nn
  7 | import torch.nn.functional as F
  8 | 
  9 | from detectron2.structures import ImageList
 10 | from detectron2.modeling.proposal_generator import build_proposal_generator
 11 | from detectron2.modeling.backbone import build_backbone
 12 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
 13 | from detectron2.structures.instances import Instances
 14 | from detectron2.structures.masks import PolygonMasks, polygons_to_bitmask
 15 | 
 16 | from .dynamic_mask_head import build_dynamic_mask_head
 17 | from .mask_branch import build_mask_branch
 18 | 
 19 | from adet.utils.comm import aligned_bilinear
 20 | 
 21 | __all__ = ["CondInst"]
 22 | 
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | def unfold_wo_center(x, kernel_size, dilation):
 28 |     assert x.dim() == 4
 29 |     assert kernel_size % 2 == 1
 30 | 
 31 |     # using SAME padding
 32 |     padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
 33 |     unfolded_x = F.unfold(
 34 |         x, kernel_size=kernel_size,
 35 |         padding=padding,
 36 |         dilation=dilation
 37 |     )
 38 | 
 39 |     unfolded_x = unfolded_x.reshape(
 40 |         x.size(0), x.size(1), -1, x.size(2), x.size(3)
 41 |     )
 42 | 
 43 |     # remove the center pixels
 44 |     size = kernel_size ** 2
 45 |     unfolded_x = torch.cat((
 46 |         unfolded_x[:, :, :size // 2],
 47 |         unfolded_x[:, :, size // 2 + 1:]
 48 |     ), dim=2)
 49 | 
 50 |     return unfolded_x
 51 | 
 52 | 
 53 | def get_images_color_similarity(images, image_masks, kernel_size, dilation):
 54 |     assert images.dim() == 4
 55 |     assert images.size(0) == 1
 56 | 
 57 |     unfolded_images = unfold_wo_center(
 58 |         images, kernel_size=kernel_size, dilation=dilation
 59 |     )
 60 | 
 61 |     diff = images[:, :, None] - unfolded_images
 62 |     similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5)
 63 | 
 64 |     unfolded_weights = unfold_wo_center(
 65 |         image_masks[None, None], kernel_size=kernel_size,
 66 |         dilation=dilation
 67 |     )
 68 |     unfolded_weights = torch.max(unfolded_weights, dim=1)[0]
 69 | 
 70 |     return similarity * unfolded_weights
 71 | 
 72 | 
 73 | @META_ARCH_REGISTRY.register()
 74 | class CondInst(nn.Module):
 75 |     """
 76 |     Main class for CondInst architectures (see https://arxiv.org/abs/2003.05664).
 77 |     """
 78 | 
 79 |     def __init__(self, cfg):
 80 |         super().__init__()
 81 |         self.device = torch.device(cfg.MODEL.DEVICE)
 82 | 
 83 |         self.backbone = build_backbone(cfg)
 84 |         self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
 85 |         self.mask_head = build_dynamic_mask_head(cfg)
 86 |         self.mask_branch = build_mask_branch(cfg, self.backbone.output_shape())
 87 | 
 88 |         self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
 89 | 
 90 |         self.max_proposals = cfg.MODEL.CONDINST.MAX_PROPOSALS
 91 |         self.topk_proposals_per_im = cfg.MODEL.CONDINST.TOPK_PROPOSALS_PER_IM
 92 | 
 93 |         # boxinst configs
 94 |         self.boxinst_enabled = cfg.MODEL.BOXINST.ENABLED
 95 |         self.bottom_pixels_removed = cfg.MODEL.BOXINST.BOTTOM_PIXELS_REMOVED
 96 |         self.pairwise_size = cfg.MODEL.BOXINST.PAIRWISE.SIZE
 97 |         self.pairwise_dilation = cfg.MODEL.BOXINST.PAIRWISE.DILATION
 98 |         self.pairwise_color_thresh = cfg.MODEL.BOXINST.PAIRWISE.COLOR_THRESH
 99 | 
100 |         # pointsup configs
101 |         self.point_sup_enabled = cfg.INPUT.POINT_SUP
102 | 
103 |         # build top module
104 |         in_channels = self.proposal_generator.in_channels_to_top_module
105 | 
106 |         self.controller = nn.Conv2d(
107 |             in_channels, self.mask_head.num_gen_params,
108 |             kernel_size=3, stride=1, padding=1
109 |         )
110 |         torch.nn.init.normal_(self.controller.weight, std=0.01)
111 |         torch.nn.init.constant_(self.controller.bias, 0)
112 | 
113 |         pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
114 |         pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
115 |         self.normalizer = lambda x: (x - pixel_mean) / pixel_std
116 |         self.to(self.device)
117 | 
118 |     def forward(self, batched_inputs):
119 |         original_images = [x["image"].to(self.device) for x in batched_inputs]
120 | 
121 |         # normalize images
122 |         images_norm = [self.normalizer(x) for x in original_images]
123 |         images_norm = ImageList.from_tensors(images_norm, self.backbone.size_divisibility)
124 | 
125 |         features = self.backbone(images_norm.tensor)
126 | 
127 |         if "instances" in batched_inputs[0]:
128 |             gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
129 | 
130 |             if self.boxinst_enabled:
131 |                 original_image_masks = [torch.ones_like(x[0], dtype=torch.float32) for x in original_images]
132 | 
133 |                 # mask out the bottom area where the COCO dataset probably has wrong annotations
134 |                 for i in range(len(original_image_masks)):
135 |                     im_h = batched_inputs[i]["height"]
136 |                     pixels_removed = int(
137 |                         self.bottom_pixels_removed *
138 |                         float(original_images[i].size(1)) / float(im_h)
139 |                     )
140 |                     if pixels_removed > 0:
141 |                         original_image_masks[i][-pixels_removed:, :] = 0
142 | 
143 |                 original_images = ImageList.from_tensors(original_images, self.backbone.size_divisibility)
144 |                 original_image_masks = ImageList.from_tensors(
145 |                     original_image_masks, self.backbone.size_divisibility, pad_value=0.0
146 |                 )
147 |                 self.add_bitmasks_from_boxes(
148 |                     gt_instances, original_images.tensor, original_image_masks.tensor,
149 |                     original_images.tensor.size(-2), original_images.tensor.size(-1)
150 |                 )
151 |         else:
152 |             gt_instances = None
153 | 
154 |         mask_feats, sem_losses = self.mask_branch(features, gt_instances)
155 | 
156 |         proposals, proposal_losses = self.proposal_generator(
157 |             images_norm, features, gt_instances, self.controller
158 |         )
159 | 
160 |         if self.training:
161 |             mask_losses = self._forward_mask_heads_train(proposals, mask_feats, gt_instances)
162 | 
163 |             losses = {}
164 |             losses.update(sem_losses)
165 |             losses.update(proposal_losses)
166 |             losses.update(mask_losses)
167 |             return losses
168 |         else:
169 |             pred_instances_w_masks = self._forward_mask_heads_test(proposals, mask_feats)
170 | 
171 |             padded_im_h, padded_im_w = images_norm.tensor.size()[-2:]
172 |             processed_results = []
173 |             for im_id, (input_per_image, image_size) in enumerate(zip(batched_inputs, images_norm.image_sizes)):
174 |                 height = input_per_image.get("height", image_size[0])
175 |                 width = input_per_image.get("width", image_size[1])
176 | 
177 |                 instances_per_im = pred_instances_w_masks[pred_instances_w_masks.im_inds == im_id]
178 |                 instances_per_im = self.postprocess(
179 |                     instances_per_im, height, width,
180 |                     padded_im_h, padded_im_w
181 |                 )
182 | 
183 |                 processed_results.append({
184 |                     "instances": instances_per_im
185 |                 })
186 | 
187 |             return processed_results
188 | 
189 |     def _forward_mask_heads_train(self, proposals, mask_feats, gt_instances):
190 |         # prepare the inputs for mask heads
191 |         pred_instances = proposals["instances"]
192 | 
193 |         assert (self.max_proposals == -1) or (self.topk_proposals_per_im == -1), \
194 |             "MAX_PROPOSALS and TOPK_PROPOSALS_PER_IM cannot be used at the same time."
195 |         if self.max_proposals != -1:
196 |             if self.max_proposals < len(pred_instances):
197 |                 inds = torch.randperm(len(pred_instances), device=mask_feats.device).long()
198 |                 logger.info("clipping proposals from {} to {}".format(
199 |                     len(pred_instances), self.max_proposals
200 |                 ))
201 |                 pred_instances = pred_instances[inds[:self.max_proposals]]
202 |         elif self.topk_proposals_per_im != -1:
203 |             num_images = len(gt_instances)
204 | 
205 |             kept_instances = []
206 |             for im_id in range(num_images):
207 |                 instances_per_im = pred_instances[pred_instances.im_inds == im_id]
208 |                 if len(instances_per_im) == 0:
209 |                     kept_instances.append(instances_per_im)
210 |                     continue
211 | 
212 |                 unique_gt_inds = instances_per_im.gt_inds.unique()
213 |                 num_instances_per_gt = max(int(self.topk_proposals_per_im / len(unique_gt_inds)), 1)
214 | 
215 |                 for gt_ind in unique_gt_inds:
216 |                     instances_per_gt = instances_per_im[instances_per_im.gt_inds == gt_ind]
217 | 
218 |                     if len(instances_per_gt) > num_instances_per_gt:
219 |                         scores = instances_per_gt.logits_pred.sigmoid().max(dim=1)[0]
220 |                         ctrness_pred = instances_per_gt.ctrness_pred.sigmoid()
221 |                         inds = (scores * ctrness_pred).topk(k=num_instances_per_gt, dim=0)[1]
222 |                         instances_per_gt = instances_per_gt[inds]
223 | 
224 |                     kept_instances.append(instances_per_gt)
225 | 
226 |             pred_instances = Instances.cat(kept_instances)
227 | 
228 |         pred_instances.mask_head_params = pred_instances.top_feats
229 | 
230 |         loss_mask = self.mask_head(
231 |             mask_feats, self.mask_branch.out_stride,
232 |             pred_instances, gt_instances
233 |         )
234 | 
235 |         return loss_mask
236 | 
237 |     def _forward_mask_heads_test(self, proposals, mask_feats):
238 |         # prepare the inputs for mask heads
239 |         for im_id, per_im in enumerate(proposals):
240 |             per_im.im_inds = per_im.locations.new_ones(len(per_im), dtype=torch.long) * im_id
241 |         pred_instances = Instances.cat(proposals)
242 |         pred_instances.mask_head_params = pred_instances.top_feat
243 | 
244 |         pred_instances_w_masks = self.mask_head(
245 |             mask_feats, self.mask_branch.out_stride, pred_instances
246 |         )
247 | 
248 |         return pred_instances_w_masks
249 | 
250 |     def add_bitmasks(self, instances, im_h, im_w):
251 |         for per_im_gt_inst in instances:
252 |             if not per_im_gt_inst.has("gt_masks"):
253 |                 continue
254 |             start = int(self.mask_out_stride // 2)
255 |             if isinstance(per_im_gt_inst.get("gt_masks"), PolygonMasks):
256 |                 polygons = per_im_gt_inst.get("gt_masks").polygons
257 |                 per_im_bitmasks = []
258 |                 per_im_bitmasks_full = []
259 |                 for per_polygons in polygons:
260 |                     bitmask = polygons_to_bitmask(per_polygons, im_h, im_w)
261 |                     bitmask = torch.from_numpy(bitmask).to(self.device).float()
262 |                     start = int(self.mask_out_stride // 2)
263 |                     bitmask_full = bitmask.clone()
264 |                     bitmask = bitmask[start::self.mask_out_stride, start::self.mask_out_stride]
265 | 
266 |                     assert bitmask.size(0) * self.mask_out_stride == im_h
267 |                     assert bitmask.size(1) * self.mask_out_stride == im_w
268 | 
269 |                     per_im_bitmasks.append(bitmask)
270 |                     per_im_bitmasks_full.append(bitmask_full)
271 | 
272 |                 per_im_gt_inst.gt_bitmasks = torch.stack(per_im_bitmasks, dim=0)
273 |                 per_im_gt_inst.gt_bitmasks_full = torch.stack(per_im_bitmasks_full, dim=0)
274 |             else: # RLE format bitmask
275 |                 bitmasks = per_im_gt_inst.get("gt_masks").tensor
276 |                 h, w = bitmasks.size()[1:]
277 |                 # pad to new size
278 |                 bitmasks_full = F.pad(bitmasks, (0, im_w - w, 0, im_h - h), "constant", 0)
279 |                 bitmasks = bitmasks_full[:, start::self.mask_out_stride, start::self.mask_out_stride]
280 |                 per_im_gt_inst.gt_bitmasks = bitmasks
281 |                 per_im_gt_inst.gt_bitmasks_full = bitmasks_full
282 | 
283 |     def add_bitmasks_from_boxes(self, instances, images, image_masks, im_h, im_w):
284 |         stride = self.mask_out_stride
285 |         start = int(stride // 2)
286 | 
287 |         assert images.size(2) % stride == 0
288 |         assert images.size(3) % stride == 0
289 | 
290 |         downsampled_images = F.avg_pool2d(
291 |             images.float(), kernel_size=stride,
292 |             stride=stride, padding=0
293 |         )[:, [2, 1, 0]]
294 |         image_masks = image_masks[:, start::stride, start::stride]
295 | 
296 |         for im_i, per_im_gt_inst in enumerate(instances):
297 |             images_lab = color.rgb2lab(downsampled_images[im_i].byte().permute(1, 2, 0).cpu().numpy())
298 |             images_lab = torch.as_tensor(images_lab, device=downsampled_images.device, dtype=torch.float32)
299 |             images_lab = images_lab.permute(2, 0, 1)[None]
300 |             images_color_similarity = get_images_color_similarity(
301 |                 images_lab, image_masks[im_i],
302 |                 self.pairwise_size, self.pairwise_dilation
303 |             )
304 | 
305 |             per_im_boxes = per_im_gt_inst.gt_boxes.tensor
306 |             per_im_bitmasks = []
307 |             per_im_bitmasks_full = []
308 |             for per_box in per_im_boxes:
309 |                 bitmask_full = torch.zeros((im_h, im_w)).to(self.device).float()
310 |                 bitmask_full[int(per_box[1]):int(per_box[3] + 1), int(per_box[0]):int(per_box[2] + 1)] = 1.0
311 | 
312 |                 bitmask = bitmask_full[start::stride, start::stride]
313 | 
314 |                 assert bitmask.size(0) * stride == im_h
315 |                 assert bitmask.size(1) * stride == im_w
316 | 
317 |                 per_im_bitmasks.append(bitmask)
318 |                 per_im_bitmasks_full.append(bitmask_full)
319 | 
320 |             per_im_gt_inst.gt_bitmasks = torch.stack(per_im_bitmasks, dim=0)
321 |             per_im_gt_inst.gt_bitmasks_full = torch.stack(per_im_bitmasks_full, dim=0)
322 |             per_im_gt_inst.image_color_similarity = torch.cat([
323 |                 images_color_similarity for _ in range(len(per_im_gt_inst))
324 |             ], dim=0)
325 | 
326 |     def postprocess(self, results, output_height, output_width, padded_im_h, padded_im_w, mask_threshold=0.5):
327 |         """
328 |         Resize the output instances.
329 |         The input images are often resized when entering an object detector.
330 |         As a result, we often need the outputs of the detector in a different
331 |         resolution from its inputs.
332 |         This function will resize the raw outputs of an R-CNN detector
333 |         to produce outputs according to the desired output resolution.
334 |         Args:
335 |             results (Instances): the raw outputs from the detector.
336 |                 `results.image_size` contains the input image resolution the detector sees.
337 |                 This object might be modified in-place.
338 |             output_height, output_width: the desired output resolution.
339 |         Returns:
340 |             Instances: the resized output from the model, based on the output resolution
341 |         """
342 |         scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
343 |         resized_im_h, resized_im_w = results.image_size
344 |         results = Instances((output_height, output_width), **results.get_fields())
345 | 
346 |         if results.has("pred_boxes"):
347 |             output_boxes = results.pred_boxes
348 |         elif results.has("proposal_boxes"):
349 |             output_boxes = results.proposal_boxes
350 | 
351 |         output_boxes.scale(scale_x, scale_y)
352 |         output_boxes.clip(results.image_size)
353 | 
354 |         results = results[output_boxes.nonempty()]
355 | 
356 |         if results.has("pred_global_masks"):
357 |             mask_h, mask_w = results.pred_global_masks.size()[-2:]
358 |             factor_h = padded_im_h // mask_h
359 |             factor_w = padded_im_w // mask_w
360 |             assert factor_h == factor_w
361 |             factor = factor_h
362 |             pred_global_masks = aligned_bilinear(
363 |                 results.pred_global_masks, factor
364 |             )
365 |             pred_global_masks = pred_global_masks[:, :, :resized_im_h, :resized_im_w]
366 |             pred_global_masks = F.interpolate(
367 |                 pred_global_masks,
368 |                 size=(output_height, output_width),
369 |                 mode="bilinear", align_corners=False
370 |             )
371 |             pred_global_masks = pred_global_masks[:, 0, :, :]
372 |             
373 |             if self.point_sup_enabled:
374 |                 # filter out any mask prediction outside of predicted boxes (see PointSup)
375 |                 pred_boxes = results.pred_boxes.tensor
376 |                 for i in range(pred_global_masks.size(0)):
377 |                     kept_mask = torch.zeros_like(pred_global_masks[0]).to(pred_boxes.device)
378 |                     x0,y0,x1,y1 = int(pred_boxes[i][0]),int(pred_boxes[i][1]),int(pred_boxes[i][2]),int(pred_boxes[i][3])
379 |                     kept_mask[y0:y1, x0:x1] = 1
380 |                     pred_global_masks[i] *= kept_mask
381 | 
382 |             results.pred_masks = (pred_global_masks > mask_threshold).float()
383 | 
384 |         return results
385 | 


--------------------------------------------------------------------------------
/src/condinst/standard/condinst.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import logging
  3 | from skimage import color
  4 | 
  5 | import torch
  6 | from torch import nn
  7 | import torch.nn.functional as F
  8 | 
  9 | from detectron2.structures import ImageList
 10 | from detectron2.modeling.proposal_generator import build_proposal_generator
 11 | from detectron2.modeling.backbone import build_backbone
 12 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
 13 | from detectron2.structures.instances import Instances
 14 | from detectron2.structures.masks import PolygonMasks, polygons_to_bitmask
 15 | 
 16 | from .dynamic_mask_head import build_dynamic_mask_head
 17 | from .mask_branch import build_mask_branch
 18 | 
 19 | from adet.utils.comm import aligned_bilinear
 20 | 
 21 | __all__ = ["CondInst"]
 22 | 
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | def unfold_wo_center(x, kernel_size, dilation):
 28 |     assert x.dim() == 4
 29 |     assert kernel_size % 2 == 1
 30 | 
 31 |     # using SAME padding
 32 |     padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
 33 |     unfolded_x = F.unfold(
 34 |         x, kernel_size=kernel_size,
 35 |         padding=padding,
 36 |         dilation=dilation
 37 |     )
 38 | 
 39 |     unfolded_x = unfolded_x.reshape(
 40 |         x.size(0), x.size(1), -1, x.size(2), x.size(3)
 41 |     )
 42 | 
 43 |     # remove the center pixels
 44 |     size = kernel_size ** 2
 45 |     unfolded_x = torch.cat((
 46 |         unfolded_x[:, :, :size // 2],
 47 |         unfolded_x[:, :, size // 2 + 1:]
 48 |     ), dim=2)
 49 | 
 50 |     return unfolded_x
 51 | 
 52 | 
 53 | def get_images_color_similarity(images, image_masks, kernel_size, dilation):
 54 |     assert images.dim() == 4
 55 |     assert images.size(0) == 1
 56 | 
 57 |     unfolded_images = unfold_wo_center(
 58 |         images, kernel_size=kernel_size, dilation=dilation
 59 |     )
 60 | 
 61 |     diff = images[:, :, None] - unfolded_images
 62 |     similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5)
 63 | 
 64 |     unfolded_weights = unfold_wo_center(
 65 |         image_masks[None, None], kernel_size=kernel_size,
 66 |         dilation=dilation
 67 |     )
 68 |     unfolded_weights = torch.max(unfolded_weights, dim=1)[0]
 69 | 
 70 |     return similarity * unfolded_weights
 71 | 
 72 | 
 73 | @META_ARCH_REGISTRY.register()
 74 | class CondInst(nn.Module):
 75 |     """
 76 |     Main class for CondInst architectures (see https://arxiv.org/abs/2003.05664).
 77 |     """
 78 | 
 79 |     def __init__(self, cfg):
 80 |         super().__init__()
 81 |         self.device = torch.device(cfg.MODEL.DEVICE)
 82 | 
 83 |         self.backbone = build_backbone(cfg)
 84 |         self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
 85 |         self.mask_head = build_dynamic_mask_head(cfg)
 86 |         self.mask_branch = build_mask_branch(cfg, self.backbone.output_shape())
 87 | 
 88 |         self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
 89 | 
 90 |         self.max_proposals = cfg.MODEL.CONDINST.MAX_PROPOSALS
 91 |         self.topk_proposals_per_im = cfg.MODEL.CONDINST.TOPK_PROPOSALS_PER_IM
 92 | 
 93 |         # boxinst configs
 94 |         self.boxinst_enabled = cfg.MODEL.BOXINST.ENABLED
 95 |         self.bottom_pixels_removed = cfg.MODEL.BOXINST.BOTTOM_PIXELS_REMOVED
 96 |         self.pairwise_size = cfg.MODEL.BOXINST.PAIRWISE.SIZE
 97 |         self.pairwise_dilation = cfg.MODEL.BOXINST.PAIRWISE.DILATION
 98 |         self.pairwise_color_thresh = cfg.MODEL.BOXINST.PAIRWISE.COLOR_THRESH
 99 | 
100 |         # pointsup configs
101 |         self.point_sup_enabled = cfg.INPUT.POINT_SUP
102 | 
103 |         # build top module
104 |         in_channels = self.proposal_generator.in_channels_to_top_module
105 | 
106 |         self.controller = nn.Conv2d(
107 |             in_channels, self.mask_head.num_gen_params,
108 |             kernel_size=3, stride=1, padding=1
109 |         )
110 |         torch.nn.init.normal_(self.controller.weight, std=0.01)
111 |         torch.nn.init.constant_(self.controller.bias, 0)
112 | 
113 |         pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
114 |         pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
115 |         self.normalizer = lambda x: (x - pixel_mean) / pixel_std
116 |         self.to(self.device)
117 | 
118 |     def forward(self, batched_inputs):
119 |         original_images = [x["image"].to(self.device) for x in batched_inputs]
120 | 
121 |         # normalize images
122 |         images_norm = [self.normalizer(x) for x in original_images]
123 |         images_norm = ImageList.from_tensors(images_norm, self.backbone.size_divisibility)
124 | 
125 |         features = self.backbone(images_norm.tensor)
126 | 
127 |         if "instances" in batched_inputs[0]:
128 |             gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
129 | 
130 |             if self.boxinst_enabled:
131 |                 original_image_masks = [torch.ones_like(x[0], dtype=torch.float32) for x in original_images]
132 | 
133 |                 # mask out the bottom area where the COCO dataset probably has wrong annotations
134 |                 for i in range(len(original_image_masks)):
135 |                     im_h = batched_inputs[i]["height"]
136 |                     pixels_removed = int(
137 |                         self.bottom_pixels_removed *
138 |                         float(original_images[i].size(1)) / float(im_h)
139 |                     )
140 |                     if pixels_removed > 0:
141 |                         original_image_masks[i][-pixels_removed:, :] = 0
142 | 
143 |                 original_images = ImageList.from_tensors(original_images, self.backbone.size_divisibility)
144 |                 original_image_masks = ImageList.from_tensors(
145 |                     original_image_masks, self.backbone.size_divisibility, pad_value=0.0
146 |                 )
147 |                 self.add_bitmasks_from_boxes(
148 |                     gt_instances, original_images.tensor, original_image_masks.tensor,
149 |                     original_images.tensor.size(-2), original_images.tensor.size(-1)
150 |                 )
151 |         else:
152 |             gt_instances = None
153 | 
154 |         mask_feats, sem_losses = self.mask_branch(features, gt_instances)
155 | 
156 |         proposals, proposal_losses = self.proposal_generator(
157 |             images_norm, features, gt_instances, self.controller
158 |         )
159 | 
160 |         if self.training:
161 |             mask_losses = self._forward_mask_heads_train(proposals, mask_feats, gt_instances)
162 | 
163 |             losses = {}
164 |             losses.update(sem_losses)
165 |             losses.update(proposal_losses)
166 |             losses.update(mask_losses)
167 |             return losses
168 |         else:
169 |             pred_instances_w_masks = self._forward_mask_heads_test(proposals, mask_feats)
170 | 
171 |             padded_im_h, padded_im_w = images_norm.tensor.size()[-2:]
172 |             processed_results = []
173 |             for im_id, (input_per_image, image_size) in enumerate(zip(batched_inputs, images_norm.image_sizes)):
174 |                 height = input_per_image.get("height", image_size[0])
175 |                 width = input_per_image.get("width", image_size[1])
176 | 
177 |                 instances_per_im = pred_instances_w_masks[pred_instances_w_masks.im_inds == im_id]
178 |                 instances_per_im = self.postprocess(
179 |                     instances_per_im, height, width,
180 |                     padded_im_h, padded_im_w
181 |                 )
182 | 
183 |                 processed_results.append({
184 |                     "instances": instances_per_im
185 |                 })
186 | 
187 |             return processed_results
188 | 
189 |     def _forward_mask_heads_train(self, proposals, mask_feats, gt_instances):
190 |         # prepare the inputs for mask heads
191 |         pred_instances = proposals["instances"]
192 | 
193 |         assert (self.max_proposals == -1) or (self.topk_proposals_per_im == -1), \
194 |             "MAX_PROPOSALS and TOPK_PROPOSALS_PER_IM cannot be used at the same time."
195 |         if self.max_proposals != -1:
196 |             if self.max_proposals < len(pred_instances):
197 |                 inds = torch.randperm(len(pred_instances), device=mask_feats.device).long()
198 |                 logger.info("clipping proposals from {} to {}".format(
199 |                     len(pred_instances), self.max_proposals
200 |                 ))
201 |                 pred_instances = pred_instances[inds[:self.max_proposals]]
202 |         elif self.topk_proposals_per_im != -1:
203 |             num_images = len(gt_instances)
204 | 
205 |             kept_instances = []
206 |             for im_id in range(num_images):
207 |                 instances_per_im = pred_instances[pred_instances.im_inds == im_id]
208 |                 if len(instances_per_im) == 0:
209 |                     kept_instances.append(instances_per_im)
210 |                     continue
211 | 
212 |                 unique_gt_inds = instances_per_im.gt_inds.unique()
213 |                 num_instances_per_gt = max(int(self.topk_proposals_per_im / len(unique_gt_inds)), 1)
214 | 
215 |                 for gt_ind in unique_gt_inds:
216 |                     instances_per_gt = instances_per_im[instances_per_im.gt_inds == gt_ind]
217 | 
218 |                     if len(instances_per_gt) > num_instances_per_gt:
219 |                         scores = instances_per_gt.logits_pred.sigmoid().max(dim=1)[0]
220 |                         ctrness_pred = instances_per_gt.ctrness_pred.sigmoid()
221 |                         inds = (scores * ctrness_pred).topk(k=num_instances_per_gt, dim=0)[1]
222 |                         instances_per_gt = instances_per_gt[inds]
223 | 
224 |                     kept_instances.append(instances_per_gt)
225 | 
226 |             pred_instances = Instances.cat(kept_instances)
227 | 
228 |         pred_instances.mask_head_params = pred_instances.top_feats
229 | 
230 |         loss_mask = self.mask_head(
231 |             mask_feats, self.mask_branch.out_stride,
232 |             pred_instances, gt_instances
233 |         )
234 | 
235 |         return loss_mask
236 | 
237 |     def _forward_mask_heads_test(self, proposals, mask_feats):
238 |         # prepare the inputs for mask heads
239 |         for im_id, per_im in enumerate(proposals):
240 |             per_im.im_inds = per_im.locations.new_ones(len(per_im), dtype=torch.long) * im_id
241 |         pred_instances = Instances.cat(proposals)
242 |         pred_instances.mask_head_params = pred_instances.top_feat
243 | 
244 |         pred_instances_w_masks = self.mask_head(
245 |             mask_feats, self.mask_branch.out_stride, pred_instances
246 |         )
247 | 
248 |         return pred_instances_w_masks
249 | 
250 |     def add_bitmasks(self, instances, im_h, im_w):
251 |         for per_im_gt_inst in instances:
252 |             if not per_im_gt_inst.has("gt_masks"):
253 |                 continue
254 |             start = int(self.mask_out_stride // 2)
255 |             if isinstance(per_im_gt_inst.get("gt_masks"), PolygonMasks):
256 |                 polygons = per_im_gt_inst.get("gt_masks").polygons
257 |                 per_im_bitmasks = []
258 |                 per_im_bitmasks_full = []
259 |                 for per_polygons in polygons:
260 |                     bitmask = polygons_to_bitmask(per_polygons, im_h, im_w)
261 |                     bitmask = torch.from_numpy(bitmask).to(self.device).float()
262 |                     start = int(self.mask_out_stride // 2)
263 |                     bitmask_full = bitmask.clone()
264 |                     bitmask = bitmask[start::self.mask_out_stride, start::self.mask_out_stride]
265 | 
266 |                     assert bitmask.size(0) * self.mask_out_stride == im_h
267 |                     assert bitmask.size(1) * self.mask_out_stride == im_w
268 | 
269 |                     per_im_bitmasks.append(bitmask)
270 |                     per_im_bitmasks_full.append(bitmask_full)
271 | 
272 |                 per_im_gt_inst.gt_bitmasks = torch.stack(per_im_bitmasks, dim=0)
273 |                 per_im_gt_inst.gt_bitmasks_full = torch.stack(per_im_bitmasks_full, dim=0)
274 |             else: # RLE format bitmask
275 |                 bitmasks = per_im_gt_inst.get("gt_masks").tensor
276 |                 h, w = bitmasks.size()[1:]
277 |                 # pad to new size
278 |                 bitmasks_full = F.pad(bitmasks, (0, im_w - w, 0, im_h - h), "constant", 0)
279 |                 bitmasks = bitmasks_full[:, start::self.mask_out_stride, start::self.mask_out_stride]
280 |                 per_im_gt_inst.gt_bitmasks = bitmasks
281 |                 per_im_gt_inst.gt_bitmasks_full = bitmasks_full
282 | 
283 |     def add_bitmasks_from_boxes(self, instances, images, image_masks, im_h, im_w):
284 |         stride = self.mask_out_stride
285 |         start = int(stride // 2)
286 | 
287 |         assert images.size(2) % stride == 0
288 |         assert images.size(3) % stride == 0
289 | 
290 |         downsampled_images = F.avg_pool2d(
291 |             images.float(), kernel_size=stride,
292 |             stride=stride, padding=0
293 |         )[:, [2, 1, 0]]
294 |         image_masks = image_masks[:, start::stride, start::stride]
295 | 
296 |         for im_i, per_im_gt_inst in enumerate(instances):
297 |             images_lab = color.rgb2lab(downsampled_images[im_i].byte().permute(1, 2, 0).cpu().numpy())
298 |             images_lab = torch.as_tensor(images_lab, device=downsampled_images.device, dtype=torch.float32)
299 |             images_lab = images_lab.permute(2, 0, 1)[None]
300 |             images_color_similarity = get_images_color_similarity(
301 |                 images_lab, image_masks[im_i],
302 |                 self.pairwise_size, self.pairwise_dilation
303 |             )
304 | 
305 |             per_im_boxes = per_im_gt_inst.gt_boxes.tensor
306 |             per_im_bitmasks = []
307 |             per_im_bitmasks_full = []
308 |             for per_box in per_im_boxes:
309 |                 bitmask_full = torch.zeros((im_h, im_w)).to(self.device).float()
310 |                 bitmask_full[int(per_box[1]):int(per_box[3] + 1), int(per_box[0]):int(per_box[2] + 1)] = 1.0
311 | 
312 |                 bitmask = bitmask_full[start::stride, start::stride]
313 | 
314 |                 assert bitmask.size(0) * stride == im_h
315 |                 assert bitmask.size(1) * stride == im_w
316 | 
317 |                 per_im_bitmasks.append(bitmask)
318 |                 per_im_bitmasks_full.append(bitmask_full)
319 | 
320 |             per_im_gt_inst.gt_bitmasks = torch.stack(per_im_bitmasks, dim=0)
321 |             per_im_gt_inst.gt_bitmasks_full = torch.stack(per_im_bitmasks_full, dim=0)
322 |             per_im_gt_inst.image_color_similarity = torch.cat([
323 |                 images_color_similarity for _ in range(len(per_im_gt_inst))
324 |             ], dim=0)
325 | 
326 |     def postprocess(self, results, output_height, output_width, padded_im_h, padded_im_w, mask_threshold=0.5):
327 |         """
328 |         Resize the output instances.
329 |         The input images are often resized when entering an object detector.
330 |         As a result, we often need the outputs of the detector in a different
331 |         resolution from its inputs.
332 |         This function will resize the raw outputs of an R-CNN detector
333 |         to produce outputs according to the desired output resolution.
334 |         Args:
335 |             results (Instances): the raw outputs from the detector.
336 |                 `results.image_size` contains the input image resolution the detector sees.
337 |                 This object might be modified in-place.
338 |             output_height, output_width: the desired output resolution.
339 |         Returns:
340 |             Instances: the resized output from the model, based on the output resolution
341 |         """
342 |         scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
343 |         resized_im_h, resized_im_w = results.image_size
344 |         results = Instances((output_height, output_width), **results.get_fields())
345 | 
346 |         if results.has("pred_boxes"):
347 |             output_boxes = results.pred_boxes
348 |         elif results.has("proposal_boxes"):
349 |             output_boxes = results.proposal_boxes
350 | 
351 |         output_boxes.scale(scale_x, scale_y)
352 |         output_boxes.clip(results.image_size)
353 | 
354 |         results = results[output_boxes.nonempty()]
355 | 
356 |         if results.has("pred_global_masks"):
357 |             mask_h, mask_w = results.pred_global_masks.size()[-2:]
358 |             factor_h = padded_im_h // mask_h
359 |             factor_w = padded_im_w // mask_w
360 |             assert factor_h == factor_w
361 |             factor = factor_h
362 |             pred_global_masks = aligned_bilinear(
363 |                 results.pred_global_masks, factor
364 |             )
365 |             pred_global_masks = pred_global_masks[:, :, :resized_im_h, :resized_im_w]
366 |             pred_global_masks = F.interpolate(
367 |                 pred_global_masks,
368 |                 size=(output_height, output_width),
369 |                 mode="bilinear", align_corners=False
370 |             )
371 |             pred_global_masks = pred_global_masks[:, 0, :, :]
372 |             
373 |             if self.point_sup_enabled:
374 |                 # filter out any mask prediction outside of predicted boxes (see PointSup)
375 |                 pred_boxes = results.pred_boxes.tensor
376 |                 for i in range(pred_global_masks.size(0)):
377 |                     kept_mask = torch.zeros_like(pred_global_masks[0]).to(pred_boxes.device)
378 |                     x0,y0,x1,y1 = int(pred_boxes[i][0]),int(pred_boxes[i][1]),int(pred_boxes[i][2]),int(pred_boxes[i][3])
379 |                     kept_mask[y0:y1, x0:x1] = 1
380 |                     pred_global_masks[i] *= kept_mask
381 | 
382 |             results.pred_masks = (pred_global_masks > mask_threshold).float()
383 | 
384 |         return results
385 | 


--------------------------------------------------------------------------------
/src/condinst/Entropy/condinst.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import logging
  3 | from skimage import color
  4 | 
  5 | import os
  6 | import torch
  7 | from torch import nn
  8 | import torch.nn.functional as F
  9 | 
 10 | from detectron2.structures import ImageList
 11 | from detectron2.modeling.proposal_generator import build_proposal_generator
 12 | from detectron2.modeling.backbone import build_backbone
 13 | from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
 14 | from detectron2.structures.instances import Instances
 15 | from detectron2.structures.masks import PolygonMasks, polygons_to_bitmask
 16 | 
 17 | from .dynamic_mask_head import build_dynamic_mask_head
 18 | from .mask_branch import build_mask_branch
 19 | 
 20 | from adet.utils.comm import aligned_bilinear
 21 | 
 22 | __all__ = ["CondInst"]
 23 | 
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | import random
 28 | EPS = 1e-12
 29 | def sampled_points_with_uncertainty(pred_score, ins_box):
 30 |     x = pred_score.mean(0)
 31 |     uncertainty = - x * torch.log(x + EPS) - (1 - x) * torch.log(1 - x + EPS)
 32 | 
 33 |     x0,y0,x1,y1 = ins_box.tolist()
 34 |     keep = torch.zeros_like(uncertainty).to(uncertainty.device)
 35 |     keep[y0:y1, x0:x1] = 1
 36 |     uncertainty *= keep
 37 | 
 38 |     points = (uncertainty==torch.max(uncertainty)).nonzero()
 39 |     random_idx = random.randint(0, len(points)-1)
 40 |     point = points[random_idx]
 41 |     return point
 42 | 
 43 | def unfold_wo_center(x, kernel_size, dilation):
 44 |     assert x.dim() == 4
 45 |     assert kernel_size % 2 == 1
 46 | 
 47 |     # using SAME padding
 48 |     padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
 49 |     unfolded_x = F.unfold(
 50 |         x, kernel_size=kernel_size,
 51 |         padding=padding,
 52 |         dilation=dilation
 53 |     )
 54 | 
 55 |     unfolded_x = unfolded_x.reshape(
 56 |         x.size(0), x.size(1), -1, x.size(2), x.size(3)
 57 |     )
 58 | 
 59 |     # remove the center pixels
 60 |     size = kernel_size ** 2
 61 |     unfolded_x = torch.cat((
 62 |         unfolded_x[:, :, :size // 2],
 63 |         unfolded_x[:, :, size // 2 + 1:]
 64 |     ), dim=2)
 65 | 
 66 |     return unfolded_x
 67 | 
 68 | 
 69 | def get_images_color_similarity(images, image_masks, kernel_size, dilation):
 70 |     assert images.dim() == 4
 71 |     assert images.size(0) == 1
 72 | 
 73 |     unfolded_images = unfold_wo_center(
 74 |         images, kernel_size=kernel_size, dilation=dilation
 75 |     )
 76 | 
 77 |     diff = images[:, :, None] - unfolded_images
 78 |     similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5)
 79 | 
 80 |     unfolded_weights = unfold_wo_center(
 81 |         image_masks[None, None], kernel_size=kernel_size,
 82 |         dilation=dilation
 83 |     )
 84 |     unfolded_weights = torch.max(unfolded_weights, dim=1)[0]
 85 | 
 86 |     return similarity * unfolded_weights
 87 | 
 88 | 
 89 | @META_ARCH_REGISTRY.register()
 90 | class CondInst(nn.Module):
 91 |     """
 92 |     Main class for CondInst architectures (see https://arxiv.org/abs/2003.05664).
 93 |     """
 94 | 
 95 |     def __init__(self, cfg):
 96 |         super().__init__()
 97 |         self.device = torch.device(cfg.MODEL.DEVICE)
 98 | 
 99 |         self.backbone = build_backbone(cfg)
100 |         self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
101 |         self.mask_head = build_dynamic_mask_head(cfg)
102 |         self.mask_branch = build_mask_branch(cfg, self.backbone.output_shape())
103 | 
104 |         self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
105 | 
106 |         self.max_proposals = cfg.MODEL.CONDINST.MAX_PROPOSALS
107 |         self.topk_proposals_per_im = cfg.MODEL.CONDINST.TOPK_PROPOSALS_PER_IM
108 | 
109 |         # boxinst configs
110 |         self.boxinst_enabled = cfg.MODEL.BOXINST.ENABLED
111 |         self.bottom_pixels_removed = cfg.MODEL.BOXINST.BOTTOM_PIXELS_REMOVED
112 |         self.pairwise_size = cfg.MODEL.BOXINST.PAIRWISE.SIZE
113 |         self.pairwise_dilation = cfg.MODEL.BOXINST.PAIRWISE.DILATION
114 |         self.pairwise_color_thresh = cfg.MODEL.BOXINST.PAIRWISE.COLOR_THRESH
115 | 
116 |         # pointsup configs
117 |         self.point_sup_enabled = cfg.INPUT.POINT_SUP
118 | 
119 |         # build top module
120 |         in_channels = self.proposal_generator.in_channels_to_top_module
121 | 
122 |         self.controller = nn.Conv2d(
123 |             in_channels, self.mask_head.num_gen_params,
124 |             kernel_size=3, stride=1, padding=1
125 |         )
126 |         torch.nn.init.normal_(self.controller.weight, std=0.01)
127 |         torch.nn.init.constant_(self.controller.bias, 0)
128 | 
129 |         pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
130 |         pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
131 |         self.normalizer = lambda x: (x - pixel_mean) / pixel_std
132 |         self.to(self.device)
133 | 
134 |     def forward(self, batched_inputs):
135 |         original_images = [x["image"].to(self.device) for x in batched_inputs]
136 | 
137 |         # normalize images
138 |         images_norm = [self.normalizer(x) for x in original_images]
139 |         images_norm = ImageList.from_tensors(images_norm, self.backbone.size_divisibility)
140 | 
141 |         features = self.backbone(images_norm.tensor)
142 | 
143 |         if "instances" in batched_inputs[0]:
144 |             gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
145 | 
146 |             if self.boxinst_enabled:
147 |                 original_image_masks = [torch.ones_like(x[0], dtype=torch.float32) for x in original_images]
148 | 
149 |                 # mask out the bottom area where the COCO dataset probably has wrong annotations
150 |                 for i in range(len(original_image_masks)):
151 |                     im_h = batched_inputs[i]["height"]
152 |                     pixels_removed = int(
153 |                         self.bottom_pixels_removed *
154 |                         float(original_images[i].size(1)) / float(im_h)
155 |                     )
156 |                     if pixels_removed > 0:
157 |                         original_image_masks[i][-pixels_removed:, :] = 0
158 | 
159 |                 original_images = ImageList.from_tensors(original_images, self.backbone.size_divisibility)
160 |                 original_image_masks = ImageList.from_tensors(
161 |                     original_image_masks, self.backbone.size_divisibility, pad_value=0.0
162 |                 )
163 |                 self.add_bitmasks_from_boxes(
164 |                     gt_instances, original_images.tensor, original_image_masks.tensor,
165 |                     original_images.tensor.size(-2), original_images.tensor.size(-1)
166 |                 )
167 |         else:
168 |             gt_instances = None
169 | 
170 |         mask_feats, sem_losses = self.mask_branch(features, gt_instances)
171 | 
172 |         proposals, proposal_losses = self.proposal_generator(
173 |             images_norm, features, gt_instances, self.controller
174 |         )
175 | 
176 |         if self.training:
177 |             mask_losses, preds = self._forward_mask_heads_train(proposals, mask_feats, gt_instances)
178 | 
179 |             for per_im, per_im_gt_instances, pred in zip(batched_inputs, gt_instances, preds):
180 |                 if len(per_im_gt_instances) == 0 or len(pred) == 0:
181 |                     continue
182 | 
183 |                 img_id = per_im['file_name'].split('/')[-1].split('.')[0]
184 |                 img_h, img_w = per_im['height'], per_im['width']
185 |                 resized_h, resized_w = per_im_gt_instances.image_size
186 |                 if min(resized_h, resized_w) == 800:
187 |                     factor = min(resized_h, resized_w) * 1.0 / min(img_h, img_w)
188 |                 else:
189 |                     factor = max(resized_h, resized_w) * 1.0 / max(img_h, img_w)
190 |                 
191 |                 gt_inds, pred_global_masks = pred
192 |                 pred_global_masks = F.interpolate(
193 |                     pred_global_masks,
194 |                     size=(img_h, img_w),
195 |                     mode="bilinear", align_corners=False
196 |                 )
197 |                 pred_global_masks = pred_global_masks[:, 0, :, :] # original image size
198 | 
199 |                 all_points = []
200 |                 for ins_idx in gt_inds.unique():
201 |                     pred_score = pred_global_masks[gt_inds==ins_idx]
202 |                     ins_box = torch.floor(per_im_gt_instances.gt_boxes[ins_idx.item()].tensor[0] / factor).int()
203 |                     sampled_points = sampled_points_with_uncertainty(pred_score, ins_box)
204 |                     all_points.append((ins_idx.cpu().data, sampled_points.cpu().data))
205 |                 torch.save(all_points, os.path.join(os.getenv('ROOT_PATH'), f'AdelaiDet/points/{img_id}.pt'))
206 | 
207 |             losses = {}
208 |             losses.update(sem_losses)
209 |             losses.update(proposal_losses)
210 |             losses.update(mask_losses)
211 |             return losses
212 |         else:
213 |             pred_instances_w_masks = self._forward_mask_heads_test(proposals, mask_feats)
214 | 
215 |             padded_im_h, padded_im_w = images_norm.tensor.size()[-2:]
216 |             processed_results = []
217 |             for im_id, (input_per_image, image_size) in enumerate(zip(batched_inputs, images_norm.image_sizes)):
218 |                 height = input_per_image.get("height", image_size[0])
219 |                 width = input_per_image.get("width", image_size[1])
220 | 
221 |                 instances_per_im = pred_instances_w_masks[pred_instances_w_masks.im_inds == im_id]
222 |                 instances_per_im = self.postprocess(
223 |                     instances_per_im, height, width,
224 |                     padded_im_h, padded_im_w
225 |                 )
226 | 
227 |                 processed_results.append({
228 |                     "instances": instances_per_im
229 |                 })
230 | 
231 |             return processed_results
232 | 
233 |     def _forward_mask_heads_train(self, proposals, mask_feats, gt_instances):
234 |         # prepare the inputs for mask heads
235 |         pred_instances = proposals["instances"]
236 | 
237 |         assert (self.max_proposals == -1) or (self.topk_proposals_per_im == -1), \
238 |             "MAX_PROPOSALS and TOPK_PROPOSALS_PER_IM cannot be used at the same time."
239 |         if self.max_proposals != -1:
240 |             if self.max_proposals < len(pred_instances):
241 |                 inds = torch.randperm(len(pred_instances), device=mask_feats.device).long()
242 |                 logger.info("clipping proposals from {} to {}".format(
243 |                     len(pred_instances), self.max_proposals
244 |                 ))
245 |                 pred_instances = pred_instances[inds[:self.max_proposals]]
246 |         elif self.topk_proposals_per_im != -1:
247 |             num_images = len(gt_instances)
248 | 
249 |             kept_instances = []
250 |             for im_id in range(num_images):
251 |                 instances_per_im = pred_instances[pred_instances.im_inds == im_id]
252 |                 if len(instances_per_im) == 0:
253 |                     kept_instances.append(instances_per_im)
254 |                     continue
255 | 
256 |                 unique_gt_inds = instances_per_im.gt_inds.unique()
257 |                 num_instances_per_gt = max(int(self.topk_proposals_per_im / len(unique_gt_inds)), 1)
258 | 
259 |                 for gt_ind in unique_gt_inds:
260 |                     instances_per_gt = instances_per_im[instances_per_im.gt_inds == gt_ind]
261 | 
262 |                     if len(instances_per_gt) > num_instances_per_gt:
263 |                         scores = instances_per_gt.logits_pred.sigmoid().max(dim=1)[0]
264 |                         ctrness_pred = instances_per_gt.ctrness_pred.sigmoid()
265 |                         inds = (scores * ctrness_pred).topk(k=num_instances_per_gt, dim=0)[1]
266 |                         instances_per_gt = instances_per_gt[inds]
267 | 
268 |                     kept_instances.append(instances_per_gt)
269 | 
270 |             pred_instances = Instances.cat(kept_instances)
271 | 
272 |         pred_instances.mask_head_params = pred_instances.top_feats
273 | 
274 |         loss_mask = self.mask_head(
275 |             mask_feats, self.mask_branch.out_stride,
276 |             pred_instances, gt_instances
277 |         )
278 | 
279 |         return loss_mask
280 | 
281 |     def _forward_mask_heads_test(self, proposals, mask_feats):
282 |         # prepare the inputs for mask heads
283 |         for im_id, per_im in enumerate(proposals):
284 |             per_im.im_inds = per_im.locations.new_ones(len(per_im), dtype=torch.long) * im_id
285 |         pred_instances = Instances.cat(proposals)
286 |         pred_instances.mask_head_params = pred_instances.top_feat
287 | 
288 |         pred_instances_w_masks = self.mask_head(
289 |             mask_feats, self.mask_branch.out_stride, pred_instances
290 |         )
291 | 
292 |         return pred_instances_w_masks
293 | 
294 |     def add_bitmasks(self, instances, im_h, im_w):
295 |         for per_im_gt_inst in instances:
296 |             if not per_im_gt_inst.has("gt_masks"):
297 |                 continue
298 |             start = int(self.mask_out_stride // 2)
299 |             if isinstance(per_im_gt_inst.get("gt_masks"), PolygonMasks):
300 |                 polygons = per_im_gt_inst.get("gt_masks").polygons
301 |                 per_im_bitmasks = []
302 |                 per_im_bitmasks_full = []
303 |                 for per_polygons in polygons:
304 |                     bitmask = polygons_to_bitmask(per_polygons, im_h, im_w)
305 |                     bitmask = torch.from_numpy(bitmask).to(self.device).float()
306 |                     start = int(self.mask_out_stride // 2)
307 |                     bitmask_full = bitmask.clone()
308 |                     bitmask = bitmask[start::self.mask_out_stride, start::self.mask_out_stride]
309 | 
310 |                     assert bitmask.size(0) * self.mask_out_stride == im_h
311 |                     assert bitmask.size(1) * self.mask_out_stride == im_w
312 | 
313 |                     per_im_bitmasks.append(bitmask)
314 |                     per_im_bitmasks_full.append(bitmask_full)
315 | 
316 |                 per_im_gt_inst.gt_bitmasks = torch.stack(per_im_bitmasks, dim=0)
317 |                 per_im_gt_inst.gt_bitmasks_full = torch.stack(per_im_bitmasks_full, dim=0)
318 |             else: # RLE format bitmask
319 |                 bitmasks = per_im_gt_inst.get("gt_masks").tensor
320 |                 h, w = bitmasks.size()[1:]
321 |                 # pad to new size
322 |                 bitmasks_full = F.pad(bitmasks, (0, im_w - w, 0, im_h - h), "constant", 0)
323 |                 bitmasks = bitmasks_full[:, start::self.mask_out_stride, start::self.mask_out_stride]
324 |                 per_im_gt_inst.gt_bitmasks = bitmasks
325 |                 per_im_gt_inst.gt_bitmasks_full = bitmasks_full
326 | 
327 |     def add_bitmasks_from_boxes(self, instances, images, image_masks, im_h, im_w):
328 |         stride = self.mask_out_stride
329 |         start = int(stride // 2)
330 | 
331 |         assert images.size(2) % stride == 0
332 |         assert images.size(3) % stride == 0
333 | 
334 |         downsampled_images = F.avg_pool2d(
335 |             images.float(), kernel_size=stride,
336 |             stride=stride, padding=0
337 |         )[:, [2, 1, 0]]
338 |         image_masks = image_masks[:, start::stride, start::stride]
339 | 
340 |         for im_i, per_im_gt_inst in enumerate(instances):
341 |             images_lab = color.rgb2lab(downsampled_images[im_i].byte().permute(1, 2, 0).cpu().numpy())
342 |             images_lab = torch.as_tensor(images_lab, device=downsampled_images.device, dtype=torch.float32)
343 |             images_lab = images_lab.permute(2, 0, 1)[None]
344 |             images_color_similarity = get_images_color_similarity(
345 |                 images_lab, image_masks[im_i],
346 |                 self.pairwise_size, self.pairwise_dilation
347 |             )
348 | 
349 |             per_im_boxes = per_im_gt_inst.gt_boxes.tensor
350 |             per_im_bitmasks = []
351 |             per_im_bitmasks_full = []
352 |             for per_box in per_im_boxes:
353 |                 bitmask_full = torch.zeros((im_h, im_w)).to(self.device).float()
354 |                 bitmask_full[int(per_box[1]):int(per_box[3] + 1), int(per_box[0]):int(per_box[2] + 1)] = 1.0
355 | 
356 |                 bitmask = bitmask_full[start::stride, start::stride]
357 | 
358 |                 assert bitmask.size(0) * stride == im_h
359 |                 assert bitmask.size(1) * stride == im_w
360 | 
361 |                 per_im_bitmasks.append(bitmask)
362 |                 per_im_bitmasks_full.append(bitmask_full)
363 | 
364 |             per_im_gt_inst.gt_bitmasks = torch.stack(per_im_bitmasks, dim=0)
365 |             per_im_gt_inst.gt_bitmasks_full = torch.stack(per_im_bitmasks_full, dim=0)
366 |             per_im_gt_inst.image_color_similarity = torch.cat([
367 |                 images_color_similarity for _ in range(len(per_im_gt_inst))
368 |             ], dim=0)
369 | 
370 |     def postprocess(self, results, output_height, output_width, padded_im_h, padded_im_w, mask_threshold=0.5):
371 |         """
372 |         Resize the output instances.
373 |         The input images are often resized when entering an object detector.
374 |         As a result, we often need the outputs of the detector in a different
375 |         resolution from its inputs.
376 |         This function will resize the raw outputs of an R-CNN detector
377 |         to produce outputs according to the desired output resolution.
378 |         Args:
379 |             results (Instances): the raw outputs from the detector.
380 |                 `results.image_size` contains the input image resolution the detector sees.
381 |                 This object might be modified in-place.
382 |             output_height, output_width: the desired output resolution.
383 |         Returns:
384 |             Instances: the resized output from the model, based on the output resolution
385 |         """
386 |         scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
387 |         resized_im_h, resized_im_w = results.image_size
388 |         results = Instances((output_height, output_width), **results.get_fields())
389 | 
390 |         if results.has("pred_boxes"):
391 |             output_boxes = results.pred_boxes
392 |         elif results.has("proposal_boxes"):
393 |             output_boxes = results.proposal_boxes
394 | 
395 |         output_boxes.scale(scale_x, scale_y)
396 |         output_boxes.clip(results.image_size)
397 | 
398 |         results = results[output_boxes.nonempty()]
399 | 
400 |         if results.has("pred_global_masks"):
401 |             mask_h, mask_w = results.pred_global_masks.size()[-2:]
402 |             factor_h = padded_im_h // mask_h
403 |             factor_w = padded_im_w // mask_w
404 |             assert factor_h == factor_w
405 |             factor = factor_h
406 |             pred_global_masks = aligned_bilinear(
407 |                 results.pred_global_masks, factor
408 |             )
409 |             pred_global_masks = pred_global_masks[:, :, :resized_im_h, :resized_im_w]
410 |             pred_global_masks = F.interpolate(
411 |                 pred_global_masks,
412 |                 size=(output_height, output_width),
413 |                 mode="bilinear", align_corners=False
414 |             )
415 |             pred_global_masks = pred_global_masks[:, 0, :, :]
416 |             
417 |             if self.point_sup_enabled:
418 |                 # filter out any mask prediction outside of predicted boxes (see PointSup)
419 |                 pred_boxes = results.pred_boxes.tensor
420 |                 for i in range(pred_global_masks.size(0)):
421 |                     kept_mask = torch.zeros_like(pred_global_masks[0]).to(pred_boxes.device)
422 |                     x0,y0,x1,y1 = int(pred_boxes[i][0]),int(pred_boxes[i][1]),int(pred_boxes[i][2]),int(pred_boxes[i][3])
423 |                     kept_mask[y0:y1, x0:x1] = 1
424 |                     pred_global_masks[i] *= kept_mask
425 | 
426 |             results.pred_masks = (pred_global_masks > mask_threshold).float()
427 | 
428 |         return results
429 | 


--------------------------------------------------------------------------------