├── .gitignore
├── LICENSE
├── README.md
├── coco_rem
    ├── coco_evaluator.py
    ├── configs
    │   ├── README.md
    │   ├── common
    │   │   ├── coco_schedule.py
    │   │   ├── data
    │   │   │   ├── coco.py
    │   │   │   └── constants.py
    │   │   ├── models
    │   │   │   ├── cascade_rcnn.py
    │   │   │   ├── mask2former.py
    │   │   │   ├── mask_rcnn_fpn.py
    │   │   │   └── mask_rcnn_vitdet.py
    │   │   ├── optim.py
    │   │   └── train.py
    │   ├── convnext
    │   │   ├── cascade_mask_rcnn_convnext_base_1k_3x.py
    │   │   ├── cascade_mask_rcnn_convnext_base_22k_3x.py
    │   │   ├── cascade_mask_rcnn_convnext_large_22k_3x.py
    │   │   ├── cascade_mask_rcnn_convnext_small_1k_3x.py
    │   │   ├── cascade_mask_rcnn_convnext_tiny_1k_3x.py
    │   │   ├── cascade_mask_rcnn_convnext_xlarge_22k_3x.py
    │   │   └── mask_rcnn_convnext_tiny_1k_3x.py
    │   ├── d2lsj
    │   │   ├── mask_rcnn_R_101_FPN_100ep.py
    │   │   ├── mask_rcnn_R_101_FPN_200ep.py
    │   │   ├── mask_rcnn_R_101_FPN_400ep.py
    │   │   ├── mask_rcnn_R_50_FPN_100ep.py
    │   │   ├── mask_rcnn_R_50_FPN_200ep.py
    │   │   ├── mask_rcnn_R_50_FPN_400ep.py
    │   │   ├── mask_rcnn_regnetx_4gf_dds_FPN_100ep.py
    │   │   ├── mask_rcnn_regnetx_4gf_dds_FPN_200ep.py
    │   │   ├── mask_rcnn_regnetx_4gf_dds_FPN_400ep.py
    │   │   ├── mask_rcnn_regnety_4gf_dds_FPN_100ep.py
    │   │   ├── mask_rcnn_regnety_4gf_dds_FPN_200ep.py
    │   │   └── mask_rcnn_regnety_4gf_dds_FPN_400ep.py
    │   ├── d2main
    │   │   ├── cascade_mask_rcnn_R_50_FPN_3x.py
    │   │   ├── mask_rcnn_R_50_FPN_3x.py
    │   │   ├── scratch_mask_rcnn_R_50_FPN_9x_gn.py
    │   │   └── scratch_mask_rcnn_R_50_FPN_9x_syncbn.py
    │   ├── mask2former
    │   │   ├── maskformer2_R101_bs16_50ep.py
    │   │   ├── maskformer2_R50_bs16_50ep.py
    │   │   ├── maskformer2_swin_base_384_bs16_50ep.py
    │   │   ├── maskformer2_swin_base_IN21k_384_bs16_50ep.py
    │   │   ├── maskformer2_swin_large_IN21k_384_bs16_100ep.py
    │   │   ├── maskformer2_swin_small_bs16_50ep.py
    │   │   └── maskformer2_swin_tiny_bs16_50ep.py
    │   ├── mvitv2
    │   │   ├── cascade_mask_rcnn_mvitv2_b_3x.py
    │   │   ├── cascade_mask_rcnn_mvitv2_b_in21k_3x.py
    │   │   ├── cascade_mask_rcnn_mvitv2_h_in21k_lsj_3x.py
    │   │   ├── cascade_mask_rcnn_mvitv2_s_3x.py
    │   │   ├── cascade_mask_rcnn_mvitv2_t_3x.py
    │   │   └── mask_rcnn_mvitv2_t_3x.py
    │   └── vitdet
    │   │   ├── cascade_mask_rcnn_mvitv2_b_in21k_100ep.py
    │   │   ├── cascade_mask_rcnn_mvitv2_h_in21k_36ep.py
    │   │   ├── cascade_mask_rcnn_mvitv2_l_in21k_50ep.py
    │   │   ├── cascade_mask_rcnn_swin_b_in21k_50ep.py
    │   │   ├── cascade_mask_rcnn_swin_l_in21k_50ep.py
    │   │   ├── cascade_mask_rcnn_vitdet_b_100ep.py
    │   │   ├── cascade_mask_rcnn_vitdet_h_75ep.py
    │   │   ├── cascade_mask_rcnn_vitdet_l_100ep.py
    │   │   ├── mask_rcnn_vitdet_b_100ep.py
    │   │   ├── mask_rcnn_vitdet_h_75ep.py
    │   │   └── mask_rcnn_vitdet_l_100ep.py
    ├── data
    │   ├── builtin.py
    │   ├── lvis.py
    │   └── manual_rem.py
    ├── mask_visualizer.py
    ├── modeling
    │   ├── convnext.py
    │   ├── rcnn_refiner.py
    │   └── sam_refiner.py
    └── trainer.py
├── images
    ├── coco_rem_example_1.jpg
    └── coco_rem_example_2.jpg
├── requirements.txt
├── scripts
    ├── correct_labeling_errors.py
    ├── merge_instances.py
    ├── refine_boundaries.py
    ├── train_net.py
    └── visualize_coco.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.diff
 2 | 
 3 | # compilation and distribution
 4 | __pycache__
 5 | _ext
 6 | *.pyc
 7 | *.pyd
 8 | *.so
 9 | *.dll
10 | *.egg-info/
11 | build/
12 | dist/
13 | wheels/
14 | 
15 | # Python virtual environments.
16 | .env
17 | .venv
18 | env/
19 | venv/
20 | ENV/
21 | env.bak/
22 | venv.bak/
23 | 
24 | # Jupyter Notebook
25 | .ipynb_checkpoints
26 | /.virtual_documents
27 | 
28 | # IPython
29 | profile_default/
30 | ipython_config.py
31 | 
32 | 
33 | # pytorch/python/numpy formats
34 | *.pth
35 | *.pkl
36 | *.npy
37 | *.pt
38 | 
39 | # Editor temporaries
40 | *.swn
41 | *.swo
42 | *.swp
43 | *~
44 | 
45 | # editor settings
46 | .idea
47 | .vscode
48 | _darcs
49 | pyrightconfig.json
50 | 
51 | # project dirs
52 | datasets
53 | checkpoints
54 | output
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024, Karan Desai.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
 4 | associated documentation files (the "Software"), to deal in the Software without restriction,
 5 | including without limitation the rights to use, copy, modify, merge, publish, distribute,
 6 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
 7 | furnished to do so, subject to the following conditions:
 8 | 
 9 | The above copyright notice and this permission notice shall be included in all copies or substantial
10 | portions of the Software.
11 | 
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
13 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
14 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
15 | OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # COCO-ReM (COCO with Refined Masks)
  2 | 
  3 | [![Framework: PyTorch](https://img.shields.io/badge/Framework-PyTorch-orange.svg)](https://pytorch.org) [![HuggingFace Datasets](https://img.shields.io/badge/%F0%9F%A4%97-HuggingFace_Datasets-cyan.svg
  4 | )](https://huggingface.co/datasets/kdexd/coco-rem)
  5 | 
  6 | [Shweta Singh](https://www.linkedin.com/in/shweta-singh-460154284/)<sup>&dagger;</sup>, [Aayan Yadav](https://www.linkedin.com/in/aayanyadav09/)<sup>&dagger;</sup>, [Jitesh Jain](https://praeclarumjj3.github.io/), [Humphrey Shi](https://www.humphreyshi.com/home), [Justin Johnson](https://web.eecs.umich.edu/~justincj/), [Karan Desai](https://kdexd.xyz/) 
  7 | 
  8 | <sup>&dagger;</sup> Equal Contribution
  9 | 
 10 | [[`arxiv`](https://arxiv.org/abs/2403.18819)] [[`Dataset Website`](https://cocorem.xyz)]
 11 | 
 12 | ![Random examples from COCO-ReM](./images/coco_rem_example_2.jpg)
 13 | 
 14 | Introducing COCO-ReM, a set of high-quality instance annotations for COCO images.
 15 | COCO-ReM improves on imperfections prevailing in COCO-2017 such as coarse mask boundaries, non-exhaustive annotations,
 16 | inconsistent handling of occlusions, and duplicate masks.
 17 | Masks in COCO-ReM have a visibly better quality than COCO-2017, as shown below.
 18 | 
 19 | ![COCO and COCO-ReM](./images/coco_rem_example_1.jpg)
 20 | 
 21 | ## Contents
 22 | 
 23 | 1. [News](#news)
 24 | 2. [Setup Instructions](#setup-instructions)
 25 | 3. [Download COCO-ReM](#download-coco-rem)
 26 | 4. [Mask Visualization](#mask-visualization)
 27 | 5. [Evaluation using COCO-ReM](#evaluation-using-coco-rem)
 28 | 6. [Training with COCO-ReM](#training-with-coco-rem)
 29 | 7. [Annotation Pipeline](#annotation-pipeline)
 30 |     - [Stage 1: Mask Boundary Refinement (automatic step)](#stage-1-mask-boundary-refinement)
 31 |     - [Stage 2: Exhaustive Instance Annotation (automatic step)](#stage-2-exhaustive-instance-annotation)
 32 |     - [Stage 3: Correction of Labeling Errors](#stage-3-correction-of-labeling-errors)
 33 | 8. [Citation](#citation)
 34 | 
 35 | ## News
 36 | 
 37 | - **[July 7, 2024]**: Dataset now available on [**HuggingFace**](https://huggingface.co/datasets/kdexd/coco-rem) and [**code**](https://github.com/kdexd/coco-rem) is public!
 38 | - **[July 1, 2024]**: COCO-ReM is accepted to ECCV 2024!
 39 | - **[March 27, 2024]**: [**Dataset website**](https://cocorem.xyz) and [**arXiv preprint**](https://arxiv.org/abs/2403.18819) are public!
 40 | 
 41 | ## Setup Instructions
 42 | 
 43 | Clone the repository, create a conda environment, and install all dependencies as follows:
 44 | 
 45 | ```bash
 46 | git clone https://github.com/kdexd/coco-rem.git && cd coco-rem
 47 | conda create -n coco_rem python=3.10
 48 | conda activate coco_rem
 49 | ```
 50 | 
 51 | Install PyTorch and `torchvision` following the instructions on [pytorch.org](https://pytorch.org).
 52 | Install Detectron2, [instructions are available here](https://github.com/facebookresearch/detectron2/blob/main/INSTALL.md).
 53 | Then, install the dependencies:
 54 | 
 55 | ```bash
 56 | pip install -r requirements.txt
 57 | pip install git+https://github.com/facebookresearch/segment-anything.git
 58 | pip install git+https://github.com/bowenc0221/boundary-iou-api.git
 59 | 
 60 | python setup.py develop
 61 | ```
 62 | 
 63 | ## Download COCO-ReM
 64 | 
 65 | COCO-ReM is hosted on Huggingface Datasets at [@kdexd/coco-rem](https://huggingface.co/datasets/kdexd/coco-rem).
 66 | Download the annotation files:
 67 | 
 68 | ```
 69 | for name in trainrem valrem; do
 70 |     wget https://huggingface.co/datasets/kdexd/coco-rem/resolve/main/instances_$name.json.zip
 71 |     unzip instances_$name.json.zip
 72 | done
 73 | ```
 74 | 
 75 | **Dataset organization:** COCO and COCO-ReM and must be organized inside `datasets` directory as follows.
 76 | 
 77 | ```
 78 | $PROJECT_ROOT/datasets
 79 |     — coco/
 80 |         — train2017/         # Contains 118287 train images (.jpg files).
 81 |         — val2017/           # Contains 5000 val images (.jpg files).
 82 |         — annotations/
 83 |             — instances_train2017.json
 84 |             — instances_val2017.json
 85 |     - coco_rem/
 86 |             - instances_trainrem.json
 87 |             - instances_valrem.json
 88 |     -lvis
 89 |             - lvis_v1_val.json
 90 |             - lvis_v1_train.json
 91 | ```
 92 | 
 93 | -----
 94 | 
 95 | ## Mask Visualization
 96 | 
 97 | We include a lightweight script to quickly visualize masks of COCO-ReM and COCO-2017,
 98 | both validation and training sets. For example, run the following command to visualize
 99 | the masks for COCO-ReM validation set:
100 | 
101 | ```bash
102 | python scripts/visualize_coco.py \
103 |     --input-json datasets/coco_rem/instances_valrem.json \
104 |     --image-dir datasets/coco/val2017 \
105 |     --output visualization_output
106 | ```
107 | 
108 | Read the documentation (`python scripts/visualize_coco.py --help`) for details about other arguments.
109 | 
110 | -----
111 | 
112 | ## Evaluation using COCO-ReM
113 | 
114 | We support evaluation of all fifty object detectors available in the paper.
115 | First, run `python checkpoints/download.py` to download all the pre-trained models
116 | from their official repositories and save them in `checkpoints/pretrained_weights`.
117 | 
118 | For example, to evaluate a [Mask R-CNN ViTDet-B model](https://arxiv.org/abs/2203.16527) using 8 GPUs
119 | and calculate average precision (AP) metrics, run the following command:
120 | 
121 | ```bash
122 | python scripts/train_net.py --num-gpus 8 --eval-only \
123 |     --config coco_rem/configs/vitdet/mask_rcnn_vitdet_b_100ep.py \
124 |     train.init_checkpoint=checkpoints/pretrained_weights/vitdet/mask_rcnn_vitdet_b_100ep.pkl \
125 |     dataloader.test.dataset.names=coco_rem_val \
126 |     train.output_dir=evaluation_results
127 | ```
128 | 
129 | ## Training with COCO-ReM
130 | 
131 | We also support training ViTDet baselines on COCO-ReM using the Detectron2 library.
132 | Run the following command to train using 8 GPUs (with at least 32GB memory):
133 | 
134 | ```bash
135 | python scripts/train_net.py --num-gpus 8 \
136 |     --config coco_rem/configs/vitdet/mask_rcnn_vitdet_b_100ep.py \
137 |     dataloader.train.dataset.names=coco_rem_train \
138 |     dataloader.test.dataset.names=coco_rem_val \
139 |     train.output_dir=training_output \
140 |     dataloader.train.total_batch_size=16 train.grad_accum_steps=4
141 | ```
142 | 
143 | For GPUs with less memory, update the parameters in the last line above:
144 | the batch size can be halved and gradient accumulation steps can be doubled, for same results.
145 | 
146 | ## Annotation Pipeline
147 | 
148 | <a id="stage-1-mask-boundary-refinement"></a>
149 | ### Stage 1: Mask Boundary Refinement (automatic step)
150 | 
151 | Download checkpoint for SAM from [segment-anything repository](https://github.com/facebookresearch/segment-anything) and place it in `checkpoint` folder.
152 | 
153 | Run the following command to refine the boundaries of validation set masks using 8 GPUs:
154 | 
155 | ```bash
156 | python scripts/refine_boundaries.py \
157 |     --input-json datasets/coco/annotations/instances_val2017.json \
158 |     --image-dir datasets/coco/val2017 \
159 |     --num-gpus 8 \
160 |     --output datasets/intermediate/cocoval_boundary_refined.json
161 | ```
162 | 
163 | Read the documentation (`python scripts/refine_boundaries.py --help`) for details about other arguments.
164 | 
165 | Use default values for other optional arguments to follow the strategy used in [paper](https://arxiv.org/abs/2403.18819). 
166 | 
167 | Do this stage for both COCO and LVIS datasets before the merging stage.
168 | 
169 | <a id="stage-2-exhaustive-instance-annotation"></a>
170 | 
171 | ### Stage 2: Exhaustive Instance Annotation (automatic step)
172 | 
173 | Run the following command to merge LVIS annotations for validation set of COCO using the strategy described in  [paper](https://arxiv.org/abs/2403.18819):
174 | 
175 | ```
176 | python scripts/merge_instances.py \
177 |     --coco-json datasets/intermediate/cocoval_boundary_refined.json \
178 |     --lvis-json datasets/intermediate/lvistrain_boundary_refined.json datasets/intermediate/lvisval_boundary_refined.json \
179 |     --split val \
180 |     --output datasets/intermediate/cocoval_lvis_merged.json
181 | ```
182 | Read the documentation (`python scripts/merge_instances.py --help`) for details about above arguments.
183 | 
184 | Merging handpicked `(image,category)` non exhaustive instances from LVIS in validation set is done in the script of next stage.
185 | 
186 | <a id="stage-3-correction-of-labeling-errors"></a>
187 | 
188 | ### Stage 3: Correction of Labeling Errors
189 | 
190 | This stage is done only for validation set.
191 | 
192 | ```
193 | python scripts/correct_labeling_errors.py \
194 |     --input datasets/intermediate/cocoval_lvis_merged.json \
195 |     --output datasets/cocoval_refined.json
196 | ```
197 | **Note**: For the above json to be COCO-ReM we also have to perform the manual parts of Stage 1 and Stage 2.
198 | 
199 | ## Citation
200 | 
201 | If you found COCO-ReM useful in your research, please consider starring ⭐ us on GitHub and citing 📚 us in your research!
202 | 
203 | ```bibtex
204 | @inproceedings{cocorem,
205 |   title={Benchmarking Object Detectors with COCO: A New Path Forward},
206 |   author={Singh, Shweta and Yadav, Aayan and Jain, Jitesh and Shi, Humphrey and Johnson, Justin and Desai, Karan},
207 |   journal={ECCV},
208 |   year={2024}
209 | }
210 | ```
211 | 


--------------------------------------------------------------------------------
/coco_rem/coco_evaluator.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | from __future__ import annotations
  3 | 
  4 | import contextlib
  5 | import copy
  6 | import io
  7 | import itertools
  8 | import json
  9 | import logging
 10 | import os
 11 | from collections import OrderedDict
 12 | 
 13 | import detectron2.utils.comm as comm
 14 | import numpy as np
 15 | import pycocotools.mask as mask_util
 16 | import torch
 17 | from boundary_iou.coco_instance_api.coco import COCO
 18 | from boundary_iou.coco_instance_api.cocoeval import COCOeval
 19 | from detectron2.data import MetadataCatalog
 20 | from detectron2.evaluation.evaluator import DatasetEvaluator
 21 | from detectron2.structures import BoxMode, Instances
 22 | from detectron2.utils.file_io import PathManager
 23 | from detectron2.utils.logger import create_small_table
 24 | from tabulate import tabulate
 25 | 
 26 | 
 27 | class COCOReMEvaluator(DatasetEvaluator):
 28 |     """
 29 |     Evaluate AP for COCO instance segmentation. The metrics range from 0 to 100
 30 |     (instead of 0 to 1), where a -1 or NaN means the metric cannot be computed
 31 |     (e.g. due to no predictions made).
 32 | 
 33 |     See http://cocodataset.org/#detection-eval
 34 | 
 35 |     This implementation is functionally same as the original COCO evaluator of
 36 |     Detectron2 (:class:`detectron2.evaluation.COCOEvaluator`) except a few API
 37 |     and behavioral differences:
 38 | 
 39 |     1. Only `Mask AP` and `Boundary AP` are supported, other metrics like `Box AP`
 40 |        and `Keypoint AP` are neither supported, nor calculated.
 41 | 
 42 |     2. Max detections per image are always `[1, 10, 100]` following official COCO
 43 |        evaluation protocol, these are not customizable.
 44 | 
 45 |     3. The official COCO evaluation API is used for calculating metrics, unlike
 46 |        Detectron2 that also allows using a fast, yet unofficial implementation.
 47 |        Hence, the calculated AP is suitable to report in research papers.
 48 |     """
 49 | 
 50 |     def __init__(self, dataset_name: str, distributed: bool = True, output_dir=None):
 51 |         """
 52 |         Args:
 53 |             dataset_name: Name of the dataset to be evaluated. It must have either
 54 |                 registered metadata with a field named `json_file` which is a path
 55 |                 to the COCO format annotation file.
 56 |             distributed: If True, will collect results from all ranks and run
 57 |                 evaluation in the main process. Otherwise, will only evaluate
 58 |                 the results in the current process.
 59 |             output_dir: An optional path to output directory where all results
 60 |                 will be dumped as two files:
 61 | 
 62 |                 1. "instances_predictions.pth" a file that can be loaded with
 63 |                    `torch.load` and contains all the results in the format they
 64 |                    are produced by the model.
 65 |                 2. "coco_instances_results.json" in COCO result format.
 66 |         """
 67 |         self._logger = logging.getLogger(__name__)
 68 |         self._distributed = distributed
 69 |         self._output_dir = output_dir
 70 |         self._cpu_device = torch.device("cpu")
 71 | 
 72 |         self._metadata = MetadataCatalog.get(dataset_name)
 73 |         json_file = PathManager.get_local_path(self._metadata.json_file)
 74 | 
 75 |         with contextlib.redirect_stdout(io.StringIO()):
 76 |             self._coco_api = COCO(json_file)
 77 | 
 78 |     def reset(self):
 79 |         self._predictions = []
 80 | 
 81 |     def process(self, inputs, outputs):
 82 |         """
 83 |         Args:
 84 |             inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
 85 |                 It is a list of dict. Each dict corresponds to an image and
 86 |                 contains keys like "height", "width", "file_name", "image_id".
 87 |             outputs: the outputs of a COCO model. It is a list of dicts with key
 88 |                 "instances" that contains :class:`Instances`.
 89 |         """
 90 |         for input, output in zip(inputs, outputs):
 91 |             prediction = {"image_id": input["image_id"]}
 92 | 
 93 |             if "instances" in output:
 94 |                 instances = output["instances"].to(self._cpu_device)
 95 |                 prediction["instances"] = instances_to_coco_json(
 96 |                     instances, input["image_id"]
 97 |                 )
 98 |             if "proposals" in output:
 99 |                 prediction["proposals"] = output["proposals"].to(self._cpu_device)
100 |             if len(prediction) > 1:
101 |                 self._predictions.append(prediction)
102 | 
103 |     def evaluate(self, img_ids=None):
104 |         """
105 |         Args:
106 |             img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
107 |         """
108 |         if self._distributed:
109 |             comm.synchronize()
110 |             predictions = comm.gather(self._predictions, dst=0)
111 |             predictions = list(itertools.chain(*predictions))
112 | 
113 |             if not comm.is_main_process():
114 |                 return {}
115 |         else:
116 |             predictions = self._predictions
117 | 
118 |         if len(predictions) == 0:
119 |             self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
120 |             return {}
121 | 
122 |         if self._output_dir:
123 |             PathManager.mkdirs(self._output_dir)
124 |             file_path = os.path.join(self._output_dir, "instances_predictions.pth")
125 |             with PathManager.open(file_path, "wb") as f:
126 |                 torch.save(predictions, f)
127 | 
128 |         self._results = OrderedDict()
129 |         if "instances" in predictions[0]:
130 |             self._eval_predictions(predictions, img_ids=img_ids)
131 |         # Copy so the caller can do whatever with results
132 |         return copy.deepcopy(self._results)
133 | 
134 |     def _eval_predictions(self, predictions, img_ids=None):
135 |         """
136 |         Evaluate predictions. Fill self._results with the metrics of the tasks.
137 |         """
138 |         self._logger.info("Preparing results for COCO format ...")
139 |         coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
140 | 
141 |         # unmap the category ids for COCO
142 |         if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
143 |             dataset_id_to_contiguous_id = (
144 |                 self._metadata.thing_dataset_id_to_contiguous_id
145 |             )
146 |             all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
147 |             num_classes = len(all_contiguous_ids)
148 |             assert (
149 |                 min(all_contiguous_ids) == 0
150 |                 and max(all_contiguous_ids) == num_classes - 1
151 |             )
152 | 
153 |             reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
154 |             for result in coco_results:
155 |                 category_id = result["category_id"]
156 |                 assert category_id < num_classes, (
157 |                     f"A prediction has class={category_id}, "
158 |                     f"but the dataset only has {num_classes} classes and "
159 |                     f"predicted class id should be in [0, {num_classes - 1}]."
160 |                 )
161 |                 result["category_id"] = reverse_id_mapping[category_id]
162 | 
163 |         if self._output_dir:
164 |             file_path = os.path.join(self._output_dir, "coco_instances_results.json")
165 |             self._logger.info("Saving results to {}".format(file_path))
166 |             with PathManager.open(file_path, "w") as f:
167 |                 f.write(json.dumps(coco_results))
168 |                 f.flush()
169 | 
170 |         self._logger.info("Evaluating predictions with official COCO API...")
171 | 
172 |         for task in ["segm", "boundary"]:
173 |             coco_eval = (
174 |                 _evaluate_predictions_on_coco(
175 |                     self._coco_api, coco_results, task, img_ids=img_ids
176 |                 )
177 |                 if len(coco_results) > 0
178 |                 else None  # cocoapi does not handle empty results very well
179 |             )
180 | 
181 |             res = self._derive_coco_results(
182 |                 coco_eval, task, class_names=self._metadata.get("thing_classes")
183 |             )
184 |             self._results[task] = res
185 | 
186 |     def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
187 |         """
188 |         Derive the desired score numbers from summarized COCOeval.
189 |         """
190 | 
191 |         metrics = [
192 |             "AP",
193 |             "AP50",
194 |             "AP75",
195 |             "AP80",
196 |             "AP85",
197 |             "AP90",
198 |             "AP95",
199 |             "APs",
200 |             "APm",
201 |             "APl",
202 |         ]
203 |         if coco_eval is None:
204 |             self._logger.warn("No predictions from the model!")
205 |             return {metric: float("nan") for metric in metrics}
206 | 
207 |         # the standard metrics
208 |         results = {
209 |             metric: float(
210 |                 coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan"
211 |             )
212 |             for idx, metric in enumerate(metrics)
213 |         }
214 |         self._logger.info(
215 |             "Evaluation results for {}: \n".format(iou_type)
216 |             + create_small_table(results)
217 |         )
218 |         if not np.isfinite(sum(results.values())):
219 |             self._logger.info("Some metrics cannot be computed and is shown as NaN.")
220 | 
221 |         if class_names is None or len(class_names) <= 1:
222 |             return results
223 | 
224 |         # Compute per-category AP
225 |         precisions = coco_eval.eval["precision"]
226 |         # precision has dims (iou, recall, cls, area range, max dets)
227 |         assert len(class_names) == precisions.shape[2]
228 | 
229 |         results_per_category = []
230 |         for idx, name in enumerate(class_names):
231 |             # area range index 0: all area ranges
232 |             # max dets index -1: typically 100 per image
233 |             precision = precisions[:, :, idx, 0, -1]
234 |             precision = precision[precision > -1]
235 |             ap = np.mean(precision) if precision.size else float("nan")
236 |             results_per_category.append(("{}".format(name), float(ap * 100)))
237 | 
238 |         # tabulate it
239 |         N_COLS = min(6, len(results_per_category) * 2)
240 |         results_flatten = list(itertools.chain(*results_per_category))
241 |         results_2d = itertools.zip_longest(
242 |             *[results_flatten[i::N_COLS] for i in range(N_COLS)]
243 |         )
244 |         table = tabulate(
245 |             results_2d,
246 |             tablefmt="pipe",
247 |             floatfmt=".3f",
248 |             headers=["category", "AP"] * (N_COLS // 2),
249 |             numalign="left",
250 |         )
251 |         self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
252 | 
253 |         results.update({"AP-" + name: ap for name, ap in results_per_category})
254 |         return results
255 | 
256 | 
257 | def instances_to_coco_json(instances: Instances, img_id: int) -> list[dict]:
258 |     """
259 |     Dump an "Instances" object to a COCO-format json that's used for evaluation.
260 |     """
261 |     num_instance = len(instances)
262 |     if num_instance == 0:
263 |         return []
264 | 
265 |     boxes = instances.pred_boxes.tensor.numpy()
266 |     boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
267 |     boxes = boxes.tolist()
268 |     scores = instances.scores.tolist()
269 |     classes = instances.pred_classes.tolist()
270 | 
271 |     has_mask = instances.has("pred_masks")
272 |     if has_mask:
273 |         # use RLE to encode the masks, because they are too large and takes memory
274 |         # since this evaluator stores outputs of the entire dataset
275 |         rles = [
276 |             mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
277 |             for mask in instances.pred_masks
278 |         ]
279 |         for rle in rles:
280 |             # "counts" is an array encoded by mask_util as a byte-stream. Python3's
281 |             # json writer which always produces strings cannot serialize a bytestream
282 |             # unless you decode it. Thankfully, utf-8 works out (which is also what
283 |             # the pycocotools/_mask.pyx does).
284 |             rle["counts"] = rle["counts"].decode("utf-8")
285 | 
286 |     results = []
287 |     for k in range(num_instance):
288 |         result = {
289 |             "image_id": img_id,
290 |             "category_id": classes[k],
291 |             "bbox": boxes[k],
292 |             "score": scores[k],
293 |         }
294 |         if has_mask:
295 |             result["segmentation"] = rles[k]
296 |         results.append(result)
297 |     return results
298 | 
299 | 
300 | class COCOevalHighIoU(COCOeval):
301 |     def summarize(self):
302 |         """
303 |         Compute and display summary metrics for evaluation results including AP
304 |         with higher IOU thresholds (0.9 and 0.95).
305 |         """
306 | 
307 |         def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
308 |             p = self.params
309 |             p.iouThrs = np.array(
310 |                 [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
311 |             )
312 | 
313 |             iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
314 |             titleStr = "Average Precision" if ap == 1 else "Average Recall"
315 |             typeStr = "(AP)" if ap == 1 else "(AR)"
316 |             iouStr = (
317 |                 "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
318 |                 if iouThr is None
319 |                 else "{:0.2f}".format(iouThr)
320 |             )
321 | 
322 |             aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
323 |             mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
324 |             if ap == 1:
325 |                 # dimension of precision: [TxRxKxAxM]
326 |                 s = self.eval["precision"]
327 |                 # IoU
328 |                 if iouThr is not None:
329 |                     t = np.where(iouThr == p.iouThrs)[0]
330 |                     s = s[t]
331 |                 s = s[:, :, :, aind, mind]
332 |             else:
333 |                 # dimension of recall: [TxKxAxM]
334 |                 s = self.eval["recall"]
335 |                 if iouThr is not None:
336 |                     t = np.where(iouThr == p.iouThrs)[0]
337 |                     s = s[t]
338 |                 s = s[:, :, aind, mind]
339 |             if len(s[s > -1]) == 0:
340 |                 mean_s = -1
341 |             else:
342 |                 mean_s = np.mean(s[s > -1])
343 |             print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
344 |             return mean_s
345 | 
346 |         def _summarizeDets():
347 |             stats = np.zeros((16,))
348 |             stats[0] = _summarize(1, maxDets=self.params.maxDets[2])
349 |             stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
350 |             stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
351 |             stats[3] = _summarize(1, iouThr=0.80, maxDets=self.params.maxDets[2])
352 |             stats[4] = _summarize(1, iouThr=0.85, maxDets=self.params.maxDets[2])
353 |             stats[5] = _summarize(1, iouThr=0.90, maxDets=self.params.maxDets[2])
354 |             stats[6] = _summarize(1, iouThr=0.95, maxDets=self.params.maxDets[2])
355 |             stats[7] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2])
356 |             stats[8] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2])
357 |             stats[9] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2])
358 |             stats[10] = _summarize(0, maxDets=self.params.maxDets[0])
359 |             stats[11] = _summarize(0, maxDets=self.params.maxDets[1])
360 |             stats[12] = _summarize(0, maxDets=self.params.maxDets[2])
361 |             stats[13] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2])
362 |             stats[14] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2])
363 |             stats[15] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2])
364 |             return stats
365 | 
366 |         if not self.eval:
367 |             raise Exception("Please run accumulate() first")
368 | 
369 |         self.stats = _summarizeDets()
370 | 
371 |     def __str__(self):
372 |         self.summarize()
373 | 
374 | 
375 | def _evaluate_predictions_on_coco(coco_gt, coco_results, iou_type, img_ids=None):
376 |     """
377 |     Evaluate the coco results using COCOEval API.
378 |     """
379 |     assert len(coco_results) > 0
380 | 
381 |     if iou_type in {"segm", "boundary"}:
382 |         coco_results = copy.deepcopy(coco_results)
383 |         # When evaluating mask AP, if the results contain bbox, cocoapi will
384 |         # use the box area as the area of the instance, instead of the mask area.
385 |         # This leads to a different definition of small/medium/large.
386 |         # We remove the bbox field to let mask AP use mask area.
387 |         for c in coco_results:
388 |             c.pop("bbox", None)
389 | 
390 |     coco_dt = coco_gt.loadRes(coco_results)
391 |     coco_eval = COCOevalHighIoU(coco_gt, coco_dt, iou_type)
392 | 
393 |     if img_ids is not None:
394 |         coco_eval.params.imgIds = img_ids
395 | 
396 |     coco_eval.evaluate()
397 |     coco_eval.accumulate()
398 |     coco_eval.summarize()
399 | 
400 |     return coco_eval
401 | 


--------------------------------------------------------------------------------
/coco_rem/configs/README.md:
--------------------------------------------------------------------------------
 1 | # Model Configs for Benchmarking
 2 | 
 3 | Each sub-directory contains Detectron2 config files (`LazyConfig` format) for
 4 | all model checkpoints from public Github repos building with Detectron2.
 5 | 
 6 | - `d2main`: Detectron2 model zoo (initial baselines).
 7 | - `d2lsj`: Detectron2 model zoo (new LSJ baselines).
 8 | - `vitdet`: https://github.com/facebookresearch/detectron2/tree/main/projects/ViTDet
 9 | - `convnext`: https://github.com/facebookresearch/convnext
10 | - `mvitv2`: https://github.com/facebookresearch/detectron2/tree/main/projects/MViTv2
11 | - `mask2former`: https://github.com/facebookresearch/Mask2Former
12 | 
13 | Additionally, `common` directory has config objects that are shared across many
14 | config files.
15 | 
16 | ### Note on config structure
17 | 
18 | Detectron2 lazy configs are described in the official Detectron2 documentation
19 | [here](https://detectron2.readthedocs.io/en/latest/tutorials/lazyconfigs.html).
20 | Each config file requires five objects: `dataloader`, `model`, `optimizer`,
21 | `lr_multiplier`, `train`. Some configs may exclude two objects that are not
22 | required for evaluation - `optimizer` and `lr_multiplier`.
23 | 


--------------------------------------------------------------------------------
/coco_rem/configs/common/coco_schedule.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.solver import WarmupParamScheduler
 3 | from fvcore.common.param_scheduler import MultiStepParamScheduler
 4 | 
 5 | 
 6 | def default_lsj_epoch_scheduler(epochs: int):
 7 |     """
 8 |     Returns the config for a default multi-step LR scheduler that runs for fixed
 9 |     amount of COCO epochs, typically used with models using "LSJ" augmentations
10 |     and training schedule (large-scale jittering augmentation and 50-400 epochs).
11 |     """
12 | 
13 |     coco_100ep_iter = 184375
14 |     coco_curr_iter = coco_100ep_iter * epochs // 100
15 | 
16 |     coco_100ep_milestones = [163889, 177546]
17 |     coco_curr_milestones = [x * epochs // 100 for x in coco_100ep_milestones]
18 | 
19 |     lr_multiplier = L(WarmupParamScheduler)(
20 |         scheduler=L(MultiStepParamScheduler)(
21 |             values=[1.0, 0.1, 0.01],
22 |             milestones=coco_curr_milestones,
23 |             num_updates=coco_curr_iter,
24 |         ),
25 |         warmup_length=250 / coco_curr_iter,
26 |         warmup_factor=0.001,
27 |     )
28 |     return lr_multiplier
29 | 
30 | 
31 | lr_multiplier_75ep = default_lsj_epoch_scheduler(75)
32 | lr_multiplier_100ep = default_lsj_epoch_scheduler(100)
33 | lr_multiplier_200ep = default_lsj_epoch_scheduler(200)
34 | lr_multiplier_400ep = default_lsj_epoch_scheduler(400)
35 | 


--------------------------------------------------------------------------------
/coco_rem/configs/common/data/coco.py:
--------------------------------------------------------------------------------
 1 | import detectron2.data.transforms as T
 2 | from detectron2.config import LazyCall as L
 3 | from detectron2.data import (
 4 |     DatasetMapper,
 5 |     build_detection_test_loader,
 6 |     build_detection_train_loader,
 7 |     get_detection_dataset_dicts,
 8 | )
 9 | from omegaconf import OmegaConf
10 | 
11 | from coco_rem.coco_evaluator import COCOReMEvaluator
12 | 
13 | dataloader = OmegaConf.create()
14 | 
15 | # Mapper with large-scale jittering (LSJ) augmentation.
16 | image_size = 1024
17 | 
18 | dataloader.train = L(build_detection_train_loader)(
19 |     dataset=L(get_detection_dataset_dicts)(names="coco_2017_train"),
20 |     mapper=L(DatasetMapper)(
21 |         is_train=True,
22 |         augmentations=[
23 |             L(T.RandomFlip)(horizontal=True),  # flip first
24 |             L(T.ResizeScale)(
25 |                 min_scale=0.1,
26 |                 max_scale=2.0,
27 |                 target_height=image_size,
28 |                 target_width=image_size,
29 |             ),
30 |             L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False),
31 |         ],
32 |         image_format="RGB",
33 |         use_instance_mask=True,
34 |         instance_mask_format="bitmask",
35 |         recompute_boxes=True,
36 |     ),
37 |     total_batch_size=64,
38 |     num_workers=4,
39 | )
40 | 
41 | # Resize shortest edge to 1024 pixels.
42 | dataloader.test = L(build_detection_test_loader)(
43 |     dataset=L(get_detection_dataset_dicts)(names="coco_2017_val", filter_empty=False),
44 |     mapper=L(DatasetMapper)(
45 |         is_train=False,
46 |         augmentations=[
47 |             L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size),
48 |         ],
49 |         image_format="${...train.mapper.image_format}",
50 |     ),
51 |     num_workers=4,
52 | )
53 | 
54 | # Update: Custom COCO evaluator that returns exactly same results as default
55 | # evaluator, with additionally returning AP90, AP95, and Boundary AP.
56 | dataloader.evaluator = L(COCOReMEvaluator)(
57 |     dataset_name="${..test.dataset.names}",
58 |     output_dir="${...train.output_dir}",
59 | )
60 | 


--------------------------------------------------------------------------------
/coco_rem/configs/common/data/constants.py:
--------------------------------------------------------------------------------
 1 | constants = dict(
 2 |     imagenet_rgb256_mean=[123.675, 116.28, 103.53],
 3 |     imagenet_rgb256_std=[58.395, 57.12, 57.375],
 4 |     imagenet_bgr256_mean=[103.530, 116.280, 123.675],
 5 |     # When using pre-trained models in Detectron1 or any MSRA models,
 6 |     # std has been absorbed into its conv1 weights, so the std needs to be set 1.
 7 |     # Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
 8 |     imagenet_bgr256_std=[1.0, 1.0, 1.0],
 9 | )
10 | 


--------------------------------------------------------------------------------
/coco_rem/configs/common/models/cascade_rcnn.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | from detectron2.modeling.box_regression import Box2BoxTransform
 4 | from detectron2.modeling.matcher import Matcher
 5 | from detectron2.modeling.roi_heads import (
 6 |     CascadeROIHeads,
 7 |     FastRCNNConvFCHead,
 8 |     FastRCNNOutputLayers,
 9 | )
10 | 
11 | from .mask_rcnn_fpn import model
12 | 
13 | # arguments that don't exist for Cascade R-CNN
14 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
15 | 
16 | model.roi_heads.update(
17 |     _target_=CascadeROIHeads,
18 |     box_heads=[
19 |         L(FastRCNNConvFCHead)(
20 |             input_shape=ShapeSpec(channels=256, height=7, width=7),
21 |             conv_dims=[],
22 |             fc_dims=[1024, 1024],
23 |         )
24 |         for k in range(3)
25 |     ],
26 |     box_predictors=[
27 |         L(FastRCNNOutputLayers)(
28 |             input_shape=ShapeSpec(channels=1024),
29 |             test_score_thresh=0.05,
30 |             box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
31 |             cls_agnostic_bbox_reg=True,
32 |             num_classes="${...num_classes}",
33 |         )
34 |         for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
35 |     ],
36 |     proposal_matchers=[
37 |         L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
38 |         for th in [0.5, 0.6, 0.7]
39 |     ],
40 | )
41 | 


--------------------------------------------------------------------------------
/coco_rem/configs/common/models/mask2former.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | from detectron2.modeling.backbone import BasicStem, ResNet
 4 | from mask2former.maskformer_model import MaskFormer
 5 | from mask2former.modeling.meta_arch.mask_former_head import MaskFormerHead
 6 | from mask2former.modeling.pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder
 7 | from mask2former.modeling.transformer_decoder import MultiScaleMaskedTransformerDecoder
 8 | 
 9 | from ..data.constants import constants
10 | 
11 | model = L(MaskFormer)(
12 |     backbone=L(ResNet)(
13 |         stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
14 |         stages=L(ResNet.make_default_stages)(
15 |             depth=50,
16 |             stride_in_1x1=False,
17 |             norm="FrozenBN",
18 |         ),
19 |         out_features=["res2", "res3", "res4", "res5"],
20 |     ),
21 |     sem_seg_head=L(MaskFormerHead)(
22 |         input_shape={
23 |             "res2": L(ShapeSpec)(channels=256, stride=4),
24 |             "res3": L(ShapeSpec)(channels=512, stride=8),
25 |             "res4": L(ShapeSpec)(channels=1024, stride=16),
26 |             "res5": L(ShapeSpec)(channels=2048, stride=32),
27 |         },
28 |         num_classes=80,
29 |         pixel_decoder=L(MSDeformAttnPixelDecoder)(
30 |             input_shape="${..input_shape}",
31 |             transformer_dropout=0.0,
32 |             transformer_nheads=8,
33 |             transformer_dim_feedforward=1024,
34 |             transformer_enc_layers=6,
35 |             conv_dim=256,
36 |             mask_dim=256,
37 |             norm="GN",
38 |             transformer_in_features=["res3", "res4", "res5"],
39 |             common_stride=4,
40 |         ),
41 |         loss_weight=1.0,
42 |         ignore_value=255,
43 |         transformer_predictor=L(MultiScaleMaskedTransformerDecoder)(
44 |             in_channels="${..pixel_decoder.conv_dim}",
45 |             mask_classification=True,
46 |             num_classes="${..num_classes}",
47 |             hidden_dim="${..pixel_decoder.conv_dim}",
48 |             num_queries="${...num_queries}",
49 |             nheads=8,
50 |             dim_feedforward=2048,
51 |             dec_layers=9,
52 |             pre_norm=False,
53 |             mask_dim="${..pixel_decoder.mask_dim}",
54 |             enforce_input_project=False,
55 |         ),
56 |         transformer_in_feature="multi_scale_pixel_decoder",
57 |     ),
58 |     criterion=None,
59 |     num_queries=100,
60 |     metadata=None,
61 |     size_divisibility=32,
62 |     sem_seg_postprocess_before_inference=True,
63 |     object_mask_threshold=0.8,
64 |     overlap_threshold=0.8,
65 |     instance_on=True,
66 |     semantic_on=False,
67 |     panoptic_on=False,
68 |     pixel_mean=constants.imagenet_rgb256_mean,
69 |     pixel_std=constants.imagenet_rgb256_std,
70 |     test_topk_per_image=100,
71 | )
72 | 


--------------------------------------------------------------------------------
/coco_rem/configs/common/models/mask_rcnn_fpn.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
 4 | from detectron2.modeling.backbone import FPN, BasicStem, ResNet
 5 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool
 6 | from detectron2.modeling.box_regression import Box2BoxTransform
 7 | from detectron2.modeling.matcher import Matcher
 8 | from detectron2.modeling.meta_arch import GeneralizedRCNN
 9 | from detectron2.modeling.poolers import ROIPooler
10 | from detectron2.modeling.proposal_generator import RPN, StandardRPNHead
11 | from detectron2.modeling.roi_heads import (
12 |     FastRCNNConvFCHead,
13 |     FastRCNNOutputLayers,
14 |     MaskRCNNConvUpsampleHead,
15 |     StandardROIHeads,
16 | )
17 | 
18 | from ..data.constants import constants
19 | 
20 | model = L(GeneralizedRCNN)(
21 |     backbone=L(FPN)(
22 |         bottom_up=L(ResNet)(
23 |             stem=L(BasicStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
24 |             stages=L(ResNet.make_default_stages)(
25 |                 depth=50,
26 |                 stride_in_1x1=True,
27 |                 norm="FrozenBN",
28 |             ),
29 |             out_features=["res2", "res3", "res4", "res5"],
30 |         ),
31 |         in_features="${.bottom_up.out_features}",
32 |         out_channels=256,
33 |         top_block=L(LastLevelMaxPool)(),
34 |     ),
35 |     proposal_generator=L(RPN)(
36 |         in_features=["p2", "p3", "p4", "p5", "p6"],
37 |         head=L(StandardRPNHead)(in_channels=256, num_anchors=3),
38 |         anchor_generator=L(DefaultAnchorGenerator)(
39 |             sizes=[[32], [64], [128], [256], [512]],
40 |             aspect_ratios=[0.5, 1.0, 2.0],
41 |             strides=[4, 8, 16, 32, 64],
42 |             offset=0.0,
43 |         ),
44 |         anchor_matcher=L(Matcher)(
45 |             thresholds=[0.3, 0.7], labels=[0, -1, 1], allow_low_quality_matches=True
46 |         ),
47 |         box2box_transform=L(Box2BoxTransform)(weights=[1.0, 1.0, 1.0, 1.0]),
48 |         batch_size_per_image=256,
49 |         positive_fraction=0.5,
50 |         pre_nms_topk=(2000, 1000),
51 |         post_nms_topk=(1000, 1000),
52 |         nms_thresh=0.7,
53 |     ),
54 |     roi_heads=L(StandardROIHeads)(
55 |         num_classes=80,
56 |         batch_size_per_image=512,
57 |         positive_fraction=0.25,
58 |         proposal_matcher=L(Matcher)(
59 |             thresholds=[0.5], labels=[0, 1], allow_low_quality_matches=False
60 |         ),
61 |         box_in_features=["p2", "p3", "p4", "p5"],
62 |         box_pooler=L(ROIPooler)(
63 |             output_size=7,
64 |             scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
65 |             sampling_ratio=0,
66 |             pooler_type="ROIAlignV2",
67 |         ),
68 |         box_head=L(FastRCNNConvFCHead)(
69 |             input_shape=ShapeSpec(channels=256, height=7, width=7),
70 |             conv_dims=[],
71 |             fc_dims=[1024, 1024],
72 |         ),
73 |         box_predictor=L(FastRCNNOutputLayers)(
74 |             input_shape=ShapeSpec(channels=1024),
75 |             test_score_thresh=0.05,
76 |             box2box_transform=L(Box2BoxTransform)(weights=(10, 10, 5, 5)),
77 |             num_classes="${..num_classes}",
78 |         ),
79 |         mask_in_features=["p2", "p3", "p4", "p5"],
80 |         mask_pooler=L(ROIPooler)(
81 |             output_size=14,
82 |             scales=(1.0 / 4, 1.0 / 8, 1.0 / 16, 1.0 / 32),
83 |             sampling_ratio=0,
84 |             pooler_type="ROIAlignV2",
85 |         ),
86 |         mask_head=L(MaskRCNNConvUpsampleHead)(
87 |             input_shape=ShapeSpec(channels=256, width=14, height=14),
88 |             num_classes="${..num_classes}",
89 |             conv_dims=[256, 256, 256, 256, 256],
90 |         ),
91 |     ),
92 |     pixel_mean=constants.imagenet_rgb256_mean,
93 |     pixel_std=constants.imagenet_rgb256_std,
94 |     input_format="RGB",
95 | )
96 | 


--------------------------------------------------------------------------------
/coco_rem/configs/common/models/mask_rcnn_vitdet.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import torch.nn as nn
 4 | from detectron2.config import LazyCall as L
 5 | from detectron2.modeling import SimpleFeaturePyramid, ViT
 6 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool
 7 | 
 8 | from .mask_rcnn_fpn import model
 9 | 
10 | # Base
11 | embed_dim, depth, num_heads, dp = 768, 12, 12, 0.1
12 | # Creates Simple Feature Pyramid from ViT backbone
13 | model.backbone = L(SimpleFeaturePyramid)(
14 |     net=L(ViT)(  # Single-scale ViT backbone
15 |         img_size=1024,
16 |         patch_size=16,
17 |         embed_dim=embed_dim,
18 |         depth=depth,
19 |         num_heads=num_heads,
20 |         drop_path_rate=dp,
21 |         window_size=14,
22 |         mlp_ratio=4,
23 |         qkv_bias=True,
24 |         norm_layer=partial(nn.LayerNorm, eps=1e-6),
25 |         window_block_indexes=[
26 |             # 2, 5, 8 11 for global attention
27 |             0,
28 |             1,
29 |             3,
30 |             4,
31 |             6,
32 |             7,
33 |             9,
34 |             10,
35 |         ],
36 |         residual_block_indexes=[],
37 |         use_rel_pos=True,
38 |         out_feature="last_feat",
39 |     ),
40 |     in_feature="${.net.out_feature}",
41 |     out_channels=256,
42 |     scale_factors=(4.0, 2.0, 1.0, 0.5),
43 |     top_block=L(LastLevelMaxPool)(),
44 |     norm="LN",
45 |     square_pad=1024,
46 | )
47 | 
48 | model.roi_heads.box_head.conv_norm = model.roi_heads.mask_head.conv_norm = "LN"
49 | 
50 | # 2conv in RPN:
51 | model.proposal_generator.head.conv_dims = [-1, -1]
52 | 
53 | # 4conv1fc box head
54 | model.roi_heads.box_head.conv_dims = [256, 256, 256, 256]
55 | model.roi_heads.box_head.fc_dims = [1024]
56 | 


--------------------------------------------------------------------------------
/coco_rem/configs/common/optim.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from detectron2.config import LazyCall as L
 3 | from detectron2.solver.build import get_default_optimizer_params
 4 | 
 5 | SGD = L(torch.optim.SGD)(
 6 |     params=L(get_default_optimizer_params)(
 7 |         # params.model is meant to be set to the model object, before instantiating
 8 |         # the optimizer.
 9 |         weight_decay_norm=0.0
10 |     ),
11 |     lr=0.02,
12 |     momentum=0.9,
13 |     weight_decay=1e-4,
14 | )
15 | 
16 | 
17 | AdamW = L(torch.optim.AdamW)(
18 |     params=L(get_default_optimizer_params)(
19 |         # params.model is meant to be set to the model object, before instantiating
20 |         # the optimizer.
21 |         base_lr="${..lr}",
22 |         weight_decay_norm=0.0,
23 |     ),
24 |     lr=1e-4,
25 |     betas=(0.9, 0.999),
26 |     weight_decay=0.1,
27 | )
28 | 


--------------------------------------------------------------------------------
/coco_rem/configs/common/train.py:
--------------------------------------------------------------------------------
 1 | # Common training-related configs that are designed for "scripts/evaluate.py"
 2 | # You can use your own instead, together with your own train_net.py
 3 | train = dict(
 4 |     output_dir="./output",
 5 |     init_checkpoint="",
 6 |     max_iter=90000,
 7 |     amp=dict(enabled=True),  # options for Automatic Mixed Precision
 8 |     ddp=dict(  # options for DistributedDataParallel
 9 |         broadcast_buffers=False,
10 |         find_unused_parameters=False,
11 |         fp16_compression=False,
12 |     ),
13 |     checkpointer=dict(period=5000, max_to_keep=100),  # options for PeriodicCheckpointer
14 |     eval_period=5000,
15 |     log_period=20,
16 |     device="cuda"
17 |     # ...
18 | )
19 | 


--------------------------------------------------------------------------------
/coco_rem/configs/convnext/cascade_mask_rcnn_convnext_base_1k_3x.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | from detectron2.modeling.box_regression import Box2BoxTransform
 4 | from detectron2.modeling.roi_heads import FastRCNNConvFCHead, FastRCNNOutputLayers
 5 | 
 6 | from coco_rem.modeling.convnext import ConvNeXt
 7 | 
 8 | from ..common.data.coco import dataloader
 9 | from ..common.data.constants import constants
10 | from ..common.models.cascade_rcnn import model
11 | from ..common.train import train
12 | 
13 | model.backbone.bottom_up = L(ConvNeXt)(
14 |     in_chans=3,
15 |     depths=[3, 3, 27, 3],
16 |     dims=[128, 256, 512, 1024],
17 |     drop_path_rate=0.7,
18 |     layer_scale_init_value=1.0,
19 |     out_features=["res2", "res3", "res4", "res5"],
20 | )
21 | 
22 | model.roi_heads.update(
23 |     # 4conv1fc box heads with BatchNorm
24 |     box_heads=[
25 |         L(FastRCNNConvFCHead)(
26 |             input_shape=ShapeSpec(channels=256, height=7, width=7),
27 |             conv_dims=[256, 256, 256, 256],
28 |             fc_dims=[1024],
29 |             conv_norm="SyncBN",
30 |         )
31 |         for k in range(3)
32 |     ],
33 |     box_predictors=[
34 |         L(FastRCNNOutputLayers)(
35 |             input_shape=ShapeSpec(channels=1024),
36 |             test_score_thresh=0.05,
37 |             box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
38 |             #
39 |             # Cascade R-CNN implementation in Detectron2 has class-agnostic box reg
40 |             # but checkpoints from ConvNext repo (MMDetection) use class-specific.
41 |             cls_agnostic_bbox_reg=False,
42 |             num_classes="${...num_classes}",
43 |         )
44 |         for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
45 |     ],
46 | )
47 | 
48 | train.init_checkpoint = None  # Load externally.
49 | train.max_iter *= 3
50 | 


--------------------------------------------------------------------------------
/coco_rem/configs/convnext/cascade_mask_rcnn_convnext_base_22k_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train
2 | 
3 | # This config is IDENTICAL to ConvNeXt-B (ImageNet-1K) - the only difference
4 | # is pre-training dataset of backbone (ImageNet-22K vs 1K) but weights in,
5 | # `train.init_checkpoint` (to be provided externally) override everything.
6 | 


--------------------------------------------------------------------------------
/coco_rem/configs/convnext/cascade_mask_rcnn_convnext_large_22k_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train
2 | 
3 | model.backbone.bottom_up.dims = [192, 384, 768, 1536]
4 | model.backbone.bottom_up.drop_path_rate = 0.7
5 | 


--------------------------------------------------------------------------------
/coco_rem/configs/convnext/cascade_mask_rcnn_convnext_small_1k_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train
2 | 
3 | model.backbone.bottom_up.dims = [96, 192, 384, 768]
4 | model.backbone.bottom_up.drop_path_rate = 0.6
5 | 


--------------------------------------------------------------------------------
/coco_rem/configs/convnext/cascade_mask_rcnn_convnext_tiny_1k_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train
2 | 
3 | model.backbone.bottom_up.depths = [3, 3, 9, 3]
4 | model.backbone.bottom_up.dims = [96, 192, 384, 768]
5 | model.backbone.bottom_up.drop_path_rate = 0.4
6 | 


--------------------------------------------------------------------------------
/coco_rem/configs/convnext/cascade_mask_rcnn_convnext_xlarge_22k_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_convnext_base_1k_3x import dataloader, model, train
2 | 
3 | model.backbone.bottom_up.dims = [256, 512, 1024, 2048]
4 | model.backbone.bottom_up.drop_path_rate = 0.8
5 | 


--------------------------------------------------------------------------------
/coco_rem/configs/convnext/mask_rcnn_convnext_tiny_1k_3x.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | 
 3 | from coco_rem.modeling.convnext import ConvNeXt
 4 | 
 5 | from ..common.data.coco import dataloader
 6 | from ..common.data.constants import constants
 7 | from ..common.models.mask_rcnn_fpn import model
 8 | from ..common.train import train
 9 | 
10 | model.backbone.bottom_up = L(ConvNeXt)(
11 |     in_chans=3,
12 |     depths=[3, 3, 9, 3],
13 |     dims=[96, 192, 384, 768],
14 |     drop_path_rate=0.4,
15 |     layer_scale_init_value=1.0,
16 |     out_features=["res2", "res3", "res4", "res5"],
17 | )
18 | 
19 | train.init_checkpoint = None  # Load externally.
20 | train.max_iter *= 3
21 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_R_101_FPN_100ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train
2 | 
3 | model.backbone.bottom_up.stages.depth = 101
4 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_R_101_FPN_200ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_R_101_FPN_100ep import dataloader, model, train
2 | 
3 | train.max_iter *= 2  # 100ep -> 200ep
4 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_R_101_FPN_400ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_R_101_FPN_100ep import dataloader, model, train
2 | 
3 | train.max_iter *= 4  # 100ep -> 400ep
4 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_R_50_FPN_100ep.py:
--------------------------------------------------------------------------------
 1 | from detectron2.layers.batch_norm import NaiveSyncBatchNorm
 2 | 
 3 | from ..common.coco_schedule import lr_multiplier_100ep as lr_multiplier
 4 | from ..common.data.coco import dataloader
 5 | from ..common.data.constants import constants
 6 | from ..common.models.mask_rcnn_fpn import model
 7 | from ..common.optim import SGD as optimizer
 8 | from ..common.train import train
 9 | 
10 | dataloader.train.mapper.image_format = "BGR"
11 | model.pixel_mean = constants.imagenet_bgr256_mean
12 | model.pixel_std = constants.imagenet_bgr256_std
13 | model.input_format = "BGR"
14 | 
15 | # train from scratch
16 | train.init_checkpoint = ""
17 | train.amp.enabled = True
18 | train.ddp.fp16_compression = True
19 | model.backbone.bottom_up.freeze_at = 0
20 | 
21 | # SyncBN
22 | model.backbone.bottom_up.stem.norm = "SyncBN"
23 | model.backbone.bottom_up.stages.norm = "SyncBN"
24 | model.backbone.norm = "SyncBN"
25 | 
26 | # Using NaiveSyncBatchNorm because heads may have empty input. That is not supported by
27 | # torch.nn.SyncBatchNorm. We can remove this after
28 | # https://github.com/pytorch/pytorch/issues/36530 is fixed.
29 | model.roi_heads.box_head.conv_norm = lambda c: NaiveSyncBatchNorm(c, stats_mode="N")
30 | model.roi_heads.mask_head.conv_norm = lambda c: NaiveSyncBatchNorm(c, stats_mode="N")
31 | 
32 | # 2conv in RPN:
33 | model.proposal_generator.head.conv_dims = [-1, -1]
34 | 
35 | # 4conv1fc box head
36 | model.roi_heads.box_head.conv_dims = [256, 256, 256, 256]
37 | model.roi_heads.box_head.fc_dims = [1024]
38 | 
39 | # Equivalent to 100 epochs.
40 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
41 | train.max_iter = 184375
42 | 
43 | optimizer.lr = 0.1
44 | optimizer.weight_decay = 4e-5
45 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_R_50_FPN_200ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train
2 | 
3 | train.max_iter *= 2  # 100ep -> 200ep
4 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_R_50_FPN_400ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train
2 | 
3 | train.max_iter *= 4  # 100ep -> 400ep
4 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_regnetx_4gf_dds_FPN_100ep.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.modeling.backbone import RegNet
 3 | from detectron2.modeling.backbone.regnet import ResBottleneckBlock, SimpleStem
 4 | 
 5 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train
 6 | 
 7 | # Config source:
 8 | model.backbone.bottom_up = L(RegNet)(
 9 |     stem_class=SimpleStem,
10 |     stem_width=32,
11 |     block_class=ResBottleneckBlock,
12 |     depth=23,
13 |     w_a=38.65,
14 |     w_0=96,
15 |     w_m=2.43,
16 |     group_width=40,
17 |     norm="SyncBN",
18 |     out_features=["s1", "s2", "s3", "s4"],
19 | )
20 | model.pixel_std = [57.375, 57.120, 58.395]
21 | 
22 | # RegNets benefit from enabling cudnn benchmark mode
23 | train.cudnn_benchmark = True
24 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_regnetx_4gf_dds_FPN_200ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_regnetx_4gf_dds_FPN_100ep import dataloader, model, train
2 | 
3 | train.max_iter *= 2  # 100ep -> 200ep
4 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_regnetx_4gf_dds_FPN_400ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_regnetx_4gf_dds_FPN_100ep import dataloader, model, train
2 | 
3 | train.max_iter *= 4  # 100ep -> 400ep
4 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_regnety_4gf_dds_FPN_100ep.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.modeling.backbone import RegNet
 3 | from detectron2.modeling.backbone.regnet import ResBottleneckBlock, SimpleStem
 4 | 
 5 | from .mask_rcnn_R_50_FPN_100ep import dataloader, model, train
 6 | 
 7 | model.backbone.bottom_up = L(RegNet)(
 8 |     stem_class=SimpleStem,
 9 |     stem_width=32,
10 |     block_class=ResBottleneckBlock,
11 |     depth=22,
12 |     w_a=31.41,
13 |     w_0=96,
14 |     w_m=2.24,
15 |     group_width=64,
16 |     se_ratio=0.25,
17 |     norm="SyncBN",
18 |     out_features=["s1", "s2", "s3", "s4"],
19 | )
20 | model.pixel_std = [57.375, 57.120, 58.395]
21 | 
22 | # RegNets benefit from enabling cudnn benchmark mode
23 | train.cudnn_benchmark = True
24 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_regnety_4gf_dds_FPN_200ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_regnety_4gf_dds_FPN_100ep import dataloader, model, train
2 | 
3 | train.max_iter *= 2  # 100ep -> 200ep
4 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2lsj/mask_rcnn_regnety_4gf_dds_FPN_400ep.py:
--------------------------------------------------------------------------------
1 | from .mask_rcnn_regnety_4gf_dds_FPN_100ep import dataloader, model, train
2 | 
3 | train.max_iter *= 4  # 100ep -> 400ep
4 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2main/cascade_mask_rcnn_R_50_FPN_3x.py:
--------------------------------------------------------------------------------
 1 | from ..common.data.coco import dataloader
 2 | from ..common.data.constants import constants
 3 | from ..common.models.cascade_rcnn import model
 4 | from ..common.train import train
 5 | 
 6 | dataloader.train.mapper.image_format = "BGR"
 7 | model.pixel_mean = constants.imagenet_bgr256_mean
 8 | model.pixel_std = constants.imagenet_bgr256_std
 9 | model.input_format = "BGR"
10 | 
11 | train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
12 | train.max_iter *= 3
13 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2main/mask_rcnn_R_50_FPN_3x.py:
--------------------------------------------------------------------------------
 1 | from ..common.data.coco import dataloader
 2 | from ..common.data.constants import constants
 3 | from ..common.models.mask_rcnn_fpn import model
 4 | from ..common.train import train
 5 | 
 6 | dataloader.train.mapper.image_format = "BGR"
 7 | model.pixel_mean = constants.imagenet_bgr256_mean
 8 | model.pixel_std = constants.imagenet_bgr256_std
 9 | model.input_format = "BGR"
10 | 
11 | model.backbone.bottom_up.freeze_at = 2
12 | train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
13 | train.max_iter *= 3
14 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2main/scratch_mask_rcnn_R_50_FPN_9x_gn.py:
--------------------------------------------------------------------------------
 1 | from ..common.data.coco import dataloader
 2 | from ..common.data.constants import constants
 3 | from ..common.models.mask_rcnn_fpn import model
 4 | from ..common.train import train
 5 | 
 6 | dataloader.train.mapper.image_format = "BGR"
 7 | model.pixel_mean = constants.imagenet_bgr256_mean
 8 | model.pixel_std = constants.imagenet_bgr256_std
 9 | model.input_format = "BGR"
10 | 
11 | # Handle Caffe2 model specs:
12 | model.backbone.bottom_up.stages.stride_in_1x1 = False
13 | model.pixel_std = [57.375, 57.120, 58.395]
14 | 
15 | model.backbone.bottom_up.stem.norm = "GN"
16 | model.backbone.bottom_up.stages.norm = "GN"
17 | model.backbone.norm = "GN"
18 | model.roi_heads.box_head.conv_norm = "GN"
19 | model.roi_heads.mask_head.conv_norm = "GN"
20 | 
21 | # 4conv1fc box head
22 | model.roi_heads.box_head.conv_dims = [256, 256, 256, 256]
23 | model.roi_heads.box_head.fc_dims = [1024]
24 | 
25 | dataloader.train.total_batch_size = 64
26 | train.max_iter *= 9
27 | 


--------------------------------------------------------------------------------
/coco_rem/configs/d2main/scratch_mask_rcnn_R_50_FPN_9x_syncbn.py:
--------------------------------------------------------------------------------
 1 | from ..common.data.coco import dataloader
 2 | from ..common.data.constants import constants
 3 | from ..common.models.mask_rcnn_fpn import model
 4 | from ..common.train import train
 5 | 
 6 | dataloader.train.mapper.image_format = "BGR"
 7 | model.pixel_mean = constants.imagenet_bgr256_mean
 8 | model.pixel_std = constants.imagenet_bgr256_std
 9 | model.input_format = "BGR"
10 | 
11 | # Handle Caffe2 model specs:
12 | model.backbone.bottom_up.stages.stride_in_1x1 = False
13 | model.pixel_std = [57.375, 57.120, 58.395]
14 | 
15 | model.backbone.bottom_up.stem.norm = "SyncBN"
16 | model.backbone.bottom_up.stages.norm = "SyncBN"
17 | model.backbone.norm = "SyncBN"
18 | model.roi_heads.box_head.conv_norm = "SyncBN"
19 | model.roi_heads.mask_head.conv_norm = "SyncBN"
20 | 
21 | # 4conv1fc box head
22 | model.roi_heads.box_head.conv_dims = [256, 256, 256, 256]
23 | model.roi_heads.box_head.fc_dims = [1024]
24 | 
25 | dataloader.train.total_batch_size = 64
26 | train.max_iter *= 9
27 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_R101_bs16_50ep.py:
--------------------------------------------------------------------------------
1 | from .maskformer2_R50_bs16_50ep import dataloader, model, train
2 | 
3 | model.backbone.stages.depth = 101
4 | train.init_checkpoint = "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
5 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_R50_bs16_50ep.py:
--------------------------------------------------------------------------------
 1 | from ..common.data.coco import dataloader
 2 | from ..common.models.mask2former import model
 3 | from ..common.train import train
 4 | 
 5 | # Initialization and trainer settings
 6 | train.init_checkpoint = "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
 7 | 
 8 | # Schedule
 9 | # 50 ep = 368750 iters * 16 images/iter / 118000 images/ep
10 | dataloader.train.total_batch_size = 16
11 | train.max_iter = 368750
12 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_swin_base_384_bs16_50ep.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | 
 4 | from .maskformer2_swin_tiny_bs16_50ep import dataloader, model, train
 5 | 
 6 | model.backbone.depths = [2, 2, 18, 2]
 7 | model.backbone.num_heads = [4, 8, 16, 32]
 8 | model.backbone.window_size = 12
 9 | model.backbone.embed_dim = 128
10 | model.backbone.pretrain_img_size = 384
11 | 
12 | model.sem_seg_head.pixel_decoder.input_shape = {
13 |     "p0": L(ShapeSpec)(channels=128, stride=4),
14 |     "p1": L(ShapeSpec)(channels=256, stride=8),
15 |     "p2": L(ShapeSpec)(channels=512, stride=16),
16 |     "p3": L(ShapeSpec)(channels=1024, stride=32),
17 | }
18 | 
19 | train.init_checkpoint = (
20 |     "detectron2://ImageNetPretrained/swin/swin_base_patch4_window12_384.pth"
21 | )
22 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_swin_base_IN21k_384_bs16_50ep.py:
--------------------------------------------------------------------------------
1 | from .maskformer2_swin_base_384_bs16_50ep import dataloader, model, train
2 | 
3 | train.init_checkpoint = (
4 |     "detectron2://ImageNetPretrained/swin/swin_base_patch4_window12_384_22k.pth"
5 | )
6 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_swin_large_IN21k_384_bs16_100ep.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | 
 4 | from .maskformer2_swin_base_384_bs16_50ep import dataloader, model, train
 5 | 
 6 | model.num_queries = 200
 7 | model.backbone.num_heads = [6, 12, 24, 48]
 8 | model.backbone.embed_dim = 192
 9 | 
10 | model.sem_seg_head.pixel_decoder.input_shape = {
11 |     "p0": L(ShapeSpec)(channels=192, stride=4),
12 |     "p1": L(ShapeSpec)(channels=384, stride=8),
13 |     "p2": L(ShapeSpec)(channels=768, stride=16),
14 |     "p3": L(ShapeSpec)(channels=1536, stride=32),
15 | }
16 | 
17 | train.max_iter *= 2
18 | train.init_checkpoint = (
19 |     "detectron2://ImageNetPretrained/swin/swin_base_patch4_window12_384_22k.pth"
20 | )
21 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_swin_small_bs16_50ep.py:
--------------------------------------------------------------------------------
1 | from .maskformer2_swin_tiny_bs16_50ep import dataloader, model, train
2 | 
3 | model.backbone.depths = [2, 2, 18, 2]
4 | 
5 | train.init_checkpoint = (
6 |     "detectron2://ImageNetPretrained/swin/swin_small_patch4_window7_224.pth"
7 | )
8 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mask2former/maskformer2_swin_tiny_bs16_50ep.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | from detectron2.modeling import SwinTransformer
 4 | 
 5 | from .maskformer2_R50_bs16_50ep import dataloader, model, train
 6 | 
 7 | model.backbone = L(SwinTransformer)(
 8 |     depths=[2, 2, 6, 2],
 9 |     embed_dim=96,
10 |     num_heads=[3, 6, 12, 24],
11 |     drop_path_rate=0.3,
12 | )
13 | 
14 | model.sem_seg_head.pixel_decoder.input_shape = {
15 |     "p0": L(ShapeSpec)(channels=96, stride=4),
16 |     "p1": L(ShapeSpec)(channels=192, stride=8),
17 |     "p2": L(ShapeSpec)(channels=384, stride=16),
18 |     "p3": L(ShapeSpec)(channels=768, stride=32),
19 | }
20 | model.sem_seg_head.pixel_decoder.transformer_in_features = ["p1", "p2", "p3"]
21 | 
22 | train.init_checkpoint = (
23 |     "detectron2://ImageNetPretrained/swin/swin_tiny_patch4_window7_224.pth"
24 | )
25 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_b_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_mvitv2_t_3x import dataloader, model, train
2 | 
3 | model.backbone.bottom_up.depth = 24
4 | model.backbone.bottom_up.last_block_indexes = (1, 4, 20, 23)
5 | model.backbone.bottom_up.drop_path_rate = 0.4
6 | 
7 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in1k.pyth"
8 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_b_in21k_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_mvitv2_b_3x import dataloader, model, train
2 | 
3 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in21k.pyth"
4 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_h_in21k_lsj_3x.py:
--------------------------------------------------------------------------------
 1 | from ..common.data.coco import dataloader
 2 | from .cascade_mask_rcnn_mvitv2_b_3x import model, train
 3 | 
 4 | model.backbone.bottom_up.embed_dim = 192
 5 | model.backbone.bottom_up.depth = 80
 6 | model.backbone.bottom_up.num_heads = 3
 7 | model.backbone.bottom_up.last_block_indexes = (3, 11, 71, 79)
 8 | model.backbone.bottom_up.drop_path_rate = 0.6
 9 | model.backbone.bottom_up.use_act_checkpoint = True
10 | 
11 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_H_in21k.pyth"
12 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_s_3x.py:
--------------------------------------------------------------------------------
1 | from .cascade_mask_rcnn_mvitv2_t_3x import dataloader, model, train
2 | 
3 | model.backbone.bottom_up.depth = 16
4 | model.backbone.bottom_up.last_block_indexes = (0, 2, 13, 15)
5 | 
6 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_S_in1k.pyth"
7 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mvitv2/cascade_mask_rcnn_mvitv2_t_3x.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | from detectron2.modeling.box_regression import Box2BoxTransform
 4 | from detectron2.modeling.matcher import Matcher
 5 | from detectron2.modeling.roi_heads import (
 6 |     CascadeROIHeads,
 7 |     FastRCNNConvFCHead,
 8 |     FastRCNNOutputLayers,
 9 | )
10 | 
11 | from .mask_rcnn_mvitv2_t_3x import dataloader, model, train
12 | 
13 | # arguments that don't exist for Cascade R-CNN
14 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
15 | 
16 | model.roi_heads.update(
17 |     _target_=CascadeROIHeads,
18 |     box_heads=[
19 |         L(FastRCNNConvFCHead)(
20 |             input_shape=ShapeSpec(channels=256, height=7, width=7),
21 |             conv_dims=[256, 256, 256, 256],
22 |             fc_dims=[1024],
23 |             conv_norm="SyncBN",
24 |         )
25 |         for _ in range(3)
26 |     ],
27 |     box_predictors=[
28 |         L(FastRCNNOutputLayers)(
29 |             input_shape=ShapeSpec(channels=1024),
30 |             test_score_thresh=0.05,
31 |             box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
32 |             cls_agnostic_bbox_reg=True,
33 |             num_classes="${...num_classes}",
34 |         )
35 |         for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
36 |     ],
37 |     proposal_matchers=[
38 |         L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
39 |         for th in [0.5, 0.6, 0.7]
40 |     ],
41 | )
42 | 
43 | model.roi_heads.mask_head.conv_norm = "SyncBN"
44 | 
45 | # 2conv in RPN:
46 | # https://github.com/tensorflow/tpu/blob/b24729de804fdb751b06467d3dce0637fa652060/models/official/detection/modeling/architecture/heads.py#L95-L97  # noqa: E501, B950
47 | model.proposal_generator.head.conv_dims = [-1, -1]
48 | 


--------------------------------------------------------------------------------
/coco_rem/configs/mvitv2/mask_rcnn_mvitv2_t_3x.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import torch.nn as nn
 4 | from detectron2.config import LazyCall as L
 5 | from detectron2.modeling import MViT
 6 | 
 7 | from ..common.data.coco import dataloader
 8 | from ..common.data.constants import constants
 9 | from ..common.models.mask_rcnn_fpn import model
10 | from ..common.train import train
11 | 
12 | model.backbone.bottom_up = L(MViT)(
13 |     embed_dim=96,
14 |     depth=10,
15 |     num_heads=1,
16 |     last_block_indexes=(0, 2, 7, 9),
17 |     residual_pooling=True,
18 |     drop_path_rate=0.2,
19 |     norm_layer=partial(nn.LayerNorm, eps=1e-6),
20 |     out_features=("scale2", "scale3", "scale4", "scale5"),
21 | )
22 | model.backbone.in_features = "${.bottom_up.out_features}"
23 | 
24 | # Initialization and trainer settings
25 | train.amp.enabled = True
26 | train.ddp.fp16_compression = True
27 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_T_in1k.pyth"
28 | 
29 | # 36 epochs
30 | train.max_iter = 67500
31 | 


--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_mvitv2_b_in21k_100ep.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | from detectron2.config import LazyCall as L
 4 | from detectron2.layers import ShapeSpec
 5 | from detectron2.modeling import MViT
 6 | from detectron2.modeling.box_regression import Box2BoxTransform
 7 | from detectron2.modeling.matcher import Matcher
 8 | from detectron2.modeling.roi_heads import (
 9 |     CascadeROIHeads,
10 |     FastRCNNConvFCHead,
11 |     FastRCNNOutputLayers,
12 | )
13 | from torch import nn
14 | 
15 | from ..common.data.coco import dataloader
16 | from ..common.data.constants import constants
17 | from ..common.models.mask_rcnn_fpn import model
18 | from ..common.train import train
19 | 
20 | model.backbone.bottom_up = L(MViT)(
21 |     embed_dim=96,
22 |     depth=24,
23 |     num_heads=1,
24 |     last_block_indexes=(1, 4, 20, 23),
25 |     residual_pooling=True,
26 |     drop_path_rate=0.4,
27 |     norm_layer=partial(nn.LayerNorm, eps=1e-6),
28 |     out_features=("scale2", "scale3", "scale4", "scale5"),
29 | )
30 | model.backbone.in_features = "${.bottom_up.out_features}"
31 | model.backbone.square_pad = 1024
32 | 
33 | # New heads and LN
34 | model.backbone.norm = "LN"  # Use LN in FPN
35 | model.roi_heads.box_head.conv_norm = model.roi_heads.mask_head.conv_norm = "LN"
36 | 
37 | # 2conv in RPN:
38 | model.proposal_generator.head.conv_dims = [-1, -1]
39 | 
40 | # arguments that don't exist for Cascade R-CNN
41 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
42 | model.roi_heads.update(
43 |     _target_=CascadeROIHeads,
44 |     box_heads=[
45 |         L(FastRCNNConvFCHead)(
46 |             input_shape=ShapeSpec(channels=256, height=7, width=7),
47 |             conv_dims=[256, 256, 256, 256],
48 |             fc_dims=[1024],
49 |             conv_norm="LN",
50 |         )
51 |         for _ in range(3)
52 |     ],
53 |     box_predictors=[
54 |         L(FastRCNNOutputLayers)(
55 |             input_shape=ShapeSpec(channels=1024),
56 |             test_score_thresh=0.05,
57 |             box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
58 |             cls_agnostic_bbox_reg=True,
59 |             num_classes="${...num_classes}",
60 |         )
61 |         for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
62 |     ],
63 |     proposal_matchers=[
64 |         L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
65 |         for th in [0.5, 0.6, 0.7]
66 |     ],
67 | )
68 | 
69 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in21k.pyth"
70 | 
71 | # Schedule
72 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
73 | train.max_iter = 184375
74 | 


--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_mvitv2_h_in21k_36ep.py:
--------------------------------------------------------------------------------
 1 | from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import dataloader, model, train
 2 | 
 3 | model.backbone.bottom_up.embed_dim = 192
 4 | model.backbone.bottom_up.depth = 80
 5 | model.backbone.bottom_up.num_heads = 3
 6 | model.backbone.bottom_up.last_block_indexes = (3, 11, 71, 79)
 7 | model.backbone.bottom_up.drop_path_rate = 0.6
 8 | model.backbone.bottom_up.use_act_checkpoint = True
 9 | 
10 | 
11 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_H_in21k.pyth"
12 | 
13 | 
14 | # 36 epochs
15 | train.max_iter = 67500
16 | 


--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_mvitv2_l_in21k_50ep.py:
--------------------------------------------------------------------------------
 1 | from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import dataloader, model, train
 2 | 
 3 | model.backbone.bottom_up.embed_dim = 144
 4 | model.backbone.bottom_up.depth = 48
 5 | model.backbone.bottom_up.num_heads = 2
 6 | model.backbone.bottom_up.last_block_indexes = (1, 7, 43, 47)
 7 | model.backbone.bottom_up.drop_path_rate = 0.5
 8 | 
 9 | 
10 | train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_L_in21k.pyth"
11 | 
12 | train.max_iter = train.max_iter // 2  # 100ep -> 50ep
13 | 


--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_swin_b_in21k_50ep.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.modeling import SwinTransformer
 3 | 
 4 | from ..common.data.coco import dataloader
 5 | from ..common.train import train
 6 | from .cascade_mask_rcnn_mvitv2_b_in21k_100ep import model
 7 | 
 8 | model.backbone.bottom_up = L(SwinTransformer)(
 9 |     depths=[2, 2, 18, 2],
10 |     drop_path_rate=0.4,
11 |     embed_dim=128,
12 |     num_heads=[4, 8, 16, 32],
13 | )
14 | model.backbone.in_features = ("p0", "p1", "p2", "p3")
15 | model.backbone.square_pad = 1024
16 | 
17 | train.init_checkpoint = (
18 |     "detectron2://ImageNetPretrained/swin/swin_base_patch4_window7_224_22k.pth"
19 | )
20 | # 50 ep = (184375 / 2) iters * 64 images/iter / 118000 images/ep
21 | train.max_iter = 184375 // 2
22 | 


--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_swin_l_in21k_50ep.py:
--------------------------------------------------------------------------------
 1 | from .cascade_mask_rcnn_swin_b_in21k_50ep import dataloader, model, train
 2 | 
 3 | model.backbone.bottom_up.depths = [2, 2, 18, 2]
 4 | model.backbone.bottom_up.drop_path_rate = 0.4
 5 | model.backbone.bottom_up.embed_dim = 192
 6 | model.backbone.bottom_up.num_heads = [6, 12, 24, 48]
 7 | 
 8 | 
 9 | train.init_checkpoint = (
10 |     "detectron2://ImageNetPretrained/swin/swin_large_patch4_window7_224_22k.pth"
11 | )
12 | 


--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_vitdet_b_100ep.py:
--------------------------------------------------------------------------------
 1 | from detectron2.config import LazyCall as L
 2 | from detectron2.layers import ShapeSpec
 3 | from detectron2.modeling.box_regression import Box2BoxTransform
 4 | from detectron2.modeling.matcher import Matcher
 5 | from detectron2.modeling.roi_heads import (
 6 |     CascadeROIHeads,
 7 |     FastRCNNConvFCHead,
 8 |     FastRCNNOutputLayers,
 9 | )
10 | 
11 | from .mask_rcnn_vitdet_b_100ep import dataloader, lr_multiplier, model, optimizer, train
12 | 
13 | # arguments that don't exist for Cascade R-CNN
14 | [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]]
15 | 
16 | model.roi_heads.update(
17 |     _target_=CascadeROIHeads,
18 |     box_heads=[
19 |         L(FastRCNNConvFCHead)(
20 |             input_shape=ShapeSpec(channels=256, height=7, width=7),
21 |             conv_dims=[256, 256, 256, 256],
22 |             fc_dims=[1024],
23 |             conv_norm="LN",
24 |         )
25 |         for _ in range(3)
26 |     ],
27 |     box_predictors=[
28 |         L(FastRCNNOutputLayers)(
29 |             input_shape=ShapeSpec(channels=1024),
30 |             test_score_thresh=0.05,
31 |             box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)),
32 |             cls_agnostic_bbox_reg=True,
33 |             num_classes="${...num_classes}",
34 |         )
35 |         for (w1, w2) in [(10, 5), (20, 10), (30, 15)]
36 |     ],
37 |     proposal_matchers=[
38 |         L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False)
39 |         for th in [0.5, 0.6, 0.7]
40 |     ],
41 | )
42 | 


--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_vitdet_h_75ep.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate
 4 | 
 5 | from ..common.coco_schedule import lr_multiplier_75ep as lr_multiplier
 6 | from .cascade_mask_rcnn_vitdet_b_100ep import dataloader, model, optimizer, train
 7 | 
 8 | model.backbone.net.embed_dim = 1280
 9 | model.backbone.net.depth = 32
10 | model.backbone.net.num_heads = 16
11 | model.backbone.net.drop_path_rate = 0.5
12 | # 7, 15, 23, 31 for global attention
13 | model.backbone.net.window_block_indexes = (
14 |     list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31))
15 | )
16 | 
17 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth?matching_heuristics=True"
18 | train.max_iter = train.max_iter * 3 // 4  # 100ep -> 75ep
19 | 
20 | optimizer.params.lr_factor_func = partial(
21 |     get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32
22 | )
23 | optimizer.params.overrides = {}
24 | optimizer.params.weight_decay_norm = None
25 | 


--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/cascade_mask_rcnn_vitdet_l_100ep.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate
 4 | 
 5 | from .cascade_mask_rcnn_vitdet_b_100ep import (
 6 |     dataloader,
 7 |     lr_multiplier,
 8 |     model,
 9 |     optimizer,
10 |     train,
11 | )
12 | 
13 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth?matching_heuristics=True"
14 | 
15 | model.backbone.net.embed_dim = 1024
16 | model.backbone.net.depth = 24
17 | model.backbone.net.num_heads = 16
18 | model.backbone.net.drop_path_rate = 0.4
19 | # 5, 11, 17, 23 for global attention
20 | model.backbone.net.window_block_indexes = (
21 |     list(range(0, 5)) + list(range(6, 11)) + list(range(12, 17)) + list(range(18, 23))
22 | )
23 | 
24 | optimizer.params.lr_factor_func = partial(
25 |     get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24
26 | )
27 | 


--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/mask_rcnn_vitdet_b_100ep.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate
 4 | 
 5 | from ..common.coco_schedule import lr_multiplier_100ep as lr_multiplier
 6 | from ..common.data.coco import dataloader
 7 | from ..common.models.mask_rcnn_vitdet import model
 8 | from ..common.optim import AdamW as optimizer
 9 | from ..common.train import train
10 | 
11 | # Initialization and trainer settings
12 | train.ddp.fp16_compression = True
13 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth?matching_heuristics=True"
14 | 
15 | 
16 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
17 | train.max_iter = 184375
18 | 
19 | # Layer-wise LR decay for ViT
20 | optimizer.params.lr_factor_func = partial(
21 |     get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7
22 | )
23 | optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}}
24 | 


--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/mask_rcnn_vitdet_h_75ep.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate
 4 | 
 5 | from ..common.coco_schedule import lr_multiplier_75ep as lr_multiplier
 6 | from .mask_rcnn_vitdet_b_100ep import dataloader, model, optimizer, train
 7 | 
 8 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth?matching_heuristics=True"
 9 | 
10 | model.backbone.net.embed_dim = 1280
11 | model.backbone.net.depth = 32
12 | model.backbone.net.num_heads = 16
13 | model.backbone.net.drop_path_rate = 0.5
14 | # 7, 15, 23, 31 for global attention
15 | model.backbone.net.window_block_indexes = (
16 |     list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31))
17 | )
18 | 
19 | train.max_iter = train.max_iter * 3 // 4  # 100ep -> 75ep
20 | 
21 | optimizer.params.lr_factor_func = partial(
22 |     get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32
23 | )
24 | optimizer.params.overrides = {}
25 | optimizer.params.weight_decay_norm = None
26 | 


--------------------------------------------------------------------------------
/coco_rem/configs/vitdet/mask_rcnn_vitdet_l_100ep.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate
 4 | 
 5 | from .mask_rcnn_vitdet_b_100ep import dataloader, lr_multiplier, model, optimizer, train
 6 | 
 7 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth?matching_heuristics=True"
 8 | 
 9 | model.backbone.net.embed_dim = 1024
10 | model.backbone.net.depth = 24
11 | model.backbone.net.num_heads = 16
12 | model.backbone.net.drop_path_rate = 0.4
13 | # 5, 11, 17, 23 for global attention
14 | model.backbone.net.window_block_indexes = (
15 |     list(range(0, 5)) + list(range(6, 11)) + list(range(12, 17)) + list(range(18, 23))
16 | )
17 | 
18 | optimizer.params.lr_factor_func = partial(
19 |     get_vit_lr_decay_rate, lr_decay_rate=0.8, num_layers=24
20 | )
21 | 


--------------------------------------------------------------------------------
/coco_rem/data/builtin.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Register COCO-ReM instances for training and evaluation.
 3 | """
 4 | 
 5 | import os
 6 | 
 7 | from detectron2.data.datasets.coco import register_coco_instances
 8 | from detectron2.data.datasets.builtin_meta import _get_builtin_metadata
 9 | 
10 | _PREDEFINED_SPLITS_COCO_REM = {
11 |     "coco_rem_train": ("coco/train2017", "coco_rem/instances_trainrem.json"),
12 |     "coco_rem_val": ("coco/val2017", "coco_rem/instances_valrem.json"),
13 | }
14 | 
15 | 
16 | def register_all_coco_rem(root: str = "datasets"):
17 |     for key, (image_root, json_file) in _PREDEFINED_SPLITS_COCO_REM.items():
18 |         # Assume pre-defined datasets live in `./datasets`.
19 |         register_coco_instances(
20 |             key,
21 |             _get_builtin_metadata("coco"),
22 |             os.path.join(root, json_file) if "://" not in json_file else json_file,
23 |             os.path.join(root, image_root),
24 |         )
25 | 


--------------------------------------------------------------------------------
/coco_rem/data/lvis.py:
--------------------------------------------------------------------------------
 1 | from detectron2.data.datasets.lvis import (
 2 |     get_lvis_instances_meta,
 3 |     register_lvis_instances,
 4 | )
 5 | 
 6 | # This mapping is extracted from the official LVIS mapping:
 7 | # https://github.com/lvis-dataset/lvis-api/blob/master/data/coco_to_synset.json
 8 | COCO_CATEGORIES_IN_LVIS = [
 9 |     {"coco_id": 1, "lvis_id": 793, "synset": "person.n.01"},
10 |     {"coco_id": 2, "lvis_id": 94, "synset": "bicycle.n.01"},
11 |     {"coco_id": 3, "lvis_id": 207, "synset": "car.n.01"},
12 |     {"coco_id": 4, "lvis_id": 703, "synset": "motorcycle.n.01"},
13 |     {"coco_id": 5, "lvis_id": 3, "synset": "airplane.n.01"},
14 |     {"coco_id": 6, "lvis_id": 173, "synset": "bus.n.01"},
15 |     {"coco_id": 7, "lvis_id": 1115, "synset": "train.n.01"},
16 |     {"coco_id": 8, "lvis_id": 1123, "synset": "truck.n.01"},
17 |     {"coco_id": 9, "lvis_id": 118, "synset": "boat.n.01"},
18 |     {"coco_id": 10, "lvis_id": 1112, "synset": "traffic_light.n.01"},
19 |     {"coco_id": 11, "lvis_id": 445, "synset": "fireplug.n.01"},
20 |     {"coco_id": 13, "lvis_id": 1019, "synset": "stop_sign.n.01"},
21 |     {"coco_id": 14, "lvis_id": 766, "synset": "parking_meter.n.01"},
22 |     {"coco_id": 15, "lvis_id": 90, "synset": "bench.n.01"},
23 |     {"coco_id": 16, "lvis_id": 99, "synset": "bird.n.01"},
24 |     {"coco_id": 17, "lvis_id": 225, "synset": "cat.n.01"},
25 |     {"coco_id": 18, "lvis_id": 378, "synset": "dog.n.01"},
26 |     {"coco_id": 19, "lvis_id": 569, "synset": "horse.n.01"},
27 |     {"coco_id": 20, "lvis_id": 943, "synset": "sheep.n.01"},
28 |     {"coco_id": 21, "lvis_id": 80, "synset": "beef.n.01"},
29 |     {"coco_id": 22, "lvis_id": 422, "synset": "elephant.n.01"},
30 |     {"coco_id": 23, "lvis_id": 76, "synset": "bear.n.01"},
31 |     {"coco_id": 24, "lvis_id": 1202, "synset": "zebra.n.01"},
32 |     {"coco_id": 25, "lvis_id": 496, "synset": "giraffe.n.01"},
33 |     {"coco_id": 27, "lvis_id": 34, "synset": "backpack.n.01"},
34 |     {"coco_id": 28, "lvis_id": 1133, "synset": "umbrella.n.01"},
35 |     {"coco_id": 31, "lvis_id": 35, "synset": "bag.n.04"},
36 |     {"coco_id": 32, "lvis_id": 716, "synset": "necktie.n.01"},
37 |     {"coco_id": 33, "lvis_id": 36, "synset": "bag.n.06"},
38 |     {"coco_id": 34, "lvis_id": 474, "synset": "frisbee.n.01"},
39 |     {"coco_id": 35, "lvis_id": 964, "synset": "ski.n.01"},
40 |     {"coco_id": 36, "lvis_id": 976, "synset": "snowboard.n.01"},
41 |     {"coco_id": 37, "lvis_id": 41, "synset": "ball.n.06"},
42 |     {"coco_id": 38, "lvis_id": 611, "synset": "kite.n.03"},
43 |     {"coco_id": 39, "lvis_id": 58, "synset": "baseball_bat.n.01"},
44 |     {"coco_id": 40, "lvis_id": 60, "synset": "baseball_glove.n.01"},
45 |     {"coco_id": 41, "lvis_id": 962, "synset": "skateboard.n.01"},
46 |     {"coco_id": 42, "lvis_id": 1037, "synset": "surfboard.n.01"},
47 |     {"coco_id": 43, "lvis_id": 1079, "synset": "tennis_racket.n.01"},
48 |     {"coco_id": 44, "lvis_id": 133, "synset": "bottle.n.01"},
49 |     {"coco_id": 46, "lvis_id": 1190, "synset": "wineglass.n.01"},
50 |     {"coco_id": 47, "lvis_id": 344, "synset": "cup.n.01"},
51 |     {"coco_id": 48, "lvis_id": 469, "synset": "fork.n.01"},
52 |     {"coco_id": 49, "lvis_id": 615, "synset": "knife.n.01"},
53 |     {"coco_id": 50, "lvis_id": 1000, "synset": "spoon.n.01"},
54 |     {"coco_id": 51, "lvis_id": 139, "synset": "bowl.n.03"},
55 |     {"coco_id": 52, "lvis_id": 45, "synset": "banana.n.02"},
56 |     {"coco_id": 53, "lvis_id": 12, "synset": "apple.n.01"},
57 |     {"coco_id": 54, "lvis_id": 912, "synset": "sandwich.n.01"},
58 |     {"coco_id": 55, "lvis_id": 735, "synset": "orange.n.01"},
59 |     {"coco_id": 56, "lvis_id": 154, "synset": "broccoli.n.01"},
60 |     {"coco_id": 57, "lvis_id": 217, "synset": "carrot.n.01"},
61 |     {"coco_id": 59, "lvis_id": 816, "synset": "pizza.n.01"},
62 |     {"coco_id": 60, "lvis_id": 387, "synset": "doughnut.n.02"},
63 |     {"coco_id": 61, "lvis_id": 183, "synset": "cake.n.03"},
64 |     {"coco_id": 62, "lvis_id": 232, "synset": "chair.n.01"},
65 |     {"coco_id": 63, "lvis_id": 982, "synset": "sofa.n.01"},
66 |     {"coco_id": 64, "lvis_id": 837, "synset": "pot.n.04"},
67 |     {"coco_id": 65, "lvis_id": 77, "synset": "bed.n.01"},
68 |     {"coco_id": 67, "lvis_id": 367, "synset": "dining_table.n.01"},
69 |     {"coco_id": 70, "lvis_id": 1097, "synset": "toilet.n.02"},
70 |     {"coco_id": 72, "lvis_id": 1077, "synset": "television_receiver.n.01"},
71 |     {"coco_id": 73, "lvis_id": 631, "synset": "laptop.n.01"},
72 |     {"coco_id": 74, "lvis_id": 705, "synset": "mouse.n.04"},
73 |     {"coco_id": 75, "lvis_id": 881, "synset": "remote_control.n.01"},
74 |     {"coco_id": 76, "lvis_id": 296, "synset": "computer_keyboard.n.01"},
75 |     {"coco_id": 77, "lvis_id": 230, "synset": "cellular_telephone.n.01"},
76 |     {"coco_id": 78, "lvis_id": 687, "synset": "microwave.n.02"},
77 |     {"coco_id": 79, "lvis_id": 739, "synset": "oven.n.01"},
78 |     {"coco_id": 80, "lvis_id": 1095, "synset": "toaster.n.02"},
79 |     {"coco_id": 81, "lvis_id": 961, "synset": "sink.n.01"},
80 |     {"coco_id": 82, "lvis_id": 421, "synset": "electric_refrigerator.n.01"},
81 |     {"coco_id": 84, "lvis_id": 127, "synset": "book.n.01"},
82 |     {"coco_id": 85, "lvis_id": 271, "synset": "clock.n.01"},
83 |     {"coco_id": 86, "lvis_id": 1139, "synset": "vase.n.01"},
84 |     {"coco_id": 87, "lvis_id": 923, "synset": "scissors.n.01"},
85 |     {"coco_id": 88, "lvis_id": 1071, "synset": "teddy.n.01"},
86 |     {"coco_id": 89, "lvis_id": 534, "synset": "hand_blower.n.01"},
87 |     {"coco_id": 90, "lvis_id": 1102, "synset": "toothbrush.n.01"},
88 | ]
89 | 
90 | 
91 | def register_cocofied_lvis():
92 |     # COCO-fied LVIS v1 val - instances for COCO classes, masks from LVIS.
93 |     register_lvis_instances(
94 |         "lvis_v1_val_cocofied",
95 |         get_lvis_instances_meta("lvis_v1_val_cocofied"),
96 |         json_file="datasets/lvis/lvis_v1_val_cocofied.json",
97 |         image_root="datasets/coco/",
98 |     )
99 | 


--------------------------------------------------------------------------------
/coco_rem/mask_visualizer.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import itertools
  4 | from typing import Callable, Optional
  5 | 
  6 | import cv2
  7 | import matplotlib.colors as mplc
  8 | import numpy as np
  9 | import pycocotools.mask as mask_util
 10 | import torch
 11 | from detectron2.utils.visualizer import VisImage
 12 | from torch.nn.functional import max_pool2d
 13 | 
 14 | # Nice colors, taken from `colorblind` palette of the `seaborn` library with
 15 | # very minor modifications for aesthetics.
 16 | NICE_COLORS = [
 17 |     (0.0039, 0.4509, 0.6980),  # blue
 18 |     (0.8905, 0.5607, 0.0196),  # orange
 19 |     (0.0078, 0.6196, 0.4509),  # green
 20 |     (0.9400, 0.2200, 0.1000),  # red
 21 |     (0.6500, 0.3500, 0.9000),  # purple
 22 |     (0.6980, 1.0000, 0.3500),  # lime green
 23 |     (0.5019, 0.8705, 0.9176),  # cyan
 24 |     (0.7921, 0.5686, 0.3803),  # brown
 25 |     (0.9843, 0.6862, 0.8941),  # pink
 26 |     (0.9254, 0.8823, 0.2001),  # gold
 27 | ]
 28 | 
 29 | 
 30 | def binarize_mask(mask_or_polygons, height: int, width: int):
 31 |     """
 32 |     Convert input masks of any format to a binary mask (np.uint8 array with 1
 33 |     as foreground and 0 as background).
 34 |     """
 35 |     m = mask_or_polygons
 36 |     if isinstance(m, dict):
 37 |         # RLEs
 38 |         assert "counts" in m and "size" in m
 39 |         if isinstance(m["counts"], list):  # uncompressed RLEs
 40 |             h, w = m["size"]
 41 |             assert h == height and w == width
 42 |             m = mask_util.frPyObjects(m, h, w)
 43 |         mask = mask_util.decode(m)[:, :]
 44 | 
 45 |     if isinstance(m, list):  # list[ndarray]
 46 |         m = mask_util.frPyObjects(m, height, width)
 47 |         m = mask_util.merge(m)
 48 |         mask = mask_util.decode(m)[:, :]
 49 | 
 50 |     if isinstance(m, np.ndarray):  # assumed to be a binary mask
 51 |         assert m.shape[1] != 2, m.shape
 52 |         assert m.shape == (
 53 |             height,
 54 |             width,
 55 |         ), f"mask shape: {m.shape}, target dims: {height}, {width}"
 56 |         mask = m.astype("uint8")
 57 | 
 58 |     return mask
 59 | 
 60 | 
 61 | def _create_text_labels(classes, class_names, is_crowd=None):
 62 |     labels = None
 63 |     if classes is not None:
 64 |         if class_names is not None and len(class_names) > 0:
 65 |             labels = [class_names[i] for i in classes]
 66 |         else:
 67 |             labels = [str(i) for i in classes]
 68 | 
 69 |     if labels is not None and is_crowd is not None:
 70 |         labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
 71 |     return labels
 72 | 
 73 | 
 74 | class MaskVisualizer:
 75 |     """Visualizer for labeled masks of COCO-format instance annotations."""
 76 | 
 77 |     def __init__(self, img_rgb: np.ndarray, class_names: list[str] | None = None):
 78 |         """
 79 |         Args:
 80 |             img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
 81 |                 the height and width of the image respectively. C is the number of
 82 |                 color channels. The image is required to be in RGB format since that
 83 |                 is a requirement of the Matplotlib library. The image is also expected
 84 |                 to be in the range [0, 255].
 85 |             class_names: List of names to associate with object class IDs of masks.
 86 |         """
 87 |         self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
 88 |         self.class_names = class_names
 89 |         self.output = VisImage(self.img)
 90 |         self.cpu_device = torch.device("cpu")
 91 | 
 92 |         # too small texts are useless, therefore clamp to 12
 93 |         self._default_font_size = max(
 94 |             np.sqrt(self.output.height * self.output.width) // 90, 12
 95 |         )
 96 | 
 97 |     def draw_dataset_dict(
 98 |         self,
 99 |         dic,
100 |         draw_labels: bool = True,
101 |         label_suffix_formatter: Optional[Callable] = None,
102 |     ):
103 |         """
104 |         Draw annotations/segmentations in Detectron2 Dataset format.
105 | 
106 |         Args:
107 |             dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
108 | 
109 |         Returns:
110 |             output (VisImage): image object with visualizations.
111 |         """
112 |         annos = dic.get("annotations", None)
113 |         if annos:
114 |             if "segmentation" in annos[0]:
115 |                 masks = [x["segmentation"] for x in annos]
116 |             else:
117 |                 masks = None
118 | 
119 |             if draw_labels:
120 |                 category_ids = [x["category_id"] for x in annos]
121 |                 labels = _create_text_labels(
122 |                     category_ids,
123 |                     class_names=self.class_names,
124 |                     is_crowd=[x.get("iscrowd", 0) for x in annos],
125 |                 )
126 | 
127 |                 if label_suffix_formatter is not None:
128 |                     labels = label_suffix_formatter(dic, labels)
129 |             else:
130 |                 labels = None
131 | 
132 |             self.overlay_instances(labels=labels, masks=masks)
133 | 
134 |         return self.output
135 | 
136 |     def overlay_instances(self, labels=None, masks=None, alpha=0.7):
137 |         """
138 |         Args:
139 |             labels (list[str]): the text to be displayed for each instance.
140 |             masks (masks-like object): Supported types are:
141 | 
142 |                 * :class:`detectron2.structures.PolygonMasks`,
143 |                   :class:`detectron2.structures.BitMasks`.
144 |                 * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
145 |                   The first level of the list corresponds to individual instances. The second
146 |                   level to all the polygon that compose the instance, and the third level
147 |                   to the polygon coordinates. The third level should have the format of
148 |                   [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
149 |                 * list[ndarray]: each ndarray is a binary mask of shape (H, W).
150 |                 * list[dict]: each dict is a COCO-style RLE.
151 | 
152 |         Returns:
153 |             output (VisImage): image object with visualizations.
154 |         """
155 |         num_instances = 0
156 |         if masks is not None:
157 |             masks = [
158 |                 binarize_mask(x, self.output.height, self.output.width) for x in masks
159 |             ]
160 |             if num_instances:
161 |                 assert len(masks) == num_instances
162 |             else:
163 |                 num_instances = len(masks)
164 | 
165 |         if labels is not None:
166 |             assert len(labels) == num_instances
167 | 
168 |         assigned_colors = list(
169 |             itertools.islice(itertools.cycle(NICE_COLORS), num_instances)
170 |         )
171 | 
172 |         if num_instances == 0:
173 |             return self.output
174 | 
175 |         # Display in largest to smallest order to reduce occlusion.
176 |         areas = np.asarray([x.sum() for x in masks])
177 | 
178 |         sorted_idxs = np.argsort(-areas).tolist()
179 |         # Re-order overlapped instances in descending order.
180 |         labels = [labels[k] for k in sorted_idxs] if labels is not None else None
181 |         masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
182 |         assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
183 | 
184 |         for i in range(num_instances):
185 |             color = assigned_colors[i]
186 |             text = labels[i] if labels is not None else ""
187 |             self.draw_binary_mask(masks[i], color, text=text, alpha=alpha)
188 | 
189 |         return self.output
190 | 
191 |     def draw_text(self, text: str, x: float, y: float) -> VisImage:
192 |         # fmt: off
193 |         self.output.ax.text(
194 |             x, y, text, size=self._default_font_size, family="sans-serif",
195 |             bbox={"facecolor": "white", "alpha": 1.0, "pad": 1.0, "edgecolor": "none"},
196 |             verticalalignment="top", horizontalalignment="center",
197 |             color="black", zorder=10,
198 |         )
199 |         # fmt: on
200 |         return self.output
201 | 
202 |     def draw_binary_mask(self, binary_mask, color, text=None, alpha=0.7):
203 |         """
204 |         Args:
205 |             binary_mask: numpy array of shape (H, W), where H is the image height
206 |                 and W is the image width. Each value in the array is either a 0
207 |                 or 1 value of uint8 type.
208 |             color: color of the mask. Refer to `matplotlib.colors` for a full list
209 |                 of formats that are accepted. If None, will pick a random color.
210 |             text: A string to draw on the object.
211 |             alpha: blending co-efficient. Smaller values => more transparent masks.
212 | 
213 |         Returns:
214 |             output (VisImage): image object with mask drawn.
215 |         """
216 |         color = mplc.to_rgb(color)
217 | 
218 |         mask = binary_mask.astype("uint8")  # opencv needs uint8
219 |         shape2d = (binary_mask.shape[0], binary_mask.shape[1])
220 | 
221 |         # TODO: Use Path/PathPatch to draw vector graphics:
222 |         # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
223 |         rgba = np.zeros(shape2d + (4,), dtype="float32")
224 |         rgba[:, :, :3] = color
225 |         rgba[:, :, 3] = (mask == 1).astype("float32") * alpha
226 |         self.output.ax.imshow(
227 |             rgba, extent=(0, self.output.width, self.output.height, 0)
228 |         )
229 | 
230 |         # Find mask boundary using dilation, then visualize as a black border.
231 |         mask_tensor = torch.from_numpy(mask).float().unsqueeze(0)
232 |         dilated = max_pool2d(mask_tensor, kernel_size=3, stride=1, padding=1)
233 |         boundary = (dilated - mask_tensor)[0].numpy()
234 |         boundary_rgba = np.zeros(shape2d + (4,), dtype="float32")
235 |         boundary_rgba[:, :, 3] = boundary
236 |         self.output.ax.imshow(
237 |             boundary_rgba, extent=(0, self.output.width, self.output.height, 0)
238 |         )
239 | 
240 |         if text is not None:
241 |             # TODO sometimes drawn on wrong objects. the heuristics here can improve.
242 |             _num_cc, cc_labels, stats, _ = cv2.connectedComponentsWithStats(
243 |                 binary_mask, 8
244 |             )
245 |             if stats[1:, -1].size == 0:
246 |                 return
247 |             largest_component_id = np.argmax(stats[1:, -1]) + 1
248 | 
249 |             # draw text on the largest component, as well as other large components.
250 |             for cid in range(1, _num_cc):
251 |                 if cid == largest_component_id or stats[cid, -1] > 100000:
252 |                     center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
253 |                     self.draw_text(text, *center)
254 | 
255 |         return self.output
256 | 
257 |     def get_output(self):
258 |         return self.output
259 | 


--------------------------------------------------------------------------------
/coco_rem/modeling/convnext.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from functools import partial
  4 | 
  5 | import torch
  6 | from detectron2.layers.batch_norm import LayerNorm as LayerNorm2d
  7 | from detectron2.modeling.backbone import Backbone
  8 | from timm.models.layers import DropPath, trunc_normal_
  9 | from torch import nn
 10 | 
 11 | 
 12 | class Block(nn.Module):
 13 |     def __init__(self, dim, drop_path=0.0, layer_scale_init_value=1e-6):
 14 |         super().__init__()
 15 |         self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)
 16 |         self.norm = nn.LayerNorm(dim, eps=1e-6)
 17 | 
 18 |         self.pwconv1 = nn.Linear(dim, 4 * dim)
 19 |         self.act = nn.GELU()
 20 | 
 21 |         self.pwconv2 = nn.Linear(4 * dim, dim)
 22 |         self.gamma = (
 23 |             nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
 24 |             if layer_scale_init_value > 0
 25 |             else None
 26 |         )
 27 |         self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
 28 | 
 29 |     def forward(self, x):
 30 |         input = x
 31 |         x = self.dwconv(x)
 32 |         x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
 33 |         x = self.norm(x)
 34 |         x = self.pwconv1(x)
 35 |         x = self.act(x)
 36 |         x = self.pwconv2(x)
 37 |         if self.gamma is not None:
 38 |             x = self.gamma * x
 39 |         x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
 40 | 
 41 |         x = input + self.drop_path(x)
 42 |         return x
 43 | 
 44 | 
 45 | class ConvNeXt(Backbone):
 46 |     """
 47 |     A PyTorch impl of : `A ConvNet for the 2020s` - https://arxiv.org/abs/2201.03545
 48 |     """
 49 | 
 50 |     def __init__(
 51 |         self,
 52 |         in_chans: int = 3,
 53 |         depths: list[int] = [3, 3, 9, 3],
 54 |         dims: list[int] = [96, 192, 384, 768],
 55 |         drop_path_rate: float = 0.0,
 56 |         layer_scale_init_value: float = 1e-6,
 57 |         out_features: list[str] | None = None,
 58 |     ):
 59 |         """
 60 |         Args:
 61 |             in_chans: Number of input image channels.
 62 |             depths: Number of blocks at each stage.
 63 |             dims: Feature dimension at each stage.
 64 |             drop_path_rate: Stochastic depth rate.
 65 |             layer_scale_init_value: Init value for Layer Scale.
 66 |             out_features: Stage numbers of the outputs given to the Neck.
 67 |         """
 68 |         super().__init__()
 69 | 
 70 |         # stem and 3 intermediate downsampling conv layers
 71 |         self.downsample_layers = nn.ModuleList()
 72 |         stem = nn.Sequential(
 73 |             nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
 74 |             LayerNorm2d(dims[0], eps=1e-6),
 75 |         )
 76 | 
 77 |         self.downsample_layers.append(stem)
 78 |         for i in range(3):
 79 |             downsample_layer = nn.Sequential(
 80 |                 LayerNorm2d(dims[i], eps=1e-6),
 81 |                 nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
 82 |             )
 83 |             self.downsample_layers.append(downsample_layer)
 84 | 
 85 |         self.num_layers = len(depths)
 86 |         num_features = [int(dims[i] * 2**i) for i in range(self.num_layers)]
 87 |         self.num_features = num_features
 88 |         self._out_features = out_features
 89 | 
 90 |         self._out_feature_strides = {}
 91 |         self._out_feature_channels = {}
 92 | 
 93 |         # 4 feature resolution stages, each consisting of multiple residual blocks
 94 |         self.stages = nn.ModuleList()
 95 |         dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
 96 |         cur = 0
 97 |         strides = [4, 4, 4, 4]
 98 |         for i in range(4):
 99 |             stage = nn.Sequential(
100 |                 *[
101 |                     Block(
102 |                         dim=dims[i],
103 |                         drop_path=dp_rates[cur + j],
104 |                         layer_scale_init_value=layer_scale_init_value,
105 |                     )
106 |                     for j in range(depths[i])
107 |                 ]
108 |             )
109 |             self.stages.append(stage)
110 |             cur += depths[i]
111 | 
112 |             self._out_feature_channels[f"res{i + 2}"] = dims[i]
113 |             self._out_feature_strides[f"res{i + 2}"] = strides[i] * 2**i
114 | 
115 |         norm_layer = partial(LayerNorm2d, eps=1e-6)
116 |         for i_layer in range(4):
117 |             layer = norm_layer(dims[i_layer])
118 |             layer_name = f"norm{i_layer}"
119 |             self.add_module(layer_name, layer)
120 | 
121 |         self.apply(self._init_weights)
122 | 
123 |     def _init_weights(self, m):
124 |         if isinstance(m, (nn.Conv2d, nn.Linear)):
125 |             trunc_normal_(m.weight, std=0.02)
126 |             nn.init.constant_(m.bias, 0)
127 | 
128 |     def init_weights(self, pretrained=None):
129 |         """Initialize the weights in backbone.
130 |         Args:
131 |             pretrained (str, optional): Path to pre-trained weights.
132 |                 Defaults to None.
133 |         """
134 | 
135 |         def _init_weights(m):
136 |             if isinstance(m, nn.Linear):
137 |                 trunc_normal_(m.weight, std=0.02)
138 |                 if isinstance(m, nn.Linear) and m.bias is not None:
139 |                     nn.init.constant_(m.bias, 0)
140 |             elif isinstance(m, nn.LayerNorm) or isinstance(m, LayerNorm2d):
141 |                 nn.init.constant_(m.bias, 0)
142 |                 nn.init.constant_(m.weight, 1.0)
143 | 
144 |         self.apply(_init_weights)
145 | 
146 |     def forward_features(self, x):
147 |         outs = {}
148 |         for i in range(4):
149 |             x = self.downsample_layers[i](x)
150 |             x = self.stages[i](x)
151 | 
152 |             if f"res{i + 2}" in self._out_features:
153 |                 norm_layer = getattr(self, f"norm{i}")
154 |                 x_out = norm_layer(x)
155 |                 out = x_out.contiguous()
156 |                 outs[f"res{i + 2}"] = out
157 | 
158 |         return outs
159 | 
160 |     def forward(self, x):
161 |         x = self.forward_features(x)
162 |         return x
163 | 


--------------------------------------------------------------------------------
/coco_rem/modeling/rcnn_refiner.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from __future__ import annotations
 3 | 
 4 | import torch
 5 | 
 6 | from detectron2.modeling.meta_arch.rcnn import GeneralizedRCNN
 7 | 
 8 | 
 9 | class GeneralizedRCNNRefiner(GeneralizedRCNN):
10 |     """
11 |     An extension of R-CNN that produces masks conditioned on box prompts. This
12 |     model skips the region proposal network and box ROI head, running only the
13 |     mask head by cropping ROI features using input boxes.
14 |     """
15 | 
16 |     def forward(self, batched_inputs: list[dict[str, torch.Tensor]]):
17 |         assert not self.training, "`GeneralizedRCNNRefiner` only supports inference!"
18 | 
19 |         # Prepare `detected_instances: list[Instances]` for `inference()` method
20 |         # to get mask predictions for ground-truth boxes.
21 |         detected_instances = [x.pop("instances") for x in batched_inputs]
22 |         for x in detected_instances:
23 |             x.pred_classes = x.gt_classes
24 |             x.pred_boxes = x.gt_boxes
25 |             x.scores = torch.ones_like(x.pred_classes).float()
26 | 
27 |         return self.inference(batched_inputs, detected_instances)
28 | 


--------------------------------------------------------------------------------
/coco_rem/modeling/sam_refiner.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | from __future__ import annotations
  7 | 
  8 | import einops as E
  9 | import torch
 10 | from segment_anything import sam_model_registry
 11 | from segment_anything.utils import amg
 12 | from torch import nn
 13 | 
 14 | 
 15 | class SamRefiner(nn.Module):
 16 |     """
 17 |     SamRefiner: An extension of SAM that refines (low-quality) input masks via
 18 |     iteratively prompting boxes and points.
 19 |     """
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         arch: str,
 24 |         checkpoint: str,
 25 |         num_extra_points: int = 2,
 26 |         num_trials: int = 10,
 27 |         box_only_ids: list[int] = [],
 28 |         min_mask_region_area: int = 100,
 29 |     ):
 30 |         """
 31 |         Args:
 32 |             arch: SAM image encoder architecture (vit_b, vit_l, vit_h).
 33 |             checkpoint: Path to .pth file containing pre-trained SAM weights.
 34 |             num_extra_points: Number of extra points to iteratively prompt SAM
 35 |                 with, after the initial box prompt. Points are sampled from the
 36 |                 error region (bitwise XOR) between SAM prediction and ground-truth.
 37 |             num_trials: Number of refinement trials per instance mask, to improve
 38 |                 the overall mask quality by ensembling.
 39 |             box_only_ids: Category IDs for which only box prompts will used.
 40 |             min_mask_region_area: If >0, postprocessing will be applied to remove
 41 |                 islands and holes in masks with area smaller than this value.
 42 |                 However, masks smaller than `10 * min_mask_region_area` will not
 43 |                 remain unchanged to avoid removing useful details in tiny masks.
 44 |         """
 45 |         super().__init__()
 46 | 
 47 |         # Initialize SAM, freeze parameters, and transfer them here.
 48 |         _sam = sam_model_registry[arch](checkpoint)
 49 |         for param in _sam.parameters():
 50 |             param.requires_grad = False
 51 | 
 52 |         self.image_encoder = _sam.image_encoder
 53 |         self.prompt_encoder = _sam.prompt_encoder
 54 |         self.mask_decoder = _sam.mask_decoder
 55 |         self.img_size = _sam.image_encoder.img_size  # 1024 pixels
 56 | 
 57 |         self.register_buffer("pixel_mean", _sam.pixel_mean)
 58 |         self.register_buffer("pixel_std", _sam.pixel_std)
 59 | 
 60 |         self.num_extra_points = num_extra_points
 61 |         self.num_trials = num_trials
 62 |         self.box_only_ids = box_only_ids
 63 |         self.min_mask_region_area = min_mask_region_area
 64 | 
 65 |     @torch.no_grad()
 66 |     def forward(
 67 |         self,
 68 |         image: torch.Tensor,
 69 |         masks: torch.Tensor,
 70 |         category_ids: list[int],
 71 |         original_size: tuple[int, int],
 72 |     ) -> torch.Tensor:
 73 |         """
 74 |         Regenerate an input mask by iteratively prompting points to SAM, same as
 75 |         the training procedure of SAM. This is done for multiple trials and masks
 76 |         are combining by averaging and thresholding in order to reduce variance.
 77 |         """
 78 | 
 79 |         # Normalize pixel values and pad to a square input.
 80 |         input_size = image.shape[-2:]
 81 |         image = (image[None, ...] - self.pixel_mean) / self.pixel_std
 82 |         padh = self.img_size - image.shape[-2]
 83 |         padw = self.img_size - image.shape[-1]
 84 |         image = nn.functional.pad(image, (0, padw, 0, padh))
 85 | 
 86 |         image_embeddings = self.image_encoder(image)
 87 |         all_masks = masks  # Rename for convenience.
 88 | 
 89 |         all_refined_masks = []
 90 |         for src_mask, category_id in zip(all_masks, category_ids):
 91 |             xp = 0 if category_id in self.box_only_ids else self.num_extra_points
 92 | 
 93 |             # Repeat a single mask `num_trials` times to perform refinement trials
 94 |             # within the same batch.
 95 |             src_mask = E.repeat(src_mask, "h w -> n h w", n=self.num_trials)
 96 | 
 97 |             box_prompt = self._get_box_prompt(src_mask)
 98 | 
 99 |             # Iteratively prompt SAM with points sampled from error regions of
100 |             # predicted masks. This is same as SAM's training procedure. The first
101 |             # iteration will only use a box prompt.
102 |             point_prompts, mask_prompt = None, None
103 | 
104 |             for _ in range(xp + 1):
105 |                 # Pass all prompts: points, initial box, logits from prev step.
106 |                 sparse_embeddings, dense_embeddings = self.prompt_encoder(
107 |                     point_prompts, box_prompt, mask_prompt
108 |                 )
109 | 
110 |                 low_res_masks, _ = self.mask_decoder(
111 |                     image_embeddings=image_embeddings,
112 |                     image_pe=self.prompt_encoder.get_dense_pe(),
113 |                     sparse_prompt_embeddings=sparse_embeddings,
114 |                     dense_prompt_embeddings=dense_embeddings,
115 |                     multimask_output=False,
116 |                 )
117 |                 refined_masks = nn.functional.interpolate(
118 |                     low_res_masks,
119 |                     (self.img_size, self.img_size),
120 |                     mode="bilinear",
121 |                     align_corners=False,
122 |                 )
123 |                 refined_masks = refined_masks[..., : input_size[0], : input_size[1]]
124 | 
125 |                 # Use source mask if SAM returned empty mask (happens for tiny boxes).
126 |                 if (refined_masks > 0).sum() == 0:
127 |                     refined_masks = src_mask[:, None, ...].float()
128 | 
129 |                 # Update point prompts and mask prompt for next iteration.
130 |                 point_prompts = sample_point_from_error_region(
131 |                     src_mask, refined_masks[:, 0], point_prompts
132 |                 )
133 |                 mask_prompt = low_res_masks
134 | 
135 |             # Resize the refine masks to original size, then ensemble the trials
136 |             # by thresholding at zero, then taking a majority vote.
137 |             refined_masks = nn.functional.interpolate(
138 |                 refined_masks, original_size, mode="bilinear", align_corners=False
139 |             )
140 |             refined_masks = (refined_masks > 0).float()
141 |             refined_mask = E.reduce(refined_masks, "n 1 h w -> h w", "mean")
142 |             refined_mask = refined_mask > 0.5
143 | 
144 |             # Remove spurious islands/holes for large enough masks.
145 |             _area = self.min_mask_region_area
146 |             if _area > 0 and refined_mask.sum() > 10 * _area:
147 |                 _mask = refined_mask.cpu().numpy()
148 | 
149 |                 _mask, _ = amg.remove_small_regions(_mask, _area, mode="holes")
150 |                 _mask, _ = amg.remove_small_regions(_mask, _area, mode="islands")
151 |                 refined_mask = torch.from_numpy(_mask).to(refined_mask.device)
152 | 
153 |             all_refined_masks.append(refined_mask)
154 | 
155 |         all_refined_masks = torch.stack(all_refined_masks)
156 |         return all_refined_masks
157 | 
158 |     def _get_box_prompt(self, mask: torch.Tensor):
159 |         """
160 |         Make a box prompt to SAM, which is a bounding box of mask that is expanded
161 |         using random noise, same as SAM's training procedure.
162 | 
163 |         Noise values drawn from Gaussian distributions having zero mean and
164 |         standard deviation equal to 10% of box edge size, up to maximum 10 pixels.
165 |         """
166 |         box_prompt = amg.batched_mask_to_box(mask.bool()).float()
167 | 
168 |         box_w = box_prompt[:, 2] - box_prompt[:, 0]
169 |         box_h = box_prompt[:, 3] - box_prompt[:, 1]
170 |         noise_std = torch.stack([box_w, box_h, box_w, box_h], dim=1)
171 |         noise_std = torch.clamp(noise_std * 0.1, max=10.0)
172 |         noise_mean = torch.zeros_like(box_prompt)
173 | 
174 |         random_noise = torch.normal(noise_mean, noise_std)
175 | 
176 |         box_prompt[:, :2] = box_prompt[:, :2] - random_noise[:, :2].abs()
177 |         box_prompt[:, 2:] = box_prompt[:, 2:] + random_noise[:, 2:].abs()
178 |         box_prompt = box_prompt.clamp(min=0.0, max=self.img_size - 1)
179 |         return box_prompt
180 | 
181 | 
182 | def sample_point_from_error_region(
183 |     reference_masks: torch.Tensor,
184 |     predicted_masks: torch.Tensor | None = None,
185 |     previous_prompts: tuple[torch.Tensor, torch.Tensor] | None = None,
186 | ) -> tuple[torch.Tensor, torch.Tensor]:
187 |     """
188 |     Sample random points from the error regions between some reference masks
189 |     (e.g. ground-truth) and predicted masks by SAM. Newly sampled points are
190 |     labeled foreground (1) or background (0) depending on the pixel value in
191 |     reference mask. This function simulates interactive segmentation setup for
192 |     training SAM, as described in Segment Anything paper.
193 | 
194 |     Args:
195 |         reference_masks: Batch of masks as a tensor of shape `(B, H, W)` containing
196 |             pixel values in `{1, 0}` or `{True, False}` denoting foreground region.
197 |         predicted_masks: Batch of masks predicted by SAM having same shape as the
198 |             reference masks. This tensor may have real-valued logits, which will
199 |             be internally binarized by thresholding at 0.
200 |         previous_prompts: Optional tuple of `(point_coords, point_labels)` giving
201 |             point prompts to SAM used in previous interactive iterations.
202 | 
203 |     Return:
204 |         next_prompts: Tuple of `(point_coords, point_labels)` with newly sampled
205 |             point co-ordinates and labels appended to `previous_prompts`.
206 |     """
207 |     # If predicted masks are not provided, assume that SAM predicted an empty mask.
208 |     # This lets us sample a random point from anywhere inside the reference masks.
209 |     if predicted_masks is None:
210 |         predicted_masks = torch.zeros_like(reference_masks)
211 | 
212 |     points, point_labels = [], []
213 |     for ref_mask, pr_mask in zip(reference_masks, predicted_masks):
214 |         # Sample from the error region between given masks.
215 |         error_region = torch.logical_xor(ref_mask > 0, pr_mask > 0)
216 |         yx_choices = error_region.nonzero()
217 | 
218 |         # If there is no error region, sample from anywhere in GT mask.
219 |         if len(yx_choices) == 0:
220 |             yx_choices = ref_mask.nonzero()
221 | 
222 |         if len(yx_choices) == 0:
223 |             yx_choices = torch.zeros((1, 2), device=ref_mask.device).long()
224 | 
225 |         idx = torch.randint(len(yx_choices), size=(1,)).item()
226 |         point_xy = yx_choices[idx, [1, 0]]
227 |         point_label = ref_mask[point_xy[1], point_xy[0]]
228 | 
229 |         points.append(point_xy)
230 |         point_labels.append(point_label)
231 | 
232 |     points = E.rearrange(torch.stack(points), "b xy -> b 1 xy")
233 |     point_labels = E.rearrange(torch.stack(point_labels).long(), "b -> b 1")
234 | 
235 |     # Append currently sampled points to previous prompts.
236 |     if previous_prompts is not None:
237 |         previous_points, previous_labels = previous_prompts
238 |         points = torch.cat([previous_points, points], dim=1)
239 |         point_labels = torch.cat([previous_labels, point_labels], dim=1)
240 | 
241 |     return (points, point_labels)
242 | 


--------------------------------------------------------------------------------
/coco_rem/trainer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | from __future__ import annotations
  3 | 
  4 | import time
  5 | from contextlib import nullcontext
  6 | 
  7 | import torch
  8 | from detectron2.engine import SimpleTrainer
  9 | from detectron2.utils.events import get_event_storage
 10 | from torch.cuda.amp import GradScaler, autocast
 11 | from torch.nn.parallel import DistributedDataParallel
 12 | 
 13 | 
 14 | class AMPWithGradAccumTrainer(SimpleTrainer):
 15 |     """
 16 |     Like :class:`SimpleTrainer`, but uses PyTorch's native automatic mixed precision
 17 |     in the training loop and gradient accumulation after every `N` batches.
 18 |     """
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         model,
 23 |         data_loader,
 24 |         optimizer,
 25 |         gather_metric_period: int = 1,
 26 |         grad_scaler: GradScaler | None = None,
 27 |         precision: torch.dtype = torch.float16,
 28 |         log_grad_scaler: bool = False,
 29 |         grad_accum_steps: int = 1,
 30 |     ):
 31 |         """
 32 |         Args:
 33 |             model, data_loader, optimizer, gather_metric_period:
 34 |                 same as in :class:`SimpleTrainer`.
 35 |             grad_scaler: torch GradScaler to automatically scale gradients.
 36 |             precision: torch.dtype as the target precision to cast to in computations.
 37 |             grad_accum_steps: Number of gradient accumulation steps.
 38 |         """
 39 |         unsupported = (
 40 |             "AMPTrainer does not support single-process multi-device training!"
 41 |         )
 42 |         if isinstance(model, DistributedDataParallel):
 43 |             assert not (model.device_ids and len(model.device_ids) > 1), unsupported
 44 | 
 45 |         super().__init__(model, data_loader, optimizer, gather_metric_period)
 46 | 
 47 |         if grad_scaler is None:
 48 |             grad_scaler = GradScaler()
 49 |         self.grad_scaler = grad_scaler
 50 |         self.precision = precision
 51 |         self.log_grad_scaler = log_grad_scaler
 52 | 
 53 |         assert grad_accum_steps >= 1, "grad_accum_steps must be >= 1."
 54 |         self.grad_accum_steps = grad_accum_steps
 55 |         self.grad_sync_manager = _GradAccumSyncManager(model, grad_accum_steps)
 56 | 
 57 |     def run_step(self):
 58 |         """
 59 |         Implement the AMP training logic along with gradient accumulation.
 60 |         """
 61 |         assert self.model.training, "[AMPTrainer] model was changed to eval mode!"
 62 | 
 63 |         start = time.perf_counter()
 64 |         self.optimizer.zero_grad()
 65 | 
 66 |         # Record data loading time for all batches during gradient accumulation.
 67 |         total_data_time = 0.0
 68 |         prev_data_time = start
 69 | 
 70 |         for _ in range(self.grad_accum_steps):
 71 |             # Load batch and accumulate total time to load all batches throughout
 72 |             # all steps of gradient accumulation.
 73 |             data = next(self._data_loader_iter)
 74 |             current_data_time = time.perf_counter()
 75 |             total_data_time += current_data_time - prev_data_time
 76 |             prev_data_time = current_data_time
 77 | 
 78 |             with self.grad_sync_manager, autocast(dtype=self.precision):
 79 |                 loss_dict = self.model(data)
 80 |                 if isinstance(loss_dict, torch.Tensor):
 81 |                     losses = loss_dict
 82 |                     loss_dict = {"total_loss": loss_dict}
 83 |                 else:
 84 |                     losses = sum(loss_dict.values())
 85 | 
 86 |                 normalized_losses = losses / self.grad_accum_steps
 87 |                 self.grad_scaler.scale(normalized_losses).backward()
 88 | 
 89 |         if self.log_grad_scaler:
 90 |             storage = get_event_storage()
 91 |             storage.put_scalar("[metric]grad_scaler", self.grad_scaler.get_scale())
 92 | 
 93 |         self.after_backward()
 94 | 
 95 |         if self.async_write_metrics:
 96 |             # write metrics asynchronically
 97 |             self.concurrent_executor.submit(
 98 |                 self._write_metrics, loss_dict, total_data_time, iter=self.iter
 99 |             )
100 |         else:
101 |             self._write_metrics(loss_dict, total_data_time)
102 | 
103 |         self.grad_scaler.step(self.optimizer)
104 |         self.grad_scaler.update()
105 | 
106 |     def state_dict(self):
107 |         ret = super().state_dict()
108 |         ret["grad_scaler"] = self.grad_scaler.state_dict()
109 |         return ret
110 | 
111 |     def load_state_dict(self, state_dict):
112 |         super().load_state_dict(state_dict)
113 |         self.grad_scaler.load_state_dict(state_dict["grad_scaler"])
114 | 
115 | 
116 | class _GradAccumSyncManager:
117 |     """
118 |     Distributed training with gradient accumulation can cause huge slowdowns if
119 |     gradient synchronization is not done properly. This context manager does it.
120 |     When using DDP and accumulation for `N` steps, gradients are not averaged
121 |     across process for first `N - 1` steps. This context manager behaves as
122 |     a no-op (`nullcontext`) when any of these conditions are true:
123 | 
124 |         - Training with single GPU or CPU only (`model` is not DDP object)
125 |         - DDP with static graph (see https://github.com/pytorch/pytorch/issues/80832)
126 |         - No gradient accumulation across multiple steps (`num_steps = 1`)
127 |     """
128 | 
129 |     def __init__(self, model, num_steps: int):
130 |         """
131 |         Args:
132 |             model: PyTorch module that is being trained with gradient accumulation.
133 |             num_steps: Number of batches processed to accumulate gradients.
134 |         """
135 |         self.num_steps = num_steps
136 |         self.step = 0
137 |         self._sync = nullcontext()
138 | 
139 |         if isinstance(model, DistributedDataParallel) and not model.static_graph:
140 |             self._no_sync = model.no_sync()
141 |         else:
142 |             self._no_sync = nullcontext()
143 | 
144 |     def __enter__(self):
145 |         return self._no_sync if self.step < self.num_steps - 1 else self._sync
146 | 
147 |     def __exit__(self, *args, **kwargs):
148 |         self.step = (self.step + 1) % self.num_steps
149 | 


--------------------------------------------------------------------------------
/images/coco_rem_example_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdexd/coco-rem/73e38364a787b34cbcd846739f196f066430279b/images/coco_rem_example_1.jpg


--------------------------------------------------------------------------------
/images/coco_rem_example_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdexd/coco-rem/73e38364a787b34cbcd846739f196f066430279b/images/coco_rem_example_2.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fvcore==0.1.5.post20221221
2 | hydra-core>=1.1
3 | numpy==1.24.1
4 | omegaconf>=2.1
5 | pycocotools>=2.0
6 | einops>=0.6
7 | wget>=3.0
8 | 


--------------------------------------------------------------------------------
/scripts/correct_labeling_errors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Add and remove a few instances from a given COCO JSON, and mark a few other
 3 | instances as 'crowd' objects as their masks cover multiple instances.
 4 | """
 5 | 
 6 | import argparse
 7 | import json
 8 | 
 9 | import torch
10 | from pycocotools import mask as mask_utils
11 | from segment_anything.utils import amg
12 | 
13 | import coco_rem.data.manual_rem as inv
14 | 
15 | 
16 | parser = argparse.ArgumentParser(description=__doc__)
17 | parser.add_argument(
18 |     "--input",
19 |     default="datasets/coco_rem/instances_valrem_interim.json",
20 |     help="COCO-ReM JSON file to apply all manual corrections.",
21 | )
22 | parser.add_argument(
23 |     "--output", required=True, help="Path to save output annotations JSON."
24 | )
25 | 
26 | 
27 | def main(_A: argparse.Namespace):
28 |     coco_json = json.load(open(_A.input))
29 |     print(f"Number of instances in input JSON: {len(coco_json['annotations'])}")
30 | 
31 |     # ------------------------------------------------------------------------
32 |     # Step 1: Remove some instances.
33 |     remove_info_tuples = [
34 |         (x["image_id"], x["source"], x["source_id"]) for x in inv.INSTANCES_TO_REMOVE
35 |     ]
36 |     coco_json["annotations"] = [
37 |         x
38 |         for x in coco_json["annotations"]
39 |         if (x["image_id"], x["source"], x["source_id"]) not in remove_info_tuples
40 |     ]
41 | 
42 |     num_instances = len(coco_json["annotations"])
43 |     print(f"Removed few instances, updated JSON has {num_instances} instances.")
44 | 
45 |     # ------------------------------------------------------------------------
46 |     # Step 2: Set 'iscrowd = 1' for few instances.
47 |     crowd_info_tuples = [
48 |         (x["image_id"], x["source"], x["source_id"]) for x in inv.INSTANCES_TO_CROWD
49 |     ]
50 |     for ann in coco_json["annotations"]:
51 |         if (ann["image_id"], ann["source"], ann["source_id"]) in crowd_info_tuples:
52 |             ann["iscrowd"] = 1
53 | 
54 |     print(f"Set 'iscrowd = 1' for {len(inv.INSTANCES_TO_CROWD)} instances.")
55 | 
56 |     # ------------------------------------------------------------------------
57 |     # Step 3: Add some instances.
58 |     for idx, ann in enumerate(inv.INSTANCES_TO_ADD):
59 |         # Convert compressed RLE to mask.
60 |         binary_mask = mask_utils.decode(ann["segmentation"])
61 |         binary_mask = torch.from_numpy(binary_mask)
62 | 
63 |         # Convert torch tensor to uncompressed RLE.
64 |         ann["segmentation"] = amg.mask_to_rle_pytorch(binary_mask[None, ...])[0]
65 | 
66 |         # Fill other attributes for the annotation - a unique ID, source, bbox,
67 |         # area, and `iscrowd = 0`.
68 |         bbox_xyxy = amg.batched_mask_to_box(binary_mask[None, ...])[0]
69 | 
70 |         # Convert bounding box from XYXY to XYWH format.
71 |         x1, y1, x2, y2 = bbox_xyxy.tolist()
72 |         ann["bbox"] = [x1, y1, x2 - x1 + 1, y2 - y1 + 1]
73 | 
74 |         ann["area"] = amg.area_from_rle(ann["segmentation"])
75 |         ann["id"] = 2024000000 + idx
76 |         ann["source_id"] = ann["id"]
77 |         ann["source"] = "manual"
78 |         ann["iscrowd"] = 0
79 | 
80 |         coco_json["annotations"].append(ann)
81 | 
82 |     num_instances = len(coco_json["annotations"])
83 |     print(f"Added few instances, updated JSON has {num_instances} instances.")
84 | 
85 |     json.dump(coco_json, open(_A.output, "w"))
86 |     print(f"Saved the updated annotations JSON at {_A.output}!")
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     args = parser.parse_args()
91 |     main(args)
92 | 


--------------------------------------------------------------------------------
/scripts/merge_instances.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Merge LVIS instance annotations for COCO categories (a.k.a "COCO-fied LVIS") into
  3 | the original COCO instance annotations.
  4 | 
  5 | COCO instance annotations are inconsistent, sometimes covering multiple objects
  6 | into one mask. LVIS offers a much stronger guarantee that instances are labeled
  7 | individually and exhaustively. For any `(image, category)` pair, if COCO-fied LVIS
  8 | has instance annotations, then they replace the corresponding COCO annotations.
  9 | """
 10 | 
 11 | from __future__ import annotations
 12 | 
 13 | import argparse
 14 | import copy
 15 | import json
 16 | from collections import defaultdict
 17 | 
 18 | from coco_rem.data.lvis import COCO_CATEGORIES_IN_LVIS
 19 | 
 20 | parser = argparse.ArgumentParser(description=__doc__)
 21 | _AA = parser.add_argument
 22 | _AA("--coco-json", help="Path to COCO annotations JSON file.")
 23 | _AA(
 24 |     "--lvis-json",
 25 |     nargs=2,
 26 |     help="Paths to LVIS train and val JSON files.",
 27 |     default=["datasets/lvis/lvis_v1_train.json", "datasets/lvis/lvis_v1_val.json"],
 28 | )
 29 | _AA("--split", choices=["train", "val"], help="Which dataset split to pre-process?")
 30 | _AA("--output", required=True, help="Path to save the output annotations JSON.")
 31 | 
 32 | 
 33 | def make_cocofied_lvis(lvis_json_paths: list[str], split: str):
 34 |     """
 35 |     Load LVIS instance annotations and filter them to keep instance annotations
 36 |     of the COCO categories for all images belonging to a COCO split (train/val).
 37 |     Category IDs in the output JSON are same as COCO IDs.
 38 |     """
 39 | 
 40 |     lvis_images, lvis_annos = [], []
 41 |     for _path in lvis_json_paths:
 42 |         lvis_json = json.load(open(_path))
 43 |         lvis_images.extend(lvis_json.pop("images"))
 44 |         lvis_annos.extend(lvis_json.pop("annotations"))
 45 | 
 46 |     # LVIS train/val splits are different than COCO (but in total, they cover the
 47 |     # same set of images). So we load both train and val annotations, then retain
 48 |     # images and their instances of the desired COCO split.
 49 |     keep_ids = set([x["id"] for x in lvis_images if split in x["coco_url"]])
 50 |     lvis_images = [x for x in lvis_images if x["id"] in keep_ids]
 51 |     lvis_annos = [x for x in lvis_annos if x["image_id"] in keep_ids]
 52 | 
 53 |     # Replace the category ID in instance annotation (LVIS -> COCO), and remove
 54 |     # LVIS instances that do not represent COCO categories.
 55 |     lvis_to_coco_id = {x["lvis_id"]: x["coco_id"] for x in COCO_CATEGORIES_IN_LVIS}
 56 |     lvis_annos = [x for x in lvis_annos if x["category_id"] in lvis_to_coco_id]
 57 |     for ann in lvis_annos:
 58 |         ann["category_id"] = lvis_to_coco_id[ann["category_id"]]
 59 | 
 60 |     # Replace category IDs in the "negative categories" list per image, like above.
 61 |     for image in lvis_images:
 62 |         for key in ["not_exhaustive_category_ids", "neg_category_ids"]:
 63 |             image[key] = [x for x in image[key] if x in lvis_to_coco_id]
 64 |             image[key] = [lvis_to_coco_id[x] for x in image[key]]
 65 | 
 66 |     # Transfer metadata from original LVIS json to COCOfied LVIS json.
 67 |     cocofied_lvis = copy.deepcopy(lvis_json)
 68 |     cocofied_lvis["images"] = lvis_images
 69 |     cocofied_lvis["annotations"] = lvis_annos
 70 | 
 71 |     # Update category IDs of LVIS categories.
 72 |     cocofied_lvis["categories"] = [
 73 |         x for x in cocofied_lvis["categories"] if x["id"] in lvis_to_coco_id
 74 |     ]
 75 |     for ann in cocofied_lvis["categories"]:
 76 |         ann["id"] = lvis_to_coco_id[ann["id"]]
 77 | 
 78 |     print(f"COCO-fied LVIS stats for COCO {split} split:")
 79 |     print(f"  - Number of images      = {len(lvis_images)}")
 80 |     print(f"  - Number of annotations = {len(lvis_annos)}")
 81 | 
 82 |     return cocofied_lvis
 83 | 
 84 | 
 85 | def main(_A: argparse.Namespace):
 86 |     coco_json = json.load(open(_A.coco_json))
 87 |     lvis_json = make_cocofied_lvis(_A.lvis_json, _A.split)
 88 | 
 89 |     # Make a mapping from `(image_id, category_id) -> list[instances]` for both,
 90 |     # COCO and LVIS.
 91 |     coco_instances_dict = defaultdict(list)
 92 |     for ann in coco_json["annotations"]:
 93 |         # Mark the source of every annotation before merging.
 94 |         ann["source"] = "coco"
 95 |         ann["source_id"] = ann["id"]
 96 |         coco_instances_dict[(ann["image_id"], ann["category_id"])].append(ann)
 97 | 
 98 |     lvis_instances_dict = defaultdict(list)
 99 |     for ann in lvis_json["annotations"]:
100 |         ann["source"] = "lvis"
101 |         ann["source_id"] = ann["id"]
102 |         lvis_instances_dict[(ann["image_id"], ann["category_id"])].append(ann)
103 | 
104 |     # ------------------------------------------------------------------------
105 |     # For val set, remove all COCO-fied LVIS annotations for `(image, category)`
106 |     # pair if instances are not annotated exhaustively.
107 |     if _A.split == "val":
108 |         _remove = [
109 |             (image_info["id"], category_id)
110 |             for image_info in lvis_json["images"]
111 |             for category_id in image_info["not_exhaustive_category_ids"]
112 |         ]
113 |         lvis_instances_dict = {
114 |             k: v for k, v in lvis_instances_dict.items() if k not in _remove
115 |         }
116 | 
117 |     # ------------------------------------------------------------------------
118 |     # If `(image, category)` tuple has more LVIS instances than COCO instances
119 |     # then all COCO instances will be replaced by LVIS instances.
120 |     merged_annotations = []
121 |     for (image_id, category_id), anns_in_coco in coco_instances_dict.items():
122 |         anns_in_lvis = lvis_instances_dict.get((image_id, category_id), [])
123 | 
124 |         if len(anns_in_lvis) > len(anns_in_coco):
125 |             merged_annotations.extend(anns_in_lvis)
126 |         else:
127 |             merged_annotations.extend(anns_in_coco)
128 | 
129 |     # Some `(image, category)` instances of LVIS are completely absent in COCO.
130 |     # Add all of these while merging.
131 |     for (image_id, category_id), anns_in_lvis in lvis_instances_dict.items():
132 |         if (image_id, category_id) not in coco_instances_dict:
133 |             merged_annotations.extend(anns_in_lvis)
134 | 
135 |     coco_json["annotations"] = merged_annotations
136 | 
137 |     # Re-assign annotation IDs after merging.
138 |     image_id_to_anns_coco = defaultdict(list)
139 |     for ann in coco_json["annotations"]:
140 |         image_id_to_anns_coco[ann["image_id"]].append(ann)
141 | 
142 |     for image_id, anns_in_coco in image_id_to_anns_coco.items():
143 |         for idx, ann in enumerate(anns_in_coco):
144 |             ann["id"] = image_id * 1000 + idx
145 | 
146 |     # ------------------------------------------------------------------------
147 |     # Calculate number of annotations sourced from COCO/LVIS.
148 |     num_coco_src = len([x for x in merged_annotations if x["source"] == "coco"])
149 |     num_lvis_src = len([x for x in merged_annotations if x["source"] == "lvis"])
150 | 
151 |     print(f"Final COCO {_A.split} split statistics after merging:")
152 |     print(f"  - Number of images      = {len(coco_json['images'])}")
153 |     print(f"  - Number of annotations = {len(coco_json['annotations'])}")
154 |     print(f"      - Annotations from COCO = {num_coco_src}")
155 |     print(f"      - Annotations from LVIS = {num_lvis_src}")
156 | 
157 |     json.dump(coco_json, open(_A.output, "w"))
158 |     print(f"Saved the merged annotations JSON to {_A.output}")
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     _A = parser.parse_args()
163 | 
164 |     # Log all command-line arguments.
165 |     print("Running with arguments:")
166 |     for key, value in vars(_A).items():
167 |         print(f"{key:<10}: {value}")
168 | 
169 |     main(_A)
170 | 


--------------------------------------------------------------------------------
/scripts/refine_boundaries.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Refine mask boundaries of input COCO JSON to obtain COCO-ReM. Refinement is done
  3 | using the `SamRefiner` module in this package.
  4 | """
  5 | 
  6 | from __future__ import annotations
  7 | 
  8 | import argparse
  9 | import json
 10 | import os
 11 | from collections import defaultdict
 12 | 
 13 | import torch
 14 | from detectron2 import engine
 15 | from detectron2.data.detection_utils import read_image
 16 | from detectron2.data.transforms import ResizeShortestEdge
 17 | from detectron2.structures import polygons_to_bitmask
 18 | from detectron2.utils import comm
 19 | from segment_anything.utils import amg
 20 | from tqdm import tqdm
 21 | 
 22 | from coco_rem.modeling.sam_refiner import SamRefiner
 23 | 
 24 | # Add documentation of `SamRefiner` to this script documentation, so argparse can
 25 | # display it with `--help`.
 26 | __doc__ += f"\n\n{SamRefiner.__doc__}\n{SamRefiner.__init__.__doc__}"
 27 | 
 28 | # fmt: off
 29 | parser = argparse.ArgumentParser(
 30 |     description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
 31 | )
 32 | _AA = parser.add_argument
 33 | _AA("--input-json", required=True, help="Path to COCO annotations JSON.")
 34 | _AA("--image-dir", default="datasets/coco/val2017", help="COCO image directory.")
 35 | _AA("--num-gpus", type=int, default=0, help="Number of GPUs for parallelization.")
 36 | _AA("--output", required=True, help="Path to save output annotations JSON.")
 37 | 
 38 | group = parser.add_argument_group("Input arguments to `SamRefiner`.")
 39 | group.add_argument("--arch", default="vit_h", choices=["vit_b", "vit_l", "vit_h"])
 40 | group.add_argument("--checkpoint", default="checkpoints/sam_vit_h_4b8939.pth")
 41 | group.add_argument("--num-extra-points", type=int, default=2)
 42 | group.add_argument("--num-trials", type=int, default=10)
 43 | group.add_argument(
 44 |     "--box-only-names", nargs="+",
 45 |     default=["bed", "bicycle", "bowl", "dining table", "motorcycle", "scissors"],
 46 |     help="COCO category names for which we only use box prompts.",
 47 | )
 48 | # fmt: on
 49 | 
 50 | 
 51 | def main(_A: argparse.Namespace):
 52 |     device = torch.device("cpu")
 53 |     if torch.cuda.is_available():
 54 |         device = torch.cuda.current_device()
 55 | 
 56 |     # ------------------------------------------------------------------------
 57 |     coco_json = json.load(open(_A.input_json))
 58 | 
 59 |     # Make a mapping between image ID and all instance annotations.
 60 |     image_id_annotations = defaultdict(list)
 61 |     for ann in coco_json["annotations"]:
 62 |         image_id_annotations[ann["image_id"]].append(ann)
 63 | 
 64 |     image_id_annotations = list(image_id_annotations.items())
 65 | 
 66 |     # Shard the dataset so each GPU only refines masks for a subset of images.
 67 |     WORLD_SIZE = comm.get_world_size()
 68 |     RANK = comm.get_rank()
 69 | 
 70 |     image_id_annotations = image_id_annotations[RANK::WORLD_SIZE]
 71 |     print(f"GPU {RANK}/{WORLD_SIZE} will process {len(image_id_annotations)} images.")
 72 | 
 73 |     # Get a list of category IDs for which only box prompts will be used.
 74 |     cat_id_map = {x["name"]: x["id"] for x in coco_json["categories"]}
 75 |     box_only_ids = [cat_id_map[x] for x in _A.box_only_names]
 76 | 
 77 |     # ------------------------------------------------------------------------
 78 |     # Instantiate model and input tranform (resize longest side to 1024 pixels).
 79 |     refiner = SamRefiner(
 80 |         _A.arch, _A.checkpoint, _A.num_extra_points, _A.num_trials, box_only_ids
 81 |     )
 82 |     refiner = refiner.eval().to(device)
 83 | 
 84 |     preprocess = ResizeShortestEdge(refiner.img_size, max_size=refiner.img_size)
 85 | 
 86 |     # ------------------------------------------------------------------------
 87 |     for image_id, annotations in tqdm(image_id_annotations, "Refining masks"):
 88 |         image_path = os.path.join(_A.image_dir, f"{image_id:0>12d}.jpg")
 89 |         image = read_image(image_path, "RGB")
 90 |         original_hw = image.shape[:2]
 91 | 
 92 |         # Pre-process image and masks.
 93 |         transform = preprocess.get_transform(image)
 94 |         image = transform.apply_image(image)
 95 | 
 96 |         # Get image height/width before and after applying resize transform.
 97 |         resized_hw = image.shape[:2]
 98 | 
 99 |         # Convert image to NCHW format tensor, RGB values in 0-255).
100 |         image = torch.as_tensor(image, device=device)
101 |         image = image.permute(2, 0, 1).contiguous()
102 | 
103 |         # Make batches of source masks (NHW bool tensor).
104 |         source_masks = [ann["segmentation"] for ann in annotations]
105 |         for idx, segm in enumerate(source_masks):
106 |             if isinstance(segm, list):
107 |                 # Polygons.
108 |                 polygons = [torch.as_tensor(p).view(-1, 2) for p in segm]
109 |                 polygons = [p.view(-1) for p in transform.apply_polygons(polygons)]
110 |                 segm = polygons_to_bitmask(polygons, *resized_hw)
111 |             elif isinstance(segm, dict):
112 |                 # RLE.
113 |                 segm = amg.rle_to_mask(segm).astype("uint8")
114 |                 segm = transform.apply_segmentation(segm)
115 | 
116 |             source_masks[idx] = torch.as_tensor(segm).bool()
117 | 
118 |         source_masks = torch.stack(source_masks).to(device)
119 |         # --------------------------------------------------------------------
120 | 
121 |         category_ids = [ann["category_id"] for ann in annotations]
122 |         refined_masks = refiner(image, source_masks, category_ids, original_hw)
123 | 
124 |         # Get tight boxes enclosing refined masks, then convert masks to RLE.
125 |         refined_boxes_xyxy = amg.batched_mask_to_box(refined_masks)
126 |         refined_masks = amg.mask_to_rle_pytorch(refined_masks)
127 | 
128 |         # Replace the source masks with refined masks in COCO annotations.
129 |         # NOTE: Keep "crowd" annotations unchanged as they don't participate in
130 |         # the calculation of COCO AP.
131 |         for idx, ann in enumerate(annotations):
132 |             if ann.get("iscrowd", 0) != 1:
133 |                 ann["segmentation"] = refined_masks[idx]
134 |                 ann["area"] = amg.area_from_rle(refined_masks[idx])
135 | 
136 |                 # Recompute box enclosing the refined mask.
137 |                 x1, y1, x2, y2 = refined_boxes_xyxy[idx].tolist()
138 |                 ann["bbox"] = [x1, y1, x2 - x1 + 1, y2 - y1 + 1]
139 | 
140 |     # ------------------------------------------------------------------------
141 | 
142 |     # Combine the refined masks from all GPU processes to main process.
143 |     all_refined_annotations = []
144 |     for _, annotations in image_id_annotations:
145 |         all_refined_annotations.extend(annotations)
146 | 
147 |     all_refined_annotations = comm.gather(all_refined_annotations, dst=0)
148 | 
149 |     # In main process, replace annotations in COCO JSON and save to output.
150 |     if comm.is_main_process():
151 |         coco_json["annotations"] = []
152 |         for ann_list in all_refined_annotations:
153 |             coco_json["annotations"].extend(ann_list)
154 | 
155 |         os.makedirs(os.path.dirname(_A.output), exist_ok=True)
156 |         json.dump(coco_json, open(_A.output, "w"))
157 |         print(f"Saved annotations JSON with refined masks to {_A.output}")
158 | 
159 |     comm.synchronize()
160 |     print(f"GPU {RANK}/{WORLD_SIZE}: Refinement complete!")
161 | 
162 | 
163 | if __name__ == "__main__":
164 |     _A = parser.parse_args()
165 | 
166 |     print("Running with arguments:")
167 |     for key, value in vars(_A).items():
168 |         print(f"{key:<30}: {value}")
169 | 
170 |     engine.launch(main, num_gpus_per_machine=_A.num_gpus, dist_url="auto", args=(_A,))
171 | 


--------------------------------------------------------------------------------
/scripts/train_net.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Train or evaluation a model using Detectron2-style lazy config.
  3 | """
  4 | 
  5 | from __future__ import annotations
  6 | 
  7 | import argparse
  8 | import json
  9 | import logging
 10 | import warnings
 11 | 
 12 | import torch
 13 | from detectron2 import engine
 14 | from detectron2.checkpoint import DetectionCheckpointer
 15 | from detectron2.config import LazyConfig, instantiate
 16 | from detectron2.engine import hooks
 17 | from detectron2.engine.defaults import create_ddp_model
 18 | from detectron2.evaluation import inference_on_dataset, print_csv_format
 19 | from detectron2.evaluation.testing import flatten_results_dict
 20 | from detectron2.utils import comm
 21 | 
 22 | from coco_rem.data.builtin import register_all_coco_rem
 23 | from coco_rem.trainer import AMPWithGradAccumTrainer
 24 | 
 25 | warnings.filterwarnings("ignore")
 26 | logger = logging.getLogger("detectron2")
 27 | 
 28 | 
 29 | parser = engine.default_argument_parser(__doc__)
 30 | _AA = parser.add_argument
 31 | _AA("--checkpoint-period", type=int, default=5000, help="Checkpoint saving period.")
 32 | _AA("--log-period", type=int, default=10, help="Log training progress periodically.")
 33 | 
 34 | 
 35 | def do_test(_C, model):
 36 |     data_loader = instantiate(_C.dataloader.test)
 37 |     evaluator = instantiate(_C.dataloader.evaluator)
 38 | 
 39 |     results = inference_on_dataset(model, data_loader, evaluator)
 40 |     print_csv_format(results)
 41 |     return results
 42 | 
 43 | 
 44 | def main(_A: argparse.Namespace):
 45 |     # Register COCO-ReM dataset splits before starting the training job.
 46 |     register_all_coco_rem()
 47 | 
 48 |     _C = LazyConfig.load(_A.config_file)
 49 |     _C = LazyConfig.apply_overrides(_C, _A.opts)
 50 | 
 51 |     engine.default_setup(_C, _A)
 52 | 
 53 |     device = torch.cuda.current_device() if _A.num_gpus != 0 else torch.device("cpu")
 54 | 
 55 |     model = instantiate(_C.model).to(device)
 56 |     logger.info("Model:\n{}".format(model))
 57 | 
 58 |     model = create_ddp_model(model)
 59 |     DetectionCheckpointer(model).load(_C.train.get("init_checkpoint", None))
 60 | 
 61 |     if _A.eval_only:
 62 |         results = do_test(_C, model)
 63 |         if comm.is_main_process():
 64 |             results = flatten_results_dict(results)
 65 |             json.dump(results, open(f"{_C.train.output_dir}/eval_results.json", "w"))
 66 |         return
 67 | 
 68 |     train_loader = instantiate(_C.dataloader.train)
 69 | 
 70 |     _C.optimizer.params.model = model
 71 |     optim = instantiate(_C.optimizer)
 72 | 
 73 |     trainer_cls = AMPWithGradAccumTrainer if _C.train.amp else engine.SimpleTrainer
 74 |     trainer = trainer_cls(
 75 |         model, train_loader, optim, grad_accum_steps=_C.train.get("grad_accum_steps", 1)
 76 |     )
 77 |     checkpointer = DetectionCheckpointer(model, _C.train.output_dir, trainer=trainer)
 78 | 
 79 |     trainer.register_hooks(
 80 |         [
 81 |             hooks.IterationTimer(),
 82 |             hooks.LRScheduler(scheduler=instantiate(_C.lr_multiplier)),
 83 |             hooks.PeriodicCheckpointer(checkpointer, _A.checkpoint_period)
 84 |             if comm.is_main_process()
 85 |             else None,
 86 |             hooks.EvalHook(_A.checkpoint_period, lambda: do_test(_C, model)),
 87 |             hooks.PeriodicWriter(
 88 |                 engine.default_writers(_C.train.output_dir, _C.train.max_iter),
 89 |                 period=_A.log_period,
 90 |             )
 91 |             if comm.is_main_process()
 92 |             else None,
 93 |         ]
 94 |     )
 95 | 
 96 |     checkpointer.resume_or_load(_C.train.init_checkpoint, resume=_A.resume)
 97 |     if _A.resume and checkpointer.has_checkpoint():
 98 |         # The checkpoint stores the training iteration that just finished, thus we start
 99 |         # at the next iteration
100 |         start_iter = trainer.iter + 1
101 |     else:
102 |         start_iter = 0
103 |     trainer.train(start_iter, _C.train.max_iter)
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     _A = parser.parse_args()
108 |     engine.launch(
109 |         main,
110 |         num_gpus_per_machine=_A.num_gpus,
111 |         num_machines=_A.num_machines,
112 |         machine_rank=_A.machine_rank,
113 |         dist_url=_A.dist_url,
114 |         args=(_A,),
115 |     )
116 | 


--------------------------------------------------------------------------------
/scripts/visualize_coco.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Visualize instances from a COCO annotations JSON (COCO-2017 or COCO-ReM).
 3 | """
 4 | 
 5 | import argparse
 6 | import logging
 7 | import os
 8 | 
 9 | import numpy as np
10 | from detectron2.data import DatasetCatalog, MetadataCatalog
11 | from detectron2.data import detection_utils as utils
12 | from detectron2.data.datasets import load_coco_json
13 | from tqdm import tqdm
14 | 
15 | from coco_rem.mask_visualizer import MaskVisualizer
16 | 
17 | logger = logging.getLogger(__name__)
18 | 
19 | # fmt: off
20 | parser = argparse.ArgumentParser(description=__doc__)
21 | _AA = parser.add_argument
22 | _AA(
23 |     "--input-json", default="datasets/coco/annotations/instances_val2017.json",
24 |     help="Path to JSON file containing COCO annotations."
25 | )
26 | _AA(
27 |     "--image-dir", default="datasets/coco/val2017",
28 |     help="Path to directory containing COCO images.",
29 | )
30 | _AA("--draw-labels", action="store_true", help="Whether to draw labels on masks.")
31 | _AA("--class-name", help="If provided, visualize masks of this class only.")
32 | 
33 | _AA("--output", default="./viz", help="Path to output (saving) dir.")
34 | _AA("--filename-suffix", help="Add a suffix to saved image file name.")
35 | # fmt: on
36 | 
37 | 
38 | def add_id_to_labels(dic, labels):
39 |     labels = [f"{lbl} ({x['id']})" for lbl, x in zip(labels, dic["annotations"])]
40 |     return labels
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     _A = parser.parse_args()
45 |     print("Arguments: " + str(_A))
46 | 
47 |     # Register the input COCO JSON file as a Detectron2 dataset to load nicely
48 |     # formatted dataset dicts for visualization.
49 |     # Extra annotation keys: all possible keys added in generated JSON files.
50 |     name = "coco_or_lvis_v1_cocofied_to_visualize"
51 |     extra_keys = ["source", "source_id", "id"]
52 | 
53 |     DatasetCatalog.register(
54 |         name, lambda: load_coco_json(_A.input_json, _A.image_dir, name, extra_keys)
55 |     )
56 |     # ------------------------------------------------------------------------
57 |     # Fix seed for reproducible colors.
58 |     np.random.seed(0)
59 | 
60 |     dataset_dicts = DatasetCatalog.get(name)
61 |     class_names = MetadataCatalog.get("coco_2017_val").thing_classes
62 |     os.makedirs(_A.output, exist_ok=True)
63 | 
64 |     for ddict in tqdm(dataset_dicts):
65 |         if _A.class_name is not None:
66 |             ddict["annotations"] = [
67 |                 ann
68 |                 for ann in ddict["annotations"]
69 |                 if class_names[ann["category_id"]] == _A.class_name
70 |             ]
71 | 
72 |         if len(ddict["annotations"]) > 0:
73 |             img = utils.read_image(ddict["file_name"], "RGB")
74 |             visualizer = MaskVisualizer(img, class_names)
75 |             vis_image = visualizer.draw_dataset_dict(
76 |                 ddict, _A.draw_labels, label_suffix_formatter=add_id_to_labels
77 |             )
78 | 
79 |             # Save the visualized image.
80 |             filepath = os.path.join(_A.output, os.path.basename(ddict["file_name"]))
81 |             if _A.class_name is not None:
82 |                 filepath = filepath.replace(".jpg", f"_{_A.class_name}.jpg")
83 | 
84 |             if _A.filename_suffix is not None:
85 |                 filepath = filepath.replace(".jpg", f"_{_A.filename_suffix}.jpg")
86 | 
87 |             vis_image.save(filepath)
88 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from setuptools import find_packages, setup
 3 | 
 4 | setup(
 5 |     name="coco_rem",
 6 |     version="0.1",
 7 |     python_requires=">=3.8",
 8 |     zip_safe=True,
 9 |     packages=find_packages(include=["coco_rem"]),
10 | )
11 | 


--------------------------------------------------------------------------------